diff options
Diffstat (limited to 'usr/src')
201 files changed, 58700 insertions, 91699 deletions
diff --git a/usr/src/cmd/cmd-inet/usr.bin/netstat/netstat.c b/usr/src/cmd/cmd-inet/usr.bin/netstat/netstat.c index 96bcec530c..1919d21356 100644 --- a/usr/src/cmd/cmd-inet/usr.bin/netstat/netstat.c +++ b/usr/src/cmd/cmd-inet/usr.bin/netstat/netstat.c @@ -196,6 +196,7 @@ static void ire_report(const mib_item_t *item); static void tcp_report(const mib_item_t *item); static void udp_report(const mib_item_t *item); static void group_report(mib_item_t *item); +static void dce_report(mib_item_t *item); static void print_ip_stats(mib2_ip_t *ip); static void print_icmp_stats(mib2_icmp_t *icmp); static void print_ip6_stats(mib2_ipv6IfStatsEntry_t *ip6); @@ -236,7 +237,7 @@ static void fatal(int errcode, char *str1, ...); static boolean_t Aflag = B_FALSE; /* All sockets/ifs/rtng-tbls */ -static boolean_t Dflag = B_FALSE; /* Debug Info */ +static boolean_t Dflag = B_FALSE; /* DCE info */ static boolean_t Iflag = B_FALSE; /* IP Traffic Interfaces */ static boolean_t Mflag = B_FALSE; /* STREAMS Memory Statistics */ static boolean_t Nflag = B_FALSE; /* Numeric Network Addresses */ @@ -248,6 +249,7 @@ static boolean_t Pflag = B_FALSE; /* Net to Media Tables */ static boolean_t Gflag = B_FALSE; /* Multicast group membership */ static boolean_t MMflag = B_FALSE; /* Multicast routing table */ static boolean_t DHCPflag = B_FALSE; /* DHCP statistics */ +static boolean_t Xflag = B_FALSE; /* Debug Info */ static int v4compat = 0; /* Compatible printing format for status */ @@ -276,6 +278,8 @@ static int ipv6NetToMediaEntrySize; static int ipv6MemberEntrySize; static int ipv6GroupSourceEntrySize; +static int ipDestEntrySize; + static int transportMLPSize; static int tcpConnEntrySize; static int tcp6ConnEntrySize; @@ -298,7 +302,7 @@ static m_label_t *zone_security_label = NULL; /* Flags on routes */ #define FLF_A 0x00000001 -#define FLF_B 0x00000002 +#define FLF_b 0x00000002 #define FLF_D 0x00000004 #define FLF_G 0x00000008 #define FLF_H 0x00000010 @@ -306,7 +310,12 @@ static m_label_t *zone_security_label = NULL; #define FLF_U 0x00000040 #define FLF_M 0x00000080 #define FLF_S 0x00000100 -static const char flag_list[] = "ABDGHLUMS"; +#define FLF_C 0x00000200 /* IRE_IF_CLONE */ +#define FLF_I 0x00000400 /* RTF_INDIRECT */ +#define FLF_R 0x00000800 /* RTF_REJECT */ +#define FLF_B 0x00001000 /* RTF_BLACKHOLE */ + +static const char flag_list[] = "AbDGHLUMSCIRB"; typedef struct filter_rule filter_t; @@ -379,14 +388,15 @@ main(int argc, char **argv) (void) setlocale(LC_ALL, ""); (void) textdomain(TEXT_DOMAIN); - while ((c = getopt(argc, argv, "adimnrspMgvf:P:I:DRT:")) != -1) { + while ((c = getopt(argc, argv, "adimnrspMgvxf:P:I:DRT:")) != -1) { switch ((char)c) { case 'a': /* all connections */ Aflag = B_TRUE; break; - case 'd': /* turn on debugging */ + case 'd': /* DCE info */ Dflag = B_TRUE; + IFLAGMOD(Iflag_only, 1, 0); /* see macro def'n */ break; case 'i': /* interface (ill/ipif report) */ @@ -438,6 +448,10 @@ main(int argc, char **argv) IFLAGMOD(Iflag_only, 1, 0); /* see macro def'n */ break; + case 'x': /* turn on debugging */ + Xflag = B_TRUE; + break; + case 'f': process_filter(optarg); break; @@ -603,7 +617,7 @@ main(int argc, char **argv) mib_item_destroy(&previtem); } - if (!(Iflag || Rflag || Sflag || Mflag || + if (!(Dflag || Iflag || Rflag || Sflag || Mflag || MMflag || Pflag || Gflag || DHCPflag)) { if (protocol_selected(IPPROTO_UDP)) udp_report(item); @@ -634,12 +648,14 @@ main(int argc, char **argv) if (family_selected(AF_INET6)) ndp_report(item); } + if (Dflag) + dce_report(item); mib_item_destroy(&curritem); } /* netstat: AF_UNIX behaviour */ if (family_selected(AF_UNIX) && - (!(Iflag || Rflag || Sflag || Mflag || + (!(Dflag || Iflag || Rflag || Sflag || Mflag || MMflag || Pflag || Gflag))) unixpr(kc); (void) kstat_close(kc); @@ -729,7 +745,7 @@ mibget(int sd) * us information concerning IRE_MARK_TESTHIDDEN routes. */ req = (struct opthdr *)&tor[1]; - req->level = EXPER_IP_AND_TESTHIDDEN; + req->level = EXPER_IP_AND_ALL_IRES; req->name = 0; req->len = 0; @@ -755,7 +771,7 @@ mibget(int sd) getcode = getmsg(sd, &ctlbuf, (struct strbuf *)0, &flags); if (getcode == -1) { perror("mibget getmsg(ctl) failed"); - if (Dflag) { + if (Xflag) { (void) fputs("# level name len\n", stderr); i = 0; @@ -774,7 +790,7 @@ mibget(int sd) toa->PRIM_type == T_OPTMGMT_ACK && toa->MGMT_flags == T_SUCCESS && req->len == 0) { - if (Dflag) + if (Xflag) (void) printf("mibget getmsg() %d returned " "EOD (level %ld, name %ld)\n", j, req->level, req->name); @@ -826,7 +842,7 @@ mibget(int sd) last_item->valp = malloc((int)req->len); if (last_item->valp == NULL) goto error_exit; - if (Dflag) + if (Xflag) (void) printf("msg %d: group = %4d mib_id = %5d" "length = %d\n", j, last_item->group, last_item->mib_id, @@ -1754,6 +1770,7 @@ mib_get_constants(mib_item_t *item) ipGroupSourceEntrySize = ip->ipGroupSourceEntrySize; ipRouteAttributeSize = ip->ipRouteAttributeSize; transportMLPSize = ip->transportMLPSize; + ipDestEntrySize = ip->ipDestEntrySize; assert(IS_P2ALIGNED(ipAddrEntrySize, sizeof (mib2_ipAddrEntry_t *))); assert(IS_P2ALIGNED(ipRouteEntrySize, @@ -1850,7 +1867,7 @@ mib_get_constants(mib_item_t *item) } } /* 'for' loop 1 ends */ - if (Dflag) { + if (Xflag) { (void) puts("mib_get_constants:"); (void) printf("\tipv6IfStatsEntrySize %d\n", ipv6IfStatsEntrySize); @@ -1872,6 +1889,7 @@ mib_get_constants(mib_item_t *item) ipv6MemberEntrySize); (void) printf("\tipv6IfIcmpEntrySize %d\n", ipv6IfIcmpEntrySize); + (void) printf("\tipDestEntrySize %d\n", ipDestEntrySize); (void) printf("\ttransportMLPSize %d\n", transportMLPSize); (void) printf("\ttcpConnEntrySize %d\n", tcpConnEntrySize); (void) printf("\ttcp6ConnEntrySize %d\n", tcp6ConnEntrySize); @@ -1895,7 +1913,7 @@ stat_report(mib_item_t *item) /* 'for' loop 1: */ for (; item; item = item->next_item) { - if (Dflag) { + if (Xflag) { (void) printf("\n--- Entry %d ---\n", ++jtemp); (void) printf("Group = %d, mib_id = %d, " "length = %d, valp = 0x%p\n", @@ -2542,7 +2560,7 @@ mrt_stat_report(mib_item_t *curritem) for (tempitem = curritem; tempitem; tempitem = tempitem->next_item) { - if (Dflag) { + if (Xflag) { (void) printf("\n--- Entry %d ---\n", ++jtemp); (void) printf("Group = %d, mib_id = %d, " "length = %d, valp = 0x%p\n", @@ -2603,7 +2621,7 @@ if_report(mib_item_t *item, char *matchname, /* 'for' loop 1: */ for (; item; item = item->next_item) { - if (Dflag) { + if (Xflag) { (void) printf("\n--- Entry %d ---\n", ++jtemp); (void) printf("Group = %d, mib_id = %d, " "length = %d, valp = 0x%p\n", @@ -2632,7 +2650,7 @@ if_report(mib_item_t *item, char *matchname, boolean_t first = B_TRUE; uint32_t new_ifindex; - if (Dflag) + if (Xflag) (void) printf("if_report: %d items\n", (item->length) / sizeof (mib2_ipAddrEntry_t)); @@ -2944,7 +2962,7 @@ if_report(mib_item_t *item, char *matchname, boolean_t first = B_TRUE; uint32_t new_ifindex; - if (Dflag) + if (Xflag) (void) printf("if_report: %d items\n", (item->length) / sizeof (mib2_ipv6AddrEntry_t)); @@ -3287,10 +3305,10 @@ if_report_ip4(mib2_ipAddrEntry_t *ap, (void) pr_netaddr(ap->ipAdEntAddr, ap->ipAdEntNetMask, abuf, sizeof (abuf)); - (void) printf("%-13s %-14s %-6llu %-5s %-6llu " + (void) printf("%-13s %-14s %-6llu %-5s %-6s " "%-5s %-6s %-6llu\n", abuf, pr_addr(ap->ipAdEntAddr, dstbuf, sizeof (dstbuf)), - statptr->ipackets, "N/A", statptr->opackets, "N/A", "N/A", + statptr->ipackets, "N/A", "N/A", "N/A", "N/A", 0LL); } } @@ -3337,11 +3355,10 @@ if_report_ip6(mib2_ipv6AddrEntry_t *ap6, else (void) pr_prefix6(&ap6->ipv6AddrAddress, ap6->ipv6AddrPfxLength, abuf, sizeof (abuf)); - (void) printf("%-27s %-27s %-6llu %-5s %-6llu %-5s %-6s\n", + (void) printf("%-27s %-27s %-6llu %-5s %-6s %-5s %-6s\n", abuf, pr_addr6(&ap6->ipv6AddrAddress, dstbuf, sizeof (dstbuf)), - statptr->ipackets, "N/A", - statptr->opackets, "N/A", "N/A"); + statptr->ipackets, "N/A", "N/A", "N/A", "N/A"); } } @@ -3490,7 +3507,7 @@ group_report(mib_item_t *item) /* 'for' loop 1: */ for (; item; item = item->next_item) { - if (Dflag) { + if (Xflag) { (void) printf("\n--- Entry %d ---\n", ++jtemp); (void) printf("Group = %d, mib_id = %d, " "length = %d, valp = 0x%p\n", @@ -3501,12 +3518,12 @@ group_report(mib_item_t *item) switch (item->mib_id) { case EXPER_IP_GROUP_MEMBERSHIP: v4grp = item; - if (Dflag) + if (Xflag) (void) printf("item is v4grp info\n"); break; case EXPER_IP_GROUP_SOURCES: v4src = item; - if (Dflag) + if (Xflag) (void) printf("item is v4src info\n"); break; default: @@ -3518,12 +3535,12 @@ group_report(mib_item_t *item) switch (item->mib_id) { case EXPER_IP6_GROUP_MEMBERSHIP: v6grp = item; - if (Dflag) + if (Xflag) (void) printf("item is v6grp info\n"); break; case EXPER_IP6_GROUP_SOURCES: v6src = item; - if (Dflag) + if (Xflag) (void) printf("item is v6src info\n"); break; default: @@ -3533,7 +3550,7 @@ group_report(mib_item_t *item) } if (family_selected(AF_INET) && v4grp != NULL) { - if (Dflag) + if (Xflag) (void) printf("%u records for ipGroupMember:\n", v4grp->length / sizeof (ip_member_t)); @@ -3564,7 +3581,7 @@ group_report(mib_item_t *item) if (!Vflag || v4src == NULL) continue; - if (Dflag) + if (Xflag) (void) printf("scanning %u ipGroupSource " "records...\n", v4src->length/sizeof (ip_grpsrc_t)); @@ -3609,7 +3626,7 @@ group_report(mib_item_t *item) } if (family_selected(AF_INET6) && v6grp != NULL) { - if (Dflag) + if (Xflag) (void) printf("%u records for ipv6GroupMember:\n", v6grp->length / sizeof (ipv6_member_t)); @@ -3638,7 +3655,7 @@ group_report(mib_item_t *item) if (!Vflag || v6src == NULL) continue; - if (Dflag) + if (Xflag) (void) printf("scanning %u ipv6GroupSource " "records...\n", v6src->length/sizeof (ipv6_grpsrc_t)); @@ -3683,6 +3700,126 @@ group_report(mib_item_t *item) (void) fflush(stdout); } +/* --------------------- DCE_REPORT (netstat -d) ------------------------- */ + +#define FLBUFSIZE 8 + +/* Assumes flbuf is at least 5 characters; callers use FLBUFSIZE */ +static char * +dceflags2str(uint32_t flags, char *flbuf) +{ + char *str = flbuf; + + if (flags & DCEF_DEFAULT) + *str++ = 'D'; + if (flags & DCEF_PMTU) + *str++ = 'P'; + if (flags & DCEF_UINFO) + *str++ = 'U'; + if (flags & DCEF_TOO_SMALL_PMTU) + *str++ = 'S'; + *str++ = '\0'; + return (flbuf); +} + +static void +dce_report(mib_item_t *item) +{ + mib_item_t *v4dce = NULL; + mib_item_t *v6dce = NULL; + int jtemp = 0; + char ifname[LIFNAMSIZ + 1]; + char abuf[MAXHOSTNAMELEN + 1]; + char flbuf[FLBUFSIZE]; + boolean_t first; + dest_cache_entry_t *dce; + + /* 'for' loop 1: */ + for (; item; item = item->next_item) { + if (Xflag) { + (void) printf("\n--- Entry %d ---\n", ++jtemp); + (void) printf("Group = %d, mib_id = %d, " + "length = %d, valp = 0x%p\n", + item->group, item->mib_id, item->length, + item->valp); + } + if (item->group == MIB2_IP && family_selected(AF_INET) && + item->mib_id == EXPER_IP_DCE) { + v4dce = item; + if (Xflag) + (void) printf("item is v4dce info\n"); + } + if (item->group == MIB2_IP6 && family_selected(AF_INET6) && + item->mib_id == EXPER_IP_DCE) { + v6dce = item; + if (Xflag) + (void) printf("item is v6dce info\n"); + } + } + + if (family_selected(AF_INET) && v4dce != NULL) { + if (Xflag) + (void) printf("%u records for DestCacheEntry:\n", + v4dce->length / ipDestEntrySize); + + first = B_TRUE; + for (dce = (dest_cache_entry_t *)v4dce->valp; + (char *)dce < (char *)v4dce->valp + v4dce->length; + /* LINTED: (note 1) */ + dce = (dest_cache_entry_t *)((char *)dce + + ipDestEntrySize)) { + if (first) { + (void) putchar('\n'); + (void) puts("Destination Cache Entries: IPv4"); + (void) puts( + "Address PMTU Age Flags"); + (void) puts( + "-------------------- ------ ----- -----"); + first = B_FALSE; + } + + (void) printf("%-20s %6u %5u %-5s\n", + pr_addr(dce->DestIpv4Address, abuf, sizeof (abuf)), + dce->DestPmtu, dce->DestAge, + dceflags2str(dce->DestFlags, flbuf)); + } + } + + if (family_selected(AF_INET6) && v6dce != NULL) { + if (Xflag) + (void) printf("%u records for DestCacheEntry:\n", + v6dce->length / ipDestEntrySize); + + first = B_TRUE; + for (dce = (dest_cache_entry_t *)v6dce->valp; + (char *)dce < (char *)v6dce->valp + v6dce->length; + /* LINTED: (note 1) */ + dce = (dest_cache_entry_t *)((char *)dce + + ipDestEntrySize)) { + if (first) { + (void) putchar('\n'); + (void) puts("Destination Cache Entries: IPv6"); + (void) puts( + "Address PMTU " + " Age Flags If "); + (void) puts( + "--------------------------- ------ " + "----- ----- ---"); + first = B_FALSE; + } + + (void) printf("%-27s %6u %5u %-5s %s\n", + pr_addr6(&dce->DestIpv6Address, abuf, + sizeof (abuf)), + dce->DestPmtu, dce->DestAge, + dceflags2str(dce->DestFlags, flbuf), + dce->DestIfindex == 0 ? "" : + ifindex2str(dce->DestIfindex, ifname)); + } + } + (void) fflush(stdout); +} + /* --------------------- ARP_REPORT (netstat -p) -------------------------- */ static void @@ -3703,7 +3840,7 @@ arp_report(mib_item_t *item) /* 'for' loop 1: */ for (; item; item = item->next_item) { - if (Dflag) { + if (Xflag) { (void) printf("\n--- Entry %d ---\n", ++jtemp); (void) printf("Group = %d, mib_id = %d, " "length = %d, valp = 0x%p\n", @@ -3713,7 +3850,7 @@ arp_report(mib_item_t *item) if (!(item->group == MIB2_IP && item->mib_id == MIB2_IP_MEDIA)) continue; /* 'for' loop 1 */ - if (Dflag) + if (Xflag) (void) printf("%u records for " "ipNetToMediaEntryTable:\n", item->length/sizeof (mib2_ipNetToMediaEntry_t)); @@ -3798,7 +3935,7 @@ ndp_report(mib_item_t *item) /* 'for' loop 1: */ for (; item; item = item->next_item) { - if (Dflag) { + if (Xflag) { (void) printf("\n--- Entry %d ---\n", ++jtemp); (void) printf("Group = %d, mib_id = %d, " "length = %d, valp = 0x%p\n", @@ -3973,7 +4110,7 @@ ire_report(const mib_item_t *item) v4a = v4_attrs; v6a = v6_attrs; for (; item != NULL; item = item->next_item) { - if (Dflag) { + if (Xflag) { (void) printf("\n--- Entry %d ---\n", ++jtemp); (void) printf("Group = %d, mib_id = %d, " "length = %d, valp = 0x%p\n", @@ -3991,7 +4128,7 @@ ire_report(const mib_item_t *item) else if (item->group == MIB2_IP6 && !family_selected(AF_INET6)) continue; /* 'for' loop 1 */ - if (Dflag) { + if (Xflag) { if (item->group == MIB2_IP) { (void) printf("%u records for " "ipRouteEntryTable:\n", @@ -4161,29 +4298,29 @@ form_v4_route_flags(const mib2_ipRouteEntry_t *rp, char *flags) flag_b = FLF_U; (void) strcpy(flags, "U"); - if (rp->ipRouteInfo.re_ire_type == IRE_DEFAULT || - rp->ipRouteInfo.re_ire_type == IRE_PREFIX || - rp->ipRouteInfo.re_ire_type == IRE_HOST || - rp->ipRouteInfo.re_ire_type == IRE_HOST_REDIRECT) { + /* RTF_INDIRECT wins over RTF_GATEWAY - don't display both */ + if (rp->ipRouteInfo.re_flags & RTF_INDIRECT) { + (void) strcat(flags, "I"); + flag_b |= FLF_I; + } else if (rp->ipRouteInfo.re_ire_type & IRE_OFFLINK) { (void) strcat(flags, "G"); flag_b |= FLF_G; } - if (rp->ipRouteMask == IP_HOST_MASK) { + /* IRE_IF_CLONE wins over RTF_HOST - don't display both */ + if (rp->ipRouteInfo.re_ire_type & IRE_IF_CLONE) { + (void) strcat(flags, "C"); + flag_b |= FLF_C; + } else if (rp->ipRouteMask == IP_HOST_MASK) { (void) strcat(flags, "H"); flag_b |= FLF_H; } - if (rp->ipRouteInfo.re_ire_type == IRE_HOST_REDIRECT) { + if (rp->ipRouteInfo.re_flags & RTF_DYNAMIC) { (void) strcat(flags, "D"); flag_b |= FLF_D; } - if (rp->ipRouteInfo.re_ire_type == IRE_CACHE) { - /* Address resolution */ - (void) strcat(flags, "A"); - flag_b |= FLF_A; - } if (rp->ipRouteInfo.re_ire_type == IRE_BROADCAST) { /* Broadcast */ - (void) strcat(flags, "B"); - flag_b |= FLF_B; + (void) strcat(flags, "b"); + flag_b |= FLF_b; } if (rp->ipRouteInfo.re_ire_type == IRE_LOCAL) { /* Local */ (void) strcat(flags, "L"); @@ -4197,6 +4334,14 @@ form_v4_route_flags(const mib2_ipRouteEntry_t *rp, char *flags) (void) strcat(flags, "S"); /* Setsrc */ flag_b |= FLF_S; } + if (rp->ipRouteInfo.re_flags & RTF_REJECT) { + (void) strcat(flags, "R"); + flag_b |= FLF_R; + } + if (rp->ipRouteInfo.re_flags & RTF_BLACKHOLE) { + (void) strcat(flags, "B"); + flag_b |= FLF_B; + } return (flag_b); } @@ -4205,9 +4350,9 @@ static const char ire_hdr_v4[] = static const char ire_hdr_v4_compat[] = "\n%s Table:\n"; static const char ire_hdr_v4_verbose[] = -" Destination Mask Gateway Device Mxfrg " -"Rtt Ref Flg Out In/Fwd %s\n" -"-------------------- --------------- -------------------- ------ ----- " +" Destination Mask Gateway Device " +" MTU Ref Flg Out In/Fwd %s\n" +"-------------------- --------------- -------------------- ------ " "----- --- --- ----- ------ %s\n"; static const char ire_hdr_v4_normal[] = @@ -4226,8 +4371,10 @@ ire_report_item_v4(const mib2_ipRouteEntry_t *rp, boolean_t first, char flags[10]; /* RTF_ flags */ uint_t flag_b; - if (!(Aflag || (rp->ipRouteInfo.re_ire_type != IRE_CACHE && + if (!(Aflag || (rp->ipRouteInfo.re_ire_type != IRE_IF_CLONE && rp->ipRouteInfo.re_ire_type != IRE_BROADCAST && + rp->ipRouteInfo.re_ire_type != IRE_MULTICAST && + rp->ipRouteInfo.re_ire_type != IRE_NOROUTE && rp->ipRouteInfo.re_ire_type != IRE_LOCAL))) { return (first); } @@ -4253,15 +4400,13 @@ ire_report_item_v4(const mib2_ipRouteEntry_t *rp, boolean_t first, dstbuf, sizeof (dstbuf)); } if (Vflag) { - (void) printf("%-20s %-15s %-20s %-6s %5u%c %4u %3u " + (void) printf("%-20s %-15s %-20s %-6s %5u %3u " "%-4s%6u %6u %s\n", dstbuf, pr_mask(rp->ipRouteMask, maskbuf, sizeof (maskbuf)), pr_addrnz(rp->ipRouteNextHop, gwbuf, sizeof (gwbuf)), octetstr(&rp->ipRouteIfIndex, 'a', ifname, sizeof (ifname)), rp->ipRouteInfo.re_max_frag, - rp->ipRouteInfo.re_frag_flag ? '*' : ' ', - rp->ipRouteInfo.re_rtt, rp->ipRouteInfo.re_ref, flags, rp->ipRouteInfo.re_obpkt, @@ -4391,58 +4536,39 @@ ire_filter_match_v6(const mib2_ipv6RouteEntry_t *rp6, uint_t flag_b) return (B_TRUE); } -static const char ire_hdr_v6[] = -"\n%s Table: IPv6\n"; -static const char ire_hdr_v6_verbose[] = -" Destination/Mask Gateway If PMTU Rtt " -"Ref Flags Out In/Fwd %s\n" -"--------------------------- --------------------------- ----- ------ ----- " -"--- ----- ------ ------ %s\n"; -static const char ire_hdr_v6_normal[] = -" Destination/Mask Gateway Flags Ref Use " -" If %s\n" -"--------------------------- --------------------------- ----- --- ------- " -"----- %s\n"; - -static boolean_t -ire_report_item_v6(const mib2_ipv6RouteEntry_t *rp6, boolean_t first, - const sec_attr_list_t *attrs) +/* + * Given an IPv6 MIB2 route entry, form the list of flags for the + * route. + */ +static uint_t +form_v6_route_flags(const mib2_ipv6RouteEntry_t *rp6, char *flags) { - char dstbuf[MAXHOSTNAMELEN + 1]; - char gwbuf[MAXHOSTNAMELEN + 1]; - char ifname[LIFNAMSIZ + 1]; - char flags[10]; /* RTF_ flags */ - uint_t flag_b; - - if (!(Aflag || (rp6->ipv6RouteInfo.re_ire_type != IRE_CACHE && - rp6->ipv6RouteInfo.re_ire_type != IRE_LOCAL))) { - return (first); - } + uint_t flag_b; flag_b = FLF_U; (void) strcpy(flags, "U"); - if (rp6->ipv6RouteInfo.re_ire_type == IRE_DEFAULT || - rp6->ipv6RouteInfo.re_ire_type == IRE_PREFIX || - rp6->ipv6RouteInfo.re_ire_type == IRE_HOST || - rp6->ipv6RouteInfo.re_ire_type == IRE_HOST_REDIRECT) { + /* RTF_INDIRECT wins over RTF_GATEWAY - don't display both */ + if (rp6->ipv6RouteInfo.re_flags & RTF_INDIRECT) { + (void) strcat(flags, "I"); + flag_b |= FLF_I; + } else if (rp6->ipv6RouteInfo.re_ire_type & IRE_OFFLINK) { (void) strcat(flags, "G"); flag_b |= FLF_G; } - if (rp6->ipv6RoutePfxLength == IPV6_ABITS) { + /* IRE_IF_CLONE wins over RTF_HOST - don't display both */ + if (rp6->ipv6RouteInfo.re_ire_type & IRE_IF_CLONE) { + (void) strcat(flags, "C"); + flag_b |= FLF_C; + } else if (rp6->ipv6RoutePfxLength == IPV6_ABITS) { (void) strcat(flags, "H"); flag_b |= FLF_H; } - if (rp6->ipv6RouteInfo.re_ire_type == IRE_HOST_REDIRECT) { + if (rp6->ipv6RouteInfo.re_flags & RTF_DYNAMIC) { (void) strcat(flags, "D"); flag_b |= FLF_D; } - if (rp6->ipv6RouteInfo.re_ire_type == IRE_CACHE) { - /* Address resolution */ - (void) strcat(flags, "A"); - flag_b |= FLF_A; - } if (rp6->ipv6RouteInfo.re_ire_type == IRE_LOCAL) { /* Local */ (void) strcat(flags, "L"); flag_b |= FLF_L; @@ -4455,6 +4581,48 @@ ire_report_item_v6(const mib2_ipv6RouteEntry_t *rp6, boolean_t first, (void) strcat(flags, "S"); /* Setsrc */ flag_b |= FLF_S; } + if (rp6->ipv6RouteInfo.re_flags & RTF_REJECT) { + (void) strcat(flags, "R"); + flag_b |= FLF_R; + } + if (rp6->ipv6RouteInfo.re_flags & RTF_BLACKHOLE) { + (void) strcat(flags, "B"); + flag_b |= FLF_B; + } + return (flag_b); +} + +static const char ire_hdr_v6[] = +"\n%s Table: IPv6\n"; +static const char ire_hdr_v6_verbose[] = +" Destination/Mask Gateway If MTU " +"Ref Flags Out In/Fwd %s\n" +"--------------------------- --------------------------- ----- ----- " +"--- ----- ------ ------ %s\n"; +static const char ire_hdr_v6_normal[] = +" Destination/Mask Gateway Flags Ref Use " +" If %s\n" +"--------------------------- --------------------------- ----- --- ------- " +"----- %s\n"; + +static boolean_t +ire_report_item_v6(const mib2_ipv6RouteEntry_t *rp6, boolean_t first, + const sec_attr_list_t *attrs) +{ + char dstbuf[MAXHOSTNAMELEN + 1]; + char gwbuf[MAXHOSTNAMELEN + 1]; + char ifname[LIFNAMSIZ + 1]; + char flags[10]; /* RTF_ flags */ + uint_t flag_b; + + if (!(Aflag || (rp6->ipv6RouteInfo.re_ire_type != IRE_IF_CLONE && + rp6->ipv6RouteInfo.re_ire_type != IRE_MULTICAST && + rp6->ipv6RouteInfo.re_ire_type != IRE_NOROUTE && + rp6->ipv6RouteInfo.re_ire_type != IRE_LOCAL))) { + return (first); + } + + flag_b = form_v6_route_flags(rp6, flags); if (!ire_filter_match_v6(rp6, flag_b)) return (first); @@ -4468,7 +4636,7 @@ ire_report_item_v6(const mib2_ipv6RouteEntry_t *rp6, boolean_t first, } if (Vflag) { - (void) printf("%-27s %-27s %-5s %5u%c %5u %3u " + (void) printf("%-27s %-27s %-5s %5u %3u " "%-5s %6u %6u %s\n", pr_prefix6(&rp6->ipv6RouteDest, rp6->ipv6RoutePfxLength, dstbuf, sizeof (dstbuf)), @@ -4478,8 +4646,6 @@ ire_report_item_v6(const mib2_ipv6RouteEntry_t *rp6, boolean_t first, octetstr(&rp6->ipv6RouteIfIndex, 'a', ifname, sizeof (ifname)), rp6->ipv6RouteInfo.re_max_frag, - rp6->ipv6RouteInfo.re_frag_flag ? '*' : ' ', - rp6->ipv6RouteInfo.re_rtt, rp6->ipv6RouteInfo.re_ref, flags, rp6->ipv6RouteInfo.re_obpkt, @@ -4617,7 +4783,7 @@ tcp_report(const mib_item_t *item) v4a = v4_attrs; v6a = v6_attrs; for (; item != NULL; item = item->next_item) { - if (Dflag) { + if (Xflag) { (void) printf("\n--- Entry %d ---\n", ++jtemp); (void) printf("Group = %d, mib_id = %d, " "length = %d, valp = 0x%p\n", @@ -4841,7 +5007,7 @@ udp_report(const mib_item_t *item) v6a = v6_attrs; /* 'for' loop 1: */ for (; item; item = item->next_item) { - if (Dflag) { + if (Xflag) { (void) printf("\n--- Entry %d ---\n", ++jtemp); (void) printf("Group = %d, mib_id = %d, " "length = %d, valp = 0x%p\n", @@ -4916,10 +5082,7 @@ udp_report_item_v4(const mib2_udpEntry_t *ude, boolean_t first, "", miudp_state(ude->udpEntryInfo.ue_state, attr)); - /* - * UDP sockets don't have remote attributes, so there's no need to - * print them here. - */ + print_transport_label(attr); return (first); } @@ -4956,10 +5119,7 @@ udp_report_item_v6(const mib2_udp6Entry_t *ude6, boolean_t first, miudp_state(ude6->udp6EntryInfo.ue_state, attr), ifnamep == NULL ? "" : ifnamep); - /* - * UDP sockets don't have remote attributes, so there's no need to - * print them here. - */ + print_transport_label(attr); return (first); } @@ -5321,7 +5481,7 @@ mrt_report(mib_item_t *item) /* 'for' loop 1: */ for (; item; item = item->next_item) { - if (Dflag) { + if (Xflag) { (void) printf("\n--- Entry %d ---\n", ++jtemp); (void) printf("Group = %d, mib_id = %d, " "length = %d, valp = 0x%p\n", @@ -5334,7 +5494,7 @@ mrt_report(mib_item_t *item) switch (item->mib_id) { case EXPER_DVMRP_VIF: - if (Dflag) + if (Xflag) (void) printf("%u records for ipVifTable:\n", item->length/sizeof (struct vifctl)); if (item->length/sizeof (struct vifctl) == 0) { @@ -5377,7 +5537,7 @@ mrt_report(mib_item_t *item) break; case EXPER_DVMRP_MRT: - if (Dflag) + if (Xflag) (void) printf("%u records for ipMfcTable:\n", item->length/sizeof (struct vifctl)); if (item->length/sizeof (struct vifctl) == 0) { diff --git a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c index 28416c4d7f..c0621996d3 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c +++ b/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_main.c @@ -2875,7 +2875,7 @@ mibwalk(void (*proc)(mib_item_t *)) * us information concerning IRE_MARK_TESTHIDDEN routes. */ req = (struct opthdr *)&tor[1]; - req->level = EXPER_IP_AND_TESTHIDDEN; + req->level = EXPER_IP_AND_ALL_IRES; req->name = 0; req->len = 0; diff --git a/usr/src/cmd/cmd-inet/usr.lib/mdnsd/mDNSUNP.c b/usr/src/cmd/cmd-inet/usr.lib/mdnsd/mDNSUNP.c index b76341e303..2cea11b454 100644 --- a/usr/src/cmd/cmd-inet/usr.lib/mdnsd/mDNSUNP.c +++ b/usr/src/cmd/cmd-inet/usr.lib/mdnsd/mDNSUNP.c @@ -407,6 +407,15 @@ select_src_ifi_info_solaris(int sockfd, int numifs, if (ifflags & (IFF_NOXMIT | IFF_NOLOCAL | IFF_PRIVATE)) continue; + /* A DHCP client will have IFF_UP set yet the address is zero. Ignore */ + if (lifr->lifr_addr.ss_family == AF_INET) { + struct sockaddr_in *sinptr; + + sinptr = (struct sockaddr_in *) &lifr->lifr_addr; + if (sinptr->sin_addr.s_addr == INADDR_ANY) + continue; + } + if (*best_lifr != NULL) { /* * Check if we found a better interface by checking diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c index 506b15a307..868f9ab5e2 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/ifconfig.c @@ -3541,18 +3541,6 @@ ifplumb(const char *linkname, const char *ifname, boolean_t genppa, int af) Perror2_exit("I_PUSH", IP_MOD_NAME); /* - * Push the ARP module onto the interface stream. IP uses - * this to send resolution requests up to ARP. We need to - * do this before the SLIFNAME ioctl is sent down because - * the interface becomes publicly known as soon as the SLIFNAME - * ioctl completes. Thus some other process trying to bring up - * the interface after SLIFNAME but before we have pushed ARP - * could hang. We pop the module again later if it is not needed. - */ - if (ioctl(ip_fd, I_PUSH, ARP_MOD_NAME) == -1) - Perror2_exit("I_PUSH", ARP_MOD_NAME); - - /* * Prepare to set IFF_IPV4/IFF_IPV6 flags as part of SIOCSLIFNAME. * (At this point in time the kernel also allows an override of the * IFF_CANTCHANGE flags.) @@ -3679,12 +3667,6 @@ ifplumb(const char *linkname, const char *ifname, boolean_t genppa, int af) (void) putchar('\n'); } - /* Check if arp is not actually needed */ - if (lifr.lifr_flags & (IFF_NOARP|IFF_IPV6)) { - if (ioctl(ip_fd, I_POP, 0) == -1) - Perror2_exit("I_POP", ARP_MOD_NAME); - } - /* * Open "/dev/udp" for use as a multiplexor to PLINK the * interface stream under. We use "/dev/udp" instead of "/dev/ip" diff --git a/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c b/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c index 2a4ff60d57..d851dce613 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/ping/ping.c @@ -159,6 +159,7 @@ static int moptions; /* multicast options */ int npackets; /* number of packets to send */ static ushort_t tos; /* type-of-service value */ static int hoplimit = -1; /* time-to-live value */ +static int dontfrag; /* IP*_DONTFRAG */ static int timeout = TIMEOUT; /* timeout value (sec) for probes */ static struct if_entry out_if; /* interface argument */ int ident; /* ID for this ping run */ @@ -268,7 +269,7 @@ main(int argc, char *argv[]) setbuf(stdout, (char *)0); while ((c = getopt(argc, argv, - "abA:c:dF:G:g:I:i:LlnN:P:p:rRSsTt:UvX:x:Y0123?")) != -1) { + "abA:c:dDF:G:g:I:i:LlnN:P:p:rRSsTt:UvX:x:Y0123?")) != -1) { switch ((char)c) { case 'A': if (strcmp(optarg, "inet") == 0) { @@ -301,6 +302,10 @@ main(int argc, char *argv[]) options |= SO_DEBUG; break; + case 'D': + dontfrag = 1; + break; + case 'b': bypass = _B_TRUE; break; @@ -1303,8 +1308,6 @@ setup_socket(int family, int *send_sockp, int *recv_sockp, int *if_index, } } - if (nexthop != NULL && !use_udp) - set_nexthop(family, ai_nexthop, recv_sock); /* * We always receive on raw icmp socket. But the sending socket can be * raw icmp or udp, depending on the use of -U flag. @@ -1332,9 +1335,6 @@ setup_socket(int family, int *send_sockp, int *recv_sockp, int *if_index, } } - if (nexthop != NULL) - set_nexthop(family, ai_nexthop, send_sock); - /* * In order to distinguish replies to our UDP probes from * other pings', we need to know our source port number. @@ -1368,6 +1368,9 @@ setup_socket(int family, int *send_sockp, int *recv_sockp, int *if_index, send_sock = recv_sock; } + if (nexthop != NULL) + set_nexthop(family, ai_nexthop, send_sock); + int_op = 48 * 1024; if (int_op < datalen) int_op = datalen; @@ -1431,6 +1434,7 @@ setup_socket(int family, int *send_sockp, int *recv_sockp, int *if_index, if (moptions & MULTICAST_TTL) { char_op = hoplimit; + /* Applies to unicast and multicast. */ if (family == AF_INET) { if (setsockopt(send_sock, IPPROTO_IP, IP_MULTICAST_TTL, (char *)&char_op, sizeof (char)) == -1) { @@ -1454,7 +1458,10 @@ setup_socket(int family, int *send_sockp, int *recv_sockp, int *if_index, */ } - /* did the user specify an interface? */ + /* + * did the user specify an interface? + * Applies to unicast, broadcast and multicast. + */ if (moptions & MULTICAST_IF) { struct ifaddrlist *al = NULL; /* interface list */ struct ifaddrlist *my_if; @@ -1496,6 +1503,8 @@ setup_socket(int family, int *send_sockp, int *recv_sockp, int *if_index, } if (family == AF_INET) { + struct in_pktinfo pktinfo; + if (setsockopt(send_sock, IPPROTO_IP, IP_MULTICAST_IF, (char *)&my_if->addr.addr, sizeof (struct in_addr)) == -1) { @@ -1504,6 +1513,15 @@ setup_socket(int family, int *send_sockp, int *recv_sockp, int *if_index, strerror(errno)); exit(EXIT_FAILURE); } + bzero(&pktinfo, sizeof (pktinfo)); + pktinfo.ipi_ifindex = my_if->index; + if (setsockopt(send_sock, IPPROTO_IP, IP_PKTINFO, + (char *)&pktinfo, sizeof (pktinfo)) == -1) { + Fprintf(stderr, "%s: setsockopt " + "IP_PKTINFO %s\n", progname, + strerror(errno)); + exit(EXIT_FAILURE); + } } else { /* * the outgoing interface is set in set_ancillary_data() @@ -1525,6 +1543,23 @@ setup_socket(int family, int *send_sockp, int *recv_sockp, int *if_index, } } + /* We enable or disable to not depend on the kernel default */ + if (family == AF_INET) { + if (setsockopt(send_sock, IPPROTO_IP, IP_DONTFRAG, + (char *)&dontfrag, sizeof (dontfrag)) == -1) { + Fprintf(stderr, "%s: setsockopt IP_DONTFRAG %s\n", + progname, strerror(errno)); + exit(EXIT_FAILURE); + } + } else { + if (setsockopt(send_sock, IPPROTO_IPV6, IPV6_DONTFRAG, + (char *)&dontfrag, sizeof (dontfrag)) == -1) { + Fprintf(stderr, "%s: setsockopt IPV6_DONTFRAG %s\n", + progname, strerror(errno)); + exit(EXIT_FAILURE); + } + } + /* receiving IPv6 extension headers in verbose mode */ if (verbose && family == AF_INET6) { if (setsockopt(recv_sock, IPPROTO_IPV6, IPV6_RECVHOPOPTS, @@ -2336,7 +2371,7 @@ usage(char *cmdname) Fprintf(stderr, "usage: %s host [timeout]\n", cmdname); Fprintf(stderr, /* CSTYLED */ -"usage: %s -s [-l | U] [abdLnRrv] [-A addr_family] [-c traffic_class]\n\t" +"usage: %s -s [-l | U] [abdDLnRrv] [-A addr_family] [-c traffic_class]\n\t" "[-g gateway [-g gateway ...]] [-N nexthop] [-F flow_label] [-I interval]\n\t" "[-i interface] [-P tos] [-p port] [-t ttl] host [data_size] [npackets]\n", cmdname); diff --git a/usr/src/cmd/cmd-inet/usr.sbin/route.c b/usr/src/cmd/cmd-inet/usr.sbin/route.c index b4b16d6755..aedef45409 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/route.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/route.c @@ -1,5 +1,5 @@ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -45,8 +45,6 @@ * @(#)linkaddr.c 8.1 (Berkeley) 6/4/93 */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/param.h> #include <sys/file.h> #include <sys/socket.h> @@ -175,6 +173,8 @@ static struct keytab { {"show", K_SHOW}, #define K_SECATTR 43 {"secattr", K_SECATTR}, +#define K_INDIRECT 44 + {"indirect", K_INDIRECT}, {0, 0} }; @@ -655,7 +655,7 @@ flushroutes(int argc, char *argv[]) (char *)rp < (char *)item->valp + item->length; /* LINTED */ rp = (mib2_ipRouteEntry_t *) - ((char *)rp + ipRouteEntrySize)) { + ((char *)rp + ipRouteEntrySize)) { delRouteEntry(rp, NULL, seqno); seqno++; } @@ -670,7 +670,7 @@ flushroutes(int argc, char *argv[]) if (item->group == MIB2_IP6) { ipv6RouteEntrySize = ((mib2_ipv6IfStatsEntry_t *)item->valp)-> - ipv6RouteEntrySize; + ipv6RouteEntrySize; assert(IS_P2ALIGNED(ipv6RouteEntrySize, sizeof (mib2_ipv6RouteEntry_t *))); break; @@ -692,7 +692,7 @@ flushroutes(int argc, char *argv[]) (char *)rp6 < (char *)item->valp + item->length; /* LINTED */ rp6 = (mib2_ipv6RouteEntry_t *) - ((char *)rp6 + ipv6RouteEntrySize)) { + ((char *)rp6 + ipv6RouteEntrySize)) { delRouteEntry(NULL, rp6, seqno); seqno++; } @@ -812,7 +812,7 @@ delRouteEntry(mib2_ipRouteEntry_t *rp, mib2_ipv6RouteEntry_t *rp6, int seqno) (void) printf("%-20.20s ", rtm->rtm_flags & RTF_HOST ? routename(sa) : - netname(sa)); + netname(sa)); /* LINTED */ sa = (struct sockaddr *)(salen(sa) + (char *)sa); (void) printf("%-20.20s ", routename(sa)); @@ -861,7 +861,7 @@ routename(const struct sockaddr *sa) cp = "default"; if (cp == NULL && !nflag) { hp = gethostbyaddr((char *)&in, sizeof (struct in_addr), - AF_INET); + AF_INET); if (hp != NULL) { if (((cp = strchr(hp->h_name, '.')) != NULL) && (strcmp(cp + 1, domain) == 0)) @@ -892,7 +892,7 @@ routename(const struct sockaddr *sa) cp = "default"; if (cp == NULL && !nflag) { hp = getipnodebyaddr((char *)&in6, - sizeof (struct in6_addr), AF_INET6, &error_num); + sizeof (struct in6_addr), AF_INET6, &error_num); if (hp != NULL) { if (((cp = strchr(hp->h_name, '.')) != NULL) && (strcmp(cp + 1, domain) == 0)) @@ -1120,8 +1120,8 @@ print_rtcmd_short(FILE *to, rtcmd_irep_t *rcip, boolean_t gw_good, break; case AF_INET6: if (inet_ntop(AF_INET6, - &rcip->ri_gate.sin6.sin6_addr, obuf, - INET6_ADDRSTRLEN) != NULL) { + &rcip->ri_gate.sin6.sin6_addr, obuf, + INET6_ADDRSTRLEN) != NULL) { if (nflag) { (void) fprintf(to, ": gateway %s", obuf); @@ -1405,6 +1405,9 @@ args_to_rtcmd(rtcmd_irep_t *rcip, char **argv, char *cmd_string) return (B_FALSE); } break; + case K_INDIRECT: + rcip->ri_flags |= RTF_INDIRECT; + break; default: if (dash_keyword) { syntax_bad_keyword(tok + 1); @@ -1479,8 +1482,8 @@ args_to_rtcmd(rtcmd_irep_t *rcip, char **argv, char *cmd_string) } if (rcip->ri_af == AF_INET6 && memcmp(&rcip->ri_mask.sin6.sin6_addr, - &in6_host_mask, - sizeof (struct in6_addr)) == 0) { + &in6_host_mask, + sizeof (struct in6_addr)) == 0) { rcip->ri_flags |= RTF_HOST; } } else { @@ -1853,8 +1856,8 @@ newroute(char **argv) break; case AF_INET6: if (inet_ntop(AF_INET6, - (void *)&newrt->ri_dst.sin6.sin6_addr, - obuf, INET6_ADDRSTRLEN) != NULL) { + (void *)&newrt->ri_dst.sin6.sin6_addr, + obuf, INET6_ADDRSTRLEN) != NULL) { (void) printf(" %s", obuf); break; } @@ -2236,7 +2239,7 @@ in_getaddr(char *s, struct sockaddr_in *sin, int *plenp, int which, inet_lnaof(sin->sin_addr) == INADDR_ANY)) { /* This looks like a network address. */ inet_makenetandmask(rcip, ntohl(val), - sin); + sin); } } return (B_TRUE); @@ -2562,7 +2565,7 @@ static char metricnames[] = static char routeflags[] = "\1UP\2GATEWAY\3HOST\4REJECT\5DYNAMIC\6MODIFIED\7DONE\010MASK_PRESENT" "\011CLONING\012XRESOLVE\013LLINFO\014STATIC\015BLACKHOLE" - "\016PRIVATE\017PROTO2\020PROTO1\021MULTIRT\022SETSRC"; + "\016PRIVATE\017PROTO2\020PROTO1\021MULTIRT\022SETSRC\023INDIRECT"; static char ifnetflags[] = "\1UP\2BROADCAST\3DEBUG\4LOOPBACK\5PTP\6NOTRAILERS\7RUNNING\010NOARP" "\011PPROMISC\012ALLMULTI\013INTELLIGENT\014MULTICAST" @@ -2623,7 +2626,7 @@ print_rtmsg(struct rt_msghdr *rtm, int msglen) break; default: (void) printf("pid: %ld, seq %d, errno %d, flags:", - rtm->rtm_pid, rtm->rtm_seq, rtm->rtm_errno); + rtm->rtm_pid, rtm->rtm_seq, rtm->rtm_errno); bprintf(stdout, rtm->rtm_flags, routeflags); pmsg_common(rtm, msglen); break; @@ -2649,7 +2652,7 @@ print_getmsg(rtcmd_irep_t *req_rt, struct rt_msghdr *rtm, int msglen) if (rtm->rtm_msglen > (ushort_t)msglen) { (void) fprintf(stderr, gettext("message length mismatch, in packet %d, " - "returned %d\n"), rtm->rtm_msglen, msglen); + "returned %d\n"), rtm->rtm_msglen, msglen); } if (rtm->rtm_errno) { (void) fprintf(stderr, "RTM_GET: %s (errno %d)\n", @@ -2675,7 +2678,7 @@ print_getmsg(rtcmd_irep_t *req_rt, struct rt_msghdr *rtm, int msglen) case RTA_IFP: if (sa->sa_family == AF_LINK && ((struct sockaddr_dl *)sa)-> - sdl_nlen != 0) + sdl_nlen != 0) ifp = (struct sockaddr_dl *)sa; break; case RTA_SRC: @@ -3122,8 +3125,8 @@ mibget(int sd) (void) fprintf(stderr, gettext("mibget %d gives " "T_ERROR_ACK: TLI_error = 0x%lx, UNIX_error = " "0x%lx\n"), j, tea->TLI_error, tea->UNIX_error); - errno = (tea->TLI_error == TSYSERR) - ? tea->UNIX_error : EPROTO; + errno = (tea->TLI_error == TSYSERR) ? + tea->UNIX_error : EPROTO; break; } diff --git a/usr/src/cmd/cmd-inet/usr.sbin/traceroute/traceroute.c b/usr/src/cmd/cmd-inet/usr.sbin/traceroute/traceroute.c index cae75df60d..b8b56259ad 100644 --- a/usr/src/cmd/cmd-inet/usr.sbin/traceroute/traceroute.c +++ b/usr/src/cmd/cmd-inet/usr.sbin/traceroute/traceroute.c @@ -166,6 +166,7 @@ boolean_t useicmp = _B_FALSE; /* use icmp echo instead of udp packets */ boolean_t docksum = _B_TRUE; /* calculate checksums */ static boolean_t collect_stat = _B_FALSE; /* print statistics */ boolean_t settos = _B_FALSE; /* set type-of-service field */ +int dontfrag = 0; /* IP*_DONTFRAG */ static int max_timeout = 5; /* quit after this consecutive timeouts */ static boolean_t probe_all = _B_FALSE; /* probe all the IFs of the target */ static boolean_t pick_src = _B_FALSE; /* traceroute picks the src address */ @@ -315,6 +316,7 @@ main(int argc, char **argv) case 'F': off = IP_DF; + dontfrag = 1; break; case 'g': @@ -1361,6 +1363,24 @@ setup_socket(struct pr_set *pr, int packet_len) exit(EXIT_FAILURE); } } + + /* We enable or disable to not depend on the kernel default */ + if (pr->family == AF_INET) { + if (setsockopt(ssock, IPPROTO_IP, IP_DONTFRAG, + (char *)&dontfrag, sizeof (dontfrag)) == -1) { + Fprintf(stderr, "%s: IP_DONTFRAG %s\n", prog, + strerror(errno)); + exit(EXIT_FAILURE); + } + } else { + if (setsockopt(ssock, IPPROTO_IPV6, IPV6_DONTFRAG, + (char *)&dontfrag, sizeof (dontfrag)) == -1) { + Fprintf(stderr, "%s: IPV6_DONTFRAG %s\n", prog, + strerror(errno)); + exit(EXIT_FAILURE); + } + } + if (pr->family == AF_INET) { rcvsock4 = rsock; sndsock4 = ssock; diff --git a/usr/src/cmd/devfsadm/misc_link.c b/usr/src/cmd/devfsadm/misc_link.c index 222699e479..84cdb42377 100644 --- a/usr/src/cmd/devfsadm/misc_link.c +++ b/usr/src/cmd/devfsadm/misc_link.c @@ -104,8 +104,7 @@ static devfsadm_create_t misc_cbt[] = { "(^ip$)|(^tcp$)|(^udp$)|(^icmp$)|(^sctp$)|" "(^ip6$)|(^tcp6$)|(^udp6$)|(^icmp6$)|(^sctp6$)|" "(^rts$)|(^arp$)|(^ipsecah$)|(^ipsecesp$)|(^keysock$)|(^spdsock$)|" - "(^nca$)|(^rds$)|(^sdp$)|(^ipnet$)|(^dlpistub$)|(^iptunq)|" - "(^bpf$)", + "(^nca$)|(^rds$)|(^sdp$)|(^ipnet$)|(^dlpistub$)|(^bpf$)", TYPE_EXACT | DRV_RE, ILEVEL_1, minor_name }, { "pseudo", "ddi_pseudo", diff --git a/usr/src/cmd/mdb/common/modules/arp/arp.c b/usr/src/cmd/mdb/common/modules/arp/arp.c index f36a81170e..f97cdaab42 100644 --- a/usr/src/cmd/mdb/common/modules/arp/arp.c +++ b/usr/src/cmd/mdb/common/modules/arp/arp.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <stdio.h> #include <sys/types.h> #include <sys/stropts.h> @@ -36,7 +34,6 @@ #include <inet/common.h> #include <inet/mi.h> #include <inet/arp.h> -#include <inet/arp_impl.h> #include <inet/ip.h> #include <netinet/arp.h> @@ -50,541 +47,10 @@ typedef struct { } arp_cmd_tbl; /* - * Table of ARP commands and structure types used for messages between ARP and - * IP. - */ -static const arp_cmd_tbl act_list[] = { - { AR_ENTRY_ADD, "AR_ENTRY_ADD", "arp`area_t" }, - { AR_ENTRY_DELETE, "AR_ENTRY_DELETE", "arp`ared_t" }, - { AR_ENTRY_QUERY, "AR_ENTRY_QUERY", "arp`areq_t" }, - { AR_ENTRY_SQUERY, "AR_ENTRY_SQUERY", "arp`area_t" }, - { AR_MAPPING_ADD, "AR_MAPPING_ADD", "arp`arma_t" }, - { AR_CLIENT_NOTIFY, "AR_CLIENT_NOTIFY", "arp`arcn_t" }, - { AR_INTERFACE_UP, "AR_INTERFACE_UP", "arp`arc_t" }, - { AR_INTERFACE_DOWN, "AR_INTERFACE_DOWN", "arp`arc_t" }, - { AR_INTERFACE_ON, "AR_INTERFACE_ON", "arp`arc_t" }, - { AR_INTERFACE_OFF, "AR_INTERFACE_OFF", "arp`arc_t" }, - { AR_DLPIOP_DONE, "AR_DLPIOP_DONE", "arp`arc_t" }, - { AR_ARP_CLOSING, "AR_ARP_CLOSING", "arp`arc_t" }, - { AR_ARP_EXTEND, "AR_ARP_EXTEND", "arp`arc_t" }, - { 0, "unknown command", "arp`arc_t" } -}; - -/* - * State information kept during walk over ACE hash table and unhashed mask - * list. - */ -typedef struct ace_walk_data { - ace_t *awd_hash_tbl[ARP_HASH_SIZE]; - ace_t *awd_masks; - int awd_idx; -} ace_walk_data_t; - -/* - * Given the kernel address of an arl_t, return the stackid + * removed all the ace/arl related stuff. The only thing that remains + * is code for dealing with ioctls and printing out arp header that + * should probably be moved into the ip/mdb module. */ -static int -arl_to_stackid(uintptr_t addr) -{ - arl_t arl; - queue_t rq; - ar_t ar; - arp_stack_t ass; - netstack_t nss; - - if (mdb_vread(&arl, sizeof (arl), addr) == -1) { - mdb_warn("failed to read arl_t %p", addr); - return (0); - } - - addr = (uintptr_t)arl.arl_rq; - if (mdb_vread(&rq, sizeof (rq), addr) == -1) { - mdb_warn("failed to read queue_t %p", addr); - return (0); - } - - addr = (uintptr_t)rq.q_ptr; - if (mdb_vread(&ar, sizeof (ar), addr) == -1) { - mdb_warn("failed to read ar_t %p", addr); - return (0); - } - - addr = (uintptr_t)ar.ar_as; - if (mdb_vread(&ass, sizeof (ass), addr) == -1) { - mdb_warn("failed to read arp_stack_t %p", addr); - return (0); - } - addr = (uintptr_t)ass.as_netstack; - if (mdb_vread(&nss, sizeof (nss), addr) == -1) { - mdb_warn("failed to read netstack_t %p", addr); - return (0); - } - return (nss.netstack_stackid); -} - -static int -arp_stacks_walk_init(mdb_walk_state_t *wsp) -{ - if (mdb_layered_walk("netstack", wsp) == -1) { - mdb_warn("can't walk 'netstack'"); - return (WALK_ERR); - } - return (WALK_NEXT); -} - -static int -arp_stacks_walk_step(mdb_walk_state_t *wsp) -{ - uintptr_t addr; - netstack_t nss; - - if (mdb_vread(&nss, sizeof (nss), wsp->walk_addr) == -1) { - mdb_warn("can't read netstack at %p", wsp->walk_addr); - return (WALK_ERR); - } - addr = (uintptr_t)nss.netstack_modules[NS_ARP]; - - return (wsp->walk_callback(addr, wsp->walk_layer, wsp->walk_cbdata)); -} - -static int -arl_stack_walk_init(mdb_walk_state_t *wsp) -{ - uintptr_t addr; - - if (wsp->walk_addr == NULL) { - mdb_warn("arl_stack supports only local walks\n"); - return (WALK_ERR); - } - - addr = wsp->walk_addr + OFFSETOF(arp_stack_t, as_arl_head); - if (mdb_vread(&wsp->walk_addr, sizeof (wsp->walk_addr), - addr) == -1) { - mdb_warn("failed to read 'arl_g_head'"); - return (WALK_ERR); - } - return (WALK_NEXT); -} - -static int -arl_stack_walk_step(mdb_walk_state_t *wsp) -{ - uintptr_t addr = wsp->walk_addr; - arl_t arl; - - if (wsp->walk_addr == NULL) - return (WALK_DONE); - - if (mdb_vread(&arl, sizeof (arl), addr) == -1) { - mdb_warn("failed to read arl_t at %p", addr); - return (WALK_ERR); - } - - wsp->walk_addr = (uintptr_t)arl.arl_next; - - return ((*wsp->walk_callback)(addr, &arl, wsp->walk_cbdata)); -} - -static int -arl_walk_init(mdb_walk_state_t *wsp) -{ - if (mdb_layered_walk("arp_stacks", wsp) == -1) { - mdb_warn("can't walk 'arp_stacks'"); - return (WALK_ERR); - } - - return (WALK_NEXT); -} - -static int -arl_walk_step(mdb_walk_state_t *wsp) -{ - if (mdb_pwalk("arl_stack", wsp->walk_callback, - wsp->walk_cbdata, wsp->walk_addr) == -1) { - mdb_warn("couldn't walk 'arl_stack' at %p", wsp->walk_addr); - return (WALK_ERR); - } - return (WALK_NEXT); -} - -/* - * Called with walk_addr being the address of arp_stack_t - */ -static int -ace_stack_walk_init(mdb_walk_state_t *wsp) -{ - ace_walk_data_t *aw; - uintptr_t addr; - - if (wsp->walk_addr == NULL) { - mdb_warn("ace_stack supports only local walks\n"); - return (WALK_ERR); - } - - aw = mdb_alloc(sizeof (ace_walk_data_t), UM_SLEEP); - - addr = wsp->walk_addr + OFFSETOF(arp_stack_t, as_ce_hash_tbl); - if (mdb_vread(aw->awd_hash_tbl, sizeof (aw->awd_hash_tbl), - addr) == -1) { - mdb_warn("failed to read 'as_ce_hash_tbl'"); - mdb_free(aw, sizeof (ace_walk_data_t)); - return (WALK_ERR); - } - - addr = wsp->walk_addr + OFFSETOF(arp_stack_t, as_ce_mask_entries); - if (mdb_vread(&aw->awd_masks, sizeof (aw->awd_masks), - addr) == -1) { - mdb_warn("failed to read 'as_ce_mask_entries'"); - mdb_free(aw, sizeof (ace_walk_data_t)); - return (WALK_ERR); - } - - /* The step routine will start off by incrementing to index 0 */ - aw->awd_idx = -1; - wsp->walk_addr = 0; - wsp->walk_data = aw; - - return (WALK_NEXT); -} - -static int -ace_stack_walk_step(mdb_walk_state_t *wsp) -{ - uintptr_t addr; - ace_walk_data_t *aw = wsp->walk_data; - ace_t ace; - - /* - * If we're at the end of the previous list, then find the start of the - * next list to process. - */ - while (wsp->walk_addr == NULL) { - if (aw->awd_idx == ARP_HASH_SIZE) - return (WALK_DONE); - if (++aw->awd_idx == ARP_HASH_SIZE) { - wsp->walk_addr = (uintptr_t)aw->awd_masks; - } else { - wsp->walk_addr = - (uintptr_t)aw->awd_hash_tbl[aw->awd_idx]; - } - } - - addr = wsp->walk_addr; - if (mdb_vread(&ace, sizeof (ace), addr) == -1) { - mdb_warn("failed to read ace_t at %p", addr); - return (WALK_ERR); - } - - wsp->walk_addr = (uintptr_t)ace.ace_next; - - return (wsp->walk_callback(addr, &ace, wsp->walk_cbdata)); -} - -static void -ace_stack_walk_fini(mdb_walk_state_t *wsp) -{ - mdb_free(wsp->walk_data, sizeof (ace_walk_data_t)); -} - -static int -ace_walk_init(mdb_walk_state_t *wsp) -{ - if (mdb_layered_walk("arp_stacks", wsp) == -1) { - mdb_warn("can't walk 'arp_stacks'"); - return (WALK_ERR); - } - - return (WALK_NEXT); -} - -static int -ace_walk_step(mdb_walk_state_t *wsp) -{ - if (mdb_pwalk("ace_stack", wsp->walk_callback, - wsp->walk_cbdata, wsp->walk_addr) == -1) { - mdb_warn("couldn't walk 'ace_stack' at %p", wsp->walk_addr); - return (WALK_ERR); - } - return (WALK_NEXT); -} - - -/* Common routine to produce an 'ar' text description */ -static void -ar_describe(const ar_t *ar, char *buf, size_t nbytes, boolean_t addmac) -{ - if (ar->ar_arl == NULL) { - queue_t wq, ipq; - ill_t ill; - char name[LIFNAMSIZ]; - GElf_Sym sym; - boolean_t nextip; - - if (mdb_vread(&wq, sizeof (wq), (uintptr_t)ar->ar_wq) == -1 || - mdb_vread(&ipq, sizeof (ipq), (uintptr_t)wq.q_next) == -1) - return; - - nextip = - (mdb_lookup_by_obj("ip", "ipwinit", &sym) == 0 && - (uintptr_t)sym.st_value == (uintptr_t)ipq.q_qinfo); - - if (!ar->ar_on_ill_stream) { - (void) strcpy(buf, nextip ? "Client" : "Unknown"); - return; - } - - if (!nextip || - mdb_vread(&ill, sizeof (ill), (uintptr_t)ipq.q_ptr) == -1 || - mdb_readstr(name, sizeof (name), - (uintptr_t)ill.ill_name) == -1) { - return; - } - (void) mdb_snprintf(buf, nbytes, "IP %s", name); - } else { - arl_t arl; - arlphy_t ap; - ssize_t retv; - uint32_t alen; - uchar_t macaddr[ARP_MAX_ADDR_LEN]; - - if (mdb_vread(&arl, sizeof (arl), (uintptr_t)ar->ar_arl) == -1) - return; - retv = mdb_snprintf(buf, nbytes, "ARP %s ", arl.arl_name); - if (retv >= nbytes || !addmac) - return; - if (mdb_vread(&ap, sizeof (ap), (uintptr_t)arl.arl_phy) == -1) - return; - alen = ap.ap_hw_addrlen; - if (ap.ap_hw_addr == NULL || alen == 0 || - alen > sizeof (macaddr)) - return; - if (mdb_vread(macaddr, alen, (uintptr_t)ap.ap_hw_addr) == -1) - return; - mdb_mac_addr(macaddr, alen, buf + retv, nbytes - retv); - } -} - -/* ARGSUSED2 */ -static int -ar_cb(uintptr_t addr, const void *arptr, void *dummy) -{ - const ar_t *ar = arptr; - char ardesc[sizeof ("ARP ") + LIFNAMSIZ]; - - ar_describe(ar, ardesc, sizeof (ardesc), B_FALSE); - mdb_printf("%?p %?p %?p %s\n", addr, ar->ar_wq, ar->ar_arl, ardesc); - return (WALK_NEXT); -} - -/* - * Print out ARP client structures. - */ -/* ARGSUSED2 */ -static int -ar_cmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) -{ - ar_t ar; - - if (DCMD_HDRSPEC(flags) && !(flags & DCMD_PIPE_OUT)) { - mdb_printf("%<u>%?s %?s %?s %s%</u>\n", - "AR", "WQ", "ARL", "TYPE"); - } - - if (flags & DCMD_ADDRSPEC) { - if (mdb_vread(&ar, sizeof (ar), addr) == -1) { - mdb_warn("failed to read ar_t at %p", addr); - return (DCMD_ERR); - } - (void) ar_cb(addr, &ar, NULL); - } else { - if (mdb_walk("ar", ar_cb, NULL) == -1) { - mdb_warn("cannot walk ar_t structures"); - return (DCMD_ERR); - } - } - return (DCMD_OK); -} - -/* ARGSUSED2 */ -static int -arl_cb(uintptr_t addr, const void *arlptr, void *dummy) -{ - const arl_t *arl = arlptr; - arlphy_t ap; - uchar_t macaddr[ARP_MAX_ADDR_LEN]; - char macstr[ARP_MAX_ADDR_LEN*3]; - char flags[4]; - const char *primstr; - - mdb_printf("%?p ", addr); - if (arl->arl_dlpi_pending == DL_PRIM_INVAL) - mdb_printf("%16s", "--"); - else if ((primstr = mdb_dlpi_prim(arl->arl_dlpi_pending)) != NULL) - mdb_printf("%16s", primstr); - else - mdb_printf("%16x", arl->arl_dlpi_pending); - - if (mdb_vread(&ap, sizeof (ap), (uintptr_t)arl->arl_phy) == -1 || - ap.ap_hw_addrlen == 0 || ap.ap_hw_addrlen > sizeof (macaddr)) { - (void) strcpy(macstr, "--"); - } else if (mdb_vread(macaddr, ap.ap_hw_addrlen, - (uintptr_t)ap.ap_hw_addr) == -1) { - (void) strcpy(macstr, "?"); - } else { - mdb_mac_addr(macaddr, ap.ap_hw_addrlen, macstr, - sizeof (macstr)); - } - - /* Print both the link-layer state and the NOARP flag */ - flags[0] = '\0'; - if (arl->arl_flags & ARL_F_NOARP) - (void) strcat(flags, "N"); - switch (arl->arl_state) { - case ARL_S_DOWN: - (void) strcat(flags, "d"); - break; - case ARL_S_PENDING: - (void) strcat(flags, "P"); - break; - case ARL_S_UP: - (void) strcat(flags, "U"); - break; - default: - (void) strcat(flags, "?"); - break; - } - mdb_printf(" %8d %-3s %-9s %-17s %5d\n", - mdb_mblk_count(arl->arl_dlpi_deferred), flags, arl->arl_name, - macstr, arl_to_stackid((uintptr_t)addr)); - return (WALK_NEXT); -} - -/* - * Print out ARP link-layer elements. - */ -/* ARGSUSED2 */ -static int -arl_cmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) -{ - arl_t arl; - - if (DCMD_HDRSPEC(flags) && !(flags & DCMD_PIPE_OUT)) { - mdb_printf("%<u>%?s %16s %8s %3s %9s %-17s %5s%</u>\n", - "ARL", "DLPI REQ", "DLPI CNT", "FLG", "INTERFACE", - "HWADDR", "STACK"); - } - - if (flags & DCMD_ADDRSPEC) { - if (mdb_vread(&arl, sizeof (arl), addr) == -1) { - mdb_warn("failed to read arl_t at %p", addr); - return (DCMD_ERR); - } - (void) arl_cb(addr, &arl, NULL); - } else { - if (mdb_walk("arl", arl_cb, NULL) == -1) { - mdb_warn("cannot walk arl_t structures"); - return (DCMD_ERR); - } - } - return (DCMD_OK); -} - -/* ARGSUSED2 */ -static int -ace_cb(uintptr_t addr, const void *aceptr, void *dummy) -{ - const ace_t *ace = aceptr; - uchar_t macaddr[ARP_MAX_ADDR_LEN]; - char macstr[ARP_MAX_ADDR_LEN*3]; - /* The %b format isn't compact enough for long listings */ - static const char ace_flags[] = "SPDRMLdA ofya"; - const char *cp; - char flags[sizeof (ace_flags)], *fp; - int flg; - in_addr_t inaddr, mask; - char addrstr[sizeof ("255.255.255.255/32")]; - - /* Walk the list of flags and produce a string */ - cp = ace_flags; - fp = flags; - for (flg = 1; *cp != '\0'; flg <<= 1, cp++) { - if ((flg & ace->ace_flags) && *cp != ' ') - *fp++ = *cp; - } - *fp = '\0'; - - /* If it's not resolved, then it has no hardware address */ - if (!(ace->ace_flags & ACE_F_RESOLVED) || - ace->ace_hw_addr_length == 0 || - ace->ace_hw_addr_length > sizeof (macaddr)) { - (void) strcpy(macstr, "--"); - } else if (mdb_vread(macaddr, ace->ace_hw_addr_length, - (uintptr_t)ace->ace_hw_addr) == -1) { - (void) strcpy(macstr, "?"); - } else { - mdb_mac_addr(macaddr, ace->ace_hw_addr_length, macstr, - sizeof (macstr)); - } - - /* - * Nothing other than IP uses ARP these days, so we don't try very hard - * here to switch out on ARP protocol type. (Note that ARP protocol - * types are roughly Ethertypes, but are allocated separately at IANA.) - */ - if (ace->ace_proto != IP_ARP_PROTO_TYPE) { - (void) mdb_snprintf(addrstr, sizeof (addrstr), - "Unknown proto %x", ace->ace_proto); - } else if (mdb_vread(&inaddr, sizeof (inaddr), - (uintptr_t)ace->ace_proto_addr) != -1 && - mdb_vread(&mask, sizeof (mask), (uintptr_t)ace->ace_proto_mask) != - -1) { - /* - * If it's the standard host mask, then print it normally. - * Otherwise, use "/n" notation. - */ - if (mask == (in_addr_t)~0) { - (void) mdb_snprintf(addrstr, sizeof (addrstr), "%I", - inaddr); - } else { - (void) mdb_snprintf(addrstr, sizeof (addrstr), "%I/%d", - inaddr, mask == 0 ? 0 : 33 - mdb_ffs(mask)); - } - } else { - (void) strcpy(addrstr, "?"); - } - mdb_printf("%?p %-18s %-8s %-17s %5d\n", addr, addrstr, flags, - macstr, arl_to_stackid((uintptr_t)ace->ace_arl)); - return (WALK_NEXT); -} - -/* - * Print out ARP cache entry (ace_t) elements. - */ -/* ARGSUSED2 */ -static int -ace_cmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) -{ - ace_t ace; - - if (DCMD_HDRSPEC(flags) && !(flags & DCMD_PIPE_OUT)) { - mdb_printf("%<u>%?s %-18s %-8s %-17s %5s%</u>\n", - "ACE", "PROTOADDR", "FLAGS", "HWADDR", "STACK"); - } - - if (flags & DCMD_ADDRSPEC) { - if (mdb_vread(&ace, sizeof (ace), addr) == -1) { - mdb_warn("failed to read ace_t at %p", addr); - return (DCMD_ERR); - } - (void) ace_cb(addr, &ace, NULL); - } else { - if (mdb_walk("ace", ace_cb, NULL) == -1) { - mdb_warn("cannot walk ace_t structures"); - return (DCMD_ERR); - } - } - return (DCMD_OK); -} /* * Print an ARP hardware and protocol address pair; used when printing an ARP @@ -696,148 +162,25 @@ arphdr_cmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) return (DCMD_OK); } -/* - * Print out an arp command formatted in a reasonable manner. This implements - * the type switch used by ARP. - * - * It could also dump the data that follows the header (using offset and length - * in the various structures), but it currently does not. - */ -/* ARGSUSED2 */ -static int -arpcmd_cmd(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) -{ - arc_t arc; - const arp_cmd_tbl *tp; - mdb_arg_t subargv; - - if (!(flags & DCMD_ADDRSPEC)) { - mdb_warn("address required to print ARP command\n"); - return (DCMD_ERR); - } - if (mdb_vread(&arc, sizeof (arc), addr) == -1) { - mdb_warn("unable to read arc_t at %p", addr); - return (DCMD_ERR); - } - for (tp = act_list; tp->act_cmd != 0; tp++) - if (tp->act_cmd == arc.arc_cmd) - break; - mdb_printf("%p %s (%s) = ", addr, tp->act_name, tp->act_type); - subargv.a_type = MDB_TYPE_STRING; - subargv.a_un.a_str = tp->act_type; - if (mdb_call_dcmd("print", addr, DCMD_ADDRSPEC, 1, &subargv) == -1) - return (DCMD_ERR); - else - return (DCMD_OK); -} - -static size_t -mi_osize(const queue_t *q) -{ - /* - * The code in common/inet/mi.c allocates an extra word to store the - * size of the allocation. An mi_o_s is thus a size_t plus an mi_o_s. - */ - struct mi_block { - size_t mi_nbytes; - struct mi_o_s mi_o; - } m; - - if (mdb_vread(&m, sizeof (m), (uintptr_t)q->q_ptr - sizeof (m)) != -1) - return (m.mi_nbytes - sizeof (m)); - - return (0); -} - -/* - * This is called when ::stream is used and an ARP module is seen on the - * stream. Determine what sort of ARP usage is involved and show an - * appropriate message. - */ -static void -arp_qinfo(const queue_t *qp, char *buf, size_t nbytes) -{ - size_t size = mi_osize(qp); - ar_t ar; - - if (size != sizeof (ar_t)) - return; - if (mdb_vread(&ar, sizeof (ar), (uintptr_t)qp->q_ptr) == -1) - return; - ar_describe(&ar, buf, nbytes, B_TRUE); -} - -static uintptr_t -arp_rnext(const queue_t *q) -{ - size_t size = mi_osize(q); - ar_t ar; - - if (size == sizeof (ar_t) && mdb_vread(&ar, sizeof (ar), - (uintptr_t)q->q_ptr) != -1) - return ((uintptr_t)ar.ar_rq); - - return (NULL); -} - -static uintptr_t -arp_wnext(const queue_t *q) -{ - size_t size = mi_osize(q); - ar_t ar; - - if (size == sizeof (ar_t) && mdb_vread(&ar, sizeof (ar), - (uintptr_t)q->q_ptr) != -1) - return ((uintptr_t)ar.ar_wq); - - return (NULL); -} - static const mdb_dcmd_t dcmds[] = { - { "ar", "?", "display ARP client streams for all stacks", - ar_cmd, NULL }, - { "arl", "?", "display ARP link layers for all stacks", arl_cmd, NULL }, - { "ace", "?", "display ARP cache entries for all stacks", - ace_cmd, NULL }, { "arphdr", ":", "display an ARP header", arphdr_cmd, NULL }, - { "arpcmd", ":", "display an ARP command", arpcmd_cmd, NULL }, { NULL } }; /* Note: ar_t walker is in genunix.c and net.c; generic MI walker */ static const mdb_walker_t walkers[] = { - { "arl", "walk list of arl_t links for all stacks", - arl_walk_init, arl_walk_step, NULL }, - { "arl_stack", "walk list of arl_t links", - arl_stack_walk_init, arl_stack_walk_step, NULL }, - { "ace", "walk list of ace_t entries for all stacks", - ace_walk_init, ace_walk_step, NULL }, - { "ace_stack", "walk list of ace_t entries", - ace_stack_walk_init, ace_stack_walk_step, ace_stack_walk_fini }, - { "arp_stacks", "walk all the arp_stack_t", - arp_stacks_walk_init, arp_stacks_walk_step, NULL }, { NULL } }; -static const mdb_qops_t arp_qops = { arp_qinfo, arp_rnext, arp_wnext }; static const mdb_modinfo_t modinfo = { MDB_API_VERSION, dcmds, walkers }; const mdb_modinfo_t * _mdb_init(void) { - GElf_Sym sym; - - if (mdb_lookup_by_obj("arp", "winit", &sym) == 0) - mdb_qops_install(&arp_qops, (uintptr_t)sym.st_value); - return (&modinfo); } void _mdb_fini(void) { - GElf_Sym sym; - - if (mdb_lookup_by_obj("arp", "winit", &sym) == 0) - mdb_qops_remove(&arp_qops, (uintptr_t)sym.st_value); } diff --git a/usr/src/cmd/mdb/common/modules/genunix/genunix.c b/usr/src/cmd/mdb/common/modules/genunix/genunix.c index 3e49d9a99c..e6fe3f7dcf 100644 --- a/usr/src/cmd/mdb/common/modules/genunix/genunix.c +++ b/usr/src/cmd/mdb/common/modules/genunix/genunix.c @@ -4770,8 +4770,6 @@ static const mdb_walker_t walkers[] = { NULL, modchain_walk_step, NULL }, /* from net.c */ - { "ar", "walk ar_t structures using MI for all stacks", - mi_payload_walk_init, mi_payload_walk_step, NULL, &mi_ar_arg }, { "icmp", "walk ICMP control structures using MI for all stacks", mi_payload_walk_init, mi_payload_walk_step, NULL, &mi_icmp_arg }, @@ -4779,8 +4777,6 @@ static const mdb_walker_t walkers[] = { mi_walk_init, mi_walk_step, mi_walk_fini, NULL }, { "sonode", "given a sonode, walk its children", sonode_walk_init, sonode_walk_step, sonode_walk_fini, NULL }, - { "ar_stacks", "walk all the ar_stack_t", - ar_stacks_walk_init, ar_stacks_walk_step, NULL }, { "icmp_stacks", "walk all the icmp_stack_t", icmp_stacks_walk_init, icmp_stacks_walk_step, NULL }, { "tcp_stacks", "walk all the tcp_stack_t", diff --git a/usr/src/cmd/mdb/common/modules/genunix/net.c b/usr/src/cmd/mdb/common/modules/genunix/net.c index d9f4717d7e..23d6202fff 100644 --- a/usr/src/cmd/mdb/common/modules/genunix/net.c +++ b/usr/src/cmd/mdb/common/modules/genunix/net.c @@ -45,7 +45,6 @@ #include <sys/socketvar.h> #include <sys/cred_impl.h> #include <inet/udp_impl.h> -#include <inet/arp_impl.h> #include <inet/rawip_impl.h> #include <inet/mi.h> #include <fs/sockfs/socktpi_impl.h> @@ -71,31 +70,6 @@ typedef struct netstat_cb_data_s { int af; } netstat_cb_data_t; -/* Walkers for various *_stack_t */ -int -ar_stacks_walk_init(mdb_walk_state_t *wsp) -{ - if (mdb_layered_walk("netstack", wsp) == -1) { - mdb_warn("can't walk 'netstack'"); - return (WALK_ERR); - } - return (WALK_NEXT); -} - -int -ar_stacks_walk_step(mdb_walk_state_t *wsp) -{ - uintptr_t kaddr; - netstack_t nss; - - if (mdb_vread(&nss, sizeof (nss), wsp->walk_addr) == -1) { - mdb_warn("can't read netstack at %p", wsp->walk_addr); - return (WALK_ERR); - } - kaddr = (uintptr_t)nss.netstack_modules[NS_ARP]; - return (wsp->walk_callback(kaddr, wsp->walk_layer, wsp->walk_cbdata)); -} - int icmp_stacks_walk_init(mdb_walk_state_t *wsp) { @@ -201,15 +175,15 @@ net_tcp_active(const tcp_t *tcp) static int net_tcp_ipv4(const tcp_t *tcp) { - return ((tcp->tcp_ipversion == IPV4_VERSION) || - (IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip_src_v6) && + return ((tcp->tcp_connp->conn_ipversion == IPV4_VERSION) || + (IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_connp->conn_laddr_v6) && (tcp->tcp_state <= TCPS_LISTEN))); } static int net_tcp_ipv6(const tcp_t *tcp) { - return (tcp->tcp_ipversion == IPV6_VERSION); + return (tcp->tcp_connp->conn_ipversion == IPV6_VERSION); } static int @@ -222,15 +196,15 @@ net_udp_active(const udp_t *udp) static int net_udp_ipv4(const udp_t *udp) { - return ((udp->udp_ipversion == IPV4_VERSION) || - (IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src) && + return ((udp->udp_connp->conn_ipversion == IPV4_VERSION) || + (IN6_IS_ADDR_UNSPECIFIED(&udp->udp_connp->conn_laddr_v6) && (udp->udp_state <= TS_IDLE))); } static int net_udp_ipv6(const udp_t *udp) { - return (udp->udp_ipversion == IPV6_VERSION); + return (udp->udp_connp->conn_ipversion == IPV6_VERSION); } int @@ -399,11 +373,6 @@ mi_payload_walk_step(mdb_walk_state_t *wsp) return (WALK_NEXT); } -const mi_payload_walk_arg_t mi_ar_arg = { - "ar_stacks", OFFSETOF(arp_stack_t, as_head), sizeof (ar_t), - MI_PAYLOAD_DEVICE | MI_PAYLOAD_MODULE -}; - const mi_payload_walk_arg_t mi_icmp_arg = { "icmp_stacks", OFFSETOF(icmp_stack_t, is_head), sizeof (icmp_t), MI_PAYLOAD_DEVICE | MI_PAYLOAD_MODULE @@ -632,7 +601,7 @@ netstat_tcp_cb(uintptr_t kaddr, const void *walk_data, void *cb_data) tcp_kaddr = (uintptr_t)connp->conn_tcp; if (mdb_vread(&tcps, sizeof (tcp_t), tcp_kaddr) == -1) { - mdb_warn("failed to read tcp_t at %p", kaddr); + mdb_warn("failed to read tcp_t at %p", tcp_kaddr); return (WALK_ERR); } @@ -648,13 +617,13 @@ netstat_tcp_cb(uintptr_t kaddr, const void *walk_data, void *cb_data) mdb_printf("%0?p %2i ", tcp_kaddr, tcp->tcp_state); if (af == AF_INET) { - net_ipv4addrport_pr(&tcp->tcp_ip_src_v6, tcp->tcp_lport); + net_ipv4addrport_pr(&connp->conn_laddr_v6, connp->conn_lport); mdb_printf(" "); - net_ipv4addrport_pr(&tcp->tcp_remote_v6, tcp->tcp_fport); + net_ipv4addrport_pr(&connp->conn_faddr_v6, connp->conn_fport); } else if (af == AF_INET6) { - net_ipv6addrport_pr(&tcp->tcp_ip_src_v6, tcp->tcp_lport); + net_ipv6addrport_pr(&connp->conn_laddr_v6, connp->conn_lport); mdb_printf(" "); - net_ipv6addrport_pr(&tcp->tcp_remote_v6, tcp->tcp_fport); + net_ipv6addrport_pr(&connp->conn_faddr_v6, connp->conn_fport); } mdb_printf(" %5i", ns_to_stackid((uintptr_t)connp->conn_netstack)); mdb_printf(" %4i\n", connp->conn_zoneid); @@ -687,6 +656,9 @@ netstat_udp_cb(uintptr_t kaddr, const void *walk_data, void *cb_data) return (WALK_ERR); } + connp->conn_udp = &udp; + udp.udp_connp = connp; + if (!((opts & NETSTAT_ALL) || net_udp_active(&udp)) || (af == AF_INET && !net_udp_ipv4(&udp)) || (af == AF_INET6 && !net_udp_ipv6(&udp))) { @@ -704,13 +676,13 @@ netstat_udp_cb(uintptr_t kaddr, const void *walk_data, void *cb_data) mdb_printf("%0?p %10s ", (uintptr_t)connp->conn_udp, state); if (af == AF_INET) { - net_ipv4addrport_pr(&udp.udp_v6src, udp.udp_port); + net_ipv4addrport_pr(&connp->conn_laddr_v6, connp->conn_lport); mdb_printf(" "); - net_ipv4addrport_pr(&udp.udp_v6dst, udp.udp_dstport); + net_ipv4addrport_pr(&connp->conn_faddr_v6, connp->conn_fport); } else if (af == AF_INET6) { - net_ipv6addrport_pr(&udp.udp_v6src, udp.udp_port); + net_ipv6addrport_pr(&connp->conn_laddr_v6, connp->conn_lport); mdb_printf(" "); - net_ipv6addrport_pr(&udp.udp_v6dst, udp.udp_dstport); + net_ipv6addrport_pr(&connp->conn_faddr_v6, connp->conn_fport); } mdb_printf(" %5i", ns_to_stackid((uintptr_t)connp->conn_netstack)); mdb_printf(" %4i\n", connp->conn_zoneid); @@ -740,8 +712,11 @@ netstat_icmp_cb(uintptr_t kaddr, const void *walk_data, void *cb_data) return (WALK_ERR); } - if ((af == AF_INET && icmp.icmp_ipversion != IPV4_VERSION) || - (af == AF_INET6 && icmp.icmp_ipversion != IPV6_VERSION)) { + connp->conn_icmp = &icmp; + icmp.icmp_connp = connp; + + if ((af == AF_INET && connp->conn_ipversion != IPV4_VERSION) || + (af == AF_INET6 && connp->conn_ipversion != IPV6_VERSION)) { return (WALK_NEXT); } @@ -756,16 +731,16 @@ netstat_icmp_cb(uintptr_t kaddr, const void *walk_data, void *cb_data) mdb_printf("%0?p %10s ", (uintptr_t)connp->conn_icmp, state); if (af == AF_INET) { - mdb_printf("%*I ", ADDR_V4_WIDTH, - V4_PART_OF_V6((icmp.icmp_v6src))); - mdb_printf("%*I ", ADDR_V4_WIDTH, - V4_PART_OF_V6((icmp.icmp_v6dst.sin6_addr))); + net_ipv4addrport_pr(&connp->conn_laddr_v6, connp->conn_lport); + mdb_printf(" "); + net_ipv4addrport_pr(&connp->conn_faddr_v6, connp->conn_fport); } else if (af == AF_INET6) { - mdb_printf("%*N ", ADDR_V6_WIDTH, &icmp.icmp_v6src); - mdb_printf("%*N ", ADDR_V6_WIDTH, &icmp.icmp_v6dst); + net_ipv6addrport_pr(&connp->conn_laddr_v6, connp->conn_lport); + mdb_printf(" "); + net_ipv6addrport_pr(&connp->conn_faddr_v6, connp->conn_fport); } mdb_printf(" %5i", ns_to_stackid((uintptr_t)connp->conn_netstack)); - mdb_printf(" %4i\n", icmp.icmp_zoneid); + mdb_printf(" %4i\n", connp->conn_zoneid); return (WALK_NEXT); } @@ -881,57 +856,57 @@ get_ifname(const ire_t *ire, char *intf) ill_t ill; *intf = '\0'; - if (ire->ire_type == IRE_CACHE) { - queue_t stq; - - if (mdb_vread(&stq, sizeof (stq), (uintptr_t)ire->ire_stq) == - -1) - return; - if (mdb_vread(&ill, sizeof (ill), (uintptr_t)stq.q_ptr) == -1) + if (ire->ire_ill != NULL) { + if (mdb_vread(&ill, sizeof (ill), + (uintptr_t)ire->ire_ill) == -1) return; (void) mdb_readstr(intf, MIN(LIFNAMSIZ, ill.ill_name_length), (uintptr_t)ill.ill_name); - } else if (ire->ire_ipif != NULL) { - ipif_t ipif; - char *cp; - - if (mdb_vread(&ipif, sizeof (ipif), - (uintptr_t)ire->ire_ipif) == -1) - return; - if (mdb_vread(&ill, sizeof (ill), (uintptr_t)ipif.ipif_ill) == - -1) - return; - (void) mdb_readstr(intf, MIN(LIFNAMSIZ, ill.ill_name_length), - (uintptr_t)ill.ill_name); - if (ipif.ipif_id != 0) { - cp = intf + strlen(intf); - (void) mdb_snprintf(cp, LIFNAMSIZ + 1 - (cp - intf), - ":%u", ipif.ipif_id); - } } } +const in6_addr_t ipv6_all_ones = + { 0xffffffffU, 0xffffffffU, 0xffffffffU, 0xffffffffU }; + static void -get_v4flags(const ire_t *ire, char *flags) +get_ireflags(const ire_t *ire, char *flags) { (void) strcpy(flags, "U"); - if (ire->ire_type == IRE_DEFAULT || ire->ire_type == IRE_PREFIX || - ire->ire_type == IRE_HOST || ire->ire_type == IRE_HOST_REDIRECT) + /* RTF_INDIRECT wins over RTF_GATEWAY - don't display both */ + if (ire->ire_flags & RTF_INDIRECT) + (void) strcat(flags, "I"); + else if (ire->ire_type & IRE_OFFLINK) (void) strcat(flags, "G"); - if (ire->ire_mask == IP_HOST_MASK) - (void) strcat(flags, "H"); - if (ire->ire_type == IRE_HOST_REDIRECT) + + /* IRE_IF_CLONE wins over RTF_HOST - don't display both */ + if (ire->ire_type & IRE_IF_CLONE) + (void) strcat(flags, "C"); + else if (ire->ire_ipversion == IPV4_VERSION) { + if (ire->ire_mask == IP_HOST_MASK) + (void) strcat(flags, "H"); + } else { + if (IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, &ipv6_all_ones)) + (void) strcat(flags, "H"); + } + + if (ire->ire_flags & RTF_DYNAMIC) (void) strcat(flags, "D"); - if (ire->ire_type == IRE_CACHE) - (void) strcat(flags, "A"); if (ire->ire_type == IRE_BROADCAST) - (void) strcat(flags, "B"); + (void) strcat(flags, "b"); + if (ire->ire_type == IRE_MULTICAST) + (void) strcat(flags, "m"); if (ire->ire_type == IRE_LOCAL) (void) strcat(flags, "L"); + if (ire->ire_type == IRE_NOROUTE) + (void) strcat(flags, "N"); if (ire->ire_flags & RTF_MULTIRT) (void) strcat(flags, "M"); if (ire->ire_flags & RTF_SETSRC) (void) strcat(flags, "S"); + if (ire->ire_flags & RTF_REJECT) + (void) strcat(flags, "R"); + if (ire->ire_flags & RTF_BLACKHOLE) + (void) strcat(flags, "B"); } static int @@ -945,8 +920,10 @@ netstat_irev4_cb(uintptr_t kaddr, const void *walk_data, void *cb_data) if (ire->ire_ipversion != IPV4_VERSION) return (WALK_NEXT); - if (!(*opts & NETSTAT_ALL) && (ire->ire_type == IRE_CACHE || - ire->ire_type == IRE_BROADCAST || ire->ire_type == IRE_LOCAL)) + /* Skip certain IREs by default */ + if (!(*opts & NETSTAT_ALL) && + (ire->ire_type & + (IRE_BROADCAST|IRE_LOCAL|IRE_MULTICAST|IRE_NOROUTE|IRE_IF_CLONE))) return (WALK_NEXT); if (*opts & NETSTAT_FIRST) { @@ -966,10 +943,9 @@ netstat_irev4_cb(uintptr_t kaddr, const void *walk_data, void *cb_data) } } - gate = (ire->ire_type & (IRE_INTERFACE|IRE_LOOPBACK|IRE_BROADCAST)) ? - ire->ire_src_addr : ire->ire_gateway_addr; + gate = ire->ire_gateway_addr; - get_v4flags(ire, flags); + get_ireflags(ire, flags); get_ifname(ire, intf); @@ -977,8 +953,8 @@ netstat_irev4_cb(uintptr_t kaddr, const void *walk_data, void *cb_data) mdb_printf("%?p %-*I %-*I %-*I %-6s %5u%c %4u %3u %-3s %5u " "%u\n", kaddr, ADDR_V4_WIDTH, ire->ire_addr, ADDR_V4_WIDTH, ire->ire_mask, ADDR_V4_WIDTH, gate, intf, - ire->ire_max_frag, ire->ire_frag_flag ? '*' : ' ', - ire->ire_uinfo.iulp_rtt, ire->ire_refcnt, flags, + 0, ' ', + ire->ire_metrics.iulp_rtt, ire->ire_refcnt, flags, ire->ire_ob_pkt_count, ire->ire_ib_pkt_count); } else { mdb_printf("%?p %-*I %-*I %-5s %4u %5u %s\n", kaddr, @@ -1025,7 +1001,10 @@ netstat_irev6_cb(uintptr_t kaddr, const void *walk_data, void *cb_data) if (ire->ire_ipversion != IPV6_VERSION) return (WALK_NEXT); - if (!(*opts & NETSTAT_ALL) && ire->ire_type == IRE_CACHE) + /* Skip certain IREs by default */ + if (!(*opts & NETSTAT_ALL) && + (ire->ire_type & + (IRE_BROADCAST|IRE_LOCAL|IRE_MULTICAST|IRE_NOROUTE|IRE_IF_CLONE))) return (WALK_NEXT); if (*opts & NETSTAT_FIRST) { @@ -1045,37 +1024,21 @@ netstat_irev6_cb(uintptr_t kaddr, const void *walk_data, void *cb_data) } } - gatep = (ire->ire_type & (IRE_INTERFACE|IRE_LOOPBACK)) ? - &ire->ire_src_addr_v6 : &ire->ire_gateway_addr_v6; + gatep = &ire->ire_gateway_addr_v6; masklen = ip_mask_to_plen_v6(&ire->ire_mask_v6); (void) mdb_snprintf(deststr, sizeof (deststr), "%N/%d", &ire->ire_addr_v6, masklen); - (void) strcpy(flags, "U"); - if (ire->ire_type == IRE_DEFAULT || ire->ire_type == IRE_PREFIX || - ire->ire_type == IRE_HOST || ire->ire_type == IRE_HOST_REDIRECT) - (void) strcat(flags, "G"); - if (masklen == IPV6_ABITS) - (void) strcat(flags, "H"); - if (ire->ire_type == IRE_HOST_REDIRECT) - (void) strcat(flags, "D"); - if (ire->ire_type == IRE_CACHE) - (void) strcat(flags, "A"); - if (ire->ire_type == IRE_LOCAL) - (void) strcat(flags, "L"); - if (ire->ire_flags & RTF_MULTIRT) - (void) strcat(flags, "M"); - if (ire->ire_flags & RTF_SETSRC) - (void) strcat(flags, "S"); + get_ireflags(ire, flags); get_ifname(ire, intf); if (*opts & NETSTAT_VERBOSE) { mdb_printf("%?p %-*s %-*N %-5s %5u%c %5u %3u %-5s %6u %u\n", kaddr, ADDR_V6_WIDTH+4, deststr, ADDR_V6_WIDTH, gatep, - intf, ire->ire_max_frag, ire->ire_frag_flag ? '*' : ' ', - ire->ire_uinfo.iulp_rtt, ire->ire_refcnt, + intf, 0, ' ', + ire->ire_metrics.iulp_rtt, ire->ire_refcnt, flags, ire->ire_ob_pkt_count, ire->ire_ib_pkt_count); } else { mdb_printf("%?p %-*s %-*N %-5s %3u %6u %s\n", kaddr, diff --git a/usr/src/cmd/mdb/common/modules/genunix/net.h b/usr/src/cmd/mdb/common/modules/genunix/net.h index f2d441e78c..f72d75f75a 100644 --- a/usr/src/cmd/mdb/common/modules/genunix/net.h +++ b/usr/src/cmd/mdb/common/modules/genunix/net.h @@ -30,7 +30,6 @@ extern "C" { #endif -extern struct mi_payload_walk_arg_s mi_ar_arg; extern struct mi_payload_walk_arg_s mi_icmp_arg; extern struct mi_payload_walk_arg_s mi_ill_arg; @@ -42,8 +41,6 @@ extern int mi_walk_step(mdb_walk_state_t *); extern void mi_walk_fini(mdb_walk_state_t *); extern int mi_payload_walk_init(mdb_walk_state_t *); extern int mi_payload_walk_step(mdb_walk_state_t *); -extern int ar_stacks_walk_init(mdb_walk_state_t *); -extern int ar_stacks_walk_step(mdb_walk_state_t *); extern int icmp_stacks_walk_init(mdb_walk_state_t *); extern int icmp_stacks_walk_step(mdb_walk_state_t *); extern int tcp_stacks_walk_init(mdb_walk_state_t *); diff --git a/usr/src/cmd/mdb/common/modules/genunix/streams.c b/usr/src/cmd/mdb/common/modules/genunix/streams.c index 0458589309..d0095c7752 100644 --- a/usr/src/cmd/mdb/common/modules/genunix/streams.c +++ b/usr/src/cmd/mdb/common/modules/genunix/streams.c @@ -172,7 +172,6 @@ static const struct str_flags mbf[] = { { SF(0x08), "unused" }, { SF(MSGMARKNEXT), "Private: b_next's first byte marked" }, { SF(MSGNOTMARKNEXT), "Private: ... not marked" }, - { SF(MSGHASREF), "Private: msg has reference to owner" }, { 0, NULL, NULL } }; diff --git a/usr/src/cmd/mdb/common/modules/genunix/vfs.c b/usr/src/cmd/mdb/common/modules/genunix/vfs.c index 45dc27af23..8001c41b3c 100644 --- a/usr/src/cmd/mdb/common/modules/genunix/vfs.c +++ b/usr/src/cmd/mdb/common/modules/genunix/vfs.c @@ -572,8 +572,9 @@ sctp_getsockaddr(sctp_t *sctp, struct sockaddr *addr) sin_t *sin4; int scanned = 0; boolean_t skip_lback = B_FALSE; + conn_t *connp = sctp->sctp_connp; - addr->sa_family = sctp->sctp_family; + addr->sa_family = connp->conn_family; if (sctp->sctp_nsaddrs == 0) goto done; @@ -636,18 +637,18 @@ sctp_getsockaddr(sctp_t *sctp, struct sockaddr *addr) continue; } - switch (sctp->sctp_family) { + switch (connp->conn_family) { case AF_INET: /* LINTED: alignment */ sin4 = (sin_t *)addr; if ((sctp->sctp_state <= SCTPS_LISTEN) && sctp->sctp_bound_to_all) { sin4->sin_addr.s_addr = INADDR_ANY; - sin4->sin_port = sctp->sctp_lport; + sin4->sin_port = connp->conn_lport; } else { sin4 += added; sin4->sin_family = AF_INET; - sin4->sin_port = sctp->sctp_lport; + sin4->sin_port = connp->conn_lport; IN6_V4MAPPED_TO_INADDR(&laddr, &sin4->sin_addr); } @@ -660,15 +661,14 @@ sctp_getsockaddr(sctp_t *sctp, struct sockaddr *addr) sctp->sctp_bound_to_all) { bzero(&sin6->sin6_addr, sizeof (sin6->sin6_addr)); - sin6->sin6_port = sctp->sctp_lport; + sin6->sin6_port = connp->conn_lport; } else { sin6 += added; sin6->sin6_family = AF_INET6; - sin6->sin6_port = sctp->sctp_lport; + sin6->sin6_port = connp->conn_lport; sin6->sin6_addr = laddr; } - sin6->sin6_flowinfo = sctp->sctp_ip6h->ip6_vcf & - ~IPV6_VERS_AND_FLOW_MASK; + sin6->sin6_flowinfo = connp->conn_flowinfo; sin6->sin6_scope_id = 0; sin6->__sin6_src_id = 0; break; @@ -712,11 +712,12 @@ sctp_getpeeraddr(sctp_t *sctp, struct sockaddr *addr) struct sockaddr_in6 *sin6; sctp_faddr_t sctp_primary; in6_addr_t faddr; + conn_t *connp = sctp->sctp_connp; if (sctp->sctp_faddrs == NULL) return (-1); - addr->sa_family = sctp->sctp_family; + addr->sa_family = connp->conn_family; if (mdb_vread(&sctp_primary, sizeof (sctp_faddr_t), (uintptr_t)sctp->sctp_primary) == -1) { mdb_warn("failed to read sctp primary faddr"); @@ -724,12 +725,12 @@ sctp_getpeeraddr(sctp_t *sctp, struct sockaddr *addr) } faddr = sctp_primary.faddr; - switch (sctp->sctp_family) { + switch (connp->conn_family) { case AF_INET: /* LINTED: alignment */ sin4 = (struct sockaddr_in *)addr; IN6_V4MAPPED_TO_INADDR(&faddr, &sin4->sin_addr); - sin4->sin_port = sctp->sctp_fport; + sin4->sin_port = connp->conn_fport; sin4->sin_family = AF_INET; break; @@ -737,7 +738,7 @@ sctp_getpeeraddr(sctp_t *sctp, struct sockaddr *addr) /* LINTED: alignment */ sin6 = (struct sockaddr_in6 *)addr; sin6->sin6_addr = faddr; - sin6->sin6_port = sctp->sctp_fport; + sin6->sin6_port = connp->conn_fport; sin6->sin6_family = AF_INET6; sin6->sin6_flowinfo = 0; sin6->sin6_scope_id = 0; @@ -797,7 +798,7 @@ tcpip_sock_print(struct sonode *socknode) mdb_printf("socket: "); mdb_nhconvert(&port, &conn_t.conn_lport, sizeof (port)); - mdb_printf("AF_INET %I %d ", conn_t.conn_src, port); + mdb_printf("AF_INET %I %d ", conn_t.conn_laddr_v4, port); /* * If this is a listening socket, we don't print @@ -807,7 +808,8 @@ tcpip_sock_print(struct sonode *socknode) IPCL_IS_UDP(&conn_t) && IPCL_IS_CONNECTED(&conn_t)) { mdb_printf("remote: "); mdb_nhconvert(&port, &conn_t.conn_fport, sizeof (port)); - mdb_printf("AF_INET %I %d ", conn_t.conn_rem, port); + mdb_printf("AF_INET %I %d ", conn_t.conn_faddr_v4, + port); } break; @@ -826,7 +828,7 @@ tcpip_sock_print(struct sonode *socknode) mdb_printf("socket: "); mdb_nhconvert(&port, &conn_t.conn_lport, sizeof (port)); - mdb_printf("AF_INET6 %N %d ", &conn_t.conn_srcv6, port); + mdb_printf("AF_INET6 %N %d ", &conn_t.conn_laddr_v4, port); /* * If this is a listening socket, we don't print @@ -836,7 +838,8 @@ tcpip_sock_print(struct sonode *socknode) IPCL_IS_UDP(&conn_t) && IPCL_IS_CONNECTED(&conn_t)) { mdb_printf("remote: "); mdb_nhconvert(&port, &conn_t.conn_fport, sizeof (port)); - mdb_printf("AF_INET6 %N %d ", &conn_t.conn_remv6, port); + mdb_printf("AF_INET6 %N %d ", &conn_t.conn_faddr_v6, + port); } break; @@ -854,6 +857,7 @@ static int sctp_sock_print(struct sonode *socknode) { sctp_t sctp_t; + conn_t conns; struct sockaddr *laddr = mdb_alloc(sizeof (struct sockaddr), UM_SLEEP); struct sockaddr *faddr = mdb_alloc(sizeof (struct sockaddr), UM_SLEEP); @@ -864,6 +868,14 @@ sctp_sock_print(struct sonode *socknode) return (-1); } + if (mdb_vread(&conns, sizeof (conn_t), + (uintptr_t)sctp_t.sctp_connp) == -1) { + mdb_warn("failed to read conn_t at %p", + (uintptr_t)sctp_t.sctp_connp); + return (-1); + } + sctp_t.sctp_connp = &conns; + if (sctp_getsockaddr(&sctp_t, laddr) == 0) { mdb_printf("socket:"); pfiles_print_addr(laddr); diff --git a/usr/src/cmd/mdb/common/modules/ip/ip.c b/usr/src/cmd/mdb/common/modules/ip/ip.c index 28f21efe1f..da94942eae 100644 --- a/usr/src/cmd/mdb/common/modules/ip/ip.c +++ b/usr/src/cmd/mdb/common/modules/ip/ip.c @@ -52,6 +52,7 @@ #include <ilb/ilb_nat.h> #include <ilb/ilb_conn.h> #include <sys/dlpi.h> +#include <sys/zone.h> #include <mdb/mdb_modapi.h> #include <mdb/mdb_ks.h> @@ -84,15 +85,20 @@ typedef struct illif_walk_data { ill_if_t ill_if; } illif_walk_data_t; -typedef struct nce_walk_data_s { - struct ndp_g_s nce_ip_ndp; - int nce_hash_tbl_index; - nce_t nce; -} nce_walk_data_t; +typedef struct ncec_walk_data_s { + struct ndp_g_s ncec_ip_ndp; + int ncec_hash_tbl_index; + ncec_t ncec; +} ncec_walk_data_t; + +typedef struct ncec_cbdata_s { + uintptr_t ncec_addr; + int ncec_ipversion; +} ncec_cbdata_t; typedef struct nce_cbdata_s { - uintptr_t nce_addr; - int nce_ipversion; + int nce_ipversion; + char nce_ill_name[LIFNAMSIZ]; } nce_cbdata_t; typedef struct ire_cbdata_s { @@ -100,6 +106,12 @@ typedef struct ire_cbdata_s { boolean_t verbose; } ire_cbdata_t; +typedef struct zi_cbdata_s { + const char *zone_name; + ip_stack_t *ipst; + boolean_t shared_ip_zone; +} zi_cbdata_t; + typedef struct th_walk_data { uint_t thw_non_zero_only; boolean_t thw_match; @@ -122,6 +134,7 @@ typedef struct ill_walk_data_s { typedef struct ill_cbdata_s { uintptr_t ill_addr; int ill_ipversion; + ip_stack_t *ill_ipst; boolean_t verbose; } ill_cbdata_t; @@ -156,7 +169,7 @@ static hash_walk_arg_t bind_hash_arg = { }; static hash_walk_arg_t proto_hash_arg = { - OFFSETOF(ip_stack_t, ips_ipcl_proto_fanout), + OFFSETOF(ip_stack_t, ips_ipcl_proto_fanout_v4), 0 }; @@ -210,13 +223,15 @@ static void ip_list_walk_fini(mdb_walk_state_t *); static int srcid_walk_step(mdb_walk_state_t *); static int ire_format(uintptr_t addr, const void *, void *); -static int nce_format(uintptr_t addr, const nce_t *nce, int ipversion); -static int nce(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv); -static int nce_walk_step(mdb_walk_state_t *wsp); -static int nce_stack_walk_init(mdb_walk_state_t *wsp); -static int nce_stack_walk_step(mdb_walk_state_t *wsp); -static void nce_stack_walk_fini(mdb_walk_state_t *wsp); -static int nce_cb(uintptr_t addr, const nce_walk_data_t *iw, nce_cbdata_t *id); +static int ncec_format(uintptr_t addr, const ncec_t *ncec, int ipversion); +static int ncec(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv); +static int ncec_walk_step(mdb_walk_state_t *wsp); +static int ncec_stack_walk_init(mdb_walk_state_t *wsp); +static int ncec_stack_walk_step(mdb_walk_state_t *wsp); +static void ncec_stack_walk_fini(mdb_walk_state_t *wsp); +static int ncec_cb(uintptr_t addr, const ncec_walk_data_t *iw, + ncec_cbdata_t *id); +static char *nce_l2_addr(const nce_t *, const ill_t *); static int ipcl_hash_walk_init(mdb_walk_state_t *); static int ipcl_hash_walk_step(mdb_walk_state_t *); @@ -262,6 +277,69 @@ ips_to_stackid(uintptr_t kaddr) return (nss.netstack_stackid); } +/* ARGSUSED */ +static int +zone_to_ips_cb(uintptr_t addr, const void *zi_arg, void *zi_cb_arg) +{ + zi_cbdata_t *zi_cb = zi_cb_arg; + zone_t zone; + char zone_name[ZONENAME_MAX]; + netstack_t ns; + + if (mdb_vread(&zone, sizeof (zone_t), addr) == -1) { + mdb_warn("can't read zone at %p", addr); + return (WALK_ERR); + } + + (void) mdb_readstr(zone_name, ZONENAME_MAX, (uintptr_t)zone.zone_name); + + if (strcmp(zi_cb->zone_name, zone_name) != 0) + return (WALK_NEXT); + + zi_cb->shared_ip_zone = (!(zone.zone_flags & ZF_NET_EXCL) && + (strcmp(zone_name, "global") != 0)); + + if (mdb_vread(&ns, sizeof (netstack_t), (uintptr_t)zone.zone_netstack) + == -1) { + mdb_warn("can't read netstack at %p", zone.zone_netstack); + return (WALK_ERR); + } + + zi_cb->ipst = ns.netstack_ip; + return (WALK_DONE); +} + +static ip_stack_t * +zone_to_ips(const char *zone_name) +{ + zi_cbdata_t zi_cb; + + if (zone_name == NULL) + return (NULL); + + zi_cb.zone_name = zone_name; + zi_cb.ipst = NULL; + zi_cb.shared_ip_zone = B_FALSE; + + if (mdb_walk("zone", (mdb_walk_cb_t)zone_to_ips_cb, &zi_cb) == -1) { + mdb_warn("failed to walk zone"); + return (NULL); + } + + if (zi_cb.shared_ip_zone) { + mdb_warn("%s is a Shared-IP zone, try '-s global' instead\n", + zone_name); + return (NULL); + } + + if (zi_cb.ipst == NULL) { + mdb_warn("failed to find zone %s\n", zone_name); + return (NULL); + } + + return (zi_cb.ipst); +} + int ip_stacks_walk_init(mdb_walk_state_t *wsp) { @@ -529,10 +607,10 @@ illif_help(void) } int -ire_walk_init(mdb_walk_state_t *wsp) +nce_walk_init(mdb_walk_state_t *wsp) { - if (mdb_layered_walk("ire_cache", wsp) == -1) { - mdb_warn("can't walk 'ire_cache'"); + if (mdb_layered_walk("nce_cache", wsp) == -1) { + mdb_warn("can't walk 'nce_cache'"); return (WALK_ERR); } @@ -540,60 +618,129 @@ ire_walk_init(mdb_walk_state_t *wsp) } int -ire_walk_step(mdb_walk_state_t *wsp) +nce_walk_step(mdb_walk_state_t *wsp) { - ire_t ire; + nce_t nce; - if (mdb_vread(&ire, sizeof (ire), wsp->walk_addr) == -1) { - mdb_warn("can't read ire at %p", wsp->walk_addr); + if (mdb_vread(&nce, sizeof (nce), wsp->walk_addr) == -1) { + mdb_warn("can't read nce at %p", wsp->walk_addr); return (WALK_ERR); } - return (wsp->walk_callback(wsp->walk_addr, &ire, wsp->walk_cbdata)); + return (wsp->walk_callback(wsp->walk_addr, &nce, wsp->walk_cbdata)); } +static int +nce_format(uintptr_t addr, const nce_t *ncep, void *nce_cb_arg) +{ + nce_cbdata_t *nce_cb = nce_cb_arg; + ill_t ill; + char ill_name[LIFNAMSIZ]; + ncec_t ncec; + + if (mdb_vread(&ncec, sizeof (ncec), + (uintptr_t)ncep->nce_common) == -1) { + mdb_warn("can't read ncec at %p", ncep->nce_common); + return (WALK_NEXT); + } + if (nce_cb->nce_ipversion != 0 && + ncec.ncec_ipversion != nce_cb->nce_ipversion) + return (WALK_NEXT); + + if (mdb_vread(&ill, sizeof (ill), (uintptr_t)ncep->nce_ill) == -1) { + mdb_snprintf(ill_name, sizeof (ill_name), "--"); + } else { + (void) mdb_readstr(ill_name, + MIN(LIFNAMSIZ, ill.ill_name_length), + (uintptr_t)ill.ill_name); + } + + if (nce_cb->nce_ill_name[0] != '\0' && + strncmp(nce_cb->nce_ill_name, ill_name, LIFNAMSIZ) != 0) + return (WALK_NEXT); + + if (ncec.ncec_ipversion == IPV6_VERSION) { + + mdb_printf("%?p %5s %-18s %?p %6d %N\n", + addr, ill_name, + nce_l2_addr(ncep, &ill), + ncep->nce_fp_mp, + ncep->nce_refcnt, + &ncep->nce_addr); + + } else { + struct in_addr nceaddr; + + IN6_V4MAPPED_TO_INADDR(&ncep->nce_addr, &nceaddr); + mdb_printf("%?p %5s %-18s %?p %6d %I\n", + addr, ill_name, + nce_l2_addr(ncep, &ill), + ncep->nce_fp_mp, + ncep->nce_refcnt, + nceaddr.s_addr); + } + + return (WALK_NEXT); +} int -ire_ctable_walk_step(mdb_walk_state_t *wsp) +dce_walk_init(mdb_walk_state_t *wsp) { - uintptr_t kaddr; - irb_t *irb; - uint32_t cache_table_size; - int i; - ire_cbdata_t ire_cb; + wsp->walk_data = (void *)wsp->walk_addr; - ire_cb.verbose = B_FALSE; - ire_cb.ire_ipversion = 0; + if (mdb_layered_walk("dce_cache", wsp) == -1) { + mdb_warn("can't walk 'dce_cache'"); + return (WALK_ERR); + } + return (WALK_NEXT); +} - kaddr = wsp->walk_addr + OFFSETOF(ip_stack_t, ips_ip_cache_table_size); +int +dce_walk_step(mdb_walk_state_t *wsp) +{ + dce_t dce; - if (mdb_vread(&cache_table_size, sizeof (uint32_t), kaddr) == -1) { - mdb_warn("can't read ips_ip_cache_table at %p", kaddr); + if (mdb_vread(&dce, sizeof (dce), wsp->walk_addr) == -1) { + mdb_warn("can't read dce at %p", wsp->walk_addr); return (WALK_ERR); } - kaddr = wsp->walk_addr + OFFSETOF(ip_stack_t, ips_ip_cache_table); - if (mdb_vread(&kaddr, sizeof (kaddr), kaddr) == -1) { - mdb_warn("can't read ips_ip_cache_table at %p", kaddr); + /* If ip_stack_t is specified, skip DCEs that don't belong to it. */ + if ((wsp->walk_data != NULL) && (wsp->walk_data != dce.dce_ipst)) + return (WALK_NEXT); + + return (wsp->walk_callback(wsp->walk_addr, &dce, wsp->walk_cbdata)); +} + +int +ire_walk_init(mdb_walk_state_t *wsp) +{ + wsp->walk_data = (void *)wsp->walk_addr; + + if (mdb_layered_walk("ire_cache", wsp) == -1) { + mdb_warn("can't walk 'ire_cache'"); return (WALK_ERR); } - irb = mdb_alloc(sizeof (irb_t) * cache_table_size, UM_SLEEP|UM_GC); - if (mdb_vread(irb, sizeof (irb_t) * cache_table_size, kaddr) == -1) { - mdb_warn("can't read irb at %p", kaddr); + return (WALK_NEXT); +} + +int +ire_walk_step(mdb_walk_state_t *wsp) +{ + ire_t ire; + + if (mdb_vread(&ire, sizeof (ire), wsp->walk_addr) == -1) { + mdb_warn("can't read ire at %p", wsp->walk_addr); return (WALK_ERR); } - for (i = 0; i < cache_table_size; i++) { - kaddr = (uintptr_t)irb[i].irb_ire; - if (mdb_pwalk("ire_next", ire_format, &ire_cb, - kaddr) == -1) { - mdb_warn("can't walk 'ire_next' for ire %p", kaddr); - return (WALK_ERR); - } - } - return (WALK_NEXT); + /* If ip_stack_t is specified, skip IREs that don't belong to it. */ + if ((wsp->walk_data != NULL) && (wsp->walk_data != ire.ire_ipst)) + return (WALK_NEXT); + + return (wsp->walk_callback(wsp->walk_addr, &ire, wsp->walk_cbdata)); } /* ARGSUSED */ @@ -633,6 +780,9 @@ ire_format(uintptr_t addr, const void *ire_arg, void *ire_cb_arg) const ire_t *irep = ire_arg; ire_cbdata_t *ire_cb = ire_cb_arg; boolean_t verbose = ire_cb->verbose; + ill_t ill; + char ill_name[LIFNAMSIZ]; + boolean_t condemned = irep->ire_generation == IRE_GENERATION_CONDEMNED; static const mdb_bitmask_t tmasks[] = { { "BROADCAST", IRE_BROADCAST, IRE_BROADCAST }, @@ -640,22 +790,12 @@ ire_format(uintptr_t addr, const void *ire_arg, void *ire_cb_arg) { "LOCAL", IRE_LOCAL, IRE_LOCAL }, { "LOOPBACK", IRE_LOOPBACK, IRE_LOOPBACK }, { "PREFIX", IRE_PREFIX, IRE_PREFIX }, - { "CACHE", IRE_CACHE, IRE_CACHE }, + { "MULTICAST", IRE_MULTICAST, IRE_MULTICAST }, + { "NOROUTE", IRE_NOROUTE, IRE_NOROUTE }, { "IF_NORESOLVER", IRE_IF_NORESOLVER, IRE_IF_NORESOLVER }, { "IF_RESOLVER", IRE_IF_RESOLVER, IRE_IF_RESOLVER }, + { "IF_CLONE", IRE_IF_CLONE, IRE_IF_CLONE }, { "HOST", IRE_HOST, IRE_HOST }, - { "HOST_REDIRECT", IRE_HOST_REDIRECT, IRE_HOST_REDIRECT }, - { NULL, 0, 0 } - }; - - static const mdb_bitmask_t mmasks[] = { - { "CONDEMNED", IRE_MARK_CONDEMNED, IRE_MARK_CONDEMNED }, - { "TESTHIDDEN", IRE_MARK_TESTHIDDEN, IRE_MARK_TESTHIDDEN }, - { "NOADD", IRE_MARK_NOADD, IRE_MARK_NOADD }, - { "TEMPORARY", IRE_MARK_TEMPORARY, IRE_MARK_TEMPORARY }, - { "USESRC", IRE_MARK_USESRC_CHECK, IRE_MARK_USESRC_CHECK }, - { "PRIVATE", IRE_MARK_PRIVATE_ADDR, IRE_MARK_PRIVATE_ADDR }, - { "UNCACHED", IRE_MARK_UNCACHED, IRE_MARK_UNCACHED }, { NULL, 0, 0 } }; @@ -678,6 +818,7 @@ ire_format(uintptr_t addr, const void *ire_arg, void *ire_cb_arg) { "PROTO1", RTF_PROTO1, RTF_PROTO1 }, { "MULTIRT", RTF_MULTIRT, RTF_MULTIRT }, { "SETSRC", RTF_SETSRC, RTF_SETSRC }, + { "INDIRECT", RTF_INDIRECT, RTF_INDIRECT }, { NULL, 0, 0 } }; @@ -685,40 +826,53 @@ ire_format(uintptr_t addr, const void *ire_arg, void *ire_cb_arg) irep->ire_ipversion != ire_cb->ire_ipversion) return (WALK_NEXT); + if (mdb_vread(&ill, sizeof (ill), (uintptr_t)irep->ire_ill) == -1) { + mdb_snprintf(ill_name, sizeof (ill_name), "--"); + } else { + (void) mdb_readstr(ill_name, + MIN(LIFNAMSIZ, ill.ill_name_length), + (uintptr_t)ill.ill_name); + } + if (irep->ire_ipversion == IPV6_VERSION && verbose) { - mdb_printf("%<b>%?p%</b> %40N <%hb>\n" - "%?s %40N <%hb>\n" - "%?s %40d %4d <%hb>\n", - addr, &irep->ire_src_addr_v6, irep->ire_type, tmasks, - "", &irep->ire_addr_v6, (ushort_t)irep->ire_marks, mmasks, + mdb_printf("%<b>%?p%</b>%3s %40N <%hb%s>\n" + "%?s %40N\n" + "%?s %40d %4d <%hb> %s\n", + addr, condemned ? "(C)" : "", &irep->ire_setsrc_addr_v6, + irep->ire_type, tmasks, + (irep->ire_testhidden ? ", HIDDEN" : ""), + "", &irep->ire_addr_v6, "", ips_to_stackid((uintptr_t)irep->ire_ipst), irep->ire_zoneid, - irep->ire_flags, fmasks); + irep->ire_flags, fmasks, ill_name); } else if (irep->ire_ipversion == IPV6_VERSION) { - mdb_printf("%?p %30N %30N %5d %4d\n", - addr, &irep->ire_src_addr_v6, + mdb_printf("%?p%3s %30N %30N %5d %4d %s\n", + addr, condemned ? "(C)" : "", &irep->ire_setsrc_addr_v6, &irep->ire_addr_v6, ips_to_stackid((uintptr_t)irep->ire_ipst), - irep->ire_zoneid); + irep->ire_zoneid, ill_name); } else if (verbose) { - mdb_printf("%<b>%?p%</b> %40I <%hb>\n" - "%?s %40I <%hb>\n" - "%?s %40d %4d <%hb>\n", - addr, irep->ire_src_addr, irep->ire_type, tmasks, - "", irep->ire_addr, (ushort_t)irep->ire_marks, mmasks, + mdb_printf("%<b>%?p%</b>%3s %40I <%hb%s>\n" + "%?s %40I\n" + "%?s %40d %4d <%hb> %s\n", + addr, condemned ? "(C)" : "", irep->ire_setsrc_addr, + irep->ire_type, tmasks, + (irep->ire_testhidden ? ", HIDDEN" : ""), + "", irep->ire_addr, "", ips_to_stackid((uintptr_t)irep->ire_ipst), - irep->ire_zoneid, irep->ire_flags, fmasks); + irep->ire_zoneid, irep->ire_flags, fmasks, ill_name); } else { - mdb_printf("%?p %30I %30I %5d %4d\n", addr, irep->ire_src_addr, + mdb_printf("%?p%3s %30I %30I %5d %4d %s\n", addr, + condemned ? "(C)" : "", irep->ire_setsrc_addr, irep->ire_addr, ips_to_stackid((uintptr_t)irep->ire_ipst), - irep->ire_zoneid); + irep->ire_zoneid, ill_name); } return (WALK_NEXT); @@ -1040,6 +1194,140 @@ ip6hdr(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) } int +nce(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) +{ + nce_t nce; + nce_cbdata_t nce_cb; + int ipversion = 0; + const char *opt_P = NULL, *opt_ill; + + if (mdb_getopts(argc, argv, + 'i', MDB_OPT_STR, &opt_ill, + 'P', MDB_OPT_STR, &opt_P, NULL) != argc) + return (DCMD_USAGE); + + if (opt_P != NULL) { + if (strcmp("v4", opt_P) == 0) { + ipversion = IPV4_VERSION; + } else if (strcmp("v6", opt_P) == 0) { + ipversion = IPV6_VERSION; + } else { + mdb_warn("invalid protocol '%s'\n", opt_P); + return (DCMD_USAGE); + } + } + + if ((flags & DCMD_LOOPFIRST) || !(flags & DCMD_LOOP)) { + mdb_printf("%<u>%?s %5s %18s %?s %s %s %</u>\n", + "ADDR", "INTF", "LLADDR", "FP_MP", "REFCNT", + "NCE_ADDR"); + } + + bzero(&nce_cb, sizeof (nce_cb)); + if (opt_ill != NULL) { + strcpy(nce_cb.nce_ill_name, opt_ill); + } + nce_cb.nce_ipversion = ipversion; + + if (flags & DCMD_ADDRSPEC) { + (void) mdb_vread(&nce, sizeof (nce_t), addr); + (void) nce_format(addr, &nce, &nce_cb); + } else if (mdb_walk("nce", (mdb_walk_cb_t)nce_format, &nce_cb) == -1) { + mdb_warn("failed to walk ire table"); + return (DCMD_ERR); + } + + return (DCMD_OK); +} + +/* ARGSUSED */ +static int +dce_format(uintptr_t addr, const dce_t *dcep, void *dce_cb_arg) +{ + static const mdb_bitmask_t dmasks[] = { + { "D", DCEF_DEFAULT, DCEF_DEFAULT }, + { "P", DCEF_PMTU, DCEF_PMTU }, + { "U", DCEF_UINFO, DCEF_UINFO }, + { "S", DCEF_TOO_SMALL_PMTU, DCEF_TOO_SMALL_PMTU }, + { NULL, 0, 0 } + }; + char flagsbuf[2 * A_CNT(dmasks)]; + int ipversion = *(int *)dce_cb_arg; + boolean_t condemned = dcep->dce_generation == DCE_GENERATION_CONDEMNED; + + if (ipversion != 0 && ipversion != dcep->dce_ipversion) + return (WALK_NEXT); + + mdb_snprintf(flagsbuf, sizeof (flagsbuf), "%b", dcep->dce_flags, + dmasks); + + switch (dcep->dce_ipversion) { + case IPV4_VERSION: + mdb_printf("%<u>%?p%3s %8s %8d %30I %</u>\n", addr, condemned ? + "(C)" : "", flagsbuf, dcep->dce_pmtu, &dcep->dce_v4addr); + break; + case IPV6_VERSION: + mdb_printf("%<u>%?p%3s %8s %8d %30N %</u>\n", addr, condemned ? + "(C)" : "", flagsbuf, dcep->dce_pmtu, &dcep->dce_v6addr); + break; + default: + mdb_printf("%<u>%?p%3s %8s %8d %30s %</u>\n", addr, condemned ? + "(C)" : "", flagsbuf, dcep->dce_pmtu, ""); + } + + return (WALK_NEXT); +} + +int +dce(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) +{ + dce_t dce; + const char *opt_P = NULL; + const char *zone_name = NULL; + ip_stack_t *ipst = NULL; + int ipversion = 0; + + if (mdb_getopts(argc, argv, + 's', MDB_OPT_STR, &zone_name, + 'P', MDB_OPT_STR, &opt_P, NULL) != argc) + return (DCMD_USAGE); + + /* Follow the specified zone name to find a ip_stack_t*. */ + if (zone_name != NULL) { + ipst = zone_to_ips(zone_name); + if (ipst == NULL) + return (DCMD_USAGE); + } + + if (opt_P != NULL) { + if (strcmp("v4", opt_P) == 0) { + ipversion = IPV4_VERSION; + } else if (strcmp("v6", opt_P) == 0) { + ipversion = IPV6_VERSION; + } else { + mdb_warn("invalid protocol '%s'\n", opt_P); + return (DCMD_USAGE); + } + } + + if ((flags & DCMD_LOOPFIRST) || !(flags & DCMD_LOOP)) { + mdb_printf("%<u>%?s%3s %8s %8s %30s %</u>\n", + "ADDR", "", "FLAGS", "PMTU", "DST_ADDR"); + } + + if (flags & DCMD_ADDRSPEC) { + (void) mdb_vread(&dce, sizeof (dce_t), addr); + (void) dce_format(addr, &dce, &ipversion); + } else if (mdb_pwalk("dce", (mdb_walk_cb_t)dce_format, &ipversion, + (uintptr_t)ipst) == -1) { + mdb_warn("failed to walk dce cache"); + return (DCMD_ERR); + } + + return (DCMD_OK); +} + +int ire(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) { uint_t verbose = FALSE; @@ -1047,12 +1335,22 @@ ire(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) ire_cbdata_t ire_cb; int ipversion = 0; const char *opt_P = NULL; + const char *zone_name = NULL; + ip_stack_t *ipst = NULL; if (mdb_getopts(argc, argv, 'v', MDB_OPT_SETBITS, TRUE, &verbose, + 's', MDB_OPT_STR, &zone_name, 'P', MDB_OPT_STR, &opt_P, NULL) != argc) return (DCMD_USAGE); + /* Follow the specified zone name to find a ip_stack_t*. */ + if (zone_name != NULL) { + ipst = zone_to_ips(zone_name); + if (ipst == NULL) + return (DCMD_USAGE); + } + if (opt_P != NULL) { if (strcmp("v4", opt_P) == 0) { ipversion = IPV4_VERSION; @@ -1069,13 +1367,13 @@ ire(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) if (verbose) { mdb_printf("%?s %40s %-20s%\n" "%?s %40s %-20s%\n" - "%<u>%?s %40s %4s %-20s%</u>\n", + "%<u>%?s %40s %4s %-20s %s%</u>\n", "ADDR", "SRC", "TYPE", "", "DST", "MARKS", - "", "STACK", "ZONE", "FLAGS"); + "", "STACK", "ZONE", "FLAGS", "INTF"); } else { - mdb_printf("%<u>%?s %30s %30s %5s %4s%</u>\n", - "ADDR", "SRC", "DST", "STACK", "ZONE"); + mdb_printf("%<u>%?s %30s %30s %5s %4s %s%</u>\n", + "ADDR", "SRC", "DST", "STACK", "ZONE", "INTF"); } } @@ -1085,7 +1383,8 @@ ire(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) if (flags & DCMD_ADDRSPEC) { (void) mdb_vread(&ire, sizeof (ire_t), addr); (void) ire_format(addr, &ire, &ire_cb); - } else if (mdb_walk("ire", (mdb_walk_cb_t)ire_format, &ire_cb) == -1) { + } else if (mdb_pwalk("ire", (mdb_walk_cb_t)ire_format, &ire_cb, + (uintptr_t)ipst) == -1) { mdb_warn("failed to walk ire table"); return (DCMD_ERR); } @@ -1338,7 +1637,7 @@ th_trace(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) static void th_trace_help(void) { - mdb_printf("If given an address of an ill_t, ipif_t, ire_t, or nce_t, " + mdb_printf("If given an address of an ill_t, ipif_t, ire_t, or ncec_t, " "print the\n" "corresponding th_trace_t structure in detail. Otherwise, if no " "address is\n" @@ -1354,8 +1653,8 @@ static const mdb_dcmd_t dcmds[] = { { "srcid_status", ":", "display connection structures from ipcl hash tables", srcid_status }, - { "ill", "?[-v] [-P v4 | v6]", "display ill_t structures", - ill, ill_help }, + { "ill", "?[-v] [-P v4 | v6] [-s exclusive-ip-zone-name]", + "display ill_t structures", ill, ill_help }, { "illif", "?[-P v4 | v6]", "display or filter IP Lower Level InterFace structures", illif, illif_help }, @@ -1363,10 +1662,14 @@ static const mdb_dcmd_t dcmds[] = { { "ip6hdr", ":[-vf]", "display an IPv6 header", ip6hdr }, { "ipif", "?[-v] [-P v4 | v6]", "display ipif structures", ipif, ipif_help }, - { "ire", "?[-v] [-P v4|v6]", + { "ire", "?[-v] [-P v4|v6] [-s exclusive-ip-zone-name]", "display Internet Route Entry structures", ire }, - { "nce", "?[-P v4 | v6]", "display Neighbor Cache Entry structures", - nce }, + { "nce", "?[-P v4|v6] [-i <interface>]", + "display interface-specific Neighbor Cache structures", nce }, + { "ncec", "?[-P v4 | v6]", "display Neighbor Cache Entry structures", + ncec }, + { "dce", "?[-P v4|v6] [-s exclusive-ip-zone-name]", + "display Destination Cache Entry structures", dce }, { "squeue", ":[-v]", "print core squeue_t info", squeue, ip_squeue_help }, { "tcphdr", ":", "display a TCP header", tcphdr }, @@ -1385,7 +1688,7 @@ static const mdb_walker_t walkers[] = { { "illif_stack", "walk list of ill interface types", illif_stack_walk_init, illif_stack_walk_step, illif_stack_walk_fini }, - { "ill", "walk list of nce structures for all stacks", + { "ill", "walk active ill_t structures for all stacks", ill_walk_init, ill_walk_step, NULL }, { "ipif", "walk list of ipif structures for all stacks", ipif_walk_init, ipif_walk_step, NULL }, @@ -1400,19 +1703,21 @@ static const mdb_walker_t walkers[] = { &srcid_walk_arg }, { "ire", "walk active ire_t structures", ire_walk_init, ire_walk_step, NULL }, - { "ire_ctable", "walk ire_t structures in the ctable", - ip_stacks_common_walk_init, ire_ctable_walk_step, NULL }, { "ire_next", "walk ire_t structures in the ctable", ire_next_walk_init, ire_next_walk_step, NULL }, + { "nce", "walk active nce_t structures", + nce_walk_init, nce_walk_step, NULL }, + { "dce", "walk active dce_t structures", + dce_walk_init, dce_walk_step, NULL }, { "ip_stacks", "walk all the ip_stack_t", ip_stacks_walk_init, ip_stacks_walk_step, NULL }, { "th_hash", "walk all the th_hash_t entries", th_hash_walk_init, th_hash_walk_step, NULL }, - { "nce", "walk list of nce structures for all stacks", - ip_stacks_common_walk_init, nce_walk_step, NULL }, - { "nce_stack", "walk list of nce structures", - nce_stack_walk_init, nce_stack_walk_step, - nce_stack_walk_fini}, + { "ncec", "walk list of ncec structures for all stacks", + ip_stacks_common_walk_init, ncec_walk_step, NULL }, + { "ncec_stack", "walk list of ncec structures", + ncec_stack_walk_init, ncec_stack_walk_step, + ncec_stack_walk_fini}, { "udp_hash", "walk list of conn_t structures in ips_ipcl_udp_fanout", ipcl_hash_walk_init, ipcl_hash_walk_step, ipcl_hash_walk_fini, &udp_hash_arg}, @@ -1471,9 +1776,9 @@ _mdb_fini(void) } static char * -nce_state(int nce_state) +ncec_state(int ncec_state) { - switch (nce_state) { + switch (ncec_state) { case ND_UNCHANGED: return ("unchanged"); case ND_INCOMPLETE: @@ -1496,36 +1801,61 @@ nce_state(int nce_state) } static char * -nce_l2_addr(const nce_t *nce, const ill_t *ill) +ncec_l2_addr(const ncec_t *ncec, const ill_t *ill) { uchar_t *h; static char addr_buf[L2MAXADDRSTRLEN]; - mblk_t mp; - size_t mblen; - if (ill->ill_flags & ILLF_XRESOLV) { - return ("XRESOLV"); + if (ncec->ncec_lladdr == NULL) { + return ("None"); } - if (nce->nce_res_mp == NULL) { + if (ill->ill_net_type == IRE_IF_RESOLVER) { + + if (ill->ill_phys_addr_length == 0) + return ("None"); + h = mdb_zalloc(ill->ill_phys_addr_length, UM_SLEEP); + if (mdb_vread(h, ill->ill_phys_addr_length, + (uintptr_t)ncec->ncec_lladdr) == -1) { + mdb_warn("failed to read hwaddr at %p", + ncec->ncec_lladdr); + return ("Unknown"); + } + mdb_mac_addr(h, ill->ill_phys_addr_length, + addr_buf, sizeof (addr_buf)); + } else { return ("None"); } + mdb_free(h, ill->ill_phys_addr_length); + return (addr_buf); +} - if (ill->ill_net_type == IRE_IF_RESOLVER) { +static char * +nce_l2_addr(const nce_t *nce, const ill_t *ill) +{ + uchar_t *h; + static char addr_buf[L2MAXADDRSTRLEN]; + mblk_t mp; + size_t mblen; + + if (nce->nce_dlur_mp == NULL) + return ("None"); + if (ill->ill_net_type == IRE_IF_RESOLVER) { if (mdb_vread(&mp, sizeof (mblk_t), - (uintptr_t)nce->nce_res_mp) == -1) { - mdb_warn("failed to read nce_res_mp at %p", - nce->nce_res_mp); + (uintptr_t)nce->nce_dlur_mp) == -1) { + mdb_warn("failed to read nce_dlur_mp at %p", + nce->nce_dlur_mp); + return ("None"); } - - if (ill->ill_nd_lla_len == 0) + if (ill->ill_phys_addr_length == 0) return ("None"); mblen = mp.b_wptr - mp.b_rptr; if (mblen > (sizeof (dl_unitdata_req_t) + MAX_SAP_LEN) || - ill->ill_nd_lla_len > MAX_SAP_LEN || - NCE_LL_ADDR_OFFSET(ill) + ill->ill_nd_lla_len > mblen) { - return ("Truncated"); + ill->ill_phys_addr_length > MAX_SAP_LEN || + (NCE_LL_ADDR_OFFSET(ill) + + ill->ill_phys_addr_length) > mblen) { + return ("Unknown"); } h = mdb_zalloc(mblen, UM_SLEEP); if (mdb_vread(h, mblen, (uintptr_t)(mp.b_rptr)) == -1) { @@ -1533,8 +1863,8 @@ nce_l2_addr(const nce_t *nce, const ill_t *ill) mp.b_rptr + NCE_LL_ADDR_OFFSET(ill)); return ("Unknown"); } - mdb_mac_addr(h + NCE_LL_ADDR_OFFSET(ill), ill->ill_nd_lla_len, - addr_buf, sizeof (addr_buf)); + mdb_mac_addr(h + NCE_LL_ADDR_OFFSET(ill), + ill->ill_phys_addr_length, addr_buf, sizeof (addr_buf)); } else { return ("None"); } @@ -1543,7 +1873,7 @@ nce_l2_addr(const nce_t *nce, const ill_t *ill) } static void -nce_header(uint_t flags) +ncec_header(uint_t flags) { if ((flags & DCMD_LOOPFIRST) || !(flags & DCMD_LOOP)) { @@ -1553,10 +1883,10 @@ nce_header(uint_t flags) } int -nce(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) +ncec(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) { - nce_t nce; - nce_cbdata_t id; + ncec_t ncec; + ncec_cbdata_t id; int ipversion = 0; const char *opt_P = NULL; @@ -1577,23 +1907,23 @@ nce(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) if (flags & DCMD_ADDRSPEC) { - if (mdb_vread(&nce, sizeof (nce_t), addr) == -1) { - mdb_warn("failed to read nce at %p\n", addr); + if (mdb_vread(&ncec, sizeof (ncec_t), addr) == -1) { + mdb_warn("failed to read ncec at %p\n", addr); return (DCMD_ERR); } - if (ipversion != 0 && nce.nce_ipversion != ipversion) { + if (ipversion != 0 && ncec.ncec_ipversion != ipversion) { mdb_printf("IP Version mismatch\n"); return (DCMD_ERR); } - nce_header(flags); - return (nce_format(addr, &nce, ipversion)); + ncec_header(flags); + return (ncec_format(addr, &ncec, ipversion)); } else { - id.nce_addr = addr; - id.nce_ipversion = ipversion; - nce_header(flags); - if (mdb_walk("nce", (mdb_walk_cb_t)nce_cb, &id) == -1) { - mdb_warn("failed to walk nce table\n"); + id.ncec_addr = addr; + id.ncec_ipversion = ipversion; + ncec_header(flags); + if (mdb_walk("ncec", (mdb_walk_cb_t)ncec_cb, &id) == -1) { + mdb_warn("failed to walk ncec table\n"); return (DCMD_ERR); } } @@ -1601,10 +1931,10 @@ nce(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) } static int -nce_format(uintptr_t addr, const nce_t *nce, int ipversion) +ncec_format(uintptr_t addr, const ncec_t *ncec, int ipversion) { - static const mdb_bitmask_t nce_flags[] = { - { "P", NCE_F_PERMANENT, NCE_F_PERMANENT }, + static const mdb_bitmask_t ncec_flags[] = { + { "P", NCE_F_NONUD, NCE_F_NONUD }, { "R", NCE_F_ISROUTER, NCE_F_ISROUTER }, { "N", NCE_F_NONUD, NCE_F_NONUD }, { "A", NCE_F_ANYCAST, NCE_F_ANYCAST }, @@ -1613,15 +1943,15 @@ nce_format(uintptr_t addr, const nce_t *nce, int ipversion) { "B", NCE_F_BCAST, NCE_F_BCAST }, { NULL, 0, 0 } }; -#define NCE_MAX_FLAGS (sizeof (nce_flags) / sizeof (mdb_bitmask_t)) +#define NCE_MAX_FLAGS (sizeof (ncec_flags) / sizeof (mdb_bitmask_t)) struct in_addr nceaddr; ill_t ill; char ill_name[LIFNAMSIZ]; char flagsbuf[NCE_MAX_FLAGS]; - if (mdb_vread(&ill, sizeof (ill), (uintptr_t)nce->nce_ill) == -1) { - mdb_warn("failed to read nce_ill at %p", - nce->nce_ill); + if (mdb_vread(&ill, sizeof (ill), (uintptr_t)ncec->ncec_ill) == -1) { + mdb_warn("failed to read ncec_ill at %p", + ncec->ncec_ill); return (DCMD_ERR); } @@ -1629,33 +1959,33 @@ nce_format(uintptr_t addr, const nce_t *nce, int ipversion) (uintptr_t)ill.ill_name); mdb_snprintf(flagsbuf, sizeof (flagsbuf), "%hb", - nce->nce_flags, nce_flags); + ncec->ncec_flags, ncec_flags); - if (ipversion != 0 && nce->nce_ipversion != ipversion) + if (ipversion != 0 && ncec->ncec_ipversion != ipversion) return (DCMD_OK); - if (nce->nce_ipversion == IPV4_VERSION) { - IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &nceaddr); + if (ncec->ncec_ipversion == IPV4_VERSION) { + IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &nceaddr); mdb_printf("%?p %-20s %-10s " "%-8s " "%-5s %I\n", - addr, nce_l2_addr(nce, &ill), - nce_state(nce->nce_state), + addr, ncec_l2_addr(ncec, &ill), + ncec_state(ncec->ncec_state), flagsbuf, ill_name, nceaddr.s_addr); } else { mdb_printf("%?p %-20s %-10s %-8s %-5s %N\n", - addr, nce_l2_addr(nce, &ill), - nce_state(nce->nce_state), + addr, ncec_l2_addr(ncec, &ill), + ncec_state(ncec->ncec_state), flagsbuf, - ill_name, &nce->nce_addr); + ill_name, &ncec->ncec_addr); } return (DCMD_OK); } static uintptr_t -nce_get_next_hash_tbl(uintptr_t start, int *index, struct ndp_g_s ndp) +ncec_get_next_hash_tbl(uintptr_t start, int *index, struct ndp_g_s ndp) { uintptr_t addr = start; int i = *index; @@ -1671,7 +2001,7 @@ nce_get_next_hash_tbl(uintptr_t start, int *index, struct ndp_g_s ndp) } static int -nce_walk_step(mdb_walk_state_t *wsp) +ncec_walk_step(mdb_walk_state_t *wsp) { uintptr_t kaddr4, kaddr6; @@ -1686,15 +2016,15 @@ nce_walk_step(mdb_walk_state_t *wsp) mdb_warn("can't read ips_ip_cache_table at %p", kaddr6); return (WALK_ERR); } - if (mdb_pwalk("nce_stack", wsp->walk_callback, wsp->walk_cbdata, + if (mdb_pwalk("ncec_stack", wsp->walk_callback, wsp->walk_cbdata, kaddr4) == -1) { - mdb_warn("couldn't walk 'nce_stack' for ips_ndp4 %p", + mdb_warn("couldn't walk 'ncec_stack' for ips_ndp4 %p", kaddr4); return (WALK_ERR); } - if (mdb_pwalk("nce_stack", wsp->walk_callback, + if (mdb_pwalk("ncec_stack", wsp->walk_callback, wsp->walk_cbdata, kaddr6) == -1) { - mdb_warn("couldn't walk 'nce_stack' for ips_ndp6 %p", + mdb_warn("couldn't walk 'ncec_stack' for ips_ndp6 %p", kaddr6); return (WALK_ERR); } @@ -1743,7 +2073,7 @@ ipcl_hash_walk_init(mdb_walk_state_t *wsp) mdb_free(iw, sizeof (ipcl_hash_walk_data_t)); return (WALK_ERR); } - if (arg->tbl_off == OFFSETOF(ip_stack_t, ips_ipcl_proto_fanout) || + if (arg->tbl_off == OFFSETOF(ip_stack_t, ips_ipcl_proto_fanout_v4) || arg->tbl_off == OFFSETOF(ip_stack_t, ips_ipcl_proto_fanout_v6)) { iw->hash_tbl_size = IPPROTO_MAX; } else { @@ -1809,72 +2139,75 @@ ipcl_hash_walk_fini(mdb_walk_state_t *wsp) * Called with walk_addr being the address of ips_ndp{4,6} */ static int -nce_stack_walk_init(mdb_walk_state_t *wsp) +ncec_stack_walk_init(mdb_walk_state_t *wsp) { - nce_walk_data_t *nw; + ncec_walk_data_t *nw; if (wsp->walk_addr == NULL) { - mdb_warn("nce_stack requires ndp_g_s address\n"); + mdb_warn("ncec_stack requires ndp_g_s address\n"); return (WALK_ERR); } - nw = mdb_alloc(sizeof (nce_walk_data_t), UM_SLEEP); + nw = mdb_alloc(sizeof (ncec_walk_data_t), UM_SLEEP); - if (mdb_vread(&nw->nce_ip_ndp, sizeof (struct ndp_g_s), + if (mdb_vread(&nw->ncec_ip_ndp, sizeof (struct ndp_g_s), wsp->walk_addr) == -1) { mdb_warn("failed to read 'ip_ndp' at %p", wsp->walk_addr); - mdb_free(nw, sizeof (nce_walk_data_t)); + mdb_free(nw, sizeof (ncec_walk_data_t)); return (WALK_ERR); } - nw->nce_hash_tbl_index = 0; - wsp->walk_addr = nce_get_next_hash_tbl(NULL, - &nw->nce_hash_tbl_index, nw->nce_ip_ndp); + /* + * ncec_get_next_hash_tbl() starts at ++i , so initialize index to -1 + */ + nw->ncec_hash_tbl_index = -1; + wsp->walk_addr = ncec_get_next_hash_tbl(NULL, + &nw->ncec_hash_tbl_index, nw->ncec_ip_ndp); wsp->walk_data = nw; return (WALK_NEXT); } static int -nce_stack_walk_step(mdb_walk_state_t *wsp) +ncec_stack_walk_step(mdb_walk_state_t *wsp) { uintptr_t addr = wsp->walk_addr; - nce_walk_data_t *nw = wsp->walk_data; + ncec_walk_data_t *nw = wsp->walk_data; if (addr == NULL) return (WALK_DONE); - if (mdb_vread(&nw->nce, sizeof (nce_t), addr) == -1) { - mdb_warn("failed to read nce_t at %p", addr); + if (mdb_vread(&nw->ncec, sizeof (ncec_t), addr) == -1) { + mdb_warn("failed to read ncec_t at %p", addr); return (WALK_ERR); } - wsp->walk_addr = (uintptr_t)nw->nce.nce_next; + wsp->walk_addr = (uintptr_t)nw->ncec.ncec_next; - wsp->walk_addr = nce_get_next_hash_tbl(wsp->walk_addr, - &nw->nce_hash_tbl_index, nw->nce_ip_ndp); + wsp->walk_addr = ncec_get_next_hash_tbl(wsp->walk_addr, + &nw->ncec_hash_tbl_index, nw->ncec_ip_ndp); return (wsp->walk_callback(addr, nw, wsp->walk_cbdata)); } static void -nce_stack_walk_fini(mdb_walk_state_t *wsp) +ncec_stack_walk_fini(mdb_walk_state_t *wsp) { - mdb_free(wsp->walk_data, sizeof (nce_walk_data_t)); + mdb_free(wsp->walk_data, sizeof (ncec_walk_data_t)); } /* ARGSUSED */ static int -nce_cb(uintptr_t addr, const nce_walk_data_t *iw, nce_cbdata_t *id) +ncec_cb(uintptr_t addr, const ncec_walk_data_t *iw, ncec_cbdata_t *id) { - nce_t nce; + ncec_t ncec; - if (mdb_vread(&nce, sizeof (nce_t), addr) == -1) { - mdb_warn("failed to read nce at %p", addr); + if (mdb_vread(&ncec, sizeof (ncec_t), addr) == -1) { + mdb_warn("failed to read ncec at %p", addr); return (WALK_NEXT); } - (void) nce_format(addr, &nce, id->nce_ipversion); + (void) ncec_format(addr, &ncec, id->ncec_ipversion); return (WALK_NEXT); } @@ -1918,6 +2251,11 @@ ill_cb(uintptr_t addr, const ill_walk_data_t *iw, ill_cbdata_t *id) mdb_warn("failed to read ill at %p", addr); return (WALK_NEXT); } + + /* If ip_stack_t is specified, skip ILLs that don't belong to it. */ + if (id->ill_ipst != NULL && ill.ill_ipst != id->ill_ipst) + return (WALK_NEXT); + return (ill_format((uintptr_t)addr, &ill, id)); } @@ -2013,7 +2351,7 @@ ill_format(uintptr_t addr, const void *illptr, void *ill_cb_arg) break; } cnt = ill->ill_refcnt + ill->ill_ire_cnt + ill->ill_nce_cnt + - ill->ill_ilm_walker_cnt + ill->ill_ilm_cnt; + ill->ill_ilm_cnt + ill->ill_ncec_cnt; mdb_printf("%-?p %-8s %-3s ", addr, ill_name, ill->ill_isv6 ? "v6" : "v4"); if (typebuf != NULL) @@ -2035,11 +2373,10 @@ ill_format(uintptr_t addr, const void *illptr, void *ill_cb_arg) strlen(sbuf), "", ill->ill_ire_cnt, "ill_ire_cnt"); mdb_printf("%*s %7d %-18s nces referencing this ill\n", strlen(sbuf), "", ill->ill_nce_cnt, "ill_nce_cnt"); + mdb_printf("%*s %7d %-18s ncecs referencing this ill\n", + strlen(sbuf), "", ill->ill_ncec_cnt, "ill_ncec_cnt"); mdb_printf("%*s %7d %-18s ilms referencing this ill\n", strlen(sbuf), "", ill->ill_ilm_cnt, "ill_ilm_cnt"); - mdb_printf("%*s %7d %-18s active ilm walkers\n\n", - strlen(sbuf), "", ill->ill_ilm_walker_cnt, - "ill_ilm_walker_cnt"); } else { mdb_printf("%4d %-?p %-llb\n", cnt, ill->ill_wq, @@ -2054,14 +2391,24 @@ ill(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) ill_t ill_data; ill_cbdata_t id; int ipversion = 0; + const char *zone_name = NULL; const char *opt_P = NULL; uint_t verbose = FALSE; + ip_stack_t *ipst = NULL; if (mdb_getopts(argc, argv, 'v', MDB_OPT_SETBITS, TRUE, &verbose, + 's', MDB_OPT_STR, &zone_name, 'P', MDB_OPT_STR, &opt_P, NULL) != argc) return (DCMD_USAGE); + /* Follow the specified zone name to find a ip_stack_t*. */ + if (zone_name != NULL) { + ipst = zone_to_ips(zone_name); + if (ipst == NULL) + return (DCMD_USAGE); + } + if (opt_P != NULL) { if (strcmp("v4", opt_P) == 0) { ipversion = IPV4_VERSION; @@ -2076,6 +2423,7 @@ ill(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) id.verbose = verbose; id.ill_addr = addr; id.ill_ipversion = ipversion; + id.ill_ipst = ipst; ill_header(verbose); if (flags & DCMD_ADDRSPEC) { @@ -2254,7 +2602,6 @@ ipif_format(uintptr_t addr, const void *ipifptr, void *ipif_cb_arg) { "CO", IPIF_CONDEMNED, IPIF_CONDEMNED}, { "CH", IPIF_CHANGING, IPIF_CHANGING}, { "SL", IPIF_SET_LINKLOCAL, IPIF_SET_LINKLOCAL}, - { "ZS", IPIF_ZERO_SOURCE, IPIF_ZERO_SOURCE}, { NULL, 0, 0 } }; static const mdb_bitmask_t fmasks[] = { @@ -2299,16 +2646,14 @@ ipif_format(uintptr_t addr, const void *ipifptr, void *ipif_cb_arg) } mdb_snprintf(bitfields, sizeof (bitfields), "%s", ipif->ipif_addr_ready ? ",ADR" : "", - ipif->ipif_multicast_up ? ",MU" : "", ipif->ipif_was_up ? ",WU" : "", - ipif->ipif_was_dup ? ",WD" : "", - ipif->ipif_joined_allhosts ? ",JA" : ""); + ipif->ipif_was_dup ? ",WD" : ""); mdb_snprintf(flagsbuf, sizeof (flagsbuf), "%llb%s", ipif->ipif_flags, fmasks, bitfields); mdb_snprintf(sflagsbuf, sizeof (sflagsbuf), "%b", ipif->ipif_state_flags, sfmasks); - cnt = ipif->ipif_refcnt + ipif->ipif_ire_cnt + ipif->ipif_ilm_cnt; + cnt = ipif->ipif_refcnt; if (ipifcb->ill.ill_isv6) { mdb_snprintf(addrstr, sizeof (addrstr), "%N", @@ -2329,12 +2674,6 @@ ipif_format(uintptr_t addr, const void *ipifptr, void *ipif_cb_arg) mdb_printf("%s |\n%s +---> %4d %-15s " "Active consistent reader cnt\n", sbuf, sbuf, ipif->ipif_refcnt, "ipif_refcnt"); - mdb_printf("%*s %10d %-15s " - "Number of ire's referencing this ipif\n", - strlen(sbuf), "", ipif->ipif_ire_cnt, "ipif_ire_cnt"); - mdb_printf("%*s %10d %-15s " - "Number of ilm's referencing this ipif\n\n", - strlen(sbuf), "", ipif->ipif_ilm_cnt, "ipif_ilm_cnt"); mdb_printf("%-s/%d\n", addrstr, mask_to_prefixlen(af, &ipif->ipif_v6net_mask)); if (ipifcb->ill.ill_isv6) { @@ -2473,16 +2812,16 @@ conn_status_cb(uintptr_t addr, const void *walk_data, mdb_printf("%-?p %-?p %?d %?d\n", addr, conn->conn_wq, nss.netstack_stackid, conn->conn_zoneid); - if (conn->conn_af_isv6) { + if (conn->conn_family == AF_INET6) { mdb_snprintf(src_addrstr, sizeof (rem_addrstr), "%N", - &conn->conn_srcv6); + &conn->conn_laddr_v6); mdb_snprintf(rem_addrstr, sizeof (rem_addrstr), "%N", - &conn->conn_remv6); + &conn->conn_faddr_v6); } else { mdb_snprintf(src_addrstr, sizeof (src_addrstr), "%I", - V4_PART_OF_V6((conn->conn_srcv6))); + V4_PART_OF_V6((conn->conn_laddr_v6))); mdb_snprintf(rem_addrstr, sizeof (rem_addrstr), "%I", - V4_PART_OF_V6((conn->conn_remv6))); + V4_PART_OF_V6((conn->conn_faddr_v6))); } mdb_printf("%s:%-5d\n%s:%-5d\n", src_addrstr, conn->conn_lport, rem_addrstr, conn->conn_fport); @@ -2519,7 +2858,7 @@ conn_status_help(void) { mdb_printf("Prints conn_t structures from the following hash tables: " "\n\tips_ipcl_udp_fanout\n\tips_ipcl_bind_fanout" - "\n\tips_ipcl_conn_fanout\n\tips_ipcl_proto_fanout" + "\n\tips_ipcl_conn_fanout\n\tips_ipcl_proto_fanout_v4" "\n\tips_ipcl_proto_fanout_v6\n"); } diff --git a/usr/src/cmd/mdb/common/modules/sctp/sctp.c b/usr/src/cmd/mdb/common/modules/sctp/sctp.c index 05f0c385c8..4165a56ca4 100644 --- a/usr/src/cmd/mdb/common/modules/sctp/sctp.c +++ b/usr/src/cmd/mdb/common/modules/sctp/sctp.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/stream.h> #include <sys/mdb_modapi.h> @@ -164,7 +162,7 @@ sctp_faddr(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) mdb_printf("lastactive\t%?ld\thb_secret\t%?#lx\n", fa->lastactive, fa->hb_secret); mdb_printf("rxt_unacked\t%?u\n", fa->rxt_unacked); - mdb_printf("timer_mp\t%?p\tire\t\t%?p\n", fa->timer_mp, fa->ire); + mdb_printf("timer_mp\t%?p\tixa\t\t%?p\n", fa->timer_mp, fa->ixa); mdb_printf("hb_enabled\t%?d\thb_pending\t%?d\n" "timer_running\t%?d\tdf\t\t%?d\n" "pmtu_discovered\t%?d\tisv4\t\t%?d\n" @@ -566,11 +564,12 @@ show_sctp_flags(sctp_t *sctp) { mdb_printf("\tunderstands_asconf\t%d\n", sctp->sctp_understands_asconf); - mdb_printf("\tdebug\t\t\t%d\n", sctp->sctp_debug); + mdb_printf("\tdebug\t\t\t%d\n", sctp->sctp_connp->conn_debug); mdb_printf("\tcchunk_pend\t\t%d\n", sctp->sctp_cchunk_pend); - mdb_printf("\tdgram_errind\t\t%d\n", sctp->sctp_dgram_errind); + mdb_printf("\tdgram_errind\t\t%d\n", + sctp->sctp_connp->conn_dgram_errind); - mdb_printf("\tlinger\t\t\t%d\n", sctp->sctp_linger); + mdb_printf("\tlinger\t\t\t%d\n", sctp->sctp_connp->conn_linger); if (sctp->sctp_lingering) return; mdb_printf("\tlingering\t\t%d\n", sctp->sctp_lingering); @@ -578,7 +577,8 @@ show_sctp_flags(sctp_t *sctp) mdb_printf("\tforce_sack\t\t%d\n", sctp->sctp_force_sack); mdb_printf("\tack_timer_runing\t%d\n", sctp->sctp_ack_timer_running); - mdb_printf("\trecvdstaddr\t\t%d\n", sctp->sctp_recvdstaddr); + mdb_printf("\trecvdstaddr\t\t%d\n", + sctp->sctp_connp->conn_recv_ancillary.crb_recvdstaddr); mdb_printf("\thwcksum\t\t\t%d\n", sctp->sctp_hwcksum); mdb_printf("\tunderstands_addip\t%d\n", sctp->sctp_understands_addip); @@ -654,8 +654,8 @@ print_saddr(uintptr_t ptr, const void *addr, void *cbdata) if (saddr->saddr_ipif_delete_pending == 1) mdb_printf("/DeletePending"); mdb_printf(")\n"); - mdb_printf("\t\t\tMTU %d id %d zoneid %d IPIF flags %x\n", - ipif.sctp_ipif_mtu, ipif.sctp_ipif_id, + mdb_printf("\t\t\tid %d zoneid %d IPIF flags %x\n", + ipif.sctp_ipif_id, ipif.sctp_ipif_zoneid, ipif.sctp_ipif_flags); return (WALK_NEXT); } @@ -682,8 +682,8 @@ print_faddr(uintptr_t ptr, const void *addr, void *cbdata) int sctp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) { - sctp_t sctp; - conn_t connp; + sctp_t sctps, *sctp; + conn_t conns, *connp; int i; uint_t opts = 0; uint_t paddr = 0; @@ -692,16 +692,23 @@ sctp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) if (!(flags & DCMD_ADDRSPEC)) return (DCMD_USAGE); - if (mdb_vread(&sctp, sizeof (sctp), addr) == -1) { + if (mdb_vread(&sctps, sizeof (sctps), addr) == -1) { mdb_warn("failed to read sctp_t at: %p\n", addr); return (DCMD_ERR); } - if (mdb_vread(&connp, sizeof (connp), - (uintptr_t)sctp.sctp_connp) == -1) { - mdb_warn("failed to read conn_t at: %p\n", sctp.sctp_connp); + sctp = &sctps; + + if (mdb_vread(&conns, sizeof (conns), + (uintptr_t)sctp->sctp_connp) == -1) { + mdb_warn("failed to read conn_t at: %p\n", sctp->sctp_connp); return (DCMD_ERR); } + connp = &conns; + + connp->conn_sctp = sctp; + sctp->sctp_connp = connp; + if (mdb_getopts(argc, argv, 'a', MDB_OPT_SETBITS, MDB_SCTP_SHOW_ALL, &opts, 'f', MDB_OPT_SETBITS, MDB_SCTP_SHOW_FLAGS, &opts, @@ -726,7 +733,7 @@ sctp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) /* non-verbose faddrs, suitable for pipelines to sctp_faddr */ if (paddr != 0) { sctp_faddr_t faddr, *fp; - for (fp = sctp.sctp_faddrs; fp != NULL; fp = faddr.next) { + for (fp = sctp->sctp_faddrs; fp != NULL; fp = faddr.next) { if (mdb_vread(&faddr, sizeof (faddr), (uintptr_t)fp) == -1) { mdb_warn("failed to read faddr at %p", @@ -738,16 +745,16 @@ sctp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) return (DCMD_OK); } - mdb_nhconvert(&lport, &sctp.sctp_lport, sizeof (lport)); - mdb_nhconvert(&fport, &sctp.sctp_fport, sizeof (fport)); + mdb_nhconvert(&lport, &connp->conn_lport, sizeof (lport)); + mdb_nhconvert(&fport, &connp->conn_fport, sizeof (fport)); mdb_printf("%<u>%p% %22s S=%-6hu D=%-6hu% STACK=%d ZONE=%d%</u>", addr, - state2str(&sctp), lport, fport, - ns_to_stackid((uintptr_t)connp.conn_netstack), connp.conn_zoneid); + state2str(sctp), lport, fport, + ns_to_stackid((uintptr_t)connp->conn_netstack), connp->conn_zoneid); - if (sctp.sctp_faddrs) { + if (sctp->sctp_faddrs) { sctp_faddr_t faddr; if (mdb_vread(&faddr, sizeof (faddr), - (uintptr_t)sctp.sctp_faddrs) != -1) + (uintptr_t)sctp->sctp_faddrs) != -1) mdb_printf("%<u> %N%</u>", &faddr.faddr); } mdb_printf("\n"); @@ -756,78 +763,78 @@ sctp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) mdb_printf("%<b>Local and Peer Addresses%</b>\n"); /* Display source addresses */ - mdb_printf("nsaddrs\t\t%?d\n", sctp.sctp_nsaddrs); + mdb_printf("nsaddrs\t\t%?d\n", sctp->sctp_nsaddrs); (void) mdb_pwalk("sctp_walk_saddr", print_saddr, NULL, addr); /* Display peer addresses */ - mdb_printf("nfaddrs\t\t%?d\n", sctp.sctp_nfaddrs); + mdb_printf("nfaddrs\t\t%?d\n", sctp->sctp_nfaddrs); i = 1; (void) mdb_pwalk("sctp_walk_faddr", print_faddr, &i, addr); mdb_printf("lastfaddr\t%?p\tprimary\t\t%?p\n", - sctp.sctp_lastfaddr, sctp.sctp_primary); + sctp->sctp_lastfaddr, sctp->sctp_primary); mdb_printf("current\t\t%?p\tlastdata\t%?p\n", - sctp.sctp_current, sctp.sctp_lastdata); + sctp->sctp_current, sctp->sctp_lastdata); } if (opts & MDB_SCTP_SHOW_OUT) { mdb_printf("%<b>Outbound Data%</b>\n"); mdb_printf("xmit_head\t%?p\txmit_tail\t%?p\n", - sctp.sctp_xmit_head, sctp.sctp_xmit_tail); + sctp->sctp_xmit_head, sctp->sctp_xmit_tail); mdb_printf("xmit_unsent\t%?p\txmit_unsent_tail%?p\n", - sctp.sctp_xmit_unsent, sctp.sctp_xmit_unsent_tail); - mdb_printf("xmit_unacked\t%?p\n", sctp.sctp_xmit_unacked); + sctp->sctp_xmit_unsent, sctp->sctp_xmit_unsent_tail); + mdb_printf("xmit_unacked\t%?p\n", sctp->sctp_xmit_unacked); mdb_printf("unacked\t\t%?u\tunsent\t\t%?ld\n", - sctp.sctp_unacked, sctp.sctp_unsent); + sctp->sctp_unacked, sctp->sctp_unsent); mdb_printf("ltsn\t\t%?x\tlastack_rxd\t%?x\n", - sctp.sctp_ltsn, sctp.sctp_lastack_rxd); + sctp->sctp_ltsn, sctp->sctp_lastack_rxd); mdb_printf("recovery_tsn\t%?x\tadv_pap\t\t%?x\n", - sctp.sctp_recovery_tsn, sctp.sctp_adv_pap); + sctp->sctp_recovery_tsn, sctp->sctp_adv_pap); mdb_printf("num_ostr\t%?hu\tostrcntrs\t%?p\n", - sctp.sctp_num_ostr, sctp.sctp_ostrcntrs); + sctp->sctp_num_ostr, sctp->sctp_ostrcntrs); mdb_printf("pad_mp\t\t%?p\terr_chunks\t%?p\n", - sctp.sctp_pad_mp, sctp.sctp_err_chunks); - mdb_printf("err_len\t\t%?u\n", sctp.sctp_err_len); + sctp->sctp_pad_mp, sctp->sctp_err_chunks); + mdb_printf("err_len\t\t%?u\n", sctp->sctp_err_len); mdb_printf("%<b>Default Send Parameters%</b>\n"); mdb_printf("def_stream\t%?u\tdef_flags\t%?x\n", - sctp.sctp_def_stream, sctp.sctp_def_flags); + sctp->sctp_def_stream, sctp->sctp_def_flags); mdb_printf("def_ppid\t%?x\tdef_context\t%?x\n", - sctp.sctp_def_ppid, sctp.sctp_def_context); + sctp->sctp_def_ppid, sctp->sctp_def_context); mdb_printf("def_timetolive\t%?u\n", - sctp.sctp_def_timetolive); + sctp->sctp_def_timetolive); } if (opts & MDB_SCTP_SHOW_IN) { mdb_printf("%<b>Inbound Data%</b>\n"); mdb_printf("sack_info\t%?p\tsack_gaps\t%?d\n", - sctp.sctp_sack_info, sctp.sctp_sack_gaps); - dump_sack_info((uintptr_t)sctp.sctp_sack_info); + sctp->sctp_sack_info, sctp->sctp_sack_gaps); + dump_sack_info((uintptr_t)sctp->sctp_sack_info); mdb_printf("ftsn\t\t%?x\tlastacked\t%?x\n", - sctp.sctp_ftsn, sctp.sctp_lastacked); + sctp->sctp_ftsn, sctp->sctp_lastacked); mdb_printf("istr_nmsgs\t%?d\tsack_toggle\t%?d\n", - sctp.sctp_istr_nmsgs, sctp.sctp_sack_toggle); - mdb_printf("ack_mp\t\t%?p\n", sctp.sctp_ack_mp); + sctp->sctp_istr_nmsgs, sctp->sctp_sack_toggle); + mdb_printf("ack_mp\t\t%?p\n", sctp->sctp_ack_mp); mdb_printf("num_istr\t%?hu\tinstr\t\t%?p\n", - sctp.sctp_num_istr, sctp.sctp_instr); - mdb_printf("unord_reass\t%?p\n", sctp.sctp_uo_frags); + sctp->sctp_num_istr, sctp->sctp_instr); + mdb_printf("unord_reass\t%?p\n", sctp->sctp_uo_frags); } if (opts & MDB_SCTP_SHOW_RTT) { mdb_printf("%<b>RTT Tracking%</b>\n"); mdb_printf("rtt_tsn\t\t%?x\tout_time\t%?ld\n", - sctp.sctp_rtt_tsn, sctp.sctp_out_time); + sctp->sctp_rtt_tsn, sctp->sctp_out_time); } if (opts & MDB_SCTP_SHOW_FLOW) { mdb_printf("%<b>Flow Control%</b>\n"); - mdb_printf("txmit_hiwater\t%?d\n" - "xmit_lowater\t%?d\tfrwnd\t\t%?u\n" + mdb_printf("tconn_sndbuf\t%?d\n" + "conn_sndlowat\t%?d\tfrwnd\t\t%?u\n" "rwnd\t\t%?u\tinitial rwnd\t%?u\n" - "rxqueued\t%?u\tcwnd_max\t%?u\n", sctp.sctp_xmit_hiwater, - sctp.sctp_xmit_lowater, sctp.sctp_frwnd, - sctp.sctp_rwnd, sctp.sctp_irwnd, sctp.sctp_rxqueued, - sctp.sctp_cwnd_max); + "rxqueued\t%?u\tcwnd_max\t%?u\n", connp->conn_sndbuf, + connp->conn_sndlowat, sctp->sctp_frwnd, + sctp->sctp_rwnd, sctp->sctp_irwnd, sctp->sctp_rxqueued, + sctp->sctp_cwnd_max); } if (opts & MDB_SCTP_SHOW_HDR) { @@ -838,21 +845,21 @@ sctp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) "ipha\t\t%?p\tip6h\t\t%?p\n" "ip_hdr_len\t%?d\tip_hdr6_len\t%?d\n" "sctph\t\t%?p\tsctph6\t\t%?p\n" - "lvtag\t\t%?x\tfvtag\t\t%?x\n", sctp.sctp_iphc, - sctp.sctp_iphc6, sctp.sctp_iphc_len, - sctp.sctp_iphc6_len, sctp.sctp_hdr_len, - sctp.sctp_hdr6_len, sctp.sctp_ipha, sctp.sctp_ip6h, - sctp.sctp_ip_hdr_len, sctp.sctp_ip_hdr6_len, - sctp.sctp_sctph, sctp.sctp_sctph6, sctp.sctp_lvtag, - sctp.sctp_fvtag); + "lvtag\t\t%?x\tfvtag\t\t%?x\n", sctp->sctp_iphc, + sctp->sctp_iphc6, sctp->sctp_iphc_len, + sctp->sctp_iphc6_len, sctp->sctp_hdr_len, + sctp->sctp_hdr6_len, sctp->sctp_ipha, sctp->sctp_ip6h, + sctp->sctp_ip_hdr_len, sctp->sctp_ip_hdr6_len, + sctp->sctp_sctph, sctp->sctp_sctph6, sctp->sctp_lvtag, + sctp->sctp_fvtag); } if (opts & MDB_SCTP_SHOW_PMTUD) { mdb_printf("%<b>PMTUd%</b>\n"); mdb_printf("last_mtu_probe\t%?ld\tmtu_probe_intvl\t%?ld\n" "mss\t\t%?u\n", - sctp.sctp_last_mtu_probe, sctp.sctp_mtu_probe_intvl, - sctp.sctp_mss); + sctp->sctp_last_mtu_probe, sctp->sctp_mtu_probe_intvl, + sctp->sctp_mss); } if (opts & MDB_SCTP_SHOW_RXT) { @@ -862,33 +869,33 @@ sctp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) "pp_max_rxt\t%?d\trto_max\t\t%?u\n" "rto_min\t\t%?u\trto_initial\t%?u\n" "init_rto_max\t%?u\n" - "rxt_nxttsn\t%?u\trxt_maxtsn\t%?u\n", sctp.sctp_cookie_mp, - sctp.sctp_strikes, sctp.sctp_max_init_rxt, - sctp.sctp_pa_max_rxt, sctp.sctp_pp_max_rxt, - sctp.sctp_rto_max, sctp.sctp_rto_min, - sctp.sctp_rto_initial, sctp.sctp_init_rto_max, - sctp.sctp_rxt_nxttsn, sctp.sctp_rxt_maxtsn); + "rxt_nxttsn\t%?u\trxt_maxtsn\t%?u\n", sctp->sctp_cookie_mp, + sctp->sctp_strikes, sctp->sctp_max_init_rxt, + sctp->sctp_pa_max_rxt, sctp->sctp_pp_max_rxt, + sctp->sctp_rto_max, sctp->sctp_rto_min, + sctp->sctp_rto_initial, sctp->sctp_init_rto_max, + sctp->sctp_rxt_nxttsn, sctp->sctp_rxt_maxtsn); } if (opts & MDB_SCTP_SHOW_CONN) { mdb_printf("%<b>Connection State%</b>\n"); mdb_printf("last_secret_update%?ld\n", - sctp.sctp_last_secret_update); + sctp->sctp_last_secret_update); mdb_printf("secret\t\t"); for (i = 0; i < SCTP_SECRET_LEN; i++) { if (i % 2 == 0) - mdb_printf("0x%02x", sctp.sctp_secret[i]); + mdb_printf("0x%02x", sctp->sctp_secret[i]); else - mdb_printf("%02x ", sctp.sctp_secret[i]); + mdb_printf("%02x ", sctp->sctp_secret[i]); } mdb_printf("\n"); mdb_printf("old_secret\t"); for (i = 0; i < SCTP_SECRET_LEN; i++) { if (i % 2 == 0) - mdb_printf("0x%02x", sctp.sctp_old_secret[i]); + mdb_printf("0x%02x", sctp->sctp_old_secret[i]); else - mdb_printf("%02x ", sctp.sctp_old_secret[i]); + mdb_printf("%02x ", sctp->sctp_old_secret[i]); } mdb_printf("\n"); } @@ -901,40 +908,40 @@ sctp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) "T2expire\t%?lu\tT3expire\t%?lu\n" "msgcount\t%?llu\tprsctpdrop\t%?llu\n" "AssocStartTime\t%?lu\n", - sctp.sctp_opkts, sctp.sctp_obchunks, - sctp.sctp_odchunks, sctp.sctp_oudchunks, - sctp.sctp_rxtchunks, sctp.sctp_T1expire, - sctp.sctp_T2expire, sctp.sctp_T3expire, - sctp.sctp_msgcount, sctp.sctp_prsctpdrop, - sctp.sctp_assoc_start_time); + sctp->sctp_opkts, sctp->sctp_obchunks, + sctp->sctp_odchunks, sctp->sctp_oudchunks, + sctp->sctp_rxtchunks, sctp->sctp_T1expire, + sctp->sctp_T2expire, sctp->sctp_T3expire, + sctp->sctp_msgcount, sctp->sctp_prsctpdrop, + sctp->sctp_assoc_start_time); mdb_printf("ipkts\t\t%?llu\tibchunks\t%?llu\n" "idchunks\t%?llu\tiudchunks\t%?llu\n" "fragdmsgs\t%?llu\treassmsgs\t%?llu\n", - sctp.sctp_ipkts, sctp.sctp_ibchunks, - sctp.sctp_idchunks, sctp.sctp_iudchunks, - sctp.sctp_fragdmsgs, sctp.sctp_reassmsgs); + sctp->sctp_ipkts, sctp->sctp_ibchunks, + sctp->sctp_idchunks, sctp->sctp_iudchunks, + sctp->sctp_fragdmsgs, sctp->sctp_reassmsgs); } if (opts & MDB_SCTP_SHOW_HASH) { mdb_printf("%<b>Hash Tables%</b>\n"); - mdb_printf("conn_hash_next\t%?p\t", sctp.sctp_conn_hash_next); - mdb_printf("conn_hash_prev\t%?p\n", sctp.sctp_conn_hash_prev); + mdb_printf("conn_hash_next\t%?p\t", sctp->sctp_conn_hash_next); + mdb_printf("conn_hash_prev\t%?p\n", sctp->sctp_conn_hash_prev); mdb_printf("listen_hash_next%?p\t", - sctp.sctp_listen_hash_next); + sctp->sctp_listen_hash_next); mdb_printf("listen_hash_prev%?p\n", - sctp.sctp_listen_hash_prev); - mdb_nhconvert(&lport, &sctp.sctp_lport, sizeof (lport)); + sctp->sctp_listen_hash_prev); + mdb_nhconvert(&lport, &connp->conn_lport, sizeof (lport)); mdb_printf("[ listen_hash bucket\t%?d ]\n", SCTP_LISTEN_HASH(lport)); - mdb_printf("conn_tfp\t%?p\t", sctp.sctp_conn_tfp); - mdb_printf("listen_tfp\t%?p\n", sctp.sctp_listen_tfp); + mdb_printf("conn_tfp\t%?p\t", sctp->sctp_conn_tfp); + mdb_printf("listen_tfp\t%?p\n", sctp->sctp_listen_tfp); mdb_printf("bind_hash\t%?p\tptpbhn\t\t%?p\n", - sctp.sctp_bind_hash, sctp.sctp_ptpbhn); + sctp->sctp_bind_hash, sctp->sctp_ptpbhn); mdb_printf("bind_lockp\t%?p\n", - sctp.sctp_bind_lockp); + sctp->sctp_bind_lockp); mdb_printf("[ bind_hash bucket\t%?d ]\n", SCTP_BIND_HASH(lport)); } @@ -943,8 +950,8 @@ sctp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) mdb_printf("%<b>Cleanup / Close%</b>\n"); mdb_printf("shutdown_faddr\t%?p\tclient_errno\t%?d\n" "lingertime\t%?d\trefcnt\t\t%?hu\n", - sctp.sctp_shutdown_faddr, sctp.sctp_client_errno, - sctp.sctp_lingertime, sctp.sctp_refcnt); + sctp->sctp_shutdown_faddr, sctp->sctp_client_errno, + connp->conn_lingertime, sctp->sctp_refcnt); } if (opts & MDB_SCTP_SHOW_MISC) { @@ -955,24 +962,25 @@ sctp(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) "active\t\t%?ld\ttx_adaptation_code%?x\n" "rx_adaptation_code%?x\ttimer_mp\t%?p\n" "partial_delivery_point\t%?d\n", - sctp.sctp_bound_if, sctp.sctp_heartbeat_mp, - sctp.sctp_family, sctp.sctp_ipversion, - sctp.sctp_hb_interval, sctp.sctp_autoclose, - sctp.sctp_active, sctp.sctp_tx_adaptation_code, - sctp.sctp_rx_adaptation_code, sctp.sctp_timer_mp, - sctp.sctp_pd_point); + connp->conn_bound_if, sctp->sctp_heartbeat_mp, + connp->conn_family, + connp->conn_ipversion, + sctp->sctp_hb_interval, sctp->sctp_autoclose, + sctp->sctp_active, sctp->sctp_tx_adaptation_code, + sctp->sctp_rx_adaptation_code, sctp->sctp_timer_mp, + sctp->sctp_pd_point); } if (opts & MDB_SCTP_SHOW_EXT) { mdb_printf("%<b>Extensions and Reliable Ctl Chunks%</b>\n"); mdb_printf("cxmit_list\t%?p\tlcsn\t\t%?x\n" - "fcsn\t\t%?x\n", sctp.sctp_cxmit_list, sctp.sctp_lcsn, - sctp.sctp_fcsn); + "fcsn\t\t%?x\n", sctp->sctp_cxmit_list, sctp->sctp_lcsn, + sctp->sctp_fcsn); } if (opts & MDB_SCTP_SHOW_FLAGS) { mdb_printf("%<b>Flags%</b>\n"); - show_sctp_flags(&sctp); + show_sctp_flags(sctp); } return (DCMD_OK); diff --git a/usr/src/common/net/patricia/radix.c b/usr/src/common/net/patricia/radix.c index 9a1d3f78ed..cf2085280f 100644 --- a/usr/src/common/net/patricia/radix.c +++ b/usr/src/common/net/patricia/radix.c @@ -1,5 +1,5 @@ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * Copyright (c) 1988, 1989, 1993 @@ -367,8 +367,9 @@ rn_match_args(v_arg, head, rn_leaf_fn, rn_leaf_arg) * is looking for some other criteria as well. Continue * looking as if the exact match failed. */ - if (t->rn_parent->rn_flags & RNF_ROOT) { - /* hit the top. have to give up */ + if (t->rn_dupedkey == NULL && + (t->rn_parent->rn_flags & RNF_ROOT)) { + /* no more dupedkeys and hit the top. have to give up */ return (NULL); } b = 0; @@ -486,56 +487,70 @@ rn_insert(v_arg, head, dupentry, nodes) { caddr_t v = v_arg; struct radix_node *top = head->rnh_treetop; + struct radix_node *p, *x; int head_off = top->rn_offset, vlen = (int)LEN(v); struct radix_node *t = rn_search(v_arg, top); caddr_t cp = v + head_off; int b; struct radix_node *tt; + caddr_t cp2 = t->rn_key + head_off; + int cmp_res; + caddr_t cplim = v + vlen; /* * Find first bit at which v and t->rn_key differ */ - { - caddr_t cp2 = t->rn_key + head_off; - int cmp_res; - caddr_t cplim = v + vlen; - - while (cp < cplim) - if (*cp2++ != *cp++) - goto on1; - *dupentry = 1; - return (t); + while (cp < cplim) + if (*cp2++ != *cp++) + goto on1; + *dupentry = 1; + return (t); on1: - *dupentry = 0; - cmp_res = (cp[-1] ^ cp2[-1]) & 0xff; - for (b = (cp - v) << 3; cmp_res; b--) - cmp_res >>= 1; - } - { - struct radix_node *p, *x = top; - cp = v; - do { - p = x; - if (cp[x->rn_offset] & x->rn_bmask) - x = x->rn_right; - else - x = x->rn_left; - } while (b > (unsigned)x->rn_bit); - /* x->rn_bit < b && x->rn_bit >= 0 */ - t = rn_newpair(v_arg, b, nodes); - tt = t->rn_left; - if ((cp[p->rn_offset] & p->rn_bmask) == 0) - p->rn_left = t; + *dupentry = 0; + cmp_res = (cp[-1] ^ cp2[-1]) & 0xff; + /* + * (cp - v) is the number of bytes where the match is relevant. + * Multiply by 8 to get number of bits. Then reduce this number + * by the trailing bits in the last byte where we have a match + * by looking at (cmp_res >> 1) in each iteration below. + * Note that v starts at the beginning of the key, so, when key + * is a sockaddr structure, the preliminary len/family/port bytes + * are accounted for. + */ + for (b = (cp - v) << 3; cmp_res; b--) + cmp_res >>= 1; + cp = v; + x = top; + do { + p = x; + if (cp[x->rn_offset] & x->rn_bmask) + x = x->rn_right; else - p->rn_right = t; - x->rn_parent = t; - t->rn_parent = p; - if ((cp[t->rn_offset] & t->rn_bmask) == 0) { - t->rn_right = x; - } else { - t->rn_right = tt; - t->rn_left = x; - } + x = x->rn_left; + } while (b > (unsigned)x->rn_bit); + /* x->rn_bit < b && x->rn_bit >= 0 */ + /* + * now the rightmost bit where v and rn_key differ (b) is < + * x->rn_bit. + * + * We will add a new branch at p. b cannot equal x->rn_bit + * because we know we didn't find a duplicated key. + * The tree will be re-adjusted so that t is inserted between p + * and x. + */ + t = rn_newpair(v_arg, b, nodes); + tt = t->rn_left; + if ((cp[p->rn_offset] & p->rn_bmask) == 0) + p->rn_left = t; + else + p->rn_right = t; + x->rn_parent = t; + t->rn_parent = p; + if ((cp[t->rn_offset] & t->rn_bmask) == 0) { + t->rn_right = x; + } else { + t->rn_right = tt; + t->rn_left = x; } return (tt); } @@ -718,6 +733,8 @@ rn_addroute(v_arg, n_arg, head, treenodes) * find it among possible duplicate key entries * anyway, so the above test doesn't hurt. * + * Insert treenodes before tt. + * * We sort the masks for a duplicated key the same way as * in a masklist -- most specific to least specific. * This may require the unfortunate nuisance of relocating @@ -758,22 +775,54 @@ rn_addroute(v_arg, n_arg, head, treenodes) tt->rn_bit = x->rn_bit; tt->rn_flags |= x->rn_flags & RNF_NORMAL; } + /* BEGIN CSTYLED */ + /* + * at this point the parent-child relationship for p, t, x, tt is + * one of the following: + * p p + * : (left/right child) : + * : : + * t t + * / \ / \ + * x tt tt x + * + * tt == saved_tt returned by rn_insert(). + */ + /* END CSTYLED */ t = saved_tt->rn_parent; if (keyduplicated) goto key_exists; b_leaf = -1 - t->rn_bit; + /* + * b_leaf is now normalized to be in the leaf rn_bit format + * (it is the rn_bit value of a leaf corresponding to netmask + * of t->rn_bit). + */ if (t->rn_right == saved_tt) x = t->rn_left; else x = t->rn_right; - /* Promote general routes from below */ + /* + * Promote general routes from below. + * Identify the less specific netmasks and add them to t->rm_mklist + */ if (x->rn_bit < 0) { - for (mp = &t->rn_mklist; x; x = x->rn_dupedkey) - if (x->rn_mask && (x->rn_bit >= b_leaf) && x->rn_mklist == 0) { - *mp = m = rn_new_radix_mask(x, 0); - if (m) - mp = &m->rm_mklist; - } + /* x is the sibling node. it is a leaf node. */ + for (mp = &t->rn_mklist; x; x = x->rn_dupedkey) + if (x->rn_mask && (x->rn_bit >= b_leaf) && + x->rn_mklist == 0) { + /* + * x is the first node in the dupedkey chain + * without a mklist, and with a shorter mask + * than b_leaf. Create a radix_mask + * corresponding to x's mask and add it to + * t's rn_mklist. The mask list gets created + * in decreasing order of mask length. + */ + *mp = m = rn_new_radix_mask(x, 0); + if (m) + mp = &m->rm_mklist; + } } else if (x->rn_mklist) { /* * Skip over masks whose index is > that of new node @@ -788,6 +837,7 @@ key_exists: if ((netmask == 0) || (b > t->rn_bit)) return (tt); /* can't lift at all */ b_leaf = tt->rn_bit; + /* b is the index of the netmask */ do { x = t; t = t->rn_parent; diff --git a/usr/src/lib/brand/native/zone/platform.xml b/usr/src/lib/brand/native/zone/platform.xml index e988200bde..0225a51dc7 100644 --- a/usr/src/lib/brand/native/zone/platform.xml +++ b/usr/src/lib/brand/native/zone/platform.xml @@ -106,7 +106,6 @@ <device match="ipsecesp" ip-type="exclusive" /> <device match="ipstate" ip-type="exclusive" /> <device match="ipsync" ip-type="exclusive" /> - <device match="iptunq" ip-type="exclusive" /> <device match="keysock" ip-type="exclusive" /> <device match="rawip" ip-type="exclusive" /> <device match="rawip6" ip-type="exclusive" /> @@ -117,6 +116,7 @@ <device match="spdsock" ip-type="exclusive" /> <device match="sppp" ip-type="exclusive" /> <device match="sppptun" ip-type="exclusive" /> + <device match="vni" ip-type="exclusive" /> <!-- Renamed devices to create under /dev --> <device match="zcons/%z/zoneconsole" name="zconsole" /> diff --git a/usr/src/lib/brand/solaris10/zone/platform.xml b/usr/src/lib/brand/solaris10/zone/platform.xml index fa396ec222..89f7035615 100644 --- a/usr/src/lib/brand/solaris10/zone/platform.xml +++ b/usr/src/lib/brand/solaris10/zone/platform.xml @@ -123,7 +123,6 @@ <device match="ipsecesp" ip-type="exclusive" /> <device match="ipstate" ip-type="exclusive" /> <device match="ipsync" ip-type="exclusive" /> - <device match="iptunq" ip-type="exclusive" /> <device match="keysock" ip-type="exclusive" /> <device match="rawip" ip-type="exclusive" /> <device match="rawip6" ip-type="exclusive" /> @@ -134,6 +133,7 @@ <device match="spdsock" ip-type="exclusive" /> <device match="sppp" ip-type="exclusive" /> <device match="sppptun" ip-type="exclusive" /> + <device match="vni" ip-type="exclusive" /> <!-- Renamed devices to create under /dev --> <device match="zcons/%z/zoneconsole" name="zconsole" /> diff --git a/usr/src/pkgdefs/SUNWckr/prototype_com b/usr/src/pkgdefs/SUNWckr/prototype_com index 30679b7037..86489c1422 100644 --- a/usr/src/pkgdefs/SUNWckr/prototype_com +++ b/usr/src/pkgdefs/SUNWckr/prototype_com @@ -92,7 +92,6 @@ f none kernel/drv/ippctl.conf 644 root sys f none kernel/drv/ipsecah.conf 644 root sys f none kernel/drv/ipsecesp.conf 644 root sys f none kernel/drv/iptun.conf 644 root sys -f none kernel/drv/iptunq.conf 644 root sys f none kernel/drv/iwscn.conf 644 root sys f none kernel/drv/keysock.conf 644 root sys f none kernel/drv/kmdb.conf 644 root sys diff --git a/usr/src/pkgdefs/SUNWckr/prototype_i386 b/usr/src/pkgdefs/SUNWckr/prototype_i386 index 2a6676197e..5f886a8d60 100644 --- a/usr/src/pkgdefs/SUNWckr/prototype_i386 +++ b/usr/src/pkgdefs/SUNWckr/prototype_i386 @@ -103,7 +103,6 @@ f none kernel/drv/ippctl 755 root sys f none kernel/drv/ipsecah 755 root sys f none kernel/drv/ipsecesp 755 root sys f none kernel/drv/iptun 755 root sys -f none kernel/drv/iptunq 755 root sys f none kernel/drv/iwscn 755 root sys f none kernel/drv/kb8042 755 root sys f none kernel/drv/keysock 755 root sys @@ -326,7 +325,6 @@ f none kernel/drv/amd64/ippctl 755 root sys f none kernel/drv/amd64/ipsecah 755 root sys f none kernel/drv/amd64/ipsecesp 755 root sys f none kernel/drv/amd64/iptun 755 root sys -f none kernel/drv/amd64/iptunq 755 root sys f none kernel/drv/amd64/iwscn 755 root sys f none kernel/drv/amd64/kb8042 755 root sys f none kernel/drv/amd64/keysock 755 root sys diff --git a/usr/src/pkgdefs/SUNWckr/prototype_sparc b/usr/src/pkgdefs/SUNWckr/prototype_sparc index e086c94862..c2824f989c 100644 --- a/usr/src/pkgdefs/SUNWckr/prototype_sparc +++ b/usr/src/pkgdefs/SUNWckr/prototype_sparc @@ -94,7 +94,6 @@ f none kernel/drv/sparcv9/ippctl 755 root sys f none kernel/drv/sparcv9/ipsecah 755 root sys f none kernel/drv/sparcv9/ipsecesp 755 root sys f none kernel/drv/sparcv9/iptun 755 root sys -f none kernel/drv/sparcv9/iptunq 755 root sys f none kernel/drv/sparcv9/isp 755 root sys f none kernel/drv/sparcv9/iwscn 755 root sys f none kernel/drv/sparcv9/kb8042 755 root sys diff --git a/usr/src/pkgdefs/SUNWhea/prototype_com b/usr/src/pkgdefs/SUNWhea/prototype_com index 3129ef6be5..e3bfe3f348 100644 --- a/usr/src/pkgdefs/SUNWhea/prototype_com +++ b/usr/src/pkgdefs/SUNWhea/prototype_com @@ -242,6 +242,7 @@ d none usr/include/inet 755 root bin f none usr/include/inet/arp.h 644 root bin f none usr/include/inet/common.h 644 root bin f none usr/include/inet/ip.h 644 root bin +f none usr/include/inet/ip_arp.h 644 root bin f none usr/include/inet/ip_if.h 644 root bin f none usr/include/inet/ip_ire.h 644 root bin f none usr/include/inet/ip_ftable.h 644 root bin diff --git a/usr/src/pkgdefs/etc/exception_list_i386 b/usr/src/pkgdefs/etc/exception_list_i386 index 09514a0ecc..ee760eba55 100644 --- a/usr/src/pkgdefs/etc/exception_list_i386 +++ b/usr/src/pkgdefs/etc/exception_list_i386 @@ -365,7 +365,6 @@ usr/lib/amd64/llib-like.ln i386 usr/lib/amd64/libipsecutil.so i386 usr/lib/amd64/llib-lipsecutil.ln i386 # -usr/include/inet/arp_impl.h i386 usr/include/inet/rawip_impl.h i386 usr/include/inet/udp_impl.h i386 usr/include/inet/tcp_impl.h i386 diff --git a/usr/src/pkgdefs/etc/exception_list_sparc b/usr/src/pkgdefs/etc/exception_list_sparc index 5a32c55a05..533552b058 100644 --- a/usr/src/pkgdefs/etc/exception_list_sparc +++ b/usr/src/pkgdefs/etc/exception_list_sparc @@ -354,7 +354,6 @@ usr/share/lib/locale/com/sun/dhcpmgr/cli/dhcpconfig/ResourceBundle.properties sp usr/share/lib/locale/com/sun/dhcpmgr/cli/dhtadm/ResourceBundle.properties sparc usr/share/lib/locale/com/sun/dhcpmgr/cli/pntadm/ResourceBundle.properties sparc # -usr/include/inet/arp_impl.h sparc usr/include/inet/rawip_impl.h sparc usr/include/inet/udp_impl.h sparc usr/include/inet/tcp_impl.h sparc diff --git a/usr/src/tools/scripts/bfu.sh b/usr/src/tools/scripts/bfu.sh index be820004e4..e4e9a36ab2 100644 --- a/usr/src/tools/scripts/bfu.sh +++ b/usr/src/tools/scripts/bfu.sh @@ -8010,6 +8010,12 @@ mondo_loop() { rm -f $root/kernel/strmod/sparcv9/tun rm -f $root/kernel/strmod/amd64/tun + # Remove obsolete iptunq + rm -f $root/kernel/drv/iptunq + rm -f $root/kernel/drv/iptunq.conf + rm -f $root/kernel/drv/amd64/iptunq + rm -f $root/kernel/drv/sparcv9/iptunq + # # Remove libtopo platform XML files that have been replaced by propmap # files. diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index 042685bc5a..550606f39c 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -514,7 +514,7 @@ TOKENMT_OBJS += tokenmt.o tokenmtddi.o TSWTCL_OBJS += tswtcl.o tswtclddi.o -ARP_OBJS += arpddi.o arp.o arp_netinfo.o +ARP_OBJS += arpddi.o ICMP_OBJS += icmpddi.o @@ -535,13 +535,15 @@ IP_SCTP_OBJS = sctp.o sctp_opt_data.o sctp_output.o \ sctp_addr.o tn_ipopt.o tnet.o ip_netinfo.o IP_ILB_OBJS = ilb.o ilb_nat.o ilb_conn.o ilb_alg_hash.o ilb_alg_rr.o -IP_OBJS += igmp.o ipmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o ip6_rts.o \ - ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \ - ip_multi.o ip2mac.o ip_ndp.o ip_opt_data.o ip_rts.o ip_srcid.o \ +IP_OBJS += igmp.o ipmp.o ip.o ip6.o ip6_asp.o ip6_if.o ip6_ire.o \ + ip6_rts.o ip_if.o ip_ire.o ip_listutils.o ip_mroute.o \ + ip_multi.o ip2mac.o ip_ndp.o ip_rts.o ip_srcid.o \ ipddi.o ipdrop.o mi.o nd.o optcom.o snmpcom.o ipsec_loader.o \ spd.o ipclassifier.o inet_common.o ip_squeue.o squeue.o \ ip_sadb.o ip_ftable.o proto_set.o radix.o ip_dummy.o \ - ip_helper_stream.o iptunq.o \ + ip_helper_stream.o \ + ip_output.o ip_input.o ip6_input.o ip6_output.o ip_arp.o \ + conn_opt.o ip_attr.o ip_dce.o \ $(IP_ICMP_OBJS) \ $(IP_RTS_OBJS) \ $(IP_TCP_OBJS) \ @@ -644,8 +646,6 @@ MAC_IB_OBJS += mac_ib.o IPTUN_OBJS += iptun_dev.o iptun_ctl.o iptun.o -IPTUNQ_OBJS += iptunq_ddi.o - AGGR_OBJS += aggr_dev.o aggr_ctl.o aggr_grp.o aggr_port.o \ aggr_send.o aggr_recv.o aggr_lacp.o diff --git a/usr/src/uts/common/fs/sockfs/sockcommon.h b/usr/src/uts/common/fs/sockfs/sockcommon.h index f3ffe456f1..fac10a8935 100644 --- a/usr/src/uts/common/fs/sockfs/sockcommon.h +++ b/usr/src/uts/common/fs/sockfs/sockcommon.h @@ -184,8 +184,7 @@ extern int so_dequeue_msg(struct sonode *, mblk_t **, struct uio *, extern void so_enqueue_msg(struct sonode *, mblk_t *, size_t); extern void so_process_new_message(struct sonode *, mblk_t *, mblk_t *); -extern mblk_t *socopyinuio(uio_t *, ssize_t, size_t, ssize_t, size_t, int *, - cred_t *); +extern mblk_t *socopyinuio(uio_t *, ssize_t, size_t, ssize_t, size_t, int *); extern mblk_t *socopyoutuio(mblk_t *, struct uio *, ssize_t, int *); extern boolean_t somsghasdata(mblk_t *); diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c index 48a3e37921..4521fdd352 100644 --- a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c +++ b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c @@ -470,8 +470,7 @@ so_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, so->so_proto_props.sopp_maxpsz, so->so_proto_props.sopp_wroff, so->so_proto_props.sopp_maxblk, - so->so_proto_props.sopp_tail, &error, - cr)) == NULL) { + so->so_proto_props.sopp_tail, &error)) == NULL) { break; } ASSERT(uiop->uio_resid >= 0); diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c index a244c65bc6..9b806d0a4a 100644 --- a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c +++ b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c @@ -471,7 +471,7 @@ socket_sendsig(struct sonode *so, int event) /* Copy userdata into a new mblk_t */ mblk_t * socopyinuio(uio_t *uiop, ssize_t iosize, size_t wroff, ssize_t maxblk, - size_t tail_len, int *errorp, cred_t *cr) + size_t tail_len, int *errorp) { mblk_t *head = NULL, **tail = &head; @@ -499,11 +499,7 @@ socopyinuio(uio_t *uiop, ssize_t iosize, size_t wroff, ssize_t maxblk, blocksize = MIN(iosize, maxblk); ASSERT(blocksize >= 0); - if (is_system_labeled()) - mp = allocb_cred(wroff + blocksize + tail_len, - cr, curproc->p_pid); - else - mp = allocb(wroff + blocksize + tail_len, BPRI_MED); + mp = allocb(wroff + blocksize + tail_len, BPRI_MED); if (mp == NULL) { *errorp = ENOMEM; return (head); diff --git a/usr/src/uts/common/fs/sockfs/socktpi.c b/usr/src/uts/common/fs/sockfs/socktpi.c index b2a178fbcb..bfbd67ad81 100644 --- a/usr/src/uts/common/fs/sockfs/socktpi.c +++ b/usr/src/uts/common/fs/sockfs/socktpi.c @@ -5506,205 +5506,6 @@ sotpi_setsockopt(struct sonode *so, int level, int option_name, so_lock_single(so); /* Set SOLOCKED */ mutex_exit(&so->so_lock); - /* - * For SOCKET or TCP level options, try to set it here itself - * provided socket has not been popped and we know the tcp - * structure (stored in so_priv). - */ - if ((level == SOL_SOCKET || level == IPPROTO_TCP) && - (so->so_family == AF_INET || so->so_family == AF_INET6) && - (so->so_version == SOV_SOCKSTREAM) && - (so->so_proto_handle != NULL)) { - tcp_t *tcp = (tcp_t *)so->so_proto_handle; - boolean_t onoff; - -#define intvalue (*(int32_t *)optval) - - switch (level) { - case SOL_SOCKET: - switch (option_name) { /* Check length param */ - case SO_DEBUG: - case SO_REUSEADDR: - case SO_DONTROUTE: - case SO_BROADCAST: - case SO_USELOOPBACK: - case SO_OOBINLINE: - case SO_DGRAM_ERRIND: - if (optlen != (t_uscalar_t)sizeof (int32_t)) { - error = EINVAL; - eprintsoline(so, error); - mutex_enter(&so->so_lock); - goto done2; - } - ASSERT(optval); - onoff = intvalue != 0; - handled = B_TRUE; - break; - case SO_SNDTIMEO: - case SO_RCVTIMEO: - if (get_udatamodel() == DATAMODEL_NONE || - get_udatamodel() == DATAMODEL_NATIVE) { - if (optlen != - sizeof (struct timeval)) { - error = EINVAL; - eprintsoline(so, error); - mutex_enter(&so->so_lock); - goto done2; - } - } else { - if (optlen != - sizeof (struct timeval32)) { - error = EINVAL; - eprintsoline(so, error); - mutex_enter(&so->so_lock); - goto done2; - } - } - ASSERT(optval); - handled = B_TRUE; - break; - case SO_LINGER: - if (optlen != - (t_uscalar_t)sizeof (struct linger)) { - error = EINVAL; - eprintsoline(so, error); - mutex_enter(&so->so_lock); - goto done2; - } - ASSERT(optval); - handled = B_TRUE; - break; - } - - switch (option_name) { /* Do actions */ - case SO_LINGER: { - struct linger *lgr = (struct linger *)optval; - - if (lgr->l_onoff) { - tcp->tcp_linger = 1; - tcp->tcp_lingertime = lgr->l_linger; - so->so_linger.l_onoff = SO_LINGER; - so->so_options |= SO_LINGER; - } else { - tcp->tcp_linger = 0; - tcp->tcp_lingertime = 0; - so->so_linger.l_onoff = 0; - so->so_options &= ~SO_LINGER; - } - so->so_linger.l_linger = lgr->l_linger; - handled = B_TRUE; - break; - } - case SO_SNDTIMEO: - case SO_RCVTIMEO: { - struct timeval tl; - clock_t val; - - if (get_udatamodel() == DATAMODEL_NONE || - get_udatamodel() == DATAMODEL_NATIVE) - bcopy(&tl, (struct timeval *)optval, - sizeof (struct timeval)); - else - TIMEVAL32_TO_TIMEVAL(&tl, - (struct timeval32 *)optval); - val = tl.tv_sec * 1000 * 1000 + tl.tv_usec; - if (option_name == SO_RCVTIMEO) - so->so_rcvtimeo = drv_usectohz(val); - else - so->so_sndtimeo = drv_usectohz(val); - break; - } - - case SO_DEBUG: - tcp->tcp_debug = onoff; -#ifdef SOCK_TEST - if (intvalue & 2) - sock_test_timelimit = 10 * hz; - else - sock_test_timelimit = 0; - - if (intvalue & 4) - do_useracc = 0; - else - do_useracc = 1; -#endif /* SOCK_TEST */ - break; - case SO_DONTROUTE: - /* - * SO_DONTROUTE, SO_USELOOPBACK and - * SO_BROADCAST are only of interest to IP. - * We track them here only so - * that we can report their current value. - */ - tcp->tcp_dontroute = onoff; - if (onoff) - so->so_options |= option_name; - else - so->so_options &= ~option_name; - break; - case SO_USELOOPBACK: - tcp->tcp_useloopback = onoff; - if (onoff) - so->so_options |= option_name; - else - so->so_options &= ~option_name; - break; - case SO_BROADCAST: - tcp->tcp_broadcast = onoff; - if (onoff) - so->so_options |= option_name; - else - so->so_options &= ~option_name; - break; - case SO_REUSEADDR: - tcp->tcp_reuseaddr = onoff; - if (onoff) - so->so_options |= option_name; - else - so->so_options &= ~option_name; - break; - case SO_OOBINLINE: - tcp->tcp_oobinline = onoff; - if (onoff) - so->so_options |= option_name; - else - so->so_options &= ~option_name; - break; - case SO_DGRAM_ERRIND: - tcp->tcp_dgram_errind = onoff; - if (onoff) - so->so_options |= option_name; - else - so->so_options &= ~option_name; - break; - } - break; - case IPPROTO_TCP: - switch (option_name) { - case TCP_NODELAY: - if (optlen != (t_uscalar_t)sizeof (int32_t)) { - error = EINVAL; - eprintsoline(so, error); - mutex_enter(&so->so_lock); - goto done2; - } - ASSERT(optval); - tcp->tcp_naglim = intvalue ? 1 : tcp->tcp_mss; - handled = B_TRUE; - break; - } - break; - default: - handled = B_FALSE; - break; - } - } - - if (handled) { - mutex_enter(&so->so_lock); - goto done2; - } - optmgmt_req.PRIM_type = T_SVR4_OPTMGMT_REQ; optmgmt_req.MGMT_flags = T_NEGOTIATE; optmgmt_req.OPT_length = (t_scalar_t)sizeof (oh) + optlen; diff --git a/usr/src/uts/common/inet/Makefile b/usr/src/uts/common/inet/Makefile index 052c010aea..3d45e4861c 100644 --- a/usr/src/uts/common/inet/Makefile +++ b/usr/src/uts/common/inet/Makefile @@ -28,12 +28,12 @@ # include global definitions include ../../../Makefile.master -HDRS= arp.h arp_impl.h common.h ipclassifier.h ip.h ip6.h ipdrop.h ipnet.h \ +HDRS= arp.h common.h ipclassifier.h ip.h ip6.h ipdrop.h ipnet.h \ ipsecah.h ipsecesp.h ipsec_info.h iptun.h ip6_asp.h ip_if.h ip_ire.h \ ip_multi.h ip_netinfo.h ip_ndp.h ip_rts.h ipsec_impl.h keysock.h \ led.h mi.h mib2.h nd.h optcom.h sadb.h sctp_itf.h snmpcom.h tcp.h \ tcp_sack.h tcp_stack.h udp_impl.h rawip_impl.h ipp_common.h \ - ip_ftable.h ip_impl.h ip_stack.h tcp_impl.h wifi_ioctl.h \ + ip_ftable.h ip_impl.h ip_stack.h ip_arp.h tcp_impl.h wifi_ioctl.h \ ip2mac.h ip2mac_impl.h ROOTDIRS= $(ROOT)/usr/include/inet diff --git a/usr/src/uts/common/inet/arp.h b/usr/src/uts/common/inet/arp.h index 4351c91666..de0602e1f7 100644 --- a/usr/src/uts/common/inet/arp.h +++ b/usr/src/uts/common/inet/arp.h @@ -28,7 +28,6 @@ #define _INET_ARP_H #include <sys/types.h> -#include <net/if.h> #ifdef __cplusplus extern "C" { @@ -45,30 +44,7 @@ extern "C" { #define RARP_REQUEST 3 #define RARP_RESPONSE 4 -#define AR_IOCTL (((unsigned)'A' & 0xFF)<<8) -#define CMD_IN_PROGRESS 0x10000 - -#define AR_ENTRY_ADD (AR_IOCTL + 1) -#define AR_ENTRY_DELETE (AR_IOCTL + 2) -#define AR_ENTRY_QUERY (AR_IOCTL + 3) -#define AR_ENTRY_SQUERY (AR_IOCTL + 6) -#define AR_MAPPING_ADD (AR_IOCTL + 7) -#define AR_CLIENT_NOTIFY (AR_IOCTL + 8) -#define AR_INTERFACE_UP (AR_IOCTL + 9) -#define AR_INTERFACE_DOWN (AR_IOCTL + 10) -#define AR_INTERFACE_ON (AR_IOCTL + 12) -#define AR_INTERFACE_OFF (AR_IOCTL + 13) -#define AR_DLPIOP_DONE (AR_IOCTL + 14) -/* - * This is not an ARP command per se, it is used to interface between - * ARP and IP during close. - */ -#define AR_ARP_CLOSING (AR_IOCTL + 16) -#define AR_ARP_EXTEND (AR_IOCTL + 17) -#define AR_IPMP_ACTIVATE (AR_IOCTL + 18) -#define AR_IPMP_DEACTIVATE (AR_IOCTL + 19) - -/* Both ace_flags and area_flags; must also modify arp.c in mdb */ +/* Both ace_flags; must also modify arp.c in mdb */ #define ACE_F_PERMANENT 0x0001 #define ACE_F_PUBLISH 0x0002 #define ACE_F_DYING 0x0004 @@ -84,123 +60,6 @@ extern "C" { #define ACE_F_DELAYED 0x0800 /* rescheduled on arp_defend_rate */ #define ACE_F_DAD_ABORTED 0x1000 /* DAD was aborted on link down */ -/* ared_flags */ -#define ARED_F_PRESERVE_PERM 0x0001 /* preserve permanent ace */ - -/* ARP Command Structures */ - -/* arc_t - Common command overlay */ -typedef struct ar_cmd_s { - uint32_t arc_cmd; - uint32_t arc_name_offset; - uint32_t arc_name_length; -} arc_t; - -/* - * NOTE: when using area_t for an AR_ENTRY_SQUERY, the area_hw_addr_offset - * field isn't what you might think. See comments in ip_multi.c where - * the routine ill_create_squery() is called, and also in the routine - * itself, to see how this field is used *only* when the area_t holds - * an AR_ENTRY_SQUERY. - */ -typedef struct ar_entry_add_s { - uint32_t area_cmd; - uint32_t area_name_offset; - uint32_t area_name_length; - uint32_t area_proto; - uint32_t area_proto_addr_offset; - uint32_t area_proto_addr_length; - uint32_t area_proto_mask_offset; - uint32_t area_flags; /* Same values as ace_flags */ - uint32_t area_hw_addr_offset; - uint32_t area_hw_addr_length; -} area_t; - -typedef struct ar_entry_delete_s { - uint32_t ared_cmd; - uint32_t ared_name_offset; - uint32_t ared_name_length; - uint32_t ared_proto; - uint32_t ared_proto_addr_offset; - uint32_t ared_proto_addr_length; - uint32_t ared_flags; -} ared_t; - -typedef struct ar_entry_query_s { - uint32_t areq_cmd; - uint32_t areq_name_offset; - uint32_t areq_name_length; - uint32_t areq_proto; - uint32_t areq_target_addr_offset; - uint32_t areq_target_addr_length; - uint32_t areq_flags; - uint32_t areq_sender_addr_offset; - uint32_t areq_sender_addr_length; - uint32_t areq_xmit_count; /* 0 ==> cache lookup only */ - uint32_t areq_xmit_interval; /* # of milliseconds; 0: default */ - /* # ofquests to buffer; 0: default */ - uint32_t areq_max_buffered; - uchar_t areq_sap[8]; /* to insert in returned template */ -} areq_t; - -#define AR_EQ_DEFAULT_XMIT_COUNT 6 -#define AR_EQ_DEFAULT_XMIT_INTERVAL 1000 -#define AR_EQ_DEFAULT_MAX_BUFFERED 4 - -/* - * Structure used with AR_ENTRY_LLAQUERY to map from the link_addr - * (in Neighbor Discovery option format excluding the option type and - * length) to a hardware address. - * The response has the same format as for an AR_ENTRY_SQUERY - an M_CTL with - * arel_hw_addr updated. - * An IPv6 address will be passed in AR_ENTRY_LLAQUERY so that atmip - * can send it in AR_CLIENT_NOTIFY messages. - */ -typedef struct ar_entry_llaquery_s { - uint32_t arel_cmd; - uint32_t arel_name_offset; - uint32_t arel_name_length; - uint32_t arel_link_addr_offset; - uint32_t arel_link_addr_length; - uint32_t arel_hw_addr_offset; - uint32_t arel_hw_addr_length; - uint32_t arel_ip_addr_offset; - uint32_t arel_ip_addr_length; -} arel_t; - -typedef struct ar_mapping_add_s { - uint32_t arma_cmd; - uint32_t arma_name_offset; - uint32_t arma_name_length; - uint32_t arma_proto; - uint32_t arma_proto_addr_offset; - uint32_t arma_proto_addr_length; - uint32_t arma_proto_mask_offset; - uint32_t arma_proto_extract_mask_offset; - uint32_t arma_flags; - uint32_t arma_hw_addr_offset; - uint32_t arma_hw_addr_length; - /* Offset were we start placing */ - uint32_t arma_hw_mapping_start; - /* the mask&proto_addr */ -} arma_t; - -/* Structure used to notify ARP of changes to IPMP group topology */ -typedef struct ar_ipmp_event_s { - uint32_t arie_cmd; - uint32_t arie_name_offset; - uint32_t arie_name_length; - char arie_grifname[LIFNAMSIZ]; -} arie_t; - -/* Structure used to notify clients of interesting conditions. */ -typedef struct ar_client_notify_s { - uint32_t arcn_cmd; - uint32_t arcn_name_offset; - uint32_t arcn_name_length; - uint32_t arcn_code; /* Notification code. */ -} arcn_t; - /* Client Notification Codes */ #define AR_CN_BOGON 1 #define AR_CN_ANNOUNCE 2 diff --git a/usr/src/uts/common/inet/arp/arp.c b/usr/src/uts/common/inet/arp/arp.c deleted file mode 100644 index abdbc39a47..0000000000 --- a/usr/src/uts/common/inet/arp/arp.c +++ /dev/null @@ -1,4883 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* Copyright (c) 1990 Mentat Inc. */ - -/* AR - Address Resolution Protocol */ - -#include <sys/types.h> -#include <sys/stream.h> -#include <sys/stropts.h> -#include <sys/strsubr.h> -#include <sys/errno.h> -#include <sys/strlog.h> -#include <sys/dlpi.h> -#include <sys/sockio.h> -#define _SUN_TPI_VERSION 2 -#include <sys/tihdr.h> -#include <sys/socket.h> -#include <sys/ddi.h> -#include <sys/sunddi.h> -#include <sys/cmn_err.h> -#include <sys/sdt.h> -#include <sys/vtrace.h> -#include <sys/strsun.h> -#include <sys/policy.h> -#include <sys/zone.h> -#include <sys/ethernet.h> -#include <sys/zone.h> -#include <sys/random.h> -#include <sys/sdt.h> -#include <sys/hook_event.h> - -#include <inet/common.h> -#include <inet/optcom.h> -#include <inet/mi.h> -#include <inet/nd.h> -#include <inet/snmpcom.h> -#include <net/if.h> -#include <inet/arp.h> -#include <netinet/ip6.h> -#include <netinet/arp.h> -#include <inet/ip.h> -#include <inet/ip_ire.h> -#include <inet/ip_ndp.h> -#include <inet/mib2.h> -#include <inet/arp_impl.h> - -/* - * ARP entry life time and design notes - * ------------------------------------ - * - * ARP entries (ACEs) must last at least as long as IP knows about a given - * MAC-IP translation (i.e., as long as the IRE cache entry exists). It's ok - * if the ARP entry lasts longer, but not ok if it is removed before the IP - * entry. The reason for this is that if ARP doesn't have an entry, we will be - * unable to detect the difference between an ARP broadcast that represents no - * change (same, known address of sender) and one that represents a change (new - * address for existing entry). In the former case, we must not notify IP, or - * we can suffer hurricane attack. In the latter case, we must notify IP, or - * IP will drift out of sync with the network. - * - * Note that IP controls the lifetime of entries, not ARP. - * - * We don't attempt to reconfirm aging entries. If the system is no longer - * talking to a given peer, then it doesn't matter if we have the right mapping - * for that peer. It would be possible to send queries on aging entries that - * are active, but this isn't done. - * - * IPMP Notes - * ---------- - * - * ARP is aware of IPMP. In particular, IP notifies ARP about all "active" - * (able to transmit data packets) interfaces in a given group via - * AR_IPMP_ACTIVATE and AR_IPMP_DEACTIVATE messages. These messages, combined - * with the "IPMP arl_t" that ARP creates over the IPMP DLPI stub driver, - * enable ARP to track all the arl_t's that are in the same group and thus - * ensure that ACEs are shared across each group and the arl_t that ARP - * chooses to transmit on for a given ACE is optimal. - * - * ARP relies on IP for hardware address updates. In particular, if the - * hardware address of an interface changes (DL_NOTE_PHYS_ADDR), then IP will - * bring the interface down and back up -- and as part of bringing it back - * up, will send messages to ARP that allow it to update the affected arl's - * with new hardware addresses. - * - * N.B.: One side-effect of this approach is that when an interface fails and - * then starts to repair, it will temporarily populate the ARP cache with - * addresses that are owned by it rather than the group's arl_t. To address - * this, we could add more messages (e.g., AR_IPMP_JOIN and AR_IPMP_LEAVE), - * but as the issue appears to be only cosmetic (redundant entries in the ARP - * cache during interace repair), we've kept things simple for now. - */ - -/* - * This is used when scanning for "old" (least recently broadcast) ACEs. We - * don't want to have to walk the list for every single one, so we gather up - * batches at a time. - */ -#define ACE_RESCHED_LIST_LEN 8 - -typedef struct { - arl_t *art_arl; - uint_t art_naces; - ace_t *art_aces[ACE_RESCHED_LIST_LEN]; -} ace_resched_t; - -#define ACE_RESOLVED(ace) ((ace)->ace_flags & ACE_F_RESOLVED) -#define ACE_NONPERM(ace) \ - (((ace)->ace_flags & (ACE_F_RESOLVED | ACE_F_PERMANENT)) == \ - ACE_F_RESOLVED) - -#define AR_DEF_XMIT_INTERVAL 500 /* time in milliseconds */ -#define AR_LL_HDR_SLACK 32 /* Leave the lower layer some room */ - -#define AR_SNMP_MSG T_OPTMGMT_ACK -#define AR_DRAINING (void *)0x11 - -/* - * The IPv4 Link Local address space is special; we do extra duplicate checking - * there, as the entire assignment mechanism rests on random numbers. - */ -#define IS_IPV4_LL_SPACE(ptr) (((uchar_t *)ptr)[0] == 169 && \ - ((uchar_t *)ptr)[1] == 254) - -/* - * Check if the command needs to be enqueued by seeing if there are other - * commands ahead of us or if some DLPI response is being awaited. Usually - * there would be an enqueued command in the latter case, however if the - * stream that originated the command has closed, the close would have - * cleaned up the enqueued command. AR_DRAINING signifies that the command - * at the head of the arl_queue has been internally dequeued on completion - * of the previous command and is being called from ar_dlpi_done - */ -#define CMD_NEEDS_QUEUEING(mp, arl) \ - (mp->b_prev != AR_DRAINING && (arl->arl_queue != NULL || \ - arl->arl_dlpi_pending != DL_PRIM_INVAL)) - -#define ARH_FIXED_LEN 8 - -/* - * Macro used when creating ACEs to determine the arl that should own it. - */ -#define OWNING_ARL(arl) \ - ((arl)->arl_ipmp_arl != NULL ? (arl)->arl_ipmp_arl : arl) - -/* - * MAC-specific intelligence. Shouldn't be needed, but the DL_INFO_ACK - * doesn't quite do it for us. - */ -typedef struct ar_m_s { - t_uscalar_t ar_mac_type; - uint32_t ar_mac_arp_hw_type; - t_scalar_t ar_mac_sap_length; - uint32_t ar_mac_hw_addr_length; -} ar_m_t; - -typedef struct msg2_args { - mblk_t *m2a_mpdata; - mblk_t *m2a_mptail; -} msg2_args_t; - -static mblk_t *ar_alloc(uint32_t cmd, int); -static int ar_ce_create(arl_t *arl, uint32_t proto, uchar_t *hw_addr, - uint32_t hw_addr_len, uchar_t *proto_addr, - uint32_t proto_addr_len, uchar_t *proto_mask, - uchar_t *proto_extract_mask, uint32_t hw_extract_start, - uchar_t *sender_addr, uint32_t flags); -static void ar_ce_delete(ace_t *ace); -static void ar_ce_delete_per_arl(ace_t *ace, void *arg); -static ace_t **ar_ce_hash(arp_stack_t *as, uint32_t proto, - const uchar_t *proto_addr, uint32_t proto_addr_length); -static ace_t *ar_ce_lookup(arl_t *arl, uint32_t proto, - const uchar_t *proto_addr, uint32_t proto_addr_length); -static ace_t *ar_ce_lookup_entry(arl_t *arl, uint32_t proto, - const uchar_t *proto_addr, uint32_t proto_addr_length); -static ace_t *ar_ce_lookup_from_area(arp_stack_t *as, mblk_t *mp, - ace_t *matchfn()); -static ace_t *ar_ce_lookup_mapping(arl_t *arl, uint32_t proto, - const uchar_t *proto_addr, uint32_t proto_addr_length); -static ace_t *ar_ce_lookup_permanent(arp_stack_t *as, uint32_t proto, - uchar_t *proto_addr, uint32_t proto_addr_length); -static boolean_t ar_ce_resolve(ace_t *ace, const uchar_t *hw_addr, - uint32_t hw_addr_length); -static void ar_ce_walk(arp_stack_t *as, void (*pfi)(ace_t *, void *), - void *arg1); - -static void ar_client_notify(const arl_t *arl, mblk_t *mp, int code); -static int ar_close(queue_t *q); -static int ar_cmd_dispatch(queue_t *q, mblk_t *mp, boolean_t from_wput); -static void ar_cmd_drain(arl_t *arl); -static void ar_cmd_done(arl_t *arl); -static mblk_t *ar_dlpi_comm(t_uscalar_t prim, size_t size); -static void ar_dlpi_send(arl_t *, mblk_t *); -static void ar_dlpi_done(arl_t *, t_uscalar_t); -static int ar_entry_add(queue_t *q, mblk_t *mp); -static int ar_entry_delete(queue_t *q, mblk_t *mp); -static int ar_entry_query(queue_t *q, mblk_t *mp); -static int ar_entry_squery(queue_t *q, mblk_t *mp); -static int ar_interface_up(queue_t *q, mblk_t *mp); -static int ar_interface_down(queue_t *q, mblk_t *mp); -static int ar_interface_on(queue_t *q, mblk_t *mp); -static int ar_interface_off(queue_t *q, mblk_t *mp); -static int ar_ipmp_activate(queue_t *q, mblk_t *mp); -static int ar_ipmp_deactivate(queue_t *q, mblk_t *mp); -static void ar_ll_cleanup_arl_queue(queue_t *q); -static void ar_ll_down(arl_t *arl); -static arl_t *ar_ll_lookup_by_name(arp_stack_t *as, const char *name); -static arl_t *ar_ll_lookup_from_mp(arp_stack_t *as, mblk_t *mp); -static void ar_ll_init(arp_stack_t *, ar_t *, mblk_t *mp); -static void ar_ll_set_defaults(arl_t *, mblk_t *mp); -static void ar_ll_clear_defaults(arl_t *); -static int ar_ll_up(arl_t *arl); -static int ar_mapping_add(queue_t *q, mblk_t *mp); -static boolean_t ar_mask_all_ones(uchar_t *mask, uint32_t mask_len); -static ar_m_t *ar_m_lookup(t_uscalar_t mac_type); -static int ar_nd_ioctl(queue_t *q, mblk_t *mp); -static int ar_open(queue_t *q, dev_t *devp, int flag, int sflag, - cred_t *credp); -static int ar_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); -static boolean_t ar_param_register(IDP *ndp, arpparam_t *arppa, int cnt); -static int ar_param_set(queue_t *q, mblk_t *mp, char *value, - caddr_t cp, cred_t *cr); -static void ar_query_delete(ace_t *ace, void *ar); -static void ar_query_reply(ace_t *ace, int ret_val, - uchar_t *proto_addr, uint32_t proto_addr_len); -static clock_t ar_query_xmit(arp_stack_t *as, ace_t *ace); -static void ar_rput(queue_t *q, mblk_t *mp_orig); -static void ar_rput_dlpi(queue_t *q, mblk_t *mp); -static void ar_set_address(ace_t *ace, uchar_t *addrpos, - uchar_t *proto_addr, uint32_t proto_addr_len); -static int ar_slifname(queue_t *q, mblk_t *mp); -static int ar_set_ppa(queue_t *q, mblk_t *mp); -static int ar_snmp_msg(queue_t *q, mblk_t *mp_orig); -static void ar_snmp_msg2(ace_t *, void *); -static void ar_wput(queue_t *q, mblk_t *mp); -static void ar_wsrv(queue_t *q); -static void ar_xmit(arl_t *arl, uint32_t operation, uint32_t proto, - uint32_t plen, const uchar_t *haddr1, const uchar_t *paddr1, - const uchar_t *haddr2, const uchar_t *paddr2, const uchar_t *dstaddr, - arp_stack_t *as); -static void ar_cmd_enqueue(arl_t *arl, mblk_t *mp, queue_t *q, - ushort_t cmd, boolean_t); -static mblk_t *ar_cmd_dequeue(arl_t *arl); - -static void *arp_stack_init(netstackid_t stackid, netstack_t *ns); -static void arp_stack_fini(netstackid_t stackid, void *arg); -static void arp_stack_shutdown(netstackid_t stackid, void *arg); - -boolean_t arp_no_defense = B_FALSE; - -/* - * All of these are alterable, within the min/max values given, - * at run time. arp_publish_interval and arp_publish_count are - * set by default to 2 seconds and 5 respectively. This is - * useful during FAILOVER/FAILBACK to make sure that the ARP - * packets are not lost. Assumed that it does not affect the - * normal operations. - */ -static arpparam_t arp_param_arr[] = { - /* min max value name */ - { 30000, 3600000, 300000, "arp_cleanup_interval"}, - { 1000, 20000, 2000, "arp_publish_interval"}, - { 1, 20, 5, "arp_publish_count"}, - { 0, 20000, 1000, "arp_probe_delay"}, - { 10, 20000, 1500, "arp_probe_interval"}, - { 0, 20, 3, "arp_probe_count"}, - { 0, 20000, 100, "arp_fastprobe_delay"}, - { 10, 20000, 150, "arp_fastprobe_interval"}, - { 0, 20, 3, "arp_fastprobe_count"}, - { 0, 3600000, 300000, "arp_defend_interval"}, - { 0, 20000, 100, "arp_defend_rate"}, - { 0, 3600000, 15000, "arp_broadcast_interval"}, - { 5, 86400, 3600, "arp_defend_period"} -}; -#define as_cleanup_interval as_param_arr[0].arp_param_value -#define as_publish_interval as_param_arr[1].arp_param_value -#define as_publish_count as_param_arr[2].arp_param_value -#define as_probe_delay as_param_arr[3].arp_param_value -#define as_probe_interval as_param_arr[4].arp_param_value -#define as_probe_count as_param_arr[5].arp_param_value -#define as_fastprobe_delay as_param_arr[6].arp_param_value -#define as_fastprobe_interval as_param_arr[7].arp_param_value -#define as_fastprobe_count as_param_arr[8].arp_param_value -#define as_defend_interval as_param_arr[9].arp_param_value -#define as_defend_rate as_param_arr[10].arp_param_value -#define as_broadcast_interval as_param_arr[11].arp_param_value -#define as_defend_period as_param_arr[12].arp_param_value - -static struct module_info arp_mod_info = { - 0, "arp", 0, INFPSZ, 512, 128 -}; - -static struct qinit arprinit = { - (pfi_t)ar_rput, NULL, ar_open, ar_close, NULL, &arp_mod_info -}; - -static struct qinit arpwinit = { - (pfi_t)ar_wput, (pfi_t)ar_wsrv, ar_open, ar_close, NULL, &arp_mod_info -}; - -struct streamtab arpinfo = { - &arprinit, &arpwinit -}; - -/* - * TODO: we need a better mechanism to set the ARP hardware type since - * the DLPI mac type does not include enough predefined values. - */ -static ar_m_t ar_m_tbl[] = { - { DL_CSMACD, ARPHRD_ETHER, -2, 6}, /* 802.3 */ - { DL_TPB, ARPHRD_IEEE802, -2, 6}, /* 802.4 */ - { DL_TPR, ARPHRD_IEEE802, -2, 6}, /* 802.5 */ - { DL_METRO, ARPHRD_IEEE802, -2, 6}, /* 802.6 */ - { DL_ETHER, ARPHRD_ETHER, -2, 6}, /* Ethernet */ - { DL_FDDI, ARPHRD_ETHER, -2, 6}, /* FDDI */ - { DL_IB, ARPHRD_IB, -2, 20}, /* Infiniband */ - { DL_OTHER, ARPHRD_ETHER, -2, 6}, /* unknown */ -}; - -/* - * Note that all routines which need to queue the message for later - * processing have to be ioctl_aware to be able to queue the complete message. - * Following are command entry flags in arct_flags - */ -#define ARF_IOCTL_AWARE 0x1 /* Arp command can come down as M_IOCTL */ -#define ARF_ONLY_CMD 0x2 /* Command is exclusive to ARP */ -#define ARF_WPUT_OK 0x4 /* Command is allowed from ar_wput */ - -/* ARP Cmd Table entry */ -typedef struct arct_s { - int (*arct_pfi)(queue_t *, mblk_t *); - uint32_t arct_cmd; - int arct_min_len; - uint32_t arct_flags; - int arct_priv_req; /* Privilege required for this cmd */ - const char *arct_txt; -} arct_t; - -/* - * AR_ENTRY_ADD, QUERY and SQUERY are used by sdp, hence they need to - * have ARF_WPUT_OK set. - */ -static arct_t ar_cmd_tbl[] = { - { ar_entry_add, AR_ENTRY_ADD, sizeof (area_t), - ARF_IOCTL_AWARE | ARF_ONLY_CMD | ARF_WPUT_OK, OP_CONFIG, - "AR_ENTRY_ADD" }, - { ar_entry_delete, AR_ENTRY_DELETE, sizeof (ared_t), - ARF_IOCTL_AWARE | ARF_ONLY_CMD, OP_CONFIG, "AR_ENTRY_DELETE" }, - { ar_entry_query, AR_ENTRY_QUERY, sizeof (areq_t), - ARF_IOCTL_AWARE | ARF_ONLY_CMD | ARF_WPUT_OK, OP_NP, - "AR_ENTRY_QUERY" }, - { ar_entry_squery, AR_ENTRY_SQUERY, sizeof (area_t), - ARF_IOCTL_AWARE | ARF_ONLY_CMD | ARF_WPUT_OK, OP_NP, - "AR_ENTRY_SQUERY" }, - { ar_mapping_add, AR_MAPPING_ADD, sizeof (arma_t), - ARF_IOCTL_AWARE | ARF_ONLY_CMD, OP_CONFIG, "AR_MAPPING_ADD" }, - { ar_interface_up, AR_INTERFACE_UP, sizeof (arc_t), - ARF_ONLY_CMD, OP_CONFIG, "AR_INTERFACE_UP" }, - { ar_interface_down, AR_INTERFACE_DOWN, sizeof (arc_t), - ARF_ONLY_CMD, OP_CONFIG, "AR_INTERFACE_DOWN" }, - { ar_interface_on, AR_INTERFACE_ON, sizeof (arc_t), - ARF_ONLY_CMD, OP_CONFIG, "AR_INTERFACE_ON" }, - { ar_interface_off, AR_INTERFACE_OFF, sizeof (arc_t), - ARF_ONLY_CMD, OP_CONFIG, "AR_INTERFACE_OFF" }, - { ar_ipmp_activate, AR_IPMP_ACTIVATE, sizeof (arie_t), - ARF_ONLY_CMD, OP_CONFIG, "AR_IPMP_ACTIVATE" }, - { ar_ipmp_deactivate, AR_IPMP_DEACTIVATE, sizeof (arie_t), - ARF_ONLY_CMD, OP_CONFIG, "AR_IPMP_DEACTIVATE" }, - { ar_set_ppa, (uint32_t)IF_UNITSEL, sizeof (int), - ARF_IOCTL_AWARE | ARF_WPUT_OK, OP_CONFIG, "IF_UNITSEL" }, - { ar_nd_ioctl, ND_GET, 1, - ARF_IOCTL_AWARE | ARF_WPUT_OK, OP_NP, "ND_GET" }, - { ar_nd_ioctl, ND_SET, 1, - ARF_IOCTL_AWARE | ARF_WPUT_OK, OP_CONFIG, "ND_SET" }, - { ar_snmp_msg, AR_SNMP_MSG, sizeof (struct T_optmgmt_ack), - ARF_IOCTL_AWARE | ARF_WPUT_OK | ARF_ONLY_CMD, OP_NP, - "AR_SNMP_MSG" }, - { ar_slifname, (uint32_t)SIOCSLIFNAME, sizeof (struct lifreq), - ARF_IOCTL_AWARE | ARF_WPUT_OK, OP_CONFIG, "SIOCSLIFNAME" } -}; - -/* - * Lookup and return an arl appropriate for sending packets with either source - * hardware address `hw_addr' or source protocol address `ip_addr', in that - * order. If neither was specified or neither match, return any arl in the - * same group as `arl'. - */ -static arl_t * -ar_ipmp_lookup_xmit_arl(arl_t *arl, uchar_t *hw_addr, uint_t hw_addrlen, - uchar_t *ip_addr) -{ - arlphy_t *ap; - ace_t *src_ace; - arl_t *xmit_arl = NULL; - arp_stack_t *as = ARL_TO_ARPSTACK(arl); - - ASSERT(arl->arl_flags & ARL_F_IPMP); - - if (hw_addr != NULL && hw_addrlen != 0) { - xmit_arl = as->as_arl_head; - for (; xmit_arl != NULL; xmit_arl = xmit_arl->arl_next) { - /* - * There may be arls with the same HW address that are - * not in our IPMP group; we don't want those. - */ - if (xmit_arl->arl_ipmp_arl != arl) - continue; - - ap = xmit_arl->arl_phy; - if (ap != NULL && ap->ap_hw_addrlen == hw_addrlen && - bcmp(ap->ap_hw_addr, hw_addr, hw_addrlen) == 0) - break; - } - - DTRACE_PROBE4(xmit_arl_hwsrc, arl_t *, arl, arl_t *, - xmit_arl, uchar_t *, hw_addr, uint_t, hw_addrlen); - } - - if (xmit_arl == NULL && ip_addr != NULL) { - src_ace = ar_ce_lookup_permanent(as, IP_ARP_PROTO_TYPE, ip_addr, - IP_ADDR_LEN); - if (src_ace != NULL) - xmit_arl = src_ace->ace_xmit_arl; - - DTRACE_PROBE4(xmit_arl_ipsrc, arl_t *, arl, arl_t *, - xmit_arl, uchar_t *, ip_addr, uint_t, IP_ADDR_LEN); - } - - if (xmit_arl == NULL) { - xmit_arl = as->as_arl_head; - for (; xmit_arl != NULL; xmit_arl = xmit_arl->arl_next) - if (xmit_arl->arl_ipmp_arl == arl && xmit_arl != arl) - break; - - DTRACE_PROBE2(xmit_arl_any, arl_t *, arl, arl_t *, xmit_arl); - } - - return (xmit_arl); -} - -/* - * ARP Cache Entry creation routine. - * Cache entries are allocated within timer messages and inserted into - * the global hash list based on protocol and protocol address. - */ -static int -ar_ce_create(arl_t *arl, uint_t proto, uchar_t *hw_addr, uint_t hw_addr_len, - uchar_t *proto_addr, uint_t proto_addr_len, uchar_t *proto_mask, - uchar_t *proto_extract_mask, uint_t hw_extract_start, uchar_t *sender_addr, - uint_t flags) -{ - static ace_t ace_null; - ace_t *ace; - ace_t **acep; - uchar_t *dst; - mblk_t *mp; - arp_stack_t *as = ARL_TO_ARPSTACK(arl); - arl_t *xmit_arl; - arlphy_t *ap; - - if ((flags & ~ACE_EXTERNAL_FLAGS_MASK) || arl == NULL) - return (EINVAL); - - if (proto_addr == NULL || proto_addr_len == 0 || - (proto == IP_ARP_PROTO_TYPE && proto_addr_len != IP_ADDR_LEN)) - return (EINVAL); - - if (flags & ACE_F_MYADDR) - flags |= ACE_F_PUBLISH | ACE_F_AUTHORITY; - - /* - * Latch a transmit arl for this ace. - */ - if (arl->arl_flags & ARL_F_IPMP) { - ASSERT(proto == IP_ARP_PROTO_TYPE); - xmit_arl = ar_ipmp_lookup_xmit_arl(arl, hw_addr, hw_addr_len, - sender_addr); - } else { - xmit_arl = arl; - } - - if (xmit_arl == NULL || xmit_arl->arl_phy == NULL) - return (EINVAL); - - ap = xmit_arl->arl_phy; - - if (!hw_addr && hw_addr_len == 0) { - if (flags == ACE_F_PERMANENT) { /* Not publish */ - /* 224.0.0.0 to zero length address */ - flags |= ACE_F_RESOLVED; - } else { /* local address and unresolved case */ - hw_addr = ap->ap_hw_addr; - hw_addr_len = ap->ap_hw_addrlen; - if (flags & ACE_F_PUBLISH) - flags |= ACE_F_RESOLVED; - } - } else { - flags |= ACE_F_RESOLVED; - } - - /* Handle hw_addr_len == 0 for DL_ENABMULTI_REQ etc. */ - if (hw_addr_len != 0 && hw_addr == NULL) - return (EINVAL); - if (hw_addr_len < ap->ap_hw_addrlen && hw_addr_len != 0) - return (EINVAL); - if (!proto_extract_mask && (flags & ACE_F_MAPPING)) - return (EINVAL); - - /* - * If the underlying link doesn't have reliable up/down notification or - * if we're working with the IPv4 169.254.0.0/16 Link Local Address - * space, then don't use the fast timers. Otherwise, use them. - */ - if (ap->ap_notifies && - !(proto == IP_ARP_PROTO_TYPE && IS_IPV4_LL_SPACE(proto_addr))) { - flags |= ACE_F_FAST; - } - - /* - * Allocate the timer block to hold the ace. - * (ace + proto_addr + proto_addr_mask + proto_extract_mask + hw_addr) - */ - mp = mi_timer_alloc(sizeof (ace_t) + proto_addr_len + proto_addr_len + - proto_addr_len + hw_addr_len); - if (!mp) - return (ENOMEM); - ace = (ace_t *)mp->b_rptr; - *ace = ace_null; - ace->ace_proto = proto; - ace->ace_mp = mp; - ace->ace_arl = arl; - ace->ace_xmit_arl = xmit_arl; - - dst = (uchar_t *)&ace[1]; - - ace->ace_proto_addr = dst; - ace->ace_proto_addr_length = proto_addr_len; - bcopy(proto_addr, dst, proto_addr_len); - dst += proto_addr_len; - /* - * The proto_mask allows us to add entries which will let us respond - * to requests for a group of addresses. This makes it easy to provide - * proxy ARP service for machines that don't understand about the local - * subnet structure, if, for example, there are BSD4.2 systems lurking. - */ - ace->ace_proto_mask = dst; - if (proto_mask != NULL) { - bcopy(proto_mask, dst, proto_addr_len); - dst += proto_addr_len; - } else { - while (proto_addr_len-- > 0) - *dst++ = (uchar_t)~0; - } - - if (proto_extract_mask != NULL) { - ace->ace_proto_extract_mask = dst; - bcopy(proto_extract_mask, dst, ace->ace_proto_addr_length); - dst += ace->ace_proto_addr_length; - } else { - ace->ace_proto_extract_mask = NULL; - } - ace->ace_hw_extract_start = hw_extract_start; - ace->ace_hw_addr_length = hw_addr_len; - ace->ace_hw_addr = dst; - if (hw_addr != NULL) { - bcopy(hw_addr, dst, hw_addr_len); - dst += hw_addr_len; - } - - ace->ace_flags = flags; - if (ar_mask_all_ones(ace->ace_proto_mask, - ace->ace_proto_addr_length)) { - acep = ar_ce_hash(as, ace->ace_proto, ace->ace_proto_addr, - ace->ace_proto_addr_length); - } else { - acep = &as->as_ce_mask_entries; - } - if ((ace->ace_next = *acep) != NULL) - ace->ace_next->ace_ptpn = &ace->ace_next; - *acep = ace; - ace->ace_ptpn = acep; - return (0); -} - -/* Delete a cache entry. */ -static void -ar_ce_delete(ace_t *ace) -{ - ace_t **acep; - - /* Get out of the hash list. */ - acep = ace->ace_ptpn; - if (ace->ace_next) - ace->ace_next->ace_ptpn = acep; - acep[0] = ace->ace_next; - /* Mark it dying in case we have a timer about to fire. */ - ace->ace_flags |= ACE_F_DYING; - /* Complete any outstanding queries immediately. */ - ar_query_reply(ace, ENXIO, NULL, (uint32_t)0); - /* Free the timer, immediately, or when it fires. */ - mi_timer_free(ace->ace_mp); -} - -/* - * ar_ce_walk routine. Delete the ace if it is associated with the arl - * that is going away. - */ -static void -ar_ce_delete_per_arl(ace_t *ace, void *arl) -{ - if (ace->ace_arl == arl || ace->ace_xmit_arl == arl) { - ace->ace_flags &= ~ACE_F_PERMANENT; - ar_ce_delete(ace); - } -} - -/* - * ar_ce_walk routine used when deactivating an `arl' in a group. Deletes - * `ace' if it was using `arl_arg' as its output interface. - */ -static void -ar_ce_ipmp_deactivate(ace_t *ace, void *arl_arg) -{ - arl_t *arl = arl_arg; - - ASSERT(!(arl->arl_flags & ARL_F_IPMP)); - - if (ace->ace_arl == arl) { - ASSERT(ace->ace_xmit_arl == arl); - /* - * This ACE is tied to the arl leaving the group (e.g., an - * ACE_F_PERMANENT for a test address) and is not used by the - * group, so we can leave it be. - */ - return; - } - - if (ace->ace_xmit_arl != arl) - return; - - ASSERT(ace->ace_arl == arl->arl_ipmp_arl); - - /* - * IP should've already sent us messages asking us to move any - * ACE_F_MYADDR entries to another arl, but there are two exceptions: - * - * 1. The group was misconfigured with interfaces that have duplicate - * hardware addresses, but in.mpathd was unable to offline those - * duplicate interfaces. - * - * 2. The messages from IP were lost or never created (e.g. due to - * memory pressure). - * - * We handle the first case by just quietly deleting the ACE. Since - * the second case cannot be distinguished from a more serious bug in - * the IPMP framework, we ASSERT() that this can't happen on DEBUG - * systems, but quietly delete the ACE on production systems (the - * deleted ACE will render the IP address unreachable). - */ - if (ace->ace_flags & ACE_F_MYADDR) { - arlphy_t *ap = arl->arl_phy; - uint_t hw_addrlen = ap->ap_hw_addrlen; - - ASSERT(hw_addrlen == ace->ace_hw_addr_length && - bcmp(ap->ap_hw_addr, ace->ace_hw_addr, hw_addrlen) == 0); - } - - /* - * NOTE: it's possible this arl got selected as the ace_xmit_arl when - * creating an ACE_F_PERMANENT ACE on behalf of an SIOCS*ARP ioctl for - * an IPMP IP interface. But it's still OK for us to delete such an - * ACE since ipmp_illgrp_refresh_arpent() will ask us to recreate it - * and we'll pick another arl then. - */ - ar_ce_delete(ace); -} - -/* Cache entry hash routine, based on protocol and protocol address. */ -static ace_t ** -ar_ce_hash(arp_stack_t *as, uint32_t proto, const uchar_t *proto_addr, - uint32_t proto_addr_length) -{ - const uchar_t *up = proto_addr; - unsigned int hval = proto; - int len = proto_addr_length; - - while (--len >= 0) - hval ^= *up++; - return (&as->as_ce_hash_tbl[hval % ARP_HASH_SIZE]); -} - -/* Cache entry lookup. Try to find an ace matching the parameters passed. */ -ace_t * -ar_ce_lookup(arl_t *arl, uint32_t proto, const uchar_t *proto_addr, - uint32_t proto_addr_length) -{ - ace_t *ace; - - ace = ar_ce_lookup_entry(arl, proto, proto_addr, proto_addr_length); - if (!ace) - ace = ar_ce_lookup_mapping(arl, proto, proto_addr, - proto_addr_length); - return (ace); -} - -/* - * Cache entry lookup. Try to find an ace matching the parameters passed. - * Look only for exact entries (no mappings) - */ -static ace_t * -ar_ce_lookup_entry(arl_t *arl, uint32_t proto, const uchar_t *proto_addr, - uint32_t proto_addr_length) -{ - ace_t *ace; - arp_stack_t *as = ARL_TO_ARPSTACK(arl); - - if (!proto_addr) - return (NULL); - ace = *ar_ce_hash(as, proto, proto_addr, proto_addr_length); - for (; ace; ace = ace->ace_next) { - if ((ace->ace_arl == arl || - ace->ace_arl == arl->arl_ipmp_arl) && - ace->ace_proto_addr_length == proto_addr_length && - ace->ace_proto == proto) { - int i1 = proto_addr_length; - uchar_t *ace_addr = ace->ace_proto_addr; - uchar_t *mask = ace->ace_proto_mask; - /* - * Note that the ace_proto_mask is applied to the - * proto_addr before comparing to the ace_addr. - */ - do { - if (--i1 < 0) - return (ace); - } while ((proto_addr[i1] & mask[i1]) == ace_addr[i1]); - } - } - return (ace); -} - -/* - * Extract cache entry lookup parameters from an external command message, then - * call the supplied match function. - */ -static ace_t * -ar_ce_lookup_from_area(arp_stack_t *as, mblk_t *mp, ace_t *matchfn()) -{ - uchar_t *proto_addr; - area_t *area = (area_t *)mp->b_rptr; - - proto_addr = mi_offset_paramc(mp, area->area_proto_addr_offset, - area->area_proto_addr_length); - if (!proto_addr) - return (NULL); - return ((*matchfn)(ar_ll_lookup_from_mp(as, mp), area->area_proto, - proto_addr, area->area_proto_addr_length)); -} - -/* - * Cache entry lookup. Try to find an ace matching the parameters passed. - * Look only for mappings. - */ -static ace_t * -ar_ce_lookup_mapping(arl_t *arl, uint32_t proto, const uchar_t *proto_addr, - uint32_t proto_addr_length) -{ - ace_t *ace; - arp_stack_t *as = ARL_TO_ARPSTACK(arl); - - if (!proto_addr) - return (NULL); - ace = as->as_ce_mask_entries; - for (; ace; ace = ace->ace_next) { - if (ace->ace_arl == arl && - ace->ace_proto_addr_length == proto_addr_length && - ace->ace_proto == proto) { - int i1 = proto_addr_length; - uchar_t *ace_addr = ace->ace_proto_addr; - uchar_t *mask = ace->ace_proto_mask; - /* - * Note that the ace_proto_mask is applied to the - * proto_addr before comparing to the ace_addr. - */ - do { - if (--i1 < 0) - return (ace); - } while ((proto_addr[i1] & mask[i1]) == ace_addr[i1]); - } - } - return (ace); -} - -/* - * Look for a permanent entry for proto_addr across all interfaces. - */ -static ace_t * -ar_ce_lookup_permanent(arp_stack_t *as, uint32_t proto, uchar_t *proto_addr, - uint32_t proto_addr_length) -{ - ace_t *ace; - - ace = *ar_ce_hash(as, proto, proto_addr, proto_addr_length); - for (; ace != NULL; ace = ace->ace_next) { - if (!(ace->ace_flags & ACE_F_PERMANENT)) - continue; - if (ace->ace_proto_addr_length == proto_addr_length && - ace->ace_proto == proto) { - int i1 = proto_addr_length; - uchar_t *ace_addr = ace->ace_proto_addr; - uchar_t *mask = ace->ace_proto_mask; - - /* - * Note that the ace_proto_mask is applied to the - * proto_addr before comparing to the ace_addr. - */ - do { - if (--i1 < 0) - return (ace); - } while ((proto_addr[i1] & mask[i1]) == ace_addr[i1]); - } - } - return (ace); -} - -/* - * ar_ce_resolve is called when a response comes in to an outstanding request. - * Returns 'true' if the address has changed and we need to tell the client. - * (We don't need to tell the client if there's still an outstanding query.) - */ -static boolean_t -ar_ce_resolve(ace_t *ace, const uchar_t *hw_addr, uint32_t hw_addr_length) -{ - boolean_t hwchanged; - - if (hw_addr_length == ace->ace_hw_addr_length) { - ASSERT(ace->ace_hw_addr != NULL); - hwchanged = bcmp(hw_addr, ace->ace_hw_addr, - hw_addr_length) != 0; - if (hwchanged) - bcopy(hw_addr, ace->ace_hw_addr, hw_addr_length); - /* - * No need to bother with ar_query_reply if no queries are - * waiting. - */ - ace->ace_flags |= ACE_F_RESOLVED; - if (ace->ace_query_mp != NULL) - ar_query_reply(ace, 0, NULL, (uint32_t)0); - if (hwchanged) - return (B_TRUE); - } - return (B_FALSE); -} - -/* - * There are 2 functions performed by this function. - * 1. Resolution of unresolved entries and update of resolved entries. - * 2. Detection of nodes with our own IP address (duplicates). - * - * If the resolving ARL is in the same group as a matching ACE's ARL, then - * update the ACE. Otherwise, make no updates. - * - * For all entries, we first check to see if this is a duplicate (probable - * loopback) message. If so, then just ignore it. - * - * Next, check to see if the entry has completed DAD. If not, then we've - * failed, because someone is already using the address. Notify IP of the DAD - * failure and remove the broken ace. - * - * Next, we check if we're the authority for this address. If so, then it's - * time to defend it, because the other node is a duplicate. Report it as a - * 'bogon' and let IP decide how to defend. - * - * Finally, if it's unresolved or if the arls match, we just update the MAC - * address. This allows a published 'static' entry to be updated by an ARP - * request from the node for which we're a proxy ARP server. - * - * Note that this logic does not update published ARP entries for mismatched - * arls, as for example when we proxy arp across 2 subnets with differing - * subnet masks. - * - * Return Values below - */ - -#define AR_NOTFOUND 1 /* No matching ace found in cache */ -#define AR_MERGED 2 /* Matching ace updated (RFC 826 Merge_flag) */ -#define AR_LOOPBACK 3 /* Our own arp packet was received */ -#define AR_BOGON 4 /* Another host has our IP addr. */ -#define AR_FAILED 5 /* Duplicate Address Detection has failed */ -#define AR_CHANGED 6 /* Address has changed; tell IP (and merged) */ - -static int -ar_ce_resolve_all(arl_t *arl, uint32_t proto, const uchar_t *src_haddr, - uint32_t hlen, const uchar_t *src_paddr, uint32_t plen, arl_t **ace_arlp) -{ - ace_t *ace; - ace_t *ace_next; - int i1; - const uchar_t *paddr; - uchar_t *ace_addr; - uchar_t *mask; - int retv = AR_NOTFOUND; - arp_stack_t *as = ARL_TO_ARPSTACK(arl); - - ace = *ar_ce_hash(as, proto, src_paddr, plen); - for (; ace != NULL; ace = ace_next) { - - /* ar_ce_resolve may delete the ace; fetch next pointer now */ - ace_next = ace->ace_next; - - if (ace->ace_proto_addr_length != plen || - ace->ace_proto != proto) { - continue; - } - - /* - * Note that the ace_proto_mask is applied to the proto_addr - * before comparing to the ace_addr. - */ - paddr = src_paddr; - i1 = plen; - ace_addr = ace->ace_proto_addr; - mask = ace->ace_proto_mask; - while (--i1 >= 0) { - if ((*paddr++ & *mask++) != *ace_addr++) - break; - } - if (i1 >= 0) - continue; - - *ace_arlp = ace->ace_arl; - - /* - * If the IP address is ours, and the hardware address matches - * one of our own arls, then this is a broadcast packet - * emitted by one of our interfaces, reflected by the switch - * and received on another interface. We return AR_LOOPBACK. - */ - if (ace->ace_flags & ACE_F_MYADDR) { - arl_t *hw_arl = as->as_arl_head; - arlphy_t *ap; - - for (; hw_arl != NULL; hw_arl = hw_arl->arl_next) { - ap = hw_arl->arl_phy; - if (ap != NULL && ap->ap_hw_addrlen == hlen && - bcmp(ap->ap_hw_addr, src_haddr, hlen) == 0) - return (AR_LOOPBACK); - } - } - - /* - * If the entry is unverified, then we've just verified that - * someone else already owns this address, because this is a - * message with the same protocol address but different - * hardware address. NOTE: the ace_xmit_arl check ensures we - * don't send duplicate AR_FAILEDs if arl is in an IPMP group. - */ - if ((ace->ace_flags & ACE_F_UNVERIFIED) && - arl == ace->ace_xmit_arl) { - ar_ce_delete(ace); - return (AR_FAILED); - } - - /* - * If the IP address matches ours and we're authoritative for - * this entry, then some other node is using our IP addr, so - * return AR_BOGON. Also reset the transmit count to zero so - * that, if we're currently in initial announcement mode, we - * switch back to the lazier defense mode. Knowing that - * there's at least one duplicate out there, we ought not - * blindly announce. NOTE: the ace_xmit_arl check ensures we - * don't send duplicate AR_BOGONs if arl is in an IPMP group. - */ - if ((ace->ace_flags & ACE_F_AUTHORITY) && - arl == ace->ace_xmit_arl) { - ace->ace_xmit_count = 0; - return (AR_BOGON); - } - - /* - * Only update this ACE if it's on the same network -- i.e., - * it's for our ARL or another ARL in the same IPMP group. - */ - if (ace->ace_arl == arl || ace->ace_arl == arl->arl_ipmp_arl) { - if (ar_ce_resolve(ace, src_haddr, hlen)) - retv = AR_CHANGED; - else if (retv == AR_NOTFOUND) - retv = AR_MERGED; - } - } - - if (retv == AR_NOTFOUND) - *ace_arlp = NULL; - return (retv); -} - -/* Pass arg1 to the pfi supplied, along with each ace in existence. */ -static void -ar_ce_walk(arp_stack_t *as, void (*pfi)(ace_t *, void *), void *arg1) -{ - ace_t *ace; - ace_t *ace1; - int i; - - for (i = 0; i < ARP_HASH_SIZE; i++) { - /* - * We walk the hash chain in a way that allows the current - * ace to get blown off by the called routine. - */ - for (ace = as->as_ce_hash_tbl[i]; ace; ace = ace1) { - ace1 = ace->ace_next; - (*pfi)(ace, arg1); - } - } - for (ace = as->as_ce_mask_entries; ace; ace = ace1) { - ace1 = ace->ace_next; - (*pfi)(ace, arg1); - } -} - -/* - * Send a copy of interesting packets to the corresponding IP instance. - * The corresponding IP instance is the ARP-IP-DEV instance for this - * DEV (i.e. ARL). - */ -static void -ar_client_notify(const arl_t *arl, mblk_t *mp, int code) -{ - ar_t *ar = ((ar_t *)arl->arl_rq->q_ptr)->ar_arl_ip_assoc; - arcn_t *arcn; - mblk_t *mp1; - int arl_namelen = strlen(arl->arl_name) + 1; - - /* Looks like the association disappeared */ - if (ar == NULL) { - freemsg(mp); - return; - } - - /* ar is the corresponding ARP-IP instance for this ARL */ - ASSERT(ar->ar_arl == NULL && ar->ar_wq->q_next != NULL); - - mp1 = allocb(sizeof (arcn_t) + arl_namelen, BPRI_MED); - if (mp1 == NULL) { - freemsg(mp); - return; - } - DB_TYPE(mp1) = M_CTL; - mp1->b_cont = mp; - arcn = (arcn_t *)mp1->b_rptr; - mp1->b_wptr = (uchar_t *)&arcn[1] + arl_namelen; - arcn->arcn_cmd = AR_CLIENT_NOTIFY; - arcn->arcn_name_offset = sizeof (arcn_t); - arcn->arcn_name_length = arl_namelen; - arcn->arcn_code = code; - bcopy(arl->arl_name, &arcn[1], arl_namelen); - - putnext(ar->ar_wq, mp1); -} - -/* - * Send a delete-notify message down to IP. We've determined that IP doesn't - * have a cache entry for the IP address itself, but it may have other cache - * entries with the same hardware address, and we don't want to see those grow - * stale. (The alternative is sending down updates for every ARP message we - * get that doesn't match an existing ace. That's much more expensive than an - * occasional delete and reload.) - */ -static void -ar_delete_notify(const ace_t *ace) -{ - const arl_t *arl = ace->ace_arl; - const arlphy_t *ap = ace->ace_xmit_arl->arl_phy; - mblk_t *mp; - size_t len; - arh_t *arh; - - len = sizeof (*arh) + 2 * ace->ace_proto_addr_length; - mp = allocb(len, BPRI_MED); - if (mp == NULL) - return; - arh = (arh_t *)mp->b_rptr; - mp->b_wptr = (uchar_t *)arh + len; - U16_TO_BE16(ap->ap_arp_hw_type, arh->arh_hardware); - U16_TO_BE16(ace->ace_proto, arh->arh_proto); - arh->arh_hlen = 0; - arh->arh_plen = ace->ace_proto_addr_length; - U16_TO_BE16(ARP_RESPONSE, arh->arh_operation); - bcopy(ace->ace_proto_addr, arh + 1, ace->ace_proto_addr_length); - bcopy(ace->ace_proto_addr, (uchar_t *)(arh + 1) + - ace->ace_proto_addr_length, ace->ace_proto_addr_length); - ar_client_notify(arl, mp, AR_CN_ANNOUNCE); -} - -/* ARP module close routine. */ -static int -ar_close(queue_t *q) -{ - ar_t *ar = (ar_t *)q->q_ptr; - char name[LIFNAMSIZ]; - arl_t *arl, *xarl; - arl_t **arlp; - cred_t *cr; - arc_t *arc; - mblk_t *mp1; - int index; - arp_stack_t *as = ar->ar_as; - - TRACE_1(TR_FAC_ARP, TR_ARP_CLOSE, - "arp_close: q %p", q); - - arl = ar->ar_arl; - if (arl == NULL) { - index = 0; - /* - * If this is the <ARP-IP-Driver> stream send down - * a closing message to IP and wait for IP to send - * an ack. This helps to make sure that messages - * that are currently being sent up by IP are not lost. - */ - if (ar->ar_on_ill_stream) { - mp1 = allocb(sizeof (arc_t), BPRI_MED); - if (mp1 != NULL) { - DB_TYPE(mp1) = M_CTL; - arc = (arc_t *)mp1->b_rptr; - mp1->b_wptr = mp1->b_rptr + sizeof (arc_t); - arc->arc_cmd = AR_ARP_CLOSING; - putnext(WR(q), mp1); - while (!ar->ar_ip_acked_close) - /* If we are interrupted break out */ - if (qwait_sig(q) == 0) - break; - } - } - /* Delete all our pending queries, 'arl' is not dereferenced */ - ar_ce_walk(as, ar_query_delete, ar); - /* - * The request could be pending on some arl_queue also. This - * happens if the arl is not yet bound, and bind is pending. - */ - ar_ll_cleanup_arl_queue(q); - } else { - index = arl->arl_index; - (void) strcpy(name, arl->arl_name); - arl->arl_closing = 1; - while (arl->arl_queue != NULL) - qwait(arl->arl_rq); - - if (arl->arl_state == ARL_S_UP) - ar_ll_down(arl); - - while (arl->arl_state != ARL_S_DOWN) - qwait(arl->arl_rq); - - if (arl->arl_flags & ARL_F_IPMP) { - /* - * Though rude, someone could force the IPMP arl - * closed without removing the underlying interfaces. - * In that case, force the ARLs out of the group. - */ - xarl = as->as_arl_head; - for (; xarl != NULL; xarl = xarl->arl_next) { - if (xarl->arl_ipmp_arl != arl || xarl == arl) - continue; - ar_ce_walk(as, ar_ce_ipmp_deactivate, xarl); - xarl->arl_ipmp_arl = NULL; - } - } - - ar_ll_clear_defaults(arl); - /* - * If this is the control stream for an arl, delete anything - * hanging off our arl. - */ - ar_ce_walk(as, ar_ce_delete_per_arl, arl); - /* Free any messages waiting for a bind_ack */ - /* Get the arl out of the chain. */ - rw_enter(&as->as_arl_lock, RW_WRITER); - for (arlp = &as->as_arl_head; *arlp; - arlp = &(*arlp)->arl_next) { - if (*arlp == arl) { - *arlp = arl->arl_next; - break; - } - } - - ASSERT(arl->arl_dlpi_deferred == NULL); - ar->ar_arl = NULL; - rw_exit(&as->as_arl_lock); - - mi_free((char *)arl); - } - /* Let's break the association between an ARL and IP instance */ - if (ar->ar_arl_ip_assoc != NULL) { - ASSERT(ar->ar_arl_ip_assoc->ar_arl_ip_assoc != NULL && - ar->ar_arl_ip_assoc->ar_arl_ip_assoc == ar); - ar->ar_arl_ip_assoc->ar_arl_ip_assoc = NULL; - ar->ar_arl_ip_assoc = NULL; - } - cr = ar->ar_credp; - /* mi_close_comm frees the instance data. */ - (void) mi_close_comm(&as->as_head, q); - qprocsoff(q); - crfree(cr); - - if (index != 0) { - hook_nic_event_t info; - - info.hne_nic = index; - info.hne_lif = 0; - info.hne_event = NE_UNPLUMB; - info.hne_data = name; - info.hne_datalen = strlen(name); - (void) hook_run(as->as_net_data->netd_hooks, - as->as_arpnicevents, (hook_data_t)&info); - } - netstack_rele(as->as_netstack); - return (0); -} - -/* - * Dispatch routine for ARP commands. This routine can be called out of - * either ar_wput or ar_rput, in response to IOCTLs or M_PROTO messages. - */ -/* TODO: error reporting for M_PROTO case */ -static int -ar_cmd_dispatch(queue_t *q, mblk_t *mp_orig, boolean_t from_wput) -{ - arct_t *arct; - uint32_t cmd; - ssize_t len; - mblk_t *mp = mp_orig; - cred_t *cr = NULL; - - if (!mp) - return (ENOENT); - - /* We get both M_PROTO and M_IOCTL messages, so watch out! */ - if (DB_TYPE(mp) == M_IOCTL) { - struct iocblk *ioc; - ioc = (struct iocblk *)mp->b_rptr; - cmd = ioc->ioc_cmd; - cr = ioc->ioc_cr; - mp = mp->b_cont; - if (!mp) - return (ENOENT); - } else { - cr = msg_getcred(mp, NULL); - /* For initial messages beteen IP and ARP, cr can be NULL */ - if (cr == NULL) - cr = ((ar_t *)q->q_ptr)->ar_credp; - } - len = MBLKL(mp); - if (len < sizeof (uint32_t) || !OK_32PTR(mp->b_rptr)) - return (ENOENT); - if (mp_orig == mp) - cmd = *(uint32_t *)mp->b_rptr; - for (arct = ar_cmd_tbl; ; arct++) { - if (arct >= A_END(ar_cmd_tbl)) - return (ENOENT); - if (arct->arct_cmd == cmd) - break; - } - if (len < arct->arct_min_len) { - /* - * If the command is exclusive to ARP, we return EINVAL, - * else we need to pass the command downstream, so return - * ENOENT - */ - return ((arct->arct_flags & ARF_ONLY_CMD) ? EINVAL : ENOENT); - } - if (arct->arct_priv_req != OP_NP) { - int error; - - if ((error = secpolicy_ip(cr, arct->arct_priv_req, - B_FALSE)) != 0) - return (error); - } - /* Disallow many commands except if from rput i.e. from IP */ - if (from_wput && !(arct->arct_flags & ARF_WPUT_OK)) { - return (EINVAL); - } - - if (arct->arct_flags & ARF_IOCTL_AWARE) - mp = mp_orig; - - DTRACE_PROBE3(cmd_dispatch, queue_t *, q, mblk_t *, mp, - arct_t *, arct); - return (*arct->arct_pfi)(q, mp); -} - -/* Allocate and do common initializations for DLPI messages. */ -static mblk_t * -ar_dlpi_comm(t_uscalar_t prim, size_t size) -{ - mblk_t *mp; - - if ((mp = allocb(size, BPRI_HI)) == NULL) - return (NULL); - - /* - * DLPIv2 says that DL_INFO_REQ and DL_TOKEN_REQ (the latter - * of which we don't seem to use) are sent with M_PCPROTO, and - * that other DLPI are M_PROTO. - */ - DB_TYPE(mp) = (prim == DL_INFO_REQ) ? M_PCPROTO : M_PROTO; - - mp->b_wptr = mp->b_rptr + size; - bzero(mp->b_rptr, size); - ((union DL_primitives *)mp->b_rptr)->dl_primitive = prim; - - return (mp); -} - -static void -ar_dlpi_dispatch(arl_t *arl) -{ - mblk_t *mp; - t_uscalar_t primitive = DL_PRIM_INVAL; - - while (((mp = arl->arl_dlpi_deferred) != NULL) && - (arl->arl_dlpi_pending == DL_PRIM_INVAL)) { - union DL_primitives *dlp = (union DL_primitives *)mp->b_rptr; - - DTRACE_PROBE2(dlpi_dispatch, arl_t *, arl, mblk_t *, mp); - - ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); - arl->arl_dlpi_deferred = mp->b_next; - mp->b_next = NULL; - - /* - * If this is a DL_NOTIFY_CONF, no ack is expected. - */ - if ((primitive = dlp->dl_primitive) != DL_NOTIFY_CONF) - arl->arl_dlpi_pending = dlp->dl_primitive; - putnext(arl->arl_wq, mp); - } - - if (arl->arl_dlpi_pending == DL_PRIM_INVAL) { - /* - * No pending DLPI operation. - */ - ASSERT(mp == NULL); - DTRACE_PROBE1(dlpi_idle, arl_t *, arl); - - /* - * If the last DLPI message dispatched is DL_NOTIFY_CONF, - * it is not assoicated with any pending cmd request, drain - * the rest of pending cmd requests, otherwise call - * ar_cmd_done() to finish up the current pending cmd - * operation. - */ - if (primitive == DL_NOTIFY_CONF) - ar_cmd_drain(arl); - else - ar_cmd_done(arl); - } else if (mp != NULL) { - DTRACE_PROBE2(dlpi_defer, arl_t *, arl, mblk_t *, mp); - } -} - -/* - * The following two functions serialize DLPI messages to the driver, much - * along the lines of ill_dlpi_send and ill_dlpi_done in IP. Basically, - * we wait for a DLPI message, sent downstream, to be acked before sending - * the next. If there are DLPI messages that have not yet been sent, queue - * this message (mp), else send it downstream. - */ -static void -ar_dlpi_send(arl_t *arl, mblk_t *mp) -{ - mblk_t **mpp; - - ASSERT(arl != NULL); - ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); - - /* Always queue the message. Tail insertion */ - mpp = &arl->arl_dlpi_deferred; - while (*mpp != NULL) - mpp = &((*mpp)->b_next); - *mpp = mp; - - ar_dlpi_dispatch(arl); -} - -/* - * Called when an DLPI control message has been acked; send down the next - * queued message (if any). - * The DLPI messages of interest being bind, attach, unbind and detach since - * these are the only ones sent by ARP via ar_dlpi_send. - */ -static void -ar_dlpi_done(arl_t *arl, t_uscalar_t prim) -{ - if (arl->arl_dlpi_pending != prim) { - DTRACE_PROBE2(dlpi_done_unexpected, arl_t *, arl, - t_uscalar_t, prim); - return; - } - - DTRACE_PROBE2(dlpi_done, arl_t *, arl, t_uscalar_t, prim); - arl->arl_dlpi_pending = DL_PRIM_INVAL; - ar_dlpi_dispatch(arl); -} - -/* - * Send a DL_NOTE_REPLUMB_DONE message down to the driver to indicate - * the replumb process has already been done. Note that mp is either a - * DL_NOTIFY_IND message or an AR_INTERFACE_DOWN message (comes from IP). - */ -static void -arp_replumb_done(arl_t *arl, mblk_t *mp) -{ - ASSERT(arl->arl_state == ARL_S_DOWN && arl->arl_replumbing); - - mp = mexchange(NULL, mp, sizeof (dl_notify_conf_t), M_PROTO, - DL_NOTIFY_CONF); - ((dl_notify_conf_t *)(mp->b_rptr))->dl_notification = - DL_NOTE_REPLUMB_DONE; - arl->arl_replumbing = B_FALSE; - ar_dlpi_send(arl, mp); -} - -static void -ar_cmd_drain(arl_t *arl) -{ - mblk_t *mp; - queue_t *q; - - /* - * Run the commands that have been enqueued while we were waiting - * for the last command (AR_INTERFACE_UP or AR_INTERFACE_DOWN) - * to complete. - */ - while ((mp = arl->arl_queue) != NULL) { - if (((uintptr_t)mp->b_prev & CMD_IN_PROGRESS) != 0) { - /* - * The current command is an AR_INTERFACE_UP or - * AR_INTERFACE_DOWN and is waiting for a DLPI ack - * from the driver. Return. We can't make progress now. - */ - break; - } - - mp = ar_cmd_dequeue(arl); - mp->b_prev = AR_DRAINING; - q = mp->b_queue; - mp->b_queue = NULL; - - /* - * Don't call put(q, mp) since it can lead to reorder of - * messages by sending the current messages to the end of - * arp's syncq - */ - if (q->q_flag & QREADR) - ar_rput(q, mp); - else - ar_wput(q, mp); - } -} - -static void -ar_cmd_done(arl_t *arl) -{ - mblk_t *mp; - int cmd; - int err; - mblk_t *mp1; - mblk_t *dlpi_op_done_mp = NULL; - queue_t *dlpi_op_done_q; - ar_t *ar_arl; - ar_t *ar_ip; - - ASSERT(arl->arl_state == ARL_S_UP || arl->arl_state == ARL_S_DOWN); - - /* - * If the current operation was initiated by IP there must be - * an op enqueued in arl_queue. But if ar_close has sent down - * a detach/unbind, there is no command enqueued. Also if the IP-ARP - * stream has closed the cleanup would be done and there won't be any mp - */ - if ((mp = arl->arl_queue) == NULL) - return; - - if ((cmd = (uintptr_t)mp->b_prev) & CMD_IN_PROGRESS) { - mp1 = ar_cmd_dequeue(arl); - ASSERT(mp == mp1); - - cmd &= ~CMD_IN_PROGRESS; - if (cmd == AR_INTERFACE_UP) { - /* - * There is an ioctl waiting for us... - */ - if (arl->arl_state == ARL_S_UP) - err = 0; - else - err = EINVAL; - - dlpi_op_done_mp = ar_alloc(AR_DLPIOP_DONE, err); - if (dlpi_op_done_mp != NULL) { - /* - * Better performance if we send the response - * after the potential MAPPING_ADDs command - * that are likely to follow. (Do it below the - * while loop, instead of putnext right now) - */ - dlpi_op_done_q = WR(mp->b_queue); - } - - if (err == 0) { - /* - * Now that we have the ARL instance - * corresponding to the IP instance let's make - * the association here. - */ - ar_ip = (ar_t *)mp->b_queue->q_ptr; - ar_arl = (ar_t *)arl->arl_rq->q_ptr; - ar_arl->ar_arl_ip_assoc = ar_ip; - ar_ip->ar_arl_ip_assoc = ar_arl; - } - - inet_freemsg(mp); - } else if (cmd == AR_INTERFACE_DOWN && arl->arl_replumbing) { - /* - * The arl is successfully brought down and this is - * a result of the DL_NOTE_REPLUMB process. Reset - * mp->b_prev first (it keeps the 'cmd' information - * at this point). - */ - mp->b_prev = NULL; - arp_replumb_done(arl, mp); - } else { - inet_freemsg(mp); - } - } - - ar_cmd_drain(arl); - - if (dlpi_op_done_mp != NULL) { - DTRACE_PROBE3(cmd_done_next, arl_t *, arl, - queue_t *, dlpi_op_done_q, mblk_t *, dlpi_op_done_mp); - putnext(dlpi_op_done_q, dlpi_op_done_mp); - } -} - -/* - * Queue all arp commands coming from clients. Typically these commands - * come from IP, but could also come from other clients. The commands - * are serviced in FIFO order. Some commands need to wait and restart - * after the DLPI response from the driver is received. Typically - * AR_INTERFACE_UP and AR_INTERFACE_DOWN. ar_dlpi_done restarts - * the command and then dequeues the queue at arl_queue and calls ar_rput - * or ar_wput for each enqueued command. AR_DRAINING is used to signify - * that the command is being executed thru a drain from ar_dlpi_done. - * Functions handling the individual commands such as ar_entry_add - * check for this flag in b_prev to determine whether the command has - * to be enqueued for later processing or must be processed now. - * - * b_next used to thread the enqueued command mblks - * b_queue used to identify the queue of the originating request(client) - * b_prev used to store the command itself for easy parsing. - */ -static void -ar_cmd_enqueue(arl_t *arl, mblk_t *mp, queue_t *q, ushort_t cmd, - boolean_t tail_insert) -{ - mp->b_queue = q; - if (arl->arl_queue == NULL) { - ASSERT(arl->arl_queue_tail == NULL); - mp->b_prev = (void *)((uintptr_t)(cmd | CMD_IN_PROGRESS)); - mp->b_next = NULL; - arl->arl_queue = mp; - arl->arl_queue_tail = mp; - } else if (tail_insert) { - mp->b_prev = (void *)((uintptr_t)cmd); - mp->b_next = NULL; - arl->arl_queue_tail->b_next = mp; - arl->arl_queue_tail = mp; - } else { - /* head insert */ - mp->b_prev = (void *)((uintptr_t)cmd | CMD_IN_PROGRESS); - mp->b_next = arl->arl_queue; - arl->arl_queue = mp; - } -} - -static mblk_t * -ar_cmd_dequeue(arl_t *arl) -{ - mblk_t *mp; - - if (arl->arl_queue == NULL) { - ASSERT(arl->arl_queue_tail == NULL); - return (NULL); - } - mp = arl->arl_queue; - arl->arl_queue = mp->b_next; - if (arl->arl_queue == NULL) - arl->arl_queue_tail = NULL; - mp->b_next = NULL; - return (mp); -} - -/* - * Standard ACE timer handling: compute 'fuzz' around a central value or from 0 - * up to a value, and then set the timer. The randomization is necessary to - * prevent groups of systems from falling into synchronization on the network - * and producing ARP packet storms. - */ -static void -ace_set_timer(ace_t *ace, boolean_t initial_time) -{ - clock_t intv, rnd, frac; - - (void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd)); - /* Note that clock_t is signed; must chop off bits */ - rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1; - intv = ace->ace_xmit_interval; - if (initial_time) { - /* Set intv to be anywhere in the [1 .. intv] range */ - if (intv <= 0) - intv = 1; - else - intv = (rnd % intv) + 1; - } else { - /* Compute 'frac' as 20% of the configured interval */ - if ((frac = intv / 5) <= 1) - frac = 2; - /* Set intv randomly in the range [intv-frac .. intv+frac] */ - if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0) - intv = 1; - } - mi_timer(ace->ace_arl->arl_wq, ace->ace_mp, intv); -} - -/* - * Process entry add requests from external messages. - * It is also called by ip_rput_dlpi_writer() through - * ipif_resolver_up() to change hardware address when - * an asynchronous hardware address change notification - * arrives from the driver. - */ -static int -ar_entry_add(queue_t *q, mblk_t *mp_orig) -{ - area_t *area; - ace_t *ace; - uchar_t *hw_addr; - uint32_t hw_addr_len; - uchar_t *proto_addr; - uint32_t proto_addr_len; - uchar_t *proto_mask; - arl_t *arl; - mblk_t *mp = mp_orig; - int err; - uint_t aflags; - boolean_t unverified; - arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as; - - /* We handle both M_IOCTL and M_PROTO messages. */ - if (DB_TYPE(mp) == M_IOCTL) - mp = mp->b_cont; - arl = ar_ll_lookup_from_mp(as, mp); - if (arl == NULL) - return (EINVAL); - /* - * Newly received commands from clients go to the tail of the queue. - */ - if (CMD_NEEDS_QUEUEING(mp_orig, arl)) { - DTRACE_PROBE3(eadd_enqueued, queue_t *, q, mblk_t *, mp_orig, - arl_t *, arl); - ar_cmd_enqueue(arl, mp_orig, q, AR_ENTRY_ADD, B_TRUE); - return (EINPROGRESS); - } - mp_orig->b_prev = NULL; - - area = (area_t *)mp->b_rptr; - aflags = area->area_flags; - - /* - * If the previous entry wasn't published and we are now going - * to publish, then we need to do address verification. The previous - * entry may have been a local unpublished address or even an external - * address. If the entry we find was in an unverified state we retain - * this. - * If it's a new published entry, then we're obligated to do - * duplicate address detection now. - */ - ace = ar_ce_lookup_from_area(as, mp, ar_ce_lookup_entry); - if (ace != NULL) { - unverified = !(ace->ace_flags & ACE_F_PUBLISH) && - (aflags & ACE_F_PUBLISH); - if (ace->ace_flags & ACE_F_UNVERIFIED) - unverified = B_TRUE; - ar_ce_delete(ace); - } else { - unverified = (aflags & ACE_F_PUBLISH) != 0; - } - - /* Allow client to request DAD restart */ - if (aflags & ACE_F_UNVERIFIED) - unverified = B_TRUE; - - /* Extract parameters from the message. */ - hw_addr_len = area->area_hw_addr_length; - hw_addr = mi_offset_paramc(mp, area->area_hw_addr_offset, hw_addr_len); - proto_addr_len = area->area_proto_addr_length; - proto_addr = mi_offset_paramc(mp, area->area_proto_addr_offset, - proto_addr_len); - proto_mask = mi_offset_paramc(mp, area->area_proto_mask_offset, - proto_addr_len); - if (proto_mask == NULL) { - DTRACE_PROBE2(eadd_bad_mask, arl_t *, arl, area_t *, area); - return (EINVAL); - } - err = ar_ce_create( - arl, - area->area_proto, - hw_addr, - hw_addr_len, - proto_addr, - proto_addr_len, - proto_mask, - NULL, - (uint32_t)0, - NULL, - aflags & ~ACE_F_MAPPING & ~ACE_F_UNVERIFIED & ~ACE_F_DEFEND); - if (err != 0) { - DTRACE_PROBE3(eadd_create_failed, arl_t *, arl, area_t *, area, - int, err); - return (err); - } - - if (aflags & ACE_F_PUBLISH) { - arlphy_t *ap; - - ace = ar_ce_lookup(arl, area->area_proto, proto_addr, - proto_addr_len); - ASSERT(ace != NULL); - - ap = ace->ace_xmit_arl->arl_phy; - - if (hw_addr == NULL || hw_addr_len == 0) { - hw_addr = ap->ap_hw_addr; - } else if (aflags & ACE_F_MYADDR) { - /* - * If hardware address changes, then make sure - * that the hardware address and hardware - * address length fields in arlphy_t get updated - * too. Otherwise, they will continue carrying - * the old hardware address information. - */ - ASSERT((hw_addr != NULL) && (hw_addr_len != 0)); - bcopy(hw_addr, ap->ap_hw_addr, hw_addr_len); - ap->ap_hw_addrlen = hw_addr_len; - } - - if (ace->ace_flags & ACE_F_FAST) { - ace->ace_xmit_count = as->as_fastprobe_count; - ace->ace_xmit_interval = as->as_fastprobe_delay; - } else { - ace->ace_xmit_count = as->as_probe_count; - ace->ace_xmit_interval = as->as_probe_delay; - } - - /* - * If the user has disabled duplicate address detection for - * this kind of interface (fast or slow) by setting the probe - * count to zero, then pretend as if we've verified the - * address, and go right to address defense mode. - */ - if (ace->ace_xmit_count == 0) - unverified = B_FALSE; - - /* - * If we need to do duplicate address detection, then kick that - * off. Otherwise, send out a gratuitous ARP message in order - * to update everyone's caches with the new hardware address. - */ - if (unverified) { - ace->ace_flags |= ACE_F_UNVERIFIED; - if (ace->ace_xmit_interval == 0) { - /* - * User has configured us to send the first - * probe right away. Do so, and set up for - * the subsequent probes. - */ - DTRACE_PROBE2(eadd_probe, ace_t *, ace, - area_t *, area); - ar_xmit(ace->ace_xmit_arl, ARP_REQUEST, - area->area_proto, proto_addr_len, - hw_addr, NULL, NULL, proto_addr, NULL, as); - ace->ace_xmit_count--; - ace->ace_xmit_interval = - (ace->ace_flags & ACE_F_FAST) ? - as->as_fastprobe_interval : - as->as_probe_interval; - ace_set_timer(ace, B_FALSE); - } else { - DTRACE_PROBE2(eadd_delay, ace_t *, ace, - area_t *, area); - /* Regular delay before initial probe */ - ace_set_timer(ace, B_TRUE); - } - } else { - DTRACE_PROBE2(eadd_announce, ace_t *, ace, - area_t *, area); - ar_xmit(ace->ace_xmit_arl, ARP_REQUEST, - area->area_proto, proto_addr_len, hw_addr, - proto_addr, ap->ap_arp_addr, proto_addr, NULL, as); - ace->ace_last_bcast = ddi_get_lbolt(); - - /* - * If AUTHORITY is set, it is not just a proxy arp - * entry; we believe we're the authority for this - * entry. In that case, and if we're not just doing - * one-off defense of the address, we send more than - * one copy, so we'll still have a good chance of - * updating everyone even when there's a packet loss - * or two. - */ - if ((aflags & ACE_F_AUTHORITY) && - !(aflags & ACE_F_DEFEND) && - as->as_publish_count > 0) { - /* Account for the xmit we just did */ - ace->ace_xmit_count = as->as_publish_count - 1; - ace->ace_xmit_interval = - as->as_publish_interval; - if (ace->ace_xmit_count > 0) - ace_set_timer(ace, B_FALSE); - } - } - } - return (0); -} - -/* Process entry delete requests from external messages. */ -static int -ar_entry_delete(queue_t *q, mblk_t *mp_orig) -{ - ace_t *ace; - arl_t *arl; - mblk_t *mp = mp_orig; - arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as; - - /* We handle both M_IOCTL and M_PROTO messages. */ - if (DB_TYPE(mp) == M_IOCTL) - mp = mp->b_cont; - arl = ar_ll_lookup_from_mp(as, mp); - if (arl == NULL) - return (EINVAL); - /* - * Newly received commands from clients go to the tail of the queue. - */ - if (CMD_NEEDS_QUEUEING(mp_orig, arl)) { - DTRACE_PROBE3(edel_enqueued, queue_t *, q, mblk_t *, mp_orig, - arl_t *, arl); - ar_cmd_enqueue(arl, mp_orig, q, AR_ENTRY_DELETE, B_TRUE); - return (EINPROGRESS); - } - mp_orig->b_prev = NULL; - - /* - * Need to know if it is a mapping or an exact match. Check exact - * match first. - */ - ace = ar_ce_lookup_from_area(as, mp, ar_ce_lookup); - if (ace != NULL) { - ared_t *ared = (ared_t *)mp->b_rptr; - - /* - * If it's a permanent entry, then the client is the one who - * told us to delete it, so there's no reason to notify. - */ - if (ACE_NONPERM(ace)) - ar_delete_notify(ace); - /* - * Only delete the ARP entry if it is non-permanent, or - * ARED_F_PRESERVE_PERM flags is not set. - */ - if (ACE_NONPERM(ace) || - !(ared->ared_flags & ARED_F_PRESERVE_PERM)) { - ar_ce_delete(ace); - } - return (0); - } - return (ENXIO); -} - -/* - * Process entry query requests from external messages. - * Bump up the ire_stats_freed for all errors except - * EINPROGRESS - which means the packet has been queued. - * For all other errors the packet is going to be freed - * and hence we account for ire being freed if it - * is a M_PROTO message. - */ -static int -ar_entry_query(queue_t *q, mblk_t *mp_orig) -{ - ace_t *ace; - areq_t *areq; - arl_t *arl; - int err; - mblk_t *mp = mp_orig; - uchar_t *proto_addr; - uchar_t *sender_addr; - uint32_t proto_addr_len; - clock_t ms; - boolean_t is_mproto = B_TRUE; - arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as; - - /* We handle both M_IOCTL and M_PROTO messages. */ - if (DB_TYPE(mp) == M_IOCTL) { - is_mproto = B_FALSE; - mp = mp->b_cont; - } - arl = ar_ll_lookup_from_mp(as, mp); - if (arl == NULL) { - DTRACE_PROBE2(query_no_arl, queue_t *, q, mblk_t *, mp); - err = EINVAL; - goto err_ret; - } - /* - * Newly received commands from clients go to the tail of the queue. - */ - if (CMD_NEEDS_QUEUEING(mp_orig, arl)) { - DTRACE_PROBE3(query_enqueued, queue_t *, q, mblk_t *, mp_orig, - arl_t *, arl); - ar_cmd_enqueue(arl, mp_orig, q, AR_ENTRY_QUERY, B_TRUE); - return (EINPROGRESS); - } - mp_orig->b_prev = NULL; - - areq = (areq_t *)mp->b_rptr; - proto_addr_len = areq->areq_target_addr_length; - proto_addr = mi_offset_paramc(mp, areq->areq_target_addr_offset, - proto_addr_len); - if (proto_addr == NULL) { - DTRACE_PROBE1(query_illegal_address, areq_t *, areq); - err = EINVAL; - goto err_ret; - } - /* Stash the reply queue pointer for later use. */ - mp->b_prev = (mblk_t *)OTHERQ(q); - mp->b_next = NULL; - if (areq->areq_xmit_interval == 0) - areq->areq_xmit_interval = AR_DEF_XMIT_INTERVAL; - ace = ar_ce_lookup(arl, areq->areq_proto, proto_addr, proto_addr_len); - if (ace != NULL && (ace->ace_flags & ACE_F_OLD)) { - /* - * This is a potentially stale entry that IP's asking about. - * Since IP is asking, it must not have an answer anymore, - * either due to periodic ARP flush or due to SO_DONTROUTE. - * Rather than go forward with what we've got, restart - * resolution. - */ - DTRACE_PROBE2(query_stale_ace, ace_t *, ace, areq_t *, areq); - ar_ce_delete(ace); - ace = NULL; - } - if (ace != NULL) { - mblk_t **mpp; - uint32_t count = 0; - - /* - * There is already a cache entry. This means there is either - * a permanent entry, or address resolution is in progress. - * If the latter, there should be one or more queries queued - * up. We link the current one in at the end, if there aren't - * too many outstanding. - */ - for (mpp = &ace->ace_query_mp; mpp[0]; mpp = &mpp[0]->b_next) { - if (++count > areq->areq_max_buffered) { - DTRACE_PROBE2(query_overflow, ace_t *, ace, - areq_t *, areq); - mp->b_prev = NULL; - err = EALREADY; - goto err_ret; - } - } - /* Put us on the list. */ - mpp[0] = mp; - if (count != 0) { - /* - * If a query was already queued up, then we must not - * have an answer yet. - */ - DTRACE_PROBE2(query_in_progress, ace_t *, ace, - areq_t *, areq); - return (EINPROGRESS); - } - if (ACE_RESOLVED(ace)) { - /* - * We have an answer already. - * Keep a dup of mp since proto_addr points to it - * and mp has been placed on the ace_query_mp list. - */ - mblk_t *mp1; - - DTRACE_PROBE2(query_resolved, ace_t *, ace, - areq_t *, areq); - mp1 = dupmsg(mp); - ar_query_reply(ace, 0, proto_addr, proto_addr_len); - freemsg(mp1); - return (EINPROGRESS); - } - if (ace->ace_flags & ACE_F_MAPPING) { - /* Should never happen */ - DTRACE_PROBE2(query_unresolved_mapping, ace_t *, ace, - areq_t *, areq); - mpp[0] = mp->b_next; - err = ENXIO; - goto err_ret; - } - DTRACE_PROBE2(query_unresolved, ace_t, ace, areq_t *, areq); - } else { - /* No ace yet. Make one now. (This is the common case.) */ - if (areq->areq_xmit_count == 0) { - DTRACE_PROBE2(query_template, arl_t *, arl, - areq_t *, areq); - mp->b_prev = NULL; - err = ENXIO; - goto err_ret; - } - /* - * Check for sender addr being NULL or not before - * we create the ace. It is easy to cleanup later. - */ - sender_addr = mi_offset_paramc(mp, - areq->areq_sender_addr_offset, - areq->areq_sender_addr_length); - if (sender_addr == NULL) { - DTRACE_PROBE2(query_no_sender, arl_t *, arl, - areq_t *, areq); - mp->b_prev = NULL; - err = EINVAL; - goto err_ret; - } - err = ar_ce_create(OWNING_ARL(arl), areq->areq_proto, NULL, 0, - proto_addr, proto_addr_len, NULL, - NULL, (uint32_t)0, sender_addr, - areq->areq_flags); - if (err != 0) { - DTRACE_PROBE3(query_create_failed, arl_t *, arl, - areq_t *, areq, int, err); - mp->b_prev = NULL; - goto err_ret; - } - ace = ar_ce_lookup(arl, areq->areq_proto, proto_addr, - proto_addr_len); - if (ace == NULL || ace->ace_query_mp != NULL) { - /* Shouldn't happen! */ - DTRACE_PROBE3(query_lookup_failed, arl_t *, arl, - areq_t *, areq, ace_t *, ace); - mp->b_prev = NULL; - err = ENXIO; - goto err_ret; - } - ace->ace_query_mp = mp; - } - ms = ar_query_xmit(as, ace); - if (ms == 0) { - /* Immediate reply requested. */ - ar_query_reply(ace, ENXIO, NULL, (uint32_t)0); - } else { - mi_timer(ace->ace_arl->arl_wq, ace->ace_mp, ms); - } - return (EINPROGRESS); -err_ret: - if (is_mproto) { - ip_stack_t *ipst = as->as_netstack->netstack_ip; - - BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_freed); - } - return (err); -} - -/* Handle simple query requests. */ -static int -ar_entry_squery(queue_t *q, mblk_t *mp_orig) -{ - ace_t *ace; - area_t *area; - arl_t *arl; - uchar_t *hw_addr; - uint32_t hw_addr_len; - mblk_t *mp = mp_orig; - uchar_t *proto_addr; - int proto_addr_len; - arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as; - - if (DB_TYPE(mp) == M_IOCTL) - mp = mp->b_cont; - arl = ar_ll_lookup_from_mp(as, mp); - if (arl == NULL) - return (EINVAL); - /* - * Newly received commands from clients go to the tail of the queue. - */ - if (CMD_NEEDS_QUEUEING(mp_orig, arl)) { - DTRACE_PROBE3(squery_enqueued, queue_t *, q, mblk_t *, mp_orig, - arl_t *, arl); - ar_cmd_enqueue(arl, mp_orig, q, AR_ENTRY_SQUERY, B_TRUE); - return (EINPROGRESS); - } - mp_orig->b_prev = NULL; - - /* Extract parameters from the request message. */ - area = (area_t *)mp->b_rptr; - proto_addr_len = area->area_proto_addr_length; - proto_addr = mi_offset_paramc(mp, area->area_proto_addr_offset, - proto_addr_len); - hw_addr_len = area->area_hw_addr_length; - hw_addr = mi_offset_paramc(mp, area->area_hw_addr_offset, hw_addr_len); - if (proto_addr == NULL || hw_addr == NULL) { - DTRACE_PROBE1(squery_illegal_address, area_t *, area); - return (EINVAL); - } - ace = ar_ce_lookup(arl, area->area_proto, proto_addr, proto_addr_len); - if (ace == NULL) { - return (ENXIO); - } - if (hw_addr_len < ace->ace_hw_addr_length) { - return (EINVAL); - } - if (ACE_RESOLVED(ace)) { - /* Got it, prepare the response. */ - ASSERT(area->area_hw_addr_length == ace->ace_hw_addr_length); - ar_set_address(ace, hw_addr, proto_addr, proto_addr_len); - } else { - /* - * We have an incomplete entry. Set the length to zero and - * just return out the flags. - */ - area->area_hw_addr_length = 0; - } - area->area_flags = ace->ace_flags; - if (mp == mp_orig) { - /* Non-ioctl case */ - /* TODO: change message type? */ - DB_TYPE(mp) = M_CTL; /* Caught by ip_wput */ - DTRACE_PROBE3(squery_reply, queue_t *, q, mblk_t *, mp, - arl_t *, arl); - qreply(q, mp); - return (EINPROGRESS); - } - return (0); -} - -/* Process an interface down causing us to detach and unbind. */ -/* ARGSUSED */ -static int -ar_interface_down(queue_t *q, mblk_t *mp) -{ - arl_t *arl; - arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as; - - arl = ar_ll_lookup_from_mp(as, mp); - if (arl == NULL || arl->arl_closing) { - DTRACE_PROBE2(down_no_arl, queue_t *, q, mblk_t *, mp); - return (EINVAL); - } - - /* - * Newly received commands from clients go to the tail of the queue. - */ - if (CMD_NEEDS_QUEUEING(mp, arl)) { - DTRACE_PROBE3(down_enqueued, queue_t *, q, mblk_t *, mp, - arl_t *, arl); - ar_cmd_enqueue(arl, mp, q, AR_INTERFACE_DOWN, B_TRUE); - return (EINPROGRESS); - } - mp->b_prev = NULL; - /* - * The arl is already down, no work to do. - */ - if (arl->arl_state == ARL_S_DOWN) { - if (arl->arl_replumbing) { - /* - * The arl is already down and this is a result of - * the DL_NOTE_REPLUMB process. Return EINPROGRESS - * so this mp won't be freed by ar_rput(). - */ - arp_replumb_done(arl, mp); - return (EINPROGRESS); - } else { - /* ar_rput frees the mp */ - return (0); - } - } - - /* - * This command cannot complete in a single shot now itself. - * It has to be restarted after the receipt of the ack from - * the driver. So we need to enqueue the command (at the head). - */ - ar_cmd_enqueue(arl, mp, q, AR_INTERFACE_DOWN, B_FALSE); - - ASSERT(arl->arl_state == ARL_S_UP); - - /* Free all arp entries for this interface */ - ar_ce_walk(as, ar_ce_delete_per_arl, arl); - - ar_ll_down(arl); - /* Return EINPROGRESS so that ar_rput does not free the 'mp' */ - return (EINPROGRESS); -} - - -/* Process an interface up causing the info req sequence to start. */ -/* ARGSUSED */ -static int -ar_interface_up(queue_t *q, mblk_t *mp) -{ - arl_t *arl; - int err; - mblk_t *mp1; - arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as; - - arl = ar_ll_lookup_from_mp(as, mp); - if (arl == NULL || arl->arl_closing) { - DTRACE_PROBE2(up_no_arl, queue_t *, q, mblk_t *, mp); - err = EINVAL; - goto done; - } - - /* - * Newly received commands from clients go to the tail of the queue. - */ - if (CMD_NEEDS_QUEUEING(mp, arl)) { - DTRACE_PROBE3(up_enqueued, queue_t *, q, mblk_t *, mp, - arl_t *, arl); - ar_cmd_enqueue(arl, mp, q, AR_INTERFACE_UP, B_TRUE); - return (EINPROGRESS); - } - mp->b_prev = NULL; - - /* - * The arl is already up. No work to do. - */ - if (arl->arl_state == ARL_S_UP) { - err = 0; - goto done; - } - - /* - * This command cannot complete in a single shot now itself. - * It has to be restarted after the receipt of the ack from - * the driver. So we need to enqueue the command (at the head). - */ - ar_cmd_enqueue(arl, mp, q, AR_INTERFACE_UP, B_FALSE); - - err = ar_ll_up(arl); - - /* Return EINPROGRESS so that ar_rput does not free the 'mp' */ - return (EINPROGRESS); - -done: - /* caller frees 'mp' */ - - mp1 = ar_alloc(AR_DLPIOP_DONE, err); - if (mp1 != NULL) { - q = WR(q); - DTRACE_PROBE3(up_send_err, queue_t *, q, mblk_t *, mp1, - int, err); - putnext(q, mp1); - } - return (err); -} - -/* - * Given an arie_t `mp', find the arl_t's that it names and return them - * in `*arlp' and `*ipmp_arlp'. If they cannot be found, return B_FALSE. - */ -static boolean_t -ar_ipmp_lookup(arp_stack_t *as, mblk_t *mp, arl_t **arlp, arl_t **ipmp_arlp) -{ - arie_t *arie = (arie_t *)mp->b_rptr; - - *arlp = ar_ll_lookup_from_mp(as, mp); - if (*arlp == NULL) { - DTRACE_PROBE1(ipmp_lookup_no_arl, mblk_t *, mp); - return (B_FALSE); - } - - arie->arie_grifname[LIFNAMSIZ - 1] = '\0'; - *ipmp_arlp = ar_ll_lookup_by_name(as, arie->arie_grifname); - if (*ipmp_arlp == NULL) { - DTRACE_PROBE1(ipmp_lookup_no_ipmp_arl, mblk_t *, mp); - return (B_FALSE); - } - - DTRACE_PROBE2(ipmp_lookup, arl_t *, *arlp, arl_t *, *ipmp_arlp); - return (B_TRUE); -} - -/* - * Bind an arl_t to an IPMP group arl_t. - */ -static int -ar_ipmp_activate(queue_t *q, mblk_t *mp) -{ - arl_t *arl, *ipmp_arl; - arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as; - - if (!ar_ipmp_lookup(as, mp, &arl, &ipmp_arl)) - return (EINVAL); - - if (arl->arl_ipmp_arl != NULL) { - DTRACE_PROBE1(ipmp_activated_already, arl_t *, arl); - return (EALREADY); - } - - DTRACE_PROBE2(ipmp_activate, arl_t *, arl, arl_t *, ipmp_arl); - arl->arl_ipmp_arl = ipmp_arl; - return (0); -} - -/* - * Unbind an arl_t from an IPMP group arl_t and update the ace_t's so - * that it is no longer part of the group. - */ -static int -ar_ipmp_deactivate(queue_t *q, mblk_t *mp) -{ - arl_t *arl, *ipmp_arl; - arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as; - - if (!ar_ipmp_lookup(as, mp, &arl, &ipmp_arl)) - return (EINVAL); - - if (ipmp_arl != arl->arl_ipmp_arl) { - DTRACE_PROBE2(ipmp_deactivate_notactive, arl_t *, arl, arl_t *, - ipmp_arl); - return (EINVAL); - } - - DTRACE_PROBE2(ipmp_deactivate, arl_t *, arl, arl_t *, - arl->arl_ipmp_arl); - ar_ce_walk(as, ar_ce_ipmp_deactivate, arl); - arl->arl_ipmp_arl = NULL; - return (0); -} - -/* - * Enable an interface to process ARP_REQUEST and ARP_RESPONSE messages. - */ -/* ARGSUSED */ -static int -ar_interface_on(queue_t *q, mblk_t *mp) -{ - arl_t *arl; - arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as; - - arl = ar_ll_lookup_from_mp(as, mp); - if (arl == NULL) { - DTRACE_PROBE2(on_no_arl, queue_t *, q, mblk_t *, mp); - return (EINVAL); - } - - DTRACE_PROBE3(on_intf, queue_t *, q, mblk_t *, mp, arl_t *, arl); - arl->arl_flags &= ~ARL_F_NOARP; - return (0); -} - -/* - * Disable an interface from processing - * ARP_REQUEST and ARP_RESPONSE messages - */ -/* ARGSUSED */ -static int -ar_interface_off(queue_t *q, mblk_t *mp) -{ - arl_t *arl; - arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as; - - arl = ar_ll_lookup_from_mp(as, mp); - if (arl == NULL) { - DTRACE_PROBE2(off_no_arl, queue_t *, q, mblk_t *, mp); - return (EINVAL); - } - - DTRACE_PROBE3(off_intf, queue_t *, q, mblk_t *, mp, arl_t *, arl); - arl->arl_flags |= ARL_F_NOARP; - return (0); -} - -/* - * The queue 'q' is closing. Walk all the arl's and free any message - * pending in the arl_queue if it originated from the closing q. - * Also cleanup the ip_pending_queue, if the arp-IP stream is closing. - */ -static void -ar_ll_cleanup_arl_queue(queue_t *q) -{ - arl_t *arl; - mblk_t *mp; - mblk_t *mpnext; - mblk_t *prev; - arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as; - ip_stack_t *ipst = as->as_netstack->netstack_ip; - - for (arl = as->as_arl_head; arl != NULL; arl = arl->arl_next) { - for (prev = NULL, mp = arl->arl_queue; mp != NULL; - mp = mpnext) { - mpnext = mp->b_next; - if ((void *)mp->b_queue == (void *)q || - (void *)mp->b_queue == (void *)OTHERQ(q)) { - if (prev == NULL) - arl->arl_queue = mp->b_next; - else - prev->b_next = mp->b_next; - if (arl->arl_queue_tail == mp) - arl->arl_queue_tail = prev; - if (DB_TYPE(mp) == M_PROTO && - *(uint32_t *)mp->b_rptr == AR_ENTRY_QUERY) { - BUMP_IRE_STATS(ipst->ips_ire_stats_v4, - ire_stats_freed); - } - inet_freemsg(mp); - } else { - prev = mp; - } - } - } -} - -/* - * Look up a lower level tap by name. - */ -static arl_t * -ar_ll_lookup_by_name(arp_stack_t *as, const char *name) -{ - arl_t *arl; - - for (arl = as->as_arl_head; arl; arl = arl->arl_next) { - if (strcmp(arl->arl_name, name) == 0) { - return (arl); - } - } - return (NULL); -} - -/* - * Look up a lower level tap using parameters extracted from the common - * portion of the ARP command. - */ -static arl_t * -ar_ll_lookup_from_mp(arp_stack_t *as, mblk_t *mp) -{ - arc_t *arc = (arc_t *)mp->b_rptr; - uint8_t *name; - size_t namelen = arc->arc_name_length; - - name = mi_offset_param(mp, arc->arc_name_offset, namelen); - if (name == NULL || name[namelen - 1] != '\0') - return (NULL); - return (ar_ll_lookup_by_name(as, (char *)name)); -} - -static void -ar_ll_init(arp_stack_t *as, ar_t *ar, mblk_t *mp) -{ - arl_t *arl; - dl_info_ack_t *dlia = (dl_info_ack_t *)mp->b_rptr; - - ASSERT(ar->ar_arl == NULL); - - if ((arl = (arl_t *)mi_zalloc(sizeof (arl_t))) == NULL) - return; - - if (dlia->dl_mac_type == SUNW_DL_IPMP) { - arl->arl_flags |= ARL_F_IPMP; - arl->arl_ipmp_arl = arl; - } - - arl->arl_provider_style = dlia->dl_provider_style; - arl->arl_rq = ar->ar_rq; - arl->arl_wq = ar->ar_wq; - - arl->arl_dlpi_pending = DL_PRIM_INVAL; - - ar->ar_arl = arl; - - /* - * If/when ARP gets pushed into the IP module then this code to make - * a number uniquely identify an ARP instance can be removed and the - * ifindex from IP used. Rather than try and reinvent or copy the - * code used by IP for the purpose of allocating an index number - * (and trying to keep the number small), just allocate it in an - * ever increasing manner. This index number isn't ever exposed to - * users directly, its only use is for providing the pfhooks interface - * with a number it can use to uniquely identify an interface in time. - * - * Using a 32bit counter, over 136 plumbs would need to be done every - * second of every day (non-leap year) for it to wrap around and the - * for() loop below to kick in as a performance concern. - */ - if (as->as_arp_counter_wrapped) { - arl_t *arl1; - - do { - for (arl1 = as->as_arl_head; arl1 != NULL; - arl1 = arl1->arl_next) - if (arl1->arl_index == - as->as_arp_index_counter) { - as->as_arp_index_counter++; - if (as->as_arp_index_counter == 0) { - as->as_arp_counter_wrapped++; - as->as_arp_index_counter = 1; - } - break; - } - } while (arl1 != NULL); - } else { - arl->arl_index = as->as_arp_index_counter; - } - as->as_arp_index_counter++; - if (as->as_arp_index_counter == 0) { - as->as_arp_counter_wrapped++; - as->as_arp_index_counter = 1; - } -} - -/* - * This routine is called during module initialization when the DL_INFO_ACK - * comes back from the device. We set up defaults for all the device dependent - * doo-dads we are going to need. This will leave us ready to roll if we are - * attempting auto-configuration. Alternatively, these defaults can be - * overridden by initialization procedures possessing higher intelligence. - */ -static void -ar_ll_set_defaults(arl_t *arl, mblk_t *mp) -{ - ar_m_t *arm; - dl_info_ack_t *dlia = (dl_info_ack_t *)mp->b_rptr; - dl_unitdata_req_t *dlur; - uchar_t *up; - arlphy_t *ap; - - ASSERT(arl != NULL); - - /* - * Clear any stale defaults that might exist. - */ - ar_ll_clear_defaults(arl); - - if (arl->arl_flags & ARL_F_IPMP) { - /* - * If this is an IPMP arl_t, we have nothing to do, - * since we will never transmit or receive. - */ - return; - } - - ap = kmem_zalloc(sizeof (arlphy_t), KM_NOSLEEP); - if (ap == NULL) - goto bad; - arl->arl_phy = ap; - - if ((arm = ar_m_lookup(dlia->dl_mac_type)) == NULL) - arm = ar_m_lookup(DL_OTHER); - ASSERT(arm != NULL); - - /* - * We initialize based on parameters in the (currently) not too - * exhaustive ar_m_tbl. - */ - if (dlia->dl_version == DL_VERSION_2) { - /* XXX DLPI spec allows dl_sap_length of 0 before binding. */ - ap->ap_saplen = dlia->dl_sap_length; - ap->ap_hw_addrlen = dlia->dl_brdcst_addr_length; - } else { - ap->ap_saplen = arm->ar_mac_sap_length; - ap->ap_hw_addrlen = arm->ar_mac_hw_addr_length; - } - ap->ap_arp_hw_type = arm->ar_mac_arp_hw_type; - - /* - * Allocate the hardware and ARP addresses; note that the hardware - * address cannot be filled in until we see the DL_BIND_ACK. - */ - ap->ap_hw_addr = kmem_zalloc(ap->ap_hw_addrlen, KM_NOSLEEP); - ap->ap_arp_addr = kmem_alloc(ap->ap_hw_addrlen, KM_NOSLEEP); - if (ap->ap_hw_addr == NULL || ap->ap_arp_addr == NULL) - goto bad; - - if (dlia->dl_version == DL_VERSION_2) { - if ((up = mi_offset_param(mp, dlia->dl_brdcst_addr_offset, - ap->ap_hw_addrlen)) == NULL) - goto bad; - bcopy(up, ap->ap_arp_addr, ap->ap_hw_addrlen); - } else { - /* - * No choice but to assume a broadcast address of all ones, - * known to work on some popular networks. - */ - (void) memset(ap->ap_arp_addr, ~0, ap->ap_hw_addrlen); - } - - /* - * Make us a template DL_UNITDATA_REQ message which we will use for - * broadcasting resolution requests, and which we will clone to hand - * back as responses to the protocols. - */ - ap->ap_xmit_mp = ar_dlpi_comm(DL_UNITDATA_REQ, ap->ap_hw_addrlen + - ABS(ap->ap_saplen) + sizeof (dl_unitdata_req_t)); - if (ap->ap_xmit_mp == NULL) - goto bad; - - dlur = (dl_unitdata_req_t *)ap->ap_xmit_mp->b_rptr; - dlur->dl_priority.dl_min = 0; - dlur->dl_priority.dl_max = 0; - dlur->dl_dest_addr_length = ap->ap_hw_addrlen + ABS(ap->ap_saplen); - dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t); - - /* NOTE: the destination address and sap offsets are permanently set */ - ap->ap_xmit_sapoff = dlur->dl_dest_addr_offset; - ap->ap_xmit_addroff = dlur->dl_dest_addr_offset; - if (ap->ap_saplen < 0) - ap->ap_xmit_sapoff += ap->ap_hw_addrlen; /* sap last */ - else - ap->ap_xmit_addroff += ap->ap_saplen; /* addr last */ - - *(uint16_t *)((caddr_t)dlur + ap->ap_xmit_sapoff) = ETHERTYPE_ARP; - return; -bad: - ar_ll_clear_defaults(arl); -} - -static void -ar_ll_clear_defaults(arl_t *arl) -{ - arlphy_t *ap = arl->arl_phy; - - if (ap != NULL) { - arl->arl_phy = NULL; - if (ap->ap_hw_addr != NULL) - kmem_free(ap->ap_hw_addr, ap->ap_hw_addrlen); - if (ap->ap_arp_addr != NULL) - kmem_free(ap->ap_arp_addr, ap->ap_hw_addrlen); - freemsg(ap->ap_xmit_mp); - kmem_free(ap, sizeof (arlphy_t)); - } -} - -static void -ar_ll_down(arl_t *arl) -{ - mblk_t *mp; - ar_t *ar; - - ASSERT(arl->arl_state == ARL_S_UP); - - /* Let's break the association between an ARL and IP instance */ - ar = (ar_t *)arl->arl_rq->q_ptr; - if (ar->ar_arl_ip_assoc != NULL) { - ASSERT(ar->ar_arl_ip_assoc->ar_arl_ip_assoc != NULL && - ar->ar_arl_ip_assoc->ar_arl_ip_assoc == ar); - ar->ar_arl_ip_assoc->ar_arl_ip_assoc = NULL; - ar->ar_arl_ip_assoc = NULL; - } - - arl->arl_state = ARL_S_PENDING; - - mp = arl->arl_unbind_mp; - ASSERT(mp != NULL); - ar_dlpi_send(arl, mp); - arl->arl_unbind_mp = NULL; - - if (arl->arl_provider_style == DL_STYLE2) { - mp = arl->arl_detach_mp; - ASSERT(mp != NULL); - ar_dlpi_send(arl, mp); - arl->arl_detach_mp = NULL; - } -} - -static int -ar_ll_up(arl_t *arl) -{ - mblk_t *attach_mp = NULL; - mblk_t *bind_mp = NULL; - mblk_t *detach_mp = NULL; - mblk_t *unbind_mp = NULL; - mblk_t *info_mp = NULL; - mblk_t *notify_mp = NULL; - - ASSERT(arl->arl_state == ARL_S_DOWN); - - if (arl->arl_provider_style == DL_STYLE2) { - attach_mp = - ar_dlpi_comm(DL_ATTACH_REQ, sizeof (dl_attach_req_t)); - if (attach_mp == NULL) - goto bad; - ((dl_attach_req_t *)attach_mp->b_rptr)->dl_ppa = - arl->arl_ppa; - - detach_mp = - ar_dlpi_comm(DL_DETACH_REQ, sizeof (dl_detach_req_t)); - if (detach_mp == NULL) - goto bad; - } - - info_mp = ar_dlpi_comm(DL_INFO_REQ, sizeof (dl_info_req_t)); - if (info_mp == NULL) - goto bad; - - /* Allocate and initialize a bind message. */ - bind_mp = ar_dlpi_comm(DL_BIND_REQ, sizeof (dl_bind_req_t)); - if (bind_mp == NULL) - goto bad; - ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ETHERTYPE_ARP; - ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS; - - unbind_mp = ar_dlpi_comm(DL_UNBIND_REQ, sizeof (dl_unbind_req_t)); - if (unbind_mp == NULL) - goto bad; - - notify_mp = ar_dlpi_comm(DL_NOTIFY_REQ, sizeof (dl_notify_req_t)); - if (notify_mp == NULL) - goto bad; - ((dl_notify_req_t *)notify_mp->b_rptr)->dl_notifications = - DL_NOTE_LINK_UP | DL_NOTE_LINK_DOWN | DL_NOTE_REPLUMB; - - arl->arl_state = ARL_S_PENDING; - if (arl->arl_provider_style == DL_STYLE2) { - ar_dlpi_send(arl, attach_mp); - ASSERT(detach_mp != NULL); - arl->arl_detach_mp = detach_mp; - } - ar_dlpi_send(arl, info_mp); - ar_dlpi_send(arl, bind_mp); - arl->arl_unbind_mp = unbind_mp; - ar_dlpi_send(arl, notify_mp); - return (0); - -bad: - freemsg(attach_mp); - freemsg(bind_mp); - freemsg(detach_mp); - freemsg(unbind_mp); - freemsg(info_mp); - freemsg(notify_mp); - return (ENOMEM); -} - -/* Process mapping add requests from external messages. */ -static int -ar_mapping_add(queue_t *q, mblk_t *mp_orig) -{ - arma_t *arma; - mblk_t *mp = mp_orig; - ace_t *ace; - uchar_t *hw_addr; - uint32_t hw_addr_len; - uchar_t *proto_addr; - uint32_t proto_addr_len; - uchar_t *proto_mask; - uchar_t *proto_extract_mask; - uint32_t hw_extract_start; - arl_t *arl; - arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as; - - /* We handle both M_IOCTL and M_PROTO messages. */ - if (DB_TYPE(mp) == M_IOCTL) - mp = mp->b_cont; - arl = ar_ll_lookup_from_mp(as, mp); - if (arl == NULL) - return (EINVAL); - /* - * Newly received commands from clients go to the tail of the queue. - */ - if (CMD_NEEDS_QUEUEING(mp_orig, arl)) { - DTRACE_PROBE3(madd_enqueued, queue_t *, q, mblk_t *, mp_orig, - arl_t *, arl); - ar_cmd_enqueue(arl, mp_orig, q, AR_MAPPING_ADD, B_TRUE); - return (EINPROGRESS); - } - mp_orig->b_prev = NULL; - - arma = (arma_t *)mp->b_rptr; - ace = ar_ce_lookup_from_area(as, mp, ar_ce_lookup_mapping); - if (ace != NULL) - ar_ce_delete(ace); - hw_addr_len = arma->arma_hw_addr_length; - hw_addr = mi_offset_paramc(mp, arma->arma_hw_addr_offset, hw_addr_len); - proto_addr_len = arma->arma_proto_addr_length; - proto_addr = mi_offset_paramc(mp, arma->arma_proto_addr_offset, - proto_addr_len); - proto_mask = mi_offset_paramc(mp, arma->arma_proto_mask_offset, - proto_addr_len); - proto_extract_mask = mi_offset_paramc(mp, - arma->arma_proto_extract_mask_offset, proto_addr_len); - hw_extract_start = arma->arma_hw_mapping_start; - if (proto_mask == NULL || proto_extract_mask == NULL) { - DTRACE_PROBE2(madd_illegal_mask, arl_t *, arl, arpa_t *, arma); - return (EINVAL); - } - return (ar_ce_create( - arl, - arma->arma_proto, - hw_addr, - hw_addr_len, - proto_addr, - proto_addr_len, - proto_mask, - proto_extract_mask, - hw_extract_start, - NULL, - arma->arma_flags | ACE_F_MAPPING)); -} - -static boolean_t -ar_mask_all_ones(uchar_t *mask, uint32_t mask_len) -{ - if (mask == NULL) - return (B_TRUE); - - while (mask_len-- > 0) { - if (*mask++ != 0xFF) { - return (B_FALSE); - } - } - return (B_TRUE); -} - -/* Find an entry for a particular MAC type in the ar_m_tbl. */ -static ar_m_t * -ar_m_lookup(t_uscalar_t mac_type) -{ - ar_m_t *arm; - - for (arm = ar_m_tbl; arm < A_END(ar_m_tbl); arm++) { - if (arm->ar_mac_type == mac_type) - return (arm); - } - return (NULL); -} - -/* Respond to Named Dispatch requests. */ -static int -ar_nd_ioctl(queue_t *q, mblk_t *mp) -{ - ar_t *ar = (ar_t *)q->q_ptr; - arp_stack_t *as = ar->ar_as; - - if (DB_TYPE(mp) == M_IOCTL && nd_getset(q, as->as_nd, mp)) - return (0); - return (ENOENT); -} - -/* ARP module open routine. */ -static int -ar_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) -{ - ar_t *ar; - int err; - queue_t *tmp_q; - mblk_t *mp; - netstack_t *ns; - arp_stack_t *as; - - TRACE_1(TR_FAC_ARP, TR_ARP_OPEN, - "arp_open: q %p", q); - /* Allow a reopen. */ - if (q->q_ptr != NULL) { - return (0); - } - - ns = netstack_find_by_cred(credp); - ASSERT(ns != NULL); - as = ns->netstack_arp; - ASSERT(as != NULL); - - /* mi_open_comm allocates the instance data structure, etc. */ - err = mi_open_comm(&as->as_head, sizeof (ar_t), q, devp, flag, sflag, - credp); - if (err) { - netstack_rele(as->as_netstack); - return (err); - } - - /* - * We are D_MTPERMOD so it is safe to do qprocson before - * the instance data has been initialized. - */ - qprocson(q); - - ar = (ar_t *)q->q_ptr; - ar->ar_rq = q; - q = WR(q); - ar->ar_wq = q; - crhold(credp); - ar->ar_credp = credp; - ar->ar_as = as; - - /* - * Probe for the DLPI info if we are not pushed on IP or UDP. Wait for - * the reply. In case of error call ar_close() which will take - * care of doing everything required to close this instance, such - * as freeing the arl, restarting the timer on a different queue etc. - */ - if (strcmp(q->q_next->q_qinfo->qi_minfo->mi_idname, "ip") == 0 || - strcmp(q->q_next->q_qinfo->qi_minfo->mi_idname, "udp") == 0) { - arc_t *arc; - - /* - * We are pushed directly on top of IP or UDP. There is no need - * to send down a DL_INFO_REQ. Return success. This could - * either be an ill stream (i.e. <arp-IP-Driver> stream) - * or a stream corresponding to an open of /dev/arp - * (i.e. <arp-IP> stream). Note that we don't support - * pushing some module in between arp and IP. - * - * Tell IP, though, that we're an extended implementation, so - * it knows to expect a DAD response after bringing an - * interface up. Old ATM drivers won't do this, and IP will - * just bring the interface up immediately. - */ - ar->ar_on_ill_stream = (q->q_next->q_next != NULL); - if (!ar->ar_on_ill_stream || arp_no_defense) - return (0); - mp = allocb(sizeof (arc_t), BPRI_MED); - if (mp == NULL) { - (void) ar_close(RD(q)); - return (ENOMEM); - } - DB_TYPE(mp) = M_CTL; - arc = (arc_t *)mp->b_rptr; - mp->b_wptr = mp->b_rptr + sizeof (arc_t); - arc->arc_cmd = AR_ARP_EXTEND; - putnext(q, mp); - return (0); - } - tmp_q = q; - /* Get the driver's queue */ - while (tmp_q->q_next != NULL) - tmp_q = tmp_q->q_next; - - ASSERT(tmp_q->q_qinfo->qi_minfo != NULL); - - if (strcmp(tmp_q->q_qinfo->qi_minfo->mi_idname, "ip") == 0 || - strcmp(tmp_q->q_qinfo->qi_minfo->mi_idname, "udp") == 0) { - /* - * We don't support pushing ARP arbitrarily on an IP or UDP - * driver stream. ARP has to be pushed directly above IP or - * UDP. - */ - (void) ar_close(RD(q)); - return (ENOTSUP); - } else { - /* - * Send down a DL_INFO_REQ so we can find out what we are - * talking to. - */ - mp = ar_dlpi_comm(DL_INFO_REQ, sizeof (dl_info_req_t)); - if (mp == NULL) { - (void) ar_close(RD(q)); - return (ENOMEM); - } - putnext(ar->ar_wq, mp); - while (ar->ar_arl == NULL) { - if (!qwait_sig(ar->ar_rq)) { - (void) ar_close(RD(q)); - return (EINTR); - } - } - } - return (0); -} - -/* Get current value of Named Dispatch item. */ -/* ARGSUSED */ -static int -ar_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) -{ - arpparam_t *arppa = (arpparam_t *)cp; - - (void) mi_mpprintf(mp, "%d", arppa->arp_param_value); - return (0); -} - -/* - * Walk through the param array specified registering each element with the - * named dispatch handler. - */ -static boolean_t -ar_param_register(IDP *ndp, arpparam_t *arppa, int cnt) -{ - for (; cnt-- > 0; arppa++) { - if (arppa->arp_param_name && arppa->arp_param_name[0]) { - if (!nd_load(ndp, arppa->arp_param_name, - ar_param_get, ar_param_set, - (caddr_t)arppa)) { - nd_free(ndp); - return (B_FALSE); - } - } - } - return (B_TRUE); -} - -/* Set new value of Named Dispatch item. */ -/* ARGSUSED */ -static int -ar_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr) -{ - long new_value; - arpparam_t *arppa = (arpparam_t *)cp; - - if (ddi_strtol(value, NULL, 10, &new_value) != 0 || - new_value < arppa->arp_param_min || - new_value > arppa->arp_param_max) { - return (EINVAL); - } - arppa->arp_param_value = new_value; - return (0); -} - -/* - * Process an I_PLINK ioctl. If the lower stream is an arp device stream, - * append another mblk to the chain, that will carry the device name, - * and the muxid. IP uses this info to lookup the corresponding ill, and - * set the ill_arp_muxid atomically, as part of the I_PLINK, instead of - * waiting for the SIOCSLIFMUXID. (which may never happen if ifconfig is - * killed, and this has the bad effect of not being able to unplumb - * subsequently) - */ -static int -ar_plink_send(queue_t *q, mblk_t *mp) -{ - char *name; - mblk_t *muxmp; - mblk_t *mp1; - ar_t *ar = (ar_t *)q->q_ptr; - arp_stack_t *as = ar->ar_as; - struct linkblk *li; - struct ipmx_s *ipmxp; - queue_t *arpwq; - - mp1 = mp->b_cont; - ASSERT((mp1 != NULL) && (mp1->b_cont == NULL)); - li = (struct linkblk *)mp1->b_rptr; - arpwq = li->l_qbot; - - /* - * Allocate a new mblk which will hold an ipmx_s and chain it to - * the M_IOCTL chain. The final chain will consist of 3 mblks, - * namely the M_IOCTL, followed by the linkblk, followed by the ipmx_s - */ - muxmp = allocb(sizeof (struct ipmx_s), BPRI_MED); - if (muxmp == NULL) - return (ENOMEM); - ipmxp = (struct ipmx_s *)muxmp->b_wptr; - ipmxp->ipmx_arpdev_stream = 0; - muxmp->b_wptr += sizeof (struct ipmx_s); - mp1->b_cont = muxmp; - - /* - * The l_qbot represents the uppermost write queue of the - * lower stream. Walk down this stream till we hit ARP. - * We can safely walk, since STREAMS has made sure the stream - * cannot close till the IOCACK goes up, and is not interruptible. - */ - while (arpwq != NULL) { - /* - * Beware of broken modules like logsubr.c that - * may not have a q_qinfo or qi_minfo. - */ - if ((q->q_qinfo != NULL) && (q->q_qinfo->qi_minfo != NULL)) { - name = arpwq->q_qinfo->qi_minfo->mi_idname; - if (name != NULL && name[0] != NULL && - (strcmp(name, arp_mod_info.mi_idname) == 0)) - break; - } - arpwq = arpwq->q_next; - } - - /* - * Check if arpwq corresponds to an arp device stream, by walking - * the mi list. If it does, then add the muxid and device name info - * for use by IP. IP will send the M_IOCACK. - */ - if (arpwq != NULL) { - for (ar = (ar_t *)mi_first_ptr(&as->as_head); ar != NULL; - ar = (ar_t *)mi_next_ptr(&as->as_head, (void *)ar)) { - if ((ar->ar_wq == arpwq) && (ar->ar_arl != NULL)) { - ipmxp->ipmx_arpdev_stream = 1; - (void) strcpy((char *)ipmxp->ipmx_name, - ar->ar_arl->arl_name); - break; - } - } - } - - putnext(q, mp); - return (0); -} - -/* - * ar_ce_walk routine to delete any outstanding queries for an ar that is - * going away. - */ -static void -ar_query_delete(ace_t *ace, void *arg) -{ - ar_t *ar = arg; - mblk_t **mpp = &ace->ace_query_mp; - mblk_t *mp; - arp_stack_t *as = ar->ar_as; - ip_stack_t *ipst = as->as_netstack->netstack_ip; - - while ((mp = *mpp) != NULL) { - /* The response queue was stored in the query b_prev. */ - if ((queue_t *)mp->b_prev == ar->ar_wq || - (queue_t *)mp->b_prev == ar->ar_rq) { - *mpp = mp->b_next; - if (DB_TYPE(mp) == M_PROTO && - *(uint32_t *)mp->b_rptr == AR_ENTRY_QUERY) { - BUMP_IRE_STATS(ipst->ips_ire_stats_v4, - ire_stats_freed); - } - inet_freemsg(mp); - } else { - mpp = &mp->b_next; - } - } -} - -/* - * This routine is called either when an address resolution has just been - * found, or when it is time to give, or in some other error situation. - * If a non-zero ret_val is provided, any outstanding queries for the - * specified ace will be completed using that error value. Otherwise, - * the completion status will depend on whether the address has been - * resolved. - */ -static void -ar_query_reply(ace_t *ace, int ret_val, uchar_t *proto_addr, - uint32_t proto_addr_len) -{ - mblk_t *areq_mp; - mblk_t *mp; - mblk_t *xmit_mp; - queue_t *arl_wq = ace->ace_arl->arl_wq; - arp_stack_t *as = ARL_TO_ARPSTACK(ace->ace_arl); - ip_stack_t *ipst = as->as_netstack->netstack_ip; - arlphy_t *ap = ace->ace_xmit_arl->arl_phy; - - /* - * On error or completion for a query, we need to shut down the timer. - * However, the timer must not be stopped for an interface doing - * Duplicate Address Detection, or it will never finish that phase. - */ - if (!(ace->ace_flags & (ACE_F_UNVERIFIED | ACE_F_AUTHORITY))) - mi_timer(arl_wq, ace->ace_mp, -1L); - - /* Establish the return value appropriate. */ - if (ret_val == 0) { - if (!ACE_RESOLVED(ace) || ap == NULL) - ret_val = ENXIO; - } - /* Terminate all outstanding queries. */ - while ((mp = ace->ace_query_mp) != 0) { - /* The response queue was saved in b_prev. */ - queue_t *q = (queue_t *)mp->b_prev; - mp->b_prev = NULL; - ace->ace_query_mp = mp->b_next; - mp->b_next = NULL; - /* - * If we have the answer, attempt to get a copy of the xmit - * template to prepare for the client. - */ - if (ret_val == 0 && - (xmit_mp = copyb(ap->ap_xmit_mp)) == NULL) { - /* Too bad, buy more memory. */ - ret_val = ENOMEM; - } - /* Complete the response based on how the request arrived. */ - if (DB_TYPE(mp) == M_IOCTL) { - struct iocblk *ioc = (struct iocblk *)mp->b_rptr; - - ioc->ioc_error = ret_val; - if (ret_val != 0) { - DB_TYPE(mp) = M_IOCNAK; - ioc->ioc_count = 0; - putnext(q, mp); - continue; - } - /* - * Return the xmit mp out with the successful IOCTL. - */ - DB_TYPE(mp) = M_IOCACK; - ioc->ioc_count = MBLKL(xmit_mp); - /* Remove the areq mblk from the IOCTL. */ - areq_mp = mp->b_cont; - mp->b_cont = areq_mp->b_cont; - } else { - if (ret_val != 0) { - /* TODO: find some way to let the guy know? */ - inet_freemsg(mp); - BUMP_IRE_STATS(ipst->ips_ire_stats_v4, - ire_stats_freed); - continue; - } - /* - * In the M_PROTO case, the areq message is followed by - * a message chain to be returned to the protocol. ARP - * doesn't know (or care) what is in this chain, but in - * the event that the reader is pondering the - * relationship between ARP and IP (for example), the - * areq is followed by an incipient IRE, and then the - * original outbound packet. Here we detach the areq. - */ - areq_mp = mp; - mp = mp->b_cont; - } - ASSERT(ret_val == 0 && ap != NULL); - if (ap->ap_saplen != 0) { - /* - * Copy the SAP type specified in the request into - * the xmit mp. - */ - areq_t *areq = (areq_t *)areq_mp->b_rptr; - bcopy(areq->areq_sap, xmit_mp->b_rptr + - ap->ap_xmit_sapoff, ABS(ap->ap_saplen)); - } - /* Done with the areq message. */ - freeb(areq_mp); - /* - * Copy the resolved hardware address into the xmit mp - * or perform the mapping operation. - */ - ar_set_address(ace, xmit_mp->b_rptr + ap->ap_xmit_addroff, - proto_addr, proto_addr_len); - /* - * Now insert the xmit mp after the response message. In - * the M_IOCTL case, it will be the returned data block. In - * the M_PROTO case, (again using IP as an example) it will - * appear after the IRE and before the outbound packet. - */ - xmit_mp->b_cont = mp->b_cont; - mp->b_cont = xmit_mp; - putnext(q, mp); - } - - /* - * Unless we are responding from a permanent cache entry, start the - * cleanup timer or (on error) delete the entry. - */ - if (!(ace->ace_flags & (ACE_F_PERMANENT | ACE_F_DYING))) { - if (!ACE_RESOLVED(ace) || ap == NULL) { - /* - * No need to notify IP here, because the entry was - * never resolved, so IP can't have any cached copies - * of the address. - */ - ar_ce_delete(ace); - } else { - mi_timer(arl_wq, ace->ace_mp, as->as_cleanup_interval); - } - } -} - -/* - * Returns number of milliseconds after which we should either rexmit or abort. - * Return of zero means we should abort. - */ -static clock_t -ar_query_xmit(arp_stack_t *as, ace_t *ace) -{ - areq_t *areq; - mblk_t *mp; - uchar_t *proto_addr; - uchar_t *sender_addr; - ace_t *src_ace; - arl_t *xmit_arl = ace->ace_xmit_arl; - - mp = ace->ace_query_mp; - /* - * ar_query_delete may have just blown off the outstanding - * ace_query_mp entries because the client who sent the query - * went away. If this happens just before the ace_mp timer - * goes off, we'd find a null ace_query_mp which is not an error. - * The unresolved ace itself, and the timer, will be removed - * when the arl stream goes away. - */ - if (!mp) - return (0); - if (DB_TYPE(mp) == M_IOCTL) - mp = mp->b_cont; - areq = (areq_t *)mp->b_rptr; - if (areq->areq_xmit_count == 0) - return (0); - areq->areq_xmit_count--; - proto_addr = mi_offset_paramc(mp, areq->areq_target_addr_offset, - areq->areq_target_addr_length); - sender_addr = mi_offset_paramc(mp, areq->areq_sender_addr_offset, - areq->areq_sender_addr_length); - - /* - * Get the ace for the sender address, so that we can verify that - * we have one and that DAD has completed. - */ - src_ace = ar_ce_lookup(xmit_arl, areq->areq_proto, sender_addr, - areq->areq_sender_addr_length); - if (src_ace == NULL) { - DTRACE_PROBE3(xmit_no_source, ace_t *, ace, areq_t *, areq, - uchar_t *, sender_addr); - return (0); - } - - /* - * If we haven't yet finished duplicate address checking on this source - * address, then do *not* use it on the wire. Doing so will corrupt - * the world's caches. Just allow the timer to restart. Note that - * duplicate address checking will eventually complete one way or the - * other, so this cannot go on "forever." - */ - if (src_ace->ace_flags & ACE_F_UNVERIFIED) { - DTRACE_PROBE2(xmit_source_unverified, ace_t *, ace, - ace_t *, src_ace); - areq->areq_xmit_count++; - return (areq->areq_xmit_interval); - } - - DTRACE_PROBE3(xmit_send, ace_t *, ace, ace_t *, src_ace, - areq_t *, areq); - - ar_xmit(xmit_arl, ARP_REQUEST, areq->areq_proto, - areq->areq_sender_addr_length, xmit_arl->arl_phy->ap_hw_addr, - sender_addr, xmit_arl->arl_phy->ap_arp_addr, proto_addr, NULL, as); - src_ace->ace_last_bcast = ddi_get_lbolt(); - return (areq->areq_xmit_interval); -} - -/* Our read side put procedure. */ -static void -ar_rput(queue_t *q, mblk_t *mp) -{ - arh_t *arh; - arl_t *arl; - arl_t *client_arl; - ace_t *dst_ace; - uchar_t *dst_paddr; - int err; - uint32_t hlen; - struct iocblk *ioc; - mblk_t *mp1; - int op; - uint32_t plen; - uint32_t proto; - uchar_t *src_haddr; - uchar_t *src_paddr; - uchar_t *dst_haddr; - boolean_t is_probe; - boolean_t is_unicast = B_FALSE; - dl_unitdata_ind_t *dlindp; - int i; - arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as; - - TRACE_1(TR_FAC_ARP, TR_ARP_RPUT_START, - "arp_rput_start: q %p", q); - - /* - * We handle ARP commands from below both in M_IOCTL and M_PROTO - * messages. Actual ARP requests and responses will show up as - * M_PROTO messages containing DL_UNITDATA_IND blocks. - */ - switch (DB_TYPE(mp)) { - case M_IOCTL: - err = ar_cmd_dispatch(q, mp, B_FALSE); - switch (err) { - case ENOENT: - DB_TYPE(mp) = M_IOCNAK; - if ((mp1 = mp->b_cont) != 0) { - /* - * Collapse the data as a note to the - * originator. - */ - mp1->b_wptr = mp1->b_rptr; - } - break; - case EINPROGRESS: - TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, - "arp_rput_end: q %p (%S)", q, "ioctl/inprogress"); - return; - default: - DB_TYPE(mp) = M_IOCACK; - break; - } - ioc = (struct iocblk *)mp->b_rptr; - ioc->ioc_error = err; - if ((mp1 = mp->b_cont) != 0) - ioc->ioc_count = MBLKL(mp1); - else - ioc->ioc_count = 0; - qreply(q, mp); - TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, - "arp_rput_end: q %p (%S)", q, "ioctl"); - return; - case M_CTL: - /* - * IP is acking the AR_ARP_CLOSING message that we sent - * in ar_close. - */ - if (MBLKL(mp) == sizeof (arc_t)) { - if (((arc_t *)mp->b_rptr)->arc_cmd == AR_ARP_CLOSING) - ((ar_t *)q->q_ptr)->ar_ip_acked_close = 1; - } - freemsg(mp); - return; - case M_PCPROTO: - case M_PROTO: - dlindp = (dl_unitdata_ind_t *)mp->b_rptr; - if (MBLKL(mp) >= sizeof (dl_unitdata_ind_t) && - dlindp->dl_primitive == DL_UNITDATA_IND) { - is_unicast = (dlindp->dl_group_address == 0); - arl = ((ar_t *)q->q_ptr)->ar_arl; - if (arl != NULL && arl->arl_phy != NULL) { - /* Real messages from the wire! */ - break; - } - putnext(q, mp); - TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, - "arp_rput_end: q %p (%S)", q, "default"); - return; - } - err = ar_cmd_dispatch(q, mp, B_FALSE); - switch (err) { - case ENOENT: - /* Miscellaneous DLPI messages get shuffled off. */ - ar_rput_dlpi(q, mp); - TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, - "arp_rput_end: q %p (%S)", q, "proto/dlpi"); - break; - case EINPROGRESS: - TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, - "arp_rput_end: q %p (%S)", q, "proto"); - break; - default: - inet_freemsg(mp); - break; - } - return; - default: - putnext(q, mp); - TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, - "arp_rput_end: q %p (%S)", q, "default"); - return; - } - /* - * If the IFF_NOARP flag is on, then do not process any - * incoming ARP_REQUEST or incoming ARP_RESPONSE. - */ - if (arl->arl_flags & ARL_F_NOARP) { - freemsg(mp); - TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, - "arp_rput_end: q %p (%S)", q, "interface has IFF_NOARP set"); - return; - } - - /* - * What we should have at this point is a DL_UNITDATA_IND message - * followed by an ARP packet. We do some initial checks and then - * get to work. - */ - mp1 = mp->b_cont; - if (mp1 == NULL) { - freemsg(mp); - TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, - "arp_rput_end: q %p (%S)", q, "baddlpi"); - return; - } - if (mp1->b_cont != NULL) { - /* No fooling around with funny messages. */ - if (!pullupmsg(mp1, -1)) { - freemsg(mp); - TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, - "arp_rput_end: q %p (%S)", q, "pullupmsgfail"); - return; - } - } - arh = (arh_t *)mp1->b_rptr; - hlen = arh->arh_hlen; - plen = arh->arh_plen; - if (MBLKL(mp1) < ARH_FIXED_LEN + 2 * hlen + 2 * plen) { - freemsg(mp); - TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, - "arp_rput_end: q %p (%S)", q, "short"); - return; - } - /* - * hlen 0 is used for RFC 1868 UnARP. - * - * Note that the rest of the code checks that hlen is what we expect - * for this hardware address type, so might as well discard packets - * here that don't match. - */ - if ((hlen > 0 && hlen != arl->arl_phy->ap_hw_addrlen) || plen == 0) { - DTRACE_PROBE2(rput_bogus, arl_t *, arl, mblk_t *, mp1); - freemsg(mp); - TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, - "arp_rput_end: q %p (%S)", q, "hlenzero/plenzero"); - return; - } - /* - * Historically, Solaris has been lenient about hardware type numbers. - * We should check here, but don't. - */ - DTRACE_PROBE2(rput_normal, arl_t *, arl, arh_t *, arh); - - DTRACE_PROBE3(arp__physical__in__start, - arl_t *, arl, arh_t *, arh, mblk_t *, mp); - - ARP_HOOK_IN(as->as_arp_physical_in_event, as->as_arp_physical_in, - arl->arl_index, arh, mp, mp1, as); - - DTRACE_PROBE1(arp__physical__in__end, mblk_t *, mp); - - if (mp == NULL) - return; - - proto = (uint32_t)BE16_TO_U16(arh->arh_proto); - src_haddr = (uchar_t *)arh; - src_haddr = &src_haddr[ARH_FIXED_LEN]; - src_paddr = &src_haddr[hlen]; - dst_haddr = &src_haddr[hlen + plen]; - dst_paddr = &src_haddr[hlen + plen + hlen]; - op = BE16_TO_U16(arh->arh_operation); - - /* Determine if this is just a probe */ - for (i = 0; i < plen; i++) - if (src_paddr[i] != 0) - break; - is_probe = i >= plen; - - /* - * RFC 826: first check if the <protocol, sender protocol address> is - * in the cache, if there is a sender protocol address. Note that this - * step also handles resolutions based on source. - * - * Note that IP expects that each notification it receives will be - * tied to the ill it received it on. Thus, we must talk to it over - * the arl tied to the resolved IP address (if any), hence client_arl. - */ - if (is_probe) - err = AR_NOTFOUND; - else - err = ar_ce_resolve_all(arl, proto, src_haddr, hlen, src_paddr, - plen, &client_arl); - - switch (err) { - case AR_BOGON: - ar_client_notify(client_arl, mp1, AR_CN_BOGON); - mp1 = NULL; - break; - case AR_FAILED: - ar_client_notify(client_arl, mp1, AR_CN_FAILED); - mp1 = NULL; - break; - case AR_LOOPBACK: - DTRACE_PROBE2(rput_loopback, arl_t *, arl, arh_t *, arh); - freemsg(mp1); - mp1 = NULL; - break; - } - if (mp1 == NULL) { - freeb(mp); - TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, - "arp_rput_end: q %p (%S)", q, "unneeded"); - return; - } - - /* - * Now look up the destination address. By RFC 826, we ignore the - * packet at this step if the target isn't one of our addresses. This - * is true even if the target is something we're trying to resolve and - * the packet is a response. To avoid duplicate responses, we also - * ignore the packet if it was multicast/broadcast to an arl that's in - * an IPMP group but was not the designated xmit_arl for the ACE. - * - * Note that in order to do this correctly, we need to know when to - * notify IP of a change implied by the source address of the ARP - * message. That implies that the local ARP table has entries for all - * of the resolved entries cached in the client. This is why we must - * notify IP when we delete a resolved entry and we know that IP may - * have cached answers. - */ - dst_ace = ar_ce_lookup_entry(arl, proto, dst_paddr, plen); - if (dst_ace == NULL || !ACE_RESOLVED(dst_ace) || - (dst_ace->ace_xmit_arl != arl && !is_unicast) || - !(dst_ace->ace_flags & ACE_F_PUBLISH)) { - /* - * Let the client know if the source mapping has changed, even - * if the destination provides no useful information for the - * client. - */ - if (err == AR_CHANGED) - ar_client_notify(client_arl, mp1, AR_CN_ANNOUNCE); - else - freemsg(mp1); - freeb(mp); - TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, - "arp_rput_end: q %p (%S)", q, "nottarget"); - return; - } - - /* - * If the target is unverified by DAD, then one of two things is true: - * either it's someone else claiming this address (on a probe or an - * announcement) or it's just a regular request. The former is - * failure, but a regular request is not. - */ - if (dst_ace->ace_flags & ACE_F_UNVERIFIED) { - /* - * Check for a reflection. Some misbehaving bridges will - * reflect our own transmitted packets back to us. - */ - if (hlen == dst_ace->ace_hw_addr_length && - bcmp(src_haddr, dst_ace->ace_hw_addr, hlen) == 0) { - DTRACE_PROBE3(rput_probe_reflected, arl_t *, arl, - arh_t *, arh, ace_t *, dst_ace); - freeb(mp); - freemsg(mp1); - TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, - "arp_rput_end: q %p (%S)", q, "reflection"); - return; - } - - /* - * Conflicts seen via the wrong interface may be bogus. - * Multiple interfaces on the same segment imply any conflict - * will also be seen via the correct interface, so we can ignore - * anything not matching the arl from the ace. - */ - if (arl != dst_ace->ace_arl) { - DTRACE_PROBE3(rput_probe_misdirect, arl_t *, arl, - arh_t *, arh, ace_t *, dst_ace); - freeb(mp); - freemsg(mp1); - return; - } - /* - * Responses targeting our HW address that are not responses to - * our DAD probe must be ignored as they are related to requests - * sent before DAD was restarted. Note: response to our DAD - * probe will have been handled by ar_ce_resolve_all() above. - */ - if (op == ARP_RESPONSE && - (bcmp(dst_haddr, dst_ace->ace_hw_addr, hlen) == 0)) { - DTRACE_PROBE3(rput_probe_stale, arl_t *, arl, - arh_t *, arh, ace_t *, dst_ace); - freeb(mp); - freemsg(mp1); - return; - } - /* - * Responses targeted to HW addresses which are not ours but - * sent to our unverified proto address are also conflicts. - * These may be reported by a proxy rather than the interface - * with the conflicting address, dst_paddr is in conflict - * rather than src_paddr. To ensure IP can locate the correct - * ipif to take down, it is necessary to copy dst_paddr to - * the src_paddr field before sending it to IP. The same is - * required for probes, where src_paddr will be INADDR_ANY. - */ - if (is_probe) { - /* - * In this case, client_arl will be invalid (e.g., - * since probes don't have a valid sender address). - * But dst_ace has the appropriate arl. - */ - bcopy(dst_paddr, src_paddr, plen); - ar_client_notify(dst_ace->ace_arl, mp1, AR_CN_FAILED); - ar_ce_delete(dst_ace); - } else if (op == ARP_RESPONSE) { - bcopy(dst_paddr, src_paddr, plen); - ar_client_notify(client_arl, mp1, AR_CN_FAILED); - ar_ce_delete(dst_ace); - } else if (err == AR_CHANGED) { - ar_client_notify(client_arl, mp1, AR_CN_ANNOUNCE); - } else { - DTRACE_PROBE3(rput_request_unverified, arl_t *, arl, - arh_t *, arh, ace_t *, dst_ace); - freemsg(mp1); - } - freeb(mp); - TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, - "arp_rput_end: q %p (%S)", q, "unverified"); - return; - } - - /* - * If it's a request, then we reply to this, and if we think the - * sender's unknown, then we create an entry to avoid unnecessary ARPs. - * The design assumption is that someone ARPing us is likely to send us - * a packet soon, and that we'll want to reply to it. - */ - if (op == ARP_REQUEST) { - const uchar_t *dstaddr = src_haddr; - clock_t now; - - /* - * This implements periodic address defense based on a modified - * version of the RFC 3927 requirements. Instead of sending a - * broadcasted reply every time, as demanded by the RFC, we - * send at most one broadcast reply per arp_broadcast_interval. - */ - now = ddi_get_lbolt(); - if ((now - dst_ace->ace_last_bcast) > - MSEC_TO_TICK(as->as_broadcast_interval)) { - DTRACE_PROBE3(rput_bcast_reply, arl_t *, arl, - arh_t *, arh, ace_t *, dst_ace); - dst_ace->ace_last_bcast = now; - dstaddr = arl->arl_phy->ap_arp_addr; - /* - * If this is one of the long-suffering entries, then - * pull it out now. It no longer needs separate - * defense, because we're doing now that with this - * broadcasted reply. - */ - dst_ace->ace_flags &= ~ACE_F_DELAYED; - } - - ar_xmit(arl, ARP_RESPONSE, dst_ace->ace_proto, plen, - dst_ace->ace_hw_addr, dst_ace->ace_proto_addr, - src_haddr, src_paddr, dstaddr, as); - if (!is_probe && err == AR_NOTFOUND && - ar_ce_create(OWNING_ARL(arl), proto, src_haddr, hlen, - src_paddr, plen, NULL, NULL, 0, NULL, 0) == 0) { - ace_t *ace; - - ace = ar_ce_lookup(arl, proto, src_paddr, plen); - ASSERT(ace != NULL); - mi_timer(ace->ace_arl->arl_wq, ace->ace_mp, - as->as_cleanup_interval); - } - } - if (err == AR_CHANGED) { - freeb(mp); - ar_client_notify(client_arl, mp1, AR_CN_ANNOUNCE); - TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, - "arp_rput_end: q %p (%S)", q, "reqchange"); - } else { - freemsg(mp); - TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, - "arp_rput_end: q %p (%S)", q, "end"); - } -} - -static void -ar_ce_restart_dad(ace_t *ace, void *arl_arg) -{ - arl_t *arl = arl_arg; - arp_stack_t *as = ARL_TO_ARPSTACK(arl); - - if ((ace->ace_xmit_arl == arl) && - (ace->ace_flags & (ACE_F_UNVERIFIED|ACE_F_DAD_ABORTED)) == - (ACE_F_UNVERIFIED|ACE_F_DAD_ABORTED)) { - /* - * Slight cheat here: we don't use the initial probe delay - * in this obscure case. - */ - if (ace->ace_flags & ACE_F_FAST) { - ace->ace_xmit_count = as->as_fastprobe_count; - ace->ace_xmit_interval = as->as_fastprobe_interval; - } else { - ace->ace_xmit_count = as->as_probe_count; - ace->ace_xmit_interval = as->as_probe_interval; - } - ace->ace_flags &= ~ACE_F_DAD_ABORTED; - ace_set_timer(ace, B_FALSE); - } -} - -/* DLPI messages, other than DL_UNITDATA_IND are handled here. */ -static void -ar_rput_dlpi(queue_t *q, mblk_t *mp) -{ - ar_t *ar = q->q_ptr; - arl_t *arl = ar->ar_arl; - arlphy_t *ap = NULL; - union DL_primitives *dlp; - const char *err_str; - arp_stack_t *as = ar->ar_as; - - if (arl != NULL) - ap = arl->arl_phy; - - if (MBLKL(mp) < sizeof (dlp->dl_primitive)) { - putnext(q, mp); - return; - } - dlp = (union DL_primitives *)mp->b_rptr; - switch (dlp->dl_primitive) { - case DL_ERROR_ACK: - /* - * ce is confused about how DLPI works, so we have to interpret - * an "error" on DL_NOTIFY_ACK (which we never could have sent) - * as really meaning an error on DL_NOTIFY_REQ. - * - * Note that supporting DL_NOTIFY_REQ is optional, so printing - * out an error message on the console isn't warranted except - * for debug. - */ - if (dlp->error_ack.dl_error_primitive == DL_NOTIFY_ACK || - dlp->error_ack.dl_error_primitive == DL_NOTIFY_REQ) { - ar_dlpi_done(arl, DL_NOTIFY_REQ); - freemsg(mp); - return; - } - err_str = dl_primstr(dlp->error_ack.dl_error_primitive); - DTRACE_PROBE2(rput_dl_error, arl_t *, arl, - dl_error_ack_t *, &dlp->error_ack); - switch (dlp->error_ack.dl_error_primitive) { - case DL_UNBIND_REQ: - if (arl->arl_provider_style == DL_STYLE1) - arl->arl_state = ARL_S_DOWN; - break; - case DL_DETACH_REQ: - case DL_BIND_REQ: - arl->arl_state = ARL_S_DOWN; - break; - case DL_ATTACH_REQ: - break; - default: - /* If it's anything else, we didn't send it. */ - putnext(q, mp); - return; - } - ar_dlpi_done(arl, dlp->error_ack.dl_error_primitive); - (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, - "ar_rput_dlpi: %s failed, dl_errno %d, dl_unix_errno %d", - err_str, dlp->error_ack.dl_errno, - dlp->error_ack.dl_unix_errno); - break; - case DL_INFO_ACK: - DTRACE_PROBE2(rput_dl_info, arl_t *, arl, - dl_info_ack_t *, &dlp->info_ack); - if (arl != NULL && arl->arl_dlpi_pending == DL_INFO_REQ) { - /* - * We have a response back from the driver. Go set up - * transmit defaults. - */ - ar_ll_set_defaults(arl, mp); - ar_dlpi_done(arl, DL_INFO_REQ); - } else if (arl == NULL) { - ar_ll_init(as, ar, mp); - } - /* Kick off any awaiting messages */ - qenable(WR(q)); - break; - case DL_OK_ACK: - DTRACE_PROBE2(rput_dl_ok, arl_t *, arl, - dl_ok_ack_t *, &dlp->ok_ack); - switch (dlp->ok_ack.dl_correct_primitive) { - case DL_UNBIND_REQ: - if (arl->arl_provider_style == DL_STYLE1) - arl->arl_state = ARL_S_DOWN; - break; - case DL_DETACH_REQ: - arl->arl_state = ARL_S_DOWN; - break; - case DL_ATTACH_REQ: - break; - default: - putnext(q, mp); - return; - } - ar_dlpi_done(arl, dlp->ok_ack.dl_correct_primitive); - break; - case DL_NOTIFY_ACK: - DTRACE_PROBE2(rput_dl_notify, arl_t *, arl, - dl_notify_ack_t *, &dlp->notify_ack); - /* - * We mostly care about interface-up transitions, as this is - * when we need to redo duplicate address detection. - */ - if (ap != NULL) { - ap->ap_notifies = (dlp->notify_ack.dl_notifications & - DL_NOTE_LINK_UP) != 0; - } - ar_dlpi_done(arl, DL_NOTIFY_REQ); - break; - case DL_BIND_ACK: - DTRACE_PROBE2(rput_dl_bind, arl_t *, arl, - dl_bind_ack_t *, &dlp->bind_ack); - if (ap != NULL) { - caddr_t hw_addr; - - hw_addr = (caddr_t)dlp + dlp->bind_ack.dl_addr_offset; - if (ap->ap_saplen > 0) - hw_addr += ap->ap_saplen; - bcopy(hw_addr, ap->ap_hw_addr, ap->ap_hw_addrlen); - } - arl->arl_state = ARL_S_UP; - ar_dlpi_done(arl, DL_BIND_REQ); - break; - case DL_NOTIFY_IND: - DTRACE_PROBE2(rput_dl_notify_ind, arl_t *, arl, - dl_notify_ind_t *, &dlp->notify_ind); - - if (dlp->notify_ind.dl_notification == DL_NOTE_REPLUMB) { - arl->arl_replumbing = B_TRUE; - if (arl->arl_state == ARL_S_DOWN) { - arp_replumb_done(arl, mp); - return; - } - break; - } - - if (ap != NULL) { - switch (dlp->notify_ind.dl_notification) { - case DL_NOTE_LINK_UP: - ap->ap_link_down = B_FALSE; - ar_ce_walk(as, ar_ce_restart_dad, arl); - break; - case DL_NOTE_LINK_DOWN: - ap->ap_link_down = B_TRUE; - break; - } - } - break; - case DL_UDERROR_IND: - DTRACE_PROBE2(rput_dl_uderror, arl_t *, arl, - dl_uderror_ind_t *, &dlp->uderror_ind); - (void) mi_strlog(q, 1, SL_ERROR | SL_TRACE, - "ar_rput_dlpi: " - "DL_UDERROR_IND, dl_dest_addr_length %d dl_errno %d", - dlp->uderror_ind.dl_dest_addr_length, - dlp->uderror_ind.dl_errno); - putnext(q, mp); - return; - default: - DTRACE_PROBE2(rput_dl_badprim, arl_t *, arl, - union DL_primitives *, dlp); - putnext(q, mp); - return; - } - freemsg(mp); -} - -static void -ar_set_address(ace_t *ace, uchar_t *addrpos, uchar_t *proto_addr, - uint32_t proto_addr_len) -{ - uchar_t *mask, *to; - int len; - - ASSERT(ace->ace_hw_addr != NULL); - - bcopy(ace->ace_hw_addr, addrpos, ace->ace_hw_addr_length); - if (ace->ace_flags & ACE_F_MAPPING && - proto_addr != NULL && - ace->ace_proto_extract_mask) { /* careful */ - len = MIN((int)ace->ace_hw_addr_length - - ace->ace_hw_extract_start, - proto_addr_len); - mask = ace->ace_proto_extract_mask; - to = addrpos + ace->ace_hw_extract_start; - while (len-- > 0) - *to++ |= *mask++ & *proto_addr++; - } -} - -static int -ar_slifname(queue_t *q, mblk_t *mp_orig) -{ - ar_t *ar = q->q_ptr; - arl_t *arl = ar->ar_arl; - struct lifreq *lifr; - mblk_t *mp = mp_orig; - arl_t *old_arl; - mblk_t *ioccpy; - struct iocblk *iocp; - hook_nic_event_t info; - arp_stack_t *as = ar->ar_as; - - if (ar->ar_on_ill_stream) { - /* - * This command is for IP, since it is coming down - * the <arp-IP-driver> stream. Return ENOENT so that - * it will be sent downstream by the caller - */ - return (ENOENT); - } - /* We handle both M_IOCTL and M_PROTO messages */ - if (DB_TYPE(mp) == M_IOCTL) - mp = mp->b_cont; - if (q->q_next == NULL || arl == NULL) { - /* - * If the interface was just opened and - * the info ack has not yet come back from the driver - */ - DTRACE_PROBE2(slifname_no_arl, queue_t *, q, - mblk_t *, mp_orig); - (void) putq(q, mp_orig); - return (EINPROGRESS); - } - - if (MBLKL(mp) < sizeof (struct lifreq)) { - DTRACE_PROBE2(slifname_malformed, queue_t *, q, - mblk_t *, mp); - } - - if (arl->arl_name[0] != '\0') { - DTRACE_PROBE1(slifname_already, arl_t *, arl); - return (EALREADY); - } - - lifr = (struct lifreq *)mp->b_rptr; - - if (strlen(lifr->lifr_name) >= LIFNAMSIZ) { - DTRACE_PROBE2(slifname_bad_name, arl_t *, arl, - struct lifreq *, lifr); - return (ENXIO); - } - - /* Check whether the name is already in use. */ - - old_arl = ar_ll_lookup_by_name(as, lifr->lifr_name); - if (old_arl != NULL) { - DTRACE_PROBE2(slifname_exists, arl_t *, arl, arl_t *, old_arl); - return (EEXIST); - } - - /* Make a copy of the message so we can send it downstream. */ - if ((ioccpy = allocb(sizeof (struct iocblk), BPRI_MED)) == NULL || - (ioccpy->b_cont = copymsg(mp)) == NULL) { - if (ioccpy != NULL) - freeb(ioccpy); - return (ENOMEM); - } - - (void) strlcpy(arl->arl_name, lifr->lifr_name, sizeof (arl->arl_name)); - - /* The ppa is sent down by ifconfig */ - arl->arl_ppa = lifr->lifr_ppa; - - /* - * A network device is not considered to be fully plumb'd until - * its name has been set using SIOCSLIFNAME. Once it has - * been set, it cannot be set again (see code above), so there - * is currently no danger in this function causing two NE_PLUMB - * events without an intervening NE_UNPLUMB. - */ - info.hne_nic = arl->arl_index; - info.hne_lif = 0; - info.hne_event = NE_PLUMB; - info.hne_data = arl->arl_name; - info.hne_datalen = strlen(arl->arl_name); - (void) hook_run(as->as_net_data->netd_hooks, as->as_arpnicevents, - (hook_data_t)&info); - - /* Chain in the new arl. */ - rw_enter(&as->as_arl_lock, RW_WRITER); - arl->arl_next = as->as_arl_head; - as->as_arl_head = arl; - rw_exit(&as->as_arl_lock); - DTRACE_PROBE1(slifname_set, arl_t *, arl); - - /* - * Send along a copy of the ioctl; this is just for hitbox. Use - * M_CTL to avoid confusing anyone else who might be listening. - */ - DB_TYPE(ioccpy) = M_CTL; - iocp = (struct iocblk *)ioccpy->b_rptr; - bzero(iocp, sizeof (*iocp)); - iocp->ioc_cmd = SIOCSLIFNAME; - iocp->ioc_count = msgsize(ioccpy->b_cont); - ioccpy->b_wptr = (uchar_t *)(iocp + 1); - putnext(arl->arl_wq, ioccpy); - - return (0); -} - -static int -ar_set_ppa(queue_t *q, mblk_t *mp_orig) -{ - ar_t *ar = (ar_t *)q->q_ptr; - arl_t *arl = ar->ar_arl; - int ppa; - char *cp; - mblk_t *mp = mp_orig; - arl_t *old_arl; - arp_stack_t *as = ar->ar_as; - - if (ar->ar_on_ill_stream) { - /* - * This command is for IP, since it is coming down - * the <arp-IP-driver> stream. Return ENOENT so that - * it will be sent downstream by the caller - */ - return (ENOENT); - } - - /* We handle both M_IOCTL and M_PROTO messages. */ - if (DB_TYPE(mp) == M_IOCTL) - mp = mp->b_cont; - if (q->q_next == NULL || arl == NULL) { - /* - * If the interface was just opened and - * the info ack has not yet come back from the driver. - */ - DTRACE_PROBE2(setppa_no_arl, queue_t *, q, - mblk_t *, mp_orig); - (void) putq(q, mp_orig); - return (EINPROGRESS); - } - - if (arl->arl_name[0] != '\0') { - DTRACE_PROBE1(setppa_already, arl_t *, arl); - return (EALREADY); - } - - do { - q = q->q_next; - } while (q->q_next != NULL); - cp = q->q_qinfo->qi_minfo->mi_idname; - - ppa = *(int *)(mp->b_rptr); - (void) snprintf(arl->arl_name, sizeof (arl->arl_name), "%s%d", cp, ppa); - - old_arl = ar_ll_lookup_by_name(as, arl->arl_name); - if (old_arl != NULL) { - DTRACE_PROBE2(setppa_exists, arl_t *, arl, arl_t *, old_arl); - /* Make it a null string again */ - arl->arl_name[0] = '\0'; - return (EBUSY); - } - - arl->arl_ppa = ppa; - DTRACE_PROBE1(setppa_done, arl_t *, arl); - /* Chain in the new arl. */ - rw_enter(&as->as_arl_lock, RW_WRITER); - arl->arl_next = as->as_arl_head; - as->as_arl_head = arl; - rw_exit(&as->as_arl_lock); - - return (0); -} - -static int -ar_snmp_msg(queue_t *q, mblk_t *mp_orig) -{ - mblk_t *mpdata, *mp = mp_orig; - struct opthdr *optp; - msg2_args_t args; - arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as; - - if (mp == NULL) - return (0); - /* - * ar_cmd_dispatch() already checked for us that "mp->b_cont" is valid - * in case of an M_IOCTL message. - */ - if (DB_TYPE(mp) == M_IOCTL) - mp = mp->b_cont; - - optp = (struct opthdr *)(&mp->b_rptr[sizeof (struct T_optmgmt_ack)]); - if (optp->level == MIB2_IP && optp->name == MIB2_IP_MEDIA) { - /* - * Put our ARP cache entries in the ipNetToMediaTable mp from - * IP. Due to a historical side effect of IP's MIB code, it - * always passes us a b_cont, but the b_cont should be empty. - */ - if ((mpdata = mp->b_cont) == NULL || MBLKL(mpdata) != 0) - return (EINVAL); - - args.m2a_mpdata = mpdata; - args.m2a_mptail = NULL; - ar_ce_walk(as, ar_snmp_msg2, &args); - optp->len = msgdsize(mpdata); - } - putnext(q, mp_orig); - return (EINPROGRESS); /* so that rput() exits doing nothing... */ -} - -static void -ar_snmp_msg2(ace_t *ace, void *arg) -{ - const char *name = "unknown"; - mib2_ipNetToMediaEntry_t ntme; - msg2_args_t *m2ap = arg; - - ASSERT(ace != NULL && ace->ace_arl != NULL); - if (ace->ace_arl != NULL) - name = ace->ace_arl->arl_name; - - /* - * Fill in ntme using the information in the ACE. - */ - ntme.ipNetToMediaType = (ace->ace_flags & ACE_F_PERMANENT) ? 4 : 3; - ntme.ipNetToMediaIfIndex.o_length = MIN(OCTET_LENGTH, strlen(name)); - bcopy(name, ntme.ipNetToMediaIfIndex.o_bytes, - ntme.ipNetToMediaIfIndex.o_length); - - bcopy(ace->ace_proto_addr, &ntme.ipNetToMediaNetAddress, - MIN(sizeof (uint32_t), ace->ace_proto_addr_length)); - - ntme.ipNetToMediaInfo.ntm_mask.o_length = - MIN(OCTET_LENGTH, ace->ace_proto_addr_length); - bcopy(ace->ace_proto_mask, ntme.ipNetToMediaInfo.ntm_mask.o_bytes, - ntme.ipNetToMediaInfo.ntm_mask.o_length); - ntme.ipNetToMediaInfo.ntm_flags = ace->ace_flags; - - ntme.ipNetToMediaPhysAddress.o_length = - MIN(OCTET_LENGTH, ace->ace_hw_addr_length); - if ((ace->ace_flags & ACE_F_RESOLVED) == 0) - ntme.ipNetToMediaPhysAddress.o_length = 0; - bcopy(ace->ace_hw_addr, ntme.ipNetToMediaPhysAddress.o_bytes, - ntme.ipNetToMediaPhysAddress.o_length); - - /* - * All entries within the ARP cache are unique, and there are no - * preexisting entries in the ipNetToMediaTable mp, so just add 'em. - */ - (void) snmp_append_data2(m2ap->m2a_mpdata, &m2ap->m2a_mptail, - (char *)&ntme, sizeof (ntme)); -} - -/* Write side put procedure. */ -static void -ar_wput(queue_t *q, mblk_t *mp) -{ - int err; - struct iocblk *ioc; - mblk_t *mp1; - - TRACE_1(TR_FAC_ARP, TR_ARP_WPUT_START, - "arp_wput_start: q %p", q); - - /* - * Here we handle ARP commands coming from controlling processes - * either in the form of M_IOCTL messages, or M_PROTO messages. - */ - switch (DB_TYPE(mp)) { - case M_IOCTL: - switch (err = ar_cmd_dispatch(q, mp, B_TRUE)) { - case ENOENT: - /* - * If it is an I_PLINK, process it. Otherwise - * we don't recognize it, so pass it down. - * Since ARP is a module there is always someone - * below. - */ - ASSERT(q->q_next != NULL); - ioc = (struct iocblk *)mp->b_rptr; - if ((ioc->ioc_cmd != I_PLINK) && - (ioc->ioc_cmd != I_PUNLINK)) { - putnext(q, mp); - TRACE_2(TR_FAC_ARP, TR_ARP_WPUT_END, - "arp_wput_end: q %p (%S)", - q, "ioctl/enoent"); - return; - } - err = ar_plink_send(q, mp); - if (err == 0) { - return; - } - if ((mp1 = mp->b_cont) != 0) - mp1->b_wptr = mp1->b_rptr; - break; - case EINPROGRESS: - /* - * If the request resulted in an attempt to resolve - * an address, we return out here. The IOCTL will - * be completed in ar_rput if something comes back, - * or as a result of the timer expiring. - */ - TRACE_2(TR_FAC_ARP, TR_ARP_WPUT_END, - "arp_wput_end: q %p (%S)", q, "inprog"); - return; - default: - DB_TYPE(mp) = M_IOCACK; - break; - } - ioc = (struct iocblk *)mp->b_rptr; - if (err != 0) - ioc->ioc_error = err; - if (ioc->ioc_error != 0) { - /* - * Don't free b_cont as IP/IB needs - * it to identify the request. - */ - DB_TYPE(mp) = M_IOCNAK; - } - ioc->ioc_count = msgdsize(mp->b_cont); - qreply(q, mp); - TRACE_2(TR_FAC_ARP, TR_ARP_WPUT_END, - "arp_wput_end: q %p (%S)", q, "ioctl"); - return; - case M_FLUSH: - if (*mp->b_rptr & FLUSHW) - flushq(q, FLUSHDATA); - if (*mp->b_rptr & FLUSHR) { - flushq(RD(q), FLUSHDATA); - *mp->b_rptr &= ~FLUSHW; - qreply(q, mp); - TRACE_2(TR_FAC_ARP, TR_ARP_WPUT_END, - "arp_wput_end: q %p (%S)", q, "flush"); - return; - } - /* - * The normal behavior of a STREAMS module should be - * to pass down M_FLUSH messages. However there is a - * complex sequence of events during plumb/unplumb that - * can cause DLPI messages in the driver's queue to be - * flushed. So we don't send down M_FLUSH. This has been - * reported for some drivers (Eg. le) that send up an M_FLUSH - * in response to unbind request which will eventually be - * looped back at the mux head and sent down. Since IP - * does not queue messages in a module instance queue - * of IP, nothing is lost by not sending down the flush. - */ - freemsg(mp); - return; - case M_PROTO: - case M_PCPROTO: - /* - * Commands in the form of PROTO messages are handled very - * much the same as IOCTLs, but no response is returned. - */ - switch (err = ar_cmd_dispatch(q, mp, B_TRUE)) { - case ENOENT: - if (q->q_next) { - putnext(q, mp); - TRACE_2(TR_FAC_ARP, TR_ARP_WPUT_END, - "arp_wput_end: q %p (%S)", q, - "proto/enoent"); - return; - } - break; - case EINPROGRESS: - TRACE_2(TR_FAC_ARP, TR_ARP_WPUT_END, - "arp_wput_end: q %p (%S)", q, "proto/einprog"); - return; - default: - break; - } - break; - case M_IOCDATA: - /* - * We pass M_IOCDATA downstream because it could be as a - * result of a previous M_COPYIN/M_COPYOUT message sent - * upstream. - */ - /* FALLTHRU */ - case M_CTL: - /* - * We also send any M_CTL downstream as it could - * contain control information for a module downstream. - */ - putnext(q, mp); - return; - default: - break; - } - /* Free any message we don't understand */ - freemsg(mp); - TRACE_2(TR_FAC_ARP, TR_ARP_WPUT_END, - "arp_wput_end: q %p (%S)", q, "end"); -} - -static boolean_t -arp_say_ready(ace_t *ace) -{ - mblk_t *mp; - arl_t *arl = ace->ace_arl; - arlphy_t *ap = ace->ace_xmit_arl->arl_phy; - arh_t *arh; - uchar_t *cp; - - mp = allocb(sizeof (*arh) + 2 * (ace->ace_hw_addr_length + - ace->ace_proto_addr_length), BPRI_MED); - if (mp == NULL) { - /* skip a beat on allocation trouble */ - ace->ace_xmit_count = 1; - ace_set_timer(ace, B_FALSE); - return (B_FALSE); - } - /* Tell IP address is now usable */ - arh = (arh_t *)mp->b_rptr; - U16_TO_BE16(ap->ap_arp_hw_type, arh->arh_hardware); - U16_TO_BE16(ace->ace_proto, arh->arh_proto); - arh->arh_hlen = ace->ace_hw_addr_length; - arh->arh_plen = ace->ace_proto_addr_length; - U16_TO_BE16(ARP_REQUEST, arh->arh_operation); - cp = (uchar_t *)(arh + 1); - bcopy(ace->ace_hw_addr, cp, ace->ace_hw_addr_length); - cp += ace->ace_hw_addr_length; - bcopy(ace->ace_proto_addr, cp, ace->ace_proto_addr_length); - cp += ace->ace_proto_addr_length; - bcopy(ace->ace_hw_addr, cp, ace->ace_hw_addr_length); - cp += ace->ace_hw_addr_length; - bcopy(ace->ace_proto_addr, cp, ace->ace_proto_addr_length); - cp += ace->ace_proto_addr_length; - mp->b_wptr = cp; - ar_client_notify(arl, mp, AR_CN_READY); - DTRACE_PROBE1(ready, ace_t *, ace); - return (B_TRUE); -} - -/* - * Pick the longest-waiting aces for defense. - */ -static void -ace_reschedule(ace_t *ace, void *arg) -{ - ace_resched_t *art = arg; - ace_t **aces; - ace_t **acemax; - ace_t *atemp; - - if (ace->ace_xmit_arl != art->art_arl) - return; - /* - * Only published entries that are ready for announcement are eligible. - */ - if ((ace->ace_flags & (ACE_F_PUBLISH | ACE_F_UNVERIFIED | ACE_F_DYING | - ACE_F_DELAYED)) != ACE_F_PUBLISH) { - return; - } - if (art->art_naces < ACE_RESCHED_LIST_LEN) { - art->art_aces[art->art_naces++] = ace; - } else { - aces = art->art_aces; - acemax = aces + ACE_RESCHED_LIST_LEN; - for (; aces < acemax; aces++) { - if ((*aces)->ace_last_bcast > ace->ace_last_bcast) { - atemp = *aces; - *aces = ace; - ace = atemp; - } - } - } -} - -/* - * Reschedule the ARP defense of any long-waiting ACEs. It's assumed that this - * doesn't happen very often (if at all), and thus it needn't be highly - * optimized. (Note, though, that it's actually O(N) complexity, because the - * outer loop is bounded by a constant rather than by the length of the list.) - */ -static void -arl_reschedule(arl_t *arl) -{ - arlphy_t *ap = arl->arl_phy; - ace_resched_t art; - int i; - ace_t *ace; - arp_stack_t *as = ARL_TO_ARPSTACK(arl); - - i = ap->ap_defend_count; - ap->ap_defend_count = 0; - /* If none could be sitting around, then don't reschedule */ - if (i < as->as_defend_rate) { - DTRACE_PROBE1(reschedule_none, arl_t *, arl); - return; - } - art.art_arl = arl; - while (ap->ap_defend_count < as->as_defend_rate) { - art.art_naces = 0; - ar_ce_walk(as, ace_reschedule, &art); - for (i = 0; i < art.art_naces; i++) { - ace = art.art_aces[i]; - ace->ace_flags |= ACE_F_DELAYED; - ace_set_timer(ace, B_FALSE); - if (++ap->ap_defend_count >= as->as_defend_rate) - break; - } - if (art.art_naces < ACE_RESCHED_LIST_LEN) - break; - } - DTRACE_PROBE1(reschedule, arl_t *, arl); -} - -/* - * Write side service routine. The only action here is delivery of transmit - * timer events and delayed messages while waiting for the info_ack (ar_arl - * not yet set). - */ -static void -ar_wsrv(queue_t *q) -{ - ace_t *ace; - arlphy_t *ap; - mblk_t *mp; - clock_t ms; - arp_stack_t *as = ((ar_t *)q->q_ptr)->ar_as; - - TRACE_1(TR_FAC_ARP, TR_ARP_WSRV_START, - "arp_wsrv_start: q %p", q); - - while ((mp = getq(q)) != NULL) { - switch (DB_TYPE(mp)) { - case M_PCSIG: - if (!mi_timer_valid(mp)) - continue; - ace = (ace_t *)mp->b_rptr; - if (ace->ace_flags & ACE_F_DYING) - continue; - ap = ace->ace_xmit_arl->arl_phy; - if (ace->ace_flags & ACE_F_UNVERIFIED) { - ASSERT(ace->ace_flags & ACE_F_PUBLISH); - ASSERT(ace->ace_query_mp == NULL); - /* - * If the link is down, give up for now. IP - * will give us the go-ahead to try again when - * the link restarts. - */ - if (ap->ap_link_down) { - DTRACE_PROBE1(timer_link_down, - ace_t *, ace); - ace->ace_flags |= ACE_F_DAD_ABORTED; - continue; - } - if (ace->ace_xmit_count > 0) { - DTRACE_PROBE1(timer_probe, - ace_t *, ace); - ace->ace_xmit_count--; - ar_xmit(ace->ace_xmit_arl, ARP_REQUEST, - ace->ace_proto, - ace->ace_proto_addr_length, - ace->ace_hw_addr, NULL, NULL, - ace->ace_proto_addr, NULL, as); - ace_set_timer(ace, B_FALSE); - continue; - } - if (!arp_say_ready(ace)) - continue; - DTRACE_PROBE1(timer_ready, ace_t *, ace); - ace->ace_xmit_interval = - as->as_publish_interval; - ace->ace_xmit_count = as->as_publish_count; - if (ace->ace_xmit_count == 0) - ace->ace_xmit_count++; - ace->ace_flags &= ~ACE_F_UNVERIFIED; - } - if (ace->ace_flags & ACE_F_PUBLISH) { - clock_t now; - - /* - * If an hour has passed, then free up the - * entries that need defense by rescheduling - * them. - */ - now = ddi_get_lbolt(); - if (as->as_defend_rate > 0 && - now - ap->ap_defend_start > - SEC_TO_TICK(as->as_defend_period)) { - ap->ap_defend_start = now; - arl_reschedule(ace->ace_xmit_arl); - } - /* - * Finish the job that we started in - * ar_entry_add. When we get to zero - * announcement retransmits left, switch to - * address defense. - */ - ASSERT(ace->ace_query_mp == NULL); - if (ace->ace_xmit_count > 0) { - ace->ace_xmit_count--; - DTRACE_PROBE1(timer_announce, - ace_t *, ace); - } else if (ace->ace_flags & ACE_F_DELAYED) { - /* - * This guy was rescheduled as one of - * the really old entries needing - * on-going defense. Let him through - * now. - */ - DTRACE_PROBE1(timer_send_delayed, - ace_t *, ace); - ace->ace_flags &= ~ACE_F_DELAYED; - } else if (as->as_defend_rate > 0 && - (ap->ap_defend_count >= - as->as_defend_rate || - ++ap->ap_defend_count >= - as->as_defend_rate)) { - /* - * If we're no longer allowed to send - * unbidden defense messages, then just - * wait for rescheduling. - */ - DTRACE_PROBE1(timer_excess_defense, - ace_t *, ace); - ace_set_timer(ace, B_FALSE); - continue; - } else { - DTRACE_PROBE1(timer_defend, - ace_t *, ace); - } - ar_xmit(ace->ace_xmit_arl, ARP_REQUEST, - ace->ace_proto, - ace->ace_proto_addr_length, - ace->ace_hw_addr, - ace->ace_proto_addr, - ace->ace_xmit_arl->arl_phy->ap_arp_addr, - ace->ace_proto_addr, NULL, as); - ace->ace_last_bcast = now; - if (ace->ace_xmit_count == 0) - ace->ace_xmit_interval = - as->as_defend_interval; - if (ace->ace_xmit_interval != 0) - ace_set_timer(ace, B_FALSE); - continue; - } - - /* - * If this is a non-permanent (regular) resolved ARP - * entry, then it's now time to check if it can be - * retired. As an optimization, we check with IP - * first, and just restart the timer if the address is - * still in use. - */ - if (ACE_NONPERM(ace)) { - if (ace->ace_proto == IP_ARP_PROTO_TYPE && - ndp_lookup_ipaddr(*(ipaddr_t *) - ace->ace_proto_addr, as->as_netstack)) { - ace->ace_flags |= ACE_F_OLD; - mi_timer(ace->ace_arl->arl_wq, - ace->ace_mp, - as->as_cleanup_interval); - } else { - ar_delete_notify(ace); - ar_ce_delete(ace); - } - continue; - } - - /* - * ar_query_xmit returns the number of milliseconds to - * wait following this transmit. If the number of - * allowed transmissions has been exhausted, it will - * return zero without transmitting. If that happens - * we complete the operation with a failure indication. - * Otherwise, we restart the timer. - */ - ms = ar_query_xmit(as, ace); - if (ms == 0) - ar_query_reply(ace, ENXIO, NULL, (uint32_t)0); - else - mi_timer(q, mp, ms); - continue; - default: - put(q, mp); - continue; - } - } - TRACE_1(TR_FAC_ARP, TR_ARP_WSRV_END, - "arp_wsrv_end: q %p", q); -} - -/* ar_xmit is called to transmit an ARP Request or Response. */ -static void -ar_xmit(arl_t *arl, uint32_t operation, uint32_t proto, uint32_t plen, - const uchar_t *haddr1, const uchar_t *paddr1, const uchar_t *haddr2, - const uchar_t *paddr2, const uchar_t *dstaddr, arp_stack_t *as) -{ - arh_t *arh; - uint8_t *cp; - uint_t hlen; - mblk_t *mp; - arlphy_t *ap = arl->arl_phy; - - ASSERT(!(arl->arl_flags & ARL_F_IPMP)); - - if (ap == NULL) { - DTRACE_PROBE1(xmit_no_arl_phy, arl_t *, arl); - return; - } - - /* IFF_NOARP flag is set or link down: do not send arp messages */ - if ((arl->arl_flags & ARL_F_NOARP) || ap->ap_link_down) - return; - - hlen = ap->ap_hw_addrlen; - if ((mp = copyb(ap->ap_xmit_mp)) == NULL) - return; - - mp->b_cont = allocb(AR_LL_HDR_SLACK + ARH_FIXED_LEN + (hlen * 4) + - plen + plen, BPRI_MED); - if (mp->b_cont == NULL) { - freeb(mp); - return; - } - - /* Get the L2 destination address for the message */ - if (haddr2 == NULL) - dstaddr = ap->ap_arp_addr; - else if (dstaddr == NULL) - dstaddr = haddr2; - - /* - * Figure out where the target hardware address goes in the - * DL_UNITDATA_REQ header, and copy it in. - */ - cp = mi_offset_param(mp, ap->ap_xmit_addroff, hlen); - ASSERT(cp != NULL); - if (cp == NULL) { - freemsg(mp); - return; - } - bcopy(dstaddr, cp, hlen); - - /* Fill in the ARP header. */ - cp = mp->b_cont->b_rptr + (AR_LL_HDR_SLACK + hlen + hlen); - mp->b_cont->b_rptr = cp; - arh = (arh_t *)cp; - U16_TO_BE16(ap->ap_arp_hw_type, arh->arh_hardware); - U16_TO_BE16(proto, arh->arh_proto); - arh->arh_hlen = (uint8_t)hlen; - arh->arh_plen = (uint8_t)plen; - U16_TO_BE16(operation, arh->arh_operation); - cp += ARH_FIXED_LEN; - bcopy(haddr1, cp, hlen); - cp += hlen; - if (paddr1 == NULL) - bzero(cp, plen); - else - bcopy(paddr1, cp, plen); - cp += plen; - if (haddr2 == NULL) - bzero(cp, hlen); - else - bcopy(haddr2, cp, hlen); - cp += hlen; - bcopy(paddr2, cp, plen); - cp += plen; - mp->b_cont->b_wptr = cp; - - DTRACE_PROBE3(arp__physical__out__start, - arl_t *, arl, arh_t *, arh, mblk_t *, mp); - - ARP_HOOK_OUT(as->as_arp_physical_out_event, as->as_arp_physical_out, - arl->arl_index, arh, mp, mp->b_cont, as); - - DTRACE_PROBE1(arp__physical__out__end, mblk_t *, mp); - - if (mp == NULL) - return; - - /* Ship it out. */ - if (canputnext(arl->arl_wq)) - putnext(arl->arl_wq, mp); - else - freemsg(mp); -} - -static mblk_t * -ar_alloc(uint32_t cmd, int err) -{ - uint32_t len; - mblk_t *mp; - mblk_t *mp1; - char *cp; - arc_t *arc; - - /* For now only one type of command is accepted */ - if (cmd != AR_DLPIOP_DONE) - return (NULL); - len = sizeof (arc_t); - mp = allocb(len, BPRI_HI); - if (!mp) - return (NULL); - - DB_TYPE(mp) = M_CTL; - cp = (char *)mp->b_rptr; - arc = (arc_t *)(mp->b_rptr); - arc->arc_cmd = cmd; - mp->b_wptr = (uchar_t *)&cp[len]; - len = sizeof (int); - mp1 = allocb(len, BPRI_HI); - if (!mp1) { - freeb(mp); - return (NULL); - } - cp = (char *)mp->b_rptr; - /* Initialize the error code */ - *((int *)mp1->b_rptr) = err; - mp1->b_wptr = (uchar_t *)&cp[len]; - linkb(mp, mp1); - return (mp); -} - -void -arp_ddi_init(void) -{ - /* - * We want to be informed each time a stack is created or - * destroyed in the kernel, so we can maintain the - * set of arp_stack_t's. - */ - netstack_register(NS_ARP, arp_stack_init, arp_stack_shutdown, - arp_stack_fini); -} - -void -arp_ddi_destroy(void) -{ - netstack_unregister(NS_ARP); -} - -/* - * Initialize the ARP stack instance. - */ -/* ARGSUSED */ -static void * -arp_stack_init(netstackid_t stackid, netstack_t *ns) -{ - arp_stack_t *as; - arpparam_t *pa; - - as = (arp_stack_t *)kmem_zalloc(sizeof (*as), KM_SLEEP); - as->as_netstack = ns; - - pa = (arpparam_t *)kmem_alloc(sizeof (arp_param_arr), KM_SLEEP); - as->as_param_arr = pa; - bcopy(arp_param_arr, as->as_param_arr, sizeof (arp_param_arr)); - - (void) ar_param_register(&as->as_nd, - as->as_param_arr, A_CNT(arp_param_arr)); - - as->as_arp_index_counter = 1; - as->as_arp_counter_wrapped = 0; - - rw_init(&as->as_arl_lock, NULL, RW_DRIVER, NULL); - arp_net_init(as, stackid); - arp_hook_init(as); - - return (as); -} - -/* ARGSUSED */ -static void -arp_stack_shutdown(netstackid_t stackid, void *arg) -{ - arp_stack_t *as = (arp_stack_t *)arg; - - arp_net_shutdown(as); -} - -/* - * Free the ARP stack instance. - */ -/* ARGSUSED */ -static void -arp_stack_fini(netstackid_t stackid, void *arg) -{ - arp_stack_t *as = (arp_stack_t *)arg; - - arp_hook_destroy(as); - arp_net_destroy(as); - rw_destroy(&as->as_arl_lock); - nd_free(&as->as_nd); - kmem_free(as->as_param_arr, sizeof (arp_param_arr)); - as->as_param_arr = NULL; - kmem_free(as, sizeof (*as)); -} diff --git a/usr/src/uts/common/inet/arp/arp_netinfo.c b/usr/src/uts/common/inet/arp/arp_netinfo.c deleted file mode 100644 index 9d9c6a5bbe..0000000000 --- a/usr/src/uts/common/inet/arp/arp_netinfo.c +++ /dev/null @@ -1,376 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include <sys/param.h> -#include <sys/types.h> -#include <sys/systm.h> -#include <sys/cmn_err.h> -#include <sys/stream.h> -#include <sys/sunddi.h> -#include <sys/hook.h> -#include <sys/hook_impl.h> -#include <sys/netstack.h> -#include <net/if.h> - -#include <sys/neti.h> -#include <sys/hook_event.h> -#include <inet/arp_impl.h> - -/* - * ARP netinfo entry point declarations. - */ -static int arp_getifname(net_handle_t, phy_if_t, char *, const size_t); -static int arp_getmtu(net_handle_t, phy_if_t, lif_if_t); -static int arp_getpmtuenabled(net_handle_t); -static int arp_getlifaddr(net_handle_t, phy_if_t, lif_if_t, size_t, - net_ifaddr_t [], void *); -static int arp_getlifzone(net_handle_t, phy_if_t, lif_if_t, zoneid_t *); -static int arp_getlifflags(net_handle_t, phy_if_t, lif_if_t, uint64_t *); -static phy_if_t arp_phygetnext(net_handle_t, phy_if_t); -static phy_if_t arp_phylookup(net_handle_t, const char *); -static lif_if_t arp_lifgetnext(net_handle_t, phy_if_t, lif_if_t); -static int arp_inject(net_handle_t, inject_t, net_inject_t *); -static phy_if_t arp_routeto(net_handle_t, struct sockaddr *, struct sockaddr *); -static int arp_ispartialchecksum(net_handle_t, mblk_t *); -static int arp_isvalidchecksum(net_handle_t, mblk_t *); - -static net_protocol_t arp_netinfo = { - NETINFO_VERSION, - NHF_ARP, - arp_getifname, - arp_getmtu, - arp_getpmtuenabled, - arp_getlifaddr, - arp_getlifzone, - arp_getlifflags, - arp_phygetnext, - arp_phylookup, - arp_lifgetnext, - arp_inject, - arp_routeto, - arp_ispartialchecksum, - arp_isvalidchecksum -}; - -/* - * Register ARP netinfo functions. - */ -void -arp_net_init(arp_stack_t *as, netstackid_t stackid) -{ - netid_t id; - - id = net_getnetidbynetstackid(stackid); - ASSERT(id != -1); - - as->as_net_data = net_protocol_register(id, &arp_netinfo); - ASSERT(as->as_net_data != NULL); -} - -void -arp_net_shutdown(arp_stack_t *as) -{ - if (as->as_arpnicevents != NULL) { - (void) net_event_shutdown(as->as_net_data, - &as->as_arp_nic_events); - } - - if (as->as_arp_physical_out != NULL) { - (void) net_event_shutdown(as->as_net_data, - &as->as_arp_physical_out_event); - } - - if (as->as_arp_physical_in != NULL) { - (void) net_event_shutdown(as->as_net_data, - &as->as_arp_physical_in_event); - } - - (void) net_family_shutdown(as->as_net_data, &as->as_arproot); -} - -/* - * Unregister ARP netinfo functions. - */ -void -arp_net_destroy(arp_stack_t *as) -{ - if (net_protocol_unregister(as->as_net_data) == 0) - as->as_net_data = NULL; -} - -/* - * Initialize ARP hook family and events - */ -void -arp_hook_init(arp_stack_t *as) -{ - HOOK_FAMILY_INIT(&as->as_arproot, Hn_ARP); - if (net_family_register(as->as_net_data, &as->as_arproot) != 0) { - cmn_err(CE_NOTE, "arp_hook_init: " - "net_family_register failed for arp"); - } - - HOOK_EVENT_INIT(&as->as_arp_physical_in_event, NH_PHYSICAL_IN); - as->as_arp_physical_in = net_event_register(as->as_net_data, - &as->as_arp_physical_in_event); - if (as->as_arp_physical_in == NULL) { - cmn_err(CE_NOTE, "arp_hook_init: " - "net_event_register failed for arp/physical_in"); - } - - HOOK_EVENT_INIT(&as->as_arp_physical_out_event, NH_PHYSICAL_OUT); - as->as_arp_physical_out = net_event_register(as->as_net_data, - &as->as_arp_physical_out_event); - if (as->as_arp_physical_out == NULL) { - cmn_err(CE_NOTE, "arp_hook_init: " - "net_event_register failed for arp/physical_out"); - } - - HOOK_EVENT_INIT(&as->as_arp_nic_events, NH_NIC_EVENTS); - as->as_arpnicevents = net_event_register(as->as_net_data, - &as->as_arp_nic_events); - if (as->as_arpnicevents == NULL) { - cmn_err(CE_NOTE, "arp_hook_init: " - "net_event_register failed for arp/nic_events"); - } -} - -void -arp_hook_destroy(arp_stack_t *as) -{ - if (as->as_arpnicevents != NULL) { - if (net_event_unregister(as->as_net_data, - &as->as_arp_nic_events) == 0) - as->as_arpnicevents = NULL; - } - - if (as->as_arp_physical_out != NULL) { - if (net_event_unregister(as->as_net_data, - &as->as_arp_physical_out_event) == 0) - as->as_arp_physical_out = NULL; - } - - if (as->as_arp_physical_in != NULL) { - if (net_event_unregister(as->as_net_data, - &as->as_arp_physical_in_event) == 0) - as->as_arp_physical_in = NULL; - } - - (void) net_family_unregister(as->as_net_data, &as->as_arproot); -} - -/* - * Determine the name of the lower level interface - */ -static int -arp_getifname(net_handle_t net, phy_if_t phy_ifdata, char *buffer, - const size_t buflen) -{ - arl_t *arl; - arp_stack_t *as; - netstack_t *ns = net->netd_stack->nts_netstack; - - ASSERT(buffer != NULL); - ASSERT(ns != NULL); - - as = ns->netstack_arp; - rw_enter(&as->as_arl_lock, RW_READER); - for (arl = as->as_arl_head; arl != NULL; arl = arl->arl_next) { - if (arl->arl_index == phy_ifdata) { - (void) strlcpy(buffer, arl->arl_name, buflen); - rw_exit(&as->as_arl_lock); - return (0); - } - } - rw_exit(&as->as_arl_lock); - - return (1); -} - -/* - * Unsupported with ARP. - */ -/*ARGSUSED*/ -static int -arp_getmtu(net_handle_t net, phy_if_t phy_ifdata, lif_if_t ifdata) -{ - return (-1); -} - -/* - * Unsupported with ARP. - */ -/*ARGSUSED*/ -static int -arp_getpmtuenabled(net_handle_t net) -{ - return (-1); -} - -/* - * Unsupported with ARP. - */ -/*ARGSUSED*/ -static int -arp_getlifaddr(net_handle_t net, phy_if_t phy_ifdata, lif_if_t ifdata, - size_t nelem, net_ifaddr_t type[], void *storage) -{ - return (-1); -} - -/* - * Determine the instance number of the next lower level interface - */ -static phy_if_t -arp_phygetnext(net_handle_t net, phy_if_t phy_ifdata) -{ - arl_t *arl; - int index; - arp_stack_t *as; - netstack_t *ns = net->netd_stack->nts_netstack; - - ASSERT(ns != NULL); - - as = ns->netstack_arp; - rw_enter(&as->as_arl_lock, RW_READER); - if (phy_ifdata == 0) { - arl = as->as_arl_head; - } else { - for (arl = as->as_arl_head; arl != NULL; - arl = arl->arl_next) { - if (arl->arl_index == phy_ifdata) { - arl = arl->arl_next; - break; - } - } - } - - index = (arl != NULL) ? arl->arl_index : 0; - - rw_exit(&as->as_arl_lock); - - return (index); -} - -/* - * Given a network interface name, find its ARP layer instance number. - */ -static phy_if_t -arp_phylookup(net_handle_t net, const char *name) -{ - arl_t *arl; - int index; - arp_stack_t *as; - netstack_t *ns = net->netd_stack->nts_netstack; - - ASSERT(name != NULL); - ASSERT(ns != NULL); - - index = 0; - as = ns->netstack_arp; - rw_enter(&as->as_arl_lock, RW_READER); - for (arl = as->as_arl_head; arl != NULL; arl = arl->arl_next) { - if (strcmp(name, arl->arl_name) == 0) { - index = arl->arl_index; - break; - } - } - rw_exit(&as->as_arl_lock); - - return (index); - -} - -/* - * Unsupported with ARP. - */ -/*ARGSUSED*/ -static lif_if_t -arp_lifgetnext(net_handle_t net, phy_if_t ifp, lif_if_t lif) -{ - return ((lif_if_t)-1); -} - -/* - * Unsupported with ARP. - */ -/*ARGSUSED*/ -static int -arp_inject(net_handle_t net, inject_t injection, net_inject_t *neti) -{ - return (-1); -} - -/* - * Unsupported with ARP. - */ -/*ARGSUSED*/ -static phy_if_t -arp_routeto(net_handle_t net, struct sockaddr *addr, struct sockaddr *next) -{ - return ((phy_if_t)-1); -} - -/* - * Unsupported with ARP. - */ -/*ARGSUSED*/ -int -arp_ispartialchecksum(net_handle_t net, mblk_t *mb) -{ - return (-1); -} - -/* - * Unsupported with ARP. - */ -/*ARGSUSED*/ -static int -arp_isvalidchecksum(net_handle_t net, mblk_t *mb) -{ - return (-1); -} - -/* - * Unsupported with ARP. - */ -/*ARGSUSED*/ -static int -arp_getlifzone(net_handle_t net, phy_if_t phy_ifdata, lif_if_t ifdata, - zoneid_t *zoneid) -{ - return (-1); -} - -/* - * Unsupported with ARP. - */ -/*ARGSUSED*/ -static int -arp_getlifflags(net_handle_t net, phy_if_t phy_ifdata, lif_if_t ifdata, - uint64_t *flags) -{ - return (-1); -} diff --git a/usr/src/uts/common/inet/arp/arpddi.c b/usr/src/uts/common/inet/arp/arpddi.c index 2cc56b77fd..de8333295b 100644 --- a/usr/src/uts/common/inet/arp/arpddi.c +++ b/usr/src/uts/common/inet/arp/arpddi.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -27,10 +27,8 @@ #include <sys/types.h> #include <sys/conf.h> #include <sys/modctl.h> -#include <sys/ksynch.h> #include <inet/common.h> #include <inet/ip.h> -#include <inet/arp_impl.h> #define INET_NAME "arp" #define INET_MODDESC "ARP STREAMS module" @@ -39,28 +37,16 @@ #define INET_DEVSTRTAB ipinfov4 #define INET_MODSTRTAB arpinfo #define INET_DEVMTFLAGS IP_DEVMTFLAGS /* since as a driver we're ip */ -#define INET_MODMTFLAGS (D_MP | D_MTPERMOD) +#define INET_MODMTFLAGS D_MP #include "../inetddi.c" -extern void arp_ddi_init(void); -extern void arp_ddi_destroy(void); - int _init(void) { int error; - /* - * Note: After mod_install succeeds, another thread can enter - * therefore all initialization is done before it and any - * de-initialization needed done if it fails. - */ - arp_ddi_init(); error = mod_install(&modlinkage); - if (error != 0) - arp_ddi_destroy(); - return (error); } @@ -70,8 +56,6 @@ _fini(void) int error; error = mod_remove(&modlinkage); - if (error == 0) - arp_ddi_destroy(); return (error); } diff --git a/usr/src/uts/common/inet/arp_impl.h b/usr/src/uts/common/inet/arp_impl.h deleted file mode 100644 index 38d0d1ab65..0000000000 --- a/usr/src/uts/common/inet/arp_impl.h +++ /dev/null @@ -1,253 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _ARP_IMPL_H -#define _ARP_IMPL_H - -#ifdef __cplusplus -extern "C" { -#endif - -#ifdef _KERNEL - -#include <sys/types.h> -#include <sys/stream.h> -#include <net/if.h> -#include <sys/netstack.h> - -/* ARP kernel hash size; used for mdb support */ -#define ARP_HASH_SIZE 256 - -/* Named Dispatch Parameter Management Structure */ -typedef struct arpparam_s { - uint32_t arp_param_min; - uint32_t arp_param_max; - uint32_t arp_param_value; - char *arp_param_name; -} arpparam_t; - -/* ARL Structure, one per link level device */ -typedef struct arl_s { - struct arl_s *arl_next; /* ARL chain at arl_g_head */ - queue_t *arl_rq; /* Read queue pointer */ - queue_t *arl_wq; /* Write queue pointer */ - t_uscalar_t arl_ppa; /* DL_ATTACH parameter */ - char arl_name[LIFNAMSIZ]; /* Lower level name */ - mblk_t *arl_unbind_mp; - mblk_t *arl_detach_mp; - t_uscalar_t arl_provider_style; /* From DL_INFO_ACK */ - mblk_t *arl_queue; /* Queued commands head */ - mblk_t *arl_queue_tail; /* Queued commands tail */ - uint32_t arl_flags; /* ARL_F_* values below */ - t_uscalar_t arl_dlpi_pending; /* pending DLPI request */ - mblk_t *arl_dlpi_deferred; /* Deferred DLPI messages */ - uint_t arl_state; /* lower interface state */ - uint_t arl_closing : 1, /* stream is closing */ - arl_replumbing : 1; /* Wait for IP to bring down */ - uint32_t arl_index; /* instance number */ - struct arlphy_s *arl_phy; /* physical info, if any */ - struct arl_s *arl_ipmp_arl; /* pointer to group arl_t */ -} arl_t; - -/* - * There is no field to get from an arl_t to an arp_stack_t, but this - * macro does it. - */ -#define ARL_TO_ARPSTACK(_arl) (((ar_t *)(_arl)->arl_rq->q_ptr)->ar_as) - -/* ARL physical info structure, one per physical link level device */ -typedef struct arlphy_s { - uint32_t ap_arp_hw_type; /* hardware type */ - uchar_t *ap_arp_addr; /* multicast address to use */ - uchar_t *ap_hw_addr; /* hardware address */ - uint32_t ap_hw_addrlen; /* hardware address length */ - mblk_t *ap_xmit_mp; /* DL_UNITDATA_REQ template */ - t_uscalar_t ap_xmit_addroff; /* address offset in xmit_mp */ - t_uscalar_t ap_xmit_sapoff; /* sap offset in xmit_mp */ - t_scalar_t ap_saplen; /* sap length */ - clock_t ap_defend_start; /* start of 1-hour period */ - uint_t ap_defend_count; /* # of unbidden broadcasts */ - uint_t ap_notifies : 1, /* handles DL_NOTE_LINK */ - ap_link_down : 1; /* DL_NOTE status */ -} arlphy_t; - -/* ARP Cache Entry */ -typedef struct ace_s { - struct ace_s *ace_next; /* Hash chain next pointer */ - struct ace_s **ace_ptpn; /* Pointer to previous next */ - struct arl_s *ace_arl; /* Associated arl */ - uint32_t ace_proto; /* Protocol for this ace */ - uint32_t ace_flags; - uchar_t *ace_proto_addr; - uint32_t ace_proto_addr_length; - uchar_t *ace_proto_mask; /* Mask for matching addr */ - uchar_t *ace_proto_extract_mask; /* For mappings */ - uchar_t *ace_hw_addr; - uint32_t ace_hw_addr_length; - uint32_t ace_hw_extract_start; /* For mappings */ - mblk_t *ace_mp; /* mblk we are in */ - mblk_t *ace_query_mp; /* outstanding query chain */ - clock_t ace_last_bcast; /* last broadcast Response */ - clock_t ace_xmit_interval; - int ace_xmit_count; - arl_t *ace_xmit_arl; /* xmit on this arl */ -} ace_t; - -#define ARPHOOK_INTERESTED_PHYSICAL_IN(as) \ - (as->as_arp_physical_in_event.he_interested) -#define ARPHOOK_INTERESTED_PHYSICAL_OUT(as) \ - (as->as_arp_physical_out_event.he_interested) - -#define ARP_HOOK_IN(_hook, _event, _ilp, _hdr, _fm, _m, as) \ - \ - if ((_hook).he_interested) { \ - hook_pkt_event_t info; \ - \ - info.hpe_protocol = as->as_net_data; \ - info.hpe_ifp = _ilp; \ - info.hpe_ofp = 0; \ - info.hpe_hdr = _hdr; \ - info.hpe_mp = &(_fm); \ - info.hpe_mb = _m; \ - if (hook_run(as->as_net_data->netd_hooks, \ - _event, (hook_data_t)&info) != 0) { \ - if (_fm != NULL) { \ - freemsg(_fm); \ - _fm = NULL; \ - } \ - _hdr = NULL; \ - _m = NULL; \ - } else { \ - _hdr = info.hpe_hdr; \ - _m = info.hpe_mb; \ - } \ - } - -#define ARP_HOOK_OUT(_hook, _event, _olp, _hdr, _fm, _m, as) \ - \ - if ((_hook).he_interested) { \ - hook_pkt_event_t info; \ - \ - info.hpe_protocol = as->as_net_data; \ - info.hpe_ifp = 0; \ - info.hpe_ofp = _olp; \ - info.hpe_hdr = _hdr; \ - info.hpe_mp = &(_fm); \ - info.hpe_mb = _m; \ - if (hook_run(as->as_net_data->netd_hooks, \ - _event, (hook_data_t)&info) != 0) { \ - if (_fm != NULL) { \ - freemsg(_fm); \ - _fm = NULL; \ - } \ - _hdr = NULL; \ - _m = NULL; \ - } else { \ - _hdr = info.hpe_hdr; \ - _m = info.hpe_mb; \ - } \ - } - -#define ACE_EXTERNAL_FLAGS_MASK \ - (ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MAPPING | ACE_F_MYADDR | \ - ACE_F_AUTHORITY) - -/* - * ARP stack instances - */ -struct arp_stack { - netstack_t *as_netstack; /* Common netstack */ - void *as_head; /* AR Instance Data List Head */ - caddr_t as_nd; /* AR Named Dispatch Head */ - struct arl_s *as_arl_head; /* ARL List Head */ - arpparam_t *as_param_arr; /* ndd variable table */ - - /* ARP Cache Entry Hash Table */ - ace_t *as_ce_hash_tbl[ARP_HASH_SIZE]; - ace_t *as_ce_mask_entries; - - /* - * With the introduction of netinfo (neti kernel module), - * it is now possible to access data structures in the ARP module - * without the code being executed in the context of the IP module, - * thus there is no locking being enforced through the use of STREAMS. - * as_arl_lock is used to protect as_arl_head list. - */ - krwlock_t as_arl_lock; - - uint32_t as_arp_index_counter; - uint32_t as_arp_counter_wrapped; - - /* arp_neti.c */ - hook_family_t as_arproot; - - /* - * Hooks for ARP - */ - hook_event_t as_arp_physical_in_event; - hook_event_t as_arp_physical_out_event; - hook_event_t as_arp_nic_events; - - hook_event_token_t as_arp_physical_in; - hook_event_token_t as_arp_physical_out; - hook_event_token_t as_arpnicevents; - - net_handle_t as_net_data; -}; -typedef struct arp_stack arp_stack_t; - -#define ARL_F_NOARP 0x01 -#define ARL_F_IPMP 0x02 - -#define ARL_S_DOWN 0x00 -#define ARL_S_PENDING 0x01 -#define ARL_S_UP 0x02 - -/* AR Structure, one per upper stream */ -typedef struct ar_s { - queue_t *ar_rq; /* Read queue pointer */ - queue_t *ar_wq; /* Write queue pointer */ - arl_t *ar_arl; /* Associated arl */ - cred_t *ar_credp; /* Credentials associated w/ open */ - struct ar_s *ar_arl_ip_assoc; /* ARL - IP association */ - uint32_t - ar_ip_acked_close : 1, /* IP has acked the close */ - ar_on_ill_stream : 1; /* Module below is IP */ - arp_stack_t *ar_as; -} ar_t; - -extern void arp_hook_init(arp_stack_t *); -extern void arp_hook_destroy(arp_stack_t *); -extern void arp_net_init(arp_stack_t *, netstackid_t); -extern void arp_net_shutdown(arp_stack_t *); -extern void arp_net_destroy(arp_stack_t *); - -#endif /* _KERNEL */ - -#ifdef __cplusplus -} -#endif - -#endif /* _ARP_IMPL_H */ diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h index 5a7e05b210..88a14068bb 100644 --- a/usr/src/uts/common/inet/ip.h +++ b/usr/src/uts/common/inet/ip.h @@ -55,8 +55,6 @@ extern "C" { #include <sys/squeue.h> #include <net/route.h> #include <sys/systm.h> -#include <sys/multidata.h> -#include <sys/list.h> #include <net/radix.h> #include <sys/modhash.h> @@ -94,6 +92,7 @@ typedef uint32_t ipaddr_t; /* Number of bits in an address */ #define IP_ABITS 32 +#define IPV4_ABITS IP_ABITS #define IPV6_ABITS 128 #define IP_HOST_MASK (ipaddr_t)0xffffffffU @@ -101,14 +100,6 @@ typedef uint32_t ipaddr_t; #define IP_CSUM(mp, off, sum) (~ip_cksum(mp, off, sum) & 0xFFFF) #define IP_CSUM_PARTIAL(mp, off, sum) ip_cksum(mp, off, sum) #define IP_BCSUM_PARTIAL(bp, len, sum) bcksum(bp, len, sum) -#define IP_MD_CSUM(pd, off, sum) (~ip_md_cksum(pd, off, sum) & 0xffff) -#define IP_MD_CSUM_PARTIAL(pd, off, sum) ip_md_cksum(pd, off, sum) - -/* - * Flag to IP write side to indicate that the appln has sent in a pre-built - * IP header. Stored in ipha_ident (which is otherwise zero). - */ -#define IP_HDR_INCLUDED 0xFFFF #define ILL_FRAG_HASH_TBL_COUNT ((unsigned int)64) #define ILL_FRAG_HASH_TBL_SIZE (ILL_FRAG_HASH_TBL_COUNT * sizeof (ipfb_t)) @@ -137,17 +128,12 @@ typedef uint32_t ipaddr_t; #define UDPH_SIZE 8 -/* Leave room for ip_newroute to tack on the src and target addresses */ -#define OK_RESOLVER_MP(mp) \ - ((mp) && ((mp)->b_wptr - (mp)->b_rptr) >= (2 * IP_ADDR_LEN)) - /* * Constants and type definitions to support IP IOCTL commands */ #define IP_IOCTL (('i'<<8)|'p') #define IP_IOC_IRE_DELETE 4 #define IP_IOC_IRE_DELETE_NO_REPLY 5 -#define IP_IOC_IRE_ADVISE_NO_REPLY 6 #define IP_IOC_RTS_REQUEST 7 /* Common definitions used by IP IOCTL data structures */ @@ -157,31 +143,6 @@ typedef struct ipllcmd_s { uint_t ipllc_name_length; } ipllc_t; -/* IP IRE Change Command Structure. */ -typedef struct ipic_s { - ipllc_t ipic_ipllc; - uint_t ipic_ire_type; - uint_t ipic_max_frag; - uint_t ipic_addr_offset; - uint_t ipic_addr_length; - uint_t ipic_mask_offset; - uint_t ipic_mask_length; - uint_t ipic_src_addr_offset; - uint_t ipic_src_addr_length; - uint_t ipic_ll_hdr_offset; - uint_t ipic_ll_hdr_length; - uint_t ipic_gateway_addr_offset; - uint_t ipic_gateway_addr_length; - clock_t ipic_rtt; - uint32_t ipic_ssthresh; - clock_t ipic_rtt_sd; - uchar_t ipic_ire_marks; -} ipic_t; - -#define ipic_cmd ipic_ipllc.ipllc_cmd -#define ipic_ll_name_length ipic_ipllc.ipllc_name_length -#define ipic_ll_name_offset ipic_ipllc.ipllc_name_offset - /* IP IRE Delete Command Structure. */ typedef struct ipid_s { ipllc_t ipid_ipllc; @@ -257,16 +218,8 @@ typedef struct ipoptp_s #define Q_TO_ICMP(q) (Q_TO_CONN((q))->conn_icmp) #define Q_TO_RTS(q) (Q_TO_CONN((q))->conn_rts) -/* - * The following two macros are used by IP to get the appropriate - * wq and rq for a conn. If it is a TCP conn, then we need - * tcp_wq/tcp_rq else, conn_wq/conn_rq. IP can use conn_wq and conn_rq - * from a conn directly if it knows that the conn is not TCP. - */ -#define CONNP_TO_WQ(connp) \ - (IPCL_IS_TCP(connp) ? (connp)->conn_tcp->tcp_wq : (connp)->conn_wq) - -#define CONNP_TO_RQ(connp) RD(CONNP_TO_WQ(connp)) +#define CONNP_TO_WQ(connp) ((connp)->conn_wq) +#define CONNP_TO_RQ(connp) ((connp)->conn_rq) #define GRAB_CONN_LOCK(q) { \ if (q != NULL && CONN_Q(q)) \ @@ -278,9 +231,6 @@ typedef struct ipoptp_s mutex_exit(&(Q_TO_CONN(q))->conn_lock); \ } -/* "Congestion controlled" protocol */ -#define IP_FLOW_CONTROLLED_ULP(p) ((p) == IPPROTO_TCP || (p) == IPPROTO_SCTP) - /* * Complete the pending operation. Usually an ioctl. Can also * be a bind or option management request that got enqueued @@ -295,63 +245,13 @@ typedef struct ipoptp_s } /* - * Flags for the various ip_fanout_* routines. - */ -#define IP_FF_SEND_ICMP 0x01 /* Send an ICMP error */ -#define IP_FF_HDR_COMPLETE 0x02 /* Call ip_hdr_complete if error */ -#define IP_FF_CKSUM 0x04 /* Recompute ipha_cksum if error */ -#define IP_FF_RAWIP 0x08 /* Use rawip mib variable */ -#define IP_FF_SRC_QUENCH 0x10 /* OK to send ICMP_SOURCE_QUENCH */ -#define IP_FF_SYN_ADDIRE 0x20 /* Add IRE if TCP syn packet */ -#define IP_FF_IPINFO 0x80 /* Used for both V4 and V6 */ -#define IP_FF_SEND_SLLA 0x100 /* Send source link layer info ? */ -#define IPV6_REACHABILITY_CONFIRMATION 0x200 /* Flags for ip_xmit_v6 */ -#define IP_FF_NO_MCAST_LOOP 0x400 /* No multicasts for sending zone */ - -/* - * Following flags are used by IPQoS to determine if policy processing is - * required. - */ -#define IP6_NO_IPPOLICY 0x800 /* Don't do IPQoS processing */ -#define IP6_IN_LLMCAST 0x1000 /* Multicast */ - -#define IP_FF_LOOPBACK 0x2000 /* Loopback fanout */ -#define IP_FF_SCTP_CSUM_ERR 0x4000 /* sctp pkt has failed chksum */ - -#ifndef IRE_DB_TYPE -#define IRE_DB_TYPE M_SIG -#endif - -#ifndef IRE_DB_REQ_TYPE -#define IRE_DB_REQ_TYPE M_PCSIG -#endif - -#ifndef IRE_ARPRESOLVE_TYPE -#define IRE_ARPRESOLVE_TYPE M_EVENT -#endif - -/* * Values for squeue switch: */ - #define IP_SQUEUE_ENTER_NODRAIN 1 #define IP_SQUEUE_ENTER 2 -/* - * This is part of the interface between Transport provider and - * IP which can be used to set policy information. This is usually - * accompanied with O_T_BIND_REQ/T_BIND_REQ.ip_bind assumes that - * only IPSEC_POLICY_SET is there when it is found in the chain. - * The information contained is an struct ipsec_req_t. On success - * or failure, either the T_BIND_ACK or the T_ERROR_ACK is returned. - * IPSEC_POLICY_SET is never returned. - */ -#define IPSEC_POLICY_SET M_SETOPTS +#define IP_SQUEUE_FILL 3 -#define IRE_IS_LOCAL(ire) ((ire != NULL) && \ - ((ire)->ire_type & (IRE_LOCAL | IRE_LOOPBACK))) - -#define IRE_IS_TARGET(ire) ((ire != NULL) && \ - ((ire)->ire_type != IRE_BROADCAST)) +extern int ip_squeue_flag; /* IP Fragmentation Reassembly Header */ typedef struct ipf_s { @@ -387,71 +287,6 @@ typedef struct ipf_s { #define ipf_src V4_PART_OF_V6(ipf_v6src) #define ipf_dst V4_PART_OF_V6(ipf_v6dst) -typedef enum { - IB_PKT = 0x01, - OB_PKT = 0x02 -} ip_pkt_t; - -#define UPDATE_IB_PKT_COUNT(ire)\ - { \ - (ire)->ire_ib_pkt_count++; \ - if ((ire)->ire_ipif != NULL) { \ - /* \ - * forwarding packet \ - */ \ - if ((ire)->ire_type & (IRE_LOCAL|IRE_BROADCAST)) \ - atomic_add_32(&(ire)->ire_ipif->ipif_ib_pkt_count, 1);\ - else \ - atomic_add_32(&(ire)->ire_ipif->ipif_fo_pkt_count, 1);\ - } \ - } - -#define UPDATE_OB_PKT_COUNT(ire)\ - { \ - (ire)->ire_ob_pkt_count++;\ - if ((ire)->ire_ipif != NULL) { \ - atomic_add_32(&(ire)->ire_ipif->ipif_ob_pkt_count, 1); \ - } \ - } - -#define IP_RPUT_LOCAL(q, mp, ipha, ire, recv_ill) \ -{ \ - switch (ipha->ipha_protocol) { \ - case IPPROTO_UDP: \ - ip_udp_input(q, mp, ipha, ire, recv_ill); \ - break; \ - default: \ - ip_proto_input(q, mp, ipha, ire, recv_ill, 0); \ - break; \ - } \ -} - -/* - * NCE_EXPIRED is TRUE when we have a non-permanent nce that was - * found to be REACHABLE more than ip_ire_arp_interval ms ago. - * This macro is used to age existing nce_t entries. The - * nce's will get cleaned up in the following circumstances: - * - ip_ire_trash_reclaim will free nce's using ndp_cache_reclaim - * when memory is low, - * - ip_arp_news, when updates are received. - * - if the nce is NCE_EXPIRED(), it will deleted, so that a new - * arp request will need to be triggered from an ND_INITIAL nce. - * - * Note that the nce state transition follows the pattern: - * ND_INITIAL -> ND_INCOMPLETE -> ND_REACHABLE - * after which the nce is deleted when it has expired. - * - * nce_last is the timestamp that indicates when the nce_res_mp in the - * nce_t was last updated to a valid link-layer address. nce_last gets - * modified/updated : - * - when the nce is created - * - every time we get a sane arp response for the nce. - */ -#define NCE_EXPIRED(nce, ipst) (nce->nce_last > 0 && \ - ((nce->nce_flags & NCE_F_PERMANENT) == 0) && \ - ((TICK_TO_MSEC(lbolt64) - nce->nce_last) > \ - (ipst)->ips_ip_ire_arp_interval)) - #endif /* _KERNEL */ /* ICMP types */ @@ -560,7 +395,17 @@ typedef struct ipha_s { #define IPH_DF 0x4000 /* Don't fragment */ #define IPH_MF 0x2000 /* More fragments to come */ #define IPH_OFFSET 0x1FFF /* Where the offset lives */ -#define IPH_FRAG_HDR 0x8000 /* IPv6 don't fragment bit */ + +/* Byte-order specific values */ +#ifdef _BIG_ENDIAN +#define IPH_DF_HTONS 0x4000 /* Don't fragment */ +#define IPH_MF_HTONS 0x2000 /* More fragments to come */ +#define IPH_OFFSET_HTONS 0x1FFF /* Where the offset lives */ +#else +#define IPH_DF_HTONS 0x0040 /* Don't fragment */ +#define IPH_MF_HTONS 0x0020 /* More fragments to come */ +#define IPH_OFFSET_HTONS 0xFF1F /* Where the offset lives */ +#endif /* ECN code points for IPv4 TOS byte and IPv6 traffic class octet. */ #define IPH_ECN_NECT 0x0 /* Not ECN-Capable Transport */ @@ -571,10 +416,8 @@ typedef struct ipha_s { struct ill_s; typedef void ip_v6intfid_func_t(struct ill_s *, in6_addr_t *); -typedef boolean_t ip_v6mapinfo_func_t(uint_t, uint8_t *, uint8_t *, uint32_t *, - in6_addr_t *); -typedef boolean_t ip_v4mapinfo_func_t(uint_t, uint8_t *, uint8_t *, uint32_t *, - ipaddr_t *); +typedef void ip_v6mapinfo_func_t(struct ill_s *, uchar_t *, uchar_t *); +typedef void ip_v4mapinfo_func_t(struct ill_s *, uchar_t *, uchar_t *); /* IP Mac info structure */ typedef struct ip_m_s { @@ -582,8 +425,8 @@ typedef struct ip_m_s { int ip_m_type; /* From <net/if_types.h> */ t_uscalar_t ip_m_ipv4sap; t_uscalar_t ip_m_ipv6sap; - ip_v4mapinfo_func_t *ip_m_v4mapinfo; - ip_v6mapinfo_func_t *ip_m_v6mapinfo; + ip_v4mapinfo_func_t *ip_m_v4mapping; + ip_v6mapinfo_func_t *ip_m_v6mapping; ip_v6intfid_func_t *ip_m_v6intfid; ip_v6intfid_func_t *ip_m_v6destintfid; } ip_m_t; @@ -591,20 +434,14 @@ typedef struct ip_m_s { /* * The following functions attempt to reduce the link layer dependency * of the IP stack. The current set of link specific operations are: - * a. map from IPv4 class D (224.0/4) multicast address range to the link - * layer multicast address range. - * b. map from IPv6 multicast address range (ff00::/8) to the link - * layer multicast address range. - * c. derive the default IPv6 interface identifier from the interface. - * d. derive the default IPv6 destination interface identifier from + * a. map from IPv4 class D (224.0/4) multicast address range or the + * IPv6 multicast address range (ff00::/8) to the link layer multicast + * address. + * b. derive the default IPv6 interface identifier from the interface. + * c. derive the default IPv6 destination interface identifier from * the interface (point-to-point only). */ -#define MEDIA_V4MINFO(ip_m, plen, bphys, maddr, hwxp, v4ptr) \ - (((ip_m)->ip_m_v4mapinfo != NULL) && \ - (*(ip_m)->ip_m_v4mapinfo)(plen, bphys, maddr, hwxp, v4ptr)) -#define MEDIA_V6MINFO(ip_m, plen, bphys, maddr, hwxp, v6ptr) \ - (((ip_m)->ip_m_v6mapinfo != NULL) && \ - (*(ip_m)->ip_m_v6mapinfo)(plen, bphys, maddr, hwxp, v6ptr)) +extern void ip_mcast_mapping(struct ill_s *, uchar_t *, uchar_t *); /* ip_m_v6*intfid return void and are never NULL */ #define MEDIA_V6INTFID(ip_m, ill, v6ptr) (ip_m)->ip_m_v6intfid(ill, v6ptr) #define MEDIA_V6DESTINTFID(ip_m, ill, v6ptr) \ @@ -616,107 +453,38 @@ typedef struct ip_m_s { #define IRE_LOCAL 0x0004 /* Route entry for local address */ #define IRE_LOOPBACK 0x0008 /* Route entry for loopback address */ #define IRE_PREFIX 0x0010 /* Route entry for prefix routes */ +#ifndef _KERNEL +/* Keep so user-level still compiles */ #define IRE_CACHE 0x0020 /* Cached Route entry */ +#endif #define IRE_IF_NORESOLVER 0x0040 /* Route entry for local interface */ /* net without any address mapping. */ #define IRE_IF_RESOLVER 0x0080 /* Route entry for local interface */ /* net with resolver. */ #define IRE_HOST 0x0100 /* Host route entry */ +/* Keep so user-level still compiles */ #define IRE_HOST_REDIRECT 0x0200 /* only used for T_SVR4_OPTMGMT_REQ */ +#define IRE_IF_CLONE 0x0400 /* Per host clone of IRE_IF */ +#define IRE_MULTICAST 0x0800 /* Special - not in table */ +#define IRE_NOROUTE 0x1000 /* Special - not in table */ #define IRE_INTERFACE (IRE_IF_NORESOLVER | IRE_IF_RESOLVER) -#define IRE_OFFSUBNET (IRE_DEFAULT | IRE_PREFIX | IRE_HOST) -#define IRE_CACHETABLE (IRE_CACHE | IRE_BROADCAST | IRE_LOCAL | \ - IRE_LOOPBACK) -#define IRE_FORWARDTABLE (IRE_INTERFACE | IRE_OFFSUBNET) - -/* - * If an IRE is marked with IRE_MARK_CONDEMNED, the last walker of - * the bucket should delete this IRE from this bucket. - */ -#define IRE_MARK_CONDEMNED 0x0001 - -/* - * An IRE with IRE_MARK_PMTU has ire_max_frag set from an ICMP error. - */ -#define IRE_MARK_PMTU 0x0002 - -/* - * An IRE with IRE_MARK_TESTHIDDEN is used by in.mpathd for test traffic. It - * can only be looked up by requesting MATCH_IRE_MARK_TESTHIDDEN. - */ -#define IRE_MARK_TESTHIDDEN 0x0004 - -/* - * An IRE with IRE_MARK_NOADD is created in ip_newroute_ipif when the outgoing - * interface is specified by e.g. IP_PKTINFO. The IRE is not added to the IRE - * cache table. - */ -#define IRE_MARK_NOADD 0x0008 /* Mark not to add ire in cache */ - -/* - * IRE marked with IRE_MARK_TEMPORARY means that this IRE has been used - * either for forwarding a packet or has not been used for sending - * traffic on TCP connections terminated on this system. In both - * cases, this IRE is the first to go when IRE is being cleaned up. - */ -#define IRE_MARK_TEMPORARY 0x0010 - -/* - * IRE marked with IRE_MARK_USESRC_CHECK means that while adding an IRE with - * this mark, additional atomic checks need to be performed. For eg: by the - * time an IRE_CACHE is created, sent up to ARP and then comes back to IP; the - * usesrc grouping could have changed in which case we want to fail adding - * the IRE_CACHE entry - */ -#define IRE_MARK_USESRC_CHECK 0x0020 - -/* - * IRE_MARK_PRIVATE_ADDR is used for IP_NEXTHOP. When IP_NEXTHOP is set, the - * routing table lookup for the destination is bypassed and the packet is - * sent directly to the specified nexthop. The associated IRE_CACHE entries - * should be marked with IRE_MARK_PRIVATE_ADDR flag so that they don't show up - * in regular ire cache lookups. - */ -#define IRE_MARK_PRIVATE_ADDR 0x0040 +#define IRE_IF_ALL (IRE_IF_NORESOLVER | IRE_IF_RESOLVER | \ + IRE_IF_CLONE) +#define IRE_OFFSUBNET (IRE_DEFAULT | IRE_PREFIX | IRE_HOST) +#define IRE_OFFLINK IRE_OFFSUBNET /* - * When we send an ARP resolution query for the nexthop gateway's ire, - * we use esballoc to create the ire_t in the AR_ENTRY_QUERY mblk - * chain, and mark its ire_marks with IRE_MARK_UNCACHED. This flag - * indicates that information from ARP has not been transferred to a - * permanent IRE_CACHE entry. The flag is reset only when the - * information is successfully transferred to an ire_cache entry (in - * ire_add()). Attempting to free the AR_ENTRY_QUERY mblk chain prior - * to ire_add (e.g., from arp, or from ip`ip_wput_nondata) will - * require that the resources (incomplete ire_cache and/or nce) must - * be cleaned up. The free callback routine (ire_freemblk()) checks - * for IRE_MARK_UNCACHED to see if any resources that are pinned down - * will need to be cleaned up or not. + * Note that we view IRE_NOROUTE as ONLINK since we can "send" to them without + * going through a router; the result of sending will be an error/icmp error. */ - -#define IRE_MARK_UNCACHED 0x0080 - -/* - * The comment below (and for other netstack_t references) refers - * to the fact that we only do netstack_hold in particular cases, - * such as the references from open streams (ill_t and conn_t's - * pointers). Internally within IP we rely on IP's ability to cleanup e.g. - * ire_t's when an ill goes away. - */ -typedef struct ire_expire_arg_s { - int iea_flush_flag; - ip_stack_t *iea_ipst; /* Does not have a netstack_hold */ -} ire_expire_arg_t; - -/* Flags with ire_expire routine */ -#define FLUSH_ARP_TIME 0x0001 /* ARP info potentially stale timer */ -#define FLUSH_REDIRECT_TIME 0x0002 /* Redirects potentially stale */ -#define FLUSH_MTU_TIME 0x0004 /* Include path MTU per RFC 1191 */ +#define IRE_ONLINK (IRE_IF_ALL|IRE_LOCAL|IRE_LOOPBACK| \ + IRE_BROADCAST|IRE_MULTICAST|IRE_NOROUTE) /* Arguments to ire_flush_cache() */ #define IRE_FLUSH_DELETE 0 #define IRE_FLUSH_ADD 1 +#define IRE_FLUSH_GWCHANGE 2 /* * Open/close synchronization flags. @@ -724,31 +492,21 @@ typedef struct ire_expire_arg_s { * depends on the atomic 32 bit access to that field. */ #define CONN_CLOSING 0x01 /* ip_close waiting for ip_wsrv */ -#define CONN_IPSEC_LOAD_WAIT 0x02 /* waiting for load */ -#define CONN_CONDEMNED 0x04 /* conn is closing, no more refs */ -#define CONN_INCIPIENT 0x08 /* conn not yet visible, no refs */ -#define CONN_QUIESCED 0x10 /* conn is now quiescent */ - -/* Used to check connection state flags before caching the IRE */ -#define CONN_CACHE_IRE(connp) \ - (!((connp)->conn_state_flags & (CONN_CLOSING|CONN_CONDEMNED))) - -/* - * Parameter to ip_output giving the identity of the caller. - * IP_WSRV means the packet was enqueued in the STREAMS queue - * due to flow control and is now being reprocessed in the context of - * the STREAMS service procedure, consequent to flow control relief. - * IRE_SEND means the packet is being reprocessed consequent to an - * ire cache creation and addition and this may or may not be happening - * in the service procedure context. Anything other than the above 2 - * cases is identified as IP_WPUT. Most commonly this is the case of - * packets coming down from the application. +#define CONN_CONDEMNED 0x02 /* conn is closing, no more refs */ +#define CONN_INCIPIENT 0x04 /* conn not yet visible, no refs */ +#define CONN_QUIESCED 0x08 /* conn is now quiescent */ +#define CONN_UPDATE_ILL 0x10 /* conn_update_ill in progress */ + +/* + * Flags for dce_flags field. Specifies which information has been set. + * dce_ident is always present, but the other ones are identified by the flags. */ -#ifdef _KERNEL -#define IP_WSRV 1 /* Called from ip_wsrv */ -#define IP_WPUT 2 /* Called from ip_wput */ -#define IRE_SEND 3 /* Called from ire_send */ +#define DCEF_DEFAULT 0x0001 /* Default DCE - no pmtu or uinfo */ +#define DCEF_PMTU 0x0002 /* Different than interface MTU */ +#define DCEF_UINFO 0x0004 /* dce_uinfo set */ +#define DCEF_TOO_SMALL_PMTU 0x0008 /* Smaller than IPv4/IPv6 MIN */ +#ifdef _KERNEL /* * Extra structures need for per-src-addr filtering (IGMPv3/MLDv2) */ @@ -786,90 +544,80 @@ typedef struct mrec_s { } mrec_t; /* Group membership list per upper conn */ + /* - * XXX add ilg info for ifaddr/ifindex. - * XXX can we make ilg survive an ifconfig unplumb + plumb - * by setting the ipif/ill to NULL and recover that later? + * We record the multicast information from the socket option in + * ilg_ifaddr/ilg_ifindex. This allows rejoining the group in the case when + * the ifaddr (or ifindex) disappears and later reappears, potentially on + * a different ill. The IPv6 multicast socket options and ioctls all specify + * the interface using an ifindex. For IPv4 some socket options/ioctls use + * the interface address and others use the index. We record here the method + * that was actually used (and leave the other of ilg_ifaddr or ilg_ifindex) + * at zero so that we can rejoin the way the application intended. * - * ilg_ipif is used by IPv4 as multicast groups are joined using an interface - * address (ipif). - * ilg_ill is used by IPv6 as multicast groups are joined using an interface - * index (phyint->phyint_ifindex). - * ilg_ill is NULL for IPv4 and ilg_ipif is NULL for IPv6. + * We track the ill on which we will or already have joined an ilm using + * ilg_ill. When we have succeeded joining the ilm and have a refhold on it + * then we set ilg_ilm. Thus intentionally there is a window where ilg_ill is + * set and ilg_ilm is not set. This allows clearing ilg_ill as a signal that + * the ill is being unplumbed and the ilm should be discarded. * * ilg records the state of multicast memberships of a socket end point. * ilm records the state of multicast memberships with the driver and is * maintained per interface. * - * There is no direct link between a given ilg and ilm. If the - * application has joined a group G with ifindex I, we will have - * an ilg with ilg_v6group and ilg_ill. There will be a corresponding - * ilm with ilm_ill/ilm_v6addr recording the multicast membership. - * To delete the membership: - * - * a) Search for ilg matching on G and I with ilg_v6group - * and ilg_ill. Delete ilg_ill. - * b) Search the corresponding ilm matching on G and I with - * ilm_v6addr and ilm_ill. Delete ilm. - * - * For IPv4 the only difference is that we look using ipifs, not ills. + * The ilg state is protected by conn_ilg_lock. + * The ilg will not be freed until ilg_refcnt drops to zero. */ - -/* - * The ilg_t and ilm_t members are protected by ipsq. They can be changed only - * by a thread executing in the ipsq. In other words add/delete of a - * multicast group has to execute in the ipsq. - */ -#define ILG_DELETED 0x1 /* ilg_flags */ typedef struct ilg_s { + struct ilg_s *ilg_next; + struct ilg_s **ilg_ptpn; + struct conn_s *ilg_connp; /* Back pointer to get lock */ in6_addr_t ilg_v6group; - struct ipif_s *ilg_ipif; /* Logical interface we are member on */ - struct ill_s *ilg_ill; /* Used by IPv6 */ - uint_t ilg_flags; + ipaddr_t ilg_ifaddr; /* For some IPv4 cases */ + uint_t ilg_ifindex; /* IPv6 and some other IPv4 cases */ + struct ill_s *ilg_ill; /* Where ilm is joined. No refhold */ + struct ilm_s *ilg_ilm; /* With ilm_refhold */ + uint_t ilg_refcnt; mcast_record_t ilg_fmode; /* MODE_IS_INCLUDE/MODE_IS_EXCLUDE */ slist_t *ilg_filter; + boolean_t ilg_condemned; /* Conceptually deleted */ } ilg_t; /* * Multicast address list entry for ill. - * ilm_ipif is used by IPv4 as multicast groups are joined using ipif. - * ilm_ill is used by IPv6 as multicast groups are joined using ill. - * ilm_ill is NULL for IPv4 and ilm_ipif is NULL for IPv6. + * ilm_ill is used by IPv4 and IPv6 + * + * The ilm state (and other multicast state on the ill) is protected by + * ill_mcast_lock. Operations that change state on both an ilg and ilm + * in addition use ill_mcast_serializer to ensure that we can't have + * interleaving between e.g., add and delete operations for the same conn_t, + * group, and ill. * * The comment below (and for other netstack_t references) refers * to the fact that we only do netstack_hold in particular cases, - * such as the references from open streams (ill_t and conn_t's + * such as the references from open endpoints (ill_t and conn_t's * pointers). Internally within IP we rely on IP's ability to cleanup e.g. * ire_t's when an ill goes away. */ -#define ILM_DELETED 0x1 /* ilm_flags */ typedef struct ilm_s { in6_addr_t ilm_v6addr; int ilm_refcnt; uint_t ilm_timer; /* IGMP/MLD query resp timer, in msec */ - struct ipif_s *ilm_ipif; /* Back pointer to ipif for IPv4 */ struct ilm_s *ilm_next; /* Linked list for each ill */ uint_t ilm_state; /* state of the membership */ - struct ill_s *ilm_ill; /* Back pointer to ill for IPv6 */ - uint_t ilm_flags; - boolean_t ilm_notify_driver; /* Need to notify the driver */ + struct ill_s *ilm_ill; /* Back pointer to ill - ill_ilm_cnt */ zoneid_t ilm_zoneid; int ilm_no_ilg_cnt; /* number of joins w/ no ilg */ mcast_record_t ilm_fmode; /* MODE_IS_INCLUDE/MODE_IS_EXCLUDE */ slist_t *ilm_filter; /* source filter list */ slist_t *ilm_pendsrcs; /* relevant src addrs for pending req */ rtx_state_t ilm_rtx; /* SCR retransmission state */ + ipaddr_t ilm_ifaddr; /* For IPv4 netstat */ ip_stack_t *ilm_ipst; /* Does not have a netstack_hold */ } ilm_t; #define ilm_addr V4_PART_OF_V6(ilm_v6addr) -typedef struct ilm_walker { - struct ill_s *ilw_ill; /* associated ill */ - struct ill_s *ilw_ipmp_ill; /* associated ipmp ill (if any) */ - struct ill_s *ilw_walk_ill; /* current ill being walked */ -} ilm_walker_t; - /* * Soft reference to an IPsec SA. * @@ -898,40 +646,28 @@ typedef struct ipsa_ref_s * In the presence of IPsec policy, fully-bound conn's bind a connection * to more than just the 5-tuple, but also a specific IPsec action and * identity-pair. - * - * As an optimization, we also cache soft references to IPsec SA's - * here so that we can fast-path around most of the work needed for + * The identity pair is accessed from both the receive and transmit side + * hence it is maintained in the ipsec_latch_t structure. conn_latch and + * ixa_ipsec_latch points to it. + * The policy and actions are stored in conn_latch_in_policy and + * conn_latch_in_action for the inbound side, and in ixa_ipsec_policy and + * ixa_ipsec_action for the transmit side. + * + * As an optimization, we also cache soft references to IPsec SA's in + * ip_xmit_attr_t so that we can fast-path around most of the work needed for * outbound IPsec SA selection. - * - * Were it not for TCP's detached connections, this state would be - * in-line in conn_t; instead, this is in a separate structure so it - * can be handed off to TCP when a connection is detached. */ typedef struct ipsec_latch_s { kmutex_t ipl_lock; uint32_t ipl_refcnt; - uint64_t ipl_unique; - struct ipsec_policy_s *ipl_in_policy; /* latched policy (in) */ - struct ipsec_policy_s *ipl_out_policy; /* latched policy (out) */ - struct ipsec_action_s *ipl_in_action; /* latched action (in) */ - struct ipsec_action_s *ipl_out_action; /* latched action (out) */ - cred_t *ipl_local_id; struct ipsid_s *ipl_local_cid; struct ipsid_s *ipl_remote_cid; unsigned int - ipl_out_action_latched : 1, - ipl_in_action_latched : 1, - ipl_out_policy_latched : 1, - ipl_in_policy_latched : 1, - ipl_ids_latched : 1, - ipl_pad_to_bit_31 : 27; - - ipsa_ref_t ipl_ref[2]; /* 0: ESP, 1: AH */ - + ipl_pad_to_bit_31 : 31; } ipsec_latch_t; #define IPLATCH_REFHOLD(ipl) { \ @@ -939,97 +675,19 @@ typedef struct ipsec_latch_s ASSERT((ipl)->ipl_refcnt != 0); \ } -#define IPLATCH_REFRELE(ipl, ns) { \ +#define IPLATCH_REFRELE(ipl) { \ ASSERT((ipl)->ipl_refcnt != 0); \ membar_exit(); \ if (atomic_add_32_nv(&(ipl)->ipl_refcnt, -1) == 0) \ - iplatch_free(ipl, ns); \ + iplatch_free(ipl); \ } /* * peer identity structure. */ - typedef struct conn_s conn_t; /* - * The old IP client structure "ipc_t" is gone. All the data is stored in the - * connection structure "conn_t" now. The mapping of old and new fields looks - * like this: - * - * ipc_ulp conn_ulp - * ipc_rq conn_rq - * ipc_wq conn_wq - * - * ipc_laddr conn_src - * ipc_faddr conn_rem - * ipc_v6laddr conn_srcv6 - * ipc_v6faddr conn_remv6 - * - * ipc_lport conn_lport - * ipc_fport conn_fport - * ipc_ports conn_ports - * - * ipc_policy conn_policy - * ipc_latch conn_latch - * - * ipc_irc_lock conn_lock - * ipc_ire_cache conn_ire_cache - * - * ipc_state_flags conn_state_flags - * ipc_outgoing_ill conn_outgoing_ill - * - * ipc_dontroute conn_dontroute - * ipc_loopback conn_loopback - * ipc_broadcast conn_broadcast - * ipc_reuseaddr conn_reuseaddr - * - * ipc_multicast_loop conn_multicast_loop - * ipc_multi_router conn_multi_router - * ipc_draining conn_draining - * - * ipc_did_putbq conn_did_putbq - * ipc_unspec_src conn_unspec_src - * ipc_policy_cached conn_policy_cached - * - * ipc_in_enforce_policy conn_in_enforce_policy - * ipc_out_enforce_policy conn_out_enforce_policy - * ipc_af_isv6 conn_af_isv6 - * ipc_pkt_isv6 conn_pkt_isv6 - * - * ipc_ipv6_recvpktinfo conn_ipv6_recvpktinfo - * - * ipc_ipv6_recvhoplimit conn_ipv6_recvhoplimit - * ipc_ipv6_recvhopopts conn_ipv6_recvhopopts - * ipc_ipv6_recvdstopts conn_ipv6_recvdstopts - * - * ipc_ipv6_recvrthdr conn_ipv6_recvrthdr - * ipc_ipv6_recvrtdstopts conn_ipv6_recvrtdstopts - * ipc_fully_bound conn_fully_bound - * - * ipc_recvif conn_recvif - * - * ipc_recvslla conn_recvslla - * ipc_acking_unbind conn_acking_unbind - * ipc_pad_to_bit_31 conn_pad_to_bit_31 - * - * ipc_proto conn_proto - * ipc_incoming_ill conn_incoming_ill - * ipc_pending_ill conn_pending_ill - * ipc_unbind_mp conn_unbind_mp - * ipc_ilg conn_ilg - * ipc_ilg_allocated conn_ilg_allocated - * ipc_ilg_inuse conn_ilg_inuse - * ipc_ilg_walker_cnt conn_ilg_walker_cnt - * ipc_refcv conn_refcv - * ipc_multicast_ipif conn_multicast_ipif - * ipc_multicast_ill conn_multicast_ill - * ipc_drain_next conn_drain_next - * ipc_drain_prev conn_drain_prev - * ipc_idl conn_idl - */ - -/* * This is used to match an inbound/outbound datagram with policy. */ typedef struct ipsec_selector { @@ -1069,22 +727,6 @@ typedef struct ipsec_selector { #define IPSEC_POLICY_MAX 5 /* Always max + 1. */ /* - * Folowing macro is used whenever the code does not know whether there - * is a M_CTL present in the front and it needs to examine the actual mp - * i.e the IP header. As a M_CTL message could be in the front, this - * extracts the packet into mp and the M_CTL mp into first_mp. If M_CTL - * mp is not present, both first_mp and mp point to the same message. - */ -#define EXTRACT_PKT_MP(mp, first_mp, mctl_present) \ - (first_mp) = (mp); \ - if ((mp)->b_datap->db_type == M_CTL) { \ - (mp) = (mp)->b_cont; \ - (mctl_present) = B_TRUE; \ - } else { \ - (mctl_present) = B_FALSE; \ - } - -/* * Check with IPSEC inbound policy if * * 1) per-socket policy is present - indicated by conn_in_enforce_policy. @@ -1113,11 +755,6 @@ typedef struct ipsec_selector { /* * Information cached in IRE for upper layer protocol (ULP). - * - * Notice that ire_max_frag is not included in the iulp_t structure, which - * it may seem that it should. But ire_max_frag cannot really be cached. It - * is fixed for each interface. For MTU found by PMTUd, we may want to cache - * it. But currently, we do not do that. */ typedef struct iulp_s { boolean_t iulp_set; /* Is any metric set? */ @@ -1128,17 +765,21 @@ typedef struct iulp_s { uint32_t iulp_rpipe; /* Receive pipe size. */ uint32_t iulp_rtomax; /* Max round trip timeout. */ uint32_t iulp_sack; /* Use SACK option (TCP)? */ + uint32_t iulp_mtu; /* Setable with routing sockets */ + uint32_t iulp_tstamp_ok : 1, /* Use timestamp option (TCP)? */ iulp_wscale_ok : 1, /* Use window scale option (TCP)? */ iulp_ecn_ok : 1, /* Enable ECN (for TCP)? */ iulp_pmtud_ok : 1, /* Enable PMTUd? */ - iulp_not_used : 28; -} iulp_t; + /* These three are passed out by ip_set_destination */ + iulp_localnet: 1, /* IRE_ONLINK */ + iulp_loopback: 1, /* IRE_LOOPBACK */ + iulp_local: 1, /* IRE_LOCAL */ -/* Zero iulp_t. */ -extern const iulp_t ire_uinfo_null; + iulp_not_used : 25; +} iulp_t; /* * The conn drain list structure (idl_t). @@ -1173,7 +814,6 @@ struct idl_tx_list_s { struct idl_s { conn_t *idl_conn; /* Head of drain list */ kmutex_t idl_lock; /* Lock for this list */ - conn_t *idl_conn_draining; /* conn that is draining */ uint32_t idl_repeat : 1, /* Last conn must re-enable */ /* drain list again */ @@ -1182,36 +822,38 @@ struct idl_s { }; #define CONN_DRAIN_LIST_LOCK(connp) (&((connp)->conn_idl->idl_lock)) + /* * Interface route structure which holds the necessary information to recreate - * routes that are tied to an interface (namely where ire_ipif != NULL). + * routes that are tied to an interface i.e. have ire_ill set. + * * These routes which were initially created via a routing socket or via the * SIOCADDRT ioctl may be gateway routes (RTF_GATEWAY being set) or may be - * traditional interface routes. When an interface comes back up after being - * marked down, this information will be used to recreate the routes. These - * are part of an mblk_t chain that hangs off of the IPIF (ipif_saved_ire_mp). + * traditional interface routes. When an ill comes back up after being + * down, this information will be used to recreate the routes. These + * are part of an mblk_t chain that hangs off of the ILL (ill_saved_ire_mp). */ typedef struct ifrt_s { ushort_t ifrt_type; /* Type of IRE */ in6_addr_t ifrt_v6addr; /* Address IRE represents. */ - in6_addr_t ifrt_v6gateway_addr; /* Gateway if IRE_OFFSUBNET */ - in6_addr_t ifrt_v6src_addr; /* Src addr if RTF_SETSRC */ + in6_addr_t ifrt_v6gateway_addr; /* Gateway if IRE_OFFLINK */ + in6_addr_t ifrt_v6setsrc_addr; /* Src addr if RTF_SETSRC */ in6_addr_t ifrt_v6mask; /* Mask for matching IRE. */ uint32_t ifrt_flags; /* flags related to route */ - uint_t ifrt_max_frag; /* MTU (next hop or path). */ - iulp_t ifrt_iulp_info; /* Cached IRE ULP info. */ + iulp_t ifrt_metrics; /* Routing socket metrics */ + zoneid_t ifrt_zoneid; /* zoneid for route */ } ifrt_t; #define ifrt_addr V4_PART_OF_V6(ifrt_v6addr) #define ifrt_gateway_addr V4_PART_OF_V6(ifrt_v6gateway_addr) -#define ifrt_src_addr V4_PART_OF_V6(ifrt_v6src_addr) #define ifrt_mask V4_PART_OF_V6(ifrt_v6mask) +#define ifrt_setsrc_addr V4_PART_OF_V6(ifrt_v6setsrc_addr) /* Number of IP addresses that can be hosted on a physical interface */ #define MAX_ADDRS_PER_IF 8192 /* * Number of Source addresses to be considered for source address - * selection. Used by ipif_select_source[_v6]. + * selection. Used by ipif_select_source_v4/v6. */ #define MAX_IPIF_SELECT_SOURCE 50 @@ -1245,16 +887,13 @@ typedef struct th_hash_s { #define IPIF_CONDEMNED 0x1 /* The ipif is being removed */ #define IPIF_CHANGING 0x2 /* A critcal ipif field is changing */ #define IPIF_SET_LINKLOCAL 0x10 /* transient flag during bringup */ -#define IPIF_ZERO_SOURCE 0x20 /* transient flag during bringup */ /* IP interface structure, one per local address */ typedef struct ipif_s { struct ipif_s *ipif_next; struct ill_s *ipif_ill; /* Back pointer to our ill */ int ipif_id; /* Logical unit number */ - uint_t ipif_mtu; /* Starts at ipif_ill->ill_max_frag */ in6_addr_t ipif_v6lcl_addr; /* Local IP address for this if. */ - in6_addr_t ipif_v6src_addr; /* Source IP address for this if. */ in6_addr_t ipif_v6subnet; /* Subnet prefix for this if. */ in6_addr_t ipif_v6net_mask; /* Net mask for this interface. */ in6_addr_t ipif_v6brd_addr; /* Broadcast addr for this interface. */ @@ -1262,47 +901,29 @@ typedef struct ipif_s { uint64_t ipif_flags; /* Interface flags. */ uint_t ipif_metric; /* BSD if metric, for compatibility. */ uint_t ipif_ire_type; /* IRE_LOCAL or IRE_LOOPBACK */ - mblk_t *ipif_arp_del_mp; /* Allocated at time arp comes up, to */ - /* prevent awkward out of mem */ - /* condition later */ - mblk_t *ipif_saved_ire_mp; /* Allocated for each extra */ - /* IRE_IF_NORESOLVER/IRE_IF_RESOLVER */ - /* on this interface so that they */ - /* can survive ifconfig down. */ - kmutex_t ipif_saved_ire_lock; /* Protects ipif_saved_ire_mp */ - - mrec_t *ipif_igmp_rpt; /* List of group memberships which */ - /* will be reported on. Used when */ - /* handling an igmp timeout. */ /* - * The packet counts in the ipif contain the sum of the - * packet counts in dead IREs that were affiliated with - * this ipif. + * The packet count in the ipif contain the sum of the + * packet counts in dead IRE_LOCAL/LOOPBACK for this ipif. */ - uint_t ipif_fo_pkt_count; /* Forwarded thru our dead IREs */ uint_t ipif_ib_pkt_count; /* Inbound packets for our dead IREs */ - uint_t ipif_ob_pkt_count; /* Outbound packets to our dead IREs */ + /* Exclusive bit fields, protected by ipsq_t */ unsigned int - ipif_multicast_up : 1, /* ipif_multicast_up() successful */ ipif_was_up : 1, /* ipif was up before */ ipif_addr_ready : 1, /* DAD is done */ ipif_was_dup : 1, /* DAD had failed */ - - ipif_joined_allhosts : 1, /* allhosts joined */ ipif_added_nce : 1, /* nce added for local address */ - ipif_pad_to_31 : 26; + + ipif_pad_to_31 : 28; + + ilm_t *ipif_allhosts_ilm; /* For all-nodes join */ + ilm_t *ipif_solmulti_ilm; /* For IPv6 solicited multicast join */ uint_t ipif_seqid; /* unique index across all ills */ uint_t ipif_state_flags; /* See IPIF_* flag defs above */ uint_t ipif_refcnt; /* active consistent reader cnt */ - /* Number of ire's and ilm's referencing this ipif */ - uint_t ipif_ire_cnt; - uint_t ipif_ilm_cnt; - - uint_t ipif_saved_ire_cnt; zoneid_t ipif_zoneid; /* zone ID number */ timeout_id_t ipif_recovery_id; /* Timer for DAD recovery */ boolean_t ipif_trace_disable; /* True when alloc fails */ @@ -1313,40 +934,12 @@ typedef struct ipif_s { * part of a group will be pointed to, and an ill cannot disappear * while it's in a group. */ - struct ill_s *ipif_bound_ill; - struct ipif_s *ipif_bound_next; /* bound ipif chain */ - boolean_t ipif_bound; /* B_TRUE if we successfully bound */ -} ipif_t; + struct ill_s *ipif_bound_ill; + struct ipif_s *ipif_bound_next; /* bound ipif chain */ + boolean_t ipif_bound; /* B_TRUE if we successfully bound */ -/* - * IPIF_FREE_OK() means that there are no incoming references - * to the ipif. Incoming refs would prevent the ipif from being freed. - */ -#define IPIF_FREE_OK(ipif) \ - ((ipif)->ipif_ire_cnt == 0 && (ipif)->ipif_ilm_cnt == 0) -/* - * IPIF_DOWN_OK() determines whether the incoming pointer reference counts - * would permit the ipif to be considered quiescent. In order for - * an ipif or ill to be considered quiescent, the ire and nce references - * to that ipif/ill must be zero. - * - * We do not require the ilm references to go to zero for quiescence - * because the quiescence checks are done to ensure that - * outgoing packets do not use addresses from the ipif/ill after it - * has been marked down, and incoming packets to addresses on a - * queiscent interface are rejected. This implies that all the - * ire/nce's using that source address need to be deleted and future - * creation of any ires using that source address must be prevented. - * Similarly incoming unicast packets destined to the 'down' address - * will not be accepted once that ire is gone. However incoming - * multicast packets are not destined to the downed address. - * They are only related to the ill in question. Furthermore - * the current API behavior allows applications to join or leave - * multicast groups, i.e., IP_ADD_MEMBERSHIP / LEAVE_MEMBERSHIP, using a - * down address. Therefore the ilm references are not included in - * the _DOWN_OK macros. - */ -#define IPIF_DOWN_OK(ipif) ((ipif)->ipif_ire_cnt == 0) + struct ire_s *ipif_ire_local; /* Our IRE_LOCAL or LOOPBACK */ +} ipif_t; /* * The following table lists the protection levels of the various members @@ -1371,9 +964,7 @@ typedef struct ipif_s { * ill_g_lock ill_g_lock * ipif_ill ipsq + down ipif write once * ipif_id ipsq + down ipif write once - * ipif_mtu ipsq * ipif_v6lcl_addr ipsq + down ipif up ipif - * ipif_v6src_addr ipsq + down ipif up ipif * ipif_v6subnet ipsq + down ipif up ipif * ipif_v6net_mask ipsq + down ipif up ipif * @@ -1383,28 +974,30 @@ typedef struct ipif_s { * ipif_metric * ipif_ire_type ipsq + down ill up ill * - * ipif_arp_del_mp ipsq ipsq - * ipif_saved_ire_mp ipif_saved_ire_lock ipif_saved_ire_lock - * ipif_igmp_rpt ipsq ipsq - * - * ipif_fo_pkt_count Approx * ipif_ib_pkt_count Approx - * ipif_ob_pkt_count Approx * * bit fields ill_lock ill_lock * + * ipif_allhosts_ilm ipsq ipsq + * ipif_solmulti_ilm ipsq ipsq + * * ipif_seqid ipsq Write once * * ipif_state_flags ill_lock ill_lock * ipif_refcnt ill_lock ill_lock - * ipif_ire_cnt ill_lock ill_lock - * ipif_ilm_cnt ill_lock ill_lock - * ipif_saved_ire_cnt - * * ipif_bound_ill ipsq + ipmp_lock ipsq OR ipmp_lock * ipif_bound_next ipsq ipsq * ipif_bound ipsq ipsq + * + * ipif_ire_local ipsq + ips_ill_g_lock ipsq OR ips_ill_g_lock + */ + +/* + * Return values from ip_laddr_verify_{v4,v6} */ +typedef enum { IPVL_UNICAST_UP, IPVL_UNICAST_DOWN, IPVL_MCAST, IPVL_BCAST, + IPVL_BAD} ip_laddr_t; + #define IP_TR_HASH(tid) ((((uintptr_t)tid) >> 6) & (IP_TR_HASH_MAX - 1)) @@ -1422,18 +1015,12 @@ typedef struct ipif_s { /* IPv4 compatibility macros */ #define ipif_lcl_addr V4_PART_OF_V6(ipif_v6lcl_addr) -#define ipif_src_addr V4_PART_OF_V6(ipif_v6src_addr) #define ipif_subnet V4_PART_OF_V6(ipif_v6subnet) #define ipif_net_mask V4_PART_OF_V6(ipif_v6net_mask) #define ipif_brd_addr V4_PART_OF_V6(ipif_v6brd_addr) #define ipif_pp_dst_addr V4_PART_OF_V6(ipif_v6pp_dst_addr) /* Macros for easy backreferences to the ill. */ -#define ipif_wq ipif_ill->ill_wq -#define ipif_rq ipif_ill->ill_rq -#define ipif_net_type ipif_ill->ill_net_type -#define ipif_ipif_up_count ipif_ill->ill_ipif_up_count -#define ipif_type ipif_ill->ill_type #define ipif_isv6 ipif_ill->ill_isv6 #define SIOCLIFADDR_NDX 112 /* ndx of SIOCLIFADDR in the ndx ioctl table */ @@ -1524,7 +1111,7 @@ typedef struct ipxop_s { boolean_t ipx_current_done; /* is the current operation done? */ int ipx_current_ioctl; /* current ioctl, or 0 if no ioctl */ ipif_t *ipx_current_ipif; /* ipif for current op */ - ipif_t *ipx_pending_ipif; /* ipif for ipsq_pending_mp */ + ipif_t *ipx_pending_ipif; /* ipif for ipx_pending_mp */ mblk_t *ipx_pending_mp; /* current ioctl mp while waiting */ boolean_t ipx_forced; /* debugging aid */ #ifdef DEBUG @@ -1642,24 +1229,62 @@ typedef struct irb { krwlock_t irb_lock; /* Protect this bucket */ uint_t irb_refcnt; /* Protected by irb_lock */ uchar_t irb_marks; /* CONDEMNED ires in this bucket ? */ -#define IRB_MARK_CONDEMNED 0x0001 -#define IRB_MARK_FTABLE 0x0002 +#define IRB_MARK_CONDEMNED 0x0001 /* Contains some IRE_IS_CONDEMNED */ +#define IRB_MARK_DYNAMIC 0x0002 /* Dynamically allocated */ + /* Once IPv6 uses radix then IRB_MARK_DYNAMIC will be always be set */ uint_t irb_ire_cnt; /* Num of active IRE in this bucket */ - uint_t irb_tmp_ire_cnt; /* Num of temporary IRE */ - struct ire_s *irb_rr_origin; /* origin for round-robin */ int irb_nire; /* Num of ftable ire's that ref irb */ ip_stack_t *irb_ipst; /* Does not have a netstack_hold */ } irb_t; #define IRB2RT(irb) (rt_t *)((caddr_t)(irb) - offsetof(rt_t, rt_irb)) -/* The following are return values of ip_xmit_v4() */ -typedef enum { - SEND_PASSED = 0, /* sent packet out on wire */ - SEND_FAILED, /* sending of packet failed */ - LOOKUP_IN_PROGRESS, /* ire cache found, ARP resolution in progress */ - LLHDR_RESLV_FAILED /* macaddr resl of onlink dst or nexthop failed */ -} ipxmit_state_t; +/* Forward declarations */ +struct dce_s; +typedef struct dce_s dce_t; +struct ire_s; +typedef struct ire_s ire_t; +struct ncec_s; +typedef struct ncec_s ncec_t; +struct nce_s; +typedef struct nce_s nce_t; +struct ip_recv_attr_s; +typedef struct ip_recv_attr_s ip_recv_attr_t; +struct ip_xmit_attr_s; +typedef struct ip_xmit_attr_s ip_xmit_attr_t; + +struct tsol_ire_gw_secattr_s; +typedef struct tsol_ire_gw_secattr_s tsol_ire_gw_secattr_t; + +/* + * This is a structure for a one-element route cache that is passed + * by reference between ip_input and ill_inputfn. + */ +typedef struct { + ire_t *rtc_ire; + ipaddr_t rtc_ipaddr; + in6_addr_t rtc_ip6addr; +} rtc_t; + +/* + * Note: Temporarily use 64 bits, and will probably go back to 32 bits after + * more cleanup work is done. + */ +typedef uint64_t iaflags_t; + +/* The ill input function pointer type */ +typedef void (*pfillinput_t)(mblk_t *, void *, void *, ip_recv_attr_t *, + rtc_t *); + +/* The ire receive function pointer type */ +typedef void (*pfirerecv_t)(ire_t *, mblk_t *, void *, ip_recv_attr_t *); + +/* The ire send and postfrag function pointer types */ +typedef int (*pfiresend_t)(ire_t *, mblk_t *, void *, + ip_xmit_attr_t *, uint32_t *); +typedef int (*pfirepostfrag_t)(mblk_t *, nce_t *, iaflags_t, uint_t, uint32_t, + zoneid_t, zoneid_t, uintptr_t *); + #define IP_V4_G_HEAD 0 #define IP_V6_G_HEAD 1 @@ -1733,26 +1358,12 @@ typedef union ill_g_head_u { /* * Capabilities, possible flags for ill_capabilities. */ - -#define ILL_CAPAB_AH 0x01 /* IPsec AH acceleration */ -#define ILL_CAPAB_ESP 0x02 /* IPsec ESP acceleration */ -#define ILL_CAPAB_MDT 0x04 /* Multidata Transmit */ +#define ILL_CAPAB_LSO 0x04 /* Large Send Offload */ #define ILL_CAPAB_HCKSUM 0x08 /* Hardware checksumming */ #define ILL_CAPAB_ZEROCOPY 0x10 /* Zero-copy */ #define ILL_CAPAB_DLD 0x20 /* DLD capabilities */ #define ILL_CAPAB_DLD_POLL 0x40 /* Polling */ #define ILL_CAPAB_DLD_DIRECT 0x80 /* Direct function call */ -#define ILL_CAPAB_DLD_LSO 0x100 /* Large Segment Offload */ - -/* - * Per-ill Multidata Transmit capabilities. - */ -typedef struct ill_mdt_capab_s ill_mdt_capab_t; - -/* - * Per-ill IPsec capabilities. - */ -typedef struct ill_ipsec_capab_s ill_ipsec_capab_t; /* * Per-ill Hardware Checksumming capbilities. @@ -1775,15 +1386,18 @@ typedef struct ill_dld_capab_s ill_dld_capab_t; typedef struct ill_rx_ring ill_rx_ring_t; /* - * Per-ill Large Segment Offload capabilities. + * Per-ill Large Send Offload capabilities. */ typedef struct ill_lso_capab_s ill_lso_capab_t; /* The following are ill_state_flags */ #define ILL_LL_SUBNET_PENDING 0x01 /* Waiting for DL_INFO_ACK from drv */ #define ILL_CONDEMNED 0x02 /* No more new ref's to the ILL */ -#define ILL_CHANGING 0x04 /* ILL not globally visible */ -#define ILL_DL_UNBIND_IN_PROGRESS 0x08 /* UNBIND_REQ is sent */ +#define ILL_DL_UNBIND_IN_PROGRESS 0x04 /* UNBIND_REQ is sent */ +#define ILL_DOWN_IN_PROGRESS 0x08 /* ILL is going down - no new nce's */ +#define ILL_LL_BIND_PENDING 0x0020 /* XXX Reuse ILL_LL_SUBNET_PENDING ? */ +#define ILL_LL_UP 0x0040 +#define ILL_LL_DOWN 0x0080 /* Is this an ILL whose source address is used by other ILL's ? */ #define IS_USESRC_ILL(ill) \ @@ -1796,10 +1410,9 @@ typedef struct ill_lso_capab_s ill_lso_capab_t; ((ill)->ill_usesrc_grp_next != NULL)) /* Is this an virtual network interface (vni) ILL ? */ -#define IS_VNI(ill) \ - (((ill) != NULL) && \ +#define IS_VNI(ill) \ (((ill)->ill_phyint->phyint_flags & (PHYI_LOOPBACK|PHYI_VIRTUAL)) == \ - PHYI_VIRTUAL)) + PHYI_VIRTUAL) /* Is this a loopback ILL? */ #define IS_LOOPBACK(ill) \ @@ -1900,18 +1513,41 @@ typedef struct ipmp_grp_s { * ARP up-to-date as the active set of interfaces in the group changes. */ typedef struct ipmp_arpent_s { - mblk_t *ia_area_mp; /* AR_ENTRY_ADD pointer */ ipaddr_t ia_ipaddr; /* IP address for this entry */ boolean_t ia_proxyarp; /* proxy ARP entry? */ boolean_t ia_notified; /* ARP notified about this entry? */ list_node_t ia_node; /* next ARP entry in list */ + uint16_t ia_flags; /* nce_flags for the address */ + size_t ia_lladdr_len; + uchar_t *ia_lladdr; } ipmp_arpent_t; +struct arl_s; + +/* + * Per-ill capabilities. + */ +struct ill_hcksum_capab_s { + uint_t ill_hcksum_version; /* interface version */ + uint_t ill_hcksum_txflags; /* capabilities on transmit */ +}; + +struct ill_zerocopy_capab_s { + uint_t ill_zerocopy_version; /* interface version */ + uint_t ill_zerocopy_flags; /* capabilities */ +}; + +struct ill_lso_capab_s { + uint_t ill_lso_flags; /* capabilities */ + uint_t ill_lso_max; /* maximum size of payload */ +}; + /* * IP Lower level Structure. * Instance data structure in ip_open when there is a device below us. */ typedef struct ill_s { + pfillinput_t ill_inputfn; /* Fast input function selector */ ill_if_t *ill_ifptr; /* pointer to interface type */ queue_t *ill_rq; /* Read queue. */ queue_t *ill_wq; /* Write queue. */ @@ -1922,6 +1558,8 @@ typedef struct ill_s { uint_t ill_ipif_up_count; /* Number of IPIFs currently up. */ uint_t ill_max_frag; /* Max IDU from DLPI. */ + uint_t ill_current_frag; /* Current IDU from DLPI. */ + uint_t ill_mtu; /* User-specified MTU; SIOCSLIFMTU */ char *ill_name; /* Our name. */ uint_t ill_ipif_dup_count; /* Number of duplicate addresses. */ uint_t ill_name_length; /* Name length, incl. terminator. */ @@ -1941,8 +1579,9 @@ typedef struct ill_s { uint8_t *ill_frag_ptr; /* Reassembly state. */ timeout_id_t ill_frag_timer_id; /* timeout id for the frag timer */ ipfb_t *ill_frag_hash_tbl; /* Fragment hash list head. */ - ipif_t *ill_pending_ipif; /* IPIF waiting for DL operation. */ + krwlock_t ill_mcast_lock; /* Protects multicast state */ + kmutex_t ill_mcast_serializer; /* Serialize across ilg and ilm state */ ilm_t *ill_ilm; /* Multicast membership for ill */ uint_t ill_global_timer; /* for IGMPv3/MLDv2 general queries */ int ill_mcast_type; /* type of router which is querier */ @@ -1955,22 +1594,20 @@ typedef struct ill_s { uint8_t ill_mcast_rv; /* IGMPv3/MLDv2 robustness variable */ int ill_mcast_qi; /* IGMPv3/MLDv2 query interval var */ - mblk_t *ill_pending_mp; /* IOCTL/DLPI awaiting completion. */ /* * All non-NULL cells between 'ill_first_mp_to_free' and * 'ill_last_mp_to_free' are freed in ill_delete. */ #define ill_first_mp_to_free ill_bcast_mp mblk_t *ill_bcast_mp; /* DLPI header for broadcasts. */ - mblk_t *ill_resolver_mp; /* Resolver template. */ mblk_t *ill_unbind_mp; /* unbind mp from ill_dl_up() */ mblk_t *ill_promiscoff_mp; /* for ill_leave_allmulti() */ mblk_t *ill_dlpi_deferred; /* b_next chain of control messages */ - mblk_t *ill_ardeact_mp; /* deact mp from ipmp_ill_activate() */ mblk_t *ill_dest_addr_mp; /* mblk which holds ill_dest_addr */ mblk_t *ill_replumb_mp; /* replumb mp from ill_replumb() */ mblk_t *ill_phys_addr_mp; /* mblk which holds ill_phys_addr */ -#define ill_last_mp_to_free ill_phys_addr_mp + mblk_t *ill_mcast_deferred; /* b_next chain of IGMP/MLD packets */ +#define ill_last_mp_to_free ill_mcast_deferred cred_t *ill_credp; /* opener's credentials */ uint8_t *ill_phys_addr; /* ill_phys_addr_mp->b_rptr + off */ @@ -1986,37 +1623,33 @@ typedef struct ill_s { ill_dlpi_style_set : 1, ill_ifname_pending : 1, - ill_join_allmulti : 1, ill_logical_down : 1, ill_dl_up : 1, - ill_up_ipifs : 1, + ill_note_link : 1, /* supports link-up notification */ ill_capab_reneg : 1, /* capability renegotiation to be done */ ill_dld_capab_inprog : 1, /* direct dld capab call in prog */ - ill_need_recover_multicast : 1, - ill_pad_to_bit_31 : 19; + + ill_replumbing : 1, + ill_arl_dlpi_pending : 1, + + ill_pad_to_bit_31 : 18; /* Following bit fields protected by ill_lock */ uint_t ill_fragtimer_executing : 1, ill_fragtimer_needrestart : 1, - ill_ilm_cleanup_reqd : 1, - ill_arp_closing : 1, - - ill_arp_bringup_pending : 1, - ill_arp_extend : 1, /* ARP has DAD extensions */ ill_manual_token : 1, /* system won't override ill_token */ ill_manual_linklocal : 1, /* system won't auto-conf linklocal */ - ill_pad_bit_31 : 24; + ill_pad_bit_31 : 28; /* * Used in SIOCSIFMUXID and SIOCGIFMUXID for 'ifconfig unplumb'. */ - int ill_arp_muxid; /* muxid returned from plink for arp */ - int ill_ip_muxid; /* muxid returned from plink for ip */ + int ill_muxid; /* muxid returned from plink */ /* Used for IP frag reassembly throttling on a per ILL basis. */ uint_t ill_ipf_gen; /* Generation of next fragment queue */ @@ -2033,20 +1666,13 @@ typedef struct ill_s { uint_t ill_dlpi_capab_state; /* State of capability query, IDCS_* */ uint_t ill_capab_pending_cnt; uint64_t ill_capabilities; /* Enabled capabilities, ILL_CAPAB_* */ - ill_mdt_capab_t *ill_mdt_capab; /* Multidata Transmit capabilities */ - ill_ipsec_capab_t *ill_ipsec_capab_ah; /* IPsec AH capabilities */ - ill_ipsec_capab_t *ill_ipsec_capab_esp; /* IPsec ESP capabilities */ ill_hcksum_capab_t *ill_hcksum_capab; /* H/W cksumming capabilities */ ill_zerocopy_capab_t *ill_zerocopy_capab; /* Zero-copy capabilities */ ill_dld_capab_t *ill_dld_capab; /* DLD capabilities */ ill_lso_capab_t *ill_lso_capab; /* Large Segment Offload capabilities */ mblk_t *ill_capab_reset_mp; /* Preallocated mblk for capab reset */ - /* - * Fields for IPv6 - */ uint8_t ill_max_hops; /* Maximum hops for any logical interface */ - uint_t ill_max_mtu; /* Maximum MTU for any logical interface */ uint_t ill_user_mtu; /* User-specified MTU via SIOCSLIFLNKINFO */ uint32_t ill_reachable_time; /* Value for ND algorithm in msec */ uint32_t ill_reachable_retrans_time; /* Value for ND algorithm msec */ @@ -2057,20 +1683,6 @@ typedef struct ill_s { uint32_t ill_xmit_count; /* ndp max multicast xmits */ mib2_ipIfStatsEntry_t *ill_ip_mib; /* ver indep. interface mib */ mib2_ipv6IfIcmpEntry_t *ill_icmp6_mib; /* Per interface mib */ - /* - * Following two mblks are allocated common to all - * the ipifs when the first interface is coming up. - * It is sent up to arp when the last ipif is coming - * down. - */ - mblk_t *ill_arp_down_mp; - mblk_t *ill_arp_del_mapping_mp; - /* - * Used for implementing IFF_NOARP. As IFF_NOARP is used - * to turn off for all the logicals, it is here instead - * of the ipif. - */ - mblk_t *ill_arp_on_mp; phyint_t *ill_phyint; uint64_t ill_flags; @@ -2094,11 +1706,11 @@ typedef struct ill_s { */ uint_t ill_ifname_pending_err; avl_node_t ill_avl_byppa; /* avl node based on ppa */ - void *ill_fastpath_list; /* both ire and nce hang off this */ + list_t ill_nce; /* pointer to nce_s list */ uint_t ill_refcnt; /* active refcnt by threads */ uint_t ill_ire_cnt; /* ires associated with this ill */ kcondvar_t ill_cv; - uint_t ill_ilm_walker_cnt; /* snmp ilm walkers */ + uint_t ill_ncec_cnt; /* ncecs associated with this ill */ uint_t ill_nce_cnt; /* nces associated with this ill */ uint_t ill_waiters; /* threads waiting in ipsq_enter */ /* @@ -2119,6 +1731,17 @@ typedef struct ill_s { void *ill_flownotify_mh; /* Tx flow ctl, mac cb handle */ uint_t ill_ilm_cnt; /* ilms referencing this ill */ uint_t ill_ipallmulti_cnt; /* ip_join_allmulti() calls */ + ilm_t *ill_ipallmulti_ilm; + + mblk_t *ill_saved_ire_mp; /* Allocated for each extra IRE */ + /* with ire_ill set so they can */ + /* survive the ill going down and up. */ + kmutex_t ill_saved_ire_lock; /* Protects ill_saved_ire_mp, cnt */ + uint_t ill_saved_ire_cnt; /* # entries */ + struct arl_ill_common_s *ill_common; + ire_t *ill_ire_multicast; /* IRE_MULTICAST for ill */ + clock_t ill_defend_start; /* start of 1 hour period */ + uint_t ill_defend_count; /* # of announce/defends per ill */ /* * IPMP fields. */ @@ -2131,6 +1754,8 @@ typedef struct ill_s { uint_t ill_bound_cnt; /* # of data addresses bound to ill */ ipif_t *ill_bound_ipif; /* ipif chain bound to ill */ timeout_id_t ill_refresh_tid; /* ill refresh retry timeout id */ + + uint32_t ill_mrouter_cnt; /* mrouter allmulti joins */ } ill_t; /* @@ -2139,15 +1764,17 @@ typedef struct ill_s { */ #define ILL_FREE_OK(ill) \ ((ill)->ill_ire_cnt == 0 && (ill)->ill_ilm_cnt == 0 && \ - (ill)->ill_nce_cnt == 0) + (ill)->ill_ncec_cnt == 0 && (ill)->ill_nce_cnt == 0) /* - * An ipif/ill can be marked down only when the ire and nce references + * An ipif/ill can be marked down only when the ire and ncec references * to that ipif/ill goes to zero. ILL_DOWN_OK() is a necessary condition * quiescence checks. See comments above IPIF_DOWN_OK for details * on why ires and nces are selectively considered for this macro. */ -#define ILL_DOWN_OK(ill) (ill->ill_ire_cnt == 0 && ill->ill_nce_cnt == 0) +#define ILL_DOWN_OK(ill) \ + (ill->ill_ire_cnt == 0 && ill->ill_ncec_cnt == 0 && \ + ill->ill_nce_cnt == 0) /* * The following table lists the protection levels of the various members @@ -2162,7 +1789,8 @@ typedef struct ill_s { * ill_error ipsq None * ill_ipif ill_g_lock + ipsq ill_g_lock OR ipsq * ill_ipif_up_count ill_lock + ipsq ill_lock OR ipsq - * ill_max_frag ipsq Write once + * ill_max_frag ill_lock ill_lock + * ill_current_frag ill_lock ill_lock * * ill_name ill_g_lock + ipsq Write once * ill_name_length ill_g_lock + ipsq Write once @@ -2179,23 +1807,22 @@ typedef struct ill_s { * * ill_frag_timer_id ill_lock ill_lock * ill_frag_hash_tbl ipsq up ill - * ill_ilm ipsq + ill_lock ill_lock - * ill_mcast_type ill_lock ill_lock - * ill_mcast_v1_time ill_lock ill_lock - * ill_mcast_v2_time ill_lock ill_lock - * ill_mcast_v1_tset ill_lock ill_lock - * ill_mcast_v2_tset ill_lock ill_lock - * ill_mcast_rv ill_lock ill_lock - * ill_mcast_qi ill_lock ill_lock - * ill_pending_mp ill_lock ill_lock - * - * ill_bcast_mp ipsq ipsq - * ill_resolver_mp ipsq only when ill is up + * ill_ilm ill_mcast_lock(WRITER) ill_mcast_lock(READER) + * ill_global_timer ill_mcast_lock(WRITER) ill_mcast_lock(READER) + * ill_mcast_type ill_mcast_lock(WRITER) ill_mcast_lock(READER) + * ill_mcast_v1_time ill_mcast_lock(WRITER) ill_mcast_lock(READER) + * ill_mcast_v2_time ill_mcast_lock(WRITER) ill_mcast_lock(READER) + * ill_mcast_v1_tset ill_mcast_lock(WRITER) ill_mcast_lock(READER) + * ill_mcast_v2_tset ill_mcast_lock(WRITER) ill_mcast_lock(READER) + * ill_mcast_rv ill_mcast_lock(WRITER) ill_mcast_lock(READER) + * ill_mcast_qi ill_mcast_lock(WRITER) ill_mcast_lock(READER) + * * ill_down_mp ipsq ipsq * ill_dlpi_deferred ill_lock ill_lock * ill_dlpi_pending ipsq + ill_lock ipsq or ill_lock or * absence of ipsq writer. * ill_phys_addr_mp ipsq + down ill only when ill is up + * ill_mcast_deferred ill_lock ill_lock * ill_phys_addr ipsq + down ill only when ill is up * ill_dest_addr_mp ipsq + down ill only when ill is up * ill_dest_addr ipsq + down ill only when ill is up @@ -2204,8 +1831,7 @@ typedef struct ill_s { * exclusive bit flags ipsq_t ipsq_t * shared bit flags ill_lock ill_lock * - * ill_arp_muxid ipsq Not atomic - * ill_ip_muxid ipsq Not atomic + * ill_muxid ipsq Not atomic * * ill_ipf_gen Not atomic * ill_frag_count atomics atomics @@ -2215,7 +1841,7 @@ typedef struct ill_s { * ill_dlpi_capab_state ipsq ipsq * ill_max_hops ipsq Not atomic * - * ill_max_mtu + * ill_mtu ill_lock None * * ill_user_mtu ipsq + ill_lock ill_lock * ill_reachable_time ipsq + ill_lock ill_lock @@ -2230,9 +1856,6 @@ typedef struct ill_s { * ill_xmit_count ipsq + down ill write once * ill_ip6_mib ipsq + down ill only when ill is up * ill_icmp6_mib ipsq + down ill only when ill is up - * ill_arp_down_mp ipsq ipsq - * ill_arp_del_mapping_mp ipsq ipsq - * ill_arp_on_mp ipsq ipsq * * ill_phyint ipsq, ill_g_lock, ill_lock Any of them * ill_flags ill_lock ill_lock @@ -2247,7 +1870,7 @@ typedef struct ill_s { * ill_refcnt ill_lock ill_lock * ill_ire_cnt ill_lock ill_lock * ill_cv ill_lock ill_lock - * ill_ilm_walker_cnt ill_lock ill_lock + * ill_ncec_cnt ill_lock ill_lock * ill_nce_cnt ill_lock ill_lock * ill_ilm_cnt ill_lock ill_lock * ill_src_ipif ill_g_lock ill_g_lock @@ -2256,8 +1879,12 @@ typedef struct ill_s { * ill_dhcpinit atomics atomics * ill_flownotify_mh write once write once * ill_capab_pending_cnt ipsq ipsq - * - * ill_bound_cnt ipsq ipsq + * ill_ipallmulti_cnt ill_lock ill_lock + * ill_ipallmulti_ilm ill_lock ill_lock + * ill_saved_ire_mp ill_saved_ire_lock ill_saved_ire_lock + * ill_saved_ire_cnt ill_saved_ire_lock ill_saved_ire_lock + * ill_arl ??? ??? + * ill_ire_multicast ipsq + quiescent none * ill_bound_ipif ipsq ipsq * ill_actnode ipsq + ipmp_lock ipsq OR ipmp_lock * ill_grpnode ipsq + ill_g_lock ipsq OR ill_g_lock @@ -2267,6 +1894,7 @@ typedef struct ill_s { * ill_refresh_tid ill_lock ill_lock * ill_grp (for IPMP ill) write once write once * ill_grp (for underlying ill) ipsq + ill_g_lock ipsq OR ill_g_lock + * ill_mrouter_cnt atomics atomics * * NOTE: It's OK to make heuristic decisions on an underlying interface * by using IS_UNDER_IPMP() or comparing ill_grp's raw pointer value. @@ -2311,7 +1939,6 @@ enum { IF_CMD = 1, LIF_CMD, ARP_CMD, XARP_CMD, MSFILT_CMD, MISC_CMD }; #define IPI_GET_CMD 0x8 /* branch to mi_copyout on success */ /* unused 0x10 */ #define IPI_NULL_BCONT 0x20 /* ioctl has not data and hence no b_cont */ -#define IPI_PASS_DOWN 0x40 /* pass this ioctl down when a module only */ extern ip_ioctl_cmd_t ip_ndx_ioctl_table[]; extern ip_ioctl_cmd_t ip_misc_ioctl_table[]; @@ -2362,6 +1989,430 @@ typedef struct ipndp_s { char *ip_ndp_name; } ipndp_t; +/* IXA Notification types */ +typedef enum { + IXAN_LSO, /* LSO capability change */ + IXAN_PMTU, /* PMTU change */ + IXAN_ZCOPY /* ZEROCOPY capability change */ +} ixa_notify_type_t; + +typedef uint_t ixa_notify_arg_t; + +typedef void (*ixa_notify_t)(void *, ip_xmit_attr_t *ixa, ixa_notify_type_t, + ixa_notify_arg_t); + +/* + * Attribute flags that are common to the transmit and receive attributes + */ +#define IAF_IS_IPV4 0x80000000 /* ipsec_*_v4 */ +#define IAF_TRUSTED_ICMP 0x40000000 /* ipsec_*_icmp_loopback */ +#define IAF_NO_LOOP_ZONEID_SET 0x20000000 /* Zone that shouldn't have */ + /* a copy */ +#define IAF_LOOPBACK_COPY 0x10000000 /* For multi and broadcast */ + +#define IAF_MASK 0xf0000000 /* Flags that are common */ + +/* + * Transmit side attributes used between the transport protocols and IP as + * well as inside IP. It is also used to cache information in the conn_t i.e. + * replaces conn_ire and the IPsec caching in the conn_t. + */ +struct ip_xmit_attr_s { + iaflags_t ixa_flags; /* IXAF_*. See below */ + + uint32_t ixa_free_flags; /* IXA_FREE_*. See below */ + uint32_t ixa_refcnt; /* Using atomics */ + + /* + * Always initialized independently of ixa_flags settings. + * Used by ip_xmit so we keep them up front for cache locality. + */ + uint32_t ixa_xmit_hint; /* For ECMP and GLD TX ring fanout */ + uint_t ixa_pktlen; /* Always set. For frag and stats */ + zoneid_t ixa_zoneid; /* Assumed always set */ + + /* Always set for conn_ip_output(); might be stale */ + /* + * Since TCP keeps the conn_t around past the process going away + * we need to use the "notr" (e.g, ire_refhold_notr) for ixa_ire, + * ixa_nce, and ixa_dce. + */ + ire_t *ixa_ire; /* Forwarding table entry */ + uint_t ixa_ire_generation; + nce_t *ixa_nce; /* Neighbor cache entry */ + dce_t *ixa_dce; /* Destination cache entry */ + uint_t ixa_dce_generation; + uint_t ixa_src_generation; /* If IXAF_VERIFY_SOURCE */ + + uint32_t ixa_src_preferences; /* prefs for src addr select */ + uint32_t ixa_pmtu; /* IXAF_VERIFY_PMTU */ + + /* Set by ULP if IXAF_VERIFY_PMTU; otherwise set by IP */ + uint32_t ixa_fragsize; + + int8_t ixa_use_min_mtu; /* IXAF_USE_MIN_MTU values */ + + pfirepostfrag_t ixa_postfragfn; /* Set internally in IP */ + + in6_addr_t ixa_nexthop_v6; /* IXAF_NEXTHOP_SET */ +#define ixa_nexthop_v4 V4_PART_OF_V6(ixa_nexthop_v6) + + zoneid_t ixa_no_loop_zoneid; /* IXAF_NO_LOOP_ZONEID_SET */ + + uint_t ixa_scopeid; /* For IPv6 link-locals */ + + uint_t ixa_broadcast_ttl; /* IXAF_BROACAST_TTL_SET */ + + uint_t ixa_multicast_ttl; /* Assumed set for multicast */ + uint_t ixa_multicast_ifindex; /* Assumed set for multicast */ + ipaddr_t ixa_multicast_ifaddr; /* Assumed set for multicast */ + + int ixa_raw_cksum_offset; /* If IXAF_SET_RAW_CKSUM */ + + uint32_t ixa_ident; /* For IPv6 fragment header */ + + /* + * Cached LSO information. + */ + ill_lso_capab_t ixa_lso_capab; /* Valid when IXAF_LSO_CAPAB */ + + uint64_t ixa_ipsec_policy_gen; /* Generation from iph_gen */ + /* + * The following IPsec fields are only initialized when + * IXAF_IPSEC_SECURE is set. Otherwise they contain garbage. + */ + ipsec_latch_t *ixa_ipsec_latch; /* Just the ids */ + struct ipsa_s *ixa_ipsec_ah_sa; /* Hard reference SA for AH */ + struct ipsa_s *ixa_ipsec_esp_sa; /* Hard reference SA for ESP */ + struct ipsec_policy_s *ixa_ipsec_policy; /* why are we here? */ + struct ipsec_action_s *ixa_ipsec_action; /* For reflected packets */ + ipsa_ref_t ixa_ipsec_ref[2]; /* Soft reference to SA */ + /* 0: ESP, 1: AH */ + + /* + * The selectors here are potentially different than the SPD rule's + * selectors, and we need to have both available for IKEv2. + * + * NOTE: "Source" and "Dest" are w.r.t. outbound datagrams. Ports can + * be zero, and the protocol number is needed to make the ports + * significant. + */ + uint16_t ixa_ipsec_src_port; /* Source port number of d-gram. */ + uint16_t ixa_ipsec_dst_port; /* Destination port number of d-gram. */ + uint8_t ixa_ipsec_icmp_type; /* ICMP type of d-gram */ + uint8_t ixa_ipsec_icmp_code; /* ICMP code of d-gram */ + + sa_family_t ixa_ipsec_inaf; /* Inner address family */ +#define IXA_MAX_ADDRLEN 4 /* Max addr len. (in 32-bit words) */ + uint32_t ixa_ipsec_insrc[IXA_MAX_ADDRLEN]; /* Inner src address */ + uint32_t ixa_ipsec_indst[IXA_MAX_ADDRLEN]; /* Inner dest address */ + uint8_t ixa_ipsec_insrcpfx; /* Inner source prefix */ + uint8_t ixa_ipsec_indstpfx; /* Inner destination prefix */ + + uint8_t ixa_ipsec_proto; /* IP protocol number for d-gram. */ + + /* Always initialized independently of ixa_flags settings */ + uint_t ixa_ifindex; /* Assumed always set */ + uint16_t ixa_ip_hdr_length; /* Points to ULP header */ + uint8_t ixa_protocol; /* Protocol number for ULP cksum */ + ts_label_t *ixa_tsl; /* Always set. NULL if not TX */ + ip_stack_t *ixa_ipst; /* Always set */ + uint32_t ixa_extra_ident; /* Set if LSO */ + cred_t *ixa_cred; /* For getpeerucred */ + pid_t ixa_cpid; /* For getpeerucred */ + +#ifdef DEBUG + kthread_t *ixa_curthread; /* For serialization assert */ +#endif + squeue_t *ixa_sqp; /* Set from conn_sqp as a hint */ + uintptr_t ixa_cookie; /* cookie to use for tx flow control */ + + /* + * Must be set by ULP if any of IXAF_VERIFY_LSO, IXAF_VERIFY_PMTU, + * or IXAF_VERIFY_ZCOPY is set. + */ + ixa_notify_t ixa_notify; /* Registered upcall notify function */ + void *ixa_notify_cookie; /* ULP cookie for ixa_notify */ +}; + +/* + * Flags to indicate which transmit attributes are set. + * Split into "xxx_SET" ones which indicate that the "xxx" field it set, and + * single flags. + */ +#define IXAF_REACH_CONF 0x00000001 /* Reachability confirmation */ +#define IXAF_BROADCAST_TTL_SET 0x00000002 /* ixa_broadcast_ttl valid */ +#define IXAF_SET_SOURCE 0x00000004 /* Replace if broadcast */ +#define IXAF_USE_MIN_MTU 0x00000008 /* IPV6_USE_MIN_MTU */ + +#define IXAF_DONTFRAG 0x00000010 /* IP*_DONTFRAG */ +#define IXAF_VERIFY_PMTU 0x00000020 /* ixa_pmtu/ixa_fragsize set */ +#define IXAF_PMTU_DISCOVERY 0x00000040 /* Create/use PMTU state */ +#define IXAF_MULTICAST_LOOP 0x00000080 /* IP_MULTICAST_LOOP */ + +#define IXAF_IPSEC_SECURE 0x00000100 /* Need IPsec processing */ +#define IXAF_UCRED_TSL 0x00000200 /* ixa_tsl from SCM_UCRED */ +#define IXAF_DONTROUTE 0x00000400 /* SO_DONTROUTE */ +#define IXAF_NO_IPSEC 0x00000800 /* Ignore policy */ + +#define IXAF_PMTU_TOO_SMALL 0x00001000 /* PMTU too small */ +#define IXAF_SET_ULP_CKSUM 0x00002000 /* Calculate ULP checksum */ +#define IXAF_VERIFY_SOURCE 0x00004000 /* Check that source is ok */ +#define IXAF_NEXTHOP_SET 0x00008000 /* ixa_nexthop set */ + +#define IXAF_PMTU_IPV4_DF 0x00010000 /* Set IPv4 DF */ +#define IXAF_NO_DEV_FLOW_CTL 0x00020000 /* Protocol needs no flow ctl */ +#define IXAF_NO_TTL_CHANGE 0x00040000 /* Internal to IP */ +#define IXAF_IPV6_ADD_FRAGHDR 0x00080000 /* Add fragment header */ + +#define IXAF_IPSEC_TUNNEL 0x00100000 /* Tunnel mode */ +#define IXAF_NO_PFHOOK 0x00200000 /* Skip xmit pfhook */ +#define IXAF_NO_TRACE 0x00400000 /* When back from ARP/ND */ +#define IXAF_SCOPEID_SET 0x00800000 /* ixa_scopeid set */ + +#define IXAF_MULTIRT_MULTICAST 0x01000000 /* MULTIRT for multicast */ +#define IXAF_NO_HW_CKSUM 0x02000000 /* Force software cksum */ +#define IXAF_SET_RAW_CKSUM 0x04000000 /* Use ixa_raw_cksum_offset */ +#define IXAF_IPSEC_GLOBAL_POLICY 0x08000000 /* Policy came from global */ + +/* Note the following uses bits 0x10000000 through 0x80000000 */ +#define IXAF_IS_IPV4 IAF_IS_IPV4 +#define IXAF_TRUSTED_ICMP IAF_TRUSTED_ICMP +#define IXAF_NO_LOOP_ZONEID_SET IAF_NO_LOOP_ZONEID_SET +#define IXAF_LOOPBACK_COPY IAF_LOOPBACK_COPY + +/* Note: use the upper 32 bits */ +#define IXAF_VERIFY_LSO 0x100000000 /* Check LSO capability */ +#define IXAF_LSO_CAPAB 0x200000000 /* Capable of LSO */ +#define IXAF_VERIFY_ZCOPY 0x400000000 /* Check Zero Copy capability */ +#define IXAF_ZCOPY_CAPAB 0x800000000 /* Capable of ZEROCOPY */ + +/* + * The normal flags for sending packets e.g., icmp errors + */ +#define IXAF_BASIC_SIMPLE_V4 (IXAF_SET_ULP_CKSUM | IXAF_IS_IPV4) +#define IXAF_BASIC_SIMPLE_V6 (IXAF_SET_ULP_CKSUM) + +/* + * Normally these fields do not have a hold. But in some cases they do, for + * instance when we've gone through ip_*_attr_to/from_mblk. + * We use ixa_free_flags to indicate that they have a hold and need to be + * released on cleanup. + */ +#define IXA_FREE_CRED 0x00000001 /* ixa_cred needs to be rele */ +#define IXA_FREE_TSL 0x00000002 /* ixa_tsl needs to be rele */ + +/* + * Simplistic way to set the ixa_xmit_hint for locally generated traffic + * and forwarded traffic. The shift amount are based on the size of the + * structs to discard the low order bits which don't have much if any variation + * (coloring in kmem_cache_alloc might provide some variation). + * + * Basing the locally generated hint on the address of the conn_t means that + * the packets from the same socket/connection do not get reordered. + * Basing the hint for forwarded traffic on the ill_ring_t means that + * packets from the same NIC+ring are likely to use the same outbound ring + * hence we get low contention on the ring in the transmitting driver. + */ +#define CONN_TO_XMIT_HINT(connp) ((uint32_t)(((uintptr_t)connp) >> 11)) +#define ILL_RING_TO_XMIT_HINT(ring) ((uint32_t)(((uintptr_t)ring) >> 7)) + +/* + * IP set Destination Flags used by function ip_set_destination, + * ip_attr_connect, and conn_connect. + */ +#define IPDF_ALLOW_MCBC 0x1 /* Allow multi/broadcast */ +#define IPDF_VERIFY_DST 0x2 /* Verify destination addr */ +#define IPDF_SELECT_SRC 0x4 /* Select source address */ +#define IPDF_LSO 0x8 /* Try LSO */ +#define IPDF_IPSEC 0x10 /* Set IPsec policy */ +#define IPDF_ZONE_IS_GLOBAL 0x20 /* From conn_zone_is_global */ +#define IPDF_ZCOPY 0x40 /* Try ZEROCOPY */ +#define IPDF_UNIQUE_DCE 0x80 /* Get a per-destination DCE */ + +/* + * Receive side attributes used between the transport protocols and IP as + * well as inside IP. + */ +struct ip_recv_attr_s { + iaflags_t ira_flags; /* See below */ + + uint32_t ira_free_flags; /* IRA_FREE_*. See below */ + + /* + * This is a hint for TCP SYN packets. + * Always initialized independently of ira_flags settings + */ + squeue_t *ira_sqp; + ill_rx_ring_t *ira_ring; /* Internal to IP */ + + /* For ip_accept_tcp when IRAF_TARGET_SQP is set */ + squeue_t *ira_target_sqp; + mblk_t *ira_target_sqp_mp; + + /* Always initialized independently of ira_flags settings */ + uint32_t ira_xmit_hint; /* For ECMP and GLD TX ring fanout */ + zoneid_t ira_zoneid; /* ALL_ZONES unless local delivery */ + uint_t ira_pktlen; /* Always set. For frag and stats */ + uint16_t ira_ip_hdr_length; /* Points to ULP header */ + uint8_t ira_protocol; /* Protocol number for ULP cksum */ + uint_t ira_rifindex; /* Received ifindex */ + uint_t ira_ruifindex; /* Received upper ifindex */ + ts_label_t *ira_tsl; /* Always set. NULL if not TX */ + /* + * ira_rill and ira_ill is set inside IP, but not when conn_recv is + * called; ULPs should use ira_ruifindex instead. + */ + ill_t *ira_rill; /* ill where packet came */ + ill_t *ira_ill; /* ill where IP address hosted */ + cred_t *ira_cred; /* For getpeerucred */ + pid_t ira_cpid; /* For getpeerucred */ + + /* Used when IRAF_VERIFIED_SRC is set; this source was ok */ + ipaddr_t ira_verified_src; + + /* + * The following IPsec fields are only initialized when + * IRAF_IPSEC_SECURE is set. Otherwise they contain garbage. + */ + struct ipsec_action_s *ira_ipsec_action; /* how we made it in.. */ + struct ipsa_s *ira_ipsec_ah_sa; /* SA for AH */ + struct ipsa_s *ira_ipsec_esp_sa; /* SA for ESP */ + + ipaddr_t ira_mroute_tunnel; /* IRAF_MROUTE_TUNNEL_SET */ + + zoneid_t ira_no_loop_zoneid; /* IRAF_NO_LOOP_ZONEID_SET */ + + uint32_t ira_esp_udp_ports; /* IRAF_ESP_UDP_PORTS */ + + /* + * For IP_RECVSLLA and ip_ndp_conflict/find_solicitation. + * Same size as max for sockaddr_dl + */ +#define IRA_L2SRC_SIZE 244 + uint8_t ira_l2src[IRA_L2SRC_SIZE]; /* If IRAF_L2SRC_SET */ + + /* + * Local handle that we use to do lazy setting of ira_l2src. + * We defer setting l2src until needed but we do before any + * ip_input pullupmsg or copymsg. + */ + struct mac_header_info_s *ira_mhip; /* Could be NULL */ +}; + +/* + * Flags to indicate which receive attributes are set. + */ +#define IRAF_SYSTEM_LABELED 0x00000001 /* is_system_labeled() */ +#define IRAF_IPV4_OPTIONS 0x00000002 /* Performance */ +#define IRAF_MULTICAST 0x00000004 /* Was multicast at L3 */ +#define IRAF_BROADCAST 0x00000008 /* Was broadcast at L3 */ +#define IRAF_MULTIBROADCAST (IRAF_MULTICAST|IRAF_BROADCAST) + +#define IRAF_LOOPBACK 0x00000010 /* Looped back by IP */ +#define IRAF_VERIFY_IP_CKSUM 0x00000020 /* Need to verify IP */ +#define IRAF_VERIFY_ULP_CKSUM 0x00000040 /* Need to verify TCP,UDP,etc */ +#define IRAF_SCTP_CSUM_ERR 0x00000080 /* sctp pkt has failed chksum */ + +#define IRAF_IPSEC_SECURE 0x00000100 /* Passed AH and/or ESP */ +#define IRAF_DHCP_UNICAST 0x00000200 +#define IRAF_IPSEC_DECAPS 0x00000400 /* Was packet decapsulated */ + /* from a matching inner packet? */ +#define IRAF_TARGET_SQP 0x00000800 /* ira_target_sqp is set */ +#define IRAF_VERIFIED_SRC 0x00001000 /* ira_verified_src set */ +#define IRAF_RSVP 0x00002000 /* RSVP packet for rsvpd */ +#define IRAF_MROUTE_TUNNEL_SET 0x00004000 /* From ip_mroute_decap */ +#define IRAF_PIM_REGISTER 0x00008000 /* From register_mforward */ + +#define IRAF_TX_MAC_EXEMPTABLE 0x00010000 /* Allow MAC_EXEMPT readdown */ +#define IRAF_TX_SHARED_ADDR 0x00020000 /* Arrived on ALL_ZONES addr */ +#define IRAF_ESP_UDP_PORTS 0x00040000 /* NAT-traversal packet */ +#define IRAF_NO_HW_CKSUM 0x00080000 /* Force software cksum */ + +#define IRAF_ICMP_ERROR 0x00100000 /* Send to conn_recvicmp */ +#define IRAF_ROUTER_ALERT 0x00200000 /* IPv6 router alert */ +#define IRAF_L2SRC_SET 0x00400000 /* ira_l2src has been set */ +#define IRAF_L2SRC_LOOPBACK 0x00800000 /* Came from us */ + +#define IRAF_L2DST_MULTICAST 0x01000000 /* Multicast at L2 */ +#define IRAF_L2DST_BROADCAST 0x02000000 /* Broadcast at L2 */ +/* Unused 0x04000000 */ +/* Unused 0x08000000 */ + +/* Below starts with 0x10000000 */ +#define IRAF_IS_IPV4 IAF_IS_IPV4 +#define IRAF_TRUSTED_ICMP IAF_TRUSTED_ICMP +#define IRAF_NO_LOOP_ZONEID_SET IAF_NO_LOOP_ZONEID_SET +#define IRAF_LOOPBACK_COPY IAF_LOOPBACK_COPY + +/* + * Normally these fields do not have a hold. But in some cases they do, for + * instance when we've gone through ip_*_attr_to/from_mblk. + * We use ira_free_flags to indicate that they have a hold and need to be + * released on cleanup. + */ +#define IRA_FREE_CRED 0x00000001 /* ira_cred needs to be rele */ +#define IRA_FREE_TSL 0x00000002 /* ira_tsl needs to be rele */ + +/* + * Optional destination cache entry for path MTU information, + * and ULP metrics. + */ +struct dce_s { + uint_t dce_generation; /* Changed since cached? */ + uint_t dce_flags; /* See below */ + uint_t dce_ipversion; /* IPv4/IPv6 version */ + uint32_t dce_pmtu; /* Path MTU if DCEF_PMTU */ + uint32_t dce_ident; /* Per destination IP ident. */ + iulp_t dce_uinfo; /* Metrics if DCEF_UINFO */ + + struct dce_s *dce_next; + struct dce_s **dce_ptpn; + struct dcb_s *dce_bucket; + + union { + in6_addr_t dceu_v6addr; + ipaddr_t dceu_v4addr; + } dce_u; +#define dce_v4addr dce_u.dceu_v4addr +#define dce_v6addr dce_u.dceu_v6addr + /* Note that for IPv6+IPMP we use the ifindex for the upper interface */ + uint_t dce_ifindex; /* For IPv6 link-locals */ + + kmutex_t dce_lock; + uint_t dce_refcnt; + uint64_t dce_last_change_time; /* Path MTU. In seconds */ + + ip_stack_t *dce_ipst; /* Does not have a netstack_hold */ +}; + +/* + * Values for dce_generation. + * + * If a DCE has DCE_GENERATION_CONDEMNED, the last dce_refrele should delete + * it. + * + * DCE_GENERATION_VERIFY is never stored in dce_generation but it is + * stored in places that cache DCE (such as ixa_dce_generation). + * It is used as a signal that the cache is stale and needs to be reverified. + */ +#define DCE_GENERATION_CONDEMNED 0 +#define DCE_GENERATION_VERIFY 1 +#define DCE_GENERATION_INITIAL 2 +#define DCE_IS_CONDEMNED(dce) \ + ((dce)->dce_generation == DCE_GENERATION_CONDEMNED) + + +/* + * Values for ips_src_generation. + * + * SRC_GENERATION_VERIFY is never stored in ips_src_generation but it is + * stored in places that cache IREs (ixa_src_generation). It is used as a + * signal that the cache is stale and needs to be reverified. + */ +#define SRC_GENERATION_VERIFY 0 +#define SRC_GENERATION_INITIAL 1 + /* * The kernel stores security attributes of all gateways in a database made * up of one or more tsol_gcdb_t elements. Each tsol_gcdb_t contains the @@ -2453,183 +2504,28 @@ extern kmutex_t gcgrp_lock; */ struct tsol_tnrhc; -typedef struct tsol_ire_gw_secattr_s { +struct tsol_ire_gw_secattr_s { kmutex_t igsa_lock; /* lock to protect following */ struct tsol_tnrhc *igsa_rhc; /* host entry for gateway */ tsol_gc_t *igsa_gc; /* for prefix IREs */ - tsol_gcgrp_t *igsa_gcgrp; /* for cache IREs */ -} tsol_ire_gw_secattr_t; - -/* - * Following are the macros to increment/decrement the reference - * count of the IREs and IRBs (ire bucket). - * - * 1) We bump up the reference count of an IRE to make sure that - * it does not get deleted and freed while we are using it. - * Typically all the lookup functions hold the bucket lock, - * and look for the IRE. If it finds an IRE, it bumps up the - * reference count before dropping the lock. Sometimes we *may* want - * to bump up the reference count after we *looked* up i.e without - * holding the bucket lock. So, the IRE_REFHOLD macro does not assert - * on the bucket lock being held. Any thread trying to delete from - * the hash bucket can still do so but cannot free the IRE if - * ire_refcnt is not 0. - * - * 2) We bump up the reference count on the bucket where the IRE resides - * (IRB), when we want to prevent the IREs getting deleted from a given - * hash bucket. This makes life easier for ire_walk type functions which - * wants to walk the IRE list, call a function, but needs to drop - * the bucket lock to prevent recursive rw_enters. While the - * lock is dropped, the list could be changed by other threads or - * the same thread could end up deleting the ire or the ire pointed by - * ire_next. IRE_REFHOLDing the ire or ire_next is not sufficient as - * a delete will still remove the ire from the bucket while we have - * dropped the lock and hence the ire_next would be NULL. Thus, we - * need a mechanism to prevent deletions from a given bucket. - * - * To prevent deletions, we bump up the reference count on the - * bucket. If the bucket is held, ire_delete just marks IRE_MARK_CONDEMNED - * both on the ire's ire_marks and the bucket's irb_marks. When the - * reference count on the bucket drops to zero, all the CONDEMNED ires - * are deleted. We don't have to bump up the reference count on the - * bucket if we are walking the bucket and never have to drop the bucket - * lock. Note that IRB_REFHOLD does not prevent addition of new ires - * in the list. It is okay because addition of new ires will not cause - * ire_next to point to freed memory. We do IRB_REFHOLD only when - * all of the 3 conditions are true : - * - * 1) The code needs to walk the IRE bucket from start to end. - * 2) It may have to drop the bucket lock sometimes while doing (1) - * 3) It does not want any ires to be deleted meanwhile. - */ - -/* - * Bump up the reference count on the IRE. We cannot assert that the - * bucket lock is being held as it is legal to bump up the reference - * count after the first lookup has returned the IRE without - * holding the lock. Currently ip_wput does this for caching IRE_CACHEs. - */ - -#ifdef DEBUG -#define IRE_UNTRACE_REF(ire) ire_untrace_ref(ire); -#define IRE_TRACE_REF(ire) ire_trace_ref(ire); -#else -#define IRE_UNTRACE_REF(ire) -#define IRE_TRACE_REF(ire) -#endif - -#define IRE_REFHOLD_NOTR(ire) { \ - atomic_add_32(&(ire)->ire_refcnt, 1); \ - ASSERT((ire)->ire_refcnt != 0); \ -} - -#define IRE_REFHOLD(ire) { \ - IRE_REFHOLD_NOTR(ire); \ - IRE_TRACE_REF(ire); \ -} - -#define IRE_REFHOLD_LOCKED(ire) { \ - IRE_TRACE_REF(ire); \ - (ire)->ire_refcnt++; \ -} - -/* - * Decrement the reference count on the IRE. - * In architectures e.g sun4u, where atomic_add_32_nv is just - * a cas, we need to maintain the right memory barrier semantics - * as that of mutex_exit i.e all the loads and stores should complete - * before the cas is executed. membar_exit() does that here. - * - * NOTE : This macro is used only in places where we want performance. - * To avoid bloating the code, we use the function "ire_refrele" - * which essentially calls the macro. - */ -#define IRE_REFRELE_NOTR(ire) { \ - ASSERT((ire)->ire_refcnt != 0); \ - membar_exit(); \ - if (atomic_add_32_nv(&(ire)->ire_refcnt, -1) == 0) \ - ire_inactive(ire); \ -} - -#define IRE_REFRELE(ire) { \ - if (ire->ire_bucket != NULL) { \ - IRE_UNTRACE_REF(ire); \ - } \ - IRE_REFRELE_NOTR(ire); \ -} - -/* - * Bump up the reference count on the hash bucket - IRB to - * prevent ires from being deleted in this bucket. - */ -#define IRB_REFHOLD(irb) { \ - rw_enter(&(irb)->irb_lock, RW_WRITER); \ - (irb)->irb_refcnt++; \ - ASSERT((irb)->irb_refcnt != 0); \ - rw_exit(&(irb)->irb_lock); \ -} -#define IRB_REFHOLD_LOCKED(irb) { \ - ASSERT(RW_WRITE_HELD(&(irb)->irb_lock)); \ - (irb)->irb_refcnt++; \ - ASSERT((irb)->irb_refcnt != 0); \ -} +}; void irb_refrele_ftable(irb_t *); -/* - * Note: when IRB_MARK_FTABLE (i.e., IRE_CACHETABLE entry), the irb_t - * is statically allocated, so that when the irb_refcnt goes to 0, - * we simply clean up the ire list and continue. - */ -#define IRB_REFRELE(irb) { \ - if ((irb)->irb_marks & IRB_MARK_FTABLE) { \ - irb_refrele_ftable((irb)); \ - } else { \ - rw_enter(&(irb)->irb_lock, RW_WRITER); \ - ASSERT((irb)->irb_refcnt != 0); \ - if (--(irb)->irb_refcnt == 0 && \ - ((irb)->irb_marks & IRE_MARK_CONDEMNED)) { \ - ire_t *ire_list; \ - \ - ire_list = ire_unlink(irb); \ - rw_exit(&(irb)->irb_lock); \ - ASSERT(ire_list != NULL); \ - ire_cleanup(ire_list); \ - } else { \ - rw_exit(&(irb)->irb_lock); \ - } \ - } \ -} extern struct kmem_cache *rt_entry_cache; -/* - * Lock the fast path mp for access, since the fp_mp can be deleted - * due a DL_NOTE_FASTPATH_FLUSH in the case of IRE_BROADCAST - */ - -#define LOCK_IRE_FP_MP(ire) { \ - if ((ire)->ire_type == IRE_BROADCAST) \ - mutex_enter(&ire->ire_nce->nce_lock); \ - } -#define UNLOCK_IRE_FP_MP(ire) { \ - if ((ire)->ire_type == IRE_BROADCAST) \ - mutex_exit(&ire->ire_nce->nce_lock); \ - } - typedef struct ire4 { - ipaddr_t ire4_src_addr; /* Source address to use. */ ipaddr_t ire4_mask; /* Mask for matching this IRE. */ ipaddr_t ire4_addr; /* Address this IRE represents. */ - ipaddr_t ire4_gateway_addr; /* Gateway if IRE_CACHE/IRE_OFFSUBNET */ - ipaddr_t ire4_cmask; /* Mask from parent prefix route */ + ipaddr_t ire4_gateway_addr; /* Gateway including for IRE_ONLINK */ + ipaddr_t ire4_setsrc_addr; /* RTF_SETSRC */ } ire4_t; typedef struct ire6 { - in6_addr_t ire6_src_addr; /* Source address to use. */ in6_addr_t ire6_mask; /* Mask for matching this IRE. */ in6_addr_t ire6_addr; /* Address this IRE represents. */ - in6_addr_t ire6_gateway_addr; /* Gateway if IRE_CACHE/IRE_OFFSUBNET */ - in6_addr_t ire6_cmask; /* Mask from parent prefix route */ + in6_addr_t ire6_gateway_addr; /* Gateway including for IRE_ONLINK */ + in6_addr_t ire6_setsrc_addr; /* RTF_SETSRC */ } ire6_t; typedef union ire_addr { @@ -2637,115 +2533,131 @@ typedef union ire_addr { ire4_t ire4_u; } ire_addr_u_t; -/* Internet Routing Entry */ -typedef struct ire_s { +/* + * Internet Routing Entry + * When we have multiple identical IREs we logically add them by manipulating + * ire_identical_ref and ire_delete first decrements + * that and when it reaches 1 we know it is the last IRE. + * "identical" is defined as being the same for: + * ire_addr, ire_netmask, ire_gateway, ire_ill, ire_zoneid, and ire_type + * For instance, multiple IRE_BROADCASTs for the same subnet number are + * viewed as identical, and so are the IRE_INTERFACEs when there are + * multiple logical interfaces (on the same ill) with the same subnet prefix. + */ +struct ire_s { struct ire_s *ire_next; /* The hash chain must be first. */ struct ire_s **ire_ptpn; /* Pointer to previous next. */ uint32_t ire_refcnt; /* Number of references */ - mblk_t *ire_mp; /* Non-null if allocated as mblk */ - queue_t *ire_rfq; /* recv from this queue */ - queue_t *ire_stq; /* send to this queue */ - union { - uint_t *max_fragp; /* Used only during ire creation */ - uint_t max_frag; /* MTU (next hop or path). */ - } imf_u; -#define ire_max_frag imf_u.max_frag -#define ire_max_fragp imf_u.max_fragp - uint32_t ire_frag_flag; /* IPH_DF or zero. */ - uint32_t ire_ident; /* Per IRE IP ident. */ - uint32_t ire_tire_mark; /* Used for reclaim of unused. */ + ill_t *ire_ill; + uint32_t ire_identical_ref; /* IRE_INTERFACE, IRE_BROADCAST */ uchar_t ire_ipversion; /* IPv4/IPv6 version */ - uchar_t ire_marks; /* IRE_MARK_CONDEMNED etc. */ ushort_t ire_type; /* Type of IRE */ + uint_t ire_generation; /* Generation including CONDEMNED */ uint_t ire_ib_pkt_count; /* Inbound packets for ire_addr */ uint_t ire_ob_pkt_count; /* Outbound packets to ire_addr */ - uint_t ire_ll_hdr_length; /* Non-zero if we do M_DATA prepends */ time_t ire_create_time; /* Time (in secs) IRE was created. */ - uint32_t ire_phandle; /* Associate prefix IREs to cache */ - uint32_t ire_ihandle; /* Associate interface IREs to cache */ - ipif_t *ire_ipif; /* the interface that this ire uses */ uint32_t ire_flags; /* flags related to route (RTF_*) */ /* - * Neighbor Cache Entry for IPv6; arp info for IPv4 + * ire_testhidden is TRUE for INTERFACE IREs of IS_UNDER_IPMP(ill) + * interfaces */ - struct nce_s *ire_nce; + boolean_t ire_testhidden; + pfirerecv_t ire_recvfn; /* Receive side handling */ + pfiresend_t ire_sendfn; /* Send side handling */ + pfirepostfrag_t ire_postfragfn; /* Bottom end of send handling */ + uint_t ire_masklen; /* # bits in ire_mask{,_v6} */ ire_addr_u_t ire_u; /* IPv4/IPv6 address info. */ irb_t *ire_bucket; /* Hash bucket when ire_ptphn is set */ - iulp_t ire_uinfo; /* Upper layer protocol info. */ - /* - * Protects ire_uinfo, ire_max_frag, and ire_frag_flag. - */ kmutex_t ire_lock; - uint_t ire_ipif_seqid; /* ipif_seqid of ire_ipif */ - uint_t ire_ipif_ifindex; /* ifindex associated with ipif */ - clock_t ire_last_used_time; /* Last used time */ + clock_t ire_last_used_time; /* For IRE_LOCAL reception */ tsol_ire_gw_secattr_t *ire_gw_secattr; /* gateway security attributes */ - zoneid_t ire_zoneid; /* for local address discrimination */ + zoneid_t ire_zoneid; + + /* + * Cached information of where to send packets that match this route. + * The ire_dep_* information is used to determine when ire_nce_cache + * needs to be updated. + * ire_nce_cache is the fastpath for the Neighbor Cache Entry + * for IPv6; arp info for IPv4 + * Since this is a cache setup and torn down independently of + * applications we need to use nce_ref{rele,hold}_notr for it. + */ + nce_t *ire_nce_cache; + + /* + * Quick check whether the ire_type and ire_masklen indicates + * that the IRE can have ire_nce_cache set i.e., whether it is + * IRE_ONLINK and for a single destination. + */ + boolean_t ire_nce_capable; + /* - * ire's that are embedded inside mblk_t and sent to the external - * resolver use the ire_stq_ifindex to track the ifindex of the - * ire_stq, so that the ill (if it exists) can be correctly recovered - * for cleanup in the esbfree routine when arp failure occurs. - * Similarly, the ire_stackid is used to recover the ip_stack_t. + * Dependency tracking so we can safely cache IRE and NCE pointers + * in offlink and onlink IREs. + * These are locked under the ips_ire_dep_lock rwlock. Write held + * when modifying the linkage. + * ire_dep_parent (Also chain towards IRE for nexthop) + * ire_dep_parent_generation: ire_generation of ire_dep_parent + * ire_dep_children (From parent to first child) + * ire_dep_sib_next (linked list of siblings) + * ire_dep_sib_ptpn (linked list of siblings) + * + * The parent has a ire_refhold on each child, and each child has + * an ire_refhold on its parent. + * Since ire_dep_parent is a cache setup and torn down independently of + * applications we need to use ire_ref{rele,hold}_notr for it. */ - uint_t ire_stq_ifindex; - netstackid_t ire_stackid; + ire_t *ire_dep_parent; + ire_t *ire_dep_children; + ire_t *ire_dep_sib_next; + ire_t **ire_dep_sib_ptpn; /* Pointer to previous next */ + uint_t ire_dep_parent_generation; + + uint_t ire_badcnt; /* Number of times ND_UNREACHABLE */ + uint64_t ire_last_badcnt; /* In seconds */ + + /* ire_defense* and ire_last_used_time are only used on IRE_LOCALs */ uint_t ire_defense_count; /* number of ARP conflicts */ uint_t ire_defense_time; /* last time defended (secs) */ + boolean_t ire_trace_disable; /* True when alloc fails */ ip_stack_t *ire_ipst; /* Does not have a netstack_hold */ -} ire_t; + iulp_t ire_metrics; +}; /* IPv4 compatibility macros */ -#define ire_src_addr ire_u.ire4_u.ire4_src_addr #define ire_mask ire_u.ire4_u.ire4_mask #define ire_addr ire_u.ire4_u.ire4_addr #define ire_gateway_addr ire_u.ire4_u.ire4_gateway_addr -#define ire_cmask ire_u.ire4_u.ire4_cmask +#define ire_setsrc_addr ire_u.ire4_u.ire4_setsrc_addr -#define ire_src_addr_v6 ire_u.ire6_u.ire6_src_addr #define ire_mask_v6 ire_u.ire6_u.ire6_mask #define ire_addr_v6 ire_u.ire6_u.ire6_addr #define ire_gateway_addr_v6 ire_u.ire6_u.ire6_gateway_addr -#define ire_cmask_v6 ire_u.ire6_u.ire6_cmask - -/* Convenient typedefs for sockaddrs */ -typedef struct sockaddr_in sin_t; -typedef struct sockaddr_in6 sin6_t; - -/* Address structure used for internal bind with IP */ -typedef struct ipa_conn_s { - ipaddr_t ac_laddr; - ipaddr_t ac_faddr; - uint16_t ac_fport; - uint16_t ac_lport; -} ipa_conn_t; - -typedef struct ipa6_conn_s { - in6_addr_t ac6_laddr; - in6_addr_t ac6_faddr; - uint16_t ac6_fport; - uint16_t ac6_lport; -} ipa6_conn_t; +#define ire_setsrc_addr_v6 ire_u.ire6_u.ire6_setsrc_addr /* - * Using ipa_conn_x_t or ipa6_conn_x_t allows us to modify the behavior of IP's - * bind handler. + * Values for ire_generation. + * + * If an IRE is marked with IRE_IS_CONDEMNED, the last walker of + * the bucket should delete this IRE from this bucket. + * + * IRE_GENERATION_VERIFY is never stored in ire_generation but it is + * stored in places that cache IREs (such as ixa_ire_generation and + * ire_dep_parent_generation). It is used as a signal that the cache is + * stale and needs to be reverified. */ -typedef struct ipa_conn_extended_s { - uint64_t acx_flags; - ipa_conn_t acx_conn; -} ipa_conn_x_t; +#define IRE_GENERATION_CONDEMNED 0 +#define IRE_GENERATION_VERIFY 1 +#define IRE_GENERATION_INITIAL 2 +#define IRE_IS_CONDEMNED(ire) \ + ((ire)->ire_generation == IRE_GENERATION_CONDEMNED) -typedef struct ipa6_conn_extended_s { - uint64_t ac6x_flags; - ipa6_conn_t ac6x_conn; -} ipa6_conn_x_t; - -/* flag values for ipa_conn_x_t and ipa6_conn_x_t. */ -#define ACX_VERIFY_DST 0x1ULL /* verify destination address is reachable */ +/* Convenient typedefs for sockaddrs */ +typedef struct sockaddr_in sin_t; +typedef struct sockaddr_in6 sin6_t; /* Name/Value Descriptor. */ typedef struct nv_s { @@ -2784,110 +2696,83 @@ extern uint_t ip_max_frag_dups; * to support the needs of such tools and private definitions moved to * private headers. */ -struct ip6_pkt_s { +struct ip_pkt_s { uint_t ipp_fields; /* Which fields are valid */ - uint_t ipp_sticky_ignored; /* sticky fields to ignore */ - uint_t ipp_ifindex; /* pktinfo ifindex */ in6_addr_t ipp_addr; /* pktinfo src/dst addr */ - uint_t ipp_unicast_hops; /* IPV6_UNICAST_HOPS */ - uint_t ipp_multicast_hops; /* IPV6_MULTICAST_HOPS */ +#define ipp_addr_v4 V4_PART_OF_V6(ipp_addr) + uint_t ipp_unicast_hops; /* IPV6_UNICAST_HOPS, IP_TTL */ uint_t ipp_hoplimit; /* IPV6_HOPLIMIT */ uint_t ipp_hopoptslen; - uint_t ipp_rtdstoptslen; + uint_t ipp_rthdrdstoptslen; uint_t ipp_rthdrlen; uint_t ipp_dstoptslen; - uint_t ipp_pathmtulen; uint_t ipp_fraghdrlen; ip6_hbh_t *ipp_hopopts; - ip6_dest_t *ipp_rtdstopts; + ip6_dest_t *ipp_rthdrdstopts; ip6_rthdr_t *ipp_rthdr; ip6_dest_t *ipp_dstopts; ip6_frag_t *ipp_fraghdr; - struct ip6_mtuinfo *ipp_pathmtu; - in6_addr_t ipp_nexthop; /* Transmit only */ - uint8_t ipp_tclass; - int8_t ipp_use_min_mtu; + uint8_t ipp_tclass; /* IPV6_TCLASS */ + uint8_t ipp_type_of_service; /* IP_TOS */ + uint_t ipp_ipv4_options_len; /* Len of IPv4 options */ + uint8_t *ipp_ipv4_options; /* Ptr to IPv4 options */ + uint_t ipp_label_len_v4; /* Len of TX label for IPv4 */ + uint8_t *ipp_label_v4; /* TX label for IPv4 */ + uint_t ipp_label_len_v6; /* Len of TX label for IPv6 */ + uint8_t *ipp_label_v6; /* TX label for IPv6 */ }; -typedef struct ip6_pkt_s ip6_pkt_t; - -extern void ip6_pkt_free(ip6_pkt_t *); /* free storage inside ip6_pkt_t */ - -/* - * This struct is used by ULP_opt_set() functions to return value of IPv4 - * ancillary options. Currently this is only used by udp and icmp and only - * IP_PKTINFO option is supported. - */ -typedef struct ip4_pkt_s { - uint_t ip4_ill_index; /* interface index */ - ipaddr_t ip4_addr; /* source address */ -} ip4_pkt_t; - -/* - * Used by ULP's to pass options info to ip_output - * currently only IP_PKTINFO is supported. - */ -typedef struct ip_opt_info_s { - uint_t ip_opt_ill_index; - uint_t ip_opt_flags; -} ip_opt_info_t; - -/* - * value for ip_opt_flags - */ -#define IP_VERIFY_SRC 0x1 +typedef struct ip_pkt_s ip_pkt_t; -/* - * This structure is used to convey information from IP and the ULP. - * Currently used for the IP_RECVSLLA, IP_RECVIF and IP_RECVPKTINFO options. - * The type of information field is set to IN_PKTINFO (i.e inbound pkt info) - */ -typedef struct ip_pktinfo { - uint32_t ip_pkt_ulp_type; /* type of info sent */ - uint32_t ip_pkt_flags; /* what is sent up by IP */ - uint32_t ip_pkt_ifindex; /* inbound interface index */ - struct sockaddr_dl ip_pkt_slla; /* has source link layer addr */ - struct in_addr ip_pkt_match_addr; /* matched address */ -} ip_pktinfo_t; - -/* - * flags to tell UDP what IP is sending; in_pkt_flags - */ -#define IPF_RECVIF 0x01 /* inbound interface index */ -#define IPF_RECVSLLA 0x02 /* source link layer address */ -/* - * Inbound interface index + matched address. - * Used only by IPV4. - */ -#define IPF_RECVADDR 0x04 +extern void ip_pkt_free(ip_pkt_t *); /* free storage inside ip_pkt_t */ +extern ipaddr_t ip_pkt_source_route_v4(const ip_pkt_t *); +extern in6_addr_t *ip_pkt_source_route_v6(const ip_pkt_t *); +extern int ip_pkt_copy(ip_pkt_t *, ip_pkt_t *, int); +extern void ip_pkt_source_route_reverse_v4(ip_pkt_t *); /* ipp_fields values */ -#define IPPF_IFINDEX 0x0001 /* Part of in6_pktinfo: ifindex */ -#define IPPF_ADDR 0x0002 /* Part of in6_pktinfo: src/dst addr */ -#define IPPF_SCOPE_ID 0x0004 /* Add xmit ip6i_t for sin6_scope_id */ -#define IPPF_NO_CKSUM 0x0008 /* Add xmit ip6i_t for IP6I_NO_*_CKSUM */ - -#define IPPF_RAW_CKSUM 0x0010 /* Add xmit ip6i_t for IP6I_RAW_CHECKSUM */ -#define IPPF_HOPLIMIT 0x0020 -#define IPPF_HOPOPTS 0x0040 -#define IPPF_RTHDR 0x0080 - -#define IPPF_RTDSTOPTS 0x0100 -#define IPPF_DSTOPTS 0x0200 -#define IPPF_NEXTHOP 0x0400 -#define IPPF_PATHMTU 0x0800 - -#define IPPF_TCLASS 0x1000 -#define IPPF_DONTFRAG 0x2000 -#define IPPF_USE_MIN_MTU 0x04000 -#define IPPF_MULTICAST_HOPS 0x08000 - -#define IPPF_UNICAST_HOPS 0x10000 -#define IPPF_FRAGHDR 0x20000 - -#define IPPF_HAS_IP6I \ - (IPPF_IFINDEX|IPPF_ADDR|IPPF_NEXTHOP|IPPF_SCOPE_ID| \ - IPPF_NO_CKSUM|IPPF_RAW_CKSUM|IPPF_HOPLIMIT|IPPF_DONTFRAG| \ - IPPF_USE_MIN_MTU|IPPF_MULTICAST_HOPS|IPPF_UNICAST_HOPS) +#define IPPF_ADDR 0x0001 /* Part of in6_pktinfo: src/dst addr */ +#define IPPF_HOPLIMIT 0x0002 /* Overrides unicast and multicast */ +#define IPPF_TCLASS 0x0004 /* Overrides class in sin6_flowinfo */ + +#define IPPF_HOPOPTS 0x0010 /* ipp_hopopts set */ +#define IPPF_RTHDR 0x0020 /* ipp_rthdr set */ +#define IPPF_RTHDRDSTOPTS 0x0040 /* ipp_rthdrdstopts set */ +#define IPPF_DSTOPTS 0x0080 /* ipp_dstopts set */ + +#define IPPF_IPV4_OPTIONS 0x0100 /* ipp_ipv4_options set */ +#define IPPF_LABEL_V4 0x0200 /* ipp_label_v4 set */ +#define IPPF_LABEL_V6 0x0400 /* ipp_label_v6 set */ + +#define IPPF_FRAGHDR 0x0800 /* Used for IPsec receive side */ + +/* + * Data structure which is passed to conn_opt_get/set. + * The conn_t is included even though it can be inferred from queue_t. + * setsockopt and getsockopt use conn_ixa and conn_xmit_ipp. However, + * when handling ancillary data we use separate ixa and ipps. + */ +typedef struct conn_opt_arg_s { + conn_t *coa_connp; + ip_xmit_attr_t *coa_ixa; + ip_pkt_t *coa_ipp; + boolean_t coa_ancillary; /* Ancillary data and not setsockopt */ + uint_t coa_changed; /* See below */ +} conn_opt_arg_t; + +/* + * Flags for what changed. + * If we want to be more efficient in the future we can have more fine + * grained flags e.g., a flag for just IP_TOS changing. + * For now we either call ip_set_destination (for "route changed") + * and/or conn_build_hdr_template/conn_prepend_hdr (for "header changed"). + */ +#define COA_HEADER_CHANGED 0x0001 +#define COA_ROUTE_CHANGED 0x0002 +#define COA_RCVBUF_CHANGED 0x0004 /* SO_RCVBUF */ +#define COA_SNDBUF_CHANGED 0x0008 /* SO_SNDBUF */ +#define COA_WROFF_CHANGED 0x0010 /* Header size changed */ +#define COA_ICMP_BIND_NEEDED 0x0020 +#define COA_OOBINLINE_CHANGED 0x0040 #define TCP_PORTS_OFFSET 0 #define UDP_PORTS_OFFSET 0 @@ -2902,32 +2787,21 @@ typedef struct ip_pktinfo { #define IPIF_LOOKUP_FAILED 2 /* Used as error code */ #define ILL_CAN_LOOKUP(ill) \ - (!((ill)->ill_state_flags & (ILL_CONDEMNED | ILL_CHANGING)) || \ + (!((ill)->ill_state_flags & ILL_CONDEMNED) || \ IAM_WRITER_ILL(ill)) -#define ILL_CAN_WAIT(ill, q) \ - (((q) != NULL) && !((ill)->ill_state_flags & (ILL_CONDEMNED))) +#define ILL_IS_CONDEMNED(ill) \ + ((ill)->ill_state_flags & ILL_CONDEMNED) #define IPIF_CAN_LOOKUP(ipif) \ - (!((ipif)->ipif_state_flags & (IPIF_CONDEMNED | IPIF_CHANGING)) || \ + (!((ipif)->ipif_state_flags & IPIF_CONDEMNED) || \ IAM_WRITER_IPIF(ipif)) -/* - * If the parameter 'q' is NULL, the caller is not interested in wait and - * restart of the operation if the ILL or IPIF cannot be looked up when it is - * marked as 'CHANGING'. Typically a thread that tries to send out data will - * end up passing NULLs as the last 4 parameters to ill_lookup_on_ifindex and - * in this case 'q' is NULL - */ -#define IPIF_CAN_WAIT(ipif, q) \ - (((q) != NULL) && !((ipif)->ipif_state_flags & (IPIF_CONDEMNED))) - -#define IPIF_CAN_LOOKUP_WALKER(ipif) \ - (!((ipif)->ipif_state_flags & (IPIF_CONDEMNED)) || \ - IAM_WRITER_IPIF(ipif)) +#define IPIF_IS_CONDEMNED(ipif) \ + ((ipif)->ipif_state_flags & IPIF_CONDEMNED) -#define ILL_UNMARK_CHANGING(ill) \ - (ill)->ill_state_flags &= ~ILL_CHANGING; +#define IPIF_IS_CHANGING(ipif) \ + ((ipif)->ipif_state_flags & IPIF_CHANGING) /* Macros used to assert that this thread is a writer */ #define IAM_WRITER_IPSQ(ipsq) ((ipsq)->ipsq_xop->ipx_writer == curthread) @@ -2956,9 +2830,9 @@ typedef struct ip_pktinfo { #define RELEASE_ILL_LOCKS(ill_1, ill_2) \ { \ if (ill_1 != NULL) \ - mutex_exit(&(ill_1)->ill_lock); \ + mutex_exit(&(ill_1)->ill_lock); \ if (ill_2 != NULL && ill_2 != ill_1) \ - mutex_exit(&(ill_2)->ill_lock); \ + mutex_exit(&(ill_2)->ill_lock); \ } /* Get the other protocol instance ill */ @@ -2975,20 +2849,13 @@ typedef struct cmd_info_s struct lifreq *ci_lifr; /* the lifreq struct passed down */ } cmd_info_t; -/* - * List of AH and ESP IPsec acceleration capable ills - */ -typedef struct ipsec_capab_ill_s { - uint_t ill_index; - boolean_t ill_isv6; - struct ipsec_capab_ill_s *next; -} ipsec_capab_ill_t; - extern struct kmem_cache *ire_cache; extern ipaddr_t ip_g_all_ones; -extern uint_t ip_loopback_mtu; /* /etc/system */ +extern uint_t ip_loopback_mtu; /* /etc/system */ +extern uint_t ip_loopback_mtuplus; +extern uint_t ip_loopback_mtu_v6plus; extern vmem_t *ip_minor_arena_sa; extern vmem_t *ip_minor_arena_la; @@ -3014,18 +2881,18 @@ extern vmem_t *ip_minor_arena_la; #define ips_ip_g_send_redirects ips_param_arr[5].ip_param_value #define ips_ip_g_forward_directed_bcast ips_param_arr[6].ip_param_value #define ips_ip_mrtdebug ips_param_arr[7].ip_param_value -#define ips_ip_timer_interval ips_param_arr[8].ip_param_value -#define ips_ip_ire_arp_interval ips_param_arr[9].ip_param_value -#define ips_ip_ire_redir_interval ips_param_arr[10].ip_param_value +#define ips_ip_ire_reclaim_fraction ips_param_arr[8].ip_param_value +#define ips_ip_nce_reclaim_fraction ips_param_arr[9].ip_param_value +#define ips_ip_dce_reclaim_fraction ips_param_arr[10].ip_param_value #define ips_ip_def_ttl ips_param_arr[11].ip_param_value #define ips_ip_forward_src_routed ips_param_arr[12].ip_param_value #define ips_ip_wroff_extra ips_param_arr[13].ip_param_value -#define ips_ip_ire_pathmtu_interval ips_param_arr[14].ip_param_value +#define ips_ip_pathmtu_interval ips_param_arr[14].ip_param_value #define ips_ip_icmp_return ips_param_arr[15].ip_param_value #define ips_ip_path_mtu_discovery ips_param_arr[16].ip_param_value -#define ips_ip_ignore_delete_time ips_param_arr[17].ip_param_value +#define ips_ip_pmtu_min ips_param_arr[17].ip_param_value #define ips_ip_ignore_redirect ips_param_arr[18].ip_param_value -#define ips_ip_output_queue ips_param_arr[19].ip_param_value +#define ips_ip_arp_icmp_error ips_param_arr[19].ip_param_value #define ips_ip_broadcast_ttl ips_param_arr[20].ip_param_value #define ips_ip_icmp_err_interval ips_param_arr[21].ip_param_value #define ips_ip_icmp_err_burst ips_param_arr[22].ip_param_value @@ -3046,7 +2913,7 @@ extern vmem_t *ip_minor_arena_la; #define ips_ipv6_send_redirects ips_param_arr[35].ip_param_value #define ips_ipv6_ignore_redirect ips_param_arr[36].ip_param_value #define ips_ipv6_strict_dst_multihoming ips_param_arr[37].ip_param_value -#define ips_ip_ire_reclaim_fraction ips_param_arr[38].ip_param_value +#define ips_src_check ips_param_arr[38].ip_param_value #define ips_ipsec_policy_log_interval ips_param_arr[39].ip_param_value #define ips_pim_accept_clear_messages ips_param_arr[40].ip_param_value #define ips_ip_ndp_unsolicit_interval ips_param_arr[41].ip_param_value @@ -3055,21 +2922,37 @@ extern vmem_t *ip_minor_arena_la; /* Misc IP configuration knobs */ #define ips_ip_policy_mask ips_param_arr[44].ip_param_value -#define ips_ip_multirt_resolution_interval ips_param_arr[45].ip_param_value +#define ips_ip_ecmp_behavior ips_param_arr[45].ip_param_value #define ips_ip_multirt_ttl ips_param_arr[46].ip_param_value -#define ips_ip_multidata_outbound ips_param_arr[47].ip_param_value -#define ips_ip_ndp_defense_interval ips_param_arr[48].ip_param_value -#define ips_ip_max_temp_idle ips_param_arr[49].ip_param_value -#define ips_ip_max_temp_defend ips_param_arr[50].ip_param_value -#define ips_ip_max_defend ips_param_arr[51].ip_param_value -#define ips_ip_defend_interval ips_param_arr[52].ip_param_value -#define ips_ip_dup_recovery ips_param_arr[53].ip_param_value -#define ips_ip_restrict_interzone_loopback ips_param_arr[54].ip_param_value -#define ips_ip_lso_outbound ips_param_arr[55].ip_param_value -#define ips_igmp_max_version ips_param_arr[56].ip_param_value -#define ips_mld_max_version ips_param_arr[57].ip_param_value -#define ips_ip_pmtu_min ips_param_arr[58].ip_param_value -#define ips_ipv6_drop_inbound_icmpv6 ips_param_arr[59].ip_param_value +#define ips_ip_ire_badcnt_lifetime ips_param_arr[47].ip_param_value +#define ips_ip_max_temp_idle ips_param_arr[48].ip_param_value +#define ips_ip_max_temp_defend ips_param_arr[49].ip_param_value +#define ips_ip_max_defend ips_param_arr[50].ip_param_value +#define ips_ip_defend_interval ips_param_arr[51].ip_param_value +#define ips_ip_dup_recovery ips_param_arr[52].ip_param_value +#define ips_ip_restrict_interzone_loopback ips_param_arr[53].ip_param_value +#define ips_ip_lso_outbound ips_param_arr[54].ip_param_value +#define ips_igmp_max_version ips_param_arr[55].ip_param_value +#define ips_mld_max_version ips_param_arr[56].ip_param_value +#define ips_ipv6_drop_inbound_icmpv6 ips_param_arr[57].ip_param_value +#define ips_arp_probe_delay ips_param_arr[58].ip_param_value +#define ips_arp_fastprobe_delay ips_param_arr[59].ip_param_value +#define ips_arp_probe_interval ips_param_arr[60].ip_param_value +#define ips_arp_fastprobe_interval ips_param_arr[61].ip_param_value +#define ips_arp_probe_count ips_param_arr[62].ip_param_value +#define ips_arp_fastprobe_count ips_param_arr[63].ip_param_value +#define ips_ipv4_dad_announce_interval ips_param_arr[64].ip_param_value +#define ips_ipv6_dad_announce_interval ips_param_arr[65].ip_param_value +#define ips_arp_defend_interval ips_param_arr[66].ip_param_value +#define ips_arp_defend_rate ips_param_arr[67].ip_param_value +#define ips_ndp_defend_interval ips_param_arr[68].ip_param_value +#define ips_ndp_defend_rate ips_param_arr[69].ip_param_value +#define ips_arp_defend_period ips_param_arr[70].ip_param_value +#define ips_ndp_defend_period ips_param_arr[71].ip_param_value +#define ips_ipv4_icmp_return_pmtu ips_param_arr[72].ip_param_value +#define ips_ipv6_icmp_return_pmtu ips_param_arr[73].ip_param_value +#define ips_ip_arp_publish_count ips_param_arr[74].ip_param_value +#define ips_ip_arp_publish_interval ips_param_arr[75].ip_param_value extern int dohwcksum; /* use h/w cksum if supported by the h/w */ #ifdef ZC_TEST @@ -3102,13 +2985,13 @@ extern struct module_info ip_mod_info; ((ipst)->ips_ip4_loopback_out_event.he_interested) #define HOOKS6_INTERESTED_LOOPBACK_OUT(ipst) \ ((ipst)->ips_ip6_loopback_out_event.he_interested) - /* - * Hooks macros used inside of ip + * Hooks marcos used inside of ip + * The callers use the above INTERESTED macros first, hence + * the he_interested check is superflous. */ -#define FW_HOOKS(_hook, _event, _ilp, _olp, _iph, _fm, _m, _llm, ipst) \ - \ - if ((_hook).he_interested) { \ +#define FW_HOOKS(_hook, _event, _ilp, _olp, _iph, _fm, _m, _llm, ipst, _err) \ + if ((_hook).he_interested) { \ hook_pkt_event_t info; \ \ _NOTE(CONSTCOND) \ @@ -3121,12 +3004,15 @@ extern struct module_info ip_mod_info; info.hpe_mp = &(_fm); \ info.hpe_mb = _m; \ info.hpe_flags = _llm; \ - if (hook_run(ipst->ips_ipv4_net_data->netd_hooks, \ - _event, (hook_data_t)&info) != 0) { \ + _err = hook_run(ipst->ips_ipv4_net_data->netd_hooks, \ + _event, (hook_data_t)&info); \ + if (_err != 0) { \ ip2dbg(("%s hook dropped mblk chain %p hdr %p\n",\ (_hook).he_name, (void *)_fm, (void *)_m)); \ - freemsg(_fm); \ - _fm = NULL; \ + if (_fm != NULL) { \ + freemsg(_fm); \ + _fm = NULL; \ + } \ _iph = NULL; \ _m = NULL; \ } else { \ @@ -3135,9 +3021,8 @@ extern struct module_info ip_mod_info; } \ } -#define FW_HOOKS6(_hook, _event, _ilp, _olp, _iph, _fm, _m, _llm, ipst) \ - \ - if ((_hook).he_interested) { \ +#define FW_HOOKS6(_hook, _event, _ilp, _olp, _iph, _fm, _m, _llm, ipst, _err) \ + if ((_hook).he_interested) { \ hook_pkt_event_t info; \ \ _NOTE(CONSTCOND) \ @@ -3150,12 +3035,15 @@ extern struct module_info ip_mod_info; info.hpe_mp = &(_fm); \ info.hpe_mb = _m; \ info.hpe_flags = _llm; \ - if (hook_run(ipst->ips_ipv6_net_data->netd_hooks, \ - _event, (hook_data_t)&info) != 0) { \ + _err = hook_run(ipst->ips_ipv6_net_data->netd_hooks, \ + _event, (hook_data_t)&info); \ + if (_err != 0) { \ ip2dbg(("%s hook dropped mblk chain %p hdr %p\n",\ (_hook).he_name, (void *)_fm, (void *)_m)); \ - freemsg(_fm); \ - _fm = NULL; \ + if (_fm != NULL) { \ + freemsg(_fm); \ + _fm = NULL; \ + } \ _iph = NULL; \ _m = NULL; \ } else { \ @@ -3194,24 +3082,6 @@ extern struct module_info ip_mod_info; #define IP_LOOPBACK_ADDR(addr) \ (((addr) & N_IN_CLASSA_NET == N_IN_LOOPBACK_NET)) -#ifdef DEBUG -/* IPsec HW acceleration debugging support */ - -#define IPSECHW_CAPAB 0x0001 /* capability negotiation */ -#define IPSECHW_SADB 0x0002 /* SADB exchange */ -#define IPSECHW_PKT 0x0004 /* general packet flow */ -#define IPSECHW_PKTIN 0x0008 /* driver in pkt processing details */ -#define IPSECHW_PKTOUT 0x0010 /* driver out pkt processing details */ - -#define IPSECHW_DEBUG(f, x) if (ipsechw_debug & (f)) { (void) printf x; } -#define IPSECHW_CALL(f, r, x) if (ipsechw_debug & (f)) { (void) r x; } - -extern uint32_t ipsechw_debug; -#else -#define IPSECHW_DEBUG(f, x) {} -#define IPSECHW_CALL(f, r, x) {} -#endif - extern int ip_debug; extern uint_t ip_thread_data; extern krwlock_t ip_thread_rwlock; @@ -3235,8 +3105,6 @@ extern list_t ip_thread_list; /* Default MAC-layer address string length for mac_colon_addr */ #define MAC_STR_LEN 128 -struct ipsec_out_s; - struct mac_header_info_s; extern void ill_frag_timer(void *); @@ -3252,86 +3120,173 @@ extern char *ip_dot_addr(ipaddr_t, char *); extern const char *mac_colon_addr(const uint8_t *, size_t, char *, size_t); extern void ip_lwput(queue_t *, mblk_t *); extern boolean_t icmp_err_rate_limit(ip_stack_t *); -extern void icmp_time_exceeded(queue_t *, mblk_t *, uint8_t, zoneid_t, - ip_stack_t *); -extern void icmp_unreachable(queue_t *, mblk_t *, uint8_t, zoneid_t, - ip_stack_t *); -extern mblk_t *ip_add_info(mblk_t *, ill_t *, uint_t, zoneid_t, ip_stack_t *); -cred_t *ip_best_cred(mblk_t *, conn_t *, pid_t *); -extern mblk_t *ip_bind_v4(queue_t *, mblk_t *, conn_t *); -extern boolean_t ip_bind_ipsec_policy_set(conn_t *, mblk_t *); -extern int ip_bind_laddr_v4(conn_t *, mblk_t **, uint8_t, ipaddr_t, - uint16_t, boolean_t); -extern int ip_proto_bind_laddr_v4(conn_t *, mblk_t **, uint8_t, ipaddr_t, - uint16_t, boolean_t); -extern int ip_proto_bind_connected_v4(conn_t *, mblk_t **, - uint8_t, ipaddr_t *, uint16_t, ipaddr_t, uint16_t, boolean_t, boolean_t, - cred_t *); -extern int ip_bind_connected_v4(conn_t *, mblk_t **, uint8_t, ipaddr_t *, - uint16_t, ipaddr_t, uint16_t, boolean_t, boolean_t, cred_t *); +extern void icmp_frag_needed(mblk_t *, int, ip_recv_attr_t *); +extern mblk_t *icmp_inbound_v4(mblk_t *, ip_recv_attr_t *); +extern void icmp_time_exceeded(mblk_t *, uint8_t, ip_recv_attr_t *); +extern void icmp_unreachable(mblk_t *, uint8_t, ip_recv_attr_t *); +extern boolean_t ip_ipsec_policy_inherit(conn_t *, conn_t *, ip_recv_attr_t *); +extern void *ip_pullup(mblk_t *, ssize_t, ip_recv_attr_t *); +extern void ip_setl2src(mblk_t *, ip_recv_attr_t *, ill_t *); +extern mblk_t *ip_check_and_align_header(mblk_t *, uint_t, ip_recv_attr_t *); +extern mblk_t *ip_check_length(mblk_t *, uchar_t *, ssize_t, uint_t, uint_t, + ip_recv_attr_t *); +extern mblk_t *ip_check_optlen(mblk_t *, ipha_t *, uint_t, uint_t, + ip_recv_attr_t *); +extern mblk_t *ip_fix_dbref(mblk_t *, ip_recv_attr_t *); extern uint_t ip_cksum(mblk_t *, int, uint32_t); extern int ip_close(queue_t *, int); extern uint16_t ip_csum_hdr(ipha_t *); -extern void ip_proto_not_sup(queue_t *, mblk_t *, uint_t, zoneid_t, - ip_stack_t *); +extern void ip_forward_xmit_v4(nce_t *, ill_t *, mblk_t *, ipha_t *, + ip_recv_attr_t *, uint32_t, uint32_t); +extern boolean_t ip_forward_options(mblk_t *, ipha_t *, ill_t *, + ip_recv_attr_t *); +extern int ip_fragment_v4(mblk_t *, nce_t *, iaflags_t, uint_t, uint32_t, + uint32_t, zoneid_t, zoneid_t, pfirepostfrag_t postfragfn, + uintptr_t *cookie); +extern void ip_proto_not_sup(mblk_t *, ip_recv_attr_t *); extern void ip_ire_g_fini(void); extern void ip_ire_g_init(void); extern void ip_ire_fini(ip_stack_t *); extern void ip_ire_init(ip_stack_t *); +extern void ip_mdata_to_mhi(ill_t *, mblk_t *, struct mac_header_info_s *); extern int ip_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp); extern int ip_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp); extern int ip_reassemble(mblk_t *, ipf_t *, uint_t, boolean_t, ill_t *, size_t); -extern int ip_opt_set_ill(conn_t *, int, boolean_t, boolean_t, - int, int, mblk_t *); extern void ip_rput(queue_t *, mblk_t *); extern void ip_input(ill_t *, ill_rx_ring_t *, mblk_t *, struct mac_header_info_s *); +extern void ip_input_v6(ill_t *, ill_rx_ring_t *, mblk_t *, + struct mac_header_info_s *); +extern mblk_t *ip_input_common_v4(ill_t *, ill_rx_ring_t *, mblk_t *, + struct mac_header_info_s *, squeue_t *, mblk_t **, uint_t *); +extern mblk_t *ip_input_common_v6(ill_t *, ill_rx_ring_t *, mblk_t *, + struct mac_header_info_s *, squeue_t *, mblk_t **, uint_t *); +extern void ill_input_full_v4(mblk_t *, void *, void *, + ip_recv_attr_t *, rtc_t *); +extern void ill_input_short_v4(mblk_t *, void *, void *, + ip_recv_attr_t *, rtc_t *); +extern void ill_input_full_v6(mblk_t *, void *, void *, + ip_recv_attr_t *, rtc_t *); +extern void ill_input_short_v6(mblk_t *, void *, void *, + ip_recv_attr_t *, rtc_t *); +extern ipaddr_t ip_input_options(ipha_t *, ipaddr_t, mblk_t *, + ip_recv_attr_t *, int *); +extern boolean_t ip_input_local_options(mblk_t *, ipha_t *, ip_recv_attr_t *); +extern mblk_t *ip_input_fragment(mblk_t *, ipha_t *, ip_recv_attr_t *); +extern mblk_t *ip_input_fragment_v6(mblk_t *, ip6_t *, ip6_frag_t *, uint_t, + ip_recv_attr_t *); +extern void ip_input_post_ipsec(mblk_t *, ip_recv_attr_t *); +extern void ip_fanout_v4(mblk_t *, ipha_t *, ip_recv_attr_t *); +extern void ip_fanout_v6(mblk_t *, ip6_t *, ip_recv_attr_t *); +extern void ip_fanout_proto_conn(conn_t *, mblk_t *, ipha_t *, ip6_t *, + ip_recv_attr_t *); +extern void ip_fanout_proto_v4(mblk_t *, ipha_t *, ip_recv_attr_t *); +extern void ip_fanout_send_icmp_v4(mblk_t *, uint_t, uint_t, + ip_recv_attr_t *); +extern void ip_fanout_udp_conn(conn_t *, mblk_t *, ipha_t *, ip6_t *, + ip_recv_attr_t *); +extern void ip_fanout_udp_multi_v4(mblk_t *, ipha_t *, uint16_t, uint16_t, + ip_recv_attr_t *); +extern mblk_t *zero_spi_check(mblk_t *, ip_recv_attr_t *); +extern void ip_build_hdrs_v4(uchar_t *, uint_t, const ip_pkt_t *, uint8_t); +extern int ip_find_hdr_v4(ipha_t *, ip_pkt_t *, boolean_t); +extern int ip_total_hdrs_len_v4(const ip_pkt_t *); + extern mblk_t *ip_accept_tcp(ill_t *, ill_rx_ring_t *, squeue_t *, mblk_t *, mblk_t **, uint_t *cnt); -extern void ip_rput_dlpi(queue_t *, mblk_t *); -extern void ip_rput_forward(ire_t *, ipha_t *, mblk_t *, ill_t *); -extern void ip_rput_forward_multicast(ipaddr_t, mblk_t *, ipif_t *); +extern void ip_rput_dlpi(ill_t *, mblk_t *); +extern void ip_rput_notdata(ill_t *, mblk_t *); extern void ip_mib2_add_ip_stats(mib2_ipIfStatsEntry_t *, mib2_ipIfStatsEntry_t *); extern void ip_mib2_add_icmp6_stats(mib2_ipv6IfIcmpEntry_t *, mib2_ipv6IfIcmpEntry_t *); -extern void ip_udp_input(queue_t *, mblk_t *, ipha_t *, ire_t *, ill_t *); -extern void ip_proto_input(queue_t *, mblk_t *, ipha_t *, ire_t *, ill_t *, - uint32_t); extern void ip_rput_other(ipsq_t *, queue_t *, mblk_t *, void *); extern ire_t *ip_check_multihome(void *, ire_t *, ill_t *); -extern void ip_setpktversion(conn_t *, boolean_t, boolean_t, ip_stack_t *); -extern void ip_trash_ire_reclaim(void *); -extern void ip_trash_timer_expire(void *); -extern void ip_wput(queue_t *, mblk_t *); -extern void ip_output(void *, mblk_t *, void *, int); -extern void ip_output_options(void *, mblk_t *, void *, int, - ip_opt_info_t *); - -extern void ip_wput_ire(queue_t *, mblk_t *, ire_t *, conn_t *, int, - zoneid_t); -extern void ip_wput_local(queue_t *, ill_t *, ipha_t *, mblk_t *, ire_t *, - int, zoneid_t); -extern void ip_wput_multicast(queue_t *, mblk_t *, ipif_t *, zoneid_t); -extern void ip_wput_nondata(ipsq_t *, queue_t *, mblk_t *, void *); +extern void ip_send_potential_redirect_v4(mblk_t *, ipha_t *, ire_t *, + ip_recv_attr_t *); +extern int ip_set_destination_v4(ipaddr_t *, ipaddr_t, ipaddr_t, + ip_xmit_attr_t *, iulp_t *, uint32_t, uint_t); +extern int ip_set_destination_v6(in6_addr_t *, const in6_addr_t *, + const in6_addr_t *, ip_xmit_attr_t *, iulp_t *, uint32_t, uint_t); + +extern int ip_output_simple(mblk_t *, ip_xmit_attr_t *); +extern int ip_output_simple_v4(mblk_t *, ip_xmit_attr_t *); +extern int ip_output_simple_v6(mblk_t *, ip_xmit_attr_t *); +extern int ip_output_options(mblk_t *, ipha_t *, ip_xmit_attr_t *, + ill_t *); +extern void ip_output_local_options(ipha_t *, ip_stack_t *); + +extern ip_xmit_attr_t *conn_get_ixa(conn_t *, boolean_t); +extern ip_xmit_attr_t *conn_get_ixa_tryhard(conn_t *, boolean_t); +extern ip_xmit_attr_t *conn_replace_ixa(conn_t *, ip_xmit_attr_t *); +extern ip_xmit_attr_t *conn_get_ixa_exclusive(conn_t *); +extern ip_xmit_attr_t *ip_xmit_attr_duplicate(ip_xmit_attr_t *); +extern void ip_xmit_attr_replace_tsl(ip_xmit_attr_t *, ts_label_t *); +extern void ip_xmit_attr_restore_tsl(ip_xmit_attr_t *, cred_t *); +boolean_t ip_recv_attr_replace_label(ip_recv_attr_t *, ts_label_t *); +extern void ixa_inactive(ip_xmit_attr_t *); +extern void ixa_refrele(ip_xmit_attr_t *); +extern boolean_t ixa_check_drain_insert(conn_t *, ip_xmit_attr_t *); +extern void ixa_cleanup(ip_xmit_attr_t *); +extern void ira_cleanup(ip_recv_attr_t *, boolean_t); +extern void ixa_safe_copy(ip_xmit_attr_t *, ip_xmit_attr_t *); + +extern int conn_ip_output(mblk_t *, ip_xmit_attr_t *); +extern boolean_t ip_output_verify_local(ip_xmit_attr_t *); +extern mblk_t *ip_output_process_local(mblk_t *, ip_xmit_attr_t *, boolean_t, + boolean_t, conn_t *); + +extern int conn_opt_get(conn_opt_arg_t *, t_scalar_t, t_scalar_t, + uchar_t *); +extern int conn_opt_set(conn_opt_arg_t *, t_scalar_t, t_scalar_t, uint_t, + uchar_t *, boolean_t, cred_t *); +extern boolean_t conn_same_as_last_v4(conn_t *, sin_t *); +extern boolean_t conn_same_as_last_v6(conn_t *, sin6_t *); +extern int conn_update_label(const conn_t *, const ip_xmit_attr_t *, + const in6_addr_t *, ip_pkt_t *); + +extern int ip_opt_set_multicast_group(conn_t *, t_scalar_t, + uchar_t *, boolean_t, boolean_t); +extern int ip_opt_set_multicast_sources(conn_t *, t_scalar_t, + uchar_t *, boolean_t, boolean_t); +extern int conn_getsockname(conn_t *, struct sockaddr *, uint_t *); +extern int conn_getpeername(conn_t *, struct sockaddr *, uint_t *); + +extern int conn_build_hdr_template(conn_t *, uint_t, uint_t, + const in6_addr_t *, const in6_addr_t *, uint32_t); +extern mblk_t *conn_prepend_hdr(ip_xmit_attr_t *, const ip_pkt_t *, + const in6_addr_t *, const in6_addr_t *, uint8_t, uint32_t, uint_t, + mblk_t *, uint_t, uint_t, uint32_t *, int *); +extern void ip_attr_newdst(ip_xmit_attr_t *); +extern void ip_attr_nexthop(const ip_pkt_t *, const ip_xmit_attr_t *, + const in6_addr_t *, in6_addr_t *); +extern int conn_connect(conn_t *, iulp_t *, uint32_t); +extern int ip_attr_connect(const conn_t *, ip_xmit_attr_t *, + const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, in_port_t, + in6_addr_t *, iulp_t *, uint32_t); +extern int conn_inherit_parent(conn_t *, conn_t *); + +extern void conn_ixa_cleanup(conn_t *connp, void *arg); + +extern boolean_t conn_wantpacket(conn_t *, ip_recv_attr_t *, ipha_t *); +extern uint_t ip_type_v4(ipaddr_t, ip_stack_t *); +extern uint_t ip_type_v6(const in6_addr_t *, ip_stack_t *); + +extern void ip_wput_nondata(queue_t *, mblk_t *); extern void ip_wsrv(queue_t *); extern char *ip_nv_lookup(nv_t *, int); extern boolean_t ip_local_addr_ok_v6(const in6_addr_t *, const in6_addr_t *); extern boolean_t ip_remote_addr_ok_v6(const in6_addr_t *, const in6_addr_t *); extern ipaddr_t ip_massage_options(ipha_t *, netstack_t *); extern ipaddr_t ip_net_mask(ipaddr_t); -extern void ip_newroute(queue_t *, mblk_t *, ipaddr_t, conn_t *, zoneid_t, - ip_stack_t *); -extern ipxmit_state_t ip_xmit_v4(mblk_t *, ire_t *, struct ipsec_out_s *, - boolean_t, conn_t *); -extern int ip_hdr_complete(ipha_t *, zoneid_t, ip_stack_t *); +extern void arp_bringup_done(ill_t *, int); +extern void arp_replumb_done(ill_t *, int); extern struct qinit iprinitv6; -extern struct qinit ipwinitv6; extern void ipmp_init(ip_stack_t *); extern void ipmp_destroy(ip_stack_t *); @@ -3347,12 +3302,11 @@ extern ill_t *ipmp_illgrp_add_ipif(ipmp_illgrp_t *, ipif_t *); extern void ipmp_illgrp_del_ipif(ipmp_illgrp_t *, ipif_t *); extern ill_t *ipmp_illgrp_next_ill(ipmp_illgrp_t *); extern ill_t *ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *); -extern ill_t *ipmp_illgrp_cast_ill(ipmp_illgrp_t *); extern ill_t *ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *); extern ill_t *ipmp_illgrp_ipmp_ill(ipmp_illgrp_t *); extern void ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *); -extern ipmp_arpent_t *ipmp_illgrp_create_arpent(ipmp_illgrp_t *, mblk_t *, - boolean_t); +extern ipmp_arpent_t *ipmp_illgrp_create_arpent(ipmp_illgrp_t *, + boolean_t, ipaddr_t, uchar_t *, size_t, uint16_t); extern void ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *, ipmp_arpent_t *); extern ipmp_arpent_t *ipmp_illgrp_lookup_arpent(ipmp_illgrp_t *, ipaddr_t *); extern void ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *); @@ -3373,19 +3327,25 @@ extern ill_t *ipmp_ipif_bound_ill(const ipif_t *); extern ill_t *ipmp_ipif_hold_bound_ill(const ipif_t *); extern boolean_t ipmp_ipif_is_dataaddr(const ipif_t *); extern boolean_t ipmp_ipif_is_stubaddr(const ipif_t *); +extern boolean_t ipmp_packet_is_probe(mblk_t *, ill_t *); +extern ill_t *ipmp_ill_get_xmit_ill(ill_t *, boolean_t); +extern void ipmp_ncec_flush_nce(ncec_t *); +extern void ipmp_ncec_fastpath(ncec_t *, ill_t *); extern void conn_drain_insert(conn_t *, idl_tx_list_t *); +extern void conn_setqfull(conn_t *, boolean_t *); +extern void conn_clrqfull(conn_t *, boolean_t *); extern int conn_ipsec_length(conn_t *); -extern void ip_wput_ipsec_out(queue_t *, mblk_t *, ipha_t *, ill_t *, - ire_t *); extern ipaddr_t ip_get_dst(ipha_t *); -extern int ipsec_out_extra_length(mblk_t *); -extern int ipsec_in_extra_length(mblk_t *); -extern mblk_t *ipsec_in_alloc(boolean_t, netstack_t *); -extern boolean_t ipsec_in_is_secure(mblk_t *); -extern void ipsec_out_process(queue_t *, mblk_t *, ire_t *, uint_t); -extern void ipsec_out_to_in(mblk_t *); -extern void ip_fanout_proto_again(mblk_t *, ill_t *, ill_t *, ire_t *); +extern uint_t ip_get_pmtu(ip_xmit_attr_t *); +extern uint_t ip_get_base_mtu(ill_t *, ire_t *); +extern mblk_t *ip_output_attach_policy(mblk_t *, ipha_t *, ip6_t *, + const conn_t *, ip_xmit_attr_t *); +extern int ipsec_out_extra_length(ip_xmit_attr_t *); +extern int ipsec_out_process(mblk_t *, ip_xmit_attr_t *); +extern int ip_output_post_ipsec(mblk_t *, ip_xmit_attr_t *); +extern void ipsec_out_to_in(ip_xmit_attr_t *, ill_t *ill, + ip_recv_attr_t *); extern void ire_cleanup(ire_t *); extern void ire_inactive(ire_t *); @@ -3407,14 +3367,13 @@ extern uint_t ip_srcid_find_addr(const in6_addr_t *, zoneid_t, netstack_t *); extern uint8_t ipoptp_next(ipoptp_t *); extern uint8_t ipoptp_first(ipoptp_t *, ipha_t *); -extern int ip_opt_get_user(const ipha_t *, uchar_t *); +extern int ip_opt_get_user(conn_t *, uchar_t *); extern int ipsec_req_from_conn(conn_t *, ipsec_req_t *, int); extern int ip_snmp_get(queue_t *q, mblk_t *mctl, int level); extern int ip_snmp_set(queue_t *q, int, int, uchar_t *, int); extern void ip_process_ioctl(ipsq_t *, queue_t *, mblk_t *, void *); extern void ip_quiesce_conn(conn_t *); extern void ip_reprocess_ioctl(ipsq_t *, queue_t *, mblk_t *, void *); -extern void ip_restart_optmgmt(ipsq_t *, queue_t *, mblk_t *, void *); extern void ip_ioctl_finish(queue_t *, mblk_t *, int, int, ipsq_t *); extern boolean_t ip_cmpbuf(const void *, uint_t, boolean_t, const void *, @@ -3425,32 +3384,36 @@ extern void ip_savebuf(void **, uint_t *, boolean_t, const void *, uint_t); extern boolean_t ipsq_pending_mp_cleanup(ill_t *, conn_t *); extern void conn_ioctl_cleanup(conn_t *); -extern ill_t *conn_get_held_ill(conn_t *, ill_t **, int *); - -struct tcp_stack; -extern void ip_xmit_reset_serialize(mblk_t *, int, zoneid_t, struct tcp_stack *, - conn_t *); - -struct multidata_s; -struct pdesc_s; - -extern mblk_t *ip_mdinfo_alloc(ill_mdt_capab_t *); -extern mblk_t *ip_mdinfo_return(ire_t *, conn_t *, char *, ill_mdt_capab_t *); -extern mblk_t *ip_lsoinfo_alloc(ill_lso_capab_t *); -extern mblk_t *ip_lsoinfo_return(ire_t *, conn_t *, char *, - ill_lso_capab_t *); -extern uint_t ip_md_cksum(struct pdesc_s *, int, uint_t); -extern boolean_t ip_md_addr_attr(struct multidata_s *, struct pdesc_s *, - const mblk_t *); -extern boolean_t ip_md_hcksum_attr(struct multidata_s *, struct pdesc_s *, - uint32_t, uint32_t, uint32_t, uint32_t); -extern boolean_t ip_md_zcopy_attr(struct multidata_s *, struct pdesc_s *, - uint_t); + extern void ip_unbind(conn_t *); extern void tnet_init(void); extern void tnet_fini(void); +/* + * Hook functions to enable cluster networking + * On non-clustered systems these vectors must always be NULL. + */ +extern int (*cl_inet_isclusterwide)(netstackid_t stack_id, uint8_t protocol, + sa_family_t addr_family, uint8_t *laddrp, void *args); +extern uint32_t (*cl_inet_ipident)(netstackid_t stack_id, uint8_t protocol, + sa_family_t addr_family, uint8_t *laddrp, uint8_t *faddrp, + void *args); +extern int (*cl_inet_connect2)(netstackid_t stack_id, uint8_t protocol, + boolean_t is_outgoing, sa_family_t addr_family, uint8_t *laddrp, + in_port_t lport, uint8_t *faddrp, in_port_t fport, void *args); +extern void (*cl_inet_getspi)(netstackid_t, uint8_t, uint8_t *, size_t, + void *); +extern void (*cl_inet_getspi)(netstackid_t stack_id, uint8_t protocol, + uint8_t *ptr, size_t len, void *args); +extern int (*cl_inet_checkspi)(netstackid_t stack_id, uint8_t protocol, + uint32_t spi, void *args); +extern void (*cl_inet_deletespi)(netstackid_t stack_id, uint8_t protocol, + uint32_t spi, void *args); +extern void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, + sa_family_t, in6_addr_t, in6_addr_t, void *); + + /* Hooks for CGTP (multirt routes) filtering module */ #define CGTP_FILTER_REV_1 1 #define CGTP_FILTER_REV_2 2 @@ -3491,73 +3454,6 @@ extern int ip_cgtp_filter_register(netstackid_t, cgtp_filter_ops_t *); extern int ip_cgtp_filter_unregister(netstackid_t); extern int ip_cgtp_filter_is_registered(netstackid_t); -/* Flags for ire_multirt_lookup() */ - -#define MULTIRT_USESTAMP 0x0001 -#define MULTIRT_SETSTAMP 0x0002 -#define MULTIRT_CACHEGW 0x0004 - -/* Debug stuff for multirt route resolution. */ -#if defined(DEBUG) && !defined(__lint) -/* Our "don't send, rather drop" flag. */ -#define MULTIRT_DEBUG_FLAG 0x8000 - -#define MULTIRT_TRACE(x) ip2dbg(x) - -#define MULTIRT_DEBUG_TAG(mblk) \ - do { \ - ASSERT(mblk != NULL); \ - MULTIRT_TRACE(("%s[%d]: tagging mblk %p, tag was %d\n", \ - __FILE__, __LINE__, \ - (void *)(mblk), (mblk)->b_flag & MULTIRT_DEBUG_FLAG)); \ - (mblk)->b_flag |= MULTIRT_DEBUG_FLAG; \ - } while (0) - -#define MULTIRT_DEBUG_UNTAG(mblk) \ - do { \ - ASSERT(mblk != NULL); \ - MULTIRT_TRACE(("%s[%d]: untagging mblk %p, tag was %d\n", \ - __FILE__, __LINE__, \ - (void *)(mblk), (mblk)->b_flag & MULTIRT_DEBUG_FLAG)); \ - (mblk)->b_flag &= ~MULTIRT_DEBUG_FLAG; \ - } while (0) - -#define MULTIRT_DEBUG_TAGGED(mblk) \ - (((mblk)->b_flag & MULTIRT_DEBUG_FLAG) ? B_TRUE : B_FALSE) -#else -#define MULTIRT_DEBUG_TAG(mblk) ASSERT(mblk != NULL) -#define MULTIRT_DEBUG_UNTAG(mblk) ASSERT(mblk != NULL) -#define MULTIRT_DEBUG_TAGGED(mblk) B_FALSE -#endif - -/* - * Per-ILL Multidata Transmit capabilities. - */ -struct ill_mdt_capab_s { - uint_t ill_mdt_version; /* interface version */ - uint_t ill_mdt_on; /* on/off switch for MDT on this ILL */ - uint_t ill_mdt_hdr_head; /* leading header fragment extra space */ - uint_t ill_mdt_hdr_tail; /* trailing header fragment extra space */ - uint_t ill_mdt_max_pld; /* maximum payload buffers per Multidata */ - uint_t ill_mdt_span_limit; /* maximum payload span per packet */ -}; - -struct ill_hcksum_capab_s { - uint_t ill_hcksum_version; /* interface version */ - uint_t ill_hcksum_txflags; /* capabilities on transmit */ -}; - -struct ill_zerocopy_capab_s { - uint_t ill_zerocopy_version; /* interface version */ - uint_t ill_zerocopy_flags; /* capabilities */ -}; - -struct ill_lso_capab_s { - uint_t ill_lso_on; /* on/off switch for LSO on this ILL */ - uint_t ill_lso_flags; /* capabilities */ - uint_t ill_lso_max; /* maximum size of payload */ -}; - /* * rr_ring_state cycles in the order shown below from RR_FREE through * RR_FREE_IN_PROG and back to RR_FREE. @@ -3669,18 +3565,61 @@ extern void ip_squeue_clean_ring(ill_t *, ill_rx_ring_t *); extern void ip_squeue_quiesce_ring(ill_t *, ill_rx_ring_t *); extern void ip_squeue_restart_ring(ill_t *, ill_rx_ring_t *); extern void ip_squeue_clean_all(ill_t *); +extern boolean_t ip_source_routed(ipha_t *, ip_stack_t *); extern void tcp_wput(queue_t *, mblk_t *); -extern int ip_fill_mtuinfo(struct in6_addr *, in_port_t, - struct ip6_mtuinfo *, netstack_t *); -extern ipif_t *conn_get_held_ipif(conn_t *, ipif_t **, int *); +extern int ip_fill_mtuinfo(conn_t *, ip_xmit_attr_t *, + struct ip6_mtuinfo *); extern hook_t *ipobs_register_hook(netstack_t *, pfv_t); extern void ipobs_unregister_hook(netstack_t *, hook_t *); extern void ipobs_hook(mblk_t *, int, zoneid_t, zoneid_t, const ill_t *, ip_stack_t *); typedef void (*ipsq_func_t)(ipsq_t *, queue_t *, mblk_t *, void *); +extern void dce_g_init(void); +extern void dce_g_destroy(void); +extern void dce_stack_init(ip_stack_t *); +extern void dce_stack_destroy(ip_stack_t *); +extern void dce_cleanup(uint_t, ip_stack_t *); +extern dce_t *dce_get_default(ip_stack_t *); +extern dce_t *dce_lookup_pkt(mblk_t *, ip_xmit_attr_t *, uint_t *); +extern dce_t *dce_lookup_v4(ipaddr_t, ip_stack_t *, uint_t *); +extern dce_t *dce_lookup_v6(const in6_addr_t *, uint_t, ip_stack_t *, + uint_t *); +extern dce_t *dce_lookup_and_add_v4(ipaddr_t, ip_stack_t *); +extern dce_t *dce_lookup_and_add_v6(const in6_addr_t *, uint_t, + ip_stack_t *); +extern int dce_update_uinfo_v4(ipaddr_t, iulp_t *, ip_stack_t *); +extern int dce_update_uinfo_v6(const in6_addr_t *, uint_t, iulp_t *, + ip_stack_t *); +extern int dce_update_uinfo(const in6_addr_t *, uint_t, iulp_t *, + ip_stack_t *); +extern void dce_increment_generation(dce_t *); +extern void dce_increment_all_generations(boolean_t, ip_stack_t *); +extern void dce_refrele(dce_t *); +extern void dce_refhold(dce_t *); +extern void dce_refrele_notr(dce_t *); +extern void dce_refhold_notr(dce_t *); +mblk_t *ip_snmp_get_mib2_ip_dce(queue_t *, mblk_t *, ip_stack_t *ipst); + +extern ip_laddr_t ip_laddr_verify_v4(ipaddr_t, zoneid_t, + ip_stack_t *, boolean_t); +extern ip_laddr_t ip_laddr_verify_v6(const in6_addr_t *, zoneid_t, + ip_stack_t *, boolean_t, uint_t); +extern int ip_laddr_fanout_insert(conn_t *); + +extern boolean_t ip_verify_src(mblk_t *, ip_xmit_attr_t *, uint_t *); +extern int ip_verify_ire(mblk_t *, ip_xmit_attr_t *); + +extern mblk_t *ip_xmit_attr_to_mblk(ip_xmit_attr_t *); +extern boolean_t ip_xmit_attr_from_mblk(mblk_t *, ip_xmit_attr_t *); +extern mblk_t *ip_xmit_attr_free_mblk(mblk_t *); +extern mblk_t *ip_recv_attr_to_mblk(ip_recv_attr_t *); +extern boolean_t ip_recv_attr_from_mblk(mblk_t *, ip_recv_attr_t *); +extern mblk_t *ip_recv_attr_free_mblk(mblk_t *); +extern boolean_t ip_recv_attr_is_mblk(mblk_t *); + /* * Squeue tags. Tags only need to be unique when the callback function is the * same to distinguish between different calls, but we use unique tags for @@ -3729,16 +3668,8 @@ typedef void (*ipsq_func_t)(ipsq_t *, queue_t *, mblk_t *, void *); #define SQTAG_CONNECT_FINISH 41 #define SQTAG_SYNCHRONOUS_OP 42 #define SQTAG_TCP_SHUTDOWN_OUTPUT 43 -#define SQTAG_XMIT_EARLY_RESET 44 - -#define NOT_OVER_IP(ip_wq) \ - (ip_wq->q_next != NULL || \ - (ip_wq->q_qinfo->qi_minfo->mi_idname) == NULL || \ - strcmp(ip_wq->q_qinfo->qi_minfo->mi_idname, \ - IP_MOD_NAME) != 0 || \ - ip_wq->q_qinfo->qi_minfo->mi_idnum != IP_MOD_ID) +#define SQTAG_TCP_IXA_CLEANUP 44 -#define PROTO_FLOW_CNTRLD(connp) (connp->conn_flow_cntrld) #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/inet/ip/conn_opt.c b/usr/src/uts/common/inet/ip/conn_opt.c new file mode 100644 index 0000000000..a46d7c4cd0 --- /dev/null +++ b/usr/src/uts/common/inet/ip/conn_opt.c @@ -0,0 +1,2933 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* Copyright (c) 1990 Mentat Inc. */ + +#include <sys/types.h> +#include <sys/stream.h> +#include <sys/strsun.h> +#define _SUN_TPI_VERSION 2 +#include <sys/tihdr.h> +#include <sys/xti_inet.h> +#include <sys/ucred.h> +#include <sys/zone.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/atomic.h> +#include <sys/policy.h> + +#include <sys/systm.h> +#include <sys/param.h> +#include <sys/kmem.h> +#include <sys/sdt.h> +#include <sys/socket.h> +#include <sys/ethernet.h> +#include <sys/mac.h> +#include <net/if.h> +#include <net/if_types.h> +#include <net/if_arp.h> +#include <net/route.h> +#include <sys/sockio.h> +#include <netinet/in.h> +#include <net/if_dl.h> + +#include <inet/common.h> +#include <inet/mi.h> +#include <inet/mib2.h> +#include <inet/nd.h> +#include <inet/arp.h> +#include <inet/snmpcom.h> +#include <inet/kstatcom.h> + +#include <netinet/igmp_var.h> +#include <netinet/ip6.h> +#include <netinet/icmp6.h> +#include <netinet/sctp.h> + +#include <inet/ip.h> +#include <inet/ip_impl.h> +#include <inet/ip6.h> +#include <inet/ip6_asp.h> +#include <inet/tcp.h> +#include <inet/ip_multi.h> +#include <inet/ip_if.h> +#include <inet/ip_ire.h> +#include <inet/ip_ftable.h> +#include <inet/ip_rts.h> +#include <inet/optcom.h> +#include <inet/ip_ndp.h> +#include <inet/ip_listutils.h> +#include <netinet/igmp.h> +#include <netinet/ip_mroute.h> +#include <netinet/udp.h> +#include <inet/ipp_common.h> + +#include <net/pfkeyv2.h> +#include <inet/sadb.h> +#include <inet/ipsec_impl.h> +#include <inet/ipdrop.h> +#include <inet/ip_netinfo.h> + +#include <inet/ipclassifier.h> +#include <inet/sctp_ip.h> +#include <inet/sctp/sctp_impl.h> +#include <inet/udp_impl.h> +#include <sys/sunddi.h> + +#include <sys/tsol/label.h> +#include <sys/tsol/tnet.h> + +static sin_t sin_null; /* Zero address for quick clears */ +static sin6_t sin6_null; /* Zero address for quick clears */ + +/* + * Return how much size is needed for the different ancillary data items + */ +uint_t +conn_recvancillary_size(conn_t *connp, crb_t recv_ancillary, + ip_recv_attr_t *ira, mblk_t *mp, ip_pkt_t *ipp) +{ + uint_t ancil_size; + ip_stack_t *ipst = connp->conn_netstack->netstack_ip; + + /* + * If IP_RECVDSTADDR is set we include the destination IP + * address as an option. With IP_RECVOPTS we include all + * the IP options. + */ + ancil_size = 0; + if (recv_ancillary.crb_recvdstaddr && + (ira->ira_flags & IRAF_IS_IPV4)) { + ancil_size += sizeof (struct T_opthdr) + + sizeof (struct in_addr); + IP_STAT(ipst, conn_in_recvdstaddr); + } + + /* + * ip_recvpktinfo is used for both AF_INET and AF_INET6 but + * are different + */ + if (recv_ancillary.crb_ip_recvpktinfo && + connp->conn_family == AF_INET) { + ancil_size += sizeof (struct T_opthdr) + + sizeof (struct in_pktinfo); + IP_STAT(ipst, conn_in_recvpktinfo); + } + + if ((recv_ancillary.crb_recvopts) && + (ipp->ipp_fields & IPPF_IPV4_OPTIONS)) { + ancil_size += sizeof (struct T_opthdr) + + ipp->ipp_ipv4_options_len; + IP_STAT(ipst, conn_in_recvopts); + } + + if (recv_ancillary.crb_recvslla) { + ip_stack_t *ipst = connp->conn_netstack->netstack_ip; + ill_t *ill; + + /* Make sure ira_l2src is setup if not already */ + if (!(ira->ira_flags & IRAF_L2SRC_SET)) { + ill = ill_lookup_on_ifindex(ira->ira_rifindex, B_FALSE, + ipst); + if (ill != NULL) { + ip_setl2src(mp, ira, ill); + ill_refrele(ill); + } + } + ancil_size += sizeof (struct T_opthdr) + + sizeof (struct sockaddr_dl); + IP_STAT(ipst, conn_in_recvslla); + } + + if (recv_ancillary.crb_recvif) { + ancil_size += sizeof (struct T_opthdr) + sizeof (uint_t); + IP_STAT(ipst, conn_in_recvif); + } + + /* + * ip_recvpktinfo is used for both AF_INET and AF_INET6 but + * are different + */ + if (recv_ancillary.crb_ip_recvpktinfo && + connp->conn_family == AF_INET6) { + ancil_size += sizeof (struct T_opthdr) + + sizeof (struct in6_pktinfo); + IP_STAT(ipst, conn_in_recvpktinfo); + } + + if (recv_ancillary.crb_ipv6_recvhoplimit) { + ancil_size += sizeof (struct T_opthdr) + sizeof (int); + IP_STAT(ipst, conn_in_recvhoplimit); + } + + if (recv_ancillary.crb_ipv6_recvtclass) { + ancil_size += sizeof (struct T_opthdr) + sizeof (int); + IP_STAT(ipst, conn_in_recvtclass); + } + + if (recv_ancillary.crb_ipv6_recvhopopts && + (ipp->ipp_fields & IPPF_HOPOPTS)) { + ancil_size += sizeof (struct T_opthdr) + ipp->ipp_hopoptslen; + IP_STAT(ipst, conn_in_recvhopopts); + } + /* + * To honor RFC3542 when an application asks for both IPV6_RECVDSTOPTS + * and IPV6_RECVRTHDR, we pass up the item rthdrdstopts (the destination + * options that appear before a routing header. + * We also pass them up if IPV6_RECVRTHDRDSTOPTS is set. + */ + if (ipp->ipp_fields & IPPF_RTHDRDSTOPTS) { + if (recv_ancillary.crb_ipv6_recvrthdrdstopts || + (recv_ancillary.crb_ipv6_recvdstopts && + recv_ancillary.crb_ipv6_recvrthdr)) { + ancil_size += sizeof (struct T_opthdr) + + ipp->ipp_rthdrdstoptslen; + IP_STAT(ipst, conn_in_recvrthdrdstopts); + } + } + if ((recv_ancillary.crb_ipv6_recvrthdr) && + (ipp->ipp_fields & IPPF_RTHDR)) { + ancil_size += sizeof (struct T_opthdr) + ipp->ipp_rthdrlen; + IP_STAT(ipst, conn_in_recvrthdr); + } + if ((recv_ancillary.crb_ipv6_recvdstopts || + recv_ancillary.crb_old_ipv6_recvdstopts) && + (ipp->ipp_fields & IPPF_DSTOPTS)) { + ancil_size += sizeof (struct T_opthdr) + ipp->ipp_dstoptslen; + IP_STAT(ipst, conn_in_recvdstopts); + } + if (recv_ancillary.crb_recvucred && ira->ira_cred != NULL) { + ancil_size += sizeof (struct T_opthdr) + ucredsize; + IP_STAT(ipst, conn_in_recvucred); + } + + /* + * If SO_TIMESTAMP is set allocate the appropriate sized + * buffer. Since gethrestime() expects a pointer aligned + * argument, we allocate space necessary for extra + * alignment (even though it might not be used). + */ + if (recv_ancillary.crb_timestamp) { + ancil_size += sizeof (struct T_opthdr) + + sizeof (timestruc_t) + _POINTER_ALIGNMENT; + IP_STAT(ipst, conn_in_timestamp); + } + + /* + * If IP_RECVTTL is set allocate the appropriate sized buffer + */ + if (recv_ancillary.crb_recvttl && + (ira->ira_flags & IRAF_IS_IPV4)) { + ancil_size += sizeof (struct T_opthdr) + sizeof (uint8_t); + IP_STAT(ipst, conn_in_recvttl); + } + + return (ancil_size); +} + +/* + * Lay down the ancillary data items at "ancil_buf". + * Assumes caller has used conn_recvancillary_size to allocate a sufficiently + * large buffer - ancil_size. + */ +void +conn_recvancillary_add(conn_t *connp, crb_t recv_ancillary, + ip_recv_attr_t *ira, ip_pkt_t *ipp, uchar_t *ancil_buf, uint_t ancil_size) +{ + /* + * Copy in destination address before options to avoid + * any padding issues. + */ + if (recv_ancillary.crb_recvdstaddr && + (ira->ira_flags & IRAF_IS_IPV4)) { + struct T_opthdr *toh; + ipaddr_t *dstptr; + + toh = (struct T_opthdr *)ancil_buf; + toh->level = IPPROTO_IP; + toh->name = IP_RECVDSTADDR; + toh->len = sizeof (struct T_opthdr) + sizeof (ipaddr_t); + toh->status = 0; + ancil_buf += sizeof (struct T_opthdr); + dstptr = (ipaddr_t *)ancil_buf; + *dstptr = ipp->ipp_addr_v4; + ancil_buf += sizeof (ipaddr_t); + ancil_size -= toh->len; + } + + /* + * ip_recvpktinfo is used for both AF_INET and AF_INET6 but + * are different + */ + if (recv_ancillary.crb_ip_recvpktinfo && + connp->conn_family == AF_INET) { + ip_stack_t *ipst = connp->conn_netstack->netstack_ip; + struct T_opthdr *toh; + struct in_pktinfo *pktinfop; + ill_t *ill; + ipif_t *ipif; + + toh = (struct T_opthdr *)ancil_buf; + toh->level = IPPROTO_IP; + toh->name = IP_PKTINFO; + toh->len = sizeof (struct T_opthdr) + sizeof (*pktinfop); + toh->status = 0; + ancil_buf += sizeof (struct T_opthdr); + pktinfop = (struct in_pktinfo *)ancil_buf; + + pktinfop->ipi_ifindex = ira->ira_ruifindex; + pktinfop->ipi_spec_dst.s_addr = INADDR_ANY; + + /* Find a good address to report */ + ill = ill_lookup_on_ifindex(ira->ira_ruifindex, B_FALSE, ipst); + if (ill != NULL) { + ipif = ipif_good_addr(ill, IPCL_ZONEID(connp)); + if (ipif != NULL) { + pktinfop->ipi_spec_dst.s_addr = + ipif->ipif_lcl_addr; + ipif_refrele(ipif); + } + ill_refrele(ill); + } + pktinfop->ipi_addr.s_addr = ipp->ipp_addr_v4; + ancil_buf += sizeof (struct in_pktinfo); + ancil_size -= toh->len; + } + + if ((recv_ancillary.crb_recvopts) && + (ipp->ipp_fields & IPPF_IPV4_OPTIONS)) { + struct T_opthdr *toh; + + toh = (struct T_opthdr *)ancil_buf; + toh->level = IPPROTO_IP; + toh->name = IP_RECVOPTS; + toh->len = sizeof (struct T_opthdr) + ipp->ipp_ipv4_options_len; + toh->status = 0; + ancil_buf += sizeof (struct T_opthdr); + bcopy(ipp->ipp_ipv4_options, ancil_buf, + ipp->ipp_ipv4_options_len); + ancil_buf += ipp->ipp_ipv4_options_len; + ancil_size -= toh->len; + } + + if (recv_ancillary.crb_recvslla) { + ip_stack_t *ipst = connp->conn_netstack->netstack_ip; + struct T_opthdr *toh; + struct sockaddr_dl *dstptr; + ill_t *ill; + int alen = 0; + + ill = ill_lookup_on_ifindex(ira->ira_rifindex, B_FALSE, ipst); + if (ill != NULL) + alen = ill->ill_phys_addr_length; + + /* + * For loopback multicast and broadcast the packet arrives + * with ira_ruifdex being the physical interface, but + * ira_l2src is all zero since ip_postfrag_loopback doesn't + * know our l2src. We don't report the address in that case. + */ + if (ira->ira_flags & IRAF_LOOPBACK) + alen = 0; + + toh = (struct T_opthdr *)ancil_buf; + toh->level = IPPROTO_IP; + toh->name = IP_RECVSLLA; + toh->len = sizeof (struct T_opthdr) + + sizeof (struct sockaddr_dl); + toh->status = 0; + ancil_buf += sizeof (struct T_opthdr); + dstptr = (struct sockaddr_dl *)ancil_buf; + dstptr->sdl_family = AF_LINK; + dstptr->sdl_index = ira->ira_ruifindex; + if (ill != NULL) + dstptr->sdl_type = ill->ill_type; + else + dstptr->sdl_type = 0; + dstptr->sdl_nlen = 0; + dstptr->sdl_alen = alen; + dstptr->sdl_slen = 0; + bcopy(ira->ira_l2src, dstptr->sdl_data, alen); + ancil_buf += sizeof (struct sockaddr_dl); + ancil_size -= toh->len; + if (ill != NULL) + ill_refrele(ill); + } + + if (recv_ancillary.crb_recvif) { + struct T_opthdr *toh; + uint_t *dstptr; + + toh = (struct T_opthdr *)ancil_buf; + toh->level = IPPROTO_IP; + toh->name = IP_RECVIF; + toh->len = sizeof (struct T_opthdr) + sizeof (uint_t); + toh->status = 0; + ancil_buf += sizeof (struct T_opthdr); + dstptr = (uint_t *)ancil_buf; + *dstptr = ira->ira_ruifindex; + ancil_buf += sizeof (uint_t); + ancil_size -= toh->len; + } + + /* + * ip_recvpktinfo is used for both AF_INET and AF_INET6 but + * are different + */ + if (recv_ancillary.crb_ip_recvpktinfo && + connp->conn_family == AF_INET6) { + struct T_opthdr *toh; + struct in6_pktinfo *pkti; + + toh = (struct T_opthdr *)ancil_buf; + toh->level = IPPROTO_IPV6; + toh->name = IPV6_PKTINFO; + toh->len = sizeof (struct T_opthdr) + sizeof (*pkti); + toh->status = 0; + ancil_buf += sizeof (struct T_opthdr); + pkti = (struct in6_pktinfo *)ancil_buf; + if (ira->ira_flags & IRAF_IS_IPV4) { + IN6_IPADDR_TO_V4MAPPED(ipp->ipp_addr_v4, + &pkti->ipi6_addr); + } else { + pkti->ipi6_addr = ipp->ipp_addr; + } + pkti->ipi6_ifindex = ira->ira_ruifindex; + + ancil_buf += sizeof (*pkti); + ancil_size -= toh->len; + } + if (recv_ancillary.crb_ipv6_recvhoplimit) { + struct T_opthdr *toh; + + toh = (struct T_opthdr *)ancil_buf; + toh->level = IPPROTO_IPV6; + toh->name = IPV6_HOPLIMIT; + toh->len = sizeof (struct T_opthdr) + sizeof (uint_t); + toh->status = 0; + ancil_buf += sizeof (struct T_opthdr); + *(uint_t *)ancil_buf = ipp->ipp_hoplimit; + ancil_buf += sizeof (uint_t); + ancil_size -= toh->len; + } + if (recv_ancillary.crb_ipv6_recvtclass) { + struct T_opthdr *toh; + + toh = (struct T_opthdr *)ancil_buf; + toh->level = IPPROTO_IPV6; + toh->name = IPV6_TCLASS; + toh->len = sizeof (struct T_opthdr) + sizeof (uint_t); + toh->status = 0; + ancil_buf += sizeof (struct T_opthdr); + + if (ira->ira_flags & IRAF_IS_IPV4) + *(uint_t *)ancil_buf = ipp->ipp_type_of_service; + else + *(uint_t *)ancil_buf = ipp->ipp_tclass; + ancil_buf += sizeof (uint_t); + ancil_size -= toh->len; + } + if (recv_ancillary.crb_ipv6_recvhopopts && + (ipp->ipp_fields & IPPF_HOPOPTS)) { + struct T_opthdr *toh; + + toh = (struct T_opthdr *)ancil_buf; + toh->level = IPPROTO_IPV6; + toh->name = IPV6_HOPOPTS; + toh->len = sizeof (struct T_opthdr) + ipp->ipp_hopoptslen; + toh->status = 0; + ancil_buf += sizeof (struct T_opthdr); + bcopy(ipp->ipp_hopopts, ancil_buf, ipp->ipp_hopoptslen); + ancil_buf += ipp->ipp_hopoptslen; + ancil_size -= toh->len; + } + /* + * To honor RFC3542 when an application asks for both IPV6_RECVDSTOPTS + * and IPV6_RECVRTHDR, we pass up the item rthdrdstopts (the destination + * options that appear before a routing header. + * We also pass them up if IPV6_RECVRTHDRDSTOPTS is set. + */ + if (ipp->ipp_fields & IPPF_RTHDRDSTOPTS) { + if (recv_ancillary.crb_ipv6_recvrthdrdstopts || + (recv_ancillary.crb_ipv6_recvdstopts && + recv_ancillary.crb_ipv6_recvrthdr)) { + struct T_opthdr *toh; + + toh = (struct T_opthdr *)ancil_buf; + toh->level = IPPROTO_IPV6; + toh->name = IPV6_DSTOPTS; + toh->len = sizeof (struct T_opthdr) + + ipp->ipp_rthdrdstoptslen; + toh->status = 0; + ancil_buf += sizeof (struct T_opthdr); + bcopy(ipp->ipp_rthdrdstopts, ancil_buf, + ipp->ipp_rthdrdstoptslen); + ancil_buf += ipp->ipp_rthdrdstoptslen; + ancil_size -= toh->len; + } + } + if (recv_ancillary.crb_ipv6_recvrthdr && + (ipp->ipp_fields & IPPF_RTHDR)) { + struct T_opthdr *toh; + + toh = (struct T_opthdr *)ancil_buf; + toh->level = IPPROTO_IPV6; + toh->name = IPV6_RTHDR; + toh->len = sizeof (struct T_opthdr) + ipp->ipp_rthdrlen; + toh->status = 0; + ancil_buf += sizeof (struct T_opthdr); + bcopy(ipp->ipp_rthdr, ancil_buf, ipp->ipp_rthdrlen); + ancil_buf += ipp->ipp_rthdrlen; + ancil_size -= toh->len; + } + if ((recv_ancillary.crb_ipv6_recvdstopts || + recv_ancillary.crb_old_ipv6_recvdstopts) && + (ipp->ipp_fields & IPPF_DSTOPTS)) { + struct T_opthdr *toh; + + toh = (struct T_opthdr *)ancil_buf; + toh->level = IPPROTO_IPV6; + toh->name = IPV6_DSTOPTS; + toh->len = sizeof (struct T_opthdr) + ipp->ipp_dstoptslen; + toh->status = 0; + ancil_buf += sizeof (struct T_opthdr); + bcopy(ipp->ipp_dstopts, ancil_buf, ipp->ipp_dstoptslen); + ancil_buf += ipp->ipp_dstoptslen; + ancil_size -= toh->len; + } + + if (recv_ancillary.crb_recvucred && ira->ira_cred != NULL) { + struct T_opthdr *toh; + cred_t *rcr = connp->conn_cred; + + toh = (struct T_opthdr *)ancil_buf; + toh->level = SOL_SOCKET; + toh->name = SCM_UCRED; + toh->len = sizeof (struct T_opthdr) + ucredsize; + toh->status = 0; + (void) cred2ucred(ira->ira_cred, ira->ira_cpid, &toh[1], rcr); + ancil_buf += toh->len; + ancil_size -= toh->len; + } + if (recv_ancillary.crb_timestamp) { + struct T_opthdr *toh; + + toh = (struct T_opthdr *)ancil_buf; + toh->level = SOL_SOCKET; + toh->name = SCM_TIMESTAMP; + toh->len = sizeof (struct T_opthdr) + + sizeof (timestruc_t) + _POINTER_ALIGNMENT; + toh->status = 0; + ancil_buf += sizeof (struct T_opthdr); + /* Align for gethrestime() */ + ancil_buf = (uchar_t *)P2ROUNDUP((intptr_t)ancil_buf, + sizeof (intptr_t)); + gethrestime((timestruc_t *)ancil_buf); + ancil_buf = (uchar_t *)toh + toh->len; + ancil_size -= toh->len; + } + + /* + * CAUTION: + * Due to aligment issues + * Processing of IP_RECVTTL option + * should always be the last. Adding + * any option processing after this will + * cause alignment panic. + */ + if (recv_ancillary.crb_recvttl && + (ira->ira_flags & IRAF_IS_IPV4)) { + struct T_opthdr *toh; + uint8_t *dstptr; + + toh = (struct T_opthdr *)ancil_buf; + toh->level = IPPROTO_IP; + toh->name = IP_RECVTTL; + toh->len = sizeof (struct T_opthdr) + sizeof (uint8_t); + toh->status = 0; + ancil_buf += sizeof (struct T_opthdr); + dstptr = (uint8_t *)ancil_buf; + *dstptr = ipp->ipp_hoplimit; + ancil_buf += sizeof (uint8_t); + ancil_size -= toh->len; + } + + /* Consumed all of allocated space */ + ASSERT(ancil_size == 0); + +} + +/* + * This routine retrieves the current status of socket options. + * It returns the size of the option retrieved, or -1. + */ +int +conn_opt_get(conn_opt_arg_t *coa, t_scalar_t level, t_scalar_t name, + uchar_t *ptr) +{ + int *i1 = (int *)ptr; + conn_t *connp = coa->coa_connp; + ip_xmit_attr_t *ixa = coa->coa_ixa; + ip_pkt_t *ipp = coa->coa_ipp; + ip_stack_t *ipst = ixa->ixa_ipst; + uint_t len; + + ASSERT(MUTEX_HELD(&coa->coa_connp->conn_lock)); + + switch (level) { + case SOL_SOCKET: + switch (name) { + case SO_DEBUG: + *i1 = connp->conn_debug ? SO_DEBUG : 0; + break; /* goto sizeof (int) option return */ + case SO_KEEPALIVE: + *i1 = connp->conn_keepalive ? SO_KEEPALIVE : 0; + break; + case SO_LINGER: { + struct linger *lgr = (struct linger *)ptr; + + lgr->l_onoff = connp->conn_linger ? SO_LINGER : 0; + lgr->l_linger = connp->conn_lingertime; + } + return (sizeof (struct linger)); + + case SO_OOBINLINE: + *i1 = connp->conn_oobinline ? SO_OOBINLINE : 0; + break; + case SO_REUSEADDR: + *i1 = connp->conn_reuseaddr ? SO_REUSEADDR : 0; + break; /* goto sizeof (int) option return */ + case SO_TYPE: + *i1 = connp->conn_so_type; + break; /* goto sizeof (int) option return */ + case SO_DONTROUTE: + *i1 = (ixa->ixa_flags & IXAF_DONTROUTE) ? + SO_DONTROUTE : 0; + break; /* goto sizeof (int) option return */ + case SO_USELOOPBACK: + *i1 = connp->conn_useloopback ? SO_USELOOPBACK : 0; + break; /* goto sizeof (int) option return */ + case SO_BROADCAST: + *i1 = connp->conn_broadcast ? SO_BROADCAST : 0; + break; /* goto sizeof (int) option return */ + + case SO_SNDBUF: + *i1 = connp->conn_sndbuf; + break; /* goto sizeof (int) option return */ + case SO_RCVBUF: + *i1 = connp->conn_rcvbuf; + break; /* goto sizeof (int) option return */ + case SO_RCVTIMEO: + case SO_SNDTIMEO: + /* + * Pass these two options in order for third part + * protocol usage. Here just return directly. + */ + *i1 = 0; + break; + case SO_DGRAM_ERRIND: + *i1 = connp->conn_dgram_errind ? SO_DGRAM_ERRIND : 0; + break; /* goto sizeof (int) option return */ + case SO_RECVUCRED: + *i1 = connp->conn_recv_ancillary.crb_recvucred; + break; /* goto sizeof (int) option return */ + case SO_TIMESTAMP: + *i1 = connp->conn_recv_ancillary.crb_timestamp; + break; /* goto sizeof (int) option return */ +#ifdef SO_VRRP + case SO_VRRP: + *i1 = connp->conn_isvrrp; + break; /* goto sizeof (int) option return */ +#endif + case SO_ANON_MLP: + *i1 = connp->conn_anon_mlp; + break; /* goto sizeof (int) option return */ + case SO_MAC_EXEMPT: + *i1 = (connp->conn_mac_mode == CONN_MAC_AWARE); + break; /* goto sizeof (int) option return */ + case SO_MAC_IMPLICIT: + *i1 = (connp->conn_mac_mode == CONN_MAC_IMPLICIT); + break; /* goto sizeof (int) option return */ + case SO_ALLZONES: + *i1 = connp->conn_allzones; + break; /* goto sizeof (int) option return */ + case SO_EXCLBIND: + *i1 = connp->conn_exclbind ? SO_EXCLBIND : 0; + break; + case SO_PROTOTYPE: + *i1 = connp->conn_proto; + break; + + case SO_DOMAIN: + *i1 = connp->conn_family; + break; + default: + return (-1); + } + break; + case IPPROTO_IP: + if (connp->conn_family != AF_INET) + return (-1); + switch (name) { + case IP_OPTIONS: + case T_IP_OPTIONS: + if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS)) + return (0); + + len = ipp->ipp_ipv4_options_len; + if (len > 0) { + bcopy(ipp->ipp_ipv4_options, ptr, len); + } + return (len); + + case IP_PKTINFO: { + /* + * This also handles IP_RECVPKTINFO. + * IP_PKTINFO and IP_RECVPKTINFO have same value. + * Differentiation is based on the size of the + * argument passed in. + */ + struct in_pktinfo *pktinfo; + +#ifdef notdef + /* optcom doesn't provide a length with "get" */ + if (inlen == sizeof (int)) { + /* This is IP_RECVPKTINFO option. */ + *i1 = connp->conn_recv_ancillary. + crb_ip_recvpktinfo; + return (sizeof (int)); + } +#endif + /* XXX assumes that caller has room for max size! */ + + pktinfo = (struct in_pktinfo *)ptr; + pktinfo->ipi_ifindex = ixa->ixa_ifindex; + if (ipp->ipp_fields & IPPF_ADDR) + pktinfo->ipi_spec_dst.s_addr = ipp->ipp_addr_v4; + else + pktinfo->ipi_spec_dst.s_addr = INADDR_ANY; + return (sizeof (struct in_pktinfo)); + } + case IP_DONTFRAG: + *i1 = (ixa->ixa_flags & IXAF_DONTFRAG) != 0; + return (sizeof (int)); + case IP_TOS: + case T_IP_TOS: + *i1 = (int)ipp->ipp_type_of_service; + break; /* goto sizeof (int) option return */ + case IP_TTL: + *i1 = (int)ipp->ipp_unicast_hops; + break; /* goto sizeof (int) option return */ + case IP_DHCPINIT_IF: + return (-1); + case IP_NEXTHOP: + if (ixa->ixa_flags & IXAF_NEXTHOP_SET) { + *(ipaddr_t *)ptr = ixa->ixa_nexthop_v4; + return (sizeof (ipaddr_t)); + } else { + return (0); + } + + case IP_MULTICAST_IF: + /* 0 address if not set */ + *(ipaddr_t *)ptr = ixa->ixa_multicast_ifaddr; + return (sizeof (ipaddr_t)); + case IP_MULTICAST_TTL: + *(uchar_t *)ptr = ixa->ixa_multicast_ttl; + return (sizeof (uchar_t)); + case IP_MULTICAST_LOOP: + *ptr = (ixa->ixa_flags & IXAF_MULTICAST_LOOP) ? 1 : 0; + return (sizeof (uint8_t)); + case IP_RECVOPTS: + *i1 = connp->conn_recv_ancillary.crb_recvopts; + break; /* goto sizeof (int) option return */ + case IP_RECVDSTADDR: + *i1 = connp->conn_recv_ancillary.crb_recvdstaddr; + break; /* goto sizeof (int) option return */ + case IP_RECVIF: + *i1 = connp->conn_recv_ancillary.crb_recvif; + break; /* goto sizeof (int) option return */ + case IP_RECVSLLA: + *i1 = connp->conn_recv_ancillary.crb_recvslla; + break; /* goto sizeof (int) option return */ + case IP_RECVTTL: + *i1 = connp->conn_recv_ancillary.crb_recvttl; + break; /* goto sizeof (int) option return */ + case IP_ADD_MEMBERSHIP: + case IP_DROP_MEMBERSHIP: + case MCAST_JOIN_GROUP: + case MCAST_LEAVE_GROUP: + case IP_BLOCK_SOURCE: + case IP_UNBLOCK_SOURCE: + case IP_ADD_SOURCE_MEMBERSHIP: + case IP_DROP_SOURCE_MEMBERSHIP: + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + case MRT_INIT: + case MRT_DONE: + case MRT_ADD_VIF: + case MRT_DEL_VIF: + case MRT_ADD_MFC: + case MRT_DEL_MFC: + /* cannot "get" the value for these */ + return (-1); + case MRT_VERSION: + case MRT_ASSERT: + (void) ip_mrouter_get(name, connp, ptr); + return (sizeof (int)); + case IP_SEC_OPT: + return (ipsec_req_from_conn(connp, (ipsec_req_t *)ptr, + IPSEC_AF_V4)); + case IP_BOUND_IF: + /* Zero if not set */ + *i1 = connp->conn_bound_if; + break; /* goto sizeof (int) option return */ + case IP_UNSPEC_SRC: + *i1 = connp->conn_unspec_src; + break; /* goto sizeof (int) option return */ + case IP_BROADCAST_TTL: + if (ixa->ixa_flags & IXAF_BROADCAST_TTL_SET) + *(uchar_t *)ptr = ixa->ixa_broadcast_ttl; + else + *(uchar_t *)ptr = ipst->ips_ip_broadcast_ttl; + return (sizeof (uchar_t)); + default: + return (-1); + } + break; + case IPPROTO_IPV6: + if (connp->conn_family != AF_INET6) + return (-1); + switch (name) { + case IPV6_UNICAST_HOPS: + *i1 = (int)ipp->ipp_unicast_hops; + break; /* goto sizeof (int) option return */ + case IPV6_MULTICAST_IF: + /* 0 index if not set */ + *i1 = ixa->ixa_multicast_ifindex; + break; /* goto sizeof (int) option return */ + case IPV6_MULTICAST_HOPS: + *i1 = ixa->ixa_multicast_ttl; + break; /* goto sizeof (int) option return */ + case IPV6_MULTICAST_LOOP: + *i1 = (ixa->ixa_flags & IXAF_MULTICAST_LOOP) ? 1 : 0; + break; /* goto sizeof (int) option return */ + case IPV6_JOIN_GROUP: + case IPV6_LEAVE_GROUP: + case MCAST_JOIN_GROUP: + case MCAST_LEAVE_GROUP: + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + /* cannot "get" the value for these */ + return (-1); + case IPV6_BOUND_IF: + /* Zero if not set */ + *i1 = connp->conn_bound_if; + break; /* goto sizeof (int) option return */ + case IPV6_UNSPEC_SRC: + *i1 = connp->conn_unspec_src; + break; /* goto sizeof (int) option return */ + case IPV6_RECVPKTINFO: + *i1 = connp->conn_recv_ancillary.crb_ip_recvpktinfo; + break; /* goto sizeof (int) option return */ + case IPV6_RECVTCLASS: + *i1 = connp->conn_recv_ancillary.crb_ipv6_recvtclass; + break; /* goto sizeof (int) option return */ + case IPV6_RECVPATHMTU: + *i1 = connp->conn_ipv6_recvpathmtu; + break; /* goto sizeof (int) option return */ + case IPV6_RECVHOPLIMIT: + *i1 = connp->conn_recv_ancillary.crb_ipv6_recvhoplimit; + break; /* goto sizeof (int) option return */ + case IPV6_RECVHOPOPTS: + *i1 = connp->conn_recv_ancillary.crb_ipv6_recvhopopts; + break; /* goto sizeof (int) option return */ + case IPV6_RECVDSTOPTS: + *i1 = connp->conn_recv_ancillary.crb_ipv6_recvdstopts; + break; /* goto sizeof (int) option return */ + case _OLD_IPV6_RECVDSTOPTS: + *i1 = + connp->conn_recv_ancillary.crb_old_ipv6_recvdstopts; + break; /* goto sizeof (int) option return */ + case IPV6_RECVRTHDRDSTOPTS: + *i1 = connp->conn_recv_ancillary. + crb_ipv6_recvrthdrdstopts; + break; /* goto sizeof (int) option return */ + case IPV6_RECVRTHDR: + *i1 = connp->conn_recv_ancillary.crb_ipv6_recvrthdr; + break; /* goto sizeof (int) option return */ + case IPV6_PKTINFO: { + /* XXX assumes that caller has room for max size! */ + struct in6_pktinfo *pkti; + + pkti = (struct in6_pktinfo *)ptr; + pkti->ipi6_ifindex = ixa->ixa_ifindex; + if (ipp->ipp_fields & IPPF_ADDR) + pkti->ipi6_addr = ipp->ipp_addr; + else + pkti->ipi6_addr = ipv6_all_zeros; + return (sizeof (struct in6_pktinfo)); + } + case IPV6_TCLASS: + *i1 = ipp->ipp_tclass; + break; /* goto sizeof (int) option return */ + case IPV6_NEXTHOP: { + sin6_t *sin6 = (sin6_t *)ptr; + + if (ixa->ixa_flags & IXAF_NEXTHOP_SET) + return (0); + + *sin6 = sin6_null; + sin6->sin6_family = AF_INET6; + sin6->sin6_addr = ixa->ixa_nexthop_v6; + + return (sizeof (sin6_t)); + } + case IPV6_HOPOPTS: + if (!(ipp->ipp_fields & IPPF_HOPOPTS)) + return (0); + bcopy(ipp->ipp_hopopts, ptr, + ipp->ipp_hopoptslen); + return (ipp->ipp_hopoptslen); + case IPV6_RTHDRDSTOPTS: + if (!(ipp->ipp_fields & IPPF_RTHDRDSTOPTS)) + return (0); + bcopy(ipp->ipp_rthdrdstopts, ptr, + ipp->ipp_rthdrdstoptslen); + return (ipp->ipp_rthdrdstoptslen); + case IPV6_RTHDR: + if (!(ipp->ipp_fields & IPPF_RTHDR)) + return (0); + bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen); + return (ipp->ipp_rthdrlen); + case IPV6_DSTOPTS: + if (!(ipp->ipp_fields & IPPF_DSTOPTS)) + return (0); + bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen); + return (ipp->ipp_dstoptslen); + case IPV6_PATHMTU: + return (ip_fill_mtuinfo(connp, ixa, + (struct ip6_mtuinfo *)ptr)); + case IPV6_SEC_OPT: + return (ipsec_req_from_conn(connp, (ipsec_req_t *)ptr, + IPSEC_AF_V6)); + case IPV6_SRC_PREFERENCES: + return (ip6_get_src_preferences(ixa, (uint32_t *)ptr)); + case IPV6_DONTFRAG: + *i1 = (ixa->ixa_flags & IXAF_DONTFRAG) != 0; + return (sizeof (int)); + case IPV6_USE_MIN_MTU: + if (ixa->ixa_flags & IXAF_USE_MIN_MTU) + *i1 = ixa->ixa_use_min_mtu; + else + *i1 = IPV6_USE_MIN_MTU_MULTICAST; + break; + case IPV6_V6ONLY: + *i1 = connp->conn_ipv6_v6only; + return (sizeof (int)); + default: + return (-1); + } + break; + case IPPROTO_UDP: + switch (name) { + case UDP_ANONPRIVBIND: + *i1 = connp->conn_anon_priv_bind; + break; + case UDP_EXCLBIND: + *i1 = connp->conn_exclbind ? UDP_EXCLBIND : 0; + break; + default: + return (-1); + } + break; + case IPPROTO_TCP: + switch (name) { + case TCP_RECVDSTADDR: + *i1 = connp->conn_recv_ancillary.crb_recvdstaddr; + break; + case TCP_ANONPRIVBIND: + *i1 = connp->conn_anon_priv_bind; + break; + case TCP_EXCLBIND: + *i1 = connp->conn_exclbind ? TCP_EXCLBIND : 0; + break; + default: + return (-1); + } + break; + default: + return (-1); + } + return (sizeof (int)); +} + +static int conn_opt_set_socket(conn_opt_arg_t *coa, t_scalar_t name, + uint_t inlen, uchar_t *invalp, boolean_t checkonly, cred_t *cr); +static int conn_opt_set_ip(conn_opt_arg_t *coa, t_scalar_t name, + uint_t inlen, uchar_t *invalp, boolean_t checkonly, cred_t *cr); +static int conn_opt_set_ipv6(conn_opt_arg_t *coa, t_scalar_t name, + uint_t inlen, uchar_t *invalp, boolean_t checkonly, cred_t *cr); +static int conn_opt_set_udp(conn_opt_arg_t *coa, t_scalar_t name, + uint_t inlen, uchar_t *invalp, boolean_t checkonly, cred_t *cr); +static int conn_opt_set_tcp(conn_opt_arg_t *coa, t_scalar_t name, + uint_t inlen, uchar_t *invalp, boolean_t checkonly, cred_t *cr); + +/* + * This routine sets the most common socket options including some + * that are transport/ULP specific. + * It returns errno or zero. + * + * For fixed length options, there is no sanity check + * of passed in length is done. It is assumed *_optcom_req() + * routines do the right thing. + */ +int +conn_opt_set(conn_opt_arg_t *coa, t_scalar_t level, t_scalar_t name, + uint_t inlen, uchar_t *invalp, boolean_t checkonly, cred_t *cr) +{ + ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock)); + + /* We have different functions for different levels */ + switch (level) { + case SOL_SOCKET: + return (conn_opt_set_socket(coa, name, inlen, invalp, + checkonly, cr)); + case IPPROTO_IP: + return (conn_opt_set_ip(coa, name, inlen, invalp, + checkonly, cr)); + case IPPROTO_IPV6: + return (conn_opt_set_ipv6(coa, name, inlen, invalp, + checkonly, cr)); + case IPPROTO_UDP: + return (conn_opt_set_udp(coa, name, inlen, invalp, + checkonly, cr)); + case IPPROTO_TCP: + return (conn_opt_set_tcp(coa, name, inlen, invalp, + checkonly, cr)); + default: + return (0); + } +} + +/* + * Handle SOL_SOCKET + * Note that we do not handle SO_PROTOTYPE here. The ULPs that support + * it implement their own checks and setting of conn_proto. + */ +/* ARGSUSED1 */ +static int +conn_opt_set_socket(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen, + uchar_t *invalp, boolean_t checkonly, cred_t *cr) +{ + conn_t *connp = coa->coa_connp; + ip_xmit_attr_t *ixa = coa->coa_ixa; + int *i1 = (int *)invalp; + boolean_t onoff = (*i1 == 0) ? 0 : 1; + + switch (name) { + case SO_ALLZONES: + if (IPCL_IS_BOUND(connp)) + return (EINVAL); + break; +#ifdef SO_VRRP + case SO_VRRP: + if (secpolicy_ip_config(cr, checkonly) != 0) + return (EACCES); + break; +#endif + case SO_MAC_EXEMPT: + if (secpolicy_net_mac_aware(cr) != 0) + return (EACCES); + if (IPCL_IS_BOUND(connp)) + return (EINVAL); + break; + case SO_MAC_IMPLICIT: + if (secpolicy_net_mac_implicit(cr) != 0) + return (EACCES); + break; + } + if (checkonly) + return (0); + + mutex_enter(&connp->conn_lock); + /* Here we set the actual option value */ + switch (name) { + case SO_DEBUG: + connp->conn_debug = onoff; + break; + case SO_KEEPALIVE: + connp->conn_keepalive = onoff; + break; + case SO_LINGER: { + struct linger *lgr = (struct linger *)invalp; + + if (lgr->l_onoff) { + connp->conn_linger = 1; + connp->conn_lingertime = lgr->l_linger; + } else { + connp->conn_linger = 0; + connp->conn_lingertime = 0; + } + break; + } + case SO_OOBINLINE: + connp->conn_oobinline = onoff; + coa->coa_changed |= COA_OOBINLINE_CHANGED; + break; + case SO_REUSEADDR: + connp->conn_reuseaddr = onoff; + break; + case SO_DONTROUTE: + if (onoff) + ixa->ixa_flags |= IXAF_DONTROUTE; + else + ixa->ixa_flags &= ~IXAF_DONTROUTE; + coa->coa_changed |= COA_ROUTE_CHANGED; + break; + case SO_USELOOPBACK: + connp->conn_useloopback = onoff; + break; + case SO_BROADCAST: + connp->conn_broadcast = onoff; + break; + case SO_SNDBUF: + /* ULP has range checked the value */ + connp->conn_sndbuf = *i1; + coa->coa_changed |= COA_SNDBUF_CHANGED; + break; + case SO_RCVBUF: + /* ULP has range checked the value */ + connp->conn_rcvbuf = *i1; + coa->coa_changed |= COA_RCVBUF_CHANGED; + break; + case SO_RCVTIMEO: + case SO_SNDTIMEO: + /* + * Pass these two options in order for third part + * protocol usage. + */ + break; + case SO_DGRAM_ERRIND: + connp->conn_dgram_errind = onoff; + break; + case SO_RECVUCRED: + connp->conn_recv_ancillary.crb_recvucred = onoff; + break; + case SO_ALLZONES: + connp->conn_allzones = onoff; + coa->coa_changed |= COA_ROUTE_CHANGED; + if (onoff) + ixa->ixa_zoneid = ALL_ZONES; + else + ixa->ixa_zoneid = connp->conn_zoneid; + break; + case SO_TIMESTAMP: + connp->conn_recv_ancillary.crb_timestamp = onoff; + break; +#ifdef SO_VRRP + case SO_VRRP: + connp->conn_isvrrp = onoff; + break; +#endif + case SO_ANON_MLP: + connp->conn_anon_mlp = onoff; + break; + case SO_MAC_EXEMPT: + connp->conn_mac_mode = onoff ? + CONN_MAC_AWARE : CONN_MAC_DEFAULT; + break; + case SO_MAC_IMPLICIT: + connp->conn_mac_mode = onoff ? + CONN_MAC_IMPLICIT : CONN_MAC_DEFAULT; + break; + case SO_EXCLBIND: + connp->conn_exclbind = onoff; + break; + } + mutex_exit(&connp->conn_lock); + return (0); +} + +/* Handle IPPROTO_IP */ +static int +conn_opt_set_ip(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen, + uchar_t *invalp, boolean_t checkonly, cred_t *cr) +{ + conn_t *connp = coa->coa_connp; + ip_xmit_attr_t *ixa = coa->coa_ixa; + ip_pkt_t *ipp = coa->coa_ipp; + int *i1 = (int *)invalp; + boolean_t onoff = (*i1 == 0) ? 0 : 1; + ipaddr_t addr = (ipaddr_t)*i1; + uint_t ifindex; + zoneid_t zoneid = IPCL_ZONEID(connp); + ipif_t *ipif; + ip_stack_t *ipst = connp->conn_netstack->netstack_ip; + int error; + + if (connp->conn_family != AF_INET) + return (EINVAL); + + switch (name) { + case IP_TTL: + /* Don't allow zero */ + if (*i1 < 1 || *i1 > 255) + return (EINVAL); + break; + case IP_MULTICAST_IF: + if (addr == INADDR_ANY) { + /* Clear */ + ifindex = 0; + break; + } + ipif = ipif_lookup_addr(addr, NULL, zoneid, ipst); + if (ipif == NULL) + return (EHOSTUNREACH); + /* not supported by the virtual network iface */ + if (IS_VNI(ipif->ipif_ill)) { + ipif_refrele(ipif); + return (EINVAL); + } + ifindex = ipif->ipif_ill->ill_phyint->phyint_ifindex; + ipif_refrele(ipif); + break; + case IP_NEXTHOP: { + ire_t *ire; + + if (addr == INADDR_ANY) { + /* Clear */ + break; + } + /* Verify that the next-hop is on-link */ + ire = ire_ftable_lookup_v4(addr, 0, 0, IRE_ONLINK, NULL, zoneid, + NULL, MATCH_IRE_TYPE, 0, ipst, NULL); + if (ire == NULL) + return (EHOSTUNREACH); + ire_refrele(ire); + break; + } + case IP_OPTIONS: + case T_IP_OPTIONS: { + uint_t newlen; + + if (ipp->ipp_fields & IPPF_LABEL_V4) + newlen = inlen + (ipp->ipp_label_len_v4 + 3) & ~3; + else + newlen = inlen; + if ((inlen & 0x3) || newlen > IP_MAX_OPT_LENGTH) { + return (EINVAL); + } + break; + } + case IP_PKTINFO: { + struct in_pktinfo *pktinfo; + + /* Two different valid lengths */ + if (inlen != sizeof (int) && + inlen != sizeof (struct in_pktinfo)) + return (EINVAL); + if (inlen == sizeof (int)) + break; + + pktinfo = (struct in_pktinfo *)invalp; + if (pktinfo->ipi_spec_dst.s_addr != INADDR_ANY) { + switch (ip_laddr_verify_v4(pktinfo->ipi_spec_dst.s_addr, + zoneid, ipst, B_FALSE)) { + case IPVL_UNICAST_UP: + case IPVL_UNICAST_DOWN: + break; + default: + return (EADDRNOTAVAIL); + } + } + if (!ip_ifindex_valid(pktinfo->ipi_ifindex, B_FALSE, ipst)) + return (ENXIO); + break; + } + case IP_BOUND_IF: + ifindex = *(uint_t *)i1; + + /* Just check it is ok. */ + if (!ip_ifindex_valid(ifindex, B_FALSE, ipst)) + return (ENXIO); + break; + } + if (checkonly) + return (0); + + /* Here we set the actual option value */ + /* + * conn_lock protects the bitfields, and is used to + * set the fields atomically. Not needed for ixa settings since + * the caller has an exclusive copy of the ixa. + * We can not hold conn_lock across the multicast options though. + */ + switch (name) { + case IP_OPTIONS: + case T_IP_OPTIONS: + /* Save options for use by IP. */ + mutex_enter(&connp->conn_lock); + error = optcom_pkt_set(invalp, inlen, + (uchar_t **)&ipp->ipp_ipv4_options, + &ipp->ipp_ipv4_options_len); + if (error != 0) { + mutex_exit(&connp->conn_lock); + return (error); + } + if (ipp->ipp_ipv4_options_len == 0) { + ipp->ipp_fields &= ~IPPF_IPV4_OPTIONS; + } else { + ipp->ipp_fields |= IPPF_IPV4_OPTIONS; + } + mutex_exit(&connp->conn_lock); + coa->coa_changed |= COA_HEADER_CHANGED; + coa->coa_changed |= COA_WROFF_CHANGED; + break; + + case IP_TTL: + mutex_enter(&connp->conn_lock); + ipp->ipp_unicast_hops = *i1; + mutex_exit(&connp->conn_lock); + coa->coa_changed |= COA_HEADER_CHANGED; + break; + case IP_TOS: + case T_IP_TOS: + mutex_enter(&connp->conn_lock); + if (*i1 == -1) { + ipp->ipp_type_of_service = 0; + } else { + ipp->ipp_type_of_service = *i1; + } + mutex_exit(&connp->conn_lock); + coa->coa_changed |= COA_HEADER_CHANGED; + break; + case IP_MULTICAST_IF: + ixa->ixa_multicast_ifindex = ifindex; + ixa->ixa_multicast_ifaddr = addr; + coa->coa_changed |= COA_ROUTE_CHANGED; + break; + case IP_MULTICAST_TTL: + ixa->ixa_multicast_ttl = *invalp; + /* Handled automatically by ip_output */ + break; + case IP_MULTICAST_LOOP: + if (*invalp != 0) + ixa->ixa_flags |= IXAF_MULTICAST_LOOP; + else + ixa->ixa_flags &= ~IXAF_MULTICAST_LOOP; + /* Handled automatically by ip_output */ + break; + case IP_RECVOPTS: + mutex_enter(&connp->conn_lock); + connp->conn_recv_ancillary.crb_recvopts = onoff; + mutex_exit(&connp->conn_lock); + break; + case IP_RECVDSTADDR: + mutex_enter(&connp->conn_lock); + connp->conn_recv_ancillary.crb_recvdstaddr = onoff; + mutex_exit(&connp->conn_lock); + break; + case IP_RECVIF: + mutex_enter(&connp->conn_lock); + connp->conn_recv_ancillary.crb_recvif = onoff; + mutex_exit(&connp->conn_lock); + break; + case IP_RECVSLLA: + mutex_enter(&connp->conn_lock); + connp->conn_recv_ancillary.crb_recvslla = onoff; + mutex_exit(&connp->conn_lock); + break; + case IP_RECVTTL: + mutex_enter(&connp->conn_lock); + connp->conn_recv_ancillary.crb_recvttl = onoff; + mutex_exit(&connp->conn_lock); + break; + case IP_PKTINFO: { + /* + * This also handles IP_RECVPKTINFO. + * IP_PKTINFO and IP_RECVPKTINFO have same value. + * Differentiation is based on the size of the + * argument passed in. + */ + struct in_pktinfo *pktinfo; + + if (inlen == sizeof (int)) { + /* This is IP_RECVPKTINFO option. */ + mutex_enter(&connp->conn_lock); + connp->conn_recv_ancillary.crb_ip_recvpktinfo = + onoff; + mutex_exit(&connp->conn_lock); + break; + } + + /* This is IP_PKTINFO option. */ + mutex_enter(&connp->conn_lock); + pktinfo = (struct in_pktinfo *)invalp; + if (ipp->ipp_addr_v4 != INADDR_ANY) { + ipp->ipp_fields |= IPPF_ADDR; + IN6_INADDR_TO_V4MAPPED(&pktinfo->ipi_spec_dst, + &ipp->ipp_addr); + } else { + ipp->ipp_fields &= ~IPPF_ADDR; + ipp->ipp_addr = ipv6_all_zeros; + } + mutex_exit(&connp->conn_lock); + ixa->ixa_ifindex = pktinfo->ipi_ifindex; + coa->coa_changed |= COA_ROUTE_CHANGED; + coa->coa_changed |= COA_HEADER_CHANGED; + break; + } + case IP_DONTFRAG: + if (onoff) { + ixa->ixa_flags |= (IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); + ixa->ixa_flags &= ~IXAF_PMTU_DISCOVERY; + } else { + ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); + ixa->ixa_flags |= IXAF_PMTU_DISCOVERY; + } + /* Need to redo ip_attr_connect */ + coa->coa_changed |= COA_ROUTE_CHANGED; + break; + case IP_ADD_MEMBERSHIP: + case IP_DROP_MEMBERSHIP: + case MCAST_JOIN_GROUP: + case MCAST_LEAVE_GROUP: + return (ip_opt_set_multicast_group(connp, name, + invalp, B_FALSE, checkonly)); + + case IP_BLOCK_SOURCE: + case IP_UNBLOCK_SOURCE: + case IP_ADD_SOURCE_MEMBERSHIP: + case IP_DROP_SOURCE_MEMBERSHIP: + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + return (ip_opt_set_multicast_sources(connp, name, + invalp, B_FALSE, checkonly)); + + case IP_SEC_OPT: + mutex_enter(&connp->conn_lock); + error = ipsec_set_req(cr, connp, (ipsec_req_t *)invalp); + mutex_exit(&connp->conn_lock); + if (error != 0) { + return (error); + } + /* This is an IPsec policy change - redo ip_attr_connect */ + coa->coa_changed |= COA_ROUTE_CHANGED; + break; + case IP_NEXTHOP: + ixa->ixa_nexthop_v4 = addr; + if (addr != INADDR_ANY) + ixa->ixa_flags |= IXAF_NEXTHOP_SET; + else + ixa->ixa_flags &= ~IXAF_NEXTHOP_SET; + coa->coa_changed |= COA_ROUTE_CHANGED; + break; + + case IP_BOUND_IF: + ixa->ixa_ifindex = ifindex; /* Send */ + mutex_enter(&connp->conn_lock); + connp->conn_incoming_ifindex = ifindex; /* Receive */ + connp->conn_bound_if = ifindex; /* getsockopt */ + mutex_exit(&connp->conn_lock); + coa->coa_changed |= COA_ROUTE_CHANGED; + break; + case IP_UNSPEC_SRC: + mutex_enter(&connp->conn_lock); + connp->conn_unspec_src = onoff; + if (onoff) + ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE; + else + ixa->ixa_flags |= IXAF_VERIFY_SOURCE; + + mutex_exit(&connp->conn_lock); + break; + case IP_BROADCAST_TTL: + ixa->ixa_broadcast_ttl = *invalp; + ixa->ixa_flags |= IXAF_BROADCAST_TTL_SET; + /* Handled automatically by ip_output */ + break; + case MRT_INIT: + case MRT_DONE: + case MRT_ADD_VIF: + case MRT_DEL_VIF: + case MRT_ADD_MFC: + case MRT_DEL_MFC: + case MRT_ASSERT: + if ((error = secpolicy_ip_config(cr, B_FALSE)) != 0) { + return (error); + } + error = ip_mrouter_set((int)name, connp, checkonly, + (uchar_t *)invalp, inlen); + if (error) { + return (error); + } + return (0); + + } + return (0); +} + +/* Handle IPPROTO_IPV6 */ +static int +conn_opt_set_ipv6(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen, + uchar_t *invalp, boolean_t checkonly, cred_t *cr) +{ + conn_t *connp = coa->coa_connp; + ip_xmit_attr_t *ixa = coa->coa_ixa; + ip_pkt_t *ipp = coa->coa_ipp; + int *i1 = (int *)invalp; + boolean_t onoff = (*i1 == 0) ? 0 : 1; + uint_t ifindex; + zoneid_t zoneid = IPCL_ZONEID(connp); + ip_stack_t *ipst = connp->conn_netstack->netstack_ip; + int error; + + if (connp->conn_family != AF_INET6) + return (EINVAL); + + switch (name) { + case IPV6_MULTICAST_IF: + /* + * The only possible error is EINVAL. + * We call this option on both V4 and V6 + * If both fail, then this call returns + * EINVAL. If at least one of them succeeds we + * return success. + */ + ifindex = *(uint_t *)i1; + + if (!ip_ifindex_valid(ifindex, B_TRUE, ipst) && + !ip_ifindex_valid(ifindex, B_FALSE, ipst)) + return (EINVAL); + break; + case IPV6_UNICAST_HOPS: + /* Don't allow zero. -1 means to use default */ + if (*i1 < -1 || *i1 == 0 || *i1 > IPV6_MAX_HOPS) + return (EINVAL); + break; + case IPV6_MULTICAST_HOPS: + /* -1 means use default */ + if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) + return (EINVAL); + break; + case IPV6_MULTICAST_LOOP: + if (*i1 != 0 && *i1 != 1) + return (EINVAL); + break; + case IPV6_BOUND_IF: + ifindex = *(uint_t *)i1; + + if (!ip_ifindex_valid(ifindex, B_TRUE, ipst)) + return (ENXIO); + break; + case IPV6_PKTINFO: { + struct in6_pktinfo *pkti; + boolean_t isv6; + + if (inlen != 0 && inlen != sizeof (struct in6_pktinfo)) + return (EINVAL); + if (inlen == 0) + break; /* Clear values below */ + + /* + * Verify the source address and ifindex. Privileged users + * can use any source address. + */ + pkti = (struct in6_pktinfo *)invalp; + + /* + * For link-local addresses we use the ipi6_ifindex when + * we verify the local address. + * If net_rawaccess then any source address can be used. + */ + if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr) && + secpolicy_net_rawaccess(cr) != 0) { + uint_t scopeid = 0; + in6_addr_t *v6src = &pkti->ipi6_addr; + ipaddr_t v4src; + ip_laddr_t laddr_type = IPVL_UNICAST_UP; + + if (IN6_IS_ADDR_V4MAPPED(v6src)) { + IN6_V4MAPPED_TO_IPADDR(v6src, v4src); + if (v4src != INADDR_ANY) { + laddr_type = ip_laddr_verify_v4(v4src, + zoneid, ipst, B_FALSE); + } + } else { + if (IN6_IS_ADDR_LINKSCOPE(v6src)) + scopeid = pkti->ipi6_ifindex; + + laddr_type = ip_laddr_verify_v6(v6src, zoneid, + ipst, B_FALSE, scopeid); + } + switch (laddr_type) { + case IPVL_UNICAST_UP: + case IPVL_UNICAST_DOWN: + break; + default: + return (EADDRNOTAVAIL); + } + ixa->ixa_flags |= IXAF_VERIFY_SOURCE; + } else if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr)) { + /* Allow any source */ + ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE; + } + isv6 = !(IN6_IS_ADDR_V4MAPPED(&pkti->ipi6_addr)); + if (!ip_ifindex_valid(pkti->ipi6_ifindex, isv6, ipst)) + return (ENXIO); + break; + } + case IPV6_HOPLIMIT: + /* It is only allowed as ancilary data */ + if (!coa->coa_ancillary) + return (EINVAL); + + if (inlen != 0 && inlen != sizeof (int)) + return (EINVAL); + if (inlen == sizeof (int)) { + if (*i1 > 255 || *i1 < -1 || *i1 == 0) + return (EINVAL); + } + break; + case IPV6_TCLASS: + if (inlen != 0 && inlen != sizeof (int)) + return (EINVAL); + if (inlen == sizeof (int)) { + if (*i1 > 255 || *i1 < -1) + return (EINVAL); + } + break; + case IPV6_NEXTHOP: + if (inlen != 0 && inlen != sizeof (sin6_t)) + return (EINVAL); + if (inlen == sizeof (sin6_t)) { + sin6_t *sin6 = (sin6_t *)invalp; + ire_t *ire; + + if (sin6->sin6_family != AF_INET6) + return (EAFNOSUPPORT); + if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) + return (EADDRNOTAVAIL); + + /* Verify that the next-hop is on-link */ + ire = ire_ftable_lookup_v6(&sin6->sin6_addr, + 0, 0, IRE_ONLINK, NULL, zoneid, + NULL, MATCH_IRE_TYPE, 0, ipst, NULL); + if (ire == NULL) + return (EHOSTUNREACH); + ire_refrele(ire); + break; + } + break; + case IPV6_RTHDR: + case IPV6_DSTOPTS: + case IPV6_RTHDRDSTOPTS: + case IPV6_HOPOPTS: { + /* All have the length field in the same place */ + ip6_hbh_t *hopts = (ip6_hbh_t *)invalp; + /* + * Sanity checks - minimum size, size a multiple of + * eight bytes, and matching size passed in. + */ + if (inlen != 0 && + inlen != (8 * (hopts->ip6h_len + 1))) + return (EINVAL); + break; + } + case IPV6_PATHMTU: + /* Can't be set */ + return (EINVAL); + + case IPV6_USE_MIN_MTU: + if (inlen != sizeof (int)) + return (EINVAL); + if (*i1 < -1 || *i1 > 1) + return (EINVAL); + break; + case IPV6_SRC_PREFERENCES: + if (inlen != sizeof (uint32_t)) + return (EINVAL); + break; + case IPV6_V6ONLY: + if (*i1 < 0 || *i1 > 1) { + return (EINVAL); + } + break; + } + if (checkonly) + return (0); + + /* Here we set the actual option value */ + /* + * conn_lock protects the bitfields, and is used to + * set the fields atomically. Not needed for ixa settings since + * the caller has an exclusive copy of the ixa. + * We can not hold conn_lock across the multicast options though. + */ + ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock)); + switch (name) { + case IPV6_MULTICAST_IF: + ixa->ixa_multicast_ifindex = ifindex; + /* Need to redo ip_attr_connect */ + coa->coa_changed |= COA_ROUTE_CHANGED; + break; + case IPV6_UNICAST_HOPS: + /* -1 means use default */ + mutex_enter(&connp->conn_lock); + if (*i1 == -1) { + ipp->ipp_unicast_hops = connp->conn_default_ttl; + } else { + ipp->ipp_unicast_hops = (uint8_t)*i1; + } + mutex_exit(&connp->conn_lock); + coa->coa_changed |= COA_HEADER_CHANGED; + break; + case IPV6_MULTICAST_HOPS: + /* -1 means use default */ + if (*i1 == -1) { + ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; + } else { + ixa->ixa_multicast_ttl = (uint8_t)*i1; + } + /* Handled automatically by ip_output */ + break; + case IPV6_MULTICAST_LOOP: + if (*i1 != 0) + ixa->ixa_flags |= IXAF_MULTICAST_LOOP; + else + ixa->ixa_flags &= ~IXAF_MULTICAST_LOOP; + /* Handled automatically by ip_output */ + break; + case IPV6_JOIN_GROUP: + case IPV6_LEAVE_GROUP: + case MCAST_JOIN_GROUP: + case MCAST_LEAVE_GROUP: + return (ip_opt_set_multicast_group(connp, name, + invalp, B_TRUE, checkonly)); + + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + return (ip_opt_set_multicast_sources(connp, name, + invalp, B_TRUE, checkonly)); + + case IPV6_BOUND_IF: + ixa->ixa_ifindex = ifindex; /* Send */ + mutex_enter(&connp->conn_lock); + connp->conn_incoming_ifindex = ifindex; /* Receive */ + connp->conn_bound_if = ifindex; /* getsockopt */ + mutex_exit(&connp->conn_lock); + coa->coa_changed |= COA_ROUTE_CHANGED; + break; + case IPV6_UNSPEC_SRC: + mutex_enter(&connp->conn_lock); + connp->conn_unspec_src = onoff; + if (onoff) + ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE; + else + ixa->ixa_flags |= IXAF_VERIFY_SOURCE; + mutex_exit(&connp->conn_lock); + break; + case IPV6_RECVPKTINFO: + mutex_enter(&connp->conn_lock); + connp->conn_recv_ancillary.crb_ip_recvpktinfo = onoff; + mutex_exit(&connp->conn_lock); + break; + case IPV6_RECVTCLASS: + mutex_enter(&connp->conn_lock); + connp->conn_recv_ancillary.crb_ipv6_recvtclass = onoff; + mutex_exit(&connp->conn_lock); + break; + case IPV6_RECVPATHMTU: + mutex_enter(&connp->conn_lock); + connp->conn_ipv6_recvpathmtu = onoff; + mutex_exit(&connp->conn_lock); + break; + case IPV6_RECVHOPLIMIT: + mutex_enter(&connp->conn_lock); + connp->conn_recv_ancillary.crb_ipv6_recvhoplimit = + onoff; + mutex_exit(&connp->conn_lock); + break; + case IPV6_RECVHOPOPTS: + mutex_enter(&connp->conn_lock); + connp->conn_recv_ancillary.crb_ipv6_recvhopopts = onoff; + mutex_exit(&connp->conn_lock); + break; + case IPV6_RECVDSTOPTS: + mutex_enter(&connp->conn_lock); + connp->conn_recv_ancillary.crb_ipv6_recvdstopts = onoff; + mutex_exit(&connp->conn_lock); + break; + case _OLD_IPV6_RECVDSTOPTS: + mutex_enter(&connp->conn_lock); + connp->conn_recv_ancillary.crb_old_ipv6_recvdstopts = + onoff; + mutex_exit(&connp->conn_lock); + break; + case IPV6_RECVRTHDRDSTOPTS: + mutex_enter(&connp->conn_lock); + connp->conn_recv_ancillary.crb_ipv6_recvrthdrdstopts = + onoff; + mutex_exit(&connp->conn_lock); + break; + case IPV6_RECVRTHDR: + mutex_enter(&connp->conn_lock); + connp->conn_recv_ancillary.crb_ipv6_recvrthdr = onoff; + mutex_exit(&connp->conn_lock); + break; + case IPV6_PKTINFO: + mutex_enter(&connp->conn_lock); + if (inlen == 0) { + ipp->ipp_fields &= ~IPPF_ADDR; + ipp->ipp_addr = ipv6_all_zeros; + ixa->ixa_ifindex = 0; + } else { + struct in6_pktinfo *pkti; + + pkti = (struct in6_pktinfo *)invalp; + ipp->ipp_addr = pkti->ipi6_addr; + if (!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr)) + ipp->ipp_fields |= IPPF_ADDR; + else + ipp->ipp_fields &= ~IPPF_ADDR; + ixa->ixa_ifindex = pkti->ipi6_ifindex; + } + mutex_exit(&connp->conn_lock); + /* Source and ifindex might have changed */ + coa->coa_changed |= COA_HEADER_CHANGED; + coa->coa_changed |= COA_ROUTE_CHANGED; + break; + case IPV6_HOPLIMIT: + mutex_enter(&connp->conn_lock); + if (inlen == 0 || *i1 == -1) { + /* Revert to default */ + ipp->ipp_fields &= ~IPPF_HOPLIMIT; + ixa->ixa_flags &= ~IXAF_NO_TTL_CHANGE; + } else { + ipp->ipp_hoplimit = *i1; + ipp->ipp_fields |= IPPF_HOPLIMIT; + /* Ensure that it sticks for multicast packets */ + ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; + } + mutex_exit(&connp->conn_lock); + coa->coa_changed |= COA_HEADER_CHANGED; + break; + case IPV6_TCLASS: + /* + * IPV6_TCLASS accepts -1 as use kernel default + * and [0, 255] as the actualy traffic class. + */ + mutex_enter(&connp->conn_lock); + if (inlen == 0 || *i1 == -1) { + ipp->ipp_tclass = 0; + ipp->ipp_fields &= ~IPPF_TCLASS; + } else { + ipp->ipp_tclass = *i1; + ipp->ipp_fields |= IPPF_TCLASS; + } + mutex_exit(&connp->conn_lock); + coa->coa_changed |= COA_HEADER_CHANGED; + break; + case IPV6_NEXTHOP: + if (inlen == 0) { + ixa->ixa_flags &= ~IXAF_NEXTHOP_SET; + } else { + sin6_t *sin6 = (sin6_t *)invalp; + + ixa->ixa_nexthop_v6 = sin6->sin6_addr; + if (!IN6_IS_ADDR_UNSPECIFIED(&ixa->ixa_nexthop_v6)) + ixa->ixa_flags |= IXAF_NEXTHOP_SET; + else + ixa->ixa_flags &= ~IXAF_NEXTHOP_SET; + } + coa->coa_changed |= COA_ROUTE_CHANGED; + break; + case IPV6_HOPOPTS: + mutex_enter(&connp->conn_lock); + error = optcom_pkt_set(invalp, inlen, + (uchar_t **)&ipp->ipp_hopopts, &ipp->ipp_hopoptslen); + if (error != 0) { + mutex_exit(&connp->conn_lock); + return (error); + } + if (ipp->ipp_hopoptslen == 0) { + ipp->ipp_fields &= ~IPPF_HOPOPTS; + } else { + ipp->ipp_fields |= IPPF_HOPOPTS; + } + mutex_exit(&connp->conn_lock); + coa->coa_changed |= COA_HEADER_CHANGED; + coa->coa_changed |= COA_WROFF_CHANGED; + break; + case IPV6_RTHDRDSTOPTS: + mutex_enter(&connp->conn_lock); + error = optcom_pkt_set(invalp, inlen, + (uchar_t **)&ipp->ipp_rthdrdstopts, + &ipp->ipp_rthdrdstoptslen); + if (error != 0) { + mutex_exit(&connp->conn_lock); + return (error); + } + if (ipp->ipp_rthdrdstoptslen == 0) { + ipp->ipp_fields &= ~IPPF_RTHDRDSTOPTS; + } else { + ipp->ipp_fields |= IPPF_RTHDRDSTOPTS; + } + mutex_exit(&connp->conn_lock); + coa->coa_changed |= COA_HEADER_CHANGED; + coa->coa_changed |= COA_WROFF_CHANGED; + break; + case IPV6_DSTOPTS: + mutex_enter(&connp->conn_lock); + error = optcom_pkt_set(invalp, inlen, + (uchar_t **)&ipp->ipp_dstopts, &ipp->ipp_dstoptslen); + if (error != 0) { + mutex_exit(&connp->conn_lock); + return (error); + } + if (ipp->ipp_dstoptslen == 0) { + ipp->ipp_fields &= ~IPPF_DSTOPTS; + } else { + ipp->ipp_fields |= IPPF_DSTOPTS; + } + mutex_exit(&connp->conn_lock); + coa->coa_changed |= COA_HEADER_CHANGED; + coa->coa_changed |= COA_WROFF_CHANGED; + break; + case IPV6_RTHDR: + mutex_enter(&connp->conn_lock); + error = optcom_pkt_set(invalp, inlen, + (uchar_t **)&ipp->ipp_rthdr, &ipp->ipp_rthdrlen); + if (error != 0) { + mutex_exit(&connp->conn_lock); + return (error); + } + if (ipp->ipp_rthdrlen == 0) { + ipp->ipp_fields &= ~IPPF_RTHDR; + } else { + ipp->ipp_fields |= IPPF_RTHDR; + } + mutex_exit(&connp->conn_lock); + coa->coa_changed |= COA_HEADER_CHANGED; + coa->coa_changed |= COA_WROFF_CHANGED; + break; + + case IPV6_DONTFRAG: + if (onoff) { + ixa->ixa_flags |= IXAF_DONTFRAG; + ixa->ixa_flags &= ~IXAF_PMTU_DISCOVERY; + } else { + ixa->ixa_flags &= ~IXAF_DONTFRAG; + ixa->ixa_flags |= IXAF_PMTU_DISCOVERY; + } + /* Need to redo ip_attr_connect */ + coa->coa_changed |= COA_ROUTE_CHANGED; + break; + + case IPV6_USE_MIN_MTU: + ixa->ixa_flags |= IXAF_USE_MIN_MTU; + ixa->ixa_use_min_mtu = *i1; + /* Need to redo ip_attr_connect */ + coa->coa_changed |= COA_ROUTE_CHANGED; + break; + + case IPV6_SEC_OPT: + mutex_enter(&connp->conn_lock); + error = ipsec_set_req(cr, connp, (ipsec_req_t *)invalp); + mutex_exit(&connp->conn_lock); + if (error != 0) { + return (error); + } + /* This is an IPsec policy change - redo ip_attr_connect */ + coa->coa_changed |= COA_ROUTE_CHANGED; + break; + case IPV6_SRC_PREFERENCES: + /* + * This socket option only affects connected + * sockets that haven't already bound to a specific + * IPv6 address. In other words, sockets that + * don't call bind() with an address other than the + * unspecified address and that call connect(). + * ip_set_destination_v6() passes these preferences + * to the ipif_select_source_v6() function. + */ + mutex_enter(&connp->conn_lock); + error = ip6_set_src_preferences(ixa, *(uint32_t *)invalp); + mutex_exit(&connp->conn_lock); + if (error != 0) { + return (error); + } + break; + case IPV6_V6ONLY: + mutex_enter(&connp->conn_lock); + connp->conn_ipv6_v6only = onoff; + mutex_exit(&connp->conn_lock); + break; + } + return (0); +} + +/* Handle IPPROTO_UDP */ +/* ARGSUSED1 */ +static int +conn_opt_set_udp(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen, + uchar_t *invalp, boolean_t checkonly, cred_t *cr) +{ + conn_t *connp = coa->coa_connp; + int *i1 = (int *)invalp; + boolean_t onoff = (*i1 == 0) ? 0 : 1; + int error; + + switch (name) { + case UDP_ANONPRIVBIND: + if ((error = secpolicy_net_privaddr(cr, 0, IPPROTO_UDP)) != 0) { + return (error); + } + break; + } + if (checkonly) + return (0); + + /* Here we set the actual option value */ + mutex_enter(&connp->conn_lock); + switch (name) { + case UDP_ANONPRIVBIND: + connp->conn_anon_priv_bind = onoff; + break; + case UDP_EXCLBIND: + connp->conn_exclbind = onoff; + break; + } + mutex_exit(&connp->conn_lock); + return (0); +} + +/* Handle IPPROTO_TCP */ +/* ARGSUSED1 */ +static int +conn_opt_set_tcp(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen, + uchar_t *invalp, boolean_t checkonly, cred_t *cr) +{ + conn_t *connp = coa->coa_connp; + int *i1 = (int *)invalp; + boolean_t onoff = (*i1 == 0) ? 0 : 1; + int error; + + switch (name) { + case TCP_ANONPRIVBIND: + if ((error = secpolicy_net_privaddr(cr, 0, IPPROTO_TCP)) != 0) { + return (error); + } + break; + } + if (checkonly) + return (0); + + /* Here we set the actual option value */ + mutex_enter(&connp->conn_lock); + switch (name) { + case TCP_ANONPRIVBIND: + connp->conn_anon_priv_bind = onoff; + break; + case TCP_EXCLBIND: + connp->conn_exclbind = onoff; + break; + case TCP_RECVDSTADDR: + connp->conn_recv_ancillary.crb_recvdstaddr = onoff; + break; + } + mutex_exit(&connp->conn_lock); + return (0); +} + +int +conn_getsockname(conn_t *connp, struct sockaddr *sa, uint_t *salenp) +{ + sin_t *sin; + sin6_t *sin6; + + if (connp->conn_family == AF_INET) { + if (*salenp < sizeof (sin_t)) + return (EINVAL); + + *salenp = sizeof (sin_t); + /* Fill zeroes and then initialize non-zero fields */ + sin = (sin_t *)sa; + *sin = sin_null; + sin->sin_family = AF_INET; + if (!IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_saddr_v6) && + !IN6_IS_ADDR_UNSPECIFIED(&connp->conn_saddr_v6)) { + sin->sin_addr.s_addr = connp->conn_saddr_v4; + } else { + /* + * INADDR_ANY + * conn_saddr is not set, we might be bound to + * broadcast/multicast. Use conn_bound_addr as + * local address instead (that could + * also still be INADDR_ANY) + */ + sin->sin_addr.s_addr = connp->conn_bound_addr_v4; + } + sin->sin_port = connp->conn_lport; + } else { + if (*salenp < sizeof (sin6_t)) + return (EINVAL); + + *salenp = sizeof (sin6_t); + /* Fill zeroes and then initialize non-zero fields */ + sin6 = (sin6_t *)sa; + *sin6 = sin6_null; + sin6->sin6_family = AF_INET6; + if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_saddr_v6)) { + sin6->sin6_addr = connp->conn_saddr_v6; + } else { + /* + * conn_saddr is not set, we might be bound to + * broadcast/multicast. Use conn_bound_addr as + * local address instead (which could + * also still be unspecified) + */ + sin6->sin6_addr = connp->conn_bound_addr_v6; + } + sin6->sin6_port = connp->conn_lport; + if (IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr) && + (connp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) + sin6->sin6_scope_id = connp->conn_ixa->ixa_scopeid; + } + return (0); +} + +int +conn_getpeername(conn_t *connp, struct sockaddr *sa, uint_t *salenp) +{ + struct sockaddr_in *sin; + struct sockaddr_in6 *sin6; + + if (connp->conn_family == AF_INET) { + if (*salenp < sizeof (sin_t)) + return (EINVAL); + + *salenp = sizeof (sin_t); + /* initialize */ + sin = (sin_t *)sa; + *sin = sin_null; + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = connp->conn_faddr_v4; + sin->sin_port = connp->conn_fport; + } else { + if (*salenp < sizeof (sin6_t)) + return (EINVAL); + + *salenp = sizeof (sin6_t); + /* initialize */ + sin6 = (sin6_t *)sa; + *sin6 = sin6_null; + sin6->sin6_family = AF_INET6; + sin6->sin6_addr = connp->conn_faddr_v6; + sin6->sin6_port = connp->conn_fport; + sin6->sin6_flowinfo = connp->conn_flowinfo; + if (IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr) && + (connp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) + sin6->sin6_scope_id = connp->conn_ixa->ixa_scopeid; + } + return (0); +} + +static uint32_t cksum_massage_options_v4(ipha_t *, netstack_t *); +static uint32_t cksum_massage_options_v6(ip6_t *, uint_t, netstack_t *); + +/* + * Allocate and fill in conn_ht_iphc based on the current information + * in the conn. + * Normally used when we bind() and connect(). + * Returns failure if can't allocate memory, or if there is a problem + * with a routing header/option. + * + * We allocate space for the transport header (ulp_hdr_len + extra) and + * indicate the offset of the ulp header by setting ixa_ip_hdr_length. + * The extra is there for transports that want some spare room for future + * options. conn_ht_iphc_allocated is what was allocated; conn_ht_iphc_len + * excludes the extra part. + * + * We massage an routing option/header and store the ckecksum difference + * in conn_sum. + * + * Caller needs to update conn_wroff if desired. + */ +int +conn_build_hdr_template(conn_t *connp, uint_t ulp_hdr_length, uint_t extra, + const in6_addr_t *v6src, const in6_addr_t *v6dst, uint32_t flowinfo) +{ + ip_xmit_attr_t *ixa = connp->conn_ixa; + ip_pkt_t *ipp = &connp->conn_xmit_ipp; + uint_t ip_hdr_length; + uchar_t *hdrs; + uint_t hdrs_len; + + ASSERT(MUTEX_HELD(&connp->conn_lock)); + + if (ixa->ixa_flags & IXAF_IS_IPV4) { + ip_hdr_length = ip_total_hdrs_len_v4(ipp); + /* In case of TX label and IP options it can be too much */ + if (ip_hdr_length > IP_MAX_HDR_LENGTH) { + /* Preserves existing TX errno for this */ + return (EHOSTUNREACH); + } + } else { + ip_hdr_length = ip_total_hdrs_len_v6(ipp); + } + ixa->ixa_ip_hdr_length = ip_hdr_length; + hdrs_len = ip_hdr_length + ulp_hdr_length + extra; + ASSERT(hdrs_len != 0); + + if (hdrs_len != connp->conn_ht_iphc_allocated) { + /* Allocate new before we free any old */ + hdrs = kmem_alloc(hdrs_len, KM_NOSLEEP); + if (hdrs == NULL) + return (ENOMEM); + + if (connp->conn_ht_iphc != NULL) { + kmem_free(connp->conn_ht_iphc, + connp->conn_ht_iphc_allocated); + } + connp->conn_ht_iphc = hdrs; + connp->conn_ht_iphc_allocated = hdrs_len; + } else { + hdrs = connp->conn_ht_iphc; + } + hdrs_len -= extra; + connp->conn_ht_iphc_len = hdrs_len; + + connp->conn_ht_ulp = hdrs + ip_hdr_length; + connp->conn_ht_ulp_len = ulp_hdr_length; + + if (ixa->ixa_flags & IXAF_IS_IPV4) { + ipha_t *ipha = (ipha_t *)hdrs; + + IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src); + IN6_V4MAPPED_TO_IPADDR(v6dst, ipha->ipha_dst); + ip_build_hdrs_v4(hdrs, ip_hdr_length, ipp, connp->conn_proto); + ipha->ipha_length = htons(hdrs_len); + if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) + ipha->ipha_fragment_offset_and_flags |= IPH_DF_HTONS; + else + ipha->ipha_fragment_offset_and_flags &= ~IPH_DF_HTONS; + + if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) { + connp->conn_sum = cksum_massage_options_v4(ipha, + connp->conn_netstack); + } else { + connp->conn_sum = 0; + } + } else { + ip6_t *ip6h = (ip6_t *)hdrs; + + ip6h->ip6_src = *v6src; + ip6h->ip6_dst = *v6dst; + ip_build_hdrs_v6(hdrs, ip_hdr_length, ipp, connp->conn_proto, + flowinfo); + ip6h->ip6_plen = htons(hdrs_len - IPV6_HDR_LEN); + + if (ipp->ipp_fields & IPPF_RTHDR) { + connp->conn_sum = cksum_massage_options_v6(ip6h, + ip_hdr_length, connp->conn_netstack); + + /* + * Verify that the first hop isn't a mapped address. + * Routers along the path need to do this verification + * for subsequent hops. + */ + if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_dst)) + return (EADDRNOTAVAIL); + + } else { + connp->conn_sum = 0; + } + } + return (0); +} + +/* + * Prepend a header template to data_mp based on the ip_pkt_t + * and the passed in source, destination and protocol. + * + * Returns failure if can't allocate memory, in which case data_mp is freed. + * We allocate space for the transport header (ulp_hdr_len) and + * indicate the offset of the ulp header by setting ixa_ip_hdr_length. + * + * We massage an routing option/header and return the ckecksum difference + * in *sump. This is in host byte order. + * + * Caller needs to update conn_wroff if desired. + */ +mblk_t * +conn_prepend_hdr(ip_xmit_attr_t *ixa, const ip_pkt_t *ipp, + const in6_addr_t *v6src, const in6_addr_t *v6dst, + uint8_t protocol, uint32_t flowinfo, uint_t ulp_hdr_length, mblk_t *data_mp, + uint_t data_length, uint_t wroff_extra, uint32_t *sump, int *errorp) +{ + uint_t ip_hdr_length; + uchar_t *hdrs; + uint_t hdrs_len; + mblk_t *mp; + + if (ixa->ixa_flags & IXAF_IS_IPV4) { + ip_hdr_length = ip_total_hdrs_len_v4(ipp); + ASSERT(ip_hdr_length <= IP_MAX_HDR_LENGTH); + } else { + ip_hdr_length = ip_total_hdrs_len_v6(ipp); + } + hdrs_len = ip_hdr_length + ulp_hdr_length; + ASSERT(hdrs_len != 0); + + ixa->ixa_ip_hdr_length = ip_hdr_length; + + /* Can we prepend to data_mp? */ + if (data_mp != NULL && + data_mp->b_rptr - data_mp->b_datap->db_base >= hdrs_len && + data_mp->b_datap->db_ref == 1) { + hdrs = data_mp->b_rptr - hdrs_len; + data_mp->b_rptr = hdrs; + mp = data_mp; + } else { + mp = allocb(hdrs_len + wroff_extra, BPRI_MED); + if (mp == NULL) { + freemsg(data_mp); + *errorp = ENOMEM; + return (NULL); + } + mp->b_wptr = mp->b_datap->db_lim; + hdrs = mp->b_rptr = mp->b_wptr - hdrs_len; + mp->b_cont = data_mp; + } + + /* + * Set the source in the header. ip_build_hdrs_v4/v6 will overwrite it + * if PKTINFO (aka IPPF_ADDR) was set. + */ + if (ixa->ixa_flags & IXAF_IS_IPV4) { + ipha_t *ipha = (ipha_t *)hdrs; + + ASSERT(IN6_IS_ADDR_V4MAPPED(v6dst)); + IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src); + IN6_V4MAPPED_TO_IPADDR(v6dst, ipha->ipha_dst); + ip_build_hdrs_v4(hdrs, ip_hdr_length, ipp, protocol); + ipha->ipha_length = htons(hdrs_len + data_length); + if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) + ipha->ipha_fragment_offset_and_flags |= IPH_DF_HTONS; + else + ipha->ipha_fragment_offset_and_flags &= ~IPH_DF_HTONS; + + if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) { + *sump = cksum_massage_options_v4(ipha, + ixa->ixa_ipst->ips_netstack); + } else { + *sump = 0; + } + } else { + ip6_t *ip6h = (ip6_t *)hdrs; + + ip6h->ip6_src = *v6src; + ip6h->ip6_dst = *v6dst; + ip_build_hdrs_v6(hdrs, ip_hdr_length, ipp, protocol, flowinfo); + ip6h->ip6_plen = htons(hdrs_len + data_length - IPV6_HDR_LEN); + + if (ipp->ipp_fields & IPPF_RTHDR) { + *sump = cksum_massage_options_v6(ip6h, + ip_hdr_length, ixa->ixa_ipst->ips_netstack); + + /* + * Verify that the first hop isn't a mapped address. + * Routers along the path need to do this verification + * for subsequent hops. + */ + if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_dst)) { + *errorp = EADDRNOTAVAIL; + freemsg(mp); + return (NULL); + } + } else { + *sump = 0; + } + } + return (mp); +} + +/* + * Massage a source route if any putting the first hop + * in ipha_dst. Compute a starting value for the checksum which + * takes into account that the original ipha_dst should be + * included in the checksum but that IP will include the + * first hop from the source route in the tcp checksum. + */ +static uint32_t +cksum_massage_options_v4(ipha_t *ipha, netstack_t *ns) +{ + in_addr_t dst; + uint32_t cksum; + + /* Get last hop then diff against first hop */ + cksum = ip_massage_options(ipha, ns); + cksum = (cksum & 0xFFFF) + (cksum >> 16); + dst = ipha->ipha_dst; + cksum -= ((dst >> 16) + (dst & 0xffff)); + if ((int)cksum < 0) + cksum--; + cksum = (cksum & 0xFFFF) + (cksum >> 16); + cksum = (cksum & 0xFFFF) + (cksum >> 16); + ASSERT(cksum < 0x10000); + return (ntohs(cksum)); +} + +static uint32_t +cksum_massage_options_v6(ip6_t *ip6h, uint_t ip_hdr_len, netstack_t *ns) +{ + uint8_t *end; + ip6_rthdr_t *rth; + uint32_t cksum; + + end = (uint8_t *)ip6h + ip_hdr_len; + rth = ip_find_rthdr_v6(ip6h, end); + if (rth == NULL) + return (0); + + cksum = ip_massage_options_v6(ip6h, rth, ns); + cksum = (cksum & 0xFFFF) + (cksum >> 16); + ASSERT(cksum < 0x10000); + return (ntohs(cksum)); +} + +/* + * ULPs that change the destination address need to call this for each + * change to discard any state about a previous destination that might + * have been multicast or multirt. + */ +void +ip_attr_newdst(ip_xmit_attr_t *ixa) +{ + ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM | + IXAF_NO_TTL_CHANGE | IXAF_IPV6_ADD_FRAGHDR | + IXAF_NO_LOOP_ZONEID_SET); +} + +/* + * Determine the nexthop which will be used. + * Normally this is just the destination, but if a IPv4 source route, or + * IPv6 routing header, is in the ip_pkt_t then we extract the nexthop from + * there. + */ +void +ip_attr_nexthop(const ip_pkt_t *ipp, const ip_xmit_attr_t *ixa, + const in6_addr_t *dst, in6_addr_t *nexthop) +{ + if (ixa->ixa_flags & IXAF_IS_IPV4) { + ipaddr_t v4dst; + ipaddr_t v4nexthop; + + IN6_V4MAPPED_TO_IPADDR(dst, v4dst); + v4nexthop = ip_pkt_source_route_v4(ipp); + if (v4nexthop == INADDR_ANY) + v4nexthop = v4dst; + + IN6_IPADDR_TO_V4MAPPED(v4nexthop, nexthop); + } else { + const in6_addr_t *v6nexthop; + + v6nexthop = ip_pkt_source_route_v6(ipp); + if (v6nexthop == NULL) + v6nexthop = dst; + + *nexthop = *v6nexthop; + } +} + +/* + * Update the ip_xmit_attr_t based the addresses, conn_xmit_ipp and conn_ixa. + * If IPDF_IPSEC is set we cache the IPsec policy to handle the unconnected + * case (connected latching is done in conn_connect). + * Note that IPsec policy lookup requires conn_proto and conn_laddr to be + * set, but doesn't otherwise use the conn_t. + * + * Caller must set/clear IXAF_IS_IPV4 as appropriately. + * Caller must use ip_attr_nexthop() to determine the nexthop argument. + * + * The caller must NOT hold conn_lock (to avoid problems with ill_refrele + * causing the squeue to run doing ipcl_walk grabbing conn_lock.) + * + * Updates laddrp and uinfo if they are non-NULL. + * + * TSOL notes: The callers if ip_attr_connect must check if the destination + * is different than before and in that case redo conn_update_label. + * The callers of conn_connect do not need that since conn_connect + * performs the conn_update_label. + */ +int +ip_attr_connect(const conn_t *connp, ip_xmit_attr_t *ixa, + const in6_addr_t *v6src, const in6_addr_t *v6dst, + const in6_addr_t *v6nexthop, in_port_t dstport, in6_addr_t *laddrp, + iulp_t *uinfo, uint32_t flags) +{ + in6_addr_t laddr = *v6src; + int error; + + ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); + + if (connp->conn_zone_is_global) + flags |= IPDF_ZONE_IS_GLOBAL; + else + flags &= ~IPDF_ZONE_IS_GLOBAL; + + /* + * Lookup the route to determine a source address and the uinfo. + * If the ULP has a source route option then the caller will + * have set v6nexthop to be the first hop. + */ + if (ixa->ixa_flags & IXAF_IS_IPV4) { + ipaddr_t v4dst; + ipaddr_t v4src, v4nexthop; + + IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst); + IN6_V4MAPPED_TO_IPADDR(v6nexthop, v4nexthop); + IN6_V4MAPPED_TO_IPADDR(v6src, v4src); + + if (connp->conn_unspec_src || v4src != INADDR_ANY) + flags &= ~IPDF_SELECT_SRC; + else + flags |= IPDF_SELECT_SRC; + + error = ip_set_destination_v4(&v4src, v4dst, v4nexthop, ixa, + uinfo, flags, connp->conn_mac_mode); + IN6_IPADDR_TO_V4MAPPED(v4src, &laddr); + } else { + if (connp->conn_unspec_src || !IN6_IS_ADDR_UNSPECIFIED(v6src)) + flags &= ~IPDF_SELECT_SRC; + else + flags |= IPDF_SELECT_SRC; + + error = ip_set_destination_v6(&laddr, v6dst, v6nexthop, ixa, + uinfo, flags, connp->conn_mac_mode); + } + /* Pass out some address even if we hit a RTF_REJECT etc */ + if (laddrp != NULL) + *laddrp = laddr; + + if (error != 0) + return (error); + + if (flags & IPDF_IPSEC) { + /* + * Set any IPsec policy in ixa. Routine also looks at ULP + * ports. + */ + ipsec_cache_outbound_policy(connp, v6src, v6dst, dstport, ixa); + } + return (0); +} + +/* + * Connect the conn based on the addresses, conn_xmit_ipp and conn_ixa. + * Assumes that conn_faddr and conn_fport are already set. As such it is not + * usable for SCTP, since SCTP has multiple faddrs. + * + * Caller must hold conn_lock to provide atomic constency between the + * conn_t's addresses and the ixa. + * NOTE: this function drops and reaquires conn_lock since it can't be + * held across ip_attr_connect/ip_set_destination. + * + * The caller needs to handle inserting in the receive-side fanout when + * appropriate after conn_connect returns. + */ +int +conn_connect(conn_t *connp, iulp_t *uinfo, uint32_t flags) +{ + ip_xmit_attr_t *ixa = connp->conn_ixa; + in6_addr_t nexthop; + in6_addr_t saddr, faddr; + in_port_t fport; + int error; + + ASSERT(MUTEX_HELD(&connp->conn_lock)); + + if (connp->conn_ipversion == IPV4_VERSION) + ixa->ixa_flags |= IXAF_IS_IPV4; + else + ixa->ixa_flags &= ~IXAF_IS_IPV4; + + /* We do IPsec latching below - hence no caching in ip_attr_connect */ + flags &= ~IPDF_IPSEC; + + /* In case we had previously done an ip_attr_connect */ + ip_attr_newdst(ixa); + + /* + * Determine the nexthop and copy the addresses before dropping + * conn_lock. + */ + ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa, + &connp->conn_faddr_v6, &nexthop); + saddr = connp->conn_saddr_v6; + faddr = connp->conn_faddr_v6; + fport = connp->conn_fport; + + mutex_exit(&connp->conn_lock); + error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop, fport, + &saddr, uinfo, flags | IPDF_VERIFY_DST); + mutex_enter(&connp->conn_lock); + + /* Could have changed even if an error */ + connp->conn_saddr_v6 = saddr; + if (error != 0) + return (error); + + /* + * Check whether Trusted Solaris policy allows communication with this + * host, and pretend that the destination is unreachable if not. + * Compute any needed label and place it in ipp_label_v4/v6. + * + * Later conn_build_hdr_template() takes ipp_label_v4/v6 to form + * the packet. + * + * TSOL Note: Any concurrent threads would pick a different ixa + * (and ipp if they are to change the ipp) so we + * don't have to worry about concurrent threads. + */ + if (is_system_labeled()) { + if (connp->conn_mlp_type != mlptSingle) + return (ECONNREFUSED); + + /* + * conn_update_label will set ipp_label* which will later + * be used by conn_build_hdr_template. + */ + error = conn_update_label(connp, ixa, + &connp->conn_faddr_v6, &connp->conn_xmit_ipp); + if (error != 0) + return (error); + } + + /* + * Ensure that we match on the selected local address. + * This overrides conn_laddr in the case we had earlier bound to a + * multicast or broadcast address. + */ + connp->conn_laddr_v6 = connp->conn_saddr_v6; + + /* + * Allow setting new policies. + * The addresses/ports are already set, thus the IPsec policy calls + * can handle their passed-in conn's. + */ + connp->conn_policy_cached = B_FALSE; + + /* + * Cache IPsec policy in this conn. If we have per-socket policy, + * we'll cache that. If we don't, we'll inherit global policy. + * + * This is done before the caller inserts in the receive-side fanout. + * Note that conn_policy_cached is set by ipsec_conn_cache_policy() even + * for connections where we don't have a policy. This is to prevent + * global policy lookups in the inbound path. + * + * If we insert before we set conn_policy_cached, + * CONN_INBOUND_POLICY_PRESENT() check can still evaluate true + * because global policy cound be non-empty. We normally call + * ipsec_check_policy() for conn_policy_cached connections only if + * conn_in_enforce_policy is set. But in this case, + * conn_policy_cached can get set anytime since we made the + * CONN_INBOUND_POLICY_PRESENT() check and ipsec_check_policy() is + * called, which will make the above assumption false. Thus, we + * need to insert after we set conn_policy_cached. + */ + error = ipsec_conn_cache_policy(connp, + connp->conn_ipversion == IPV4_VERSION); + if (error != 0) + return (error); + + /* + * We defer to do LSO check until here since now we have better idea + * whether IPsec is present. If the underlying ill is LSO capable, + * copy its capability in so the ULP can decide whether to enable LSO + * on this connection. So far, only TCP/IPv4 is implemented, so won't + * claim LSO for IPv6. + * + * Currently, won't enable LSO for IRE_LOOPBACK or IRE_LOCAL, because + * the receiver can not handle it. Also not to enable LSO for MULTIRT. + */ + ixa->ixa_flags &= ~IXAF_LSO_CAPAB; + + ASSERT(ixa->ixa_ire != NULL); + if (ixa->ixa_ipst->ips_ip_lso_outbound && (flags & IPDF_LSO) && + !(ixa->ixa_flags & IXAF_IPSEC_SECURE) && + !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && + !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) && + (ixa->ixa_nce != NULL) && + ((ixa->ixa_flags & IXAF_IS_IPV4) ? + ILL_LSO_TCP_IPV4_USABLE(ixa->ixa_nce->nce_ill) : + ILL_LSO_TCP_IPV6_USABLE(ixa->ixa_nce->nce_ill))) { + ixa->ixa_lso_capab = *ixa->ixa_nce->nce_ill->ill_lso_capab; + ixa->ixa_flags |= IXAF_LSO_CAPAB; + } + + /* Check whether ZEROCOPY capability is usable for this connection. */ + ixa->ixa_flags &= ~IXAF_ZCOPY_CAPAB; + + if ((flags & IPDF_ZCOPY) && + !(ixa->ixa_flags & IXAF_IPSEC_SECURE) && + !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && + !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) && + (ixa->ixa_nce != NULL) && + ILL_ZCOPY_USABLE(ixa->ixa_nce->nce_ill)) { + ixa->ixa_flags |= IXAF_ZCOPY_CAPAB; + } + return (0); +} + +/* + * Predicates to check if the addresses match conn_last* + */ + +/* + * Compare the conn against an address. + * If using mapped addresses on AF_INET6 sockets, use the _v6 function + */ +boolean_t +conn_same_as_last_v4(conn_t *connp, sin_t *sin) +{ + ASSERT(connp->conn_family == AF_INET); + return (sin->sin_addr.s_addr == connp->conn_v4lastdst && + sin->sin_port == connp->conn_lastdstport); +} + +/* + * Compare, including for mapped addresses + */ +boolean_t +conn_same_as_last_v6(conn_t *connp, sin6_t *sin6) +{ + return (IN6_ARE_ADDR_EQUAL(&connp->conn_v6lastdst, &sin6->sin6_addr) && + sin6->sin6_port == connp->conn_lastdstport && + sin6->sin6_flowinfo == connp->conn_lastflowinfo && + sin6->sin6_scope_id == connp->conn_lastscopeid); +} + +/* + * Compute a label and place it in the ip_packet_t. + * Handles IPv4 and IPv6. + * The caller should have a correct ixa_tsl and ixa_zoneid and have + * already called conn_connect or ip_attr_connect to ensure that tsol_check_dest + * has been called. + */ +int +conn_update_label(const conn_t *connp, const ip_xmit_attr_t *ixa, + const in6_addr_t *v6dst, ip_pkt_t *ipp) +{ + int err; + ipaddr_t v4dst; + + if (IN6_IS_ADDR_V4MAPPED(v6dst)) { + uchar_t opt_storage[IP_MAX_OPT_LENGTH]; + + IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst); + + err = tsol_compute_label_v4(ixa->ixa_tsl, ixa->ixa_zoneid, + v4dst, opt_storage, ixa->ixa_ipst); + if (err == 0) { + /* Length contained in opt_storage[IPOPT_OLEN] */ + err = optcom_pkt_set(opt_storage, + opt_storage[IPOPT_OLEN], + (uchar_t **)&ipp->ipp_label_v4, + &ipp->ipp_label_len_v4); + } + if (err != 0) { + DTRACE_PROBE4(tx__ip__log__info__updatelabel, + char *, "conn(1) failed to update options(2) " + "on ixa(3)", + conn_t *, connp, char *, opt_storage, + ip_xmit_attr_t *, ixa); + } + if (ipp->ipp_label_len_v4 != 0) + ipp->ipp_fields |= IPPF_LABEL_V4; + else + ipp->ipp_fields &= ~IPPF_LABEL_V4; + } else { + uchar_t opt_storage[TSOL_MAX_IPV6_OPTION]; + uint_t optlen; + + err = tsol_compute_label_v6(ixa->ixa_tsl, ixa->ixa_zoneid, + v6dst, opt_storage, ixa->ixa_ipst); + if (err == 0) { + /* + * Note that ipp_label_v6 is just the option - not + * the hopopts extension header. + * + * Length contained in opt_storage[IPOPT_OLEN], but + * that doesn't include the two byte options header. + */ + optlen = opt_storage[IPOPT_OLEN]; + if (optlen != 0) + optlen += 2; + + err = optcom_pkt_set(opt_storage, optlen, + (uchar_t **)&ipp->ipp_label_v6, + &ipp->ipp_label_len_v6); + } + if (err != 0) { + DTRACE_PROBE4(tx__ip__log__info__updatelabel, + char *, "conn(1) failed to update options(2) " + "on ixa(3)", + conn_t *, connp, char *, opt_storage, + ip_xmit_attr_t *, ixa); + } + if (ipp->ipp_label_len_v6 != 0) + ipp->ipp_fields |= IPPF_LABEL_V6; + else + ipp->ipp_fields &= ~IPPF_LABEL_V6; + } + return (err); +} + +/* + * Inherit all options settings from the parent/listener to the eager. + * Returns zero on success; ENOMEM if memory allocation failed. + * + * We assume that the eager has not had any work done i.e., the conn_ixa + * and conn_xmit_ipp are all zero. + * Furthermore we assume that no other thread can access the eager (because + * it isn't inserted in any fanout list). + */ +int +conn_inherit_parent(conn_t *lconnp, conn_t *econnp) +{ + cred_t *credp; + int err; + void *notify_cookie; + + econnp->conn_family = lconnp->conn_family; + econnp->conn_ipv6_v6only = lconnp->conn_ipv6_v6only; + econnp->conn_wq = lconnp->conn_wq; + econnp->conn_rq = lconnp->conn_rq; + + /* + * Make a safe copy of the transmit attributes. + * conn_connect will later be used by the caller to setup the ire etc. + */ + ASSERT(econnp->conn_ixa->ixa_refcnt == 1); + ASSERT(econnp->conn_ixa->ixa_ire == NULL); + ASSERT(econnp->conn_ixa->ixa_dce == NULL); + ASSERT(econnp->conn_ixa->ixa_nce == NULL); + + /* Preserve ixa_notify_cookie */ + notify_cookie = econnp->conn_ixa->ixa_notify_cookie; + ixa_safe_copy(lconnp->conn_ixa, econnp->conn_ixa); + econnp->conn_ixa->ixa_notify_cookie = notify_cookie; + + econnp->conn_bound_if = lconnp->conn_bound_if; + econnp->conn_incoming_ifindex = lconnp->conn_incoming_ifindex; + + /* Inherit all RECV options */ + econnp->conn_recv_ancillary = lconnp->conn_recv_ancillary; + + err = ip_pkt_copy(&lconnp->conn_xmit_ipp, &econnp->conn_xmit_ipp, + KM_NOSLEEP); + if (err != 0) + return (err); + + econnp->conn_zoneid = lconnp->conn_zoneid; + econnp->conn_allzones = lconnp->conn_allzones; + + /* This is odd. Pick a flowlabel for each connection instead? */ + econnp->conn_flowinfo = lconnp->conn_flowinfo; + + econnp->conn_default_ttl = lconnp->conn_default_ttl; + + /* + * TSOL: tsol_input_proc() needs the eager's cred before the + * eager is accepted + */ + ASSERT(lconnp->conn_cred != NULL); + econnp->conn_cred = credp = lconnp->conn_cred; + crhold(credp); + econnp->conn_cpid = lconnp->conn_cpid; + econnp->conn_open_time = lbolt64; + + /* + * Cache things in the ixa without any refhold. + * Listener might not have set up ixa_cred + */ + econnp->conn_ixa->ixa_cred = econnp->conn_cred; + econnp->conn_ixa->ixa_cpid = econnp->conn_cpid; + if (is_system_labeled()) + econnp->conn_ixa->ixa_tsl = crgetlabel(econnp->conn_cred); + + /* + * If the caller has the process-wide flag set, then default to MAC + * exempt mode. This allows read-down to unlabeled hosts. + */ + if (getpflags(NET_MAC_AWARE, credp) != 0) + econnp->conn_mac_mode = CONN_MAC_AWARE; + + econnp->conn_zone_is_global = lconnp->conn_zone_is_global; + + /* + * We eliminate the need for sockfs to send down a T_SVR4_OPTMGMT_REQ + * via soaccept()->soinheritoptions() which essentially applies + * all the listener options to the new connection. The options that we + * need to take care of are: + * SO_DEBUG, SO_REUSEADDR, SO_KEEPALIVE, SO_DONTROUTE, SO_BROADCAST, + * SO_USELOOPBACK, SO_OOBINLINE, SO_DGRAM_ERRIND, SO_LINGER, + * SO_SNDBUF, SO_RCVBUF. + * + * SO_RCVBUF: conn_rcvbuf is set. + * SO_SNDBUF: conn_sndbuf is set. + */ + + econnp->conn_sndbuf = lconnp->conn_sndbuf; + econnp->conn_rcvbuf = lconnp->conn_rcvbuf; + econnp->conn_sndlowat = lconnp->conn_sndlowat; + econnp->conn_rcvlowat = lconnp->conn_rcvlowat; + econnp->conn_dgram_errind = lconnp->conn_dgram_errind; + econnp->conn_oobinline = lconnp->conn_oobinline; + econnp->conn_debug = lconnp->conn_debug; + econnp->conn_keepalive = lconnp->conn_keepalive; + econnp->conn_linger = lconnp->conn_linger; + econnp->conn_lingertime = lconnp->conn_lingertime; + + /* Set the IP options */ + econnp->conn_broadcast = lconnp->conn_broadcast; + econnp->conn_useloopback = lconnp->conn_useloopback; + econnp->conn_reuseaddr = lconnp->conn_reuseaddr; + return (0); +} diff --git a/usr/src/uts/common/inet/ip/icmp.c b/usr/src/uts/common/inet/ip/icmp.c index 7f6d4b621f..8222c866d0 100644 --- a/usr/src/uts/common/inet/ip/icmp.c +++ b/usr/src/uts/common/inet/ip/icmp.c @@ -35,65 +35,58 @@ #include <sys/ddi.h> #include <sys/sunddi.h> #include <sys/strsubr.h> +#include <sys/suntpi.h> +#include <sys/xti_inet.h> #include <sys/cmn_err.h> -#include <sys/debug.h> #include <sys/kmem.h> +#include <sys/cred_impl.h> #include <sys/policy.h> #include <sys/priv.h> +#include <sys/ucred.h> #include <sys/zone.h> -#include <sys/time.h> #include <sys/sockio.h> #include <sys/socket.h> #include <sys/socketvar.h> +#include <sys/vtrace.h> +#include <sys/sdt.h> +#include <sys/debug.h> #include <sys/isa_defs.h> -#include <sys/suntpi.h> -#include <sys/xti_inet.h> -#include <sys/netstack.h> - -#include <net/route.h> -#include <net/if.h> - +#include <sys/random.h> #include <netinet/in.h> #include <netinet/ip6.h> #include <netinet/icmp6.h> +#include <netinet/udp.h> + #include <inet/common.h> #include <inet/ip.h> +#include <inet/ip_impl.h> +#include <inet/ipsec_impl.h> #include <inet/ip6.h> +#include <inet/ip_ire.h> +#include <inet/ip_if.h> +#include <inet/ip_multi.h> +#include <inet/ip_ndp.h> #include <inet/proto_set.h> +#include <inet/mib2.h> #include <inet/nd.h> #include <inet/optcom.h> #include <inet/snmpcom.h> #include <inet/kstatcom.h> -#include <inet/rawip_impl.h> - -#include <netinet/ip_mroute.h> -#include <inet/tcp.h> -#include <net/pfkeyv2.h> -#include <inet/ipsec_info.h> #include <inet/ipclassifier.h> #include <sys/tsol/label.h> #include <sys/tsol/tnet.h> -#include <inet/ip_ire.h> -#include <inet/ip_if.h> +#include <inet/rawip_impl.h> -#include <inet/ip_impl.h> #include <sys/disp.h> /* * Synchronization notes: * - * RAWIP is MT and uses the usual kernel synchronization primitives. There is - * locks, which is icmp_rwlock. We also use conn_lock when updating things - * which affect the IP classifier lookup. - * The lock order is icmp_rwlock -> conn_lock. - * - * The icmp_rwlock: - * This protects most of the other fields in the icmp_t. The exact list of - * fields which are protected by each of the above locks is documented in - * the icmp_t structure definition. + * RAWIP is MT and uses the usual kernel synchronization primitives. We use + * conn_lock to protect the icmp_t. * * Plumbing notes: * ICMP is always a device driver. For compatibility with mibopen() code @@ -103,27 +96,29 @@ static void icmp_addr_req(queue_t *q, mblk_t *mp); static void icmp_tpi_bind(queue_t *q, mblk_t *mp); -static int icmp_bind_proto(conn_t *connp); -static int icmp_build_hdrs(icmp_t *icmp); +static void icmp_bind_proto(icmp_t *icmp); +static int icmp_build_hdr_template(conn_t *, const in6_addr_t *, + const in6_addr_t *, uint32_t); static void icmp_capability_req(queue_t *q, mblk_t *mp); static int icmp_close(queue_t *q, int flags); +static void icmp_close_free(conn_t *); static void icmp_tpi_connect(queue_t *q, mblk_t *mp); static void icmp_tpi_disconnect(queue_t *q, mblk_t *mp); static void icmp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, - int sys_error); + int sys_error); static void icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive, - t_scalar_t t_error, int sys_error); -static void icmp_icmp_error(conn_t *connp, mblk_t *mp); -static void icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp); + t_scalar_t tlierr, int sys_error); +static void icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, + ip_recv_attr_t *); +static void icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, + ip_recv_attr_t *); static void icmp_info_req(queue_t *q, mblk_t *mp); -static void icmp_input(void *, mblk_t *, void *); +static void icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *); static conn_t *icmp_open(int family, cred_t *credp, int *err, int flags); static int icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp); static int icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp); -static int icmp_unitdata_opt_process(queue_t *q, mblk_t *mp, - int *errorp, void *thisdg_attrs); static boolean_t icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name); int icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, uint_t inlen, @@ -131,25 +126,26 @@ int icmp_opt_set(conn_t *connp, uint_t optset_context, void *thisdg_attrs, cred_t *cr); int icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr); +static int icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, + sin6_t *sin6, cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa); static int icmp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); static boolean_t icmp_param_register(IDP *ndp, icmpparam_t *icmppa, int cnt); static int icmp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr); +static mblk_t *icmp_prepend_hdr(conn_t *, ip_xmit_attr_t *, const ip_pkt_t *, + const in6_addr_t *, const in6_addr_t *, uint32_t, mblk_t *, int *); +static mblk_t *icmp_prepend_header_template(conn_t *, ip_xmit_attr_t *, + mblk_t *, const in6_addr_t *, uint32_t, int *); static int icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr, int len); static void icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err); static void icmp_tpi_unbind(queue_t *q, mblk_t *mp); -static int icmp_update_label(icmp_t *icmp, mblk_t *mp, ipaddr_t dst); static void icmp_wput(queue_t *q, mblk_t *mp); static void icmp_wput_fallback(queue_t *q, mblk_t *mp); -static int raw_ip_send_data_v6(queue_t *q, conn_t *connp, mblk_t *mp, - sin6_t *sin6, ip6_pkt_t *ipp); -static int raw_ip_send_data_v4(queue_t *q, conn_t *connp, mblk_t *mp, - ipaddr_t v4dst, ip4_pkt_t *pktinfop); static void icmp_wput_other(queue_t *q, mblk_t *mp); static void icmp_wput_iocdata(queue_t *q, mblk_t *mp); static void icmp_wput_restricted(queue_t *q, mblk_t *mp); -static void icmp_ulp_recv(conn_t *, mblk_t *); +static void icmp_ulp_recv(conn_t *, mblk_t *, uint_t); static void *rawip_stack_init(netstackid_t stackid, netstack_t *ns); static void rawip_stack_fini(netstackid_t stackid, void *arg); @@ -158,10 +154,14 @@ static void *rawip_kstat_init(netstackid_t stackid); static void rawip_kstat_fini(netstackid_t stackid, kstat_t *ksp); static int rawip_kstat_update(kstat_t *kp, int rw); static void rawip_stack_shutdown(netstackid_t stackid, void *arg); -static int rawip_do_getsockname(icmp_t *icmp, struct sockaddr *sa, - uint_t *salenp); -static int rawip_do_getpeername(icmp_t *icmp, struct sockaddr *sa, - uint_t *salenp); + +/* Common routines for TPI and socket module */ +static conn_t *rawip_do_open(int, cred_t *, int *, int); +static void rawip_do_close(conn_t *); +static int rawip_do_bind(conn_t *, struct sockaddr *, socklen_t); +static int rawip_do_unbind(conn_t *); +static int rawip_do_connect(conn_t *, const struct sockaddr *, socklen_t, + cred_t *, pid_t); int rawip_getsockname(sock_lower_handle_t, struct sockaddr *, socklen_t *, cred_t *); @@ -185,7 +185,7 @@ static struct qinit icmprinitv6 = { }; static struct qinit icmpwinit = { - (pfi_t)icmp_wput, NULL, NULL, NULL, NULL, &icmp_mod_info + (pfi_t)icmp_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL, &icmp_mod_info }; /* ICMP entry point during fallback */ @@ -236,6 +236,8 @@ static icmpparam_t icmp_param_arr[] = { { 0, 65536, 1024, "icmp_xmit_lowat"}, { 4096, 65536, 8192, "icmp_recv_hiwat"}, { 65536, 1024*1024*1024, 256*1024, "icmp_max_buf"}, + { 0, 1, 0, "icmp_pmtu_discovery" }, + { 0, 1, 0, "icmp_sendto_ignerr" }, }; #define is_wroff_extra is_param_arr[0].icmp_param_value #define is_ipv4_ttl is_param_arr[1].icmp_param_value @@ -245,18 +247,17 @@ static icmpparam_t icmp_param_arr[] = { #define is_xmit_lowat is_param_arr[5].icmp_param_value #define is_recv_hiwat is_param_arr[6].icmp_param_value #define is_max_buf is_param_arr[7].icmp_param_value +#define is_pmtu_discovery is_param_arr[8].icmp_param_value +#define is_sendto_ignerr is_param_arr[9].icmp_param_value -static int rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len); -static int rawip_do_connect(conn_t *connp, const struct sockaddr *sa, - socklen_t len, cred_t *cr); -static void rawip_post_ip_bind_connect(icmp_t *icmp, mblk_t *ire_mp, int error); +typedef union T_primitives *t_primp_t; /* * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message * passed to icmp_wput. - * The O_T_BIND_REQ/T_BIND_REQ is passed downstream to ip with the ICMP - * protocol type placed in the message following the address. A T_BIND_ACK - * message is returned by ip_bind_v4/v6. + * It calls IP to verify the local IP address, and calls IP to insert + * the conn_t in the fanout table. + * If everything is ok it then sends the T_BIND_ACK back up. */ static void icmp_tpi_bind(queue_t *q, mblk_t *mp) @@ -297,17 +298,17 @@ icmp_tpi_bind(queue_t *q, mblk_t *mp) if (icmp->icmp_state != TS_UNBND) { (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, - "icmp_bind: bad state, %d", icmp->icmp_state); + "icmp_bind: bad state, %u", icmp->icmp_state); icmp_err_ack(q, mp, TOUTSTATE, 0); return; } /* * Reallocate the message to make sure we have enough room for an - * address and the protocol type. + * address. */ - mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t) + 1, 1); - if (!mp1) { + mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1); + if (mp1 == NULL) { icmp_err_ack(q, mp, TSYSERR, ENOMEM); return; } @@ -320,7 +321,7 @@ icmp_tpi_bind(queue_t *q, mblk_t *mp) switch (len) { case 0: /* request for a generic port */ tbr->ADDR_offset = sizeof (struct T_bind_req); - if (icmp->icmp_family == AF_INET) { + if (connp->conn_family == AF_INET) { tbr->ADDR_length = sizeof (sin_t); sin = (sin_t *)&tbr[1]; *sin = sin_null; @@ -329,7 +330,7 @@ icmp_tpi_bind(queue_t *q, mblk_t *mp) sa = (struct sockaddr *)sin; len = sizeof (sin_t); } else { - ASSERT(icmp->icmp_family == AF_INET6); + ASSERT(connp->conn_family == AF_INET6); tbr->ADDR_length = sizeof (sin6_t); sin6 = (sin6_t *)&tbr[1]; *sin6 = sin6_null; @@ -352,14 +353,12 @@ icmp_tpi_bind(queue_t *q, mblk_t *mp) default: (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, - "icmp_bind: bad ADDR_length %d", tbr->ADDR_length); + "icmp_bind: bad ADDR_length %u", tbr->ADDR_length); icmp_err_ack(q, mp, TBADADDR, 0); return; } error = rawip_do_bind(connp, sa, len); -done: - ASSERT(mp->b_cont == NULL); if (error != 0) { if (error > 0) { icmp_err_ack(q, mp, TSYSERR, error); @@ -377,225 +376,208 @@ rawip_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len) { sin_t *sin; sin6_t *sin6; - icmp_t *icmp; + icmp_t *icmp = connp->conn_icmp; int error = 0; - mblk_t *ire_mp; - - - icmp = connp->conn_icmp; + ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */ + in_port_t lport; /* Network byte order */ + ipaddr_t v4src; /* Set if AF_INET */ + in6_addr_t v6src; + uint_t scopeid = 0; + zoneid_t zoneid = IPCL_ZONEID(connp); + ip_stack_t *ipst = connp->conn_netstack->netstack_ip; if (sa == NULL || !OK_32PTR((char *)sa)) { return (EINVAL); } - /* - * The state must be TS_UNBND. TPI mandates that users must send - * TPI primitives only 1 at a time and wait for the response before - * sending the next primitive. - */ - rw_enter(&icmp->icmp_rwlock, RW_WRITER); - if (icmp->icmp_state != TS_UNBND || icmp->icmp_pending_op != -1) { - error = -TOUTSTATE; - goto done; - } - - ASSERT(len != 0); switch (len) { case sizeof (sin_t): /* Complete IPv4 address */ sin = (sin_t *)sa; if (sin->sin_family != AF_INET || - icmp->icmp_family != AF_INET) { + connp->conn_family != AF_INET) { /* TSYSERR, EAFNOSUPPORT */ - error = EAFNOSUPPORT; - goto done; + return (EAFNOSUPPORT); } + v4src = sin->sin_addr.s_addr; + IN6_IPADDR_TO_V4MAPPED(v4src, &v6src); + if (v4src != INADDR_ANY) { + laddr_type = ip_laddr_verify_v4(v4src, zoneid, ipst, + B_TRUE); + } + lport = sin->sin_port; break; case sizeof (sin6_t): /* Complete IPv6 address */ sin6 = (sin6_t *)sa; if (sin6->sin6_family != AF_INET6 || - icmp->icmp_family != AF_INET6) { + connp->conn_family != AF_INET6) { /* TSYSERR, EAFNOSUPPORT */ - error = EAFNOSUPPORT; - goto done; + return (EAFNOSUPPORT); } /* No support for mapped addresses on raw sockets */ if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { /* TSYSERR, EADDRNOTAVAIL */ - error = EADDRNOTAVAIL; - goto done; + return (EADDRNOTAVAIL); } + v6src = sin6->sin6_addr; + if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) { + if (IN6_IS_ADDR_LINKSCOPE(&v6src)) + scopeid = sin6->sin6_scope_id; + laddr_type = ip_laddr_verify_v6(&v6src, zoneid, ipst, + B_TRUE, scopeid); + } + lport = sin6->sin6_port; break; default: /* TBADADDR */ - error = EADDRNOTAVAIL; - goto done; + return (EADDRNOTAVAIL); } - icmp->icmp_pending_op = T_BIND_REQ; - icmp->icmp_state = TS_IDLE; + /* Is the local address a valid unicast, multicast, or broadcast? */ + if (laddr_type == IPVL_BAD) + return (EADDRNOTAVAIL); + + /* + * The state must be TS_UNBND. + */ + mutex_enter(&connp->conn_lock); + if (icmp->icmp_state != TS_UNBND) { + mutex_exit(&connp->conn_lock); + return (-TOUTSTATE); + } /* * Copy the source address into our icmp structure. This address * may still be zero; if so, ip will fill in the correct address * each time an outbound packet is passed to it. * If we are binding to a broadcast or multicast address then - * rawip_post_ip_bind_connect will clear the source address. + * we just set the conn_bound_addr since we don't want to use + * that as the source address when sending. */ - - if (icmp->icmp_family == AF_INET) { - ASSERT(sin != NULL); - ASSERT(icmp->icmp_ipversion == IPV4_VERSION); - IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, - &icmp->icmp_v6src); - icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + - icmp->icmp_ip_snd_options_len; - icmp->icmp_bound_v6src = icmp->icmp_v6src; + connp->conn_bound_addr_v6 = v6src; + connp->conn_laddr_v6 = v6src; + if (scopeid != 0) { + connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET; + connp->conn_ixa->ixa_scopeid = scopeid; + connp->conn_incoming_ifindex = scopeid; } else { - int error; - - ASSERT(sin6 != NULL); - ASSERT(icmp->icmp_ipversion == IPV6_VERSION); - icmp->icmp_v6src = sin6->sin6_addr; - icmp->icmp_max_hdr_len = icmp->icmp_sticky_hdrs_len; - icmp->icmp_bound_v6src = icmp->icmp_v6src; - - /* Rebuild the header template */ - error = icmp_build_hdrs(icmp); - if (error != 0) { - icmp->icmp_pending_op = -1; - /* - * TSYSERR - */ - goto done; - } + connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; + connp->conn_incoming_ifindex = connp->conn_bound_if; } - ire_mp = NULL; - if (!(V6_OR_V4_INADDR_ANY(icmp->icmp_v6src))) { - /* - * request an IRE if src not 0 (INADDR_ANY) - */ - ire_mp = allocb(sizeof (ire_t), BPRI_HI); - if (ire_mp == NULL) { - icmp->icmp_pending_op = -1; - error = ENOMEM; - goto done; - } - DB_TYPE(ire_mp) = IRE_DB_REQ_TYPE; + switch (laddr_type) { + case IPVL_UNICAST_UP: + case IPVL_UNICAST_DOWN: + connp->conn_saddr_v6 = v6src; + connp->conn_mcbc_bind = B_FALSE; + break; + case IPVL_MCAST: + case IPVL_BCAST: + /* ip_set_destination will pick a source address later */ + connp->conn_saddr_v6 = ipv6_all_zeros; + connp->conn_mcbc_bind = B_TRUE; + break; } -done: - rw_exit(&icmp->icmp_rwlock); - if (error != 0) - return (error); - if (icmp->icmp_family == AF_INET6) { - error = ip_proto_bind_laddr_v6(connp, &ire_mp, icmp->icmp_proto, - &sin6->sin6_addr, sin6->sin6_port, B_TRUE); + /* Any errors after this point should use late_error */ + + /* + * Use sin_port/sin6_port since applications like psh use SOCK_RAW + * with IPPROTO_TCP. + */ + connp->conn_lport = lport; + connp->conn_fport = 0; + + if (connp->conn_family == AF_INET) { + ASSERT(connp->conn_ipversion == IPV4_VERSION); } else { - error = ip_proto_bind_laddr_v4(connp, &ire_mp, icmp->icmp_proto, - sin->sin_addr.s_addr, sin->sin_port, B_TRUE); + ASSERT(connp->conn_ipversion == IPV6_VERSION); } - rawip_post_ip_bind_connect(icmp, ire_mp, error); - return (error); -} -static void -rawip_post_ip_bind_connect(icmp_t *icmp, mblk_t *ire_mp, int error) -{ - rw_enter(&icmp->icmp_rwlock, RW_WRITER); - if (icmp->icmp_state == TS_UNBND) { - /* - * not yet bound - bind sent by icmp_bind_proto. - */ - rw_exit(&icmp->icmp_rwlock); - return; - } - ASSERT(icmp->icmp_pending_op != -1); - icmp->icmp_pending_op = -1; + icmp->icmp_state = TS_IDLE; + /* + * We create an initial header template here to make a subsequent + * sendto have a starting point. Since conn_last_dst is zero the + * first sendto will always follow the 'dst changed' code path. + * Note that we defer massaging options and the related checksum + * adjustment until we have a destination address. + */ + error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, + &connp->conn_faddr_v6, connp->conn_flowinfo); if (error != 0) { - if (icmp->icmp_state == TS_DATA_XFER) { - /* Connect failed */ - /* Revert back to the bound source */ - icmp->icmp_v6src = icmp->icmp_bound_v6src; - icmp->icmp_state = TS_IDLE; - if (icmp->icmp_family == AF_INET6) - (void) icmp_build_hdrs(icmp); - } else { - V6_SET_ZERO(icmp->icmp_v6src); - V6_SET_ZERO(icmp->icmp_bound_v6src); - icmp->icmp_state = TS_UNBND; - if (icmp->icmp_family == AF_INET6) - (void) icmp_build_hdrs(icmp); - } - } else { - if (ire_mp != NULL && ire_mp->b_datap->db_type == IRE_DB_TYPE) { - ire_t *ire; - - ire = (ire_t *)ire_mp->b_rptr; - /* - * If a broadcast/multicast address was bound set - * the source address to 0. - * This ensures no datagrams with broadcast address - * as source address are emitted (which would violate - * RFC1122 - Hosts requirements) - * Note: we get IRE_BROADCAST for IPv6 - * to "mark" a multicast local address. - */ + mutex_exit(&connp->conn_lock); + goto late_error; + } + /* Just in case */ + connp->conn_faddr_v6 = ipv6_all_zeros; + connp->conn_v6lastdst = ipv6_all_zeros; + mutex_exit(&connp->conn_lock); + error = ip_laddr_fanout_insert(connp); + if (error != 0) + goto late_error; - if (ire->ire_type == IRE_BROADCAST && - icmp->icmp_state != TS_DATA_XFER) { - /* - * This was just a local bind to a - * MC/broadcast addr - */ - V6_SET_ZERO(icmp->icmp_v6src); - if (icmp->icmp_family == AF_INET6) - (void) icmp_build_hdrs(icmp); - } - } + /* Bind succeeded */ + return (0); +late_error: + mutex_enter(&connp->conn_lock); + connp->conn_saddr_v6 = ipv6_all_zeros; + connp->conn_bound_addr_v6 = ipv6_all_zeros; + connp->conn_laddr_v6 = ipv6_all_zeros; + if (scopeid != 0) { + connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; + connp->conn_incoming_ifindex = connp->conn_bound_if; } - rw_exit(&icmp->icmp_rwlock); - if (ire_mp != NULL) - freeb(ire_mp); + icmp->icmp_state = TS_UNBND; + connp->conn_v6lastdst = ipv6_all_zeros; + connp->conn_lport = 0; + + /* Restore the header that was built above - different source address */ + (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, + &connp->conn_faddr_v6, connp->conn_flowinfo); + mutex_exit(&connp->conn_lock); + return (error); } /* - * Send message to IP to just bind to the protocol. + * Tell IP to just bind to the protocol. */ -static int -icmp_bind_proto(conn_t *connp) +static void +icmp_bind_proto(icmp_t *icmp) { - icmp_t *icmp; - int error; - - icmp = connp->conn_icmp; + conn_t *connp = icmp->icmp_connp; - if (icmp->icmp_family == AF_INET6) - error = ip_proto_bind_laddr_v6(connp, NULL, icmp->icmp_proto, - &sin6_null.sin6_addr, 0, B_TRUE); - else - error = ip_proto_bind_laddr_v4(connp, NULL, icmp->icmp_proto, - sin_null.sin_addr.s_addr, 0, B_TRUE); + mutex_enter(&connp->conn_lock); + connp->conn_saddr_v6 = ipv6_all_zeros; + connp->conn_laddr_v6 = ipv6_all_zeros; + connp->conn_faddr_v6 = ipv6_all_zeros; + connp->conn_v6lastdst = ipv6_all_zeros; + mutex_exit(&connp->conn_lock); - rawip_post_ip_bind_connect(icmp, NULL, error); - return (error); + (void) ip_laddr_fanout_insert(connp); } +/* + * This routine handles each T_CONN_REQ message passed to icmp. It + * associates a default destination address with the stream. + * + * After various error checks are completed, icmp_connect() lays + * the target address and port into the composite header template. + * Then we ask IP for information, including a source address if we didn't + * already have one. Finally we send up the T_OK_ACK reply message. + */ static void icmp_tpi_connect(queue_t *q, mblk_t *mp) { conn_t *connp = Q_TO_CONN(q); struct T_conn_req *tcr; - icmp_t *icmp; struct sockaddr *sa; socklen_t len; int error; cred_t *cr; - + pid_t pid; /* * All Solaris components should pass a db_credp * for this TPI message, hence we ASSERT. @@ -603,14 +585,13 @@ icmp_tpi_connect(queue_t *q, mblk_t *mp) * like a TPI message sent by some other kernel * component, we check and return an error. */ - cr = msg_getcred(mp, NULL); + cr = msg_getcred(mp, &pid); ASSERT(cr != NULL); if (cr == NULL) { icmp_err_ack(q, mp, TSYSERR, EINVAL); return; } - icmp = connp->conn_icmp; tcr = (struct T_conn_req *)mp->b_rptr; /* Sanity checks */ if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_req)) { @@ -639,13 +620,13 @@ icmp_tpi_connect(queue_t *q, mblk_t *mp) break; } - error = proto_verify_ip_addr(icmp->icmp_family, sa, len); + error = proto_verify_ip_addr(connp->conn_family, sa, len); if (error != 0) { icmp_err_ack(q, mp, TSYSERR, error); return; } - error = rawip_do_connect(connp, sa, len, cr); + error = rawip_do_connect(connp, sa, len, cr, pid); if (error != 0) { if (error < 0) { icmp_err_ack(q, mp, -error, 0); @@ -659,11 +640,11 @@ icmp_tpi_connect(queue_t *q, mblk_t *mp) * We have to send a connection confirmation to * keep TLI happy. */ - if (icmp->icmp_family == AF_INET) { + if (connp->conn_family == AF_INET) { mp1 = mi_tpi_conn_con(NULL, (char *)sa, sizeof (sin_t), NULL, 0); } else { - ASSERT(icmp->icmp_family == AF_INET6); + ASSERT(connp->conn_family == AF_INET6); mp1 = mi_tpi_conn_con(NULL, (char *)sa, sizeof (sin6_t), NULL, 0); } @@ -688,15 +669,20 @@ icmp_tpi_connect(queue_t *q, mblk_t *mp) static int rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, - cred_t *cr) + cred_t *cr, pid_t pid) { - icmp_t *icmp; - sin_t *sin; - sin6_t *sin6; - mblk_t *ire_mp; - int error; + icmp_t *icmp; + sin_t *sin; + sin6_t *sin6; + int error; + uint16_t dstport; ipaddr_t v4dst; in6_addr_t v6dst; + uint32_t flowinfo; + ip_xmit_attr_t *ixa; + uint_t scopeid = 0; + uint_t srcid = 0; + in6_addr_t v6src = connp->conn_saddr_v6; icmp = connp->conn_icmp; @@ -704,170 +690,199 @@ rawip_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, return (EINVAL); } - ire_mp = allocb(sizeof (ire_t), BPRI_HI); - if (ire_mp == NULL) - return (ENOMEM); - DB_TYPE(ire_mp) = IRE_DB_REQ_TYPE; - - ASSERT(sa != NULL && len != 0); - rw_enter(&icmp->icmp_rwlock, RW_WRITER); - if (icmp->icmp_state == TS_UNBND || icmp->icmp_pending_op != -1) { - rw_exit(&icmp->icmp_rwlock); - freeb(ire_mp); - return (-TOUTSTATE); - } - + /* + * Determine packet type based on type of address passed in + * the request should contain an IPv4 or IPv6 address. + * Make sure that address family matches the type of + * family of the address passed down. + */ switch (len) { case sizeof (sin_t): sin = (sin_t *)sa; - ASSERT(icmp->icmp_family == AF_INET); - ASSERT(icmp->icmp_ipversion == IPV4_VERSION); - v4dst = sin->sin_addr.s_addr; - /* - * Interpret a zero destination to mean loopback. - * Update the T_CONN_REQ (sin/sin6) since it is used to - * generate the T_CONN_CON. - */ - if (v4dst == INADDR_ANY) { - v4dst = htonl(INADDR_LOOPBACK); - } - + dstport = sin->sin_port; IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); - ASSERT(icmp->icmp_ipversion == IPV4_VERSION); - icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + - icmp->icmp_ip_snd_options_len; - icmp->icmp_v6dst.sin6_addr = v6dst; - icmp->icmp_v6dst.sin6_family = AF_INET6; - icmp->icmp_v6dst.sin6_flowinfo = 0; - icmp->icmp_v6dst.sin6_port = 0; - - /* - * If the destination address is multicast and - * an outgoing multicast interface has been set, - * use the address of that interface as our - * source address if no source address has been set. - */ - if (V4_PART_OF_V6(icmp->icmp_v6src) == INADDR_ANY && - CLASSD(v4dst) && - icmp->icmp_multicast_if_addr != INADDR_ANY) { - IN6_IPADDR_TO_V4MAPPED(icmp->icmp_multicast_if_addr, - &icmp->icmp_v6src); - } + ASSERT(connp->conn_ipversion == IPV4_VERSION); break; + case sizeof (sin6_t): sin6 = (sin6_t *)sa; /* No support for mapped addresses on raw sockets */ if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { - rw_exit(&icmp->icmp_rwlock); - freeb(ire_mp); return (EADDRNOTAVAIL); } + v6dst = sin6->sin6_addr; + dstport = sin6->sin6_port; + ASSERT(connp->conn_ipversion == IPV6_VERSION); + flowinfo = sin6->sin6_flowinfo; + if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) + scopeid = sin6->sin6_scope_id; + srcid = sin6->__sin6_src_id; + if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { + ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), + connp->conn_netstack); + } + break; + } + + /* + * If there is a different thread using conn_ixa then we get a new + * copy and cut the old one loose from conn_ixa. Otherwise we use + * conn_ixa and prevent any other thread from using/changing it. + * Once connect() is done other threads can use conn_ixa since the + * refcnt will be back at one. + */ + ixa = conn_get_ixa(connp, B_TRUE); + if (ixa == NULL) + return (ENOMEM); - ASSERT(icmp->icmp_ipversion == IPV6_VERSION); - ASSERT(icmp->icmp_family == AF_INET6); + ASSERT(ixa->ixa_refcnt >= 2); + ASSERT(ixa == connp->conn_ixa); - icmp->icmp_max_hdr_len = icmp->icmp_sticky_hdrs_len; + mutex_enter(&connp->conn_lock); + /* + * This icmp_t must have bound already before doing a connect. + * Reject if a connect is in progress (we drop conn_lock during + * rawip_do_connect). + */ + if (icmp->icmp_state == TS_UNBND || icmp->icmp_state == TS_WCON_CREQ) { + mutex_exit(&connp->conn_lock); + ixa_refrele(ixa); + return (-TOUTSTATE); + } - icmp->icmp_v6dst = *sin6; - icmp->icmp_v6dst.sin6_port = 0; + if (icmp->icmp_state == TS_DATA_XFER) { + /* Already connected - clear out state */ + if (connp->conn_mcbc_bind) + connp->conn_saddr_v6 = ipv6_all_zeros; + else + connp->conn_saddr_v6 = connp->conn_bound_addr_v6; + connp->conn_laddr_v6 = connp->conn_bound_addr_v6; + connp->conn_faddr_v6 = ipv6_all_zeros; + icmp->icmp_state = TS_IDLE; + } + /* + * Use sin_port/sin6_port since applications like psh use SOCK_RAW + * with IPPROTO_TCP. + */ + connp->conn_fport = dstport; + if (connp->conn_ipversion == IPV4_VERSION) { /* * Interpret a zero destination to mean loopback. * Update the T_CONN_REQ (sin/sin6) since it is used to * generate the T_CONN_CON. */ - if (IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6dst.sin6_addr)) { - icmp->icmp_v6dst.sin6_addr = ipv6_loopback; + if (v4dst == INADDR_ANY) { + v4dst = htonl(INADDR_LOOPBACK); + IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); + ASSERT(connp->conn_family == AF_INET); + sin->sin_addr.s_addr = v4dst; } + connp->conn_faddr_v6 = v6dst; + connp->conn_flowinfo = 0; + } else { + ASSERT(connp->conn_ipversion == IPV6_VERSION); /* - * If the destination address is multicast and - * an outgoing multicast interface has been set, - * then the ip bind logic will pick the correct source - * address (i.e. matching the outgoing multicast interface). + * Interpret a zero destination to mean loopback. + * Update the T_CONN_REQ (sin/sin6) since it is used to + * generate the T_CONN_CON. */ - break; + if (IN6_IS_ADDR_UNSPECIFIED(&v6dst)) { + v6dst = ipv6_loopback; + sin6->sin6_addr = v6dst; + } + connp->conn_faddr_v6 = v6dst; + connp->conn_flowinfo = flowinfo; } - icmp->icmp_pending_op = T_CONN_REQ; - - if (icmp->icmp_state == TS_DATA_XFER) { - /* Already connected - clear out state */ - icmp->icmp_v6src = icmp->icmp_bound_v6src; - icmp->icmp_state = TS_IDLE; + ixa->ixa_cred = cr; + ixa->ixa_cpid = pid; + if (is_system_labeled()) { + /* We need to restart with a label based on the cred */ + ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); } - icmp->icmp_state = TS_DATA_XFER; - rw_exit(&icmp->icmp_rwlock); - - if (icmp->icmp_family == AF_INET6) { - error = ip_proto_bind_connected_v6(connp, &ire_mp, - icmp->icmp_proto, &icmp->icmp_v6src, 0, - &icmp->icmp_v6dst.sin6_addr, - NULL, sin6->sin6_port, B_TRUE, B_TRUE, cr); + if (scopeid != 0) { + ixa->ixa_flags |= IXAF_SCOPEID_SET; + ixa->ixa_scopeid = scopeid; + connp->conn_incoming_ifindex = scopeid; } else { - error = ip_proto_bind_connected_v4(connp, &ire_mp, - icmp->icmp_proto, &V4_PART_OF_V6(icmp->icmp_v6src), 0, - V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr), sin->sin_port, - B_TRUE, B_TRUE, cr); + ixa->ixa_flags &= ~IXAF_SCOPEID_SET; + connp->conn_incoming_ifindex = connp->conn_bound_if; } - rawip_post_ip_bind_connect(icmp, ire_mp, error); - return (error); -} -static void -icmp_close_free(conn_t *connp) -{ - icmp_t *icmp = connp->conn_icmp; - - /* If there are any options associated with the stream, free them. */ - if (icmp->icmp_ip_snd_options != NULL) { - mi_free((char *)icmp->icmp_ip_snd_options); - icmp->icmp_ip_snd_options = NULL; - icmp->icmp_ip_snd_options_len = 0; - } + /* + * conn_connect will drop conn_lock and reacquire it. + * To prevent a send* from messing with this icmp_t while the lock + * is dropped we set icmp_state and clear conn_v6lastdst. + * That will make all send* fail with EISCONN. + */ + connp->conn_v6lastdst = ipv6_all_zeros; + icmp->icmp_state = TS_WCON_CREQ; - if (icmp->icmp_filter != NULL) { - kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t)); - icmp->icmp_filter = NULL; - } + error = conn_connect(connp, NULL, IPDF_ALLOW_MCBC); + mutex_exit(&connp->conn_lock); + if (error != 0) + goto connect_failed; - /* Free memory associated with sticky options */ - if (icmp->icmp_sticky_hdrs_len != 0) { - kmem_free(icmp->icmp_sticky_hdrs, - icmp->icmp_sticky_hdrs_len); - icmp->icmp_sticky_hdrs = NULL; - icmp->icmp_sticky_hdrs_len = 0; - } + /* + * The addresses have been verified. Time to insert in + * the correct fanout list. + */ + error = ipcl_conn_insert(connp); + if (error != 0) + goto connect_failed; - if (icmp->icmp_last_cred != NULL) { - crfree(icmp->icmp_last_cred); - icmp->icmp_last_cred = NULL; + mutex_enter(&connp->conn_lock); + error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, + &connp->conn_faddr_v6, connp->conn_flowinfo); + if (error != 0) { + mutex_exit(&connp->conn_lock); + goto connect_failed; } - if (icmp->icmp_effective_cred != NULL) { - crfree(icmp->icmp_effective_cred); - icmp->icmp_effective_cred = NULL; - } + icmp->icmp_state = TS_DATA_XFER; + /* Record this as the "last" send even though we haven't sent any */ + connp->conn_v6lastdst = connp->conn_faddr_v6; + connp->conn_lastipversion = connp->conn_ipversion; + connp->conn_lastdstport = connp->conn_fport; + connp->conn_lastflowinfo = connp->conn_flowinfo; + connp->conn_lastscopeid = scopeid; + connp->conn_lastsrcid = srcid; + /* Also remember a source to use together with lastdst */ + connp->conn_v6lastsrc = v6src; + mutex_exit(&connp->conn_lock); - ip6_pkt_free(&icmp->icmp_sticky_ipp); + ixa_refrele(ixa); + return (0); - /* - * Clear any fields which the kmem_cache constructor clears. - * Only icmp_connp needs to be preserved. - * TBD: We should make this more efficient to avoid clearing - * everything. - */ - ASSERT(icmp->icmp_connp == connp); - bzero(icmp, sizeof (icmp_t)); - icmp->icmp_connp = connp; +connect_failed: + if (ixa != NULL) + ixa_refrele(ixa); + mutex_enter(&connp->conn_lock); + icmp->icmp_state = TS_IDLE; + /* In case the source address was set above */ + if (connp->conn_mcbc_bind) + connp->conn_saddr_v6 = ipv6_all_zeros; + else + connp->conn_saddr_v6 = connp->conn_bound_addr_v6; + connp->conn_laddr_v6 = connp->conn_bound_addr_v6; + connp->conn_faddr_v6 = ipv6_all_zeros; + connp->conn_v6lastdst = ipv6_all_zeros; + connp->conn_flowinfo = 0; + + (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, + &connp->conn_faddr_v6, connp->conn_flowinfo); + mutex_exit(&connp->conn_lock); + return (error); } -static int +static void rawip_do_close(conn_t *connp) { ASSERT(connp != NULL && IPCL_IS_RAWIP(connp)); @@ -878,8 +893,6 @@ rawip_do_close(conn_t *connp) qprocsoff(connp->conn_rq); } - ASSERT(connp->conn_icmp->icmp_fallback_queue_head == NULL && - connp->conn_icmp->icmp_fallback_queue_tail == NULL); icmp_close_free(connp); /* @@ -902,8 +915,6 @@ rawip_do_close(conn_t *connp) connp->conn_ref--; ipcl_conn_destroy(connp); - - return (0); } static int @@ -928,60 +939,63 @@ done: return (0); } +static void +icmp_close_free(conn_t *connp) +{ + icmp_t *icmp = connp->conn_icmp; + + if (icmp->icmp_filter != NULL) { + kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t)); + icmp->icmp_filter = NULL; + } + + /* + * Clear any fields which the kmem_cache constructor clears. + * Only icmp_connp needs to be preserved. + * TBD: We should make this more efficient to avoid clearing + * everything. + */ + ASSERT(icmp->icmp_connp == connp); + bzero(icmp, sizeof (icmp_t)); + icmp->icmp_connp = connp; +} + /* * This routine handles each T_DISCON_REQ message passed to icmp * as an indicating that ICMP is no longer connected. This results - * in sending a T_BIND_REQ to IP to restore the binding to just - * the local address. - * - * The disconnect completes in rawip_post_ip_bind_connect. + * in telling IP to restore the binding to just the local address. */ static int icmp_do_disconnect(conn_t *connp) { - icmp_t *icmp; - mblk_t *ire_mp; - int error; + icmp_t *icmp = connp->conn_icmp; + int error; - icmp = connp->conn_icmp; - rw_enter(&icmp->icmp_rwlock, RW_WRITER); - if (icmp->icmp_state != TS_DATA_XFER || icmp->icmp_pending_op != -1) { - rw_exit(&icmp->icmp_rwlock); + mutex_enter(&connp->conn_lock); + if (icmp->icmp_state != TS_DATA_XFER) { + mutex_exit(&connp->conn_lock); return (-TOUTSTATE); } - icmp->icmp_pending_op = T_DISCON_REQ; - icmp->icmp_v6src = icmp->icmp_bound_v6src; + if (connp->conn_mcbc_bind) + connp->conn_saddr_v6 = ipv6_all_zeros; + else + connp->conn_saddr_v6 = connp->conn_bound_addr_v6; + connp->conn_laddr_v6 = connp->conn_bound_addr_v6; + connp->conn_faddr_v6 = ipv6_all_zeros; icmp->icmp_state = TS_IDLE; + connp->conn_v6lastdst = ipv6_all_zeros; + error = icmp_build_hdr_template(connp, &connp->conn_saddr_v6, + &connp->conn_faddr_v6, connp->conn_flowinfo); + mutex_exit(&connp->conn_lock); + if (error != 0) + return (error); - if (icmp->icmp_family == AF_INET6) { - /* Rebuild the header template */ - error = icmp_build_hdrs(icmp); - if (error != 0) { - icmp->icmp_pending_op = -1; - rw_exit(&icmp->icmp_rwlock); - return (error); - } - } - - rw_exit(&icmp->icmp_rwlock); - ire_mp = allocb(sizeof (ire_t), BPRI_HI); - if (ire_mp == NULL) { - return (ENOMEM); - } - - if (icmp->icmp_family == AF_INET6) { - error = ip_proto_bind_laddr_v6(connp, &ire_mp, icmp->icmp_proto, - &icmp->icmp_bound_v6src, 0, B_TRUE); - } else { - - error = ip_proto_bind_laddr_v4(connp, &ire_mp, icmp->icmp_proto, - V4_PART_OF_V6(icmp->icmp_bound_v6src), 0, B_TRUE); - } - - rawip_post_ip_bind_connect(icmp, ire_mp, error); - - return (error); + /* + * Tell IP to remove the full binding and revert + * to the local address binding. + */ + return (ip_laddr_fanout_insert(connp)); } static void @@ -1014,16 +1028,14 @@ icmp_tpi_disconnect(queue_t *q, mblk_t *mp) ASSERT(mp != NULL); qreply(q, mp); } - } static int icmp_disconnect(conn_t *connp) { int error; - icmp_t *icmp = connp->conn_icmp; - icmp->icmp_dgram_errind = B_FALSE; + connp->conn_dgram_errind = B_FALSE; error = icmp_do_disconnect(connp); @@ -1058,22 +1070,22 @@ icmp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive, } /* - * icmp_icmp_error is called by icmp_input to process ICMP - * messages passed up by IP. - * Generates the appropriate permanent (non-transient) errors. - * Assumes that IP has pulled up everything up to and including - * the ICMP header. + * icmp_icmp_input is called as conn_recvicmp to process ICMP messages. + * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors. + * Assumes that IP has pulled up everything up to and including the ICMP header. */ +/* ARGSUSED2 */ static void -icmp_icmp_error(conn_t *connp, mblk_t *mp) +icmp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) { - icmph_t *icmph; - ipha_t *ipha; - int iph_hdr_length; - sin_t sin; - mblk_t *mp1; - int error = 0; - icmp_t *icmp = connp->conn_icmp; + conn_t *connp = (conn_t *)arg1; + icmp_t *icmp = connp->conn_icmp; + icmph_t *icmph; + ipha_t *ipha; + int iph_hdr_length; + sin_t sin; + mblk_t *mp1; + int error = 0; ipha = (ipha_t *)mp->b_rptr; @@ -1081,34 +1093,57 @@ icmp_icmp_error(conn_t *connp, mblk_t *mp) if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) { ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION); - icmp_icmp_error_ipv6(connp, mp); + icmp_icmp_error_ipv6(connp, mp, ira); return; } - - /* - * icmp does not support v4 mapped addresses - * so we can never be here for a V6 socket - * i.e. icmp_family == AF_INET6 - */ - ASSERT((IPH_HDR_VERSION(ipha) == IPV4_VERSION) && - (icmp->icmp_family == AF_INET)); - - ASSERT(icmp->icmp_family == AF_INET); + ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); /* Skip past the outer IP and ICMP headers */ - iph_hdr_length = IPH_HDR_LENGTH(ipha); - icmph = (icmph_t *)(&mp->b_rptr[iph_hdr_length]); - ipha = (ipha_t *)&icmph[1]; + ASSERT(IPH_HDR_LENGTH(ipha) == ira->ira_ip_hdr_length); + iph_hdr_length = ira->ira_ip_hdr_length; + icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; + ipha = (ipha_t *)&icmph[1]; /* Inner IP header */ + iph_hdr_length = IPH_HDR_LENGTH(ipha); switch (icmph->icmph_type) { case ICMP_DEST_UNREACHABLE: switch (icmph->icmph_code) { - case ICMP_FRAGMENTATION_NEEDED: + case ICMP_FRAGMENTATION_NEEDED: { + ipha_t *ipha; + ip_xmit_attr_t *ixa; /* * IP has already adjusted the path MTU. + * But we need to adjust DF for IPv4. */ + if (connp->conn_ipversion != IPV4_VERSION) + break; + + ixa = conn_get_ixa(connp, B_FALSE); + if (ixa == NULL || ixa->ixa_ire == NULL) { + /* + * Some other thread holds conn_ixa. We will + * redo this on the next ICMP too big. + */ + if (ixa != NULL) + ixa_refrele(ixa); + break; + } + (void) ip_get_pmtu(ixa); + + mutex_enter(&connp->conn_lock); + ipha = (ipha_t *)connp->conn_ht_iphc; + if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) { + ipha->ipha_fragment_offset_and_flags |= + IPH_DF_HTONS; + } else { + ipha->ipha_fragment_offset_and_flags &= + ~IPH_DF_HTONS; + } + mutex_exit(&connp->conn_lock); + ixa_refrele(ixa); break; + } case ICMP_PORT_UNREACHABLE: case ICMP_PROTOCOL_UNREACHABLE: error = ECONNREFUSED; @@ -1131,7 +1166,7 @@ icmp_icmp_error(conn_t *connp, mblk_t *mp) * Deliver T_UDERROR_IND when the application has asked for it. * The socket layer enables this automatically when connected. */ - if (!icmp->icmp_dgram_errind) { + if (!connp->conn_dgram_errind) { freemsg(mp); return; } @@ -1141,11 +1176,10 @@ icmp_icmp_error(conn_t *connp, mblk_t *mp) sin.sin_addr.s_addr = ipha->ipha_dst; if (IPCL_IS_NONSTR(connp)) { - rw_enter(&icmp->icmp_rwlock, RW_WRITER); + mutex_enter(&connp->conn_lock); if (icmp->icmp_state == TS_DATA_XFER) { - if (sin.sin_addr.s_addr == - V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr)) { - rw_exit(&icmp->icmp_rwlock); + if (sin.sin_addr.s_addr == connp->conn_faddr_v4) { + mutex_exit(&connp->conn_lock); (*connp->conn_upcalls->su_set_error) (connp->conn_upper_handle, error); goto done; @@ -1154,27 +1188,25 @@ icmp_icmp_error(conn_t *connp, mblk_t *mp) icmp->icmp_delayed_error = error; *((sin_t *)&icmp->icmp_delayed_addr) = sin; } - rw_exit(&icmp->icmp_rwlock); + mutex_exit(&connp->conn_lock); } else { - mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, - 0, error); + mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0, + error); if (mp1 != NULL) putnext(connp->conn_rq, mp1); } done: - ASSERT(!RW_ISWRITER(&icmp->icmp_rwlock)); freemsg(mp); } /* - * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMPv6 - * for IPv6 packets. - * Send permanent (non-transient) errors upstream. - * Assumes that IP has pulled up all the extension headers as well - * as the ICMPv6 header. + * icmp_icmp_error_ipv6 is called by icmp_icmp_error to process ICMP for IPv6. + * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors. + * Assumes that IP has pulled up all the extension headers as well as the + * ICMPv6 header. */ static void -icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp) +icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira) { icmp6_t *icmp6; ip6_t *ip6h, *outer_ip6h; @@ -1186,13 +1218,18 @@ icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp) icmp_t *icmp = connp->conn_icmp; outer_ip6h = (ip6_t *)mp->b_rptr; +#ifdef DEBUG if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6) iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h); else iph_hdr_length = IPV6_HDR_LEN; - + ASSERT(iph_hdr_length == ira->ira_ip_hdr_length); +#endif + /* Skip past the outer IP and ICMP headers */ + iph_hdr_length = ira->ira_ip_hdr_length; icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length]; - ip6h = (ip6_t *)&icmp6[1]; + + ip6h = (ip6_t *)&icmp6[1]; /* Inner IP header */ if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) { freemsg(mp); return; @@ -1229,7 +1266,7 @@ icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp) * information, send up an empty message containing an * IPV6_PATHMTU ancillary data item. */ - if (!icmp->icmp_ipv6_recvpathmtu) + if (!connp->conn_ipv6_recvpathmtu) break; udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) + @@ -1255,7 +1292,7 @@ icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp) sin6 = (sin6_t *)&tudi[1]; bzero(sin6, sizeof (sin6_t)); sin6->sin6_family = AF_INET6; - sin6->sin6_addr = icmp->icmp_v6dst.sin6_addr; + sin6->sin6_addr = connp->conn_faddr_v6; toh = (struct T_opthdr *)&sin6[1]; toh->level = IPPROTO_IPV6; @@ -1273,8 +1310,7 @@ icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp) * message. Free it, then send our empty message. */ freemsg(mp); - icmp_ulp_recv(connp, newmp); - + icmp_ulp_recv(connp, newmp, msgdsize(newmp)); return; } case ICMP6_TIME_EXCEEDED: @@ -1299,7 +1335,7 @@ icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp) * Deliver T_UDERROR_IND when the application has asked for it. * The socket layer enables this automatically when connected. */ - if (!icmp->icmp_dgram_errind) { + if (!connp->conn_dgram_errind) { freemsg(mp); return; } @@ -1308,13 +1344,12 @@ icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp) sin6.sin6_family = AF_INET6; sin6.sin6_addr = ip6h->ip6_dst; sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; - if (IPCL_IS_NONSTR(connp)) { - rw_enter(&icmp->icmp_rwlock, RW_WRITER); + mutex_enter(&connp->conn_lock); if (icmp->icmp_state == TS_DATA_XFER) { if (IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr, - &icmp->icmp_v6dst.sin6_addr)) { - rw_exit(&icmp->icmp_rwlock); + &connp->conn_faddr_v6)) { + mutex_exit(&connp->conn_lock); (*connp->conn_upcalls->su_set_error) (connp->conn_upper_handle, error); goto done; @@ -1323,7 +1358,7 @@ icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp) icmp->icmp_delayed_error = error; *((sin6_t *)&icmp->icmp_delayed_addr) = sin6; } - rw_exit(&icmp->icmp_rwlock); + mutex_exit(&connp->conn_lock); } else { mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), NULL, 0, error); @@ -1331,7 +1366,6 @@ icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp) putnext(connp->conn_rq, mp1); } done: - ASSERT(!RW_ISWRITER(&icmp->icmp_rwlock)); freemsg(mp); } @@ -1345,9 +1379,12 @@ done: static void icmp_addr_req(queue_t *q, mblk_t *mp) { - icmp_t *icmp = Q_TO_ICMP(q); + struct sockaddr *sa; mblk_t *ackmp; struct T_addr_ack *taa; + icmp_t *icmp = Q_TO_ICMP(q); + conn_t *connp = icmp->icmp_connp; + uint_t addrlen; /* Make it large enough for worst case */ ackmp = reallocb(mp, sizeof (struct T_addr_ack) + @@ -1363,65 +1400,39 @@ icmp_addr_req(queue_t *q, mblk_t *mp) taa->PRIM_type = T_ADDR_ACK; ackmp->b_datap->db_type = M_PCPROTO; - rw_enter(&icmp->icmp_rwlock, RW_READER); + + if (connp->conn_family == AF_INET) + addrlen = sizeof (sin_t); + else + addrlen = sizeof (sin6_t); + + mutex_enter(&connp->conn_lock); /* * Note: Following code assumes 32 bit alignment of basic * data structures like sin_t and struct T_addr_ack. */ if (icmp->icmp_state != TS_UNBND) { /* - * Fill in local address + * Fill in local address first */ taa->LOCADDR_offset = sizeof (*taa); - if (icmp->icmp_family == AF_INET) { - sin_t *sin; - - taa->LOCADDR_length = sizeof (sin_t); - sin = (sin_t *)&taa[1]; - /* Fill zeroes and then intialize non-zero fields */ - *sin = sin_null; - sin->sin_family = AF_INET; - if (!IN6_IS_ADDR_V4MAPPED_ANY(&icmp->icmp_v6src) && - !IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) { - IN6_V4MAPPED_TO_IPADDR(&icmp->icmp_v6src, - sin->sin_addr.s_addr); - } else { - /* - * INADDR_ANY - * icmp_v6src is not set, we might be bound to - * broadcast/multicast. Use icmp_bound_v6src as - * local address instead (that could - * also still be INADDR_ANY) - */ - IN6_V4MAPPED_TO_IPADDR(&icmp->icmp_bound_v6src, - sin->sin_addr.s_addr); - } - ackmp->b_wptr = (uchar_t *)&sin[1]; - } else { - sin6_t *sin6; - - ASSERT(icmp->icmp_family == AF_INET6); - taa->LOCADDR_length = sizeof (sin6_t); - sin6 = (sin6_t *)&taa[1]; - /* Fill zeroes and then intialize non-zero fields */ - *sin6 = sin6_null; - sin6->sin6_family = AF_INET6; - if (!IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) { - sin6->sin6_addr = icmp->icmp_v6src; - } else { - /* - * UNSPECIFIED - * icmp_v6src is not set, we might be bound to - * broadcast/multicast. Use icmp_bound_v6src as - * local address instead (that could - * also still be UNSPECIFIED) - */ - sin6->sin6_addr = icmp->icmp_bound_v6src; - } - ackmp->b_wptr = (uchar_t *)&sin6[1]; - } + taa->LOCADDR_length = addrlen; + sa = (struct sockaddr *)&taa[1]; + (void) conn_getsockname(connp, sa, &addrlen); + ackmp->b_wptr += addrlen; + } + if (icmp->icmp_state == TS_DATA_XFER) { + /* + * connected, fill remote address too + */ + taa->REMADDR_length = addrlen; + /* assumed 32-bit alignment */ + taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length; + sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset); + (void) conn_getpeername(connp, sa, &addrlen); + ackmp->b_wptr += addrlen; } - rw_exit(&icmp->icmp_rwlock); + mutex_exit(&connp->conn_lock); ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim); qreply(q, ackmp); } @@ -1429,9 +1440,11 @@ icmp_addr_req(queue_t *q, mblk_t *mp) static void icmp_copy_info(struct T_info_ack *tap, icmp_t *icmp) { + conn_t *connp = icmp->icmp_connp; + *tap = icmp_g_t_info_ack; - if (icmp->icmp_family == AF_INET6) + if (connp->conn_family == AF_INET6) tap->ADDR_size = sizeof (sin6_t); else tap->ADDR_size = sizeof (sin_t); @@ -1488,6 +1501,7 @@ icmp_info_req(queue_t *q, mblk_t *mp) { icmp_t *icmp = Q_TO_ICMP(q); + /* Create a T_INFO_ACK message. */ mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO, T_INFO_ACK); if (!mp) @@ -1496,18 +1510,14 @@ icmp_info_req(queue_t *q, mblk_t *mp) qreply(q, mp); } -/* For /dev/icmp aka AF_INET open */ static int icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, int family) { conn_t *connp; dev_t conn_dev; - icmp_stack_t *is; int error; - conn_dev = NULL; - /* If the stream is already open, return immediately. */ if (q->q_ptr != NULL) return (0); @@ -1534,9 +1544,9 @@ icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, return (0); } - connp = icmp_open(family, credp, &error, KM_SLEEP); + connp = rawip_do_open(family, credp, &error, KM_SLEEP); if (connp == NULL) { - ASSERT(error != NULL); + ASSERT(error != 0); inet_minor_free(ip_minor_arena_sa, connp->conn_dev); return (error); } @@ -1545,8 +1555,6 @@ icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, connp->conn_dev = conn_dev; connp->conn_minor_arena = ip_minor_arena_sa; - is = connp->conn_icmp->icmp_is; - /* * Initialize the icmp_t structure for this stream. */ @@ -1555,38 +1563,25 @@ icmp_tpi_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, connp->conn_rq = q; connp->conn_wq = WR(q); - if (connp->conn_icmp->icmp_family == AF_INET6) { - /* Build initial header template for transmit */ - rw_enter(&connp->conn_icmp->icmp_rwlock, RW_WRITER); - if ((error = icmp_build_hdrs(connp->conn_icmp)) != 0) { - rw_exit(&connp->conn_icmp->icmp_rwlock); - inet_minor_free(ip_minor_arena_sa, connp->conn_dev); - ipcl_conn_destroy(connp); - return (error); - } - rw_exit(&connp->conn_icmp->icmp_rwlock); - } - - - q->q_hiwat = is->is_recv_hiwat; - WR(q)->q_hiwat = is->is_xmit_hiwat; - WR(q)->q_lowat = is->is_xmit_lowat; + WR(q)->q_hiwat = connp->conn_sndbuf; + WR(q)->q_lowat = connp->conn_sndlowat; qprocson(q); /* Set the Stream head write offset. */ - (void) proto_set_tx_wroff(q, connp, - connp->conn_icmp->icmp_max_hdr_len + is->is_wroff_extra); - (void) proto_set_rx_hiwat(connp->conn_rq, connp, q->q_hiwat); + (void) proto_set_tx_wroff(q, connp, connp->conn_wroff); + (void) proto_set_rx_hiwat(connp->conn_rq, connp, connp->conn_rcvbuf); mutex_enter(&connp->conn_lock); connp->conn_state_flags &= ~CONN_INCIPIENT; mutex_exit(&connp->conn_lock); + icmp_bind_proto(connp->conn_icmp); + return (0); } -/* For /dev/icmp4 aka AF_INET open */ +/* For /dev/icmp aka AF_INET open */ static int icmp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) { @@ -1604,15 +1599,15 @@ icmp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) * This is the open routine for icmp. It allocates a icmp_t structure for * the stream and, on the first open of the module, creates an ND table. */ -/* ARGSUSED */ static conn_t * -icmp_open(int family, cred_t *credp, int *err, int flags) +rawip_do_open(int family, cred_t *credp, int *err, int flags) { icmp_t *icmp; conn_t *connp; zoneid_t zoneid; netstack_t *ns; icmp_stack_t *is; + int len; boolean_t isv6 = B_FALSE; *err = secpolicy_net_icmpaccess(credp); @@ -1621,6 +1616,7 @@ icmp_open(int family, cred_t *credp, int *err, int flags) if (family == AF_INET6) isv6 = B_TRUE; + ns = netstack_find_by_cred(credp); ASSERT(ns != NULL); is = ns->netstack_icmp; @@ -1639,7 +1635,6 @@ icmp_open(int family, cred_t *credp, int *err, int flags) connp = ipcl_conn_create(IPCL_RAWIPCONN, flags, ns); icmp = connp->conn_icmp; - icmp->icmp_v6dst = sin6_null; /* * ipcl_conn_create did a netstack_hold. Undo the hold that was @@ -1647,35 +1642,52 @@ icmp_open(int family, cred_t *credp, int *err, int flags) */ netstack_rele(ns); - rw_enter(&icmp->icmp_rwlock, RW_WRITER); - ASSERT(connp->conn_ulp == IPPROTO_ICMP); + /* + * Since this conn_t/icmp_t is not yet visible to anybody else we don't + * need to lock anything. + */ + ASSERT(connp->conn_proto == IPPROTO_ICMP); ASSERT(connp->conn_icmp == icmp); ASSERT(icmp->icmp_connp == connp); /* Set the initial state of the stream and the privilege status. */ icmp->icmp_state = TS_UNBND; + connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE; if (isv6) { - icmp->icmp_ipversion = IPV6_VERSION; - icmp->icmp_family = AF_INET6; - connp->conn_ulp = IPPROTO_ICMPV6; + connp->conn_family = AF_INET6; + connp->conn_ipversion = IPV6_VERSION; + connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4; + connp->conn_proto = IPPROTO_ICMPV6; /* May be changed by a SO_PROTOTYPE socket option. */ - icmp->icmp_proto = IPPROTO_ICMPV6; - icmp->icmp_checksum_off = 2; /* Offset for icmp6_cksum */ - icmp->icmp_max_hdr_len = IPV6_HDR_LEN; - icmp->icmp_ttl = (uint8_t)is->is_ipv6_hoplimit; - connp->conn_af_isv6 = B_TRUE; + connp->conn_proto = IPPROTO_ICMPV6; + connp->conn_ixa->ixa_protocol = connp->conn_proto; + connp->conn_ixa->ixa_raw_cksum_offset = 2; + connp->conn_default_ttl = is->is_ipv6_hoplimit; + len = sizeof (ip6_t); } else { - icmp->icmp_ipversion = IPV4_VERSION; - icmp->icmp_family = AF_INET; + connp->conn_family = AF_INET; + connp->conn_ipversion = IPV4_VERSION; + connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4; /* May be changed by a SO_PROTOTYPE socket option. */ - icmp->icmp_proto = IPPROTO_ICMP; - icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH; - icmp->icmp_ttl = (uint8_t)is->is_ipv4_ttl; - connp->conn_af_isv6 = B_FALSE; - } - icmp->icmp_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; - icmp->icmp_pending_op = -1; - connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; + connp->conn_proto = IPPROTO_ICMP; + connp->conn_ixa->ixa_protocol = connp->conn_proto; + connp->conn_default_ttl = is->is_ipv4_ttl; + len = sizeof (ipha_t); + } + connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl; + + connp->conn_ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; + + /* + * For the socket of protocol IPPROTO_RAW or when IP_HDRINCL is set, + * the checksum is provided in the pre-built packet. We clear + * IXAF_SET_ULP_CKSUM to tell IP that the application has sent a + * complete IP header and not to compute the transport checksum. + */ + connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM; + /* conn_allzones can not be set this early, hence no IPCL_ZONEID */ + connp->conn_ixa->ixa_zoneid = zoneid; + connp->conn_zoneid = zoneid; /* @@ -1685,17 +1697,35 @@ icmp_open(int family, cred_t *credp, int *err, int flags) if (getpflags(NET_MAC_AWARE, credp) != 0) connp->conn_mac_mode = CONN_MAC_AWARE; - connp->conn_ulp_labeled = is_system_labeled(); + connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID); icmp->icmp_is = is; + connp->conn_rcvbuf = is->is_recv_hiwat; + connp->conn_sndbuf = is->is_xmit_hiwat; + connp->conn_sndlowat = is->is_xmit_lowat; + connp->conn_rcvlowat = icmp_mod_info.mi_lowat; + + connp->conn_wroff = len + is->is_wroff_extra; + connp->conn_so_type = SOCK_RAW; + connp->conn_recv = icmp_input; + connp->conn_recvicmp = icmp_icmp_input; crhold(credp); connp->conn_cred = credp; - - rw_exit(&icmp->icmp_rwlock); + connp->conn_cpid = curproc->p_pid; + connp->conn_open_time = lbolt64; + /* Cache things in ixa without an extra refhold */ + connp->conn_ixa->ixa_cred = connp->conn_cred; + connp->conn_ixa->ixa_cpid = connp->conn_cpid; + if (is_system_labeled()) + connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred); connp->conn_flow_cntrld = B_FALSE; + + if (is->is_pmtu_discovery) + connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY; + return (connp); } @@ -1713,9 +1743,8 @@ icmp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name) * This routine gets default values of certain options whose default * values are maintained by protcol specific code */ -/* ARGSUSED */ int -icmp_opt_default(queue_t *q, int level, int name, uchar_t *ptr) +icmp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) { icmp_t *icmp = Q_TO_ICMP(q); icmp_stack_t *is = icmp->icmp_is; @@ -1759,366 +1788,88 @@ icmp_opt_default(queue_t *q, int level, int name, uchar_t *ptr) /* * This routine retrieves the current status of socket options. - * It returns the size of the option retrieved. + * It returns the size of the option retrieved, or -1. */ int icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) { icmp_t *icmp = connp->conn_icmp; - icmp_stack_t *is = icmp->icmp_is; int *i1 = (int *)ptr; - ip6_pkt_t *ipp = &icmp->icmp_sticky_ipp; - int ret = 0; + conn_opt_arg_t coas; + int retval; - ASSERT(RW_READ_HELD(&icmp->icmp_rwlock)); - switch (level) { - case SOL_SOCKET: - switch (name) { - case SO_DEBUG: - *i1 = icmp->icmp_debug; - break; - case SO_TYPE: - *i1 = SOCK_RAW; - break; - case SO_PROTOTYPE: - *i1 = icmp->icmp_proto; - break; - case SO_REUSEADDR: - *i1 = icmp->icmp_reuseaddr; - break; - - /* - * The following three items are available here, - * but are only meaningful to IP. - */ - case SO_DONTROUTE: - *i1 = icmp->icmp_dontroute; - break; - case SO_USELOOPBACK: - *i1 = icmp->icmp_useloopback; - break; - case SO_BROADCAST: - *i1 = icmp->icmp_broadcast; - break; - - case SO_SNDBUF: - ASSERT(icmp->icmp_xmit_hiwat <= INT_MAX); - *i1 = icmp->icmp_xmit_hiwat; - break; - case SO_RCVBUF: - ASSERT(icmp->icmp_recv_hiwat <= INT_MAX); - *i1 = icmp->icmp_recv_hiwat; - break; - case SO_DGRAM_ERRIND: - *i1 = icmp->icmp_dgram_errind; - break; - case SO_TIMESTAMP: - *i1 = icmp->icmp_timestamp; - break; - case SO_MAC_EXEMPT: - *i1 = (connp->conn_mac_mode == CONN_MAC_AWARE); - break; - case SO_MAC_IMPLICIT: - *i1 = (connp->conn_mac_mode == CONN_MAC_IMPLICIT); - break; - case SO_DOMAIN: - *i1 = icmp->icmp_family; - break; + coas.coa_connp = connp; + coas.coa_ixa = connp->conn_ixa; + coas.coa_ipp = &connp->conn_xmit_ipp; + coas.coa_ancillary = B_FALSE; + coas.coa_changed = 0; - /* - * Following four not meaningful for icmp - * Action is same as "default" to which we fallthrough - * so we keep them in comments. - * case SO_LINGER: - * case SO_KEEPALIVE: - * case SO_OOBINLINE: - * case SO_ALLZONES: - */ - default: - ret = -1; - goto done; - } - break; + /* + * We assume that the optcom framework has checked for the set + * of levels and names that are supported, hence we don't worry + * about rejecting based on that. + * First check for ICMP specific handling, then pass to common routine. + */ + switch (level) { case IPPROTO_IP: /* * Only allow IPv4 option processing on IPv4 sockets. */ - if (icmp->icmp_family != AF_INET) { - ret = -1; - goto done; - } + if (connp->conn_family != AF_INET) + return (-1); switch (name) { case IP_OPTIONS: case T_IP_OPTIONS: /* Options are passed up with each packet */ - ret = 0; - goto done; + return (0); case IP_HDRINCL: + mutex_enter(&connp->conn_lock); *i1 = (int)icmp->icmp_hdrincl; - break; - case IP_TOS: - case T_IP_TOS: - *i1 = (int)icmp->icmp_type_of_service; - break; - case IP_TTL: - *i1 = (int)icmp->icmp_ttl; - break; - case IP_MULTICAST_IF: - /* 0 address if not set */ - *(ipaddr_t *)ptr = icmp->icmp_multicast_if_addr; - ret = sizeof (ipaddr_t); - goto done; - case IP_MULTICAST_TTL: - *(uchar_t *)ptr = icmp->icmp_multicast_ttl; - ret = sizeof (uchar_t); - goto done; - case IP_MULTICAST_LOOP: - *ptr = connp->conn_multicast_loop; - ret = sizeof (uint8_t); - goto done; - case IP_BOUND_IF: - /* Zero if not set */ - *i1 = icmp->icmp_bound_if; - break; /* goto sizeof (int) option return */ - case IP_UNSPEC_SRC: - *ptr = icmp->icmp_unspec_source; - break; /* goto sizeof (int) option return */ - case IP_RECVIF: - *ptr = icmp->icmp_recvif; - break; /* goto sizeof (int) option return */ - case IP_BROADCAST_TTL: - *(uchar_t *)ptr = connp->conn_broadcast_ttl; - return (sizeof (uchar_t)); - case IP_RECVPKTINFO: - /* - * This also handles IP_PKTINFO. - * IP_PKTINFO and IP_RECVPKTINFO have the same value. - * Differentiation is based on the size of the argument - * passed in. - * This option is handled in IP which will return an - * error for IP_PKTINFO as it's not supported as a - * sticky option. - */ - ret = -EINVAL; - goto done; - /* - * Cannot "get" the value of following options - * at this level. Action is same as "default" to - * which we fallthrough so we keep them in comments. - * - * case IP_ADD_MEMBERSHIP: - * case IP_DROP_MEMBERSHIP: - * case IP_BLOCK_SOURCE: - * case IP_UNBLOCK_SOURCE: - * case IP_ADD_SOURCE_MEMBERSHIP: - * case IP_DROP_SOURCE_MEMBERSHIP: - * case MCAST_JOIN_GROUP: - * case MCAST_LEAVE_GROUP: - * case MCAST_BLOCK_SOURCE: - * case MCAST_UNBLOCK_SOURCE: - * case MCAST_JOIN_SOURCE_GROUP: - * case MCAST_LEAVE_SOURCE_GROUP: - * case MRT_INIT: - * case MRT_DONE: - * case MRT_ADD_VIF: - * case MRT_DEL_VIF: - * case MRT_ADD_MFC: - * case MRT_DEL_MFC: - * case MRT_VERSION: - * case MRT_ASSERT: - * case IP_SEC_OPT: - * case IP_NEXTHOP: - */ - default: - ret = -1; - goto done; + mutex_exit(&connp->conn_lock); + return (sizeof (int)); } break; + case IPPROTO_IPV6: /* * Only allow IPv6 option processing on native IPv6 sockets. */ - if (icmp->icmp_family != AF_INET6) { - ret = -1; - goto done; - } + if (connp->conn_family != AF_INET6) + return (-1); + switch (name) { - case IPV6_UNICAST_HOPS: - *i1 = (unsigned int)icmp->icmp_ttl; - break; - case IPV6_MULTICAST_IF: - /* 0 index if not set */ - *i1 = icmp->icmp_multicast_if_index; - break; - case IPV6_MULTICAST_HOPS: - *i1 = icmp->icmp_multicast_ttl; - break; - case IPV6_MULTICAST_LOOP: - *i1 = connp->conn_multicast_loop; - break; - case IPV6_BOUND_IF: - /* Zero if not set */ - *i1 = icmp->icmp_bound_if; - break; - case IPV6_UNSPEC_SRC: - *i1 = icmp->icmp_unspec_source; - break; case IPV6_CHECKSUM: /* * Return offset or -1 if no checksum offset. * Does not apply to IPPROTO_ICMPV6 */ - if (icmp->icmp_proto == IPPROTO_ICMPV6) { - ret = -1; - goto done; - } + if (connp->conn_proto == IPPROTO_ICMPV6) + return (-1); - if (icmp->icmp_raw_checksum) { - *i1 = icmp->icmp_checksum_off; - } else { - *i1 = -1; - } - break; - case IPV6_JOIN_GROUP: - case IPV6_LEAVE_GROUP: - case MCAST_JOIN_GROUP: - case MCAST_LEAVE_GROUP: - case MCAST_BLOCK_SOURCE: - case MCAST_UNBLOCK_SOURCE: - case MCAST_JOIN_SOURCE_GROUP: - case MCAST_LEAVE_SOURCE_GROUP: - /* cannot "get" the value for these */ - ret = -1; - goto done; - case IPV6_RECVPKTINFO: - *i1 = icmp->icmp_ip_recvpktinfo; - break; - case IPV6_RECVTCLASS: - *i1 = icmp->icmp_ipv6_recvtclass; - break; - case IPV6_RECVPATHMTU: - *i1 = icmp->icmp_ipv6_recvpathmtu; - break; - case IPV6_V6ONLY: - *i1 = 1; - break; - case IPV6_RECVHOPLIMIT: - *i1 = icmp->icmp_ipv6_recvhoplimit; - break; - case IPV6_RECVHOPOPTS: - *i1 = icmp->icmp_ipv6_recvhopopts; - break; - case IPV6_RECVDSTOPTS: - *i1 = icmp->icmp_ipv6_recvdstopts; - break; - case _OLD_IPV6_RECVDSTOPTS: - *i1 = icmp->icmp_old_ipv6_recvdstopts; - break; - case IPV6_RECVRTHDRDSTOPTS: - *i1 = icmp->icmp_ipv6_recvrtdstopts; - break; - case IPV6_RECVRTHDR: - *i1 = icmp->icmp_ipv6_recvrthdr; - break; - case IPV6_PKTINFO: { - /* XXX assumes that caller has room for max size! */ - struct in6_pktinfo *pkti; - - pkti = (struct in6_pktinfo *)ptr; - if (ipp->ipp_fields & IPPF_IFINDEX) - pkti->ipi6_ifindex = ipp->ipp_ifindex; + mutex_enter(&connp->conn_lock); + if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) + *i1 = connp->conn_ixa->ixa_raw_cksum_offset; else - pkti->ipi6_ifindex = 0; - if (ipp->ipp_fields & IPPF_ADDR) - pkti->ipi6_addr = ipp->ipp_addr; - else - pkti->ipi6_addr = ipv6_all_zeros; - ret = sizeof (struct in6_pktinfo); - goto done; - } - case IPV6_NEXTHOP: { - sin6_t *sin6 = (sin6_t *)ptr; - - if (!(ipp->ipp_fields & IPPF_NEXTHOP)) - return (0); - *sin6 = sin6_null; - sin6->sin6_family = AF_INET6; - sin6->sin6_addr = ipp->ipp_nexthop; - ret = (sizeof (sin6_t)); - goto done; - } - case IPV6_HOPOPTS: - if (!(ipp->ipp_fields & IPPF_HOPOPTS)) - return (0); - if (ipp->ipp_hopoptslen <= icmp->icmp_label_len_v6) - return (0); - bcopy((char *)ipp->ipp_hopopts + - icmp->icmp_label_len_v6, ptr, - ipp->ipp_hopoptslen - icmp->icmp_label_len_v6); - if (icmp->icmp_label_len_v6 > 0) { - ptr[0] = ((char *)ipp->ipp_hopopts)[0]; - ptr[1] = (ipp->ipp_hopoptslen - - icmp->icmp_label_len_v6 + 7) / 8 - 1; - } - ret = (ipp->ipp_hopoptslen - icmp->icmp_label_len_v6); - goto done; - case IPV6_RTHDRDSTOPTS: - if (!(ipp->ipp_fields & IPPF_RTDSTOPTS)) - return (0); - bcopy(ipp->ipp_rtdstopts, ptr, ipp->ipp_rtdstoptslen); - ret = ipp->ipp_rtdstoptslen; - goto done; - case IPV6_RTHDR: - if (!(ipp->ipp_fields & IPPF_RTHDR)) - return (0); - bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen); - ret = ipp->ipp_rthdrlen; - goto done; - case IPV6_DSTOPTS: - if (!(ipp->ipp_fields & IPPF_DSTOPTS)) { - ret = 0; - goto done; - } - bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen); - ret = ipp->ipp_dstoptslen; - goto done; - case IPV6_PATHMTU: - if (!(ipp->ipp_fields & IPPF_PATHMTU)) { - ret = 0; - } else { - ret = ip_fill_mtuinfo( - &icmp->icmp_v6dst.sin6_addr, 0, - (struct ip6_mtuinfo *)ptr, - is->is_netstack); - } - goto done; - case IPV6_TCLASS: - if (ipp->ipp_fields & IPPF_TCLASS) - *i1 = ipp->ipp_tclass; - else - *i1 = IPV6_FLOW_TCLASS( - IPV6_DEFAULT_VERS_AND_FLOW); - break; - default: - ret = -1; - goto done; + *i1 = -1; + mutex_exit(&connp->conn_lock); + return (sizeof (int)); } break; + case IPPROTO_ICMPV6: /* * Only allow IPv6 option processing on native IPv6 sockets. */ - if (icmp->icmp_family != AF_INET6) { - ret = -1; - } + if (connp->conn_family != AF_INET6) + return (-1); - if (icmp->icmp_proto != IPPROTO_ICMPV6) { - ret = -1; - } + if (connp->conn_proto != IPPROTO_ICMPV6) + return (-1); switch (name) { case ICMP6_FILTER: + mutex_enter(&connp->conn_lock); if (icmp->icmp_filter == NULL) { /* Make it look like "pass all" */ ICMP6_FILTER_SETPASSALL((icmp6_filter_t *)ptr); @@ -2126,501 +1877,149 @@ icmp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) (void) bcopy(icmp->icmp_filter, ptr, sizeof (icmp6_filter_t)); } - ret = sizeof (icmp6_filter_t); - goto done; - default: - ret = -1; - goto done; + mutex_exit(&connp->conn_lock); + return (sizeof (icmp6_filter_t)); } - default: - ret = -1; - goto done; } - ret = sizeof (int); -done: - return (ret); + mutex_enter(&connp->conn_lock); + retval = conn_opt_get(&coas, level, name, ptr); + mutex_exit(&connp->conn_lock); + return (retval); } /* * This routine retrieves the current status of socket options. - * It returns the size of the option retrieved. + * It returns the size of the option retrieved, or -1. */ int icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr) { - conn_t *connp = Q_TO_CONN(q); - icmp_t *icmp = connp->conn_icmp; - int err; + conn_t *connp = Q_TO_CONN(q); + int err; - rw_enter(&icmp->icmp_rwlock, RW_READER); err = icmp_opt_get(connp, level, name, ptr); - rw_exit(&icmp->icmp_rwlock); return (err); } +/* + * This routine sets socket options. + */ int -icmp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen, - uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, cred_t *cr, - void *thisdg_attrs, boolean_t checkonly) +icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name, + uint_t inlen, uchar_t *invalp, cred_t *cr, boolean_t checkonly) { + conn_t *connp = coa->coa_connp; + ip_xmit_attr_t *ixa = coa->coa_ixa; + icmp_t *icmp = connp->conn_icmp; + icmp_stack_t *is = icmp->icmp_is; + int *i1 = (int *)invalp; + boolean_t onoff = (*i1 == 0) ? 0 : 1; + int error; - int *i1 = (int *)invalp; - boolean_t onoff = (*i1 == 0) ? 0 : 1; - icmp_t *icmp = connp->conn_icmp; - icmp_stack_t *is = icmp->icmp_is; - int error; + ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock)); - ASSERT(RW_WRITE_HELD(&icmp->icmp_rwlock)); /* * For fixed length options, no sanity check * of passed in length is done. It is assumed *_optcom_req() * routines do the right thing. */ + switch (level) { case SOL_SOCKET: switch (name) { - case SO_DEBUG: - if (!checkonly) - icmp->icmp_debug = onoff; - break; case SO_PROTOTYPE: if ((*i1 & 0xFF) != IPPROTO_ICMP && (*i1 & 0xFF) != IPPROTO_ICMPV6 && secpolicy_net_rawaccess(cr) != 0) { - *outlenp = 0; return (EACCES); } - /* Can't use IPPROTO_RAW with IPv6 */ - if ((*i1 & 0xFF) == IPPROTO_RAW && - icmp->icmp_family == AF_INET6) { - *outlenp = 0; - return (EPROTONOSUPPORT); - } - if (checkonly) { - /* T_CHECK case */ - *(int *)outvalp = (*i1 & 0xFF); + if (checkonly) break; - } - icmp->icmp_proto = *i1 & 0xFF; - if ((icmp->icmp_proto == IPPROTO_RAW || - icmp->icmp_proto == IPPROTO_IGMP) && - icmp->icmp_family == AF_INET) + + mutex_enter(&connp->conn_lock); + connp->conn_proto = *i1 & 0xFF; + ixa->ixa_protocol = connp->conn_proto; + if ((connp->conn_proto == IPPROTO_RAW || + connp->conn_proto == IPPROTO_IGMP) && + connp->conn_family == AF_INET) { icmp->icmp_hdrincl = 1; - else + ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; + } else if (connp->conn_proto == IPPROTO_UDP || + connp->conn_proto == IPPROTO_TCP || + connp->conn_proto == IPPROTO_SCTP) { + /* Used by test applications like psh */ icmp->icmp_hdrincl = 0; - - if (icmp->icmp_family == AF_INET6 && - icmp->icmp_proto == IPPROTO_ICMPV6) { - /* Set offset for icmp6_cksum */ - icmp->icmp_raw_checksum = 0; - icmp->icmp_checksum_off = 2; - } - if (icmp->icmp_proto == IPPROTO_UDP || - icmp->icmp_proto == IPPROTO_TCP || - icmp->icmp_proto == IPPROTO_SCTP) { - icmp->icmp_no_tp_cksum = 1; - icmp->icmp_sticky_ipp.ipp_fields |= - IPPF_NO_CKSUM; + ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; } else { - icmp->icmp_no_tp_cksum = 0; - icmp->icmp_sticky_ipp.ipp_fields &= - ~IPPF_NO_CKSUM; + icmp->icmp_hdrincl = 0; + ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; } + if (connp->conn_family == AF_INET6 && + connp->conn_proto == IPPROTO_ICMPV6) { + /* Set offset for icmp6_cksum */ + ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM; + ixa->ixa_raw_cksum_offset = 2; + } if (icmp->icmp_filter != NULL && - icmp->icmp_proto != IPPROTO_ICMPV6) { + connp->conn_proto != IPPROTO_ICMPV6) { kmem_free(icmp->icmp_filter, sizeof (icmp6_filter_t)); icmp->icmp_filter = NULL; } + mutex_exit(&connp->conn_lock); - /* Rebuild the header template */ - error = icmp_build_hdrs(icmp); - if (error != 0) { - *outlenp = 0; - return (error); - } - + coa->coa_changed |= COA_HEADER_CHANGED; /* * For SCTP, we don't use icmp_bind_proto() for - * raw socket binding. Note that we do not need - * to set *outlenp. - * FIXME: how does SCTP work? + * raw socket binding. */ - if (icmp->icmp_proto == IPPROTO_SCTP) + if (connp->conn_proto == IPPROTO_SCTP) return (0); - *outlenp = sizeof (int); - *(int *)outvalp = *i1 & 0xFF; - - /* Drop lock across the bind operation */ - rw_exit(&icmp->icmp_rwlock); - (void) icmp_bind_proto(connp); - rw_enter(&icmp->icmp_rwlock, RW_WRITER); + coa->coa_changed |= COA_ICMP_BIND_NEEDED; return (0); - case SO_REUSEADDR: - if (!checkonly) { - icmp->icmp_reuseaddr = onoff; - PASS_OPT_TO_IP(connp); - } - break; - - /* - * The following three items are available here, - * but are only meaningful to IP. - */ - case SO_DONTROUTE: - if (!checkonly) { - icmp->icmp_dontroute = onoff; - PASS_OPT_TO_IP(connp); - } - break; - case SO_USELOOPBACK: - if (!checkonly) { - icmp->icmp_useloopback = onoff; - PASS_OPT_TO_IP(connp); - } - break; - case SO_BROADCAST: - if (!checkonly) { - icmp->icmp_broadcast = onoff; - PASS_OPT_TO_IP(connp); - } - break; case SO_SNDBUF: if (*i1 > is->is_max_buf) { - *outlenp = 0; return (ENOBUFS); } - if (!checkonly) { - if (!IPCL_IS_NONSTR(connp)) { - connp->conn_wq->q_hiwat = *i1; - } - icmp->icmp_xmit_hiwat = *i1; - } break; case SO_RCVBUF: if (*i1 > is->is_max_buf) { - *outlenp = 0; return (ENOBUFS); } - if (!checkonly) { - icmp->icmp_recv_hiwat = *i1; - rw_exit(&icmp->icmp_rwlock); - (void) proto_set_rx_hiwat(connp->conn_rq, connp, - *i1); - rw_enter(&icmp->icmp_rwlock, RW_WRITER); - } - break; - case SO_DGRAM_ERRIND: - if (!checkonly) - icmp->icmp_dgram_errind = onoff; break; - case SO_ALLZONES: - /* - * "soft" error (negative) - * option not handled at this level - * Note: Do not modify *outlenp - */ - return (-EINVAL); - case SO_TIMESTAMP: - if (!checkonly) { - icmp->icmp_timestamp = onoff; - } - break; - case SO_MAC_EXEMPT: - /* - * "soft" error (negative) - * option not handled at this level - * Note: Do not modify *outlenp - */ - return (-EINVAL); - case SO_RCVTIMEO: - case SO_SNDTIMEO: - /* - * Pass these two options in order for third part - * protocol usage. Here just return directly. - */ - return (0); - /* - * Following three not meaningful for icmp - * Action is same as "default" so we keep them - * in comments. - * case SO_LINGER: - * case SO_KEEPALIVE: - * case SO_OOBINLINE: - */ - default: - *outlenp = 0; - return (EINVAL); } break; + case IPPROTO_IP: /* * Only allow IPv4 option processing on IPv4 sockets. */ - if (icmp->icmp_family != AF_INET) { - *outlenp = 0; - return (ENOPROTOOPT); - } - switch (name) { - case IP_OPTIONS: - case T_IP_OPTIONS: - /* Save options for use by IP. */ - if ((inlen & 0x3) || - inlen + icmp->icmp_label_len > IP_MAX_OPT_LENGTH) { - *outlenp = 0; - return (EINVAL); - } - if (checkonly) - break; - - if (!tsol_option_set(&icmp->icmp_ip_snd_options, - &icmp->icmp_ip_snd_options_len, - icmp->icmp_label_len, invalp, inlen)) { - *outlenp = 0; - return (ENOMEM); - } + if (connp->conn_family != AF_INET) + return (EINVAL); - icmp->icmp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + - icmp->icmp_ip_snd_options_len; - rw_exit(&icmp->icmp_rwlock); - (void) proto_set_tx_wroff(connp->conn_rq == NULL ? NULL: - RD(connp->conn_rq), connp, - icmp->icmp_max_hdr_len + is->is_wroff_extra); - rw_enter(&icmp->icmp_rwlock, RW_WRITER); - break; + switch (name) { case IP_HDRINCL: - if (!checkonly) - icmp->icmp_hdrincl = onoff; - break; - case IP_TOS: - case T_IP_TOS: - if (!checkonly) { - icmp->icmp_type_of_service = (uint8_t)*i1; - } - break; - case IP_TTL: if (!checkonly) { - icmp->icmp_ttl = (uint8_t)*i1; - } - break; - case IP_MULTICAST_IF: - /* - * TODO should check OPTMGMT reply and undo this if - * there is an error. - */ - if (!checkonly) { - icmp->icmp_multicast_if_addr = *i1; - PASS_OPT_TO_IP(connp); - } - break; - case IP_MULTICAST_TTL: - if (!checkonly) - icmp->icmp_multicast_ttl = *invalp; - break; - case IP_MULTICAST_LOOP: - if (!checkonly) { - connp->conn_multicast_loop = - (*invalp == 0) ? 0 : 1; - PASS_OPT_TO_IP(connp); - } - break; - case IP_BOUND_IF: - if (!checkonly) { - icmp->icmp_bound_if = *i1; - PASS_OPT_TO_IP(connp); - } - break; - case IP_UNSPEC_SRC: - if (!checkonly) { - icmp->icmp_unspec_source = onoff; - PASS_OPT_TO_IP(connp); - } - break; - case IP_BROADCAST_TTL: - if (!checkonly) - connp->conn_broadcast_ttl = *invalp; - break; - case IP_RECVIF: - if (!checkonly) { - icmp->icmp_recvif = onoff; - } - /* - * pass to ip - */ - return (-EINVAL); - case IP_PKTINFO: { - /* - * This also handles IP_RECVPKTINFO. - * IP_PKTINFO and IP_RECVPKTINFO have the same value. - * Differentiation is based on the size of the argument - * passed in. - */ - struct in_pktinfo *pktinfop; - ip4_pkt_t *attr_pktinfop; - - if (checkonly) - break; - - if (inlen == sizeof (int)) { - /* - * This is IP_RECVPKTINFO option. - * Keep a local copy of wether this option is - * set or not and pass it down to IP for - * processing. - */ - icmp->icmp_ip_recvpktinfo = onoff; - return (-EINVAL); - } - - - if (inlen != sizeof (struct in_pktinfo)) { - return (EINVAL); - } - - if ((attr_pktinfop = (ip4_pkt_t *)thisdg_attrs) - == NULL) { - /* - * sticky option is not supported - */ - return (EINVAL); - } - - pktinfop = (struct in_pktinfo *)invalp; - - /* - * Atleast one of the values should be specified - */ - if (pktinfop->ipi_ifindex == 0 && - pktinfop->ipi_spec_dst.s_addr == INADDR_ANY) { - return (EINVAL); + mutex_enter(&connp->conn_lock); + icmp->icmp_hdrincl = onoff; + if (onoff) + ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; + else + ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; + mutex_exit(&connp->conn_lock); } - - attr_pktinfop->ip4_addr = pktinfop->ipi_spec_dst.s_addr; - attr_pktinfop->ip4_ill_index = pktinfop->ipi_ifindex; - } break; - case IP_ADD_MEMBERSHIP: - case IP_DROP_MEMBERSHIP: - case IP_BLOCK_SOURCE: - case IP_UNBLOCK_SOURCE: - case IP_ADD_SOURCE_MEMBERSHIP: - case IP_DROP_SOURCE_MEMBERSHIP: - case MCAST_JOIN_GROUP: - case MCAST_LEAVE_GROUP: - case MCAST_BLOCK_SOURCE: - case MCAST_UNBLOCK_SOURCE: - case MCAST_JOIN_SOURCE_GROUP: - case MCAST_LEAVE_SOURCE_GROUP: - case MRT_INIT: - case MRT_DONE: - case MRT_ADD_VIF: - case MRT_DEL_VIF: - case MRT_ADD_MFC: - case MRT_DEL_MFC: - case MRT_VERSION: - case MRT_ASSERT: - case IP_SEC_OPT: - case IP_NEXTHOP: - /* - * "soft" error (negative) - * option not handled at this level - * Note: Do not modify *outlenp - */ - return (-EINVAL); - default: - *outlenp = 0; - return (EINVAL); } break; - case IPPROTO_IPV6: { - ip6_pkt_t *ipp; - boolean_t sticky; - if (icmp->icmp_family != AF_INET6) { - *outlenp = 0; - return (ENOPROTOOPT); - } - /* - * Deal with both sticky options and ancillary data - */ - if (thisdg_attrs == NULL) { - /* sticky options, or none */ - ipp = &icmp->icmp_sticky_ipp; - sticky = B_TRUE; - } else { - /* ancillary data */ - ipp = (ip6_pkt_t *)thisdg_attrs; - sticky = B_FALSE; - } + case IPPROTO_IPV6: + if (connp->conn_family != AF_INET6) + return (EINVAL); switch (name) { - case IPV6_MULTICAST_IF: - if (!checkonly) { - icmp->icmp_multicast_if_index = *i1; - PASS_OPT_TO_IP(connp); - } - break; - case IPV6_UNICAST_HOPS: - /* -1 means use default */ - if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) { - *outlenp = 0; - return (EINVAL); - } - if (!checkonly) { - if (*i1 == -1) { - icmp->icmp_ttl = ipp->ipp_unicast_hops = - is->is_ipv6_hoplimit; - ipp->ipp_fields &= ~IPPF_UNICAST_HOPS; - /* Pass modified value to IP. */ - *i1 = ipp->ipp_hoplimit; - } else { - icmp->icmp_ttl = ipp->ipp_unicast_hops = - (uint8_t)*i1; - ipp->ipp_fields |= IPPF_UNICAST_HOPS; - } - /* Rebuild the header template */ - error = icmp_build_hdrs(icmp); - if (error != 0) { - *outlenp = 0; - return (error); - } - } - break; - case IPV6_MULTICAST_HOPS: - /* -1 means use default */ - if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) { - *outlenp = 0; - return (EINVAL); - } - if (!checkonly) { - if (*i1 == -1) { - icmp->icmp_multicast_ttl = - ipp->ipp_multicast_hops = - IP_DEFAULT_MULTICAST_TTL; - ipp->ipp_fields &= ~IPPF_MULTICAST_HOPS; - /* Pass modified value to IP. */ - *i1 = icmp->icmp_multicast_ttl; - } else { - icmp->icmp_multicast_ttl = - ipp->ipp_multicast_hops = - (uint8_t)*i1; - ipp->ipp_fields |= IPPF_MULTICAST_HOPS; - } - } - break; - case IPV6_MULTICAST_LOOP: - if (*i1 != 0 && *i1 != 1) { - *outlenp = 0; - return (EINVAL); - } - if (!checkonly) { - connp->conn_multicast_loop = *i1; - PASS_OPT_TO_IP(connp); - } - break; case IPV6_CHECKSUM: /* * Integer offset into the user data of where the @@ -2628,517 +2027,93 @@ icmp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen, * Offset of -1 disables option. * Does not apply to IPPROTO_ICMPV6. */ - if (icmp->icmp_proto == IPPROTO_ICMPV6 || !sticky) { - *outlenp = 0; + if (connp->conn_proto == IPPROTO_ICMPV6 || + coa->coa_ancillary) { return (EINVAL); } if ((*i1 != -1) && ((*i1 < 0) || (*i1 & 0x1) != 0)) { /* Negative or not 16 bit aligned offset */ - *outlenp = 0; return (EINVAL); } if (checkonly) break; + mutex_enter(&connp->conn_lock); if (*i1 == -1) { - icmp->icmp_raw_checksum = 0; - ipp->ipp_fields &= ~IPPF_RAW_CKSUM; + ixa->ixa_flags &= ~IXAF_SET_RAW_CKSUM; + ixa->ixa_raw_cksum_offset = 0; + ixa->ixa_flags &= ~IXAF_SET_ULP_CKSUM; } else { - icmp->icmp_raw_checksum = 1; - icmp->icmp_checksum_off = *i1; - ipp->ipp_fields |= IPPF_RAW_CKSUM; - } - /* Rebuild the header template */ - error = icmp_build_hdrs(icmp); - if (error != 0) { - *outlenp = 0; - return (error); - } - break; - case IPV6_JOIN_GROUP: - case IPV6_LEAVE_GROUP: - case MCAST_JOIN_GROUP: - case MCAST_LEAVE_GROUP: - case MCAST_BLOCK_SOURCE: - case MCAST_UNBLOCK_SOURCE: - case MCAST_JOIN_SOURCE_GROUP: - case MCAST_LEAVE_SOURCE_GROUP: - /* - * "soft" error (negative) - * option not handled at this level - * Note: Do not modify *outlenp - */ - return (-EINVAL); - case IPV6_BOUND_IF: - if (!checkonly) { - icmp->icmp_bound_if = *i1; - PASS_OPT_TO_IP(connp); - } - break; - case IPV6_UNSPEC_SRC: - if (!checkonly) { - icmp->icmp_unspec_source = onoff; - PASS_OPT_TO_IP(connp); - } - break; - case IPV6_RECVTCLASS: - if (!checkonly) { - icmp->icmp_ipv6_recvtclass = onoff; - PASS_OPT_TO_IP(connp); - } - break; - /* - * Set boolean switches for ancillary data delivery - */ - case IPV6_RECVPKTINFO: - if (!checkonly) { - icmp->icmp_ip_recvpktinfo = onoff; - PASS_OPT_TO_IP(connp); - } - break; - case IPV6_RECVPATHMTU: - if (!checkonly) { - icmp->icmp_ipv6_recvpathmtu = onoff; - PASS_OPT_TO_IP(connp); - } - break; - case IPV6_RECVHOPLIMIT: - if (!checkonly) { - icmp->icmp_ipv6_recvhoplimit = onoff; - PASS_OPT_TO_IP(connp); - } - break; - case IPV6_RECVHOPOPTS: - if (!checkonly) { - icmp->icmp_ipv6_recvhopopts = onoff; - PASS_OPT_TO_IP(connp); - } - break; - case IPV6_RECVDSTOPTS: - if (!checkonly) { - icmp->icmp_ipv6_recvdstopts = onoff; - PASS_OPT_TO_IP(connp); - } - break; - case _OLD_IPV6_RECVDSTOPTS: - if (!checkonly) - icmp->icmp_old_ipv6_recvdstopts = onoff; - break; - case IPV6_RECVRTHDRDSTOPTS: - if (!checkonly) { - icmp->icmp_ipv6_recvrtdstopts = onoff; - PASS_OPT_TO_IP(connp); - } - break; - case IPV6_RECVRTHDR: - if (!checkonly) { - icmp->icmp_ipv6_recvrthdr = onoff; - PASS_OPT_TO_IP(connp); - } - break; - /* - * Set sticky options or ancillary data. - * If sticky options, (re)build any extension headers - * that might be needed as a result. - */ - case IPV6_PKTINFO: - /* - * The source address and ifindex are verified - * in ip_opt_set(). For ancillary data the - * source address is checked in ip_wput_v6. - */ - if (inlen != 0 && inlen != - sizeof (struct in6_pktinfo)) { - return (EINVAL); - } - if (checkonly) - break; - - if (inlen == 0) { - ipp->ipp_fields &= ~(IPPF_IFINDEX|IPPF_ADDR); - ipp->ipp_sticky_ignored |= - (IPPF_IFINDEX|IPPF_ADDR); - } else { - struct in6_pktinfo *pkti; - - pkti = (struct in6_pktinfo *)invalp; - ipp->ipp_ifindex = pkti->ipi6_ifindex; - ipp->ipp_addr = pkti->ipi6_addr; - if (ipp->ipp_ifindex != 0) - ipp->ipp_fields |= IPPF_IFINDEX; - else - ipp->ipp_fields &= ~IPPF_IFINDEX; - if (!IN6_IS_ADDR_UNSPECIFIED( - &ipp->ipp_addr)) - ipp->ipp_fields |= IPPF_ADDR; - else - ipp->ipp_fields &= ~IPPF_ADDR; - } - if (sticky) { - error = icmp_build_hdrs(icmp); - if (error != 0) - return (error); - PASS_OPT_TO_IP(connp); - } - break; - case IPV6_HOPLIMIT: - /* This option can only be used as ancillary data. */ - if (sticky) - return (EINVAL); - if (inlen != 0 && inlen != sizeof (int)) - return (EINVAL); - if (checkonly) - break; - - if (inlen == 0) { - ipp->ipp_fields &= ~IPPF_HOPLIMIT; - ipp->ipp_sticky_ignored |= IPPF_HOPLIMIT; - } else { - if (*i1 > 255 || *i1 < -1) - return (EINVAL); - if (*i1 == -1) - ipp->ipp_hoplimit = - is->is_ipv6_hoplimit; - else - ipp->ipp_hoplimit = *i1; - ipp->ipp_fields |= IPPF_HOPLIMIT; - } - break; - case IPV6_TCLASS: - /* - * IPV6_RECVTCLASS accepts -1 as use kernel default - * and [0, 255] as the actualy traffic class. - */ - if (inlen != 0 && inlen != sizeof (int)) { - return (EINVAL); - } - if (checkonly) - break; - - if (inlen == 0) { - ipp->ipp_fields &= ~IPPF_TCLASS; - ipp->ipp_sticky_ignored |= IPPF_TCLASS; - } else { - if (*i1 >= 256 || *i1 < -1) - return (EINVAL); - if (*i1 == -1) { - ipp->ipp_tclass = - IPV6_FLOW_TCLASS( - IPV6_DEFAULT_VERS_AND_FLOW); - } else { - ipp->ipp_tclass = *i1; - } - ipp->ipp_fields |= IPPF_TCLASS; - } - if (sticky) { - error = icmp_build_hdrs(icmp); - if (error != 0) - return (error); - } - break; - case IPV6_NEXTHOP: - /* - * IP will verify that the nexthop is reachable - * and fail for sticky options. - */ - if (inlen != 0 && inlen != sizeof (sin6_t)) { - return (EINVAL); - } - if (checkonly) - break; - - if (inlen == 0) { - ipp->ipp_fields &= ~IPPF_NEXTHOP; - ipp->ipp_sticky_ignored |= IPPF_NEXTHOP; - } else { - sin6_t *sin6 = (sin6_t *)invalp; - - if (sin6->sin6_family != AF_INET6) { - return (EAFNOSUPPORT); - } - if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { - return (EADDRNOTAVAIL); - } - ipp->ipp_nexthop = sin6->sin6_addr; - if (!IN6_IS_ADDR_UNSPECIFIED( - &ipp->ipp_nexthop)) - ipp->ipp_fields |= IPPF_NEXTHOP; - else - ipp->ipp_fields &= ~IPPF_NEXTHOP; - } - if (sticky) { - error = icmp_build_hdrs(icmp); - if (error != 0) - return (error); - PASS_OPT_TO_IP(connp); - } - break; - case IPV6_HOPOPTS: { - ip6_hbh_t *hopts = (ip6_hbh_t *)invalp; - /* - * Sanity checks - minimum size, size a multiple of - * eight bytes, and matching size passed in. - */ - if (inlen != 0 && - inlen != (8 * (hopts->ip6h_len + 1))) { - return (EINVAL); - } - - if (checkonly) - break; - error = optcom_pkt_set(invalp, inlen, sticky, - (uchar_t **)&ipp->ipp_hopopts, - &ipp->ipp_hopoptslen, - sticky ? icmp->icmp_label_len_v6 : 0); - if (error != 0) - return (error); - if (ipp->ipp_hopoptslen == 0) { - ipp->ipp_fields &= ~IPPF_HOPOPTS; - ipp->ipp_sticky_ignored |= IPPF_HOPOPTS; - } else { - ipp->ipp_fields |= IPPF_HOPOPTS; - } - if (sticky) { - error = icmp_build_hdrs(icmp); - if (error != 0) - return (error); + ixa->ixa_flags |= IXAF_SET_RAW_CKSUM; + ixa->ixa_raw_cksum_offset = *i1; + ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; } + mutex_exit(&connp->conn_lock); break; } - case IPV6_RTHDRDSTOPTS: { - ip6_dest_t *dopts = (ip6_dest_t *)invalp; - - /* - * Sanity checks - minimum size, size a multiple of - * eight bytes, and matching size passed in. - */ - if (inlen != 0 && - inlen != (8 * (dopts->ip6d_len + 1))) - return (EINVAL); - - if (checkonly) - break; - - if (inlen == 0) { - if (sticky && - (ipp->ipp_fields & IPPF_RTDSTOPTS) != 0) { - kmem_free(ipp->ipp_rtdstopts, - ipp->ipp_rtdstoptslen); - ipp->ipp_rtdstopts = NULL; - ipp->ipp_rtdstoptslen = 0; - } - ipp->ipp_fields &= ~IPPF_RTDSTOPTS; - ipp->ipp_sticky_ignored |= IPPF_RTDSTOPTS; - } else { - error = optcom_pkt_set(invalp, inlen, sticky, - (uchar_t **)&ipp->ipp_rtdstopts, - &ipp->ipp_rtdstoptslen, 0); - if (error != 0) - return (error); - ipp->ipp_fields |= IPPF_RTDSTOPTS; - } - if (sticky) { - error = icmp_build_hdrs(icmp); - if (error != 0) - return (error); - } - break; - } - case IPV6_DSTOPTS: { - ip6_dest_t *dopts = (ip6_dest_t *)invalp; + break; - /* - * Sanity checks - minimum size, size a multiple of - * eight bytes, and matching size passed in. - */ - if (inlen != 0 && - inlen != (8 * (dopts->ip6d_len + 1))) - return (EINVAL); + case IPPROTO_ICMPV6: + /* + * Only allow IPv6 option processing on IPv6 sockets. + */ + if (connp->conn_family != AF_INET6) + return (EINVAL); + if (connp->conn_proto != IPPROTO_ICMPV6) + return (EINVAL); + switch (name) { + case ICMP6_FILTER: if (checkonly) break; - if (inlen == 0) { - if (sticky && - (ipp->ipp_fields & IPPF_DSTOPTS) != 0) { - kmem_free(ipp->ipp_dstopts, - ipp->ipp_dstoptslen); - ipp->ipp_dstopts = NULL; - ipp->ipp_dstoptslen = 0; - } - ipp->ipp_fields &= ~IPPF_DSTOPTS; - ipp->ipp_sticky_ignored |= IPPF_DSTOPTS; - } else { - error = optcom_pkt_set(invalp, inlen, sticky, - (uchar_t **)&ipp->ipp_dstopts, - &ipp->ipp_dstoptslen, 0); - if (error != 0) - return (error); - ipp->ipp_fields |= IPPF_DSTOPTS; - } - if (sticky) { - error = icmp_build_hdrs(icmp); - if (error != 0) - return (error); - } - break; - } - case IPV6_RTHDR: { - ip6_rthdr_t *rt = (ip6_rthdr_t *)invalp; - - /* - * Sanity checks - minimum size, size a multiple of - * eight bytes, and matching size passed in. - */ - if (inlen != 0 && - inlen != (8 * (rt->ip6r_len + 1))) + if ((inlen != 0) && + (inlen != sizeof (icmp6_filter_t))) return (EINVAL); - if (checkonly) - break; - + mutex_enter(&connp->conn_lock); if (inlen == 0) { - if (sticky && - (ipp->ipp_fields & IPPF_RTHDR) != 0) { - kmem_free(ipp->ipp_rthdr, - ipp->ipp_rthdrlen); - ipp->ipp_rthdr = NULL; - ipp->ipp_rthdrlen = 0; + if (icmp->icmp_filter != NULL) { + kmem_free(icmp->icmp_filter, + sizeof (icmp6_filter_t)); + icmp->icmp_filter = NULL; } - ipp->ipp_fields &= ~IPPF_RTHDR; - ipp->ipp_sticky_ignored |= IPPF_RTHDR; } else { - error = optcom_pkt_set(invalp, inlen, sticky, - (uchar_t **)&ipp->ipp_rthdr, - &ipp->ipp_rthdrlen, 0); - if (error != 0) - return (error); - ipp->ipp_fields |= IPPF_RTHDR; - } - if (sticky) { - error = icmp_build_hdrs(icmp); - if (error != 0) - return (error); - } - break; - } - - case IPV6_DONTFRAG: - if (checkonly) - break; - - if (onoff) { - ipp->ipp_fields |= IPPF_DONTFRAG; - } else { - ipp->ipp_fields &= ~IPPF_DONTFRAG; - } - break; - - case IPV6_USE_MIN_MTU: - if (inlen != sizeof (int)) - return (EINVAL); - - if (*i1 < -1 || *i1 > 1) - return (EINVAL); - - if (checkonly) - break; - - ipp->ipp_fields |= IPPF_USE_MIN_MTU; - ipp->ipp_use_min_mtu = *i1; - break; - - /* - * This option can't be set. Its only returned via - * getsockopt() or ancillary data. - */ - case IPV6_PATHMTU: - return (EINVAL); - - case IPV6_SEC_OPT: - case IPV6_SRC_PREFERENCES: - case IPV6_V6ONLY: - /* Handled at IP level */ - return (-EINVAL); - default: - *outlenp = 0; - return (EINVAL); - } - break; - } /* end IPPROTO_IPV6 */ - - case IPPROTO_ICMPV6: - /* - * Only allow IPv6 option processing on IPv6 sockets. - */ - if (icmp->icmp_family != AF_INET6) { - *outlenp = 0; - return (ENOPROTOOPT); - } - if (icmp->icmp_proto != IPPROTO_ICMPV6) { - *outlenp = 0; - return (ENOPROTOOPT); - } - switch (name) { - case ICMP6_FILTER: - if (!checkonly) { - if ((inlen != 0) && - (inlen != sizeof (icmp6_filter_t))) - return (EINVAL); - - if (inlen == 0) { - if (icmp->icmp_filter != NULL) { - kmem_free(icmp->icmp_filter, - sizeof (icmp6_filter_t)); - icmp->icmp_filter = NULL; - } - } else { + if (icmp->icmp_filter == NULL) { + icmp->icmp_filter = kmem_alloc( + sizeof (icmp6_filter_t), + KM_NOSLEEP); if (icmp->icmp_filter == NULL) { - icmp->icmp_filter = kmem_alloc( - sizeof (icmp6_filter_t), - KM_NOSLEEP); - if (icmp->icmp_filter == NULL) { - *outlenp = 0; - return (ENOBUFS); - } + mutex_exit(&connp->conn_lock); + return (ENOBUFS); } - (void) bcopy(invalp, icmp->icmp_filter, - inlen); } + (void) bcopy(invalp, icmp->icmp_filter, inlen); } + mutex_exit(&connp->conn_lock); break; - - default: - *outlenp = 0; - return (EINVAL); } break; - default: - *outlenp = 0; - return (EINVAL); - } - /* - * Common case of OK return with outval same as inval. - */ - if (invalp != outvalp) { - /* don't trust bcopy for identical src/dst */ - (void) bcopy(invalp, outvalp, inlen); } - *outlenp = inlen; - return (0); + error = conn_opt_set(coa, level, name, inlen, invalp, + checkonly, cr); + return (error); } -/* This routine sets socket options. */ -/* ARGSUSED */ +/* + * This routine sets socket options. + */ int icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, void *thisdg_attrs, cred_t *cr) { - boolean_t checkonly; - int error; + icmp_t *icmp = connp->conn_icmp; + int err; + conn_opt_arg_t coas, *coa; + boolean_t checkonly; + icmp_stack_t *is = icmp->icmp_is; - error = 0; switch (optset_context) { case SETFN_OPTCOM_CHECKONLY: checkonly = B_TRUE; @@ -3152,8 +2127,7 @@ icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, */ if (inlen == 0) { *outlenp = 0; - error = 0; - goto done; + return (0); } break; case SETFN_OPTCOM_NEGOTIATE: @@ -3171,8 +2145,7 @@ icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, */ if (!icmp_opt_allow_udr_set(level, name)) { *outlenp = 0; - error = EINVAL; - goto done; + return (EINVAL); } break; default: @@ -3180,105 +2153,265 @@ icmp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, * We should never get here */ *outlenp = 0; - error = EINVAL; - goto done; + return (EINVAL); } ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); - error = icmp_do_opt_set(connp, level, name, inlen, invalp, outlenp, - outvalp, cr, thisdg_attrs, checkonly); -done: - return (error); + if (thisdg_attrs != NULL) { + /* Options from T_UNITDATA_REQ */ + coa = (conn_opt_arg_t *)thisdg_attrs; + ASSERT(coa->coa_connp == connp); + ASSERT(coa->coa_ixa != NULL); + ASSERT(coa->coa_ipp != NULL); + ASSERT(coa->coa_ancillary); + } else { + coa = &coas; + coas.coa_connp = connp; + /* Get a reference on conn_ixa to prevent concurrent mods */ + coas.coa_ixa = conn_get_ixa(connp, B_TRUE); + if (coas.coa_ixa == NULL) { + *outlenp = 0; + return (ENOMEM); + } + coas.coa_ipp = &connp->conn_xmit_ipp; + coas.coa_ancillary = B_FALSE; + coas.coa_changed = 0; + } + + err = icmp_do_opt_set(coa, level, name, inlen, invalp, + cr, checkonly); + if (err != 0) { +errout: + if (!coa->coa_ancillary) + ixa_refrele(coa->coa_ixa); + *outlenp = 0; + return (err); + } + + /* + * Common case of OK return with outval same as inval. + */ + if (invalp != outvalp) { + /* don't trust bcopy for identical src/dst */ + (void) bcopy(invalp, outvalp, inlen); + } + *outlenp = inlen; + + /* + * If this was not ancillary data, then we rebuild the headers, + * update the IRE/NCE, and IPsec as needed. + * Since the label depends on the destination we go through + * ip_set_destination first. + */ + if (coa->coa_ancillary) { + return (0); + } + + if (coa->coa_changed & COA_ROUTE_CHANGED) { + in6_addr_t saddr, faddr, nexthop; + in_port_t fport; + + /* + * We clear lastdst to make sure we pick up the change + * next time sending. + * If we are connected we re-cache the information. + * We ignore errors to preserve BSD behavior. + * Note that we don't redo IPsec policy lookup here + * since the final destination (or source) didn't change. + */ + mutex_enter(&connp->conn_lock); + connp->conn_v6lastdst = ipv6_all_zeros; + + ip_attr_nexthop(coa->coa_ipp, coa->coa_ixa, + &connp->conn_faddr_v6, &nexthop); + saddr = connp->conn_saddr_v6; + faddr = connp->conn_faddr_v6; + fport = connp->conn_fport; + mutex_exit(&connp->conn_lock); + + if (!IN6_IS_ADDR_UNSPECIFIED(&faddr) && + !IN6_IS_ADDR_V4MAPPED_ANY(&faddr)) { + (void) ip_attr_connect(connp, coa->coa_ixa, + &saddr, &faddr, &nexthop, fport, NULL, NULL, + IPDF_ALLOW_MCBC | IPDF_VERIFY_DST); + } + } + + ixa_refrele(coa->coa_ixa); + + if (coa->coa_changed & COA_HEADER_CHANGED) { + /* + * Rebuild the header template if we are connected. + * Otherwise clear conn_v6lastdst so we rebuild the header + * in the data path. + */ + mutex_enter(&connp->conn_lock); + if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && + !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { + err = icmp_build_hdr_template(connp, + &connp->conn_saddr_v6, &connp->conn_faddr_v6, + connp->conn_flowinfo); + if (err != 0) { + mutex_exit(&connp->conn_lock); + return (err); + } + } else { + connp->conn_v6lastdst = ipv6_all_zeros; + } + mutex_exit(&connp->conn_lock); + } + if (coa->coa_changed & COA_RCVBUF_CHANGED) { + (void) proto_set_rx_hiwat(connp->conn_rq, connp, + connp->conn_rcvbuf); + } + if ((coa->coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) { + connp->conn_wq->q_hiwat = connp->conn_sndbuf; + } + if (coa->coa_changed & COA_WROFF_CHANGED) { + /* Increase wroff if needed */ + uint_t wroff; + + mutex_enter(&connp->conn_lock); + wroff = connp->conn_ht_iphc_allocated + is->is_wroff_extra; + if (wroff > connp->conn_wroff) { + connp->conn_wroff = wroff; + mutex_exit(&connp->conn_lock); + (void) proto_set_tx_wroff(connp->conn_rq, connp, wroff); + } else { + mutex_exit(&connp->conn_lock); + } + } + if (coa->coa_changed & COA_ICMP_BIND_NEEDED) { + icmp_bind_proto(icmp); + } + return (err); } /* This routine sets socket options. */ -/* ARGSUSED */ int icmp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, - void *thisdg_attrs, cred_t *cr, mblk_t *mblk) + void *thisdg_attrs, cred_t *cr) { - conn_t *connp = Q_TO_CONN(q); - icmp_t *icmp; + conn_t *connp = Q_TO_CONN(q); int error; - icmp = connp->conn_icmp; - rw_enter(&icmp->icmp_rwlock, RW_WRITER); error = icmp_opt_set(connp, optset_context, level, name, inlen, invalp, outlenp, outvalp, thisdg_attrs, cr); - rw_exit(&icmp->icmp_rwlock); return (error); } /* - * Update icmp_sticky_hdrs based on icmp_sticky_ipp, icmp_v6src, icmp_ttl, - * icmp_proto, icmp_raw_checksum and icmp_no_tp_cksum. - * The headers include ip6i_t (if needed), ip6_t, and any sticky extension - * headers. - * Returns failure if can't allocate memory. + * Setup IP headers. + * + * Note that IP_HDRINCL has ipha_protocol that is different than conn_proto, + * but icmp_output_hdrincl restores ipha_protocol once we return. */ -static int -icmp_build_hdrs(icmp_t *icmp) +mblk_t * +icmp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp, + const in6_addr_t *v6src, const in6_addr_t *v6dst, uint32_t flowinfo, + mblk_t *data_mp, int *errorp) { - icmp_stack_t *is = icmp->icmp_is; - uchar_t *hdrs; - uint_t hdrs_len; - ip6_t *ip6h; - ip6i_t *ip6i; - ip6_pkt_t *ipp = &icmp->icmp_sticky_ipp; - - ASSERT(RW_WRITE_HELD(&icmp->icmp_rwlock)); - hdrs_len = ip_total_hdrs_len_v6(ipp); - ASSERT(hdrs_len != 0); - if (hdrs_len != icmp->icmp_sticky_hdrs_len) { - /* Need to reallocate */ - if (hdrs_len != 0) { - hdrs = kmem_alloc(hdrs_len, KM_NOSLEEP); - if (hdrs == NULL) - return (ENOMEM); - } else { - hdrs = NULL; - } - if (icmp->icmp_sticky_hdrs_len != 0) { - kmem_free(icmp->icmp_sticky_hdrs, - icmp->icmp_sticky_hdrs_len); - } - icmp->icmp_sticky_hdrs = hdrs; - icmp->icmp_sticky_hdrs_len = hdrs_len; + mblk_t *mp; + icmp_stack_t *is = connp->conn_netstack->netstack_icmp; + uint_t data_len; + uint32_t cksum; + + data_len = msgdsize(data_mp); + mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, connp->conn_proto, + flowinfo, 0, data_mp, data_len, is->is_wroff_extra, &cksum, errorp); + if (mp == NULL) { + ASSERT(*errorp != 0); + return (NULL); } - ip_build_hdrs_v6(icmp->icmp_sticky_hdrs, - icmp->icmp_sticky_hdrs_len, ipp, icmp->icmp_proto); - /* Set header fields not in ipp */ - if (ipp->ipp_fields & IPPF_HAS_IP6I) { - ip6i = (ip6i_t *)icmp->icmp_sticky_hdrs; - ip6h = (ip6_t *)&ip6i[1]; + ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length; - if (ipp->ipp_fields & IPPF_RAW_CKSUM) { - ip6i->ip6i_flags |= IP6I_RAW_CHECKSUM; - ip6i->ip6i_checksum_off = icmp->icmp_checksum_off; + /* + * If there was a routing option/header then conn_prepend_hdr + * has massaged it and placed the pseudo-header checksum difference + * in the cksum argument. + * + * Prepare for ICMPv6 checksum done in IP. + * + * We make it easy for IP to include our pseudo header + * by putting our length (and any routing header adjustment) + * in the ICMPv6 checksum field. + * The IP source, destination, and length have already been set by + * conn_prepend_hdr. + */ + cksum += data_len; + cksum = (cksum >> 16) + (cksum & 0xFFFF); + ASSERT(cksum < 0x10000); + + if (ixa->ixa_flags & IXAF_IS_IPV4) { + ipha_t *ipha = (ipha_t *)mp->b_rptr; + + ASSERT(ntohs(ipha->ipha_length) == ixa->ixa_pktlen); + } else { + ip6_t *ip6h = (ip6_t *)mp->b_rptr; + uint_t cksum_offset = 0; + + ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == ixa->ixa_pktlen); + + if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) { + if (connp->conn_proto == IPPROTO_ICMPV6) { + cksum_offset = ixa->ixa_ip_hdr_length + + offsetof(icmp6_t, icmp6_cksum); + } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { + cksum_offset = ixa->ixa_ip_hdr_length + + ixa->ixa_raw_cksum_offset; + } } - if (ipp->ipp_fields & IPPF_NO_CKSUM) { - ip6i->ip6i_flags |= IP6I_NO_ULP_CKSUM; + if (cksum_offset != 0) { + uint16_t *ptr; + + /* Make sure the checksum fits in the first mblk */ + if (cksum_offset + sizeof (short) > MBLKL(mp)) { + mblk_t *mp1; + + mp1 = msgpullup(mp, + cksum_offset + sizeof (short)); + freemsg(mp); + if (mp1 == NULL) { + *errorp = ENOMEM; + return (NULL); + } + mp = mp1; + ip6h = (ip6_t *)mp->b_rptr; + } + ptr = (uint16_t *)(mp->b_rptr + cksum_offset); + *ptr = htons(cksum); } - } else { - ip6h = (ip6_t *)icmp->icmp_sticky_hdrs; } - if (!(ipp->ipp_fields & IPPF_ADDR)) - ip6h->ip6_src = icmp->icmp_v6src; + /* Note that we don't try to update wroff due to ancillary data */ + return (mp); +} + +static int +icmp_build_hdr_template(conn_t *connp, const in6_addr_t *v6src, + const in6_addr_t *v6dst, uint32_t flowinfo) +{ + int error; - /* Try to get everything in a single mblk */ - if (hdrs_len > icmp->icmp_max_hdr_len) { - icmp->icmp_max_hdr_len = hdrs_len; - rw_exit(&icmp->icmp_rwlock); - (void) proto_set_tx_wroff(icmp->icmp_connp->conn_rq, - icmp->icmp_connp, - icmp->icmp_max_hdr_len + is->is_wroff_extra); - rw_enter(&icmp->icmp_rwlock, RW_WRITER); - } + ASSERT(MUTEX_HELD(&connp->conn_lock)); + /* + * We clear lastdst to make sure we don't use the lastdst path + * next time sending since we might not have set v6dst yet. + */ + connp->conn_v6lastdst = ipv6_all_zeros; + + error = conn_build_hdr_template(connp, 0, 0, v6src, v6dst, flowinfo); + if (error != 0) + return (error); + + /* + * Any routing header/option has been massaged. The checksum difference + * is stored in conn_sum. + */ return (0); } @@ -3370,16 +2503,15 @@ icmp_queue_fallback(icmp_t *icmp, mblk_t *mp) * TPI, then we'll queue the mp for later processing. */ static void -icmp_ulp_recv(conn_t *connp, mblk_t *mp) +icmp_ulp_recv(conn_t *connp, mblk_t *mp, uint_t len) { - if (IPCL_IS_NONSTR(connp)) { icmp_t *icmp = connp->conn_icmp; int error; + ASSERT(len == msgdsize(mp)); if ((*connp->conn_upcalls->su_recv) - (connp->conn_upper_handle, mp, msgdsize(mp), 0, &error, - NULL) < 0) { + (connp->conn_upper_handle, mp, len, 0, &error, NULL) < 0) { mutex_enter(&icmp->icmp_recv_lock); if (error == ENOSPC) { /* @@ -3409,115 +2541,74 @@ icmp_ulp_recv(conn_t *connp, mblk_t *mp) } } -/*ARGSUSED2*/ +/* + * This is the inbound data path. + * IP has already pulled up the IP headers and verified alignment + * etc. + */ +/* ARGSUSED2 */ static void -icmp_input(void *arg1, mblk_t *mp, void *arg2) +icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) { - conn_t *connp = (conn_t *)arg1; + conn_t *connp = (conn_t *)arg1; struct T_unitdata_ind *tudi; - uchar_t *rptr; + uchar_t *rptr; /* Pointer to IP header */ + int ip_hdr_length; + int udi_size; /* Size of T_unitdata_ind */ + int pkt_len; icmp_t *icmp; + ip_pkt_t ipps; + ip6_t *ip6h; + mblk_t *mp1; + crb_t recv_ancillary; icmp_stack_t *is; sin_t *sin; sin6_t *sin6; - ip6_t *ip6h; - ip6i_t *ip6i; - mblk_t *mp1; - int hdr_len; ipha_t *ipha; - int udi_size; /* Size of T_unitdata_ind */ - uint_t ipvers; - ip6_pkt_t ipp; - uint8_t nexthdr; - ip_pktinfo_t *pinfo = NULL; - mblk_t *options_mp = NULL; - uint_t icmp_opt = 0; - boolean_t icmp_ipv6_recvhoplimit = B_FALSE; - uint_t hopstrip; ASSERT(connp->conn_flags & IPCL_RAWIPCONN); icmp = connp->conn_icmp; is = icmp->icmp_is; rptr = mp->b_rptr; - ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_CTL); + + ASSERT(DB_TYPE(mp) == M_DATA); ASSERT(OK_32PTR(rptr)); + ASSERT(ira->ira_pktlen == msgdsize(mp)); + pkt_len = ira->ira_pktlen; /* - * IP should have prepended the options data in an M_CTL - * Check M_CTL "type" to make sure are not here bcos of - * a valid ICMP message + * Get a snapshot of these and allow other threads to change + * them after that. We need the same recv_ancillary when determining + * the size as when adding the ancillary data items. */ - if (DB_TYPE(mp) == M_CTL) { - /* - * FIXME: does IP still do this? - * IP sends up the IPSEC_IN message for handling IPSEC - * policy at the TCP level. We don't need it here. - */ - if (*(uint32_t *)(mp->b_rptr) == IPSEC_IN) { - mp1 = mp->b_cont; - freeb(mp); - mp = mp1; - rptr = mp->b_rptr; - } else if (MBLKL(mp) == sizeof (ip_pktinfo_t) && - ((ip_pktinfo_t *)mp->b_rptr)->ip_pkt_ulp_type == - IN_PKTINFO) { - /* - * IP_RECVIF or IP_RECVSLLA or IPF_RECVADDR information - * has been prepended to the packet by IP. We need to - * extract the mblk and adjust the rptr - */ - pinfo = (ip_pktinfo_t *)mp->b_rptr; - options_mp = mp; - mp = mp->b_cont; - rptr = mp->b_rptr; - } else { - /* - * ICMP messages. - */ - icmp_icmp_error(connp, mp); - return; - } - } + mutex_enter(&connp->conn_lock); + recv_ancillary = connp->conn_recv_ancillary; + mutex_exit(&connp->conn_lock); - /* - * Discard message if it is misaligned or smaller than the IP header. - */ - if (!OK_32PTR(rptr) || (mp->b_wptr - rptr) < sizeof (ipha_t)) { - freemsg(mp); - if (options_mp != NULL) - freeb(options_mp); - BUMP_MIB(&is->is_rawip_mib, rawipInErrors); - return; - } - ipvers = IPH_HDR_VERSION((ipha_t *)rptr); + ip_hdr_length = ira->ira_ip_hdr_length; + ASSERT(MBLKL(mp) >= ip_hdr_length); /* IP did a pullup */ + + /* Initialize regardless of IP version */ + ipps.ipp_fields = 0; + + if (ira->ira_flags & IRAF_IS_IPV4) { + ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION); + ASSERT(MBLKL(mp) >= sizeof (ipha_t)); + ASSERT(ira->ira_ip_hdr_length == IPH_HDR_LENGTH(rptr)); + + ipha = (ipha_t *)mp->b_rptr; + if (recv_ancillary.crb_all != 0) + (void) ip_find_hdr_v4(ipha, &ipps, B_FALSE); - /* Handle M_DATA messages containing IP packets messages */ - if (ipvers == IPV4_VERSION) { /* - * Special case where IP attaches - * the IRE needs to be handled so that we don't send up - * IRE to the user land. + * BSD for some reason adjusts ipha_length to exclude the + * IP header length. We do the same. */ - ipha = (ipha_t *)rptr; - hdr_len = IPH_HDR_LENGTH(ipha); - - if (ipha->ipha_protocol == IPPROTO_TCP) { - tcph_t *tcph = (tcph_t *)&mp->b_rptr[hdr_len]; - - if (((tcph->th_flags[0] & (TH_SYN|TH_ACK)) == - TH_SYN) && mp->b_cont != NULL) { - mp1 = mp->b_cont; - if (mp1->b_datap->db_type == IRE_DB_TYPE) { - freeb(mp1); - mp->b_cont = NULL; - } - } - } if (is->is_bsd_compat) { ushort_t len; - len = ntohs(ipha->ipha_length); + len = ntohs(ipha->ipha_length); if (mp->b_datap->db_ref > 1) { /* * Allocate a new IP header so that we can @@ -3525,70 +2616,58 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2) */ mblk_t *mp1; - mp1 = allocb(hdr_len, BPRI_MED); - if (!mp1) { + mp1 = allocb(ip_hdr_length, BPRI_MED); + if (mp1 == NULL) { freemsg(mp); - if (options_mp != NULL) - freeb(options_mp); BUMP_MIB(&is->is_rawip_mib, rawipInErrors); return; } - bcopy(rptr, mp1->b_rptr, hdr_len); - mp->b_rptr = rptr + hdr_len; + bcopy(rptr, mp1->b_rptr, ip_hdr_length); + mp->b_rptr = rptr + ip_hdr_length; rptr = mp1->b_rptr; ipha = (ipha_t *)rptr; mp1->b_cont = mp; - mp1->b_wptr = rptr + hdr_len; + mp1->b_wptr = rptr + ip_hdr_length; mp = mp1; } - len -= hdr_len; + len -= ip_hdr_length; ipha->ipha_length = htons(len); } - } - /* - * This is the inbound data path. Packets are passed upstream as - * T_UNITDATA_IND messages with full IP headers still attached. - */ - if (icmp->icmp_family == AF_INET) { - ASSERT(ipvers == IPV4_VERSION); - udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t); - if (icmp->icmp_recvif && (pinfo != NULL) && - (pinfo->ip_pkt_flags & IPF_RECVIF)) { - udi_size += sizeof (struct T_opthdr) + - sizeof (uint_t); - } + /* + * For RAW sockets we not pass ICMP/IPv4 packets to AF_INET6 + * sockets. This is ensured by icmp_bind and the IP fanout code. + */ + ASSERT(connp->conn_family == AF_INET); - if (icmp->icmp_ip_recvpktinfo && (pinfo != NULL) && - (pinfo->ip_pkt_flags & IPF_RECVADDR)) { - udi_size += sizeof (struct T_opthdr) + - sizeof (struct in_pktinfo); - } + /* + * This is the inbound data path. Packets are passed upstream + * as T_UNITDATA_IND messages with full IPv4 headers still + * attached. + */ /* - * If SO_TIMESTAMP is set allocate the appropriate sized - * buffer. Since gethrestime() expects a pointer aligned - * argument, we allocate space necessary for extra - * alignment (even though it might not be used). + * Normally only send up the source address. + * If any ancillary data items are wanted we add those. */ - if (icmp->icmp_timestamp) { - udi_size += sizeof (struct T_opthdr) + - sizeof (timestruc_t) + _POINTER_ALIGNMENT; + udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t); + if (recv_ancillary.crb_all != 0) { + udi_size += conn_recvancillary_size(connp, + recv_ancillary, ira, mp, &ipps); } + + /* Allocate a message block for the T_UNITDATA_IND structure. */ mp1 = allocb(udi_size, BPRI_MED); if (mp1 == NULL) { freemsg(mp); - if (options_mp != NULL) - freeb(options_mp); BUMP_MIB(&is->is_rawip_mib, rawipInErrors); return; } mp1->b_cont = mp; - mp = mp1; - tudi = (struct T_unitdata_ind *)mp->b_rptr; - mp->b_datap->db_type = M_PROTO; - mp->b_wptr = (uchar_t *)tudi + udi_size; + tudi = (struct T_unitdata_ind *)mp1->b_rptr; + mp1->b_datap->db_type = M_PROTO; + mp1->b_wptr = (uchar_t *)tudi + udi_size; tudi->PRIM_type = T_UNITDATA_IND; tudi->SRC_length = sizeof (sin_t); tudi->SRC_offset = sizeof (struct T_unitdata_ind); @@ -3596,316 +2675,110 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2) *sin = sin_null; sin->sin_family = AF_INET; sin->sin_addr.s_addr = ipha->ipha_src; + *(uint32_t *)&sin->sin_zero[0] = 0; + *(uint32_t *)&sin->sin_zero[4] = 0; tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin_t); udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t)); tudi->OPT_length = udi_size; /* - * Add options if IP_RECVIF is set + * Add options if IP_RECVIF etc is set */ if (udi_size != 0) { - char *dstopt; - - dstopt = (char *)&sin[1]; - if (icmp->icmp_recvif && (pinfo != NULL) && - (pinfo->ip_pkt_flags & IPF_RECVIF)) { - - struct T_opthdr *toh; - uint_t *dstptr; - - toh = (struct T_opthdr *)dstopt; - toh->level = IPPROTO_IP; - toh->name = IP_RECVIF; - toh->len = sizeof (struct T_opthdr) + - sizeof (uint_t); - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - dstptr = (uint_t *)dstopt; - *dstptr = pinfo->ip_pkt_ifindex; - dstopt += sizeof (uint_t); - udi_size -= toh->len; - } - if (icmp->icmp_timestamp) { - struct T_opthdr *toh; - - toh = (struct T_opthdr *)dstopt; - toh->level = SOL_SOCKET; - toh->name = SCM_TIMESTAMP; - toh->len = sizeof (struct T_opthdr) + - sizeof (timestruc_t) + _POINTER_ALIGNMENT; - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - /* Align for gethrestime() */ - dstopt = (char *)P2ROUNDUP((intptr_t)dstopt, - sizeof (intptr_t)); - gethrestime((timestruc_t *)dstopt); - dstopt = (char *)toh + toh->len; - udi_size -= toh->len; - } - if (icmp->icmp_ip_recvpktinfo && (pinfo != NULL) && - (pinfo->ip_pkt_flags & IPF_RECVADDR)) { - struct T_opthdr *toh; - struct in_pktinfo *pktinfop; - - toh = (struct T_opthdr *)dstopt; - toh->level = IPPROTO_IP; - toh->name = IP_PKTINFO; - toh->len = sizeof (struct T_opthdr) + - sizeof (in_pktinfo_t); - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - pktinfop = (struct in_pktinfo *)dstopt; - pktinfop->ipi_ifindex = pinfo->ip_pkt_ifindex; - pktinfop->ipi_spec_dst = - pinfo->ip_pkt_match_addr; - - pktinfop->ipi_addr.s_addr = ipha->ipha_dst; - - dstopt += sizeof (struct in_pktinfo); - udi_size -= toh->len; - } - - /* Consumed all of allocated space */ - ASSERT(udi_size == 0); + conn_recvancillary_add(connp, recv_ancillary, ira, + &ipps, (uchar_t *)&sin[1], udi_size); } - - if (options_mp != NULL) - freeb(options_mp); - - BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams); goto deliver; } + ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION); /* - * We don't need options_mp in the IPv6 path. + * IPv6 packets can only be received by applications + * that are prepared to receive IPv6 addresses. + * The IP fanout must ensure this. */ - if (options_mp != NULL) { - freeb(options_mp); - options_mp = NULL; - } + ASSERT(connp->conn_family == AF_INET6); /* - * Discard message if it is smaller than the IPv6 header - * or if the header is malformed. + * Handle IPv6 packets. We don't pass up the IP headers with the + * payload for IPv6. */ - if ((mp->b_wptr - rptr) < sizeof (ip6_t) || - IPH_HDR_VERSION((ipha_t *)rptr) != IPV6_VERSION || - icmp->icmp_family != AF_INET6) { - freemsg(mp); - BUMP_MIB(&is->is_rawip_mib, rawipInErrors); - return; - } - - /* Initialize */ - ipp.ipp_fields = 0; - hopstrip = 0; ip6h = (ip6_t *)rptr; - /* - * Call on ip_find_hdr_v6 which gets the total hdr len - * as well as individual lenghts of ext hdrs (and ptrs to - * them). - */ - if (ip6h->ip6_nxt != icmp->icmp_proto) { - /* Look for ifindex information */ - if (ip6h->ip6_nxt == IPPROTO_RAW) { - ip6i = (ip6i_t *)ip6h; - if (ip6i->ip6i_flags & IP6I_IFINDEX) { - ASSERT(ip6i->ip6i_ifindex != 0); - ipp.ipp_fields |= IPPF_IFINDEX; - ipp.ipp_ifindex = ip6i->ip6i_ifindex; - } - rptr = (uchar_t *)&ip6i[1]; - mp->b_rptr = rptr; - if (rptr == mp->b_wptr) { - mp1 = mp->b_cont; - freeb(mp); - mp = mp1; - rptr = mp->b_rptr; - } - ASSERT(mp->b_wptr - rptr >= IPV6_HDR_LEN); - ip6h = (ip6_t *)rptr; - } - hdr_len = ip_find_hdr_v6(mp, ip6h, &ipp, &nexthdr); + if (recv_ancillary.crb_all != 0) { + /* + * Call on ip_find_hdr_v6 which gets individual lenghts of + * extension headers (and pointers to them). + */ + uint8_t nexthdr; + + /* We don't care about the length or nextheader. */ + (void) ip_find_hdr_v6(mp, ip6h, B_TRUE, &ipps, &nexthdr); /* - * We need to lie a bit to the user because users inside - * labeled compartments should not see their own labels. We - * assume that in all other respects IP has checked the label, - * and that the label is always first among the options. (If - * it's not first, then this code won't see it, and the option - * will be passed along to the user.) + * We do not pass up hop-by-hop options or any other + * extension header as part of the packet. Applications + * that want to see them have to specify IPV6_RECV* socket + * options. And conn_recvancillary_size/add explicitly + * drops the TX option from IPV6_HOPOPTS as it does for UDP. * - * If we had multilevel ICMP sockets, then the following code - * should be skipped for them to allow the user to see the - * label. - * - * Alignment restrictions in the definition of IP options - * (namely, the requirement that the 4-octet DOI goes on a - * 4-octet boundary) mean that we know exactly where the option - * should start, but we're lenient for other hosts. - * - * Note that there are no multilevel ICMP or raw IP sockets - * yet, thus nobody ever sees the IP6OPT_LS option. + * If we had multilevel ICMP sockets, then we'd want to + * modify conn_recvancillary_size/add to + * allow the user to see the label. */ - if ((ipp.ipp_fields & IPPF_HOPOPTS) && - ipp.ipp_hopoptslen > 5 && is_system_labeled()) { - const uchar_t *ucp = - (const uchar_t *)ipp.ipp_hopopts + 2; - int remlen = ipp.ipp_hopoptslen - 2; - - while (remlen > 0) { - if (*ucp == IP6OPT_PAD1) { - remlen--; - ucp++; - } else if (*ucp == IP6OPT_PADN) { - remlen -= ucp[1] + 2; - ucp += ucp[1] + 2; - } else if (*ucp == ip6opt_ls) { - hopstrip = (ucp - - (const uchar_t *)ipp.ipp_hopopts) + - ucp[1] + 2; - hopstrip = (hopstrip + 7) & ~7; - break; - } else { - /* label option must be first */ - break; - } - } - } - } else { - hdr_len = IPV6_HDR_LEN; - ip6i = NULL; - nexthdr = ip6h->ip6_nxt; - } - /* - * One special case where IP attaches the IRE needs to - * be handled so that we don't send up IRE to the user land. - */ - if (nexthdr == IPPROTO_TCP) { - tcph_t *tcph = (tcph_t *)&mp->b_rptr[hdr_len]; - - if (((tcph->th_flags[0] & (TH_SYN|TH_ACK)) == TH_SYN) && - mp->b_cont != NULL) { - mp1 = mp->b_cont; - if (mp1->b_datap->db_type == IRE_DB_TYPE) { - freeb(mp1); - mp->b_cont = NULL; - } - } } + /* * Check a filter for ICMPv6 types if needed. * Verify raw checksums if needed. */ - if (icmp->icmp_filter != NULL || icmp->icmp_raw_checksum) { - if (icmp->icmp_filter != NULL) { - int type; + mutex_enter(&connp->conn_lock); + if (icmp->icmp_filter != NULL) { + int type; - /* Assumes that IP has done the pullupmsg */ - type = mp->b_rptr[hdr_len]; + /* Assumes that IP has done the pullupmsg */ + type = mp->b_rptr[ip_hdr_length]; - ASSERT(mp->b_rptr + hdr_len <= mp->b_wptr); - if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) { - freemsg(mp); - return; - } - } else { - /* Checksum */ - uint16_t *up; - uint32_t sum; - int remlen; - - up = (uint16_t *)&ip6h->ip6_src; - - remlen = msgdsize(mp) - hdr_len; - sum = htons(icmp->icmp_proto + remlen) - + up[0] + up[1] + up[2] + up[3] - + up[4] + up[5] + up[6] + up[7] - + up[8] + up[9] + up[10] + up[11] - + up[12] + up[13] + up[14] + up[15]; - sum = (sum & 0xffff) + (sum >> 16); - sum = IP_CSUM(mp, hdr_len, sum); - if (sum != 0) { - /* IPv6 RAW checksum failed */ - ip0dbg(("icmp_rput: RAW checksum " - "failed %x\n", sum)); - freemsg(mp); - BUMP_MIB(&is->is_rawip_mib, - rawipInCksumErrs); - return; - } + ASSERT(mp->b_rptr + ip_hdr_length <= mp->b_wptr); + if (ICMP6_FILTER_WILLBLOCK(type, icmp->icmp_filter)) { + mutex_exit(&connp->conn_lock); + freemsg(mp); + return; } } - /* Skip all the IPv6 headers per API */ - mp->b_rptr += hdr_len; - - udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t); - - /* - * We use local variables icmp_opt and icmp_ipv6_recvhoplimit to - * maintain state information, instead of relying on icmp_t - * structure, since there arent any locks protecting these members - * and there is a window where there might be a race between a - * thread setting options on the write side and a thread reading - * these options on the read size. - */ - if (ipp.ipp_fields & (IPPF_HOPOPTS|IPPF_DSTOPTS|IPPF_RTDSTOPTS| - IPPF_RTHDR|IPPF_IFINDEX)) { - if (icmp->icmp_ipv6_recvhopopts && - (ipp.ipp_fields & IPPF_HOPOPTS) && - ipp.ipp_hopoptslen > hopstrip) { - udi_size += sizeof (struct T_opthdr) + - ipp.ipp_hopoptslen - hopstrip; - icmp_opt |= IPPF_HOPOPTS; - } - if ((icmp->icmp_ipv6_recvdstopts || - icmp->icmp_old_ipv6_recvdstopts) && - (ipp.ipp_fields & IPPF_DSTOPTS)) { - udi_size += sizeof (struct T_opthdr) + - ipp.ipp_dstoptslen; - icmp_opt |= IPPF_DSTOPTS; - } - if (((icmp->icmp_ipv6_recvdstopts && - icmp->icmp_ipv6_recvrthdr && - (ipp.ipp_fields & IPPF_RTHDR)) || - icmp->icmp_ipv6_recvrtdstopts) && - (ipp.ipp_fields & IPPF_RTDSTOPTS)) { - udi_size += sizeof (struct T_opthdr) + - ipp.ipp_rtdstoptslen; - icmp_opt |= IPPF_RTDSTOPTS; - } - if (icmp->icmp_ipv6_recvrthdr && - (ipp.ipp_fields & IPPF_RTHDR)) { - udi_size += sizeof (struct T_opthdr) + - ipp.ipp_rthdrlen; - icmp_opt |= IPPF_RTHDR; - } - if (icmp->icmp_ip_recvpktinfo && - (ipp.ipp_fields & IPPF_IFINDEX)) { - udi_size += sizeof (struct T_opthdr) + - sizeof (struct in6_pktinfo); - icmp_opt |= IPPF_IFINDEX; + if (connp->conn_ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { + /* Checksum */ + uint16_t *up; + uint32_t sum; + int remlen; + + up = (uint16_t *)&ip6h->ip6_src; + + remlen = msgdsize(mp) - ip_hdr_length; + sum = htons(connp->conn_proto + remlen) + + up[0] + up[1] + up[2] + up[3] + + up[4] + up[5] + up[6] + up[7] + + up[8] + up[9] + up[10] + up[11] + + up[12] + up[13] + up[14] + up[15]; + sum = (sum & 0xffff) + (sum >> 16); + sum = IP_CSUM(mp, ip_hdr_length, sum); + if (sum != 0) { + /* IPv6 RAW checksum failed */ + ip0dbg(("icmp_rput: RAW checksum failed %x\n", sum)); + mutex_exit(&connp->conn_lock); + freemsg(mp); + BUMP_MIB(&is->is_rawip_mib, rawipInCksumErrs); + return; } } - if (icmp->icmp_ipv6_recvhoplimit) { - udi_size += sizeof (struct T_opthdr) + sizeof (int); - icmp_ipv6_recvhoplimit = B_TRUE; - } + mutex_exit(&connp->conn_lock); - if (icmp->icmp_ipv6_recvtclass) - udi_size += sizeof (struct T_opthdr) + sizeof (int); + udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t); - /* - * If SO_TIMESTAMP is set allocate the appropriate sized - * buffer. Since gethrestime() expects a pointer aligned - * argument, we allocate space necessary for extra - * alignment (even though it might not be used). - */ - if (icmp->icmp_timestamp) { - udi_size += sizeof (struct T_opthdr) + - sizeof (timestruc_t) + _POINTER_ALIGNMENT; + if (recv_ancillary.crb_all != 0) { + udi_size += conn_recvancillary_size(connp, + recv_ancillary, ira, mp, &ipps); } mp1 = allocb(udi_size, BPRI_MED); @@ -3915,10 +2788,9 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2) return; } mp1->b_cont = mp; - mp = mp1; - mp->b_datap->db_type = M_PROTO; - tudi = (struct T_unitdata_ind *)mp->b_rptr; - mp->b_wptr = (uchar_t *)tudi + udi_size; + mp1->b_datap->db_type = M_PROTO; + tudi = (struct T_unitdata_ind *)mp1->b_rptr; + mp1->b_wptr = (uchar_t *)tudi + udi_size; tudi->PRIM_type = T_UNITDATA_IND; tudi->SRC_length = sizeof (sin6_t); tudi->SRC_offset = sizeof (struct T_unitdata_ind); @@ -3926,166 +2798,38 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2) udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t)); tudi->OPT_length = udi_size; sin6 = (sin6_t *)&tudi[1]; + *sin6 = sin6_null; sin6->sin6_port = 0; sin6->sin6_family = AF_INET6; sin6->sin6_addr = ip6h->ip6_src; /* No sin6_flowinfo per API */ sin6->sin6_flowinfo = 0; - /* For link-scope source pass up scope id */ - if ((ipp.ipp_fields & IPPF_IFINDEX) && - IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) - sin6->sin6_scope_id = ipp.ipp_ifindex; + /* For link-scope pass up scope id */ + if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) + sin6->sin6_scope_id = ira->ira_ruifindex; else sin6->sin6_scope_id = 0; - sin6->__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst, - icmp->icmp_zoneid, is->is_netstack); + IPCL_ZONEID(connp), is->is_netstack); if (udi_size != 0) { - uchar_t *dstopt; - - dstopt = (uchar_t *)&sin6[1]; - if (icmp_opt & IPPF_IFINDEX) { - struct T_opthdr *toh; - struct in6_pktinfo *pkti; - - toh = (struct T_opthdr *)dstopt; - toh->level = IPPROTO_IPV6; - toh->name = IPV6_PKTINFO; - toh->len = sizeof (struct T_opthdr) + - sizeof (*pkti); - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - pkti = (struct in6_pktinfo *)dstopt; - pkti->ipi6_addr = ip6h->ip6_dst; - pkti->ipi6_ifindex = ipp.ipp_ifindex; - dstopt += sizeof (*pkti); - udi_size -= toh->len; - } - if (icmp_ipv6_recvhoplimit) { - struct T_opthdr *toh; - - toh = (struct T_opthdr *)dstopt; - toh->level = IPPROTO_IPV6; - toh->name = IPV6_HOPLIMIT; - toh->len = sizeof (struct T_opthdr) + - sizeof (uint_t); - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - *(uint_t *)dstopt = ip6h->ip6_hops; - dstopt += sizeof (uint_t); - udi_size -= toh->len; - } - if (icmp->icmp_ipv6_recvtclass) { - struct T_opthdr *toh; - - toh = (struct T_opthdr *)dstopt; - toh->level = IPPROTO_IPV6; - toh->name = IPV6_TCLASS; - toh->len = sizeof (struct T_opthdr) + - sizeof (uint_t); - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - *(uint_t *)dstopt = IPV6_FLOW_TCLASS(ip6h->ip6_flow); - dstopt += sizeof (uint_t); - udi_size -= toh->len; - } - if (icmp->icmp_timestamp) { - struct T_opthdr *toh; - - toh = (struct T_opthdr *)dstopt; - toh->level = SOL_SOCKET; - toh->name = SCM_TIMESTAMP; - toh->len = sizeof (struct T_opthdr) + - sizeof (timestruc_t) + _POINTER_ALIGNMENT; - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - /* Align for gethrestime() */ - dstopt = (uchar_t *)P2ROUNDUP((intptr_t)dstopt, - sizeof (intptr_t)); - gethrestime((timestruc_t *)dstopt); - dstopt = (uchar_t *)toh + toh->len; - udi_size -= toh->len; - } - - if (icmp_opt & IPPF_HOPOPTS) { - struct T_opthdr *toh; - - toh = (struct T_opthdr *)dstopt; - toh->level = IPPROTO_IPV6; - toh->name = IPV6_HOPOPTS; - toh->len = sizeof (struct T_opthdr) + - ipp.ipp_hopoptslen - hopstrip; - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - bcopy((char *)ipp.ipp_hopopts + hopstrip, dstopt, - ipp.ipp_hopoptslen - hopstrip); - if (hopstrip > 0) { - /* copy next header value and fake length */ - dstopt[0] = ((uchar_t *)ipp.ipp_hopopts)[0]; - dstopt[1] = ((uchar_t *)ipp.ipp_hopopts)[1] - - hopstrip / 8; - } - dstopt += ipp.ipp_hopoptslen - hopstrip; - udi_size -= toh->len; - } - if (icmp_opt & IPPF_RTDSTOPTS) { - struct T_opthdr *toh; - - toh = (struct T_opthdr *)dstopt; - toh->level = IPPROTO_IPV6; - toh->name = IPV6_DSTOPTS; - toh->len = sizeof (struct T_opthdr) + - ipp.ipp_rtdstoptslen; - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - bcopy(ipp.ipp_rtdstopts, dstopt, - ipp.ipp_rtdstoptslen); - dstopt += ipp.ipp_rtdstoptslen; - udi_size -= toh->len; - } - if (icmp_opt & IPPF_RTHDR) { - struct T_opthdr *toh; - - toh = (struct T_opthdr *)dstopt; - toh->level = IPPROTO_IPV6; - toh->name = IPV6_RTHDR; - toh->len = sizeof (struct T_opthdr) + - ipp.ipp_rthdrlen; - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - bcopy(ipp.ipp_rthdr, dstopt, ipp.ipp_rthdrlen); - dstopt += ipp.ipp_rthdrlen; - udi_size -= toh->len; - } - if (icmp_opt & IPPF_DSTOPTS) { - struct T_opthdr *toh; - - toh = (struct T_opthdr *)dstopt; - toh->level = IPPROTO_IPV6; - toh->name = IPV6_DSTOPTS; - toh->len = sizeof (struct T_opthdr) + - ipp.ipp_dstoptslen; - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - bcopy(ipp.ipp_dstopts, dstopt, - ipp.ipp_dstoptslen); - dstopt += ipp.ipp_dstoptslen; - udi_size -= toh->len; - } - /* Consumed all of allocated space */ - ASSERT(udi_size == 0); + conn_recvancillary_add(connp, recv_ancillary, ira, + &ipps, (uchar_t *)&sin6[1], udi_size); } - BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams); -deliver: - icmp_ulp_recv(connp, mp); + /* Skip all the IPv6 headers per API */ + mp->b_rptr += ip_hdr_length; + pkt_len -= ip_hdr_length; +deliver: + BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams); + icmp_ulp_recv(connp, mp1, pkt_len); } /* - * return SNMP stuff in buffer in mpdata + * return SNMP stuff in buffer in mpdata. We don't hold any lock and report + * information that can be changing beneath us. */ mblk_t * icmp_snmp_get(queue_t *q, mblk_t *mpctl) @@ -4146,51 +2890,70 @@ icmp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name, static void icmp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err) { + struct T_unitdata_req *tudr; mblk_t *mp1; - uchar_t *rptr = mp->b_rptr; - struct T_unitdata_req *tudr = (struct T_unitdata_req *)rptr; + uchar_t *destaddr; + t_scalar_t destlen; + uchar_t *optaddr; + t_scalar_t optlen; + + if ((mp->b_wptr < mp->b_rptr) || + (MBLKL(mp)) < sizeof (struct T_unitdata_req)) { + goto done; + } + tudr = (struct T_unitdata_req *)mp->b_rptr; + destaddr = mp->b_rptr + tudr->DEST_offset; + if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr || + destaddr + tudr->DEST_length < mp->b_rptr || + destaddr + tudr->DEST_length > mp->b_wptr) { + goto done; + } + optaddr = mp->b_rptr + tudr->OPT_offset; + if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr || + optaddr + tudr->OPT_length < mp->b_rptr || + optaddr + tudr->OPT_length > mp->b_wptr) { + goto done; + } + destlen = tudr->DEST_length; + optlen = tudr->OPT_length; - mp1 = mi_tpi_uderror_ind((char *)&rptr[tudr->DEST_offset], - tudr->DEST_length, (char *)&rptr[tudr->OPT_offset], - tudr->OPT_length, err); - if (mp1) + mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen, + (char *)optaddr, optlen, err); + if (mp1 != NULL) qreply(q, mp1); + +done: freemsg(mp); } - static int rawip_do_unbind(conn_t *connp) { - icmp_t *icmp = connp->conn_icmp; + icmp_t *icmp = connp->conn_icmp; - rw_enter(&icmp->icmp_rwlock, RW_WRITER); + mutex_enter(&connp->conn_lock); /* If a bind has not been done, we can't unbind. */ - if (icmp->icmp_state == TS_UNBND || icmp->icmp_pending_op != -1) { - rw_exit(&icmp->icmp_rwlock); + if (icmp->icmp_state == TS_UNBND) { + mutex_exit(&connp->conn_lock); return (-TOUTSTATE); } - icmp->icmp_pending_op = T_UNBIND_REQ; - rw_exit(&icmp->icmp_rwlock); + connp->conn_saddr_v6 = ipv6_all_zeros; + connp->conn_bound_addr_v6 = ipv6_all_zeros; + connp->conn_laddr_v6 = ipv6_all_zeros; + connp->conn_mcbc_bind = B_FALSE; + connp->conn_lport = 0; + connp->conn_fport = 0; + /* In case we were also connected */ + connp->conn_faddr_v6 = ipv6_all_zeros; + connp->conn_v6lastdst = ipv6_all_zeros; - /* - * Call ip to unbind - */ + icmp->icmp_state = TS_UNBND; - ip_unbind(connp); + (void) icmp_build_hdr_template(connp, &connp->conn_saddr_v6, + &connp->conn_faddr_v6, connp->conn_flowinfo); + mutex_exit(&connp->conn_lock); - /* - * Once we're unbound from IP, the pending operation may be cleared - * here. - */ - rw_enter(&icmp->icmp_rwlock, RW_WRITER); - V6_SET_ZERO(icmp->icmp_v6src); - V6_SET_ZERO(icmp->icmp_bound_v6src); - icmp->icmp_pending_op = -1; - icmp->icmp_state = TS_UNBND; - if (icmp->icmp_family == AF_INET6) - (void) icmp_build_hdrs(icmp); - rw_exit(&icmp->icmp_rwlock); + ip_unbind(connp); return (0); } @@ -4230,42 +2993,86 @@ icmp_tpi_unbind(queue_t *q, mblk_t *mp) qreply(q, mp); } - /* * Process IPv4 packets that already include an IP header. * Used when IP_HDRINCL has been set (implicit for IPPROTO_RAW and * IPPROTO_IGMP). + * In this case we ignore the address and any options in the T_UNITDATA_REQ. + * + * The packet is assumed to have a base (20 byte) IP header followed + * by the upper-layer protocol. We include any IP_OPTIONS including a + * CIPSO label but otherwise preserve the base IP header. */ static int -icmp_wput_hdrincl(queue_t *q, conn_t *connp, mblk_t *mp, icmp_t *icmp, - ip4_pkt_t *pktinfop) +icmp_output_hdrincl(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid) { - icmp_stack_t *is = icmp->icmp_is; - ipha_t *ipha; - int ip_hdr_length; - int tp_hdr_len; - int error; - uchar_t ip_snd_opt[IP_MAX_OPT_LENGTH]; - uint32_t ip_snd_opt_len = 0; - mblk_t *mp1; - uint_t pkt_len; - ip_opt_info_t optinfo; - pid_t cpid; - cred_t *cr; + icmp_t *icmp = connp->conn_icmp; + icmp_stack_t *is = icmp->icmp_is; + ipha_t iphas; + ipha_t *ipha; + int ip_hdr_length; + int tp_hdr_len; + ip_xmit_attr_t *ixa; + ip_pkt_t *ipp; + in6_addr_t v6src; + in6_addr_t v6dst; + in6_addr_t v6nexthop; + int error; + boolean_t do_ipsec; - rw_enter(&icmp->icmp_rwlock, RW_READER); + /* + * We need an exclusive copy of conn_ixa since the included IP + * header could have any destination. + * That copy has no pointers hence we + * need to set them up once we've parsed the ancillary data. + */ + ixa = conn_get_ixa_exclusive(connp); + if (ixa == NULL) { + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + freemsg(mp); + return (ENOMEM); + } + ASSERT(cr != NULL); + /* + * Caller has a reference on cr; from db_credp or because we + * are running in process context. + */ + ixa->ixa_cred = cr; + ixa->ixa_cpid = pid; + if (is_system_labeled()) { + /* We need to restart with a label based on the cred */ + ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); + } + + /* In case previous destination was multicast or multirt */ + ip_attr_newdst(ixa); - optinfo.ip_opt_flags = 0; - optinfo.ip_opt_ill_index = 0; + /* Get a copy of conn_xmit_ipp since the TX label might change it */ + ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP); + if (ipp == NULL) { + ixa_refrele(ixa); + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + freemsg(mp); + return (ENOMEM); + } + mutex_enter(&connp->conn_lock); + error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP); + mutex_exit(&connp->conn_lock); + if (error != 0) { + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + freemsg(mp); + goto done; + } + + /* Sanity check length of packet */ ipha = (ipha_t *)mp->b_rptr; - ip_hdr_length = IP_SIMPLE_HDR_LENGTH + icmp->icmp_ip_snd_options_len; + + ip_hdr_length = IP_SIMPLE_HDR_LENGTH; if ((mp->b_wptr - mp->b_rptr) < IP_SIMPLE_HDR_LENGTH) { if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { - ASSERT(icmp != NULL); BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); freemsg(mp); - rw_exit(&icmp->icmp_rwlock); - return (0); + goto done; } ipha = (ipha_t *)mp->b_rptr; } @@ -4273,1285 +3080,1541 @@ icmp_wput_hdrincl(queue_t *q, conn_t *connp, mblk_t *mp, icmp_t *icmp, (IP_VERSION<<4) | (ip_hdr_length>>2); /* - * Check if our saved options are valid; update if not. - * TSOL Note: Since we are not in WRITER mode, ICMP packets - * to different destination may require different labels, - * or worse, ICMP packets to same IP address may require - * different labels due to use of shared all-zones address. - * We use conn_lock to ensure that lastdst, ip_snd_options, - * and ip_snd_options_len are consistent for the current - * destination and are updated atomically. - */ - mutex_enter(&connp->conn_lock); - if (is_system_labeled()) { - /* - * Recompute the Trusted Extensions security label if - * we're not going to the same destination as last - * time or the cred attached to the received mblk - * changed. - */ - cr = msg_getcred(mp, &cpid); - if (!IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6lastdst) || - V4_PART_OF_V6(icmp->icmp_v6lastdst) != ipha->ipha_dst || - cr != icmp->icmp_last_cred) { - error = icmp_update_label(icmp, mp, ipha->ipha_dst); - if (error != 0) { - mutex_exit(&connp->conn_lock); - rw_exit(&icmp->icmp_rwlock); - return (error); - } - } - /* - * Apply credentials with modified security label if they - * exist. icmp_update_label() may have generated these - * credentials for packets to unlabeled remote nodes. - */ - if (icmp->icmp_effective_cred != NULL) - mblk_setcred(mp, icmp->icmp_effective_cred, cpid); - } - - if (icmp->icmp_ip_snd_options_len > 0) { - ip_snd_opt_len = icmp->icmp_ip_snd_options_len; - bcopy(icmp->icmp_ip_snd_options, ip_snd_opt, ip_snd_opt_len); - } - mutex_exit(&connp->conn_lock); - - /* - * For the socket of SOCK_RAW type, the checksum is provided in the - * pre-built packet. We set the ipha_ident field to IP_HDR_INCLUDED to - * tell IP that the application has sent a complete IP header and not - * to compute the transport checksum nor change the DF flag. + * We set IXAF_DONTFRAG if the application set DF which makes + * IP not fragment. */ - ipha->ipha_ident = IP_HDR_INCLUDED; - ipha->ipha_hdr_checksum = 0; ipha->ipha_fragment_offset_and_flags &= htons(IPH_DF); - /* Insert options if any */ - if (ip_hdr_length > IP_SIMPLE_HDR_LENGTH) { - /* - * Put the IP header plus any transport header that is - * checksumed by ip_wput into the first mblk. (ip_wput assumes - * that at least the checksum field is in the first mblk.) - */ - switch (ipha->ipha_protocol) { - case IPPROTO_UDP: - tp_hdr_len = 8; - break; - case IPPROTO_TCP: - tp_hdr_len = 20; - break; - default: - tp_hdr_len = 0; - break; - } - /* - * The code below assumes that IP_SIMPLE_HDR_LENGTH plus - * tp_hdr_len bytes will be in a single mblk. - */ - if ((mp->b_wptr - mp->b_rptr) < (IP_SIMPLE_HDR_LENGTH + - tp_hdr_len)) { - if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH + - tp_hdr_len)) { - BUMP_MIB(&is->is_rawip_mib, - rawipOutErrors); - freemsg(mp); - rw_exit(&icmp->icmp_rwlock); - return (0); - } - ipha = (ipha_t *)mp->b_rptr; - } - - /* - * if the length is larger then the max allowed IP packet, - * then send an error and abort the processing. - */ - pkt_len = ntohs(ipha->ipha_length) - + ip_snd_opt_len; - if (pkt_len > IP_MAXPACKET) { - rw_exit(&icmp->icmp_rwlock); - return (EMSGSIZE); - } - if (!(mp1 = allocb(ip_hdr_length + is->is_wroff_extra + - tp_hdr_len, BPRI_LO))) { - rw_exit(&icmp->icmp_rwlock); - return (ENOMEM); - } - mp1->b_rptr += is->is_wroff_extra; - mp1->b_wptr = mp1->b_rptr + ip_hdr_length; + if (ipha->ipha_fragment_offset_and_flags & htons(IPH_DF)) + ixa->ixa_flags |= (IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); + else + ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); - ipha->ipha_length = htons((uint16_t)pkt_len); - bcopy(ipha, mp1->b_rptr, IP_SIMPLE_HDR_LENGTH); + /* Even for multicast and broadcast we honor the apps ttl */ + ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; - /* Copy transport header if any */ - bcopy(&ipha[1], mp1->b_wptr, tp_hdr_len); - mp1->b_wptr += tp_hdr_len; + if (ipha->ipha_dst == INADDR_ANY) + ipha->ipha_dst = htonl(INADDR_LOOPBACK); - /* Add options */ - ipha = (ipha_t *)mp1->b_rptr; - bcopy(ip_snd_opt, &ipha[1], ip_snd_opt_len); + IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src); + IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); - /* Drop IP header and transport header from original */ - (void) adjmsg(mp, IP_SIMPLE_HDR_LENGTH + tp_hdr_len); + /* Defer IPsec if it might need to look at ICMP type/code */ + do_ipsec = ipha->ipha_protocol != IPPROTO_ICMP; + ixa->ixa_flags |= IXAF_IS_IPV4; - mp1->b_cont = mp; - mp = mp1; + ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop); + error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, + connp->conn_fport, &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | + (do_ipsec ? IPDF_IPSEC : 0)); + switch (error) { + case 0: + break; + case EADDRNOTAVAIL: /* - * Massage source route putting first source - * route in ipha_dst. + * IXAF_VERIFY_SOURCE tells us to pick a better source. + * Don't have the application see that errno */ - (void) ip_massage_options(ipha, is->is_netstack); - } - - if (pktinfop != NULL) { + error = ENETUNREACH; + goto failed; + case ENETDOWN: /* - * Over write the source address provided in the header + * Have !ipif_addr_ready address; drop packet silently + * until we can get applications to not send until we + * are ready. */ - if (pktinfop->ip4_addr != INADDR_ANY) { - ipha->ipha_src = pktinfop->ip4_addr; - optinfo.ip_opt_flags = IP_VERIFY_SRC; - } - - if (pktinfop->ip4_ill_index != 0) { - optinfo.ip_opt_ill_index = pktinfop->ip4_ill_index; + error = 0; + goto failed; + case EHOSTUNREACH: + case ENETUNREACH: + if (ixa->ixa_ire != NULL) { + /* + * Let conn_ip_output/ire_send_noroute return + * the error and send any local ICMP error. + */ + error = 0; + break; } + /* FALLTHRU */ + default: + failed: + freemsg(mp); + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + goto done; } - - rw_exit(&icmp->icmp_rwlock); - - ip_output_options(connp, mp, q, IP_WPUT, &optinfo); - return (0); -} - -static int -icmp_update_label(icmp_t *icmp, mblk_t *mp, ipaddr_t dst) -{ - int err; - uchar_t opt_storage[IP_MAX_OPT_LENGTH]; - icmp_stack_t *is = icmp->icmp_is; - conn_t *connp = icmp->icmp_connp; - cred_t *cred; - cred_t *msg_cred; - cred_t *effective_cred; + if (ipha->ipha_src == INADDR_ANY) + IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src); /* - * All Solaris components should pass a db_credp - * for this message, hence we ASSERT. - * On production kernels we return an error to be robust against - * random streams modules sitting on top of us. + * We might be going to a different destination than last time, + * thus check that TX allows the communication and compute any + * needed label. + * + * TSOL Note: We have an exclusive ipp and ixa for this thread so we + * don't have to worry about concurrent threads. */ - cred = msg_cred = msg_getcred(mp, NULL); - ASSERT(cred != NULL); - if (cred == NULL) - return (EINVAL); + if (is_system_labeled()) { + /* + * Check whether Trusted Solaris policy allows communication + * with this host, and pretend that the destination is + * unreachable if not. + * Compute any needed label and place it in ipp_label_v4/v6. + * + * Later conn_build_hdr_template/conn_prepend_hdr takes + * ipp_label_v4/v6 to form the packet. + * + * Tsol note: We have ipp structure local to this thread so + * no locking is needed. + */ + error = conn_update_label(connp, ixa, &v6dst, ipp); + if (error != 0) { + freemsg(mp); + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + goto done; + } + } /* - * Verify the destination is allowed to receive packets at - * the security label of the message data. check_dest() - * may create a new effective cred for this message - * with a modified label or label flags. + * Save away a copy of the IPv4 header the application passed down + * and then prepend an IPv4 header complete with any IP options + * including label. + * We need a struct copy since icmp_prepend_hdr will reuse the available + * space in the mblk. */ - if ((err = tsol_check_dest(cred, &dst, IPV4_VERSION, - connp->conn_mac_mode, &effective_cred)) != 0) - goto done; - if (effective_cred != NULL) - cred = effective_cred; + iphas = *ipha; + mp->b_rptr += IP_SIMPLE_HDR_LENGTH; - /* - * Calculate the security label to be placed in the text - * of the message (if any). - */ - if ((err = tsol_compute_label(cred, dst, opt_storage, - is->is_netstack->netstack_ip)) != 0) + mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, 0, mp, &error); + if (mp == NULL) { + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + ASSERT(error != 0); goto done; - - /* - * Insert the security label in the cached ip options, - * removing any old label that may exist. - */ - if ((err = tsol_update_options(&icmp->icmp_ip_snd_options, - &icmp->icmp_ip_snd_options_len, &icmp->icmp_label_len, - opt_storage)) != 0) + } + if (ixa->ixa_pktlen > IP_MAXPACKET) { + error = EMSGSIZE; + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + freemsg(mp); goto done; + } + /* Restore key parts of the header that the application passed down */ + ipha = (ipha_t *)mp->b_rptr; + ipha->ipha_type_of_service = iphas.ipha_type_of_service; + ipha->ipha_ident = iphas.ipha_ident; + ipha->ipha_fragment_offset_and_flags = + iphas.ipha_fragment_offset_and_flags; + ipha->ipha_ttl = iphas.ipha_ttl; + ipha->ipha_protocol = iphas.ipha_protocol; + ipha->ipha_src = iphas.ipha_src; + ipha->ipha_dst = iphas.ipha_dst; + + ixa->ixa_protocol = ipha->ipha_protocol; /* - * Save the destination address and cred we used to generate - * the security label text. + * Make sure that the IP header plus any transport header that is + * checksumed by ip_output is in the first mblk. (ip_output assumes + * that at least the checksum field is in the first mblk.) */ - IN6_IPADDR_TO_V4MAPPED(dst, &icmp->icmp_v6lastdst); - if (cred != icmp->icmp_effective_cred) { - if (icmp->icmp_effective_cred != NULL) - crfree(icmp->icmp_effective_cred); - crhold(cred); - icmp->icmp_effective_cred = cred; + switch (ipha->ipha_protocol) { + case IPPROTO_UDP: + tp_hdr_len = 8; + break; + case IPPROTO_TCP: + tp_hdr_len = 20; + break; + default: + tp_hdr_len = 0; + break; + } + ip_hdr_length = IPH_HDR_LENGTH(ipha); + if (mp->b_wptr - mp->b_rptr < ip_hdr_length + tp_hdr_len) { + if (!pullupmsg(mp, ip_hdr_length + tp_hdr_len)) { + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + if (mp->b_cont == NULL) + error = EINVAL; + else + error = ENOMEM; + freemsg(mp); + goto done; + } } - if (msg_cred != icmp->icmp_last_cred) { - if (icmp->icmp_last_cred != NULL) - crfree(icmp->icmp_last_cred); - crhold(msg_cred); - icmp->icmp_last_cred = msg_cred; + if (!do_ipsec) { + /* Policy might differ for different ICMP type/code */ + if (ixa->ixa_ipsec_policy != NULL) { + IPPOL_REFRELE(ixa->ixa_ipsec_policy); + ixa->ixa_ipsec_policy = NULL; + ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; + } + mp = ip_output_attach_policy(mp, ipha, NULL, connp, ixa); + if (mp == NULL) { + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + error = EHOSTUNREACH; /* IPsec policy failure */ + goto done; + } } + /* We're done. Pass the packet to ip. */ + BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); + + error = conn_ip_output(mp, ixa); + /* No rawipOutErrors if an error since IP increases its error counter */ + switch (error) { + case 0: + break; + case EWOULDBLOCK: + (void) ixa_check_drain_insert(connp, ixa); + error = 0; + break; + case EADDRNOTAVAIL: + /* + * IXAF_VERIFY_SOURCE tells us to pick a better source. + * Don't have the application see that errno + */ + error = ENETUNREACH; + break; + } done: - if (effective_cred != NULL) - crfree(effective_cred); + ixa_refrele(ixa); + ip_pkt_free(ipp); + kmem_free(ipp, sizeof (*ipp)); + return (error); +} - if (err != 0) { - BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); - DTRACE_PROBE4( - tx__ip__log__drop__updatelabel__icmp, - char *, "icmp(1) failed to update options(2) on mp(3)", - icmp_t *, icmp, char *, opt_storage, mblk_t *, mp); - return (err); +static mblk_t * +icmp_output_attach_policy(mblk_t *mp, conn_t *connp, ip_xmit_attr_t *ixa) +{ + ipha_t *ipha = NULL; + ip6_t *ip6h = NULL; + + if (ixa->ixa_flags & IXAF_IS_IPV4) + ipha = (ipha_t *)mp->b_rptr; + else + ip6h = (ip6_t *)mp->b_rptr; + + if (ixa->ixa_ipsec_policy != NULL) { + IPPOL_REFRELE(ixa->ixa_ipsec_policy); + ixa->ixa_ipsec_policy = NULL; + ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; } - return (0); + return (ip_output_attach_policy(mp, ipha, ip6h, connp, ixa)); } /* - * This routine handles all messages passed downstream. It either - * consumes the message or passes it downstream; it never queues a - * a message. + * Handle T_UNITDATA_REQ with options. Both IPv4 and IPv6 + * Either tudr_mp or msg is set. If tudr_mp we take ancillary data from + * the TPI options, otherwise we take them from msg_control. + * If both sin and sin6 is set it is a connected socket and we use conn_faddr. + * Always consumes mp; never consumes tudr_mp. */ -static void -icmp_wput(queue_t *q, mblk_t *mp) +static int +icmp_output_ancillary(conn_t *connp, sin_t *sin, sin6_t *sin6, mblk_t *mp, + mblk_t *tudr_mp, struct nmsghdr *msg, cred_t *cr, pid_t pid) { - uchar_t *rptr = mp->b_rptr; - mblk_t *mp1; -#define tudr ((struct T_unitdata_req *)rptr) - size_t ip_len; - conn_t *connp = Q_TO_CONN(q); - icmp_t *icmp = connp->conn_icmp; - icmp_stack_t *is = icmp->icmp_is; - sin6_t *sin6; - sin_t *sin; - ipaddr_t v4dst; - ip4_pkt_t pktinfo; - ip4_pkt_t *pktinfop = &pktinfo; - ip6_pkt_t ipp_s; /* For ancillary data options */ - ip6_pkt_t *ipp = &ipp_s; - int error; + icmp_t *icmp = connp->conn_icmp; + icmp_stack_t *is = icmp->icmp_is; + int error; + ip_xmit_attr_t *ixa; + ip_pkt_t *ipp; + in6_addr_t v6src; + in6_addr_t v6dst; + in6_addr_t v6nexthop; + in_port_t dstport; + uint32_t flowinfo; + uint_t srcid; + int is_absreq_failure = 0; + conn_opt_arg_t coas, *coa; - ipp->ipp_fields = 0; - ipp->ipp_sticky_ignored = 0; + ASSERT(tudr_mp != NULL || msg != NULL); - switch (mp->b_datap->db_type) { - case M_DATA: - if (icmp->icmp_hdrincl) { - ASSERT(icmp->icmp_ipversion == IPV4_VERSION); - error = icmp_wput_hdrincl(q, connp, mp, icmp, NULL); - if (error != 0) - icmp_ud_err(q, mp, error); - return; - } + /* + * Get ixa before checking state to handle a disconnect race. + * + * We need an exclusive copy of conn_ixa since the ancillary data + * options might modify it. That copy has no pointers hence we + * need to set them up once we've parsed the ancillary data. + */ + ixa = conn_get_ixa_exclusive(connp); + if (ixa == NULL) { + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); freemsg(mp); - return; - case M_PROTO: - case M_PCPROTO: - ip_len = mp->b_wptr - rptr; - if (ip_len >= sizeof (struct T_unitdata_req)) { - /* Expedite valid T_UNITDATA_REQ to below the switch */ - if (((union T_primitives *)rptr)->type - == T_UNITDATA_REQ) - break; - } - /* FALLTHRU */ - default: - icmp_wput_other(q, mp); - return; + return (ENOMEM); + } + ASSERT(cr != NULL); + ixa->ixa_cred = cr; + ixa->ixa_cpid = pid; + if (is_system_labeled()) { + /* We need to restart with a label based on the cred */ + ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); } - /* Handle T_UNITDATA_REQ messages here. */ + /* In case previous destination was multicast or multirt */ + ip_attr_newdst(ixa); - mp1 = mp->b_cont; - if (mp1 == NULL) { + /* Get a copy of conn_xmit_ipp since the options might change it */ + ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP); + if (ipp == NULL) { + ixa_refrele(ixa); BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); - icmp_ud_err(q, mp, EPROTO); - return; + freemsg(mp); + return (ENOMEM); } - - if ((rptr + tudr->DEST_offset + tudr->DEST_length) > mp->b_wptr) { + mutex_enter(&connp->conn_lock); + error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP); + mutex_exit(&connp->conn_lock); + if (error != 0) { BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); - icmp_ud_err(q, mp, EADDRNOTAVAIL); - return; + freemsg(mp); + goto done; } - switch (icmp->icmp_family) { - case AF_INET6: - sin6 = (sin6_t *)&rptr[tudr->DEST_offset]; - if (!OK_32PTR((char *)sin6) || - tudr->DEST_length != sizeof (sin6_t) || - sin6->sin6_family != AF_INET6) { - BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); - icmp_ud_err(q, mp, EADDRNOTAVAIL); - return; - } + /* + * Parse the options and update ixa and ipp as a result. + */ - /* No support for mapped addresses on raw sockets */ - if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { - BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); - icmp_ud_err(q, mp, EADDRNOTAVAIL); - return; - } + coa = &coas; + coa->coa_connp = connp; + coa->coa_ixa = ixa; + coa->coa_ipp = ipp; + coa->coa_ancillary = B_TRUE; + coa->coa_changed = 0; + if (msg != NULL) { + error = process_auxiliary_options(connp, msg->msg_control, + msg->msg_controllen, coa, &icmp_opt_obj, icmp_opt_set, cr); + } else { + struct T_unitdata_req *tudr; + + tudr = (struct T_unitdata_req *)tudr_mp->b_rptr; + ASSERT(tudr->PRIM_type == T_UNITDATA_REQ); + error = tpi_optcom_buf(connp->conn_wq, tudr_mp, + &tudr->OPT_length, tudr->OPT_offset, cr, &icmp_opt_obj, + coa, &is_absreq_failure); + } + if (error != 0) { /* - * Destination is a native IPv6 address. - * Send out an IPv6 format packet. + * Note: No special action needed in this + * module for "is_absreq_failure" */ - if (tudr->OPT_length != 0) { - int error; - - error = 0; - if (icmp_unitdata_opt_process(q, mp, &error, - (void *)ipp) < 0) { - /* failure */ - BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); - icmp_ud_err(q, mp, error); - return; - } - ASSERT(error == 0); - } - - error = raw_ip_send_data_v6(q, connp, mp1, sin6, ipp); + freemsg(mp); + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); goto done; - - case AF_INET: - sin = (sin_t *)&rptr[tudr->DEST_offset]; - if (!OK_32PTR((char *)sin) || - tudr->DEST_length != sizeof (sin_t) || - sin->sin_family != AF_INET) { - BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); - icmp_ud_err(q, mp, EADDRNOTAVAIL); - return; - } - /* Extract and ipaddr */ - v4dst = sin->sin_addr.s_addr; - break; - - default: - ASSERT(0); } + ASSERT(is_absreq_failure == 0); - pktinfop->ip4_ill_index = 0; - pktinfop->ip4_addr = INADDR_ANY; - + mutex_enter(&connp->conn_lock); /* - * If options passed in, feed it for verification and handling + * If laddr is unspecified then we look at sin6_src_id. + * We will give precedence to a source address set with IPV6_PKTINFO + * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't + * want ip_attr_connect to select a source (since it can fail) when + * IPV6_PKTINFO is specified. + * If this doesn't result in a source address then we get a source + * from ip_attr_connect() below. */ - if (tudr->OPT_length != 0) { - int error; - - error = 0; - if (icmp_unitdata_opt_process(q, mp, &error, - (void *)pktinfop) < 0) { - /* failure */ - BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); - icmp_ud_err(q, mp, error); - return; + v6src = connp->conn_saddr_v6; + if (sin != NULL) { + IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst); + dstport = sin->sin_port; + flowinfo = 0; + ixa->ixa_flags &= ~IXAF_SCOPEID_SET; + ixa->ixa_flags |= IXAF_IS_IPV4; + } else if (sin6 != NULL) { + v6dst = sin6->sin6_addr; + dstport = sin6->sin6_port; + flowinfo = sin6->sin6_flowinfo; + srcid = sin6->__sin6_src_id; + if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) { + ixa->ixa_scopeid = sin6->sin6_scope_id; + ixa->ixa_flags |= IXAF_SCOPEID_SET; + } else { + ixa->ixa_flags &= ~IXAF_SCOPEID_SET; } - ASSERT(error == 0); - /* - * Note: Success in processing options. - * mp option buffer represented by - * OPT_length/offset now potentially modified - * and contain option setting results - */ - } - - error = raw_ip_send_data_v4(q, connp, mp1, v4dst, pktinfop); -done: - if (error != 0) { - icmp_ud_err(q, mp, error); - return; + if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { + ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), + connp->conn_netstack); + } + if (IN6_IS_ADDR_V4MAPPED(&v6dst)) + ixa->ixa_flags |= IXAF_IS_IPV4; + else + ixa->ixa_flags &= ~IXAF_IS_IPV4; } else { - mp->b_cont = NULL; - freeb(mp); + /* Connected case */ + v6dst = connp->conn_faddr_v6; + flowinfo = connp->conn_flowinfo; + } + mutex_exit(&connp->conn_lock); + /* Handle IPV6_PKTINFO setting source address. */ + if (IN6_IS_ADDR_UNSPECIFIED(&v6src) && + (ipp->ipp_fields & IPPF_ADDR)) { + if (ixa->ixa_flags & IXAF_IS_IPV4) { + if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) + v6src = ipp->ipp_addr; + } else { + if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) + v6src = ipp->ipp_addr; + } } -} - - -/* ARGSUSED */ -static void -icmp_wput_fallback(queue_t *q, mblk_t *mp) -{ -#ifdef DEBUG - cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n"); -#endif - freemsg(mp); -} - -static int -raw_ip_send_data_v4(queue_t *q, conn_t *connp, mblk_t *mp, ipaddr_t v4dst, - ip4_pkt_t *pktinfop) -{ - ipha_t *ipha; - size_t ip_len; - icmp_t *icmp = connp->conn_icmp; - icmp_stack_t *is = icmp->icmp_is; - int ip_hdr_length; - ip_opt_info_t optinfo; - uchar_t ip_snd_opt[IP_MAX_OPT_LENGTH]; - uint32_t ip_snd_opt_len = 0; - pid_t cpid; - cred_t *cr; - optinfo.ip_opt_flags = 0; - optinfo.ip_opt_ill_index = 0; + ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop); + error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport, + &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST); - if (icmp->icmp_state == TS_UNBND) { - /* If a port has not been bound to the stream, fail. */ + switch (error) { + case 0: + break; + case EADDRNOTAVAIL: + /* + * IXAF_VERIFY_SOURCE tells us to pick a better source. + * Don't have the application see that errno + */ + error = ENETUNREACH; + goto failed; + case ENETDOWN: + /* + * Have !ipif_addr_ready address; drop packet silently + * until we can get applications to not send until we + * are ready. + */ + error = 0; + goto failed; + case EHOSTUNREACH: + case ENETUNREACH: + if (ixa->ixa_ire != NULL) { + /* + * Let conn_ip_output/ire_send_noroute return + * the error and send any local ICMP error. + */ + error = 0; + break; + } + /* FALLTHRU */ + default: + failed: + freemsg(mp); BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); - return (EPROTO); + goto done; } - if (v4dst == INADDR_ANY) - v4dst = htonl(INADDR_LOOPBACK); - - /* Protocol 255 contains full IP headers */ - if (icmp->icmp_hdrincl) - return (icmp_wput_hdrincl(q, connp, mp, icmp, pktinfop)); - - rw_enter(&icmp->icmp_rwlock, RW_READER); - /* - * Check if our saved options are valid; update if not. - * TSOL Note: Since we are not in WRITER mode, ICMP packets - * to different destination may require different labels, - * or worse, ICMP packets to same IP address may require - * different labels due to use of shared all-zones address. - * We use conn_lock to ensure that lastdst, ip_snd_options, - * and ip_snd_options_len are consistent for the current - * destination and are updated atomically. + * We might be going to a different destination than last time, + * thus check that TX allows the communication and compute any + * needed label. + * + * TSOL Note: We have an exclusive ipp and ixa for this thread so we + * don't have to worry about concurrent threads. */ - mutex_enter(&connp->conn_lock); if (is_system_labeled()) { - - /* - * Recompute the Trusted Extensions security label if we're not - * going to the same destination as last time or the cred - * attached to the received mblk changed. - */ - cr = msg_getcred(mp, &cpid); - if (!IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6lastdst) || - V4_PART_OF_V6(icmp->icmp_v6lastdst) != v4dst || - cr != icmp->icmp_last_cred) { - int error = icmp_update_label(icmp, mp, v4dst); - if (error != 0) { - mutex_exit(&connp->conn_lock); - rw_exit(&icmp->icmp_rwlock); - return (error); - } - } /* - * Apply credentials with modified security label if they - * exist. icmp_update_label() may have generated these - * credentials for packets to unlabeled remote nodes. + * Check whether Trusted Solaris policy allows communication + * with this host, and pretend that the destination is + * unreachable if not. + * Compute any needed label and place it in ipp_label_v4/v6. + * + * Later conn_build_hdr_template/conn_prepend_hdr takes + * ipp_label_v4/v6 to form the packet. + * + * Tsol note: We have ipp structure local to this thread so + * no locking is needed. */ - if (icmp->icmp_effective_cred != NULL) - mblk_setcred(mp, icmp->icmp_effective_cred, cpid); - } - - if (icmp->icmp_ip_snd_options_len > 0) { - ip_snd_opt_len = icmp->icmp_ip_snd_options_len; - bcopy(icmp->icmp_ip_snd_options, ip_snd_opt, ip_snd_opt_len); - } - mutex_exit(&connp->conn_lock); - - /* Add an IP header */ - ip_hdr_length = IP_SIMPLE_HDR_LENGTH + ip_snd_opt_len; - ipha = (ipha_t *)&mp->b_rptr[-ip_hdr_length]; - if ((uchar_t *)ipha < mp->b_datap->db_base || - mp->b_datap->db_ref != 1 || - !OK_32PTR(ipha)) { - mblk_t *mp1; - if (!(mp1 = allocb(ip_hdr_length + is->is_wroff_extra, - BPRI_LO))) { + error = conn_update_label(connp, ixa, &v6dst, ipp); + if (error != 0) { + freemsg(mp); BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); - rw_exit(&icmp->icmp_rwlock); - return (ENOMEM); + goto done; } - mp1->b_cont = mp; - ipha = (ipha_t *)mp1->b_datap->db_lim; - mp1->b_wptr = (uchar_t *)ipha; - ipha = (ipha_t *)((uchar_t *)ipha - ip_hdr_length); - mp = mp1; } -#ifdef _BIG_ENDIAN - /* Set version, header length, and tos */ - *(uint16_t *)&ipha->ipha_version_and_hdr_length = - ((((IP_VERSION << 4) | (ip_hdr_length>>2)) << 8) | - icmp->icmp_type_of_service); - /* Set ttl and protocol */ - *(uint16_t *)&ipha->ipha_ttl = (icmp->icmp_ttl << 8) | icmp->icmp_proto; -#else - /* Set version, header length, and tos */ - *(uint16_t *)&ipha->ipha_version_and_hdr_length = - ((icmp->icmp_type_of_service << 8) | - ((IP_VERSION << 4) | (ip_hdr_length>>2))); - /* Set ttl and protocol */ - *(uint16_t *)&ipha->ipha_ttl = (icmp->icmp_proto << 8) | icmp->icmp_ttl; -#endif - if (pktinfop->ip4_addr != INADDR_ANY) { - ipha->ipha_src = pktinfop->ip4_addr; - optinfo.ip_opt_flags = IP_VERIFY_SRC; - } else { - - /* - * Copy our address into the packet. If this is zero, - * ip will fill in the real source address. - */ - IN6_V4MAPPED_TO_IPADDR(&icmp->icmp_v6src, ipha->ipha_src); + mp = icmp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, flowinfo, mp, + &error); + if (mp == NULL) { + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + ASSERT(error != 0); + goto done; } - - ipha->ipha_fragment_offset_and_flags = 0; - - if (pktinfop->ip4_ill_index != 0) { - optinfo.ip_opt_ill_index = pktinfop->ip4_ill_index; + if (ixa->ixa_pktlen > IP_MAXPACKET) { + error = EMSGSIZE; + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + freemsg(mp); + goto done; } - - /* - * For the socket of SOCK_RAW type, the checksum is provided in the - * pre-built packet. We set the ipha_ident field to IP_HDR_INCLUDED to - * tell IP that the application has sent a complete IP header and not - * to compute the transport checksum nor change the DF flag. - */ - ipha->ipha_ident = IP_HDR_INCLUDED; - - /* Finish common formatting of the packet. */ - mp->b_rptr = (uchar_t *)ipha; - - ip_len = mp->b_wptr - (uchar_t *)ipha; - if (mp->b_cont != NULL) - ip_len += msgdsize(mp->b_cont); - - /* - * Set the length into the IP header. - * If the length is greater than the maximum allowed by IP, - * then free the message and return. Do not try and send it - * as this can cause problems in layers below. - */ - if (ip_len > IP_MAXPACKET) { + /* Policy might differ for different ICMP type/code */ + mp = icmp_output_attach_policy(mp, connp, ixa); + if (mp == NULL) { BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); - rw_exit(&icmp->icmp_rwlock); - return (EMSGSIZE); + error = EHOSTUNREACH; /* IPsec policy failure */ + goto done; } - ipha->ipha_length = htons((uint16_t)ip_len); - /* - * Copy in the destination address request - */ - ipha->ipha_dst = v4dst; - /* - * Set ttl based on IP_MULTICAST_TTL to match IPv6 logic. - */ - if (CLASSD(v4dst)) - ipha->ipha_ttl = icmp->icmp_multicast_ttl; + /* We're done. Pass the packet to ip. */ + BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); - /* Copy in options if any */ - if (ip_hdr_length > IP_SIMPLE_HDR_LENGTH) { - bcopy(ip_snd_opt, - &ipha[1], ip_snd_opt_len); + /* Allow source not assigned to the system? */ + ixa->ixa_flags &= ~IXAF_VERIFY_SOURCE; + error = conn_ip_output(mp, ixa); + if (!connp->conn_unspec_src) + ixa->ixa_flags |= IXAF_VERIFY_SOURCE; + /* No rawipOutErrors if an error since IP increases its error counter */ + switch (error) { + case 0: + break; + case EWOULDBLOCK: + (void) ixa_check_drain_insert(connp, ixa); + error = 0; + break; + case EADDRNOTAVAIL: /* - * Massage source route putting first source route in ipha_dst. - * Ignore the destination in the T_unitdata_req. + * IXAF_VERIFY_SOURCE tells us to pick a better source. + * Don't have the application see that errno */ - (void) ip_massage_options(ipha, is->is_netstack); + error = ENETUNREACH; + /* FALLTHRU */ + default: + mutex_enter(&connp->conn_lock); + /* + * Clear the source and v6lastdst so we call ip_attr_connect + * for the next packet and try to pick a better source. + */ + if (connp->conn_mcbc_bind) + connp->conn_saddr_v6 = ipv6_all_zeros; + else + connp->conn_saddr_v6 = connp->conn_bound_addr_v6; + connp->conn_v6lastdst = ipv6_all_zeros; + mutex_exit(&connp->conn_lock); + break; } - - rw_exit(&icmp->icmp_rwlock); - BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); - - ip_output_options(connp, mp, q, IP_WPUT, &optinfo); - return (0); +done: + ixa_refrele(ixa); + ip_pkt_free(ipp); + kmem_free(ipp, sizeof (*ipp)); + return (error); } -static int -icmp_update_label_v6(icmp_t *icmp, mblk_t *mp, in6_addr_t *dst) +/* + * Handle sending an M_DATA for a connected socket. + * Handles both IPv4 and IPv6. + */ +int +icmp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid) { - int err; - uchar_t opt_storage[TSOL_MAX_IPV6_OPTION]; - icmp_stack_t *is = icmp->icmp_is; - conn_t *connp = icmp->icmp_connp; - cred_t *cred; - cred_t *msg_cred; - cred_t *effective_cred; + icmp_t *icmp = connp->conn_icmp; + icmp_stack_t *is = icmp->icmp_is; + int error; + ip_xmit_attr_t *ixa; + boolean_t do_ipsec; /* - * All Solaris components should pass a db_credp - * for this message, hence we ASSERT. - * On production kernels we return an error to be robust against - * random streams modules sitting on top of us. + * If no other thread is using conn_ixa this just gets a reference to + * conn_ixa. Otherwise we get a safe copy of conn_ixa. */ - cred = msg_cred = msg_getcred(mp, NULL); - ASSERT(cred != NULL); - if (cred == NULL) - return (EINVAL); + ixa = conn_get_ixa(connp, B_FALSE); + if (ixa == NULL) { + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + freemsg(mp); + return (ENOMEM); + } - /* - * Verify the destination is allowed to receive packets at - * the security label of the message data. check_dest() - * may create a new effective cred for this message - * with a modified label or label flags. - */ - if ((err = tsol_check_dest(cred, dst, IPV6_VERSION, - connp->conn_mac_mode, &effective_cred)) != 0) - goto done; - if (effective_cred != NULL) - cred = effective_cred; + ASSERT(cr != NULL); + ixa->ixa_cred = cr; + ixa->ixa_cpid = pid; - /* - * Calculate the security label to be placed in the text - * of the message (if any). - */ - if ((err = tsol_compute_label_v6(cred, dst, opt_storage, - is->is_netstack->netstack_ip)) != 0) - goto done; + /* Defer IPsec if it might need to look at ICMP type/code */ + switch (ixa->ixa_protocol) { + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + do_ipsec = B_FALSE; + break; + default: + do_ipsec = B_TRUE; + } - /* - * Insert the security label in the cached ip options, - * removing any old label that may exist. - */ - if ((err = tsol_update_sticky(&icmp->icmp_sticky_ipp, - &icmp->icmp_label_len_v6, opt_storage)) != 0) - goto done; + mutex_enter(&connp->conn_lock); + mp = icmp_prepend_header_template(connp, ixa, mp, + &connp->conn_saddr_v6, connp->conn_flowinfo, &error); + + if (mp == NULL) { + ASSERT(error != 0); + mutex_exit(&connp->conn_lock); + ixa_refrele(ixa); + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + freemsg(mp); + return (error); + } + + if (!do_ipsec) { + /* Policy might differ for different ICMP type/code */ + mp = icmp_output_attach_policy(mp, connp, ixa); + if (mp == NULL) { + mutex_exit(&connp->conn_lock); + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + ixa_refrele(ixa); + return (EHOSTUNREACH); /* IPsec policy failure */ + } + } /* - * Save the destination address and cred we used to generate - * the security label text. + * In case we got a safe copy of conn_ixa, or if opt_set made us a new + * safe copy, then we need to fill in any pointers in it. */ - icmp->icmp_v6lastdst = *dst; - if (cred != icmp->icmp_effective_cred) { - if (icmp->icmp_effective_cred != NULL) - crfree(icmp->icmp_effective_cred); - crhold(cred); - icmp->icmp_effective_cred = cred; - } + if (ixa->ixa_ire == NULL) { + in6_addr_t faddr, saddr; + in6_addr_t nexthop; + in_port_t fport; + + saddr = connp->conn_saddr_v6; + faddr = connp->conn_faddr_v6; + fport = connp->conn_fport; + ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &faddr, &nexthop); + mutex_exit(&connp->conn_lock); - if (msg_cred != icmp->icmp_last_cred) { - if (icmp->icmp_last_cred != NULL) - crfree(icmp->icmp_last_cred); - crhold(msg_cred); - icmp->icmp_last_cred = msg_cred; + error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop, + fport, NULL, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | + (do_ipsec ? IPDF_IPSEC : 0)); + switch (error) { + case 0: + break; + case EADDRNOTAVAIL: + /* + * IXAF_VERIFY_SOURCE tells us to pick a better source. + * Don't have the application see that errno + */ + error = ENETUNREACH; + goto failed; + case ENETDOWN: + /* + * Have !ipif_addr_ready address; drop packet silently + * until we can get applications to not send until we + * are ready. + */ + error = 0; + goto failed; + case EHOSTUNREACH: + case ENETUNREACH: + if (ixa->ixa_ire != NULL) { + /* + * Let conn_ip_output/ire_send_noroute return + * the error and send any local ICMP error. + */ + error = 0; + break; + } + /* FALLTHRU */ + default: + failed: + ixa_refrele(ixa); + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + freemsg(mp); + return (error); + } + } else { + /* Done with conn_t */ + mutex_exit(&connp->conn_lock); } -done: - if (effective_cred != NULL) - crfree(effective_cred); + /* We're done. Pass the packet to ip. */ + BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); - if (err != 0) { - BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); - DTRACE_PROBE4( - tx__ip__log__drop__updatelabel__icmp6, - char *, "icmp(1) failed to update options(2) on mp(3)", - icmp_t *, icmp, char *, opt_storage, mblk_t *, mp); - return (err); + error = conn_ip_output(mp, ixa); + /* No rawipOutErrors if an error since IP increases its error counter */ + switch (error) { + case 0: + break; + case EWOULDBLOCK: + (void) ixa_check_drain_insert(connp, ixa); + error = 0; + break; + case EADDRNOTAVAIL: + /* + * IXAF_VERIFY_SOURCE tells us to pick a better source. + * Don't have the application see that errno + */ + error = ENETUNREACH; + break; } - return (0); + ixa_refrele(ixa); + return (error); } /* - * raw_ip_send_data_v6(): - * Assumes that icmp_wput did some sanity checking on the destination - * address, but that the label may not yet be correct. + * Handle sending an M_DATA to the last destination. + * Handles both IPv4 and IPv6. + * + * NOTE: The caller must hold conn_lock and we drop it here. */ -static int -raw_ip_send_data_v6(queue_t *q, conn_t *connp, mblk_t *mp, sin6_t *sin6, - ip6_pkt_t *ipp) +int +icmp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid, + ip_xmit_attr_t *ixa) { - ip6_t *ip6h; - ip6i_t *ip6i; /* mp->b_rptr even if no ip6i_t */ - int ip_hdr_len = IPV6_HDR_LEN; - size_t ip_len; - icmp_t *icmp = connp->conn_icmp; - icmp_stack_t *is = icmp->icmp_is; - ip6_pkt_t *tipp; - ip6_hbh_t *hopoptsptr = NULL; - uint_t hopoptslen = 0; - uint32_t csum = 0; - uint_t ignore = 0; - uint_t option_exists = 0, is_sticky = 0; - uint8_t *cp; - uint8_t *nxthdr_ptr; - in6_addr_t ip6_dst; - pid_t cpid; - cred_t *cr; - - rw_enter(&icmp->icmp_rwlock, RW_READER); + icmp_t *icmp = connp->conn_icmp; + icmp_stack_t *is = icmp->icmp_is; + int error; + boolean_t do_ipsec; - /* - * If the local address is a mapped address return - * an error. - * It would be possible to send an IPv6 packet but the - * response would never make it back to the application - * since it is bound to a mapped address. - */ - if (IN6_IS_ADDR_V4MAPPED(&icmp->icmp_v6src)) { + ASSERT(MUTEX_HELD(&connp->conn_lock)); + ASSERT(ixa != NULL); + + ASSERT(cr != NULL); + ixa->ixa_cred = cr; + ixa->ixa_cpid = pid; + + /* Defer IPsec if it might need to look at ICMP type/code */ + switch (ixa->ixa_protocol) { + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + do_ipsec = B_FALSE; + break; + default: + do_ipsec = B_TRUE; + } + + + mp = icmp_prepend_header_template(connp, ixa, mp, + &connp->conn_v6lastsrc, connp->conn_lastflowinfo, &error); + + if (mp == NULL) { + ASSERT(error != 0); + mutex_exit(&connp->conn_lock); + ixa_refrele(ixa); BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); - rw_exit(&icmp->icmp_rwlock); - return (EADDRNOTAVAIL); + freemsg(mp); + return (error); } - ignore = ipp->ipp_sticky_ignored; - if (sin6->sin6_scope_id != 0 && - IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) { - /* - * IPPF_SCOPE_ID is special. It's neither a sticky - * option nor ancillary data. It needs to be - * explicitly set in options_exists. - */ - option_exists |= IPPF_SCOPE_ID; + if (!do_ipsec) { + /* Policy might differ for different ICMP type/code */ + mp = icmp_output_attach_policy(mp, connp, ixa); + if (mp == NULL) { + mutex_exit(&connp->conn_lock); + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + ixa_refrele(ixa); + return (EHOSTUNREACH); /* IPsec policy failure */ + } } /* - * Compute the destination address + * In case we got a safe copy of conn_ixa, or if opt_set made us a new + * safe copy, then we need to fill in any pointers in it. */ - ip6_dst = sin6->sin6_addr; - if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) - ip6_dst = ipv6_loopback; + if (ixa->ixa_ire == NULL) { + in6_addr_t lastdst, lastsrc; + in6_addr_t nexthop; + in_port_t lastport; + + lastsrc = connp->conn_v6lastsrc; + lastdst = connp->conn_v6lastdst; + lastport = connp->conn_lastdstport; + ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &lastdst, &nexthop); + mutex_exit(&connp->conn_lock); - /* - * Check if our saved options are valid; update if not. - * TSOL Note: Since we are not in WRITER mode, ICMP packets - * to different destination may require different labels, - * or worse, ICMP packets to same IP address may require - * different labels due to use of shared all-zones address. - * We use conn_lock to ensure that lastdst, sticky ipp_hopopts, - * and sticky ipp_hopoptslen are consistent for the current - * destination and are updated atomically. - */ - mutex_enter(&connp->conn_lock); - if (is_system_labeled()) { - /* - * Recompute the Trusted Extensions security label if we're - * not going to the same destination as last time or the cred - * attached to the received mblk changed. This is done in a - * separate routine to avoid blowing up our stack here. - */ - cr = msg_getcred(mp, &cpid); - if (!IN6_ARE_ADDR_EQUAL(&icmp->icmp_v6lastdst, &ip6_dst) || - cr != icmp->icmp_last_cred) { - int error = 0; - error = icmp_update_label_v6(icmp, mp, &ip6_dst); - if (error != 0) { - mutex_exit(&connp->conn_lock); - rw_exit(&icmp->icmp_rwlock); - return (error); + error = ip_attr_connect(connp, ixa, &lastsrc, &lastdst, + &nexthop, lastport, NULL, NULL, IPDF_ALLOW_MCBC | + IPDF_VERIFY_DST | (do_ipsec ? IPDF_IPSEC : 0)); + switch (error) { + case 0: + break; + case EADDRNOTAVAIL: + /* + * IXAF_VERIFY_SOURCE tells us to pick a better source. + * Don't have the application see that errno + */ + error = ENETUNREACH; + goto failed; + case ENETDOWN: + /* + * Have !ipif_addr_ready address; drop packet silently + * until we can get applications to not send until we + * are ready. + */ + error = 0; + goto failed; + case EHOSTUNREACH: + case ENETUNREACH: + if (ixa->ixa_ire != NULL) { + /* + * Let conn_ip_output/ire_send_noroute return + * the error and send any local ICMP error. + */ + error = 0; + break; } + /* FALLTHRU */ + default: + failed: + ixa_refrele(ixa); + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + freemsg(mp); + return (error); } + } else { + /* Done with conn_t */ + mutex_exit(&connp->conn_lock); + } + /* We're done. Pass the packet to ip. */ + BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); + error = conn_ip_output(mp, ixa); + /* No rawipOutErrors if an error since IP increases its error counter */ + switch (error) { + case 0: + break; + case EWOULDBLOCK: + (void) ixa_check_drain_insert(connp, ixa); + error = 0; + break; + case EADDRNOTAVAIL: + /* + * IXAF_VERIFY_SOURCE tells us to pick a better source. + * Don't have the application see that errno + */ + error = ENETUNREACH; + /* FALLTHRU */ + default: + mutex_enter(&connp->conn_lock); /* - * Apply credentials with modified security label if they exist. - * icmp_update_label_v6() may have generated these credentials - * for MAC-Exempt connections. + * Clear the source and v6lastdst so we call ip_attr_connect + * for the next packet and try to pick a better source. */ - if (icmp->icmp_effective_cred != NULL) - mblk_setcred(mp, icmp->icmp_effective_cred, cpid); + if (connp->conn_mcbc_bind) + connp->conn_saddr_v6 = ipv6_all_zeros; + else + connp->conn_saddr_v6 = connp->conn_bound_addr_v6; + connp->conn_v6lastdst = ipv6_all_zeros; + mutex_exit(&connp->conn_lock); + break; } + ixa_refrele(ixa); + return (error); +} + + +/* + * Prepend the header template and then fill in the source and + * flowinfo. The caller needs to handle the destination address since + * it's setting is different if rthdr or source route. + * + * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET. + * When it returns NULL it sets errorp. + */ +static mblk_t * +icmp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp, + const in6_addr_t *v6src, uint32_t flowinfo, int *errorp) +{ + icmp_t *icmp = connp->conn_icmp; + icmp_stack_t *is = icmp->icmp_is; + uint_t pktlen; + uint_t copylen; + uint8_t *iph; + uint_t ip_hdr_length; + uint32_t cksum; + ip_pkt_t *ipp; + + ASSERT(MUTEX_HELD(&connp->conn_lock)); /* - * If there's a security label here, then we ignore any options the - * user may try to set. We keep the peer's label as a hidden sticky - * option. + * Copy the header template. */ - if (icmp->icmp_label_len_v6 > 0) { - ignore &= ~IPPF_HOPOPTS; - ipp->ipp_fields &= ~IPPF_HOPOPTS; + copylen = connp->conn_ht_iphc_len; + pktlen = copylen + msgdsize(mp); + if (pktlen > IP_MAXPACKET) { + freemsg(mp); + *errorp = EMSGSIZE; + return (NULL); } + ixa->ixa_pktlen = pktlen; - if ((icmp->icmp_sticky_ipp.ipp_fields == 0) && - (ipp->ipp_fields == 0)) { - /* No sticky options nor ancillary data. */ - mutex_exit(&connp->conn_lock); - goto no_options; + /* check/fix buffer config, setup pointers into it */ + iph = mp->b_rptr - copylen; + if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) { + mblk_t *mp1; + + mp1 = allocb(copylen + is->is_wroff_extra, BPRI_MED); + if (mp1 == NULL) { + freemsg(mp); + *errorp = ENOMEM; + return (NULL); + } + mp1->b_wptr = DB_LIM(mp1); + mp1->b_cont = mp; + mp = mp1; + iph = (mp->b_wptr - copylen); } + mp->b_rptr = iph; + bcopy(connp->conn_ht_iphc, iph, copylen); + ip_hdr_length = (uint_t)(connp->conn_ht_ulp - connp->conn_ht_iphc); + + ixa->ixa_ip_hdr_length = ip_hdr_length; /* - * Go through the options figuring out where each is going to - * come from and build two masks. The first mask indicates if - * the option exists at all. The second mask indicates if the - * option is sticky or ancillary. + * Prepare for ICMPv6 checksum done in IP. + * + * icmp_build_hdr_template has already massaged any routing header + * and placed the result in conn_sum. + * + * We make it easy for IP to include our pseudo header + * by putting our length (and any routing header adjustment) + * in the ICMPv6 checksum field. */ - if (!(ignore & IPPF_HOPOPTS)) { - if (ipp->ipp_fields & IPPF_HOPOPTS) { - option_exists |= IPPF_HOPOPTS; - ip_hdr_len += ipp->ipp_hopoptslen; - } else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_HOPOPTS) { - option_exists |= IPPF_HOPOPTS; - is_sticky |= IPPF_HOPOPTS; - ASSERT(icmp->icmp_sticky_ipp.ipp_hopoptslen != 0); - hopoptsptr = kmem_alloc( - icmp->icmp_sticky_ipp.ipp_hopoptslen, KM_NOSLEEP); - if (hopoptsptr == NULL) { - mutex_exit(&connp->conn_lock); - rw_exit(&icmp->icmp_rwlock); - return (ENOMEM); - } - hopoptslen = icmp->icmp_sticky_ipp.ipp_hopoptslen; - bcopy(icmp->icmp_sticky_ipp.ipp_hopopts, hopoptsptr, - hopoptslen); - ip_hdr_len += hopoptslen; - } - } - mutex_exit(&connp->conn_lock); + cksum = pktlen - ip_hdr_length; - if (!(ignore & IPPF_RTHDR)) { - if (ipp->ipp_fields & IPPF_RTHDR) { - option_exists |= IPPF_RTHDR; - ip_hdr_len += ipp->ipp_rthdrlen; - } else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_RTHDR) { - option_exists |= IPPF_RTHDR; - is_sticky |= IPPF_RTHDR; - ip_hdr_len += icmp->icmp_sticky_ipp.ipp_rthdrlen; - } - } + cksum += connp->conn_sum; + cksum = (cksum >> 16) + (cksum & 0xFFFF); + ASSERT(cksum < 0x10000); - if (!(ignore & IPPF_RTDSTOPTS) && (option_exists & IPPF_RTHDR)) { - /* - * Need to have a router header to use these. - */ - if (ipp->ipp_fields & IPPF_RTDSTOPTS) { - option_exists |= IPPF_RTDSTOPTS; - ip_hdr_len += ipp->ipp_rtdstoptslen; - } else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_RTDSTOPTS) { - option_exists |= IPPF_RTDSTOPTS; - is_sticky |= IPPF_RTDSTOPTS; - ip_hdr_len += - icmp->icmp_sticky_ipp.ipp_rtdstoptslen; - } - } + ipp = &connp->conn_xmit_ipp; + if (ixa->ixa_flags & IXAF_IS_IPV4) { + ipha_t *ipha = (ipha_t *)iph; - if (!(ignore & IPPF_DSTOPTS)) { - if (ipp->ipp_fields & IPPF_DSTOPTS) { - option_exists |= IPPF_DSTOPTS; - ip_hdr_len += ipp->ipp_dstoptslen; - } else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_DSTOPTS) { - option_exists |= IPPF_DSTOPTS; - is_sticky |= IPPF_DSTOPTS; - ip_hdr_len += icmp->icmp_sticky_ipp.ipp_dstoptslen; - } - } + ipha->ipha_length = htons((uint16_t)pktlen); - if (!(ignore & IPPF_IFINDEX)) { - if (ipp->ipp_fields & IPPF_IFINDEX) { - option_exists |= IPPF_IFINDEX; - } else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_IFINDEX) { - option_exists |= IPPF_IFINDEX; - is_sticky |= IPPF_IFINDEX; + /* if IP_PKTINFO specified an addres it wins over bind() */ + if ((ipp->ipp_fields & IPPF_ADDR) && + IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) { + ASSERT(ipp->ipp_addr_v4 != INADDR_ANY); + ipha->ipha_src = ipp->ipp_addr_v4; + } else { + IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src); } - } + } else { + ip6_t *ip6h = (ip6_t *)iph; + uint_t cksum_offset = 0; - if (!(ignore & IPPF_ADDR)) { - if (ipp->ipp_fields & IPPF_ADDR) { - option_exists |= IPPF_ADDR; - } else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_ADDR) { - option_exists |= IPPF_ADDR; - is_sticky |= IPPF_ADDR; - } - } + ip6h->ip6_plen = htons((uint16_t)(pktlen - IPV6_HDR_LEN)); - if (!(ignore & IPPF_DONTFRAG)) { - if (ipp->ipp_fields & IPPF_DONTFRAG) { - option_exists |= IPPF_DONTFRAG; - } else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_DONTFRAG) { - option_exists |= IPPF_DONTFRAG; - is_sticky |= IPPF_DONTFRAG; + /* if IP_PKTINFO specified an addres it wins over bind() */ + if ((ipp->ipp_fields & IPPF_ADDR) && + !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) { + ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr)); + ip6h->ip6_src = ipp->ipp_addr; + } else { + ip6h->ip6_src = *v6src; } - } - - if (!(ignore & IPPF_USE_MIN_MTU)) { - if (ipp->ipp_fields & IPPF_USE_MIN_MTU) { - option_exists |= IPPF_USE_MIN_MTU; - } else if (icmp->icmp_sticky_ipp.ipp_fields & - IPPF_USE_MIN_MTU) { - option_exists |= IPPF_USE_MIN_MTU; - is_sticky |= IPPF_USE_MIN_MTU; + ip6h->ip6_vcf = + (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) | + (flowinfo & ~IPV6_VERS_AND_FLOW_MASK); + if (ipp->ipp_fields & IPPF_TCLASS) { + /* Overrides the class part of flowinfo */ + ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf, + ipp->ipp_tclass); + } + + if (ixa->ixa_flags & IXAF_SET_ULP_CKSUM) { + if (connp->conn_proto == IPPROTO_ICMPV6) { + cksum_offset = ixa->ixa_ip_hdr_length + + offsetof(icmp6_t, icmp6_cksum); + } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { + cksum_offset = ixa->ixa_ip_hdr_length + + ixa->ixa_raw_cksum_offset; + } } - } + if (cksum_offset != 0) { + uint16_t *ptr; + + /* Make sure the checksum fits in the first mblk */ + if (cksum_offset + sizeof (short) > MBLKL(mp)) { + mblk_t *mp1; - if (!(ignore & IPPF_NEXTHOP)) { - if (ipp->ipp_fields & IPPF_NEXTHOP) { - option_exists |= IPPF_NEXTHOP; - } else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_NEXTHOP) { - option_exists |= IPPF_NEXTHOP; - is_sticky |= IPPF_NEXTHOP; + mp1 = msgpullup(mp, + cksum_offset + sizeof (short)); + freemsg(mp); + if (mp1 == NULL) { + *errorp = ENOMEM; + return (NULL); + } + mp = mp1; + iph = mp->b_rptr; + ip6h = (ip6_t *)iph; + } + ptr = (uint16_t *)(mp->b_rptr + cksum_offset); + *ptr = htons(cksum); } } - if (!(ignore & IPPF_HOPLIMIT) && (ipp->ipp_fields & IPPF_HOPLIMIT)) - option_exists |= IPPF_HOPLIMIT; - /* IPV6_HOPLIMIT can never be sticky */ - ASSERT(!(icmp->icmp_sticky_ipp.ipp_fields & IPPF_HOPLIMIT)); + return (mp); +} - if (!(ignore & IPPF_UNICAST_HOPS) && - (icmp->icmp_sticky_ipp.ipp_fields & IPPF_UNICAST_HOPS)) { - option_exists |= IPPF_UNICAST_HOPS; - is_sticky |= IPPF_UNICAST_HOPS; - } +/* + * This routine handles all messages passed downstream. It either + * consumes the message or passes it downstream; it never queues a + * a message. + */ +void +icmp_wput(queue_t *q, mblk_t *mp) +{ + sin6_t *sin6; + sin_t *sin = NULL; + uint_t srcid; + conn_t *connp = Q_TO_CONN(q); + icmp_t *icmp = connp->conn_icmp; + int error = 0; + struct sockaddr *addr = NULL; + socklen_t addrlen; + icmp_stack_t *is = icmp->icmp_is; + struct T_unitdata_req *tudr; + mblk_t *data_mp; + cred_t *cr; + pid_t pid; - if (!(ignore & IPPF_MULTICAST_HOPS) && - (icmp->icmp_sticky_ipp.ipp_fields & IPPF_MULTICAST_HOPS)) { - option_exists |= IPPF_MULTICAST_HOPS; - is_sticky |= IPPF_MULTICAST_HOPS; - } + /* + * We directly handle several cases here: T_UNITDATA_REQ message + * coming down as M_PROTO/M_PCPROTO and M_DATA messages for connected + * socket. + */ + switch (DB_TYPE(mp)) { + case M_DATA: + /* sockfs never sends down M_DATA */ + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + freemsg(mp); + return; - if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_NO_CKSUM) { - /* This is a sticky socket option only */ - option_exists |= IPPF_NO_CKSUM; - is_sticky |= IPPF_NO_CKSUM; - } + case M_PROTO: + case M_PCPROTO: + tudr = (struct T_unitdata_req *)mp->b_rptr; + if (MBLKL(mp) < sizeof (*tudr) || + ((t_primp_t)mp->b_rptr)->type != T_UNITDATA_REQ) { + icmp_wput_other(q, mp); + return; + } + break; - if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_RAW_CKSUM) { - /* This is a sticky socket option only */ - option_exists |= IPPF_RAW_CKSUM; - is_sticky |= IPPF_RAW_CKSUM; + default: + icmp_wput_other(q, mp); + return; } - if (!(ignore & IPPF_TCLASS)) { - if (ipp->ipp_fields & IPPF_TCLASS) { - option_exists |= IPPF_TCLASS; - } else if (icmp->icmp_sticky_ipp.ipp_fields & IPPF_TCLASS) { - option_exists |= IPPF_TCLASS; - is_sticky |= IPPF_TCLASS; - } + /* Handle valid T_UNITDATA_REQ here */ + data_mp = mp->b_cont; + if (data_mp == NULL) { + error = EPROTO; + goto ud_error2; } + mp->b_cont = NULL; -no_options: + if (!MBLKIN(mp, 0, tudr->DEST_offset + tudr->DEST_length)) { + error = EADDRNOTAVAIL; + goto ud_error2; + } /* - * If any options carried in the ip6i_t were specified, we - * need to account for the ip6i_t in the data we'll be sending - * down. + * All Solaris components should pass a db_credp + * for this message, hence we ASSERT. + * On production kernels we return an error to be robust against + * random streams modules sitting on top of us. */ - if (option_exists & IPPF_HAS_IP6I) - ip_hdr_len += sizeof (ip6i_t); + cr = msg_getcred(mp, &pid); + ASSERT(cr != NULL); + if (cr == NULL) { + error = EINVAL; + goto ud_error2; + } - /* check/fix buffer config, setup pointers into it */ - ip6h = (ip6_t *)&mp->b_rptr[-ip_hdr_len]; - if ((mp->b_datap->db_ref != 1) || - ((unsigned char *)ip6h < mp->b_datap->db_base) || - !OK_32PTR(ip6h)) { - mblk_t *mp1; - - /* Try to get everything in a single mblk next time */ - if (ip_hdr_len > icmp->icmp_max_hdr_len) { - icmp->icmp_max_hdr_len = ip_hdr_len; - - (void) proto_set_tx_wroff(q == NULL ? NULL:RD(q), connp, - icmp->icmp_max_hdr_len + is->is_wroff_extra); - } - mp1 = allocb(ip_hdr_len + is->is_wroff_extra, BPRI_LO); - if (!mp1) { - BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); - kmem_free(hopoptsptr, hopoptslen); - rw_exit(&icmp->icmp_rwlock); - return (ENOMEM); - } - mp1->b_cont = mp; - mp1->b_wptr = mp1->b_datap->db_lim; - ip6h = (ip6_t *)(mp1->b_wptr - ip_hdr_len); - mp = mp1; + /* + * If a port has not been bound to the stream, fail. + * This is not a problem when sockfs is directly + * above us, because it will ensure that the socket + * is first bound before allowing data to be sent. + */ + if (icmp->icmp_state == TS_UNBND) { + error = EPROTO; + goto ud_error2; } - mp->b_rptr = (unsigned char *)ip6h; - ip6i = (ip6i_t *)ip6h; - -#define ANCIL_OR_STICKY_PTR(f) ((is_sticky & f) ? &icmp->icmp_sticky_ipp : ipp) - if (option_exists & IPPF_HAS_IP6I) { - ip6h = (ip6_t *)&ip6i[1]; - ip6i->ip6i_flags = 0; - ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW; - - /* sin6_scope_id takes precendence over IPPF_IFINDEX */ - if (option_exists & IPPF_SCOPE_ID) { - ip6i->ip6i_flags |= IP6I_IFINDEX; - ip6i->ip6i_ifindex = sin6->sin6_scope_id; - } else if (option_exists & IPPF_IFINDEX) { - tipp = ANCIL_OR_STICKY_PTR(IPPF_IFINDEX); - ASSERT(tipp->ipp_ifindex != 0); - ip6i->ip6i_flags |= IP6I_IFINDEX; - ip6i->ip6i_ifindex = tipp->ipp_ifindex; + addr = (struct sockaddr *)&mp->b_rptr[tudr->DEST_offset]; + addrlen = tudr->DEST_length; + + switch (connp->conn_family) { + case AF_INET6: + sin6 = (sin6_t *)addr; + if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) || + (sin6->sin6_family != AF_INET6)) { + error = EADDRNOTAVAIL; + goto ud_error2; } - if (option_exists & IPPF_RAW_CKSUM) { - ip6i->ip6i_flags |= IP6I_RAW_CHECKSUM; - ip6i->ip6i_checksum_off = icmp->icmp_checksum_off; + /* No support for mapped addresses on raw sockets */ + if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { + error = EADDRNOTAVAIL; + goto ud_error2; } + srcid = sin6->__sin6_src_id; - if (option_exists & IPPF_NO_CKSUM) { - ip6i->ip6i_flags |= IP6I_NO_ULP_CKSUM; + /* + * If the local address is a mapped address return + * an error. + * It would be possible to send an IPv6 packet but the + * response would never make it back to the application + * since it is bound to a mapped address. + */ + if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) { + error = EADDRNOTAVAIL; + goto ud_error2; } - if (option_exists & IPPF_ADDR) { + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) + sin6->sin6_addr = ipv6_loopback; + + if (tudr->OPT_length != 0) { /* - * Enable per-packet source address verification if - * IPV6_PKTINFO specified the source address. - * ip6_src is set in the transport's _wput function. + * If we are connected then the destination needs to be + * the same as the connected one. */ - ip6i->ip6i_flags |= IP6I_VERIFY_SRC; - } + if (icmp->icmp_state == TS_DATA_XFER && + !conn_same_as_last_v6(connp, sin6)) { + error = EISCONN; + goto ud_error2; + } + error = icmp_output_ancillary(connp, NULL, sin6, + data_mp, mp, NULL, cr, pid); + } else { + ip_xmit_attr_t *ixa; - if (option_exists & IPPF_DONTFRAG) { - ip6i->ip6i_flags |= IP6I_DONTFRAG; + /* + * We have to allocate an ip_xmit_attr_t before we grab + * conn_lock and we need to hold conn_lock once we've + * checked conn_same_as_last_v6 to handle concurrent + * send* calls on a socket. + */ + ixa = conn_get_ixa(connp, B_FALSE); + if (ixa == NULL) { + error = ENOMEM; + goto ud_error2; + } + mutex_enter(&connp->conn_lock); + + if (conn_same_as_last_v6(connp, sin6) && + connp->conn_lastsrcid == srcid && + ipsec_outbound_policy_current(ixa)) { + /* icmp_output_lastdst drops conn_lock */ + error = icmp_output_lastdst(connp, data_mp, cr, + pid, ixa); + } else { + /* icmp_output_newdst drops conn_lock */ + error = icmp_output_newdst(connp, data_mp, NULL, + sin6, cr, pid, ixa); + } + ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); } + if (error == 0) { + freeb(mp); + return; + } + break; - if (option_exists & IPPF_USE_MIN_MTU) { - ip6i->ip6i_flags = IP6I_API_USE_MIN_MTU( - ip6i->ip6i_flags, ipp->ipp_use_min_mtu); + case AF_INET: + sin = (sin_t *)addr; + if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) || + (sin->sin_family != AF_INET)) { + error = EADDRNOTAVAIL; + goto ud_error2; } + if (sin->sin_addr.s_addr == INADDR_ANY) + sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); - if (option_exists & IPPF_NEXTHOP) { - tipp = ANCIL_OR_STICKY_PTR(IPPF_NEXTHOP); - ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&tipp->ipp_nexthop)); - ip6i->ip6i_flags |= IP6I_NEXTHOP; - ip6i->ip6i_nexthop = tipp->ipp_nexthop; + /* Protocol 255 contains full IP headers */ + /* Read without holding lock */ + if (icmp->icmp_hdrincl) { + if (MBLKL(data_mp) < IP_SIMPLE_HDR_LENGTH) { + if (!pullupmsg(data_mp, IP_SIMPLE_HDR_LENGTH)) { + error = EINVAL; + goto ud_error2; + } + } + error = icmp_output_hdrincl(connp, data_mp, cr, pid); + if (error == 0) { + freeb(mp); + return; + } + /* data_mp consumed above */ + data_mp = NULL; + goto ud_error2; } - /* - * tell IP this is an ip6i_t private header - */ - ip6i->ip6i_nxt = IPPROTO_RAW; - } - - /* Initialize IPv6 header */ - ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW; - bzero(&ip6h->ip6_src, sizeof (ip6h->ip6_src)); - - /* Set the hoplimit of the outgoing packet. */ - if (option_exists & IPPF_HOPLIMIT) { - /* IPV6_HOPLIMIT ancillary data overrides all other settings. */ - ip6h->ip6_hops = ipp->ipp_hoplimit; - ip6i->ip6i_flags |= IP6I_HOPLIMIT; - } else if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { - ip6h->ip6_hops = icmp->icmp_multicast_ttl; - if (option_exists & IPPF_MULTICAST_HOPS) - ip6i->ip6i_flags |= IP6I_HOPLIMIT; - } else { - ip6h->ip6_hops = icmp->icmp_ttl; - if (option_exists & IPPF_UNICAST_HOPS) - ip6i->ip6i_flags |= IP6I_HOPLIMIT; - } + if (tudr->OPT_length != 0) { + /* + * If we are connected then the destination needs to be + * the same as the connected one. + */ + if (icmp->icmp_state == TS_DATA_XFER && + !conn_same_as_last_v4(connp, sin)) { + error = EISCONN; + goto ud_error2; + } + error = icmp_output_ancillary(connp, sin, NULL, + data_mp, mp, NULL, cr, pid); + } else { + ip_xmit_attr_t *ixa; - if (option_exists & IPPF_ADDR) { - tipp = ANCIL_OR_STICKY_PTR(IPPF_ADDR); - ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&tipp->ipp_addr)); - ip6h->ip6_src = tipp->ipp_addr; - } else { - /* - * The source address was not set using IPV6_PKTINFO. - * First look at the bound source. - * If unspecified fallback to __sin6_src_id. - */ - ip6h->ip6_src = icmp->icmp_v6src; - if (sin6->__sin6_src_id != 0 && - IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) { - ip_srcid_find_id(sin6->__sin6_src_id, - &ip6h->ip6_src, icmp->icmp_zoneid, - is->is_netstack); + /* + * We have to allocate an ip_xmit_attr_t before we grab + * conn_lock and we need to hold conn_lock once we've + * checked conn_same_as_last_v4 to handle concurrent + * send* calls on a socket. + */ + ixa = conn_get_ixa(connp, B_FALSE); + if (ixa == NULL) { + error = ENOMEM; + goto ud_error2; + } + mutex_enter(&connp->conn_lock); + + if (conn_same_as_last_v4(connp, sin) && + ipsec_outbound_policy_current(ixa)) { + /* icmp_output_lastdst drops conn_lock */ + error = icmp_output_lastdst(connp, data_mp, cr, + pid, ixa); + } else { + /* icmp_output_newdst drops conn_lock */ + error = icmp_output_newdst(connp, data_mp, sin, + NULL, cr, pid, ixa); + } + ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); + } + if (error == 0) { + freeb(mp); + return; } + break; } + ASSERT(mp != NULL); + /* mp is freed by the following routine */ + icmp_ud_err(q, mp, (t_scalar_t)error); + return; - nxthdr_ptr = (uint8_t *)&ip6h->ip6_nxt; - cp = (uint8_t *)&ip6h[1]; +ud_error2: + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + freemsg(data_mp); + ASSERT(mp != NULL); + /* mp is freed by the following routine */ + icmp_ud_err(q, mp, (t_scalar_t)error); +} + +/* + * Handle the case of the IP address or flow label being different + * for both IPv4 and IPv6. + * + * NOTE: The caller must hold conn_lock and we drop it here. + */ +static int +icmp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, sin6_t *sin6, + cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa) +{ + icmp_t *icmp = connp->conn_icmp; + icmp_stack_t *is = icmp->icmp_is; + int error; + ip_xmit_attr_t *oldixa; + boolean_t do_ipsec; + uint_t srcid; + uint32_t flowinfo; + in6_addr_t v6src; + in6_addr_t v6dst; + in6_addr_t v6nexthop; + in_port_t dstport; + + ASSERT(MUTEX_HELD(&connp->conn_lock)); + ASSERT(ixa != NULL); /* - * Here's where we have to start stringing together - * any extension headers in the right order: - * Hop-by-hop, destination, routing, and final destination opts. + * We hold conn_lock across all the use and modifications of + * the conn_lastdst, conn_ixa, and conn_xmit_ipp to ensure that they + * stay consistent. */ - if (option_exists & IPPF_HOPOPTS) { - /* Hop-by-hop options */ - ip6_hbh_t *hbh = (ip6_hbh_t *)cp; - - *nxthdr_ptr = IPPROTO_HOPOPTS; - nxthdr_ptr = &hbh->ip6h_nxt; - if (hopoptslen == 0) { - tipp = ANCIL_OR_STICKY_PTR(IPPF_HOPOPTS); - bcopy(tipp->ipp_hopopts, cp, tipp->ipp_hopoptslen); - cp += tipp->ipp_hopoptslen; - } else { - bcopy(hopoptsptr, cp, hopoptslen); - cp += hopoptslen; - kmem_free(hopoptsptr, hopoptslen); - } + ASSERT(cr != NULL); + ixa->ixa_cred = cr; + ixa->ixa_cpid = pid; + if (is_system_labeled()) { + /* We need to restart with a label based on the cred */ + ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); } /* - * En-route destination options - * Only do them if there's a routing header as well + * If we are connected then the destination needs to be the + * same as the connected one, which is not the case here since we + * checked for that above. */ - if (option_exists & IPPF_RTDSTOPTS) { - ip6_dest_t *dst = (ip6_dest_t *)cp; - tipp = ANCIL_OR_STICKY_PTR(IPPF_RTDSTOPTS); + if (icmp->icmp_state == TS_DATA_XFER) { + mutex_exit(&connp->conn_lock); + error = EISCONN; + goto ud_error; + } - *nxthdr_ptr = IPPROTO_DSTOPTS; - nxthdr_ptr = &dst->ip6d_nxt; + /* In case previous destination was multicast or multirt */ + ip_attr_newdst(ixa); - bcopy(tipp->ipp_rtdstopts, cp, tipp->ipp_rtdstoptslen); - cp += tipp->ipp_rtdstoptslen; - } /* - * Routing header next + * If laddr is unspecified then we look at sin6_src_id. + * We will give precedence to a source address set with IPV6_PKTINFO + * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't + * want ip_attr_connect to select a source (since it can fail) when + * IPV6_PKTINFO is specified. + * If this doesn't result in a source address then we get a source + * from ip_attr_connect() below. */ - if (option_exists & IPPF_RTHDR) { - ip6_rthdr_t *rt = (ip6_rthdr_t *)cp; - tipp = ANCIL_OR_STICKY_PTR(IPPF_RTHDR); + v6src = connp->conn_saddr_v6; + if (sin != NULL) { + IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst); + dstport = sin->sin_port; + flowinfo = 0; + srcid = 0; + ixa->ixa_flags &= ~IXAF_SCOPEID_SET; + if (srcid != 0 && V4_PART_OF_V6(&v6src) == INADDR_ANY) { + ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), + connp->conn_netstack); + } + ixa->ixa_flags |= IXAF_IS_IPV4; + } else { + v6dst = sin6->sin6_addr; + dstport = sin6->sin6_port; + flowinfo = sin6->sin6_flowinfo; + srcid = sin6->__sin6_src_id; + if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) { + ixa->ixa_scopeid = sin6->sin6_scope_id; + ixa->ixa_flags |= IXAF_SCOPEID_SET; + } else { + ixa->ixa_flags &= ~IXAF_SCOPEID_SET; + } + if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { + ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), + connp->conn_netstack); + } + if (IN6_IS_ADDR_V4MAPPED(&v6dst)) + ixa->ixa_flags |= IXAF_IS_IPV4; + else + ixa->ixa_flags &= ~IXAF_IS_IPV4; + } + /* Handle IPV6_PKTINFO setting source address. */ + if (IN6_IS_ADDR_UNSPECIFIED(&v6src) && + (connp->conn_xmit_ipp.ipp_fields & IPPF_ADDR)) { + ip_pkt_t *ipp = &connp->conn_xmit_ipp; - *nxthdr_ptr = IPPROTO_ROUTING; - nxthdr_ptr = &rt->ip6r_nxt; + if (ixa->ixa_flags & IXAF_IS_IPV4) { + if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) + v6src = ipp->ipp_addr; + } else { + if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) + v6src = ipp->ipp_addr; + } + } - bcopy(tipp->ipp_rthdr, cp, tipp->ipp_rthdrlen); - cp += tipp->ipp_rthdrlen; + /* Defer IPsec if it might need to look at ICMP type/code */ + switch (ixa->ixa_protocol) { + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + do_ipsec = B_FALSE; + break; + default: + do_ipsec = B_TRUE; } - /* - * Do ultimate destination options - */ - if (option_exists & IPPF_DSTOPTS) { - ip6_dest_t *dest = (ip6_dest_t *)cp; - tipp = ANCIL_OR_STICKY_PTR(IPPF_DSTOPTS); - *nxthdr_ptr = IPPROTO_DSTOPTS; - nxthdr_ptr = &dest->ip6d_nxt; + ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &v6dst, &v6nexthop); + mutex_exit(&connp->conn_lock); - bcopy(tipp->ipp_dstopts, cp, tipp->ipp_dstoptslen); - cp += tipp->ipp_dstoptslen; + error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport, + &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | + (do_ipsec ? IPDF_IPSEC : 0)); + switch (error) { + case 0: + break; + case EADDRNOTAVAIL: + /* + * IXAF_VERIFY_SOURCE tells us to pick a better source. + * Don't have the application see that errno + */ + error = ENETUNREACH; + goto failed; + case ENETDOWN: + /* + * Have !ipif_addr_ready address; drop packet silently + * until we can get applications to not send until we + * are ready. + */ + error = 0; + goto failed; + case EHOSTUNREACH: + case ENETUNREACH: + if (ixa->ixa_ire != NULL) { + /* + * Let conn_ip_output/ire_send_noroute return + * the error and send any local ICMP error. + */ + error = 0; + break; + } + /* FALLTHRU */ + default: + failed: + goto ud_error; } + mutex_enter(&connp->conn_lock); /* - * Now set the last header pointer to the proto passed in + * While we dropped the lock some other thread might have connected + * this socket. If so we bail out with EISCONN to ensure that the + * connecting thread is the one that updates conn_ixa, conn_ht_* + * and conn_*last*. */ - ASSERT((int)(cp - (uint8_t *)ip6i) == ip_hdr_len); - *nxthdr_ptr = icmp->icmp_proto; + if (icmp->icmp_state == TS_DATA_XFER) { + mutex_exit(&connp->conn_lock); + error = EISCONN; + goto ud_error; + } /* - * Copy in the destination address + * We need to rebuild the headers if + * - we are labeling packets (could be different for different + * destinations) + * - we have a source route (or routing header) since we need to + * massage that to get the pseudo-header checksum + * - a socket option with COA_HEADER_CHANGED has been set which + * set conn_v6lastdst to zero. + * + * Otherwise the prepend function will just update the src, dst, + * and flow label. */ - ip6h->ip6_dst = ip6_dst; - - ip6h->ip6_vcf = - (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) | - (sin6->sin6_flowinfo & ~IPV6_VERS_AND_FLOW_MASK); - - if (option_exists & IPPF_TCLASS) { - tipp = ANCIL_OR_STICKY_PTR(IPPF_TCLASS); - ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf, - tipp->ipp_tclass); - } - if (option_exists & IPPF_RTHDR) { - ip6_rthdr_t *rth; - + if (is_system_labeled()) { + /* TX MLP requires SCM_UCRED and don't have that here */ + if (connp->conn_mlp_type != mlptSingle) { + mutex_exit(&connp->conn_lock); + error = ECONNREFUSED; + goto ud_error; + } /* - * Perform any processing needed for source routing. - * We know that all extension headers will be in the same mblk - * as the IPv6 header. + * Check whether Trusted Solaris policy allows communication + * with this host, and pretend that the destination is + * unreachable if not. + * Compute any needed label and place it in ipp_label_v4/v6. + * + * Later conn_build_hdr_template/conn_prepend_hdr takes + * ipp_label_v4/v6 to form the packet. + * + * Tsol note: Since we hold conn_lock we know no other + * thread manipulates conn_xmit_ipp. */ - rth = ip_find_rthdr_v6(ip6h, mp->b_wptr); - if (rth != NULL && rth->ip6r_segleft != 0) { - if (rth->ip6r_type != IPV6_RTHDR_TYPE_0) { - /* - * Drop packet - only support Type 0 routing. - * Notify the application as well. - */ - BUMP_MIB(&is->is_rawip_mib, - rawipOutErrors); - rw_exit(&icmp->icmp_rwlock); - return (EPROTO); - } - /* - * rth->ip6r_len is twice the number of - * addresses in the header - */ - if (rth->ip6r_len & 0x1) { - BUMP_MIB(&is->is_rawip_mib, - rawipOutErrors); - rw_exit(&icmp->icmp_rwlock); - return (EPROTO); - } - /* - * Shuffle the routing header and ip6_dst - * addresses, and get the checksum difference - * between the first hop (in ip6_dst) and - * the destination (in the last routing hdr entry). - */ - csum = ip_massage_options_v6(ip6h, rth, - is->is_netstack); - /* - * Verify that the first hop isn't a mapped address. - * Routers along the path need to do this verification - * for subsequent hops. - */ - if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_dst)) { - BUMP_MIB(&is->is_rawip_mib, - rawipOutErrors); - rw_exit(&icmp->icmp_rwlock); - return (EADDRNOTAVAIL); + error = conn_update_label(connp, ixa, &v6dst, + &connp->conn_xmit_ipp); + if (error != 0) { + mutex_exit(&connp->conn_lock); + goto ud_error; + } + /* Rebuild the header template */ + error = icmp_build_hdr_template(connp, &v6src, &v6dst, + flowinfo); + if (error != 0) { + mutex_exit(&connp->conn_lock); + goto ud_error; + } + } else if (connp->conn_xmit_ipp.ipp_fields & + (IPPF_IPV4_OPTIONS|IPPF_RTHDR) || + IN6_IS_ADDR_UNSPECIFIED(&connp->conn_v6lastdst)) { + /* Rebuild the header template */ + error = icmp_build_hdr_template(connp, &v6src, &v6dst, + flowinfo); + if (error != 0) { + mutex_exit(&connp->conn_lock); + goto ud_error; + } + } else { + /* Simply update the destination address if no source route */ + if (ixa->ixa_flags & IXAF_IS_IPV4) { + ipha_t *ipha = (ipha_t *)connp->conn_ht_iphc; + + IN6_V4MAPPED_TO_IPADDR(&v6dst, ipha->ipha_dst); + if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) { + ipha->ipha_fragment_offset_and_flags |= + IPH_DF_HTONS; + } else { + ipha->ipha_fragment_offset_and_flags &= + ~IPH_DF_HTONS; } + } else { + ip6_t *ip6h = (ip6_t *)connp->conn_ht_iphc; + ip6h->ip6_dst = v6dst; } } - ip_len = mp->b_wptr - (uchar_t *)ip6h - IPV6_HDR_LEN; - if (mp->b_cont != NULL) - ip_len += msgdsize(mp->b_cont); - /* - * Set the length into the IP header. - * If the length is greater than the maximum allowed by IP, - * then free the message and return. Do not try and send it - * as this can cause problems in layers below. + * Remember the dst etc which corresponds to the built header + * template and conn_ixa. */ - if (ip_len > IP_MAXPACKET) { - BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); - rw_exit(&icmp->icmp_rwlock); - return (EMSGSIZE); + oldixa = conn_replace_ixa(connp, ixa); + connp->conn_v6lastdst = v6dst; + connp->conn_lastflowinfo = flowinfo; + connp->conn_lastscopeid = ixa->ixa_scopeid; + connp->conn_lastsrcid = srcid; + /* Also remember a source to use together with lastdst */ + connp->conn_v6lastsrc = v6src; + + data_mp = icmp_prepend_header_template(connp, ixa, data_mp, &v6src, + flowinfo, &error); + + /* Done with conn_t */ + mutex_exit(&connp->conn_lock); + ixa_refrele(oldixa); + + if (data_mp == NULL) { + ASSERT(error != 0); + goto ud_error; } - if (icmp->icmp_proto == IPPROTO_ICMPV6 || icmp->icmp_raw_checksum) { - uint_t cksum_off; /* From ip6i == mp->b_rptr */ - uint16_t *cksum_ptr; - uint_t ext_hdrs_len; - /* ICMPv6 must have an offset matching icmp6_cksum offset */ - ASSERT(icmp->icmp_proto != IPPROTO_ICMPV6 || - icmp->icmp_checksum_off == 2); + if (!do_ipsec) { + /* Policy might differ for different ICMP type/code */ + data_mp = icmp_output_attach_policy(data_mp, connp, ixa); + if (data_mp == NULL) { + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + error = EHOSTUNREACH; /* IPsec policy failure */ + goto done; + } + } + /* We're done. Pass the packet to ip. */ + BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); + + error = conn_ip_output(data_mp, ixa); + /* No rawipOutErrors if an error since IP increases its error counter */ + switch (error) { + case 0: + break; + case EWOULDBLOCK: + (void) ixa_check_drain_insert(connp, ixa); + error = 0; + break; + case EADDRNOTAVAIL: /* - * We make it easy for IP to include our pseudo header - * by putting our length in uh_checksum, modified (if - * we have a routing header) by the checksum difference - * between the ultimate destination and first hop addresses. - * Note: ICMPv6 must always checksum the packet. + * IXAF_VERIFY_SOURCE tells us to pick a better source. + * Don't have the application see that errno */ - cksum_off = ip_hdr_len + icmp->icmp_checksum_off; - if (cksum_off + sizeof (uint16_t) > mp->b_wptr - mp->b_rptr) { - if (!pullupmsg(mp, cksum_off + sizeof (uint16_t))) { - BUMP_MIB(&is->is_rawip_mib, - rawipOutErrors); - freemsg(mp); - rw_exit(&icmp->icmp_rwlock); - return (0); - } - ip6i = (ip6i_t *)mp->b_rptr; - if (ip6i->ip6i_nxt == IPPROTO_RAW) - ip6h = (ip6_t *)&ip6i[1]; - else - ip6h = (ip6_t *)ip6i; - } - /* Add payload length to checksum */ - ext_hdrs_len = ip_hdr_len - IPV6_HDR_LEN - - (int)((uchar_t *)ip6h - (uchar_t *)ip6i); - csum += htons(ip_len - ext_hdrs_len); - - cksum_ptr = (uint16_t *)((uchar_t *)ip6i + cksum_off); - csum = (csum & 0xFFFF) + (csum >> 16); - *cksum_ptr = (uint16_t)csum; + error = ENETUNREACH; + /* FALLTHRU */ + default: + mutex_enter(&connp->conn_lock); + /* + * Clear the source and v6lastdst so we call ip_attr_connect + * for the next packet and try to pick a better source. + */ + if (connp->conn_mcbc_bind) + connp->conn_saddr_v6 = ipv6_all_zeros; + else + connp->conn_saddr_v6 = connp->conn_bound_addr_v6; + connp->conn_v6lastdst = ipv6_all_zeros; + mutex_exit(&connp->conn_lock); + break; } +done: + ixa_refrele(ixa); + return (error); -#ifdef _LITTLE_ENDIAN - ip_len = htons(ip_len); -#endif - ip6h->ip6_plen = (uint16_t)ip_len; +ud_error: + if (ixa != NULL) + ixa_refrele(ixa); - /* We're done. Pass the packet to IP */ - rw_exit(&icmp->icmp_rwlock); - BUMP_MIB(&is->is_rawip_mib, rawipOutDatagrams); - ip_output_v6(icmp->icmp_connp, mp, q, IP_WPUT); - return (0); + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + freemsg(data_mp); + return (error); +} + +/* ARGSUSED */ +static void +icmp_wput_fallback(queue_t *q, mblk_t *mp) +{ +#ifdef DEBUG + cmn_err(CE_CONT, "icmp_wput_fallback: Message during fallback \n"); +#endif + freemsg(mp); } static void @@ -5559,7 +4622,6 @@ icmp_wput_other(queue_t *q, mblk_t *mp) { uchar_t *rptr = mp->b_rptr; struct iocblk *iocp; -#define tudr ((struct T_unitdata_req *)rptr) conn_t *connp = Q_TO_CONN(q); icmp_t *icmp = connp->conn_icmp; icmp_stack_t *is = icmp->icmp_is; @@ -5576,7 +4638,7 @@ icmp_wput_other(queue_t *q, mblk_t *mp) freemsg(mp); return; } - switch (((union T_primitives *)rptr)->type) { + switch (((t_primp_t)rptr)->type) { case T_ADDR_REQ: icmp_addr_req(q, mp); return; @@ -5596,15 +4658,14 @@ icmp_wput_other(queue_t *q, mblk_t *mp) case T_UNITDATA_REQ: /* * If a T_UNITDATA_REQ gets here, the address must - * be bad. Valid T_UNITDATA_REQs are found above - * and break to below this switch. + * be bad. Valid T_UNITDATA_REQs are handled + * in icmp_wput. */ icmp_ud_err(q, mp, EADDRNOTAVAIL); return; case T_UNBIND_REQ: icmp_tpi_unbind(q, mp); return; - case T_SVR4_OPTMGMT_REQ: /* * All Solaris components should pass a db_credp @@ -5622,9 +4683,7 @@ icmp_wput_other(queue_t *q, mblk_t *mp) if (!snmpcom_req(q, mp, icmp_snmp_set, ip_snmp_get, cr)) { - /* Only IP can return anything meaningful */ - (void) svr4_optcom_req(q, mp, cr, - &icmp_opt_obj, B_TRUE); + svr4_optcom_req(q, mp, cr, &icmp_opt_obj); } return; @@ -5642,8 +4701,7 @@ icmp_wput_other(queue_t *q, mblk_t *mp) icmp_err_ack(q, mp, TSYSERR, EINVAL); return; } - /* Only IP can return anything meaningful */ - (void) tpi_optcom_req(q, mp, cr, &icmp_opt_obj, B_TRUE); + tpi_optcom_req(q, mp, cr, &icmp_opt_obj); return; case T_DISCON_REQ: @@ -5660,13 +4718,16 @@ icmp_wput_other(queue_t *q, mblk_t *mp) case T_DATA_REQ: case T_EXDATA_REQ: case T_ORDREL_REQ: - freemsg(mp); - (void) putctl1(RD(q), M_ERROR, EPROTO); + icmp_err_ack(q, mp, TNOTSUPPORT, 0); return; default: break; } break; + case M_FLUSH: + if (*rptr & FLUSHW) + flushq(q, FLUSHDATA); + break; case M_IOCTL: iocp = (struct iocblk *)mp->b_rptr; switch (iocp->ioc_cmd) { @@ -5678,7 +4739,6 @@ icmp_wput_other(queue_t *q, mblk_t *mp) * don't know the peer's name. */ iocp->ioc_error = ENOTCONN; - err_ret:; iocp->ioc_count = 0; mp->b_datap->db_type = M_IOCACK; qreply(q, mp); @@ -5696,22 +4756,13 @@ icmp_wput_other(queue_t *q, mblk_t *mp) SIZEOF_STRUCT(strbuf, iocp->ioc_flag)); return; case ND_SET: - /* nd_getset performs the necessary error checking */ + /* nd_getset performs the necessary checking */ case ND_GET: if (nd_getset(q, is->is_nd, mp)) { qreply(q, mp); return; } break; - case _SIOCSOCKFALLBACK: - /* - * socket is falling back to be a - * streams socket. Nothing to do - */ - iocp->ioc_count = 0; - iocp->ioc_rval = 0; - qreply(q, mp); - return; default: break; } @@ -5720,23 +4771,24 @@ icmp_wput_other(queue_t *q, mblk_t *mp) icmp_wput_iocdata(q, mp); return; default: + /* Unrecognized messages are passed through without change. */ break; } - ip_wput(q, mp); + ip_wput_nondata(q, mp); } /* - * icmp_wput_iocdata is called by icmp_wput_slow to handle all M_IOCDATA + * icmp_wput_iocdata is called by icmp_wput_other to handle all M_IOCDATA * messages. */ static void icmp_wput_iocdata(queue_t *q, mblk_t *mp) { - mblk_t *mp1; + mblk_t *mp1; STRUCT_HANDLE(strbuf, sb); - icmp_t *icmp; - uint_t addrlen; - uint_t error; + uint_t addrlen; + conn_t *connp = Q_TO_CONN(q); + icmp_t *icmp = connp->conn_icmp; /* Make sure it is one of ours. */ switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { @@ -5744,10 +4796,10 @@ icmp_wput_iocdata(queue_t *q, mblk_t *mp) case TI_GETPEERNAME: break; default: - icmp = Q_TO_ICMP(q); - ip_output(icmp->icmp_connp, mp, q, IP_WPUT); + ip_wput_nondata(q, mp); return; } + switch (mi_copy_state(q, mp, &mp1)) { case -1: return; @@ -5776,6 +4828,7 @@ icmp_wput_iocdata(queue_t *q, mblk_t *mp) mi_copy_done(q, mp, EPROTO); return; } + /* * Now we have the strbuf structure for TI_GETMYNAME * and TI_GETPEERNAME. Next we copyout the requested @@ -5783,8 +4836,8 @@ icmp_wput_iocdata(queue_t *q, mblk_t *mp) */ STRUCT_SET_HANDLE(sb, ((struct iocblk *)mp->b_rptr)->ioc_flag, (void *)mp1->b_rptr); - icmp = Q_TO_ICMP(q); - if (icmp->icmp_family == AF_INET) + + if (connp->conn_family == AF_INET) addrlen = sizeof (sin_t); else addrlen = sizeof (sin6_t); @@ -5793,72 +4846,37 @@ icmp_wput_iocdata(queue_t *q, mblk_t *mp) mi_copy_done(q, mp, EINVAL); return; } - + switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { + case TI_GETMYNAME: + break; + case TI_GETPEERNAME: + if (icmp->icmp_state != TS_DATA_XFER) { + mi_copy_done(q, mp, ENOTCONN); + return; + } + break; + default: + mi_copy_done(q, mp, EPROTO); + return; + } mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE); - - if (mp1 == NULL) + if (!mp1) return; - rw_enter(&icmp->icmp_rwlock, RW_READER); + STRUCT_FSET(sb, len, addrlen); switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { case TI_GETMYNAME: - error = rawip_do_getsockname(icmp, (void *)mp1->b_rptr, + (void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr, &addrlen); break; case TI_GETPEERNAME: - error = rawip_do_getpeername(icmp, (void *)mp1->b_rptr, + (void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr, &addrlen); break; } - rw_exit(&icmp->icmp_rwlock); - - if (error != 0) { - mi_copy_done(q, mp, error); - } else { - mp1->b_wptr += addrlen; - STRUCT_FSET(sb, len, addrlen); - - /* Copy out the address */ - mi_copyout(q, mp); - } -} - -static int -icmp_unitdata_opt_process(queue_t *q, mblk_t *mp, int *errorp, - void *thisdg_attrs) -{ - struct T_unitdata_req *udreqp; - int is_absreq_failure; - cred_t *cr; - - udreqp = (struct T_unitdata_req *)mp->b_rptr; - *errorp = 0; - - /* - * All Solaris components should pass a db_credp - * for this TPI message, hence we ASSERT. - * But in case there is some other M_PROTO that looks - * like a TPI message sent by some other kernel - * component, we check and return an error. - */ - cr = msg_getcred(mp, NULL); - ASSERT(cr != NULL); - if (cr == NULL) - return (-1); - - *errorp = tpi_optcom_buf(q, mp, &udreqp->OPT_length, - udreqp->OPT_offset, cr, &icmp_opt_obj, - thisdg_attrs, &is_absreq_failure); - - if (*errorp != 0) { - /* - * Note: No special action needed in this - * module for "is_absreq_failure" - */ - return (-1); /* failure */ - } - ASSERT(is_absreq_failure == 0); - return (0); /* success */ + mp1->b_wptr += addrlen; + /* Copy out the address */ + mi_copyout(q, mp); } void @@ -6013,7 +5031,7 @@ rawip_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, socklen_t len, cred_t *cr) { conn_t *connp = (conn_t *)proto_handle; - int error; + int error; /* All Solaris components should pass a cred for this operation. */ ASSERT(cr != NULL); @@ -6042,14 +5060,14 @@ rawip_implicit_bind(conn_t *connp) socklen_t len; int error; - if (connp->conn_icmp->icmp_family == AF_INET) { + if (connp->conn_family == AF_INET) { len = sizeof (struct sockaddr_in); sin = (sin_t *)&sin6addr; *sin = sin_null; sin->sin_family = AF_INET; sin->sin_addr.s_addr = INADDR_ANY; } else { - ASSERT(connp->conn_icmp->icmp_family == AF_INET6); + ASSERT(connp->conn_family == AF_INET6); len = sizeof (sin6_t); sin6 = (sin6_t *)&sin6addr; *sin6 = sin6_null; @@ -6081,7 +5099,6 @@ rawip_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr) return (EOPNOTSUPP); } -/* ARGSUSED */ int rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, socklen_t len, sock_connid_t *id, cred_t *cr) @@ -6090,6 +5107,7 @@ rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, icmp_t *icmp = connp->conn_icmp; int error; boolean_t did_bind = B_FALSE; + pid_t pid = curproc->p_pid; /* All Solaris components should pass a cred for this operation. */ ASSERT(cr != NULL); @@ -6106,7 +5124,7 @@ rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, return (error); } - error = proto_verify_ip_addr(icmp->icmp_family, sa, len); + error = proto_verify_ip_addr(connp->conn_family, sa, len); if (error != 0) return (error); @@ -6126,10 +5144,9 @@ rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, /* * set SO_DGRAM_ERRIND */ - icmp->icmp_dgram_errind = B_TRUE; - - error = rawip_do_connect(connp, sa, len, cr); + connp->conn_dgram_errind = B_TRUE; + error = rawip_do_connect(connp, sa, len, cr, pid); if (error != 0 && did_bind) { int unbind_err; @@ -6139,15 +5156,15 @@ rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, if (error == 0) { *id = 0; - (*connp->conn_upcalls->su_connected) - (connp->conn_upper_handle, 0, NULL, -1); + (*connp->conn_upcalls->su_connected)(connp->conn_upper_handle, + 0, NULL, -1); } else if (error < 0) { error = proto_tlitosyserr(-error); } return (error); } -/* ARGSUSED */ +/* ARGSUSED2 */ int rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q, boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb) @@ -6184,9 +5201,8 @@ rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q, stropt_mp->b_wptr += sizeof (*stropt); stropt = (struct stroptions *)stropt_mp->b_rptr; stropt->so_flags = SO_WROFF | SO_HIWAT; - stropt->so_wroff = - (ushort_t)(icmp->icmp_max_hdr_len + icmp->icmp_is->is_wroff_extra); - stropt->so_hiwat = icmp->icmp_recv_hiwat; + stropt->so_wroff = connp->conn_wroff; + stropt->so_hiwat = connp->conn_rcvbuf; putnext(RD(q), stropt_mp); /* @@ -6207,9 +5223,9 @@ rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q, if (error != 0) faddrlen = 0; opts = 0; - if (icmp->icmp_dgram_errind) + if (connp->conn_dgram_errind) opts |= SO_DGRAM_ERRIND; - if (icmp->icmp_dontroute) + if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE) opts |= SO_DONTROUTE; (*quiesced_cb)(connp->conn_upper_handle, q, &tca, @@ -6218,7 +5234,7 @@ rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q, /* * Attempts to send data up during fallback will result in it being - * queued in udp_t. Now we push up any queued packets. + * queued in icmp_t. Now we push up any queued packets. */ mutex_enter(&icmp->icmp_recv_lock); while (icmp->icmp_fallback_queue_head != NULL) { @@ -6236,9 +5252,9 @@ rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q, /* * No longer a streams less socket */ - rw_enter(&icmp->icmp_rwlock, RW_WRITER); + mutex_enter(&connp->conn_lock); connp->conn_flags &= ~IPCL_NONSTR; - rw_exit(&icmp->icmp_rwlock); + mutex_exit(&connp->conn_lock); mutex_exit(&icmp->icmp_recv_lock); @@ -6250,7 +5266,7 @@ rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q, return (0); } -/* ARGSUSED */ +/* ARGSUSED2 */ sock_lower_handle_t rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, uint_t *smodep, int *errorp, int flags, cred_t *credp) @@ -6262,35 +5278,10 @@ rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, return (NULL); } - connp = icmp_open(family, credp, errorp, flags); + connp = rawip_do_open(family, credp, errorp, flags); if (connp != NULL) { - icmp_stack_t *is; - - is = connp->conn_icmp->icmp_is; connp->conn_flags |= IPCL_NONSTR; - if (connp->conn_icmp->icmp_family == AF_INET6) { - /* Build initial header template for transmit */ - rw_enter(&connp->conn_icmp->icmp_rwlock, RW_WRITER); - if ((*errorp = - icmp_build_hdrs(connp->conn_icmp)) != 0) { - rw_exit(&connp->conn_icmp->icmp_rwlock); - ipcl_conn_destroy(connp); - return (NULL); - } - rw_exit(&connp->conn_icmp->icmp_rwlock); - } - - connp->conn_icmp->icmp_recv_hiwat = is->is_recv_hiwat; - connp->conn_icmp->icmp_xmit_hiwat = is->is_xmit_hiwat; - - if ((*errorp = ip_create_helper_stream(connp, - is->is_ldi_ident)) != 0) { - cmn_err(CE_CONT, "create of IP helper stream failed\n"); - (void) rawip_do_close(connp); - return (NULL); - } - mutex_enter(&connp->conn_lock); connp->conn_state_flags &= ~CONN_INCIPIENT; mutex_exit(&connp->conn_lock); @@ -6303,14 +5294,13 @@ rawip_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, return ((sock_lower_handle_t)connp); } -/* ARGSUSED */ +/* ARGSUSED3 */ void rawip_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags, cred_t *cr) { conn_t *connp = (conn_t *)proto_handle; - icmp_stack_t *is = connp->conn_icmp->icmp_is; struct sock_proto_props sopp; /* All Solaris components should pass a cred for this operation. */ @@ -6321,10 +5311,9 @@ rawip_activate(sock_lower_handle_t proto_handle, sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ; - sopp.sopp_wroff = connp->conn_icmp->icmp_max_hdr_len + - is->is_wroff_extra; - sopp.sopp_rxhiwat = is->is_recv_hiwat; - sopp.sopp_rxlowat = icmp_mod_info.mi_lowat; + sopp.sopp_wroff = connp->conn_wroff; + sopp.sopp_rxhiwat = connp->conn_rcvbuf; + sopp.sopp_rxlowat = connp->conn_rcvlowat; sopp.sopp_maxblk = INFPSZ; sopp.sopp_maxpsz = IP_MAXPACKET; sopp.sopp_minpsz = (icmp_mod_info.mi_minpsz == 1) ? 0 : @@ -6332,113 +5321,11 @@ rawip_activate(sock_lower_handle_t proto_handle, (*connp->conn_upcalls->su_set_proto_props) (connp->conn_upper_handle, &sopp); -} - -static int -rawip_do_getsockname(icmp_t *icmp, struct sockaddr *sa, uint_t *salenp) -{ - sin_t *sin = (sin_t *)sa; - sin6_t *sin6 = (sin6_t *)sa; - - ASSERT(icmp != NULL); - ASSERT(RW_LOCK_HELD(&icmp->icmp_rwlock)); - switch (icmp->icmp_family) { - case AF_INET: - ASSERT(icmp->icmp_ipversion == IPV4_VERSION); - if (*salenp < sizeof (sin_t)) - return (EINVAL); - - *salenp = sizeof (sin_t); - *sin = sin_null; - sin->sin_family = AF_INET; - if (icmp->icmp_state == TS_UNBND) { - break; - } - - if (!IN6_IS_ADDR_V4MAPPED_ANY(&icmp->icmp_v6src) && - !IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) { - sin->sin_addr.s_addr = V4_PART_OF_V6(icmp->icmp_v6src); - } else { - /* - * INADDR_ANY - * icmp_v6src is not set, we might be bound to - * broadcast/multicast. Use icmp_bound_v6src as - * local address instead (that could - * also still be INADDR_ANY) - */ - sin->sin_addr.s_addr = - V4_PART_OF_V6(icmp->icmp_bound_v6src); - } - break; - case AF_INET6: - - if (*salenp < sizeof (sin6_t)) - return (EINVAL); - - *salenp = sizeof (sin6_t); - *sin6 = sin6_null; - sin6->sin6_family = AF_INET6; - if (icmp->icmp_state == TS_UNBND) { - break; - } - if (!IN6_IS_ADDR_UNSPECIFIED(&icmp->icmp_v6src)) { - sin6->sin6_addr = icmp->icmp_v6src; - } else { - /* - * UNSPECIFIED - * icmp_v6src is not set, we might be bound to - * broadcast/multicast. Use icmp_bound_v6src as - * local address instead (that could - * also still be UNSPECIFIED) - */ - - sin6->sin6_addr = icmp->icmp_bound_v6src; - } - break; - } - return (0); -} - -static int -rawip_do_getpeername(icmp_t *icmp, struct sockaddr *sa, uint_t *salenp) -{ - sin_t *sin = (sin_t *)sa; - sin6_t *sin6 = (sin6_t *)sa; - - ASSERT(icmp != NULL); - ASSERT(RW_LOCK_HELD(&icmp->icmp_rwlock)); - - if (icmp->icmp_state != TS_DATA_XFER) - return (ENOTCONN); - - sa->sa_family = icmp->icmp_family; - switch (icmp->icmp_family) { - case AF_INET: - ASSERT(icmp->icmp_ipversion == IPV4_VERSION); - - if (*salenp < sizeof (sin_t)) - return (EINVAL); - - *salenp = sizeof (sin_t); - *sin = sin_null; - sin->sin_family = AF_INET; - sin->sin_addr.s_addr = - V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr); - break; - case AF_INET6: - if (*salenp < sizeof (sin6_t)) - return (EINVAL); - - *salenp = sizeof (sin6_t); - *sin6 = sin6_null; - *sin6 = icmp->icmp_v6dst; - break; - } - return (0); + icmp_bind_proto(connp->conn_icmp); } -/* ARGSUSED */ +/* ARGSUSED3 */ int rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa, socklen_t *salenp, cred_t *cr) @@ -6450,36 +5337,29 @@ rawip_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa, /* All Solaris components should pass a cred for this operation. */ ASSERT(cr != NULL); - ASSERT(icmp != NULL); - - rw_enter(&icmp->icmp_rwlock, RW_READER); - - error = rawip_do_getpeername(icmp, sa, salenp); - - rw_exit(&icmp->icmp_rwlock); - + mutex_enter(&connp->conn_lock); + if (icmp->icmp_state != TS_DATA_XFER) + error = ENOTCONN; + else + error = conn_getpeername(connp, sa, salenp); + mutex_exit(&connp->conn_lock); return (error); } -/* ARGSUSED */ +/* ARGSUSED3 */ int rawip_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa, socklen_t *salenp, cred_t *cr) { conn_t *connp = (conn_t *)proto_handle; - icmp_t *icmp = connp->conn_icmp; int error; /* All Solaris components should pass a cred for this operation. */ ASSERT(cr != NULL); - ASSERT(icmp != NULL); - rw_enter(&icmp->icmp_rwlock, RW_READER); - - error = rawip_do_getsockname(icmp, sa, salenp); - - rw_exit(&icmp->icmp_rwlock); - + mutex_enter(&connp->conn_lock); + error = conn_getsockname(connp, sa, salenp); + mutex_exit(&connp->conn_lock); return (error); } @@ -6488,7 +5368,6 @@ rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, const void *optvalp, socklen_t optlen, cred_t *cr) { conn_t *connp = (conn_t *)proto_handle; - icmp_t *icmp = connp->conn_icmp; int error; /* All Solaris components should pass a cred for this operation. */ @@ -6497,7 +5376,6 @@ rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, error = proto_opt_check(level, option_name, optlen, NULL, icmp_opt_obj.odb_opt_des_arr, icmp_opt_obj.odb_opt_arr_cnt, - icmp_opt_obj.odb_topmost_tpiprovider, B_TRUE, B_FALSE, cr); if (error != 0) { @@ -6510,19 +5388,9 @@ rawip_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, return (error); } - rw_enter(&icmp->icmp_rwlock, RW_WRITER); error = icmp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp, NULL, cr); - rw_exit(&icmp->icmp_rwlock); - - if (error < 0) { - /* - * Pass on to ip - */ - error = ip_set_options(connp, level, option_name, optvalp, - optlen, cr); - } ASSERT(error >= 0); @@ -6535,7 +5403,6 @@ rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, { int error; conn_t *connp = (conn_t *)proto_handle; - icmp_t *icmp = connp->conn_icmp; t_uscalar_t max_optbuf_len; void *optvalp_buf; int len; @@ -6546,7 +5413,6 @@ rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len, icmp_opt_obj.odb_opt_des_arr, icmp_opt_obj.odb_opt_arr_cnt, - icmp_opt_obj.odb_topmost_tpiprovider, B_FALSE, B_TRUE, cr); if (error != 0) { @@ -6557,31 +5423,25 @@ rawip_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, } optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP); - rw_enter(&icmp->icmp_rwlock, RW_READER); len = icmp_opt_get(connp, level, option_name, optvalp_buf); - rw_exit(&icmp->icmp_rwlock); - - if (len < 0) { - /* - * Pass on to IP - */ - kmem_free(optvalp_buf, max_optbuf_len); - return (ip_get_options(connp, level, option_name, optvalp, - optlen, cr)); - } else { - /* - * update optlen and copy option value - */ - t_uscalar_t size = MIN(len, *optlen); - bcopy(optvalp_buf, optvalp, size); - bcopy(&size, optlen, sizeof (size)); - + if (len == -1) { kmem_free(optvalp_buf, max_optbuf_len); - return (0); + return (EINVAL); } + + /* + * update optlen and copy option value + */ + t_uscalar_t size = MIN(len, *optlen); + + bcopy(optvalp_buf, optvalp, size); + bcopy(&size, optlen, sizeof (size)); + + kmem_free(optvalp_buf, max_optbuf_len); + return (0); } -/* ARGSUSED */ +/* ARGSUSED1 */ int rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr) { @@ -6594,7 +5454,7 @@ rawip_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr) return (0); } -/* ARGSUSED */ +/* ARGSUSED2 */ int rawip_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) { @@ -6635,6 +5495,27 @@ rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, /* All Solaris components should pass a cred for this operation. */ ASSERT(cr != NULL); + /* + * If we don't have a helper stream then create one. + * ip_create_helper_stream takes care of locking the conn_t, + * so this check for NULL is just a performance optimization. + */ + if (connp->conn_helper_info == NULL) { + icmp_stack_t *is = connp->conn_icmp->icmp_is; + + ASSERT(is->is_ldi_ident != NULL); + + /* + * Create a helper stream for non-STREAMS socket. + */ + error = ip_create_helper_stream(connp, is->is_ldi_ident); + if (error != 0) { + ip0dbg(("rawip_ioctl: create of IP helper stream " + "failed %d\n", error)); + return (error); + } + } + switch (cmd) { case ND_SET: case ND_GET: @@ -6658,25 +5539,25 @@ rawip_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, return (error); } -/* ARGSUSED */ int rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, cred_t *cr) { - conn_t *connp = (conn_t *)proto_handle; - icmp_t *icmp = connp->conn_icmp; - icmp_stack_t *is = icmp->icmp_is; - int error = 0; - boolean_t bypass_dgram_errind = B_FALSE; + sin6_t *sin6; + sin_t *sin = NULL; + uint_t srcid; + conn_t *connp = (conn_t *)proto_handle; + icmp_t *icmp = connp->conn_icmp; + int error = 0; + icmp_stack_t *is = icmp->icmp_is; + pid_t pid = curproc->p_pid; + ip_xmit_attr_t *ixa; ASSERT(DB_TYPE(mp) == M_DATA); /* All Solaris components should pass a cred for this operation. */ ASSERT(cr != NULL); - /* If labeled then sockfs should have already set db_credp */ - ASSERT(!is_system_labeled() || msg_getcred(mp, NULL) != NULL); - /* do an implicit bind if necessary */ if (icmp->icmp_state == TS_UNBND) { error = rawip_implicit_bind(connp); @@ -6691,170 +5572,191 @@ rawip_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, } } - rw_enter(&icmp->icmp_rwlock, RW_WRITER); - - if (msg->msg_name != NULL && icmp->icmp_state == TS_DATA_XFER) { - error = EISCONN; - goto done_lock; - } - - switch (icmp->icmp_family) { - case AF_INET6: { - sin6_t *sin6; - ip6_pkt_t ipp_s; /* For ancillary data options */ - ip6_pkt_t *ipp = &ipp_s; - - sin6 = (sin6_t *)msg->msg_name; - if (sin6 != NULL) { - error = proto_verify_ip_addr(icmp->icmp_family, - (struct sockaddr *)msg->msg_name, msg->msg_namelen); - if (error != 0) { - bypass_dgram_errind = B_TRUE; - goto done_lock; + /* Protocol 255 contains full IP headers */ + /* Read without holding lock */ + if (icmp->icmp_hdrincl) { + ASSERT(connp->conn_ipversion == IPV4_VERSION); + if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) { + if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + freemsg(mp); + return (EINVAL); } - if (icmp->icmp_delayed_error != 0) { - sin6_t *sin1 = (sin6_t *)msg->msg_name; - sin6_t *sin2 = (sin6_t *) - &icmp->icmp_delayed_addr; - - error = icmp->icmp_delayed_error; - icmp->icmp_delayed_error = 0; - - /* Compare IP address and port */ + } + error = icmp_output_hdrincl(connp, mp, cr, pid); + if (is->is_sendto_ignerr) + return (0); + else + return (error); + } - if (sin1->sin6_port == sin2->sin6_port && - IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr, - &sin2->sin6_addr)) { - goto done_lock; - } - } + /* Connected? */ + if (msg->msg_name == NULL) { + if (icmp->icmp_state != TS_DATA_XFER) { + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + return (EDESTADDRREQ); + } + if (msg->msg_controllen != 0) { + error = icmp_output_ancillary(connp, NULL, NULL, mp, + NULL, msg, cr, pid); } else { - /* - * Use connected address - */ - if (icmp->icmp_state != TS_DATA_XFER) { - BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); - error = EDESTADDRREQ; - bypass_dgram_errind = B_TRUE; - goto done_lock; - } - sin6 = &icmp->icmp_v6dst; + error = icmp_output_connected(connp, mp, cr, pid); } + if (is->is_sendto_ignerr) + return (0); + else + return (error); + } + if (icmp->icmp_state == TS_DATA_XFER) { + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + return (EISCONN); + } + error = proto_verify_ip_addr(connp->conn_family, + (struct sockaddr *)msg->msg_name, msg->msg_namelen); + if (error != 0) { + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + return (error); + } + switch (connp->conn_family) { + case AF_INET6: + sin6 = (sin6_t *)msg->msg_name; /* No support for mapped addresses on raw sockets */ if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); - error = EADDRNOTAVAIL; - goto done_lock; + return (EADDRNOTAVAIL); } - - ipp->ipp_fields = 0; - ipp->ipp_sticky_ignored = 0; + srcid = sin6->__sin6_src_id; /* - * If options passed in, feed it for verification and handling + * If the local address is a mapped address return + * an error. + * It would be possible to send an IPv6 packet but the + * response would never make it back to the application + * since it is bound to a mapped address. */ - if (msg->msg_controllen != 0) { - error = process_auxiliary_options(connp, - msg->msg_control, msg->msg_controllen, - ipp, &icmp_opt_obj, icmp_opt_set, cr); - if (error != 0) { - goto done_lock; - } + if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) { + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + return (EADDRNOTAVAIL); } - rw_exit(&icmp->icmp_rwlock); + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) + sin6->sin6_addr = ipv6_loopback; /* - * Destination is a native IPv6 address. - * Send out an IPv6 format packet. + * We have to allocate an ip_xmit_attr_t before we grab + * conn_lock and we need to hold conn_lock once we've check + * conn_same_as_last_v6 to handle concurrent send* calls on a + * socket. */ + if (msg->msg_controllen == 0) { + ixa = conn_get_ixa(connp, B_FALSE); + if (ixa == NULL) { + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + return (ENOMEM); + } + } else { + ixa = NULL; + } + mutex_enter(&connp->conn_lock); + if (icmp->icmp_delayed_error != 0) { + sin6_t *sin2 = (sin6_t *)&icmp->icmp_delayed_addr; - error = raw_ip_send_data_v6(connp->conn_wq, connp, mp, sin6, - ipp); - } - break; - case AF_INET: { - sin_t *sin; - ip4_pkt_t pktinfo; - ip4_pkt_t *pktinfop = &pktinfo; - ipaddr_t v4dst; + error = icmp->icmp_delayed_error; + icmp->icmp_delayed_error = 0; - sin = (sin_t *)msg->msg_name; - if (sin != NULL) { - error = proto_verify_ip_addr(icmp->icmp_family, - (struct sockaddr *)msg->msg_name, msg->msg_namelen); - if (error != 0) { - bypass_dgram_errind = B_TRUE; - goto done_lock; - } - v4dst = sin->sin_addr.s_addr; - if (icmp->icmp_delayed_error != 0) { - sin_t *sin1 = (sin_t *)msg->msg_name; - sin_t *sin2 = (sin_t *)&icmp->icmp_delayed_addr; - - error = icmp->icmp_delayed_error; - icmp->icmp_delayed_error = 0; - - /* Compare IP address and port */ - if (sin1->sin_port == sin2->sin_port && - sin1->sin_addr.s_addr == - sin2->sin_addr.s_addr) { - goto done_lock; - } + /* Compare IP address and family */ - } - } else { - /* - * Use connected address - */ - if (icmp->icmp_state != TS_DATA_XFER) { + if (IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, + &sin2->sin6_addr) && + sin6->sin6_family == sin2->sin6_family) { + mutex_exit(&connp->conn_lock); BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); - error = EDESTADDRREQ; - bypass_dgram_errind = B_TRUE; - goto done_lock; + if (ixa != NULL) + ixa_refrele(ixa); + return (error); } - v4dst = V4_PART_OF_V6(icmp->icmp_v6dst.sin6_addr); } + if (msg->msg_controllen != 0) { + mutex_exit(&connp->conn_lock); + ASSERT(ixa == NULL); + error = icmp_output_ancillary(connp, NULL, sin6, mp, + NULL, msg, cr, pid); + } else if (conn_same_as_last_v6(connp, sin6) && + connp->conn_lastsrcid == srcid && + ipsec_outbound_policy_current(ixa)) { + /* icmp_output_lastdst drops conn_lock */ + error = icmp_output_lastdst(connp, mp, cr, pid, ixa); + } else { + /* icmp_output_newdst drops conn_lock */ + error = icmp_output_newdst(connp, mp, NULL, sin6, cr, + pid, ixa); + } + ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); + if (is->is_sendto_ignerr) + return (0); + else + return (error); + case AF_INET: + sin = (sin_t *)msg->msg_name; - - pktinfop->ip4_ill_index = 0; - pktinfop->ip4_addr = INADDR_ANY; + if (sin->sin_addr.s_addr == INADDR_ANY) + sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); /* - * If options passed in, feed it for verification and handling + * We have to allocate an ip_xmit_attr_t before we grab + * conn_lock and we need to hold conn_lock once we've check + * conn_same_as_last_v6 to handle concurrent send* on a socket. */ - if (msg->msg_controllen != 0) { - error = process_auxiliary_options(connp, - msg->msg_control, msg->msg_controllen, - pktinfop, &icmp_opt_obj, icmp_opt_set, cr); - if (error != 0) { - goto done_lock; + if (msg->msg_controllen == 0) { + ixa = conn_get_ixa(connp, B_FALSE); + if (ixa == NULL) { + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + return (ENOMEM); } + } else { + ixa = NULL; } - rw_exit(&icmp->icmp_rwlock); + mutex_enter(&connp->conn_lock); + if (icmp->icmp_delayed_error != 0) { + sin_t *sin2 = (sin_t *)&icmp->icmp_delayed_addr; - error = raw_ip_send_data_v4(connp->conn_wq, connp, mp, - v4dst, pktinfop); - break; - } + error = icmp->icmp_delayed_error; + icmp->icmp_delayed_error = 0; - default: - ASSERT(0); - } + /* Compare IP address */ - goto done; + if (sin->sin_addr.s_addr == sin2->sin_addr.s_addr) { + mutex_exit(&connp->conn_lock); + BUMP_MIB(&is->is_rawip_mib, rawipOutErrors); + if (ixa != NULL) + ixa_refrele(ixa); + return (error); + } + } -done_lock: - rw_exit(&icmp->icmp_rwlock); - if (error != 0) { - ASSERT(mp != NULL); - freemsg(mp); + if (msg->msg_controllen != 0) { + mutex_exit(&connp->conn_lock); + ASSERT(ixa == NULL); + error = icmp_output_ancillary(connp, sin, NULL, mp, + NULL, msg, cr, pid); + } else if (conn_same_as_last_v4(connp, sin) && + ipsec_outbound_policy_current(ixa)) { + /* icmp_output_lastdst drops conn_lock */ + error = icmp_output_lastdst(connp, mp, cr, pid, ixa); + } else { + /* icmp_output_newdst drops conn_lock */ + error = icmp_output_newdst(connp, mp, sin, NULL, cr, + pid, ixa); + } + ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); + if (is->is_sendto_ignerr) + return (0); + else + return (error); + default: + return (EINVAL); } -done: - if (bypass_dgram_errind) - return (error); - return (icmp->icmp_dgram_errind ? error : 0); } sock_downcalls_t sock_rawip_downcalls = { diff --git a/usr/src/uts/common/inet/ip/icmp_opt_data.c b/usr/src/uts/common/inet/ip/icmp_opt_data.c index 8bee9827db..ff0310de0c 100644 --- a/usr/src/uts/common/inet/ip/icmp_opt_data.c +++ b/usr/src/uts/common/inet/ip/icmp_opt_data.c @@ -36,23 +36,11 @@ #include <inet/common.h> #include <netinet/ip6.h> #include <inet/ip.h> -/* - * MK_XXX Following 2 includes temporary to import ip6_rthdr_t - * definition. May not be needed if we fix ip6_dg_snd_attrs_t - * to do all extension headers in identical manner. - */ -#include <net/if.h> -#include <inet/ip6.h> #include <netinet/tcp.h> #include <netinet/ip_mroute.h> #include <inet/optcom.h> - - -extern int icmp_opt_default(queue_t *, int, int, uchar_t *); -extern int icmp_tpi_opt_get(queue_t *, int, int, uchar_t *); -extern int icmp_tpi_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *, - uint_t *, uchar_t *, void *, cred_t *, mblk_t *); +#include <inet/rawip_impl.h> /* * Table of all known options handled on a ICMP protocol stack. @@ -63,250 +51,252 @@ extern int icmp_tpi_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *, */ opdes_t icmp_opt_arr[] = { -{ SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 +{ SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, +{ SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, #ifdef SO_PROTOTYPE /* * icmp will only allow IPPROTO_ICMP for non-privileged streams * that check is made on an adhoc basis. */ -{ SO_PROTOTYPE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, +{ SO_PROTOTYPE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, #endif -{ SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_SNDTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, +{ SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_SNDTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (struct timeval), 0 }, -{ SO_RCVTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ SO_RCVTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (struct timeval), 0 }, -{ SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), +{ SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ SO_TIMESTAMP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 +{ SO_TIMESTAMP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), +{ SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), +{ SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT, sizeof (int), +{ SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), 0 }, -{ SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, +{ SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, { IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), + (OP_VARLEN|OP_NODEFAULT), IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, { T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), + (OP_VARLEN|OP_NODEFAULT), IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, -{ IP_HDRINCL, IPPROTO_IP, OA_R, OA_RW, OP_RAW, OP_PASSNEXT, +{ IP_HDRINCL, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0, sizeof (int), 0 }, -{ IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ IP_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, +{ IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ IP_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IP_MULTICAST_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IP_MULTICAST_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (struct in_addr), 0 /* INADDR_ANY */ }, -{ IP_MULTICAST_LOOP, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_DEF_FN), +{ IP_MULTICAST_LOOP, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN, sizeof (uchar_t), -1 /* not initialized */}, -{ IP_MULTICAST_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_DEF_FN), +{ IP_MULTICAST_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN, sizeof (uchar_t), -1 /* not initialized */ }, -{ IP_ADD_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT), +{ IP_ADD_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, OP_NODEFAULT, sizeof (struct ip_mreq), -1 /* not initialized */ }, -{ IP_DROP_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT), +{ IP_DROP_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, OP_NODEFAULT, sizeof (struct ip_mreq), 0 }, -{ IP_BLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT), +{ IP_BLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, OP_NODEFAULT, sizeof (struct ip_mreq_source), -1 }, -{ IP_UNBLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT), +{ IP_UNBLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, OP_NODEFAULT, sizeof (struct ip_mreq_source), -1 }, { IP_ADD_SOURCE_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct ip_mreq_source), -1 }, + OP_NODEFAULT, sizeof (struct ip_mreq_source), -1 }, { IP_DROP_SOURCE_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct ip_mreq_source), -1 }, + OP_NODEFAULT, sizeof (struct ip_mreq_source), -1 }, -{ IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_NODEFAULT), +{ IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT, sizeof (ipsec_req_t), -1 /* not initialized */ }, -{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 /* no ifindex */ }, -{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, OP_PASSNEXT, +{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0, sizeof (int), 0 }, { IP_BROADCAST_TTL, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0, sizeof (uchar_t), 0 /* disabled */ }, -{ IP_RECVIF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, +{ IP_RECVIF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { IP_PKTINFO, IPPROTO_IP, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT|OP_VARLEN), + (OP_NODEFAULT|OP_VARLEN), sizeof (struct in_pktinfo), -1 /* not initialized */ }, -{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT, +{ IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, + +{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (in_addr_t), -1 /* not initialized */ }, { MRT_INIT, IPPROTO_IP, 0, OA_X, OP_CONFIG, - (OP_PASSNEXT|OP_NODEFAULT), sizeof (int), + OP_NODEFAULT, sizeof (int), -1 /* not initialized */ }, { MRT_DONE, IPPROTO_IP, 0, OA_X, OP_CONFIG, - (OP_PASSNEXT|OP_NODEFAULT), 0, -1 /* not initialized */ }, + OP_NODEFAULT, 0, -1 /* not initialized */ }, -{ MRT_ADD_VIF, IPPROTO_IP, 0, OA_X, OP_CONFIG, (OP_PASSNEXT|OP_NODEFAULT), +{ MRT_ADD_VIF, IPPROTO_IP, 0, OA_X, OP_CONFIG, OP_NODEFAULT, sizeof (struct vifctl), -1 /* not initialized */ }, -{ MRT_DEL_VIF, IPPROTO_IP, 0, OA_X, OP_CONFIG, (OP_PASSNEXT|OP_NODEFAULT), +{ MRT_DEL_VIF, IPPROTO_IP, 0, OA_X, OP_CONFIG, OP_NODEFAULT, sizeof (vifi_t), -1 /* not initialized */ }, -{ MRT_ADD_MFC, IPPROTO_IP, 0, OA_X, OP_CONFIG, (OP_PASSNEXT|OP_NODEFAULT), +{ MRT_ADD_MFC, IPPROTO_IP, 0, OA_X, OP_CONFIG, OP_NODEFAULT, sizeof (struct mfcctl), -1 /* not initialized */ }, -{ MRT_DEL_MFC, IPPROTO_IP, 0, OA_X, OP_CONFIG, (OP_PASSNEXT|OP_NODEFAULT), +{ MRT_DEL_MFC, IPPROTO_IP, 0, OA_X, OP_CONFIG, OP_NODEFAULT, sizeof (struct mfcctl), -1 /* not initialized */ }, -{ MRT_VERSION, IPPROTO_IP, OA_R, OA_R, OP_NP, (OP_PASSNEXT|OP_NODEFAULT), +{ MRT_VERSION, IPPROTO_IP, OA_R, OA_R, OP_NP, OP_NODEFAULT, sizeof (int), -1 /* not initialized */ }, { MRT_ASSERT, IPPROTO_IP, 0, OA_RW, OP_CONFIG, - (OP_PASSNEXT|OP_NODEFAULT), + OP_NODEFAULT, sizeof (int), -1 /* not initialized */ }, { MCAST_JOIN_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_req), + OP_NODEFAULT, sizeof (struct group_req), -1 /* not initialized */ }, { MCAST_LEAVE_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_req), + OP_NODEFAULT, sizeof (struct group_req), -1 /* not initialized */ }, { MCAST_BLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req), + OP_NODEFAULT, sizeof (struct group_source_req), -1 /* not initialized */ }, { MCAST_UNBLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req), + OP_NODEFAULT, sizeof (struct group_source_req), -1 /* not initialized */ }, { MCAST_JOIN_SOURCE_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req), + OP_NODEFAULT, sizeof (struct group_source_req), -1 /* not initialized */ }, { MCAST_LEAVE_SOURCE_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req), + OP_NODEFAULT, sizeof (struct group_source_req), -1 /* not initialized */ }, -{ IPV6_MULTICAST_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_MULTICAST_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { IPV6_MULTICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_DEF_FN), sizeof (int), -1 /* not initialized */ }, + OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, { IPV6_MULTICAST_LOOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_DEF_FN), sizeof (int), -1 /* not initialized */}, + OP_DEF_FN, sizeof (int), -1 /* not initialized */}, -{ IPV6_JOIN_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT), +{ IPV6_JOIN_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, OP_NODEFAULT, sizeof (struct ipv6_mreq), -1 /* not initialized */ }, -{ IPV6_LEAVE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT), +{ IPV6_LEAVE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, OP_NODEFAULT, sizeof (struct ipv6_mreq), -1 /* not initialized */ }, -{ IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_DEF_FN), +{ IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, -{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 /* no ifindex */ }, -{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, OP_PASSNEXT, +{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0, sizeof (int), 0 }, -{ IPV6_CHECKSUM, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), +{ IPV6_CHECKSUM, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), -1 }, { ICMP6_FILTER, IPPROTO_ICMPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN|OP_VARLEN, sizeof (icmp6_filter_t), 0 }, { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT|OP_VARLEN), + (OP_NODEFAULT|OP_VARLEN), sizeof (struct in6_pktinfo), -1 /* not initialized */ }, { IPV6_HOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT|OP_VARLEN), + (OP_NODEFAULT|OP_VARLEN), sizeof (int), -1 /* not initialized */ }, { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT|OP_VARLEN), + (OP_NODEFAULT|OP_VARLEN), sizeof (sin6_t), -1 /* not initialized */ }, { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), + (OP_VARLEN|OP_NODEFAULT), MAX_EHDR_LEN, -1 /* not initialized */ }, { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), + (OP_VARLEN|OP_NODEFAULT), MAX_EHDR_LEN, -1 /* not initialized */ }, { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), + (OP_VARLEN|OP_NODEFAULT), MAX_EHDR_LEN, -1 /* not initialized */ }, { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), + (OP_VARLEN|OP_NODEFAULT), MAX_EHDR_LEN, -1 /* not initialized */ }, { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT|OP_VARLEN), + (OP_NODEFAULT|OP_VARLEN), sizeof (int), -1 /* not initialized */ }, -{ IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (struct ip6_mtuinfo), -1 }, -{ IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IPV6_RECVPATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_RECVPATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_NODEFAULT), +{ IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT, sizeof (ipsec_req_t), -1 /* not initialized */ }, -{ IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT }, { MCAST_JOIN_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_req), + OP_NODEFAULT, sizeof (struct group_req), -1 /* not initialized */ }, { MCAST_LEAVE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_req), + OP_NODEFAULT, sizeof (struct group_req), -1 /* not initialized */ }, { MCAST_BLOCK_SOURCE, IPPROTO_IPV6, OA_X, OA_X, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req), + OP_NODEFAULT, sizeof (struct group_source_req), -1 /* not initialized */ }, { MCAST_UNBLOCK_SOURCE, IPPROTO_IPV6, OA_X, OA_X, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req), + OP_NODEFAULT, sizeof (struct group_source_req), -1 /* not initialized */ }, { MCAST_JOIN_SOURCE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req), + OP_NODEFAULT, sizeof (struct group_source_req), -1 /* not initialized */ }, { MCAST_LEAVE_SOURCE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req), + OP_NODEFAULT, sizeof (struct group_source_req), -1 /* not initialized */ }, }; @@ -342,9 +332,8 @@ uint_t icmp_max_optsize; /* initialized when ICMP driver is loaded */ optdb_obj_t icmp_opt_obj = { icmp_opt_default, /* ICMP default value function pointer */ - icmp_tpi_opt_get, /* ICMP get function pointer */ - icmp_tpi_opt_set, /* ICMP set function pointer */ - B_TRUE, /* ICMP is tpi provider */ + icmp_tpi_opt_get, /* ICMP get function pointer */ + icmp_tpi_opt_set, /* ICMP set function pointer */ ICMP_OPT_ARR_CNT, /* ICMP option database count of entries */ icmp_opt_arr, /* ICMP option database */ ICMP_VALID_LEVELS_CNT, /* ICMP valid level count of entries */ diff --git a/usr/src/uts/common/inet/ip/igmp.c b/usr/src/uts/common/inet/ip/igmp.c index 5eff11af14..9e6b552a61 100644 --- a/usr/src/uts/common/inet/ip/igmp.c +++ b/usr/src/uts/common/inet/ip/igmp.c @@ -56,6 +56,7 @@ #include <netinet/igmp_var.h> #include <netinet/ip6.h> #include <netinet/icmp6.h> +#include <inet/ipsec_impl.h> #include <inet/common.h> #include <inet/mi.h> @@ -66,9 +67,8 @@ #include <inet/ip_listutils.h> #include <netinet/igmp.h> +#include <inet/ip_ndp.h> #include <inet/ip_if.h> -#include <net/pfkeyv2.h> -#include <inet/ipsec_info.h> static uint_t igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill); static uint_t igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen); @@ -76,14 +76,13 @@ static uint_t mld_query_in(mld_hdr_t *mldh, ill_t *ill); static uint_t mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen); static void igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr); static void mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr); -static void igmpv3_sendrpt(ipif_t *ipif, mrec_t *reclist); +static void igmpv3_sendrpt(ill_t *ill, mrec_t *reclist); static void mldv2_sendrpt(ill_t *ill, mrec_t *reclist); static mrec_t *mcast_bldmrec(mcast_record_t type, in6_addr_t *grp, slist_t *srclist, mrec_t *next); static void mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp, mcast_record_t rtype, slist_t *flist); static mrec_t *mcast_merge_rtx(ilm_t *ilm, mrec_t *rp, slist_t *flist); -static void mcast_signal_restart_thread(ip_stack_t *ipst); /* * Macros used to do timer len conversions. Timer values are always @@ -122,11 +121,12 @@ static void mcast_signal_restart_thread(ip_stack_t *ipst); * The first multicast join will trigger the igmp timers / mld timers * The unit for next is milliseconds. */ -static void +void igmp_start_timers(unsigned next, ip_stack_t *ipst) { int time_left; int ret; + timeout_id_t id; ASSERT(next != 0 && next != INFINITY); @@ -173,9 +173,10 @@ igmp_start_timers(unsigned next, ip_stack_t *ipst) mutex_exit(&ipst->ips_igmp_timer_lock); return; } + id = ipst->ips_igmp_timeout_id; mutex_exit(&ipst->ips_igmp_timer_lock); - ret = untimeout(ipst->ips_igmp_timeout_id); + ret = untimeout(id); mutex_enter(&ipst->ips_igmp_timer_lock); /* * The timeout was cancelled, or the timeout handler @@ -207,11 +208,12 @@ igmp_start_timers(unsigned next, ip_stack_t *ipst) * mld_start_timers: * The unit for next is milliseconds. */ -static void +void mld_start_timers(unsigned next, ip_stack_t *ipst) { int time_left; int ret; + timeout_id_t id; ASSERT(next != 0 && next != INFINITY); @@ -257,9 +259,10 @@ mld_start_timers(unsigned next, ip_stack_t *ipst) mutex_exit(&ipst->ips_mld_timer_lock); return; } + id = ipst->ips_mld_timeout_id; mutex_exit(&ipst->ips_mld_timer_lock); - ret = untimeout(ipst->ips_mld_timeout_id); + ret = untimeout(id); mutex_enter(&ipst->ips_mld_timer_lock); /* * The timeout was cancelled, or the timeout handler @@ -294,9 +297,8 @@ mld_start_timers(unsigned next, ip_stack_t *ipst) * Callers of igmp_input() may need to reinitialize variables that were copied * from the mblk as this calls pullupmsg(). */ -/* ARGSUSED */ mblk_t * -igmp_input(queue_t *q, mblk_t *mp, ill_t *ill) +igmp_input(mblk_t *mp, ip_recv_attr_t *ira) { igmpa_t *igmpa; ipha_t *ipha = (ipha_t *)(mp->b_rptr); @@ -304,22 +306,22 @@ igmp_input(queue_t *q, mblk_t *mp, ill_t *ill) ilm_t *ilm; uint32_t src, dst; uint32_t group; + in6_addr_t v6group; uint_t next; ipif_t *ipif; - ip_stack_t *ipst; - ilm_walker_t ilw; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; - ASSERT(ill != NULL); ASSERT(!ill->ill_isv6); - ipst = ill->ill_ipst; ++ipst->ips_igmpstat.igps_rcv_total; mblklen = MBLKL(mp); - if (mblklen < 1 || mblklen < (iphlen = IPH_HDR_LENGTH(ipha))) { + iphlen = ira->ira_ip_hdr_length; + if (mblklen < 1 || mblklen < iphlen) { ++ipst->ips_igmpstat.igps_rcv_tooshort; goto bad_pkt; } - igmplen = ntohs(ipha->ipha_length) - iphlen; + igmplen = ira->ira_pktlen - iphlen; /* * Since msg sizes are more variable with v3, just pullup the * whole thing now. @@ -342,13 +344,6 @@ igmp_input(queue_t *q, mblk_t *mp, ill_t *ill) ++ipst->ips_igmpstat.igps_rcv_tooshort; goto bad_pkt; } - /* - * Validate checksum - */ - if (IP_CSUM(mp, iphlen, 0)) { - ++ipst->ips_igmpstat.igps_rcv_badsum; - goto bad_pkt; - } igmpa = (igmpa_t *)(&mp->b_rptr[iphlen]); src = ipha->ipha_src; @@ -400,9 +395,8 @@ igmp_input(queue_t *q, mblk_t *mp, ill_t *ill) 1, SL_TRACE, "igmp_input: we are only " - "member src 0x%x ipif_local 0x%x", - (int)ntohl(src), - (int)ntohl(ipif->ipif_lcl_addr)); + "member src 0x%x\n", + (int)ntohl(src)); } mutex_exit(&ill->ill_lock); return (mp); @@ -445,15 +439,18 @@ igmp_input(queue_t *q, mblk_t *mp, ill_t *ill) * terminology, stop our timer for that group and 'clear * flag' i.e. mark as IGMP_OTHERMEMBER. */ - ilm = ilm_walker_start(&ilw, ill); - for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { - if (ilm->ilm_addr == group) { - ++ipst->ips_igmpstat.igps_rcv_ourreports; - ilm->ilm_timer = INFINITY; - ilm->ilm_state = IGMP_OTHERMEMBER; - } - } - ilm_walker_finish(&ilw); + rw_enter(&ill->ill_mcast_lock, RW_WRITER); + IN6_IPADDR_TO_V4MAPPED(group, &v6group); + for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { + if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, &v6group)) + continue; + + ++ipst->ips_igmpstat.igps_rcv_ourreports; + ilm->ilm_timer = INFINITY; + ilm->ilm_state = IGMP_OTHERMEMBER; + } /* for */ + rw_exit(&ill->ill_mcast_lock); + ill_mcast_timer_start(ill->ill_ipst); break; case IGMP_V3_MEMBERSHIP_REPORT: @@ -482,11 +479,11 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill) int timer; uint_t next, current; ip_stack_t *ipst; - ilm_walker_t ilw; ipst = ill->ill_ipst; ++ipst->ips_igmpstat.igps_rcv_queries; + rw_enter(&ill->ill_mcast_lock, RW_WRITER); /* * In the IGMPv2 specification, there are 3 states and a flag. * @@ -506,9 +503,6 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill) * Remember that the querier on this interface is old, * and set the timer to the value in RFC 1112. */ - - - mutex_enter(&ill->ill_lock); ill->ill_mcast_v1_time = 0; ill->ill_mcast_v1_tset = 1; if (ill->ill_mcast_type != IGMP_V1_ROUTER) { @@ -517,13 +511,14 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill) atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1); ill->ill_mcast_type = IGMP_V1_ROUTER; } - mutex_exit(&ill->ill_lock); timer = SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY); if (ipha->ipha_dst != htonl(INADDR_ALLHOSTS_GROUP) || igmpa->igmpa_group != 0) { ++ipst->ips_igmpstat.igps_rcv_badqueries; + rw_exit(&ill->ill_mcast_lock); + ill_mcast_timer_start(ill->ill_ipst); return (0); } @@ -537,6 +532,8 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill) group = igmpa->igmpa_group; if (group != 0 && (!CLASSD(group))) { ++ipst->ips_igmpstat.igps_rcv_badqueries; + rw_exit(&ill->ill_mcast_lock); + ill_mcast_timer_start(ill->ill_ipst); return (0); } @@ -545,7 +542,6 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill) * ONLY IF current state is v3. Let things be if current * state if v1 but do reset the v2-querier-present timer. */ - mutex_enter(&ill->ill_lock); if (ill->ill_mcast_type == IGMP_V3_ROUTER) { ip1dbg(("Received IGMPv2 Query on %s, switching mode " "to IGMP_V2_ROUTER", ill->ill_name)); @@ -554,18 +550,15 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill) } ill->ill_mcast_v2_time = 0; ill->ill_mcast_v2_tset = 1; - mutex_exit(&ill->ill_lock); timer = DSEC_TO_MSEC((int)igmpa->igmpa_code); } if (ip_debug > 1) { - mutex_enter(&ill->ill_lock); (void) mi_strlog(ill->ill_rq, 1, SL_TRACE, "igmp_input: TIMER = igmp_code %d igmp_type 0x%x", (int)ntohs(igmpa->igmpa_code), (int)ntohs(igmpa->igmpa_type)); - mutex_exit(&ill->ill_lock); } /* @@ -582,11 +575,9 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill) */ next = (unsigned)INFINITY; - ilm = ilm_walker_start(&ilw, ill); - mutex_enter(&ill->ill_lock); current = CURRENT_MSTIME; + for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { - for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { /* * A multicast router joins INADDR_ANY address * to enable promiscuous reception of all @@ -608,8 +599,12 @@ igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill) } } } - mutex_exit(&ill->ill_lock); - ilm_walker_finish(&ilw); + rw_exit(&ill->ill_mcast_lock); + /* + * No packets have been sent above - no + * ill_mcast_send_queued is needed. + */ + ill_mcast_timer_start(ill->ill_ipst); return (next); } @@ -623,7 +618,6 @@ igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen) ipaddr_t *src_array; uint8_t qrv; ip_stack_t *ipst; - ilm_walker_t ilw; ipst = ill->ill_ipst; /* make sure numsrc matches packet size */ @@ -636,6 +630,8 @@ igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen) ++ipst->ips_igmpstat.igps_rcv_queries; + rw_enter(&ill->ill_mcast_lock, RW_WRITER); + if ((mrd = (uint_t)igmp3qa->igmp3qa_mxrc) >= IGMP_V3_MAXRT_FPMIN) { uint_t hdrval, mant, exp; hdrval = (uint_t)igmp3qa->igmp3qa_mxrc; @@ -669,12 +665,11 @@ igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen) * sooner than the delay we calculated for this response, then * no action is required (RFC3376 section 5.2 rule 1) */ - mutex_enter(&ill->ill_lock); if (ill->ill_global_timer < (current + delay)) { - mutex_exit(&ill->ill_lock); + rw_exit(&ill->ill_mcast_lock); + ill_mcast_timer_start(ill->ill_ipst); return (next); } - mutex_exit(&ill->ill_lock); /* * Now take action depending upon query type: @@ -687,16 +682,11 @@ igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen) * greater than our calculated delay, so reset it to * our delay (random value in range [0, response time]). */ - mutex_enter(&ill->ill_lock); ill->ill_global_timer = current + delay; - mutex_exit(&ill->ill_lock); next = delay; - } else { /* group or group/source specific query */ - ilm = ilm_walker_start(&ilw, ill); - mutex_enter(&ill->ill_lock); - for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { + for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr) || (ilm->ilm_addr == htonl(INADDR_ANY)) || (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) || @@ -750,13 +740,21 @@ group_query: next = ilm->ilm_timer; ilm->ilm_timer += current; } - mutex_exit(&ill->ill_lock); - ilm_walker_finish(&ilw); } + rw_exit(&ill->ill_mcast_lock); + /* + * No packets have been sent above - no + * ill_mcast_send_queued is needed. + */ + ill_mcast_timer_start(ill->ill_ipst); return (next); } +/* + * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue + * and it gets sent after the lock is dropped. + */ void igmp_joingroup(ilm_t *ilm) { @@ -764,27 +762,21 @@ igmp_joingroup(ilm_t *ilm) ill_t *ill; ip_stack_t *ipst = ilm->ilm_ipst; - ill = ilm->ilm_ipif->ipif_ill; + ill = ilm->ilm_ill; - ASSERT(IAM_WRITER_ILL(ill)); - ASSERT(ilm->ilm_ill == NULL && !ilm->ilm_ipif->ipif_isv6); + ASSERT(!ill->ill_isv6); + ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock)); - mutex_enter(&ill->ill_lock); if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) { ilm->ilm_rtx.rtx_timer = INFINITY; ilm->ilm_state = IGMP_OTHERMEMBER; - mutex_exit(&ill->ill_lock); } else { ip1dbg(("Querier mode %d, sending report, group %x\n", ill->ill_mcast_type, htonl(ilm->ilm_addr))); if (ill->ill_mcast_type == IGMP_V1_ROUTER) { - mutex_exit(&ill->ill_lock); igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0); - mutex_enter(&ill->ill_lock); } else if (ill->ill_mcast_type == IGMP_V2_ROUTER) { - mutex_exit(&ill->ill_lock); igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0); - mutex_enter(&ill->ill_lock); } else if (ill->ill_mcast_type == IGMP_V3_ROUTER) { mrec_t *rp; mcast_record_t rtype; @@ -802,9 +794,7 @@ igmp_joingroup(ilm_t *ilm) ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE; rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr, ilm->ilm_filter, NULL); - mutex_exit(&ill->ill_lock); - igmpv3_sendrpt(ilm->ilm_ipif, rp); - mutex_enter(&ill->ill_lock); + igmpv3_sendrpt(ill, rp); /* * Set up retransmission state. Timer is set below, * for both v3 and older versions. @@ -820,35 +810,33 @@ igmp_joingroup(ilm_t *ilm) timer = ilm->ilm_rtx.rtx_timer; ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME; ilm->ilm_state = IGMP_IREPORTEDLAST; - mutex_exit(&ill->ill_lock); /* - * We need to restart the IGMP timers, but we can't do it here - * since we're inside the IPSQ and thus igmp_start_timers() -> - * untimeout() (inside the IPSQ, waiting for a running timeout - * to finish) could deadlock with igmp_timeout_handler() -> - * ipsq_enter() (running the timeout, waiting to get inside - * the IPSQ). We also can't just delay it until after we - * ipsq_exit() since we could be inside more than one IPSQ and - * thus still have the other IPSQs pinned after we exit -- and - * igmp_start_timers() may be trying to enter one of those. - * Instead, signal a dedicated thread that will do it for us. + * We are holding ill_mcast_lock here and the timeout + * handler (igmp_timeout_handler_per_ill) acquires that + * lock. Hence we can't call igmp_start_timer since it could + * deadlock in untimeout(). + * Instead the thread which drops ill_mcast_lock will have + * to call ill_mcast_timer_start(). */ mutex_enter(&ipst->ips_igmp_timer_lock); ipst->ips_igmp_deferred_next = MIN(timer, ipst->ips_igmp_deferred_next); mutex_exit(&ipst->ips_igmp_timer_lock); - mcast_signal_restart_thread(ipst); } if (ip_debug > 1) { - (void) mi_strlog(ilm->ilm_ipif->ipif_ill->ill_rq, 1, SL_TRACE, + (void) mi_strlog(ilm->ilm_ill->ill_rq, 1, SL_TRACE, "igmp_joingroup: multicast_type %d timer %d", - (ilm->ilm_ipif->ipif_ill->ill_mcast_type), + (ilm->ilm_ill->ill_mcast_type), (int)ntohl(timer)); } } +/* + * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue + * and it gets sent after the lock is dropped. + */ void mld_joingroup(ilm_t *ilm) { @@ -858,19 +846,16 @@ mld_joingroup(ilm_t *ilm) ill = ilm->ilm_ill; - ASSERT(IAM_WRITER_ILL(ill)); - ASSERT(ilm->ilm_ipif == NULL && ill->ill_isv6); + ASSERT(ill->ill_isv6); + + ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock)); - mutex_enter(&ill->ill_lock); if (IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr)) { ilm->ilm_rtx.rtx_timer = INFINITY; ilm->ilm_state = IGMP_OTHERMEMBER; - mutex_exit(&ill->ill_lock); } else { if (ill->ill_mcast_type == MLD_V1_ROUTER) { - mutex_exit(&ill->ill_lock); mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL); - mutex_enter(&ill->ill_lock); } else { mrec_t *rp; mcast_record_t rtype; @@ -888,9 +873,7 @@ mld_joingroup(ilm_t *ilm) ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE; rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr, ilm->ilm_filter, NULL); - mutex_exit(&ill->ill_lock); mldv2_sendrpt(ill, rp); - mutex_enter(&ill->ill_lock); /* * Set up retransmission state. Timer is set below, * for both v2 and v1. @@ -909,17 +892,19 @@ mld_joingroup(ilm_t *ilm) timer = ilm->ilm_rtx.rtx_timer; ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME; ilm->ilm_state = IGMP_IREPORTEDLAST; - mutex_exit(&ill->ill_lock); /* - * Signal another thread to restart the timers. See the - * comment in igmp_joingroup() for details. + * We are holding ill_mcast_lock here and the timeout + * handler (mld_timeout_handler_per_ill) acquires that + * lock. Hence we can't call mld_start_timer since it could + * deadlock in untimeout(). + * Instead the thread which drops ill_mcast_lock will have + * to call ill_mcast_timer_start(). */ mutex_enter(&ipst->ips_mld_timer_lock); ipst->ips_mld_deferred_next = MIN(timer, ipst->ips_mld_deferred_next); mutex_exit(&ipst->ips_mld_timer_lock); - mcast_signal_restart_thread(ipst); } if (ip_debug > 1) { @@ -930,23 +915,26 @@ mld_joingroup(ilm_t *ilm) } } +/* + * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue + * and it gets sent after the lock is dropped. + */ void igmp_leavegroup(ilm_t *ilm) { - ill_t *ill = ilm->ilm_ipif->ipif_ill; + ill_t *ill = ilm->ilm_ill; - ASSERT(ilm->ilm_ill == NULL); ASSERT(!ill->ill_isv6); - mutex_enter(&ill->ill_lock); + ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock)); if (ilm->ilm_state == IGMP_IREPORTEDLAST && ill->ill_mcast_type == IGMP_V2_ROUTER && (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) { - mutex_exit(&ill->ill_lock); igmp_sendpkt(ilm, IGMP_V2_LEAVE_GROUP, (htonl(INADDR_ALLRTRS_GROUP))); return; - } else if ((ill->ill_mcast_type == IGMP_V3_ROUTER) && + } + if ((ill->ill_mcast_type == IGMP_V3_ROUTER) && (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) { mrec_t *rp; /* @@ -965,29 +953,30 @@ igmp_leavegroup(ilm_t *ilm) rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, NULL, NULL); } - mutex_exit(&ill->ill_lock); - igmpv3_sendrpt(ilm->ilm_ipif, rp); + igmpv3_sendrpt(ill, rp); return; } - mutex_exit(&ill->ill_lock); } +/* + * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue + * and it gets sent after the lock is dropped. + */ void mld_leavegroup(ilm_t *ilm) { ill_t *ill = ilm->ilm_ill; - ASSERT(ilm->ilm_ipif == NULL); ASSERT(ill->ill_isv6); - mutex_enter(&ill->ill_lock); + ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock)); if (ilm->ilm_state == IGMP_IREPORTEDLAST && ill->ill_mcast_type == MLD_V1_ROUTER && (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) { - mutex_exit(&ill->ill_lock); mld_sendpkt(ilm, MLD_LISTENER_REDUCTION, &ipv6_all_rtrs_mcast); return; - } else if ((ill->ill_mcast_type == MLD_V2_ROUTER) && + } + if ((ill->ill_mcast_type == MLD_V2_ROUTER) && (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) { mrec_t *rp; /* @@ -1006,13 +995,15 @@ mld_leavegroup(ilm_t *ilm) rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, NULL, NULL); } - mutex_exit(&ill->ill_lock); mldv2_sendrpt(ill, rp); return; } - mutex_exit(&ill->ill_lock); } +/* + * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue + * and it gets sent after the lock is dropped. + */ void igmp_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist) { @@ -1023,17 +1014,11 @@ igmp_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist) ASSERT(ilm != NULL); /* state change reports should only be sent if the router is v3 */ - if (ilm->ilm_ipif->ipif_ill->ill_mcast_type != IGMP_V3_ROUTER) + if (ilm->ilm_ill->ill_mcast_type != IGMP_V3_ROUTER) return; - if (ilm->ilm_ill == NULL) { - ASSERT(ilm->ilm_ipif != NULL); - ill = ilm->ilm_ipif->ipif_ill; - } else { - ill = ilm->ilm_ill; - } - - mutex_enter(&ill->ill_lock); + ill = ilm->ilm_ill; + ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock)); /* * Compare existing(old) state with the new state and prepare @@ -1089,8 +1074,7 @@ send_to_in: /* * Need to set up retransmission state; merge the new info with the * current state (which may be null). If the timer is not currently - * running, signal a thread to restart it -- see the comment in - * igmp_joingroup() for details. + * running, the caller will start it when dropping ill_mcast_lock. */ rp = mcast_merge_rtx(ilm, rp, flist); if (ilm->ilm_rtx.rtx_timer == INFINITY) { @@ -1102,13 +1086,15 @@ send_to_in: ilm->ilm_rtx.rtx_timer); ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME; mutex_exit(&ipst->ips_igmp_timer_lock); - mcast_signal_restart_thread(ipst); } - mutex_exit(&ill->ill_lock); - igmpv3_sendrpt(ilm->ilm_ipif, rp); + igmpv3_sendrpt(ill, rp); } +/* + * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue + * and it gets sent after the lock is dropped. + */ void mld_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist) { @@ -1119,11 +1105,10 @@ mld_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist) ASSERT(ilm != NULL); ill = ilm->ilm_ill; + ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock)); /* only need to send if we have an mldv2-capable router */ - mutex_enter(&ill->ill_lock); if (ill->ill_mcast_type != MLD_V2_ROUTER) { - mutex_exit(&ill->ill_lock); return; } @@ -1179,8 +1164,7 @@ send_to_in: /* * Need to set up retransmission state; merge the new info with the * current state (which may be null). If the timer is not currently - * running, signal a thread to restart it -- see the comment in - * igmp_joingroup() for details. + * running, the caller will start it when dropping ill_mcast_lock. */ rp = mcast_merge_rtx(ilm, rp, flist); ASSERT(ilm->ilm_rtx.rtx_cnt > 0); @@ -1193,10 +1177,8 @@ send_to_in: MIN(ipst->ips_mld_deferred_next, ilm->ilm_rtx.rtx_timer); ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME; mutex_exit(&ipst->ips_mld_timer_lock); - mcast_signal_restart_thread(ipst); } - mutex_exit(&ill->ill_lock); mldv2_sendrpt(ill, rp); } @@ -1205,15 +1187,12 @@ igmp_timeout_handler_per_ill(ill_t *ill) { uint_t next = INFINITY, current; ilm_t *ilm; - ipif_t *ipif; mrec_t *rp = NULL; mrec_t *rtxrp = NULL; rtx_state_t *rtxp; mcast_record_t rtype; - ASSERT(IAM_WRITER_ILL(ill)); - - mutex_enter(&ill->ill_lock); + rw_enter(&ill->ill_mcast_lock, RW_WRITER); current = CURRENT_MSTIME; /* First check the global timer on this interface */ @@ -1230,10 +1209,8 @@ igmp_timeout_handler_per_ill(ill_t *ill) for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) continue; - ASSERT(ilm->ilm_ipif != NULL); - ilm->ilm_ipif->ipif_igmp_rpt = - mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr, - ilm->ilm_filter, ilm->ilm_ipif->ipif_igmp_rpt); + rp = mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr, + ilm->ilm_filter, rp); /* * Since we're sending a report on this group, okay * to delete pending group-specific timers. Note @@ -1245,20 +1222,8 @@ igmp_timeout_handler_per_ill(ill_t *ill) FREE_SLIST(ilm->ilm_pendsrcs); ilm->ilm_pendsrcs = NULL; } - /* - * We've built per-ipif mrec lists; walk the ill's ipif list - * and send a report for each ipif that has an mrec list. - */ - for (ipif = ill->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - if (ipif->ipif_igmp_rpt == NULL) - continue; - mutex_exit(&ill->ill_lock); - igmpv3_sendrpt(ipif, ipif->ipif_igmp_rpt); - mutex_enter(&ill->ill_lock); - /* mrec list was freed by igmpv3_sendrpt() */ - ipif->ipif_igmp_rpt = NULL; - } + igmpv3_sendrpt(ill, rp); + rp = NULL; } else { if ((ill->ill_global_timer - current) < next) next = ill->ill_global_timer - current; @@ -1288,13 +1253,9 @@ per_ilm_timer: ilm->ilm_timer = INFINITY; ilm->ilm_state = IGMP_IREPORTEDLAST; if (ill->ill_mcast_type == IGMP_V1_ROUTER) { - mutex_exit(&ill->ill_lock); igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0); - mutex_enter(&ill->ill_lock); } else if (ill->ill_mcast_type == IGMP_V2_ROUTER) { - mutex_exit(&ill->ill_lock); igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0); - mutex_enter(&ill->ill_lock); } else { slist_t *rsp; if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) && @@ -1325,9 +1286,7 @@ per_ilm_timer: rp = mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr, ilm->ilm_filter, rp); } - mutex_exit(&ill->ill_lock); - igmpv3_sendrpt(ill->ill_ipif, rp); - mutex_enter(&ill->ill_lock); + igmpv3_sendrpt(ill, rp); rp = NULL; } @@ -1345,14 +1304,11 @@ per_ilm_rtxtimer: rtxp->rtx_timer = INFINITY; ilm->ilm_state = IGMP_IREPORTEDLAST; if (ill->ill_mcast_type == IGMP_V1_ROUTER) { - mutex_exit(&ill->ill_lock); igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0); - mutex_enter(&ill->ill_lock); continue; - } else if (ill->ill_mcast_type == IGMP_V2_ROUTER) { - mutex_exit(&ill->ill_lock); + } + if (ill->ill_mcast_type == IGMP_V2_ROUTER) { igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0); - mutex_enter(&ill->ill_lock); continue; } @@ -1393,13 +1349,14 @@ per_ilm_rtxtimer: CLEAR_SLIST(rtxp->rtx_allow); CLEAR_SLIST(rtxp->rtx_block); } - mutex_exit(&ill->ill_lock); - igmpv3_sendrpt(ilm->ilm_ipif, rtxrp); - mutex_enter(&ill->ill_lock); + igmpv3_sendrpt(ill, rtxrp); rtxrp = NULL; } - mutex_exit(&ill->ill_lock); + rw_exit(&ill->ill_mcast_lock); + /* Send any deferred/queued IP packets */ + ill_mcast_send_queued(ill); + /* Defer ill_mcast_timer_start() until the caller is done */ return (next); } @@ -1411,17 +1368,15 @@ per_ilm_rtxtimer: * * As part of multicast join and leave igmp we may need to send out an * igmp request. The igmp related state variables in the ilm are protected - * by ill_lock. A single global igmp timer is used to track igmp timeouts. + * by ill_mcast_lock. A single global igmp timer is used to track igmp timeouts. * igmp_timer_lock protects the global igmp_timeout_id. igmp_start_timers * starts the igmp timer if needed. It serializes multiple threads trying to * simultaneously start the timer using the igmp_timer_setter_active flag. * * igmp_input() receives igmp queries and responds to the queries * in a delayed fashion by posting a timer i.e. it calls igmp_start_timers(). - * Later the igmp_timer fires, the timeout handler igmp_timeout_handler() - * performs the action exclusively after entering each ill's ipsq as writer. - * (The need to enter the IPSQ is largely historical but there are still some - * fields like ilm_filter that rely on it.) + * Later the igmp_timer fires, the timeout handler igmp_timerout_handler() + * performs the action exclusively after acquiring ill_mcast_lock. * * The igmp_slowtimeo() function is called thru another timer. * igmp_slowtimeout_lock protects the igmp_slowtimeout_id @@ -1433,12 +1388,12 @@ igmp_timeout_handler(void *arg) uint_t global_next = INFINITY; uint_t next; ill_walk_context_t ctx; - boolean_t success; ip_stack_t *ipst = arg; ASSERT(arg != NULL); mutex_enter(&ipst->ips_igmp_timer_lock); ASSERT(ipst->ips_igmp_timeout_id != 0); + ipst->ips_igmp_timeout_id = 0; ipst->ips_igmp_timer_scheduled_last = 0; ipst->ips_igmp_time_to_next = 0; mutex_exit(&ipst->ips_igmp_timer_lock); @@ -1447,31 +1402,17 @@ igmp_timeout_handler(void *arg) ill = ILL_START_WALK_V4(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { ASSERT(!ill->ill_isv6); - /* - * We may not be able to refhold the ill if the ill/ipif - * is changing. But we need to make sure that the ill will - * not vanish. So we just bump up the ill_waiter count. - */ - if (!ill_waiter_inc(ill)) + /* Make sure the ill isn't going away. */ + if (!ill_check_and_refhold(ill)) continue; rw_exit(&ipst->ips_ill_g_lock); - success = ipsq_enter(ill, B_TRUE, NEW_OP); - if (success) { - next = igmp_timeout_handler_per_ill(ill); - if (next < global_next) - global_next = next; - ipsq_exit(ill->ill_phyint->phyint_ipsq); - } + next = igmp_timeout_handler_per_ill(ill); + if (next < global_next) + global_next = next; + ill_refrele(ill); rw_enter(&ipst->ips_ill_g_lock, RW_READER); - ill_waiter_dcr(ill); } rw_exit(&ipst->ips_ill_g_lock); - - mutex_enter(&ipst->ips_igmp_timer_lock); - ASSERT(ipst->ips_igmp_timeout_id != 0); - ipst->ips_igmp_timeout_id = 0; - mutex_exit(&ipst->ips_igmp_timer_lock); - if (global_next != INFINITY) igmp_start_timers(global_next, ipst); } @@ -1481,7 +1422,6 @@ igmp_timeout_handler(void *arg) * Called when there are timeout events, every next (tick). * Returns number of ticks to next event (or 0 if none). */ -/* ARGSUSED */ uint_t mld_timeout_handler_per_ill(ill_t *ill) { @@ -1491,9 +1431,7 @@ mld_timeout_handler_per_ill(ill_t *ill) rtx_state_t *rtxp; mcast_record_t rtype; - ASSERT(IAM_WRITER_ILL(ill)); - - mutex_enter(&ill->ill_lock); + rw_enter(&ill->ill_mcast_lock, RW_WRITER); current = CURRENT_MSTIME; /* @@ -1528,9 +1466,7 @@ mld_timeout_handler_per_ill(ill_t *ill) FREE_SLIST(ilm->ilm_pendsrcs); ilm->ilm_pendsrcs = NULL; } - mutex_exit(&ill->ill_lock); mldv2_sendrpt(ill, rp); - mutex_enter(&ill->ill_lock); } else { if ((ill->ill_global_timer - current) < next) next = ill->ill_global_timer - current; @@ -1561,9 +1497,7 @@ per_ilm_timer: ilm->ilm_timer = INFINITY; ilm->ilm_state = IGMP_IREPORTEDLAST; if (ill->ill_mcast_type == MLD_V1_ROUTER) { - mutex_exit(&ill->ill_lock); mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL); - mutex_enter(&ill->ill_lock); } else { slist_t *rsp; if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) && @@ -1605,9 +1539,7 @@ per_ilm_rtxtimer: rtxp->rtx_timer = INFINITY; ilm->ilm_state = IGMP_IREPORTEDLAST; if (ill->ill_mcast_type == MLD_V1_ROUTER) { - mutex_exit(&ill->ill_lock); mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL); - mutex_enter(&ill->ill_lock); continue; } @@ -1651,13 +1583,13 @@ per_ilm_rtxtimer: } if (ill->ill_mcast_type == MLD_V2_ROUTER) { - mutex_exit(&ill->ill_lock); mldv2_sendrpt(ill, rp); mldv2_sendrpt(ill, rtxrp); - return (next); } - - mutex_exit(&ill->ill_lock); + rw_exit(&ill->ill_mcast_lock); + /* Send any deferred/queued IP packets */ + ill_mcast_send_queued(ill); + /* Defer ill_mcast_timer_start() until the caller is done */ return (next); } @@ -1675,12 +1607,12 @@ mld_timeout_handler(void *arg) uint_t global_next = INFINITY; uint_t next; ill_walk_context_t ctx; - boolean_t success; ip_stack_t *ipst = arg; ASSERT(arg != NULL); mutex_enter(&ipst->ips_mld_timer_lock); ASSERT(ipst->ips_mld_timeout_id != 0); + ipst->ips_mld_timeout_id = 0; ipst->ips_mld_timer_scheduled_last = 0; ipst->ips_mld_time_to_next = 0; mutex_exit(&ipst->ips_mld_timer_lock); @@ -1689,31 +1621,17 @@ mld_timeout_handler(void *arg) ill = ILL_START_WALK_V6(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { ASSERT(ill->ill_isv6); - /* - * We may not be able to refhold the ill if the ill/ipif - * is changing. But we need to make sure that the ill will - * not vanish. So we just bump up the ill_waiter count. - */ - if (!ill_waiter_inc(ill)) + /* Make sure the ill isn't going away. */ + if (!ill_check_and_refhold(ill)) continue; rw_exit(&ipst->ips_ill_g_lock); - success = ipsq_enter(ill, B_TRUE, NEW_OP); - if (success) { - next = mld_timeout_handler_per_ill(ill); - if (next < global_next) - global_next = next; - ipsq_exit(ill->ill_phyint->phyint_ipsq); - } + next = mld_timeout_handler_per_ill(ill); + if (next < global_next) + global_next = next; + ill_refrele(ill); rw_enter(&ipst->ips_ill_g_lock, RW_READER); - ill_waiter_dcr(ill); } rw_exit(&ipst->ips_ill_g_lock); - - mutex_enter(&ipst->ips_mld_timer_lock); - ASSERT(ipst->ips_mld_timeout_id != 0); - ipst->ips_mld_timeout_id = 0; - mutex_exit(&ipst->ips_mld_timer_lock); - if (global_next != INFINITY) mld_start_timers(global_next, ipst); } @@ -1743,8 +1661,6 @@ igmp_slowtimo(void *arg) ip_stack_t *ipst = (ip_stack_t *)arg; ASSERT(arg != NULL); - /* Hold the ill_g_lock so that we can safely walk the ill list */ - rw_enter(&ipst->ips_ill_g_lock, RW_READER); /* * The ill_if_t list is circular, hence the odd loop parameters. @@ -1754,6 +1670,7 @@ igmp_slowtimo(void *arg) * structure (allowing us to skip if none of the instances have timers * running). */ + rw_enter(&ipst->ips_ill_g_lock, RW_READER); for (ifp = IP_V4_ILL_G_LIST(ipst); ifp != (ill_if_t *)&IP_V4_ILL_G_LIST(ipst); ifp = ifp->illif_next) { @@ -1768,7 +1685,11 @@ igmp_slowtimo(void *arg) avl_tree = &ifp->illif_avl_by_ppa; for (ill = avl_first(avl_tree); ill != NULL; ill = avl_walk(avl_tree, ill, AVL_AFTER)) { - mutex_enter(&ill->ill_lock); + /* Make sure the ill isn't going away. */ + if (!ill_check_and_refhold(ill)) + continue; + rw_exit(&ipst->ips_ill_g_lock); + rw_enter(&ill->ill_mcast_lock, RW_WRITER); if (ill->ill_mcast_v1_tset == 1) ill->ill_mcast_v1_time++; if (ill->ill_mcast_v2_tset == 1) @@ -1808,10 +1729,13 @@ igmp_slowtimo(void *arg) ill->ill_mcast_v2_tset = 0; atomic_add_16(&ifp->illif_mcast_v2, -1); } - mutex_exit(&ill->ill_lock); + rw_exit(&ill->ill_mcast_lock); + ill_refrele(ill); + rw_enter(&ipst->ips_ill_g_lock, RW_READER); } } rw_exit(&ipst->ips_ill_g_lock); + ill_mcast_timer_start(ipst); mutex_enter(&ipst->ips_igmp_slowtimeout_lock); ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo, (void *)ipst, MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); @@ -1826,7 +1750,6 @@ igmp_slowtimo(void *arg) * Check for ips_mld_max_version ensures that we don't revert to a higher * IGMP version than configured. */ -/* ARGSUSED */ void mld_slowtimo(void *arg) { @@ -1847,7 +1770,11 @@ mld_slowtimo(void *arg) avl_tree = &ifp->illif_avl_by_ppa; for (ill = avl_first(avl_tree); ill != NULL; ill = avl_walk(avl_tree, ill, AVL_AFTER)) { - mutex_enter(&ill->ill_lock); + /* Make sure the ill isn't going away. */ + if (!ill_check_and_refhold(ill)) + continue; + rw_exit(&ipst->ips_ill_g_lock); + rw_enter(&ill->ill_mcast_lock, RW_WRITER); if (ill->ill_mcast_v1_tset == 1) ill->ill_mcast_v1_time++; if ((ill->ill_mcast_type == MLD_V1_ROUTER) && @@ -1861,10 +1788,13 @@ mld_slowtimo(void *arg) ill->ill_mcast_v1_tset = 0; atomic_add_16(&ifp->illif_mcast_v1, -1); } - mutex_exit(&ill->ill_lock); + rw_exit(&ill->ill_mcast_lock); + ill_refrele(ill); + rw_enter(&ipst->ips_ill_g_lock, RW_READER); } } rw_exit(&ipst->ips_ill_g_lock); + ill_mcast_timer_start(ipst); mutex_enter(&ipst->ips_mld_slowtimeout_lock); ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo, (void *)ipst, MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); @@ -1873,9 +1803,7 @@ mld_slowtimo(void *arg) /* * igmp_sendpkt: - * This will send to ip_wput like icmp_inbound. - * Note that the lower ill (on which the membership is kept) is used - * as an upper ill to pass in the multicast parameters. + * This will send to ip_output_simple just like icmp_inbound. */ static void igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr) @@ -1886,51 +1814,16 @@ igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr) ipha_t *ipha; int hdrlen = sizeof (ipha_t) + RTRALERT_LEN; size_t size = hdrlen + sizeof (igmpa_t); - ipif_t *ipif = ilm->ilm_ipif; - ill_t *ill = ipif->ipif_ill; - mblk_t *first_mp; - ipsec_out_t *io; - zoneid_t zoneid; + ill_t *ill = ilm->ilm_ill; ip_stack_t *ipst = ill->ill_ipst; - /* - * We need to make sure this packet goes out on an ipif. If - * there is some global policy match in ip_wput_ire, we need - * to get to the right interface after IPSEC processing. - * To make sure this multicast packet goes out on the right - * interface, we attach an ipsec_out and initialize ill_index - * like we did in ip_wput. To make sure that this packet does - * not get forwarded on other interfaces or looped back, we - * set ipsec_out_dontroute to B_TRUE and ipsec_out_multicast_loop - * to B_FALSE. - */ - first_mp = allocb(sizeof (ipsec_info_t), BPRI_HI); - if (first_mp == NULL) - return; - - first_mp->b_datap->db_type = M_CTL; - first_mp->b_wptr += sizeof (ipsec_info_t); - bzero(first_mp->b_rptr, sizeof (ipsec_info_t)); - /* ipsec_out_secure is B_FALSE now */ - io = (ipsec_out_t *)first_mp->b_rptr; - io->ipsec_out_type = IPSEC_OUT; - io->ipsec_out_len = sizeof (ipsec_out_t); - io->ipsec_out_use_global_policy = B_TRUE; - io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex; - io->ipsec_out_multicast_loop = B_FALSE; - io->ipsec_out_dontroute = B_TRUE; - if ((zoneid = ilm->ilm_zoneid) == ALL_ZONES) - zoneid = GLOBAL_ZONEID; - io->ipsec_out_zoneid = zoneid; - io->ipsec_out_ns = ipst->ips_netstack; /* No netstack_hold */ + ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock)); mp = allocb(size, BPRI_HI); if (mp == NULL) { - freemsg(first_mp); return; } mp->b_wptr = mp->b_rptr + size; - first_mp->b_cont = mp; ipha = (ipha_t *)mp->b_rptr; rtralert = (uint8_t *)&(ipha[1]); @@ -1956,53 +1849,38 @@ igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr) ipha->ipha_protocol = IPPROTO_IGMP; ipha->ipha_hdr_checksum = 0; ipha->ipha_dst = addr ? addr : igmpa->igmpa_group; - ipha->ipha_src = ipif->ipif_src_addr; - /* - * Request loopback of the report if we are acting as a multicast - * router, so that the process-level routing demon can hear it. - */ - /* - * This will run multiple times for the same group if there are members - * on the same group for multiple ipif's on the same ill. The - * igmp_input code will suppress this due to the loopback thus we - * always loopback membership report. - */ - ASSERT(ill->ill_rq != NULL); - ip_multicast_loopback(ill->ill_rq, ill, first_mp, 0, ilm->ilm_zoneid); + ipha->ipha_src = INADDR_ANY; - ip_wput_multicast(ill->ill_wq, first_mp, ipif, zoneid); + ill_mcast_queue(ill, mp); ++ipst->ips_igmpstat.igps_snd_reports; } /* - * Sends an IGMP_V3_MEMBERSHIP_REPORT message out the ill associated - * with the passed-in ipif. The report will contain one group record + * Sends an IGMP_V3_MEMBERSHIP_REPORT message out the ill. + * The report will contain one group record * for each element of reclist. If this causes packet length to - * exceed ipif->ipif_ill->ill_max_frag, multiple reports are sent. + * exceed ill->ill_mtu, multiple reports are sent. * reclist is assumed to be made up of buffers allocated by mcast_bldmrec(), * and those buffers are freed here. */ static void -igmpv3_sendrpt(ipif_t *ipif, mrec_t *reclist) +igmpv3_sendrpt(ill_t *ill, mrec_t *reclist) { - ipsec_out_t *io; igmp3ra_t *igmp3ra; grphdra_t *grphdr; - mblk_t *first_mp, *mp; + mblk_t *mp; ipha_t *ipha; uint8_t *rtralert; ipaddr_t *src_array; int i, j, numrec, more_src_cnt; size_t hdrsize, size, rsize; - ill_t *ill = ipif->ipif_ill; mrec_t *rp, *cur_reclist; mrec_t *next_reclist = reclist; boolean_t morepkts; - zoneid_t zoneid; ip_stack_t *ipst = ill->ill_ipst; - ASSERT(IAM_WRITER_IPIF(ipif)); + ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock)); /* if there aren't any records, there's nothing to send */ if (reclist == NULL) @@ -2018,7 +1896,7 @@ nextpkt: for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) { rsize = sizeof (grphdra_t) + (rp->mrec_srcs.sl_numsrc * sizeof (ipaddr_t)); - if (size + rsize > ill->ill_max_frag) { + if (size + rsize > ill->ill_mtu) { if (rp == cur_reclist) { /* * If the first mrec we looked at is too big @@ -2029,7 +1907,7 @@ nextpkt: * other types). */ int srcspace, srcsperpkt; - srcspace = ill->ill_max_frag - (size + + srcspace = ill->ill_mtu - (size + sizeof (grphdra_t)); /* @@ -2082,37 +1960,12 @@ nextpkt: numrec++; } - /* - * See comments in igmp_sendpkt() about initializing for ipsec and - * load balancing requirements. - */ - first_mp = allocb(sizeof (ipsec_info_t), BPRI_HI); - if (first_mp == NULL) - goto free_reclist; - - first_mp->b_datap->db_type = M_CTL; - first_mp->b_wptr += sizeof (ipsec_info_t); - bzero(first_mp->b_rptr, sizeof (ipsec_info_t)); - /* ipsec_out_secure is B_FALSE now */ - io = (ipsec_out_t *)first_mp->b_rptr; - io->ipsec_out_type = IPSEC_OUT; - io->ipsec_out_len = sizeof (ipsec_out_t); - io->ipsec_out_use_global_policy = B_TRUE; - io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex; - io->ipsec_out_multicast_loop = B_FALSE; - io->ipsec_out_dontroute = B_TRUE; - if ((zoneid = ipif->ipif_zoneid) == ALL_ZONES) - zoneid = GLOBAL_ZONEID; - io->ipsec_out_zoneid = zoneid; - mp = allocb(size, BPRI_HI); if (mp == NULL) { - freemsg(first_mp); goto free_reclist; } bzero((char *)mp->b_rptr, size); mp->b_wptr = (uchar_t *)(mp->b_rptr + size); - first_mp->b_cont = mp; ipha = (ipha_t *)mp->b_rptr; rtralert = (uint8_t *)&(ipha[1]); @@ -2149,21 +2002,9 @@ nextpkt: ipha->ipha_ttl = IGMP_TTL; ipha->ipha_protocol = IPPROTO_IGMP; ipha->ipha_dst = htonl(INADDR_ALLRPTS_GROUP); - ipha->ipha_src = ipif->ipif_src_addr; + ipha->ipha_src = INADDR_ANY; - /* - * Request loopback of the report if we are acting as a multicast - * router, so that the process-level routing daemon can hear it. - * - * This will run multiple times for the same group if there are - * members on the same group for multiple ipifs on the same ill. - * The igmp_input code will suppress this due to the loopback; - * thus we always loopback membership report. - */ - ASSERT(ill->ill_rq != NULL); - ip_multicast_loopback(ill->ill_rq, ill, mp, 0, ipif->ipif_zoneid); - - ip_wput_multicast(ill->ill_wq, first_mp, ipif, zoneid); + ill_mcast_queue(ill, mp); ++ipst->ips_igmpstat.igps_snd_reports; @@ -2190,21 +2031,24 @@ free_reclist: /* * mld_input: + * Return NULL for a bad packet that is discarded here. + * Return mp if the message is OK and should be handed to "raw" receivers. + * Callers of mld_input() may need to reinitialize variables that were copied + * from the mblk as this calls pullupmsg(). */ -/* ARGSUSED */ -void -mld_input(queue_t *q, mblk_t *mp, ill_t *ill) +mblk_t * +mld_input(mblk_t *mp, ip_recv_attr_t *ira) { ip6_t *ip6h = (ip6_t *)(mp->b_rptr); mld_hdr_t *mldh; ilm_t *ilm; ipif_t *ipif; uint16_t hdr_length, exthdr_length; - in6_addr_t *v6group_ptr, *lcladdr_ptr; + in6_addr_t *v6group_ptr; uint_t next; int mldlen; + ill_t *ill = ira->ira_ill; ip_stack_t *ipst = ill->ill_ipst; - ilm_walker_t ilw; BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembTotal); @@ -2212,30 +2056,26 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill) if (!(IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))) { BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors); freemsg(mp); - return; + return (NULL); } if (ip6h->ip6_hlim != 1) { BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpBadHoplimit); freemsg(mp); - return; + return (NULL); } /* Get to the icmp header part */ - if (ip6h->ip6_nxt != IPPROTO_ICMPV6) { - hdr_length = ip_hdr_length_v6(mp, ip6h); - exthdr_length = hdr_length - IPV6_HDR_LEN; - } else { - hdr_length = IPV6_HDR_LEN; - exthdr_length = 0; - } + hdr_length = ira->ira_ip_hdr_length; + exthdr_length = hdr_length - IPV6_HDR_LEN; + mldlen = ntohs(ip6h->ip6_plen) - exthdr_length; /* An MLD packet must at least be 24 octets to be valid */ if (mldlen < MLD_MINLEN) { BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors); freemsg(mp); - return; + return (NULL); } mldh = (mld_hdr_t *)(&mp->b_rptr[hdr_length]); @@ -2254,50 +2094,41 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill) } else { BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors); freemsg(mp); - return; + return (NULL); } if (next == 0) { - freemsg(mp); - return; + return (mp); } if (next != INFINITY) mld_start_timers(next, ipst); break; - case MLD_LISTENER_REPORT: { - - ASSERT(ill->ill_ipif != NULL); + case MLD_LISTENER_REPORT: /* * For fast leave to work, we have to know that we are the * last person to send a report for this group. Reports * generated by us are looped back since we could potentially * be a multicast router, so discard reports sourced by me. */ - lcladdr_ptr = &(ill->ill_ipif->ipif_v6subnet); mutex_enter(&ill->ill_lock); for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, - lcladdr_ptr)) { + &ip6h->ip6_src)) { if (ip_debug > 1) { char buf1[INET6_ADDRSTRLEN]; - char buf2[INET6_ADDRSTRLEN]; (void) mi_strlog(ill->ill_rq, 1, SL_TRACE, "mld_input: we are only " - "member src %s ipif_local %s", - inet_ntop(AF_INET6, lcladdr_ptr, - buf1, sizeof (buf1)), - inet_ntop(AF_INET6, - &ipif->ipif_v6lcl_addr, - buf2, sizeof (buf2))); + "member src %s\n", + inet_ntop(AF_INET6, &ip6h->ip6_src, + buf1, sizeof (buf1))); } mutex_exit(&ill->ill_lock); - freemsg(mp); - return; + return (mp); } } mutex_exit(&ill->ill_lock); @@ -2308,9 +2139,10 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill) BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembBadReports); freemsg(mp); - return; + return (NULL); } + /* * If we belong to the group being reported, and we are a * 'Delaying member' per the RFC terminology, stop our timer @@ -2319,8 +2151,8 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill) * membership entries for the same group address (one per zone) * so we need to walk the ill_ilm list. */ - ilm = ilm_walker_start(&ilw, ill); - for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { + rw_enter(&ill->ill_mcast_lock, RW_WRITER); + for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group_ptr)) continue; BUMP_MIB(ill->ill_icmp6_mib, @@ -2329,23 +2161,19 @@ mld_input(queue_t *q, mblk_t *mp, ill_t *ill) ilm->ilm_timer = INFINITY; ilm->ilm_state = IGMP_OTHERMEMBER; } - ilm_walker_finish(&ilw); + rw_exit(&ill->ill_mcast_lock); + /* + * No packets have been sent above - no + * ill_mcast_send_queued is needed. + */ + ill_mcast_timer_start(ill->ill_ipst); break; - } + case MLD_LISTENER_REDUCTION: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembReductions); break; } - /* - * All MLD packets have already been passed up to any - * process(es) listening on a ICMP6 raw socket. This - * has been accomplished in ip_deliver_local_v6 prior to - * this function call. It is assumed that the multicast daemon - * will have a SOCK_RAW IPPROTO_ICMPV6 (and presumbly use the - * ICMP6_FILTER socket option to only receive the MLD messages) - * Thus we can free the MLD message block here - */ - freemsg(mp); + return (mp); } /* @@ -2359,7 +2187,6 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill) int timer; uint_t next, current; in6_addr_t *v6group; - ilm_walker_t ilw; BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries); @@ -2383,7 +2210,7 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill) } /* Need to do compatibility mode checking */ - mutex_enter(&ill->ill_lock); + rw_enter(&ill->ill_mcast_lock, RW_WRITER); ill->ill_mcast_v1_time = 0; ill->ill_mcast_v1_tset = 1; if (ill->ill_mcast_type == MLD_V2_ROUTER) { @@ -2392,7 +2219,6 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill) atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1); ill->ill_mcast_type = MLD_V1_ROUTER; } - mutex_exit(&ill->ill_lock); timer = (int)ntohs(mldh->mld_maxdelay); if (ip_debug > 1) { @@ -2415,11 +2241,8 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill) */ next = INFINITY; - ilm = ilm_walker_start(&ilw, ill); - mutex_enter(&ill->ill_lock); current = CURRENT_MSTIME; - - for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { + for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { ASSERT(!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr)); if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) || @@ -2434,9 +2257,7 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill) /* Respond immediately */ ilm->ilm_timer = INFINITY; ilm->ilm_state = IGMP_IREPORTEDLAST; - mutex_exit(&ill->ill_lock); mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL); - mutex_enter(&ill->ill_lock); break; } if (ilm->ilm_timer > timer) { @@ -2448,8 +2269,10 @@ mld_query_in(mld_hdr_t *mldh, ill_t *ill) break; } } - mutex_exit(&ill->ill_lock); - ilm_walker_finish(&ilw); + rw_exit(&ill->ill_mcast_lock); + /* Send any deferred/queued IP packets */ + ill_mcast_send_queued(ill); + ill_mcast_timer_start(ill->ill_ipst); return (next); } @@ -2466,7 +2289,6 @@ mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen) in6_addr_t *v6group, *src_array; uint_t next, numsrc, i, mrd, delay, qqi, current; uint8_t qrv; - ilm_walker_t ilw; v6group = &mld2q->mld2q_addr; numsrc = ntohs(mld2q->mld2q_numsrc); @@ -2514,12 +2336,11 @@ mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen) * sooner than the delay we calculated for this response, then * no action is required (MLDv2 draft section 6.2 rule 1) */ - mutex_enter(&ill->ill_lock); + rw_enter(&ill->ill_mcast_lock, RW_WRITER); if (ill->ill_global_timer < (current + delay)) { - mutex_exit(&ill->ill_lock); + rw_exit(&ill->ill_mcast_lock); return (next); } - mutex_exit(&ill->ill_lock); /* * Now take action depending on query type: general, @@ -2532,16 +2353,11 @@ mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen) * greater than our calculated delay, so reset it to * our delay (random value in range [0, response time]) */ - mutex_enter(&ill->ill_lock); ill->ill_global_timer = current + delay; - mutex_exit(&ill->ill_lock); next = delay; - } else { /* group or group/source specific query */ - ilm = ilm_walker_start(&ilw, ill); - mutex_enter(&ill->ill_lock); - for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { + for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) || IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) || IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr) || @@ -2595,9 +2411,13 @@ group_query: ilm->ilm_timer += current; break; } - mutex_exit(&ill->ill_lock); - ilm_walker_finish(&ilw); } + rw_exit(&ill->ill_mcast_lock); + /* + * No packets have been sent above - no + * ill_mcast_send_queued is needed. + */ + ill_mcast_timer_start(ill->ill_ipst); return (next); } @@ -2615,7 +2435,8 @@ mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr) struct ip6_opt_router *ip6router; size_t size = IPV6_HDR_LEN + sizeof (mld_hdr_t); ill_t *ill = ilm->ilm_ill; - ipif_t *ipif; + + ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock)); /* * We need to place a router alert option in this packet. The length @@ -2663,35 +2484,20 @@ mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr) else ip6h->ip6_dst = *v6addr; - /* ipif returned by ipif_lookup_zoneid is link-local (if present) */ - if (ipif_lookup_zoneid(ill, ilm->ilm_zoneid, IPIF_UP, &ipif)) { - ip6h->ip6_src = ipif->ipif_v6src_addr; - ipif_refrele(ipif); - } else { - /* Otherwise, use IPv6 default address selection. */ - ip6h->ip6_src = ipv6_all_zeros; - } - + ip6h->ip6_src = ipv6_all_zeros; /* * Prepare for checksum by putting icmp length in the icmp - * checksum field. The checksum is calculated in ip_wput_v6. + * checksum field. The checksum is calculated in ip_output. */ mldh->mld_cksum = htons(sizeof (*mldh)); - /* - * ip_wput will automatically loopback the multicast packet to - * the conn if multicast loopback is enabled. - * The MIB stats corresponding to this outgoing MLD packet - * will be accounted for in ip_wput->ip_wput_v6->ip_wput_ire_v6 - * ->icmp_update_out_mib_v6 function call. - */ - (void) ip_output_v6(NULL, mp, ill->ill_wq, IP_WPUT); + ill_mcast_queue(ill, mp); } /* * Sends an MLD_V2_LISTENER_REPORT message out the passed-in ill. The * report will contain one multicast address record for each element of - * reclist. If this causes packet length to exceed ill->ill_max_frag, + * reclist. If this causes packet length to exceed ill->ill_mtu, * multiple reports are sent. reclist is assumed to be made up of * buffers allocated by mcast_bldmrec(), and those buffers are freed here. */ @@ -2706,19 +2512,17 @@ mldv2_sendrpt(ill_t *ill, mrec_t *reclist) ip6_hbh_t *ip6hbh; struct ip6_opt_router *ip6router; size_t size, optlen, padlen, icmpsize, rsize; - ipif_t *ipif; int i, numrec, more_src_cnt; mrec_t *rp, *cur_reclist; mrec_t *next_reclist = reclist; boolean_t morepkts; - ASSERT(IAM_WRITER_ILL(ill)); - /* If there aren't any records, there's nothing to send */ if (reclist == NULL) return; ASSERT(ill->ill_isv6); + ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock)); /* * Total option length (optlen + padlen) must be a multiple of @@ -2737,7 +2541,7 @@ nextpkt: rp = rp->mrec_next, numrec++) { rsize = sizeof (mld2mar_t) + (rp->mrec_srcs.sl_numsrc * sizeof (in6_addr_t)); - if (size + rsize > ill->ill_max_frag) { + if (size + rsize > ill->ill_mtu) { if (rp == cur_reclist) { /* * If the first mrec we looked at is too big @@ -2748,7 +2552,7 @@ nextpkt: * other types). */ int srcspace, srcsperpkt; - srcspace = ill->ill_max_frag - + srcspace = ill->ill_mtu - (size + sizeof (mld2mar_t)); /* @@ -2819,14 +2623,7 @@ nextpkt: ip6h->ip6_nxt = IPPROTO_HOPOPTS; ip6h->ip6_hops = MLD_HOP_LIMIT; ip6h->ip6_dst = ipv6_all_v2rtrs_mcast; - /* ipif returned by ipif_lookup_zoneid is link-local (if present) */ - if (ipif_lookup_zoneid(ill, ALL_ZONES, IPIF_UP, &ipif)) { - ip6h->ip6_src = ipif->ipif_v6src_addr; - ipif_refrele(ipif); - } else { - /* otherwise, use IPv6 default address selection. */ - ip6h->ip6_src = ipv6_all_zeros; - } + ip6h->ip6_src = ipv6_all_zeros; ip6hbh->ip6h_nxt = IPPROTO_ICMPV6; /* @@ -2844,7 +2641,7 @@ nextpkt: mld2r->mld2r_nummar = htons(numrec); /* * Prepare for the checksum by putting icmp length in the icmp - * checksum field. The checksum is calculated in ip_wput_v6. + * checksum field. The checksum is calculated in ip_output_simple. */ mld2r->mld2r_cksum = htons(icmpsize); @@ -2861,14 +2658,7 @@ nextpkt: mld2mar = (mld2mar_t *)&(srcarray[i]); } - /* - * ip_wput will automatically loopback the multicast packet to - * the conn if multicast loopback is enabled. - * The MIB stats corresponding to this outgoing MLD packet - * will be accounted for in ip_wput->ip_wput_v6->ip_wput_ire_v6 - * ->icmp_update_out_mib_v6 function call. - */ - (void) ip_output_v6(NULL, mp, ill->ill_wq, IP_WPUT); + ill_mcast_queue(ill, mp); if (morepkts) { if (more_src_cnt > 0) { @@ -2997,7 +2787,7 @@ mcast_merge_rtx(ilm_t *ilm, mrec_t *mreclist, slist_t *flist) mrec_t *rp, *rpnext, *rtnmrec; boolean_t ovf; - ill = (ilm->ilm_ill == NULL ? ilm->ilm_ipif->ipif_ill : ilm->ilm_ill); + ill = ilm->ilm_ill; if (mreclist == NULL) return (mreclist); @@ -3100,64 +2890,3 @@ mcast_merge_rtx(ilm_t *ilm, mrec_t *mreclist, slist_t *flist) return (rtnmrec); } - -/* - * Convenience routine to signal the restart-timer thread. - */ -static void -mcast_signal_restart_thread(ip_stack_t *ipst) -{ - mutex_enter(&ipst->ips_mrt_lock); - ipst->ips_mrt_flags |= IP_MRT_RUN; - cv_signal(&ipst->ips_mrt_cv); - mutex_exit(&ipst->ips_mrt_lock); -} - -/* - * Thread to restart IGMP/MLD timers. See the comment in igmp_joingroup() for - * the story behind this unfortunate thread. - */ -void -mcast_restart_timers_thread(ip_stack_t *ipst) -{ - int next; - char name[64]; - callb_cpr_t cprinfo; - - (void) snprintf(name, sizeof (name), "mcast_restart_timers_thread_%d", - ipst->ips_netstack->netstack_stackid); - CALLB_CPR_INIT(&cprinfo, &ipst->ips_mrt_lock, callb_generic_cpr, name); - - for (;;) { - mutex_enter(&ipst->ips_mrt_lock); - while (!(ipst->ips_mrt_flags & (IP_MRT_STOP|IP_MRT_RUN))) { - CALLB_CPR_SAFE_BEGIN(&cprinfo); - cv_wait(&ipst->ips_mrt_cv, &ipst->ips_mrt_lock); - CALLB_CPR_SAFE_END(&cprinfo, &ipst->ips_mrt_lock); - } - if (ipst->ips_mrt_flags & IP_MRT_STOP) - break; - ipst->ips_mrt_flags &= ~IP_MRT_RUN; - mutex_exit(&ipst->ips_mrt_lock); - - mutex_enter(&ipst->ips_igmp_timer_lock); - next = ipst->ips_igmp_deferred_next; - ipst->ips_igmp_deferred_next = INFINITY; - mutex_exit(&ipst->ips_igmp_timer_lock); - - if (next != INFINITY) - igmp_start_timers(next, ipst); - - mutex_enter(&ipst->ips_mld_timer_lock); - next = ipst->ips_mld_deferred_next; - ipst->ips_mld_deferred_next = INFINITY; - mutex_exit(&ipst->ips_mld_timer_lock); - if (next != INFINITY) - mld_start_timers(next, ipst); - } - - ipst->ips_mrt_flags |= IP_MRT_DONE; - cv_signal(&ipst->ips_mrt_done_cv); - CALLB_CPR_EXIT(&cprinfo); /* drops ips_mrt_lock */ - thread_exit(); -} diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c index ebb89e3172..b59087e9b1 100644 --- a/usr/src/uts/common/inet/ip/ip.c +++ b/usr/src/uts/common/inet/ip/ip.c @@ -38,6 +38,7 @@ #include <sys/tihdr.h> #include <sys/xti_inet.h> #include <sys/ddi.h> +#include <sys/suntpi.h> #include <sys/cmn_err.h> #include <sys/debug.h> #include <sys/kobj.h> @@ -94,10 +95,8 @@ #include <inet/ipp_common.h> #include <net/pfkeyv2.h> -#include <inet/ipsec_info.h> #include <inet/sadb.h> #include <inet/ipsec_impl.h> -#include <sys/iphada.h> #include <inet/iptun/iptun_impl.h> #include <inet/ipdrop.h> #include <inet/ip_netinfo.h> @@ -111,9 +110,7 @@ #include <ipp/ipp_impl.h> #include <ipp/ipgpc/ipgpc.h> -#include <sys/multidata.h> #include <sys/pattr.h> - #include <inet/ipclassifier.h> #include <inet/sctp_ip.h> #include <inet/sctp/sctp_impl.h> @@ -126,6 +123,7 @@ #include <rpc/pmap_prot.h> #include <sys/squeue_impl.h> +#include <inet/ip_arp.h> /* * Values for squeue switch: @@ -133,10 +131,9 @@ * IP_SQUEUE_ENTER: SQ_PROCESS * IP_SQUEUE_FILL: SQ_FILL */ -int ip_squeue_enter = 2; /* Setable in /etc/system */ +int ip_squeue_enter = IP_SQUEUE_ENTER; /* Setable in /etc/system */ int ip_squeue_flag; -#define SET_BPREV_FLAG(x) ((mblk_t *)(uintptr_t)(x)) /* * Setable in /etc/system @@ -177,7 +174,8 @@ typedef struct iproutedata_s { listptr_t ird_attrs; /* ipRouteAttributeTable */ } iproutedata_t; -#define IRD_REPORT_TESTHIDDEN 0x01 /* include IRE_MARK_TESTHIDDEN routes */ +/* Include ire_testhidden and IRE_IF_CLONE routes */ +#define IRD_REPORT_ALL 0x01 /* * Cluster specific hooks. These should be NULL when booted as a non-cluster @@ -233,29 +231,26 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t, * MT level protection given by STREAMS. IP uses a combination of its own * internal serialization mechanism and standard Solaris locking techniques. * The internal serialization is per phyint. This is used to serialize - * plumbing operations, certain multicast operations, most set ioctls, - * igmp/mld timers etc. + * plumbing operations, IPMP operations, most set ioctls, etc. * * Plumbing is a long sequence of operations involving message * exchanges between IP, ARP and device drivers. Many set ioctls are typically * involved in plumbing operations. A natural model is to serialize these * ioctls one per ill. For example plumbing of hme0 and qfe0 can go on in * parallel without any interference. But various set ioctls on hme0 are best - * serialized, along with multicast join/leave operations, igmp/mld timer - * operations, and processing of DLPI control messages received from drivers - * on a per phyint basis. This serialization is provided by the ipsq_t and - * primitives operating on this. Details can be found in ip_if.c above the - * core primitives operating on ipsq_t. + * serialized, along with IPMP operations and processing of DLPI control + * messages received from drivers on a per phyint basis. This serialization is + * provided by the ipsq_t and primitives operating on this. Details can + * be found in ip_if.c above the core primitives operating on ipsq_t. * * Lookups of an ipif or ill by a thread return a refheld ipif / ill. * Simiarly lookup of an ire by a thread also returns a refheld ire. * In addition ipif's and ill's referenced by the ire are also indirectly - * refheld. Thus no ipif or ill can vanish nor can critical parameters like - * the ipif's address or netmask change as long as an ipif is refheld + * refheld. Thus no ipif or ill can vanish as long as an ipif is refheld * directly or indirectly. For example an SIOCSLIFADDR ioctl that changes the * address of an ipif has to go through the ipsq_t. This ensures that only - * 1 such exclusive operation proceeds at any time on the ipif. It then - * deletes all ires associated with this ipif, and waits for all refcnts + * one such exclusive operation proceeds at any time on the ipif. It then + * waits for all refcnts * associated with this ipif to come down to zero. The address is changed * only after the ipif has been quiesced. Then the ipif is brought up again. * More details are described above the comment in ip_sioctl_flags. @@ -274,7 +269,7 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t, * - ire_lock to protect some of the fields of the ire, IRE tables * (one lock per hash bucket). Refer to ip_ire.c for details. * - * - ndp_g_lock and nce_lock for protecting NCEs. + * - ndp_g_lock and ncec_lock for protecting NCEs. * * - ill_lock protects fields of the ill and ipif. Details in ip.h * @@ -312,12 +307,6 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t, * This lock is held in ipif_up_done and the ipif is marked IPIF_UP and the * uniqueness check also done atomically. * - * - ipsec_capab_ills_lock: This readers/writer lock protects the global - * lists of IPsec capable ills (ipsec_capab_ills_{ah,esp}). It is taken - * as a writer when adding or deleting elements from these lists, and - * as a reader when walking these lists to send a SADB update to the - * IPsec capable ills. - * * - ill_g_usesrc_lock: This readers/writer lock protects the usesrc * group list linked by ill_usesrc_grp_next. It also protects the * ill_usesrc_ifindex field. It is taken as a writer when a member of the @@ -357,20 +346,30 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t, * * ill_g_lock -> conn_lock -> ill_lock -> ipsq_lock -> ipx_lock * ill_g_lock -> ill_lock(s) -> phyint_lock - * ill_g_lock -> ndp_g_lock -> ill_lock -> nce_lock + * ill_g_lock -> ndp_g_lock -> ill_lock -> ncec_lock * ill_g_lock -> ip_addr_avail_lock * conn_lock -> irb_lock -> ill_lock -> ire_lock * ill_g_lock -> ip_g_nd_lock + * ill_g_lock -> ips_ipmp_lock -> ill_lock -> nce_lock + * ill_g_lock -> ndp_g_lock -> ill_lock -> ncec_lock -> nce_lock + * arl_lock -> ill_lock + * ips_ire_dep_lock -> irb_lock * * When more than 1 ill lock is needed to be held, all ill lock addresses * are sorted on address and locked starting from highest addressed lock * downward. * + * Multicast scenarios + * ips_ill_g_lock -> ill_mcast_lock + * conn_ilg_lock -> ips_ill_g_lock -> ill_lock + * ill_mcast_serializer -> ill_mcast_lock -> ips_ipmp_lock -> ill_lock + * ill_mcast_serializer -> ill_mcast_lock -> connf_lock -> conn_lock + * ill_mcast_serializer -> ill_mcast_lock -> conn_ilg_lock + * ill_mcast_serializer -> ill_mcast_lock -> ips_igmp_timer_lock + * * IPsec scenarios * * ipsa_lock -> ill_g_lock -> ill_lock - * ipsec_capab_ills_lock -> ill_g_lock -> ill_lock - * ipsec_capab_ills_lock -> ipsa_lock * ill_g_usesrc_lock -> ill_g_lock -> ill_lock * * Trusted Solaris scenarios @@ -414,31 +413,30 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t, * Walker - Increment irb_refcnt before calling the walker callback. Hold the * global tree lock (read mode) for traversal. * + * IRE dependencies - In some cases we hold ips_ire_dep_lock across ire_refrele + * hence we will acquire irb_lock while holding ips_ire_dep_lock. + * * IPsec notes : * - * IP interacts with the IPsec code (AH/ESP) by tagging a M_CTL message - * in front of the actual packet. For outbound datagrams, the M_CTL - * contains a ipsec_out_t (defined in ipsec_info.h), which has the + * IP interacts with the IPsec code (AH/ESP) by storing IPsec attributes + * in the ip_xmit_attr_t ip_recv_attr_t. For outbound datagrams, the + * ip_xmit_attr_t has the * information used by the IPsec code for applying the right level of - * protection. The information initialized by IP in the ipsec_out_t + * protection. The information initialized by IP in the ip_xmit_attr_t * is determined by the per-socket policy or global policy in the system. - * For inbound datagrams, the M_CTL contains a ipsec_in_t (defined in - * ipsec_info.h) which starts out with nothing in it. It gets filled + * For inbound datagrams, the ip_recv_attr_t + * starts out with nothing in it. It gets filled * with the right information if it goes through the AH/ESP code, which * happens if the incoming packet is secure. The information initialized - * by AH/ESP, is later used by IP(during fanouts to ULP) to see whether + * by AH/ESP, is later used by IP (during fanouts to ULP) to see whether * the policy requirements needed by per-socket policy or global policy * is met or not. * - * If there is both per-socket policy (set using setsockopt) and there - * is also global policy match for the 5 tuples of the socket, - * ipsec_override_policy() makes the decision of which one to use. - * * For fully connected sockets i.e dst, src [addr, port] is known, * conn_policy_cached is set indicating that policy has been cached. * conn_in_enforce_policy may or may not be set depending on whether * there is a global policy match or per-socket policy match. - * Policy inheriting happpens in ip_bind during the ipa_conn_t bind. + * Policy inheriting happpens in ip_policy_set once the destination is known. * Once the right policy is set on the conn_t, policy cannot change for * this socket. This makes life simpler for TCP (UDP ?) where * re-transmissions go out with the same policy. For symmetry, policy @@ -513,7 +511,8 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t, * idl_tx_list in ips_idl_tx_list[] array. Then conn_drain_insert() is * called passing idl_tx_list. The connp gets inserted in a drain list * pointed to by idl_tx_list. conn_drain_list() asserts flow control for - * the sockets (non stream based) and sets QFULL condition for conn_wq. + * the sockets (non stream based) and sets QFULL condition on the conn_wq + * of streams sockets, or the su_txqfull for non-streams sockets. * connp->conn_direct_blocked will be set to indicate the blocked * condition. * @@ -521,46 +520,37 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t, * A cookie is passed in the call to ill_flow_enable() that identifies the * blocked Tx ring. This cookie is used to get to the idl_tx_list that * contains the blocked connp's. conn_walk_drain() uses the idl_tx_list_t - * and goes through each of the drain list (q)enabling the conn_wq of the - * first conn in each of the drain list. This causes ip_wsrv to run for the + * and goes through each conn in the drain list and calls conn_idl_remove + * for the conn to clear the qfull condition for the conn, as well as to + * remove the conn from the idl list. In addition, streams based sockets + * will have the conn_wq enabled, causing ip_wsrv to run for the * conn. ip_wsrv drains the queued messages, and removes the conn from the - * drain list, if all messages were drained. It also qenables the next conn - * in the drain list to continue the drain process. + * drain list, if all messages were drained. It also notifies the + * conn_upcalls for the conn to signal that flow-control has opened up. * * In reality the drain list is not a single list, but a configurable number - * of lists. conn_drain_walk() in the IP module, qenables the first conn in - * each list. If the ip_wsrv of the next qenabled conn does not run, because - * the stream closes, ip_close takes responsibility to qenable the next conn - * in the drain list. conn_drain_insert and conn_drain_tail are the only + * of lists. conn_walk_drain() in the IP module, notifies the conn_upcalls for + * each conn in the list. conn_drain_insert and conn_drain_tail are the only * functions that manipulate this drain list. conn_drain_insert is called in - * ip_wput context itself (as opposed to from ip_wsrv context for STREAMS + * from the protocol layer when conn_ip_output returns EWOULDBLOCK. + * (as opposed to from ip_wsrv context for STREAMS * case -- see below). The synchronization between drain insertion and flow * control wakeup is handled by using idl_txl->txl_lock. * * Flow control using STREAMS: * When ILL_DIRECT_CAPABLE() is not TRUE, STREAMS flow control mechanism * is used. On the send side, if the packet cannot be sent down to the - * driver by IP, because of a canput failure, IP does a putq on the conn_wq. - * This will cause ip_wsrv to run on the conn_wq. ip_wsrv in turn, inserts - * the conn in a list of conn's that need to be drained when the flow - * control condition subsides. The blocked connps are put in first member - * of ips_idl_tx_list[] array. Ultimately STREAMS backenables the ip_wsrv - * on the IP module. It calls conn_walk_drain() passing ips_idl_tx_list[0]. - * ips_idl_tx_list[0] contains the drain lists of blocked conns. The - * conn_wq of the first conn in the drain lists is (q)enabled to run. - * ip_wsrv on this conn drains the queued messages, and removes the conn - * from the drain list, if all messages were drained. It also qenables the - * next conn in the drain list to continue the drain process. - * - * If the ip_wsrv of the next qenabled conn does not run, because the - * stream closes, ip_close takes responsibility to qenable the next conn in - * the drain list. The directly called ip_wput path always does a putq, if - * it cannot putnext. Thus synchronization problems are handled between - * ip_wsrv and ip_close. conn_drain_insert and conn_drain_tail are the only - * functions that manipulate this drain list. Furthermore conn_drain_insert - * is called only from ip_wsrv for the STREAMS case, and there can be only 1 - * instance of ip_wsrv running on a queue at any time. conn_drain_tail can - * be simultaneously called from both ip_wsrv and ip_close. + * driver by IP, because of a canput failure, ip_xmit drops the packet + * and returns EWOULDBLOCK to the caller, who may then invoke + * ixa_check_drain_insert to insert the conn on the 0'th drain list. + * When ip_wsrv runs on the ill_wq because flow control has been relieved, the + * blocked conns in the * 0'th drain list is drained as with the + * non-STREAMS case. + * + * In both the STREAMS and non-STREAMS case, the sockfs upcall to set + * qfull is done when the conn is inserted into the drain list + * (conn_drain_insert()) and cleared when the conn is removed from the drain + * list (conn_idl_remove()). * * IPQOS notes: * @@ -579,14 +569,13 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t, * By default all the callout positions are enabled. * * Outbound (local_out) - * Hooks are placed in ip_wput_ire and ipsec_out_process. + * Hooks are placed in ire_send_wire_v4 and ire_send_wire_v6. * * Inbound (local_in) - * Hooks are placed in ip_proto_input, icmp_inbound, ip_fanout_proto and - * TCP and UDP fanout routines. + * Hooks are placed in ip_fanout_v4 and ip_fanout_v6. * * Forwarding (in and out) - * Hooks are placed in ip_rput_forward. + * Hooks are placed in ire_recv_forward_v4/v6. * * IP Policy Framework processing (IPPF processing) * Policy processing for a packet is initiated by ip_process, which ascertains @@ -596,16 +585,6 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t, * filters configured in ipgpc and resumes normal IP processing thereafter. * An action instance can drop a packet in course of its processing. * - * A boolean variable, ip_policy, is used in all the fanout routines that can - * invoke ip_process for a packet. This variable indicates if the packet should - * to be sent for policy processing. The variable is set to B_TRUE by default, - * i.e. when the routines are invoked in the normal ip procesing path for a - * packet. The two exceptions being ip_wput_local and icmp_inbound_error_fanout; - * ip_policy is set to B_FALSE for all the routines called in these two - * functions because, in the former case, we don't process loopback traffic - * currently while in the latter, the packets have already been processed in - * icmp_inbound. - * * Zones notes: * * The partitioning rules for networking are as follows: @@ -638,24 +617,18 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t, * IRE_LOCAL Exclusive (x) * IRE_LOOPBACK Exclusive * IRE_PREFIX (net routes) Shared (*) - * IRE_CACHE Exclusive * IRE_IF_NORESOLVER (interface routes) Exclusive * IRE_IF_RESOLVER (interface routes) Exclusive + * IRE_IF_CLONE (interface routes) Exclusive * IRE_HOST (host routes) Shared (*) * * (*) A zone can only use a default or off-subnet route if the gateway is * directly reachable from the zone, that is, if the gateway's address matches * one of the zone's logical interfaces. * - * (x) IRE_LOCAL are handled a bit differently, since for all other entries - * in ire_ctable and IRE_INTERFACE, ire_src_addr is what can be used as source - * when sending packets using the IRE. For IRE_LOCAL ire_src_addr is the IP - * address of the zone itself (the destination). Since IRE_LOCAL is used - * for communication between zones, ip_wput_ire has special logic to set - * the right source address when sending using an IRE_LOCAL. - * - * Furthermore, when ip_restrict_interzone_loopback is set (the default), - * ire_cache_lookup restricts loopback using an IRE_LOCAL + * (x) IRE_LOCAL are handled a bit differently. + * When ip_restrict_interzone_loopback is set (the default), + * ire_route_recursive restricts loopback using an IRE_LOCAL * between zone to the case when L2 would have conceptually looped the packet * back, i.e. the loopback which is required since neither Ethernet drivers * nor Ethernet hardware loops them back. This is the case when the normal @@ -669,17 +642,11 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t, * since some zones may not be on the 10.16.72/24 network. To handle this, each * zone has its own set of IRE_BROADCAST entries; then, broadcast packets are * sent to every zone that has an IRE_BROADCAST entry for the destination - * address on the input ill, see conn_wantpacket(). + * address on the input ill, see ip_input_broadcast(). * * Applications in different zones can join the same multicast group address. - * For IPv4, group memberships are per-logical interface, so they're already - * inherently part of a zone. For IPv6, group memberships are per-physical - * interface, so we distinguish IPv6 group memberships based on group address, - * interface and zoneid. In both cases, received multicast packets are sent to - * every zone for which a group membership entry exists. On IPv6 we need to - * check that the target zone still has an address on the receiving physical - * interface; it could have been removed since the application issued the - * IPV6_JOIN_GROUP. + * The same logic applies for multicast as for broadcast. ip_input_multicast + * dispatches packets to all zones that have members on the physical interface. */ /* @@ -694,62 +661,37 @@ boolean_t ip_squeue_fanout = 0; */ uint_t ip_max_frag_dups = 10; -#define IS_SIMPLE_IPH(ipha) \ - ((ipha)->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION) - /* RFC 1122 Conformance */ #define IP_FORWARD_DEFAULT IP_FORWARD_NEVER #define ILL_MAX_NAMELEN LIFNAMSIZ -static int conn_set_held_ipif(conn_t *, ipif_t **, ipif_t *); - static int ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, boolean_t isv6); -static mblk_t *ip_wput_attach_llhdr(mblk_t *, ire_t *, ip_proc_t, uint32_t, - ipha_t **); +static mblk_t *ip_xmit_attach_llhdr(mblk_t *, nce_t *); -static void icmp_frag_needed(queue_t *, mblk_t *, int, zoneid_t, - ip_stack_t *); -static void icmp_inbound(queue_t *, mblk_t *, boolean_t, ill_t *, int, - uint32_t, boolean_t, boolean_t, ill_t *, zoneid_t); -static ipaddr_t icmp_get_nexthop_addr(ipha_t *, ill_t *, zoneid_t, mblk_t *mp); -static boolean_t icmp_inbound_too_big(icmph_t *, ipha_t *, ill_t *, zoneid_t, - mblk_t *, int, ip_stack_t *); -static void icmp_inbound_error_fanout(queue_t *, ill_t *, mblk_t *, - icmph_t *, ipha_t *, int, int, boolean_t, boolean_t, - ill_t *, zoneid_t); +static boolean_t icmp_inbound_verify_v4(mblk_t *, icmph_t *, ip_recv_attr_t *); +static void icmp_inbound_too_big_v4(icmph_t *, ip_recv_attr_t *); +static void icmp_inbound_error_fanout_v4(mblk_t *, icmph_t *, + ip_recv_attr_t *); static void icmp_options_update(ipha_t *); -static void icmp_param_problem(queue_t *, mblk_t *, uint8_t, zoneid_t, - ip_stack_t *); -static void icmp_pkt(queue_t *, mblk_t *, void *, size_t, boolean_t, - zoneid_t zoneid, ip_stack_t *); -static mblk_t *icmp_pkt_err_ok(mblk_t *, ip_stack_t *); -static void icmp_redirect(ill_t *, mblk_t *); -static void icmp_send_redirect(queue_t *, mblk_t *, ipaddr_t, - ip_stack_t *); +static void icmp_param_problem(mblk_t *, uint8_t, ip_recv_attr_t *); +static void icmp_pkt(mblk_t *, void *, size_t, ip_recv_attr_t *); +static mblk_t *icmp_pkt_err_ok(mblk_t *, ip_recv_attr_t *); +static void icmp_redirect_v4(mblk_t *mp, ipha_t *, icmph_t *, + ip_recv_attr_t *); +static void icmp_send_redirect(mblk_t *, ipaddr_t, ip_recv_attr_t *); +static void icmp_send_reply_v4(mblk_t *, ipha_t *, icmph_t *, + ip_recv_attr_t *); -static void ip_arp_news(queue_t *, mblk_t *); -static boolean_t ip_bind_get_ire_v4(mblk_t **, ire_t *, iulp_t *, ip_stack_t *); mblk_t *ip_dlpi_alloc(size_t, t_uscalar_t); char *ip_dot_addr(ipaddr_t, char *); mblk_t *ip_carve_mp(mblk_t **, ssize_t); int ip_close(queue_t *, int); static char *ip_dot_saddr(uchar_t *, char *); -static void ip_fanout_proto(queue_t *, mblk_t *, ill_t *, ipha_t *, uint_t, - boolean_t, boolean_t, ill_t *, zoneid_t); -static void ip_fanout_tcp(queue_t *, mblk_t *, ill_t *, ipha_t *, uint_t, - boolean_t, boolean_t, zoneid_t); -static void ip_fanout_udp(queue_t *, mblk_t *, ill_t *, ipha_t *, uint32_t, - boolean_t, uint_t, boolean_t, boolean_t, ill_t *, zoneid_t); static void ip_lrput(queue_t *, mblk_t *); ipaddr_t ip_net_mask(ipaddr_t); -void ip_newroute(queue_t *, mblk_t *, ipaddr_t, conn_t *, zoneid_t, - ip_stack_t *); -static void ip_newroute_ipif(queue_t *, mblk_t *, ipif_t *, ipaddr_t, - conn_t *, uint32_t, zoneid_t, ip_opt_info_t *); char *ip_nv_lookup(nv_t *, int); -static boolean_t ip_check_for_ipsec_opt(queue_t *, mblk_t *); static int ip_param_get(queue_t *, mblk_t *, caddr_t, cred_t *); static int ip_param_generic_get(queue_t *, mblk_t *, caddr_t, cred_t *); static boolean_t ip_param_register(IDP *ndp, ipparam_t *, size_t, @@ -758,17 +700,6 @@ static int ip_param_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); void ip_rput(queue_t *, mblk_t *); static void ip_rput_dlpi_writer(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy_arg); -void ip_rput_forward(ire_t *, ipha_t *, mblk_t *, ill_t *); -static int ip_rput_forward_options(mblk_t *, ipha_t *, ire_t *, - ip_stack_t *); -static boolean_t ip_rput_local_options(queue_t *, mblk_t *, ipha_t *, - ire_t *, ip_stack_t *); -static boolean_t ip_rput_multimblk_ipoptions(queue_t *, ill_t *, - mblk_t *, ipha_t **, ipaddr_t *, ip_stack_t *); -static int ip_rput_options(queue_t *, mblk_t *, ipha_t *, ipaddr_t *, - ip_stack_t *); -static boolean_t ip_rput_fragment(ill_t *, ill_t *, mblk_t **, ipha_t *, - uint32_t *, uint16_t *); int ip_snmp_get(queue_t *, mblk_t *, int); static mblk_t *ip_snmp_get_mib2_ip(queue_t *, mblk_t *, mib2_ipIfStatsEntry_t *, ip_stack_t *); @@ -801,49 +732,34 @@ static mblk_t *ip_snmp_get_mib2_ip6_route_media(queue_t *, mblk_t *, int, ip_stack_t *ipst); static void ip_snmp_get2_v4(ire_t *, iproutedata_t *); static void ip_snmp_get2_v6_route(ire_t *, iproutedata_t *); -static int ip_snmp_get2_v6_media(nce_t *, iproutedata_t *); +static int ip_snmp_get2_v4_media(ncec_t *, iproutedata_t *); +static int ip_snmp_get2_v6_media(ncec_t *, iproutedata_t *); int ip_snmp_set(queue_t *, int, int, uchar_t *, int); -static boolean_t ip_source_routed(ipha_t *, ip_stack_t *); -static boolean_t ip_source_route_included(ipha_t *); -static void ip_trash_ire_reclaim_stack(ip_stack_t *); -static void ip_wput_frag(ire_t *, mblk_t *, ip_pkt_t, uint32_t, uint32_t, - zoneid_t, ip_stack_t *, conn_t *); -static mblk_t *ip_wput_frag_copyhdr(uchar_t *, int, int, ip_stack_t *, +static mblk_t *ip_fragment_copyhdr(uchar_t *, int, int, ip_stack_t *, mblk_t *); -static void ip_wput_local_options(ipha_t *, ip_stack_t *); -static int ip_wput_options(queue_t *, mblk_t *, ipha_t *, boolean_t, - zoneid_t, ip_stack_t *); static void conn_drain_init(ip_stack_t *); static void conn_drain_fini(ip_stack_t *); static void conn_drain_tail(conn_t *connp, boolean_t closing); static void conn_walk_drain(ip_stack_t *, idl_tx_list_t *); -static void conn_setqfull(conn_t *); -static void conn_clrqfull(conn_t *); +static void conn_walk_sctp(pfv_t, void *, zoneid_t, netstack_t *); static void *ip_stack_init(netstackid_t stackid, netstack_t *ns); static void ip_stack_shutdown(netstackid_t stackid, void *arg); static void ip_stack_fini(netstackid_t stackid, void *arg); -static boolean_t conn_wantpacket(conn_t *, ill_t *, ipha_t *, int, - zoneid_t); -static void ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, - void *dummy_arg); - static int ip_forward_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); static int ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t, - ipaddr_t, ipaddr_t, uint_t *, mcast_record_t, ipaddr_t, mblk_t *), ire_t *, - conn_t *, boolean_t, ipaddr_t, mcast_record_t, ipaddr_t, mblk_t *); -static void ip_multirt_bad_mtu(ire_t *, uint32_t); + const in6_addr_t *, ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *), + ire_t *, conn_t *, boolean_t, const in6_addr_t *, mcast_record_t, + const in6_addr_t *); static int ip_cgtp_filter_get(queue_t *, mblk_t *, caddr_t, cred_t *); static int ip_cgtp_filter_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); -extern int ip_helper_stream_setup(queue_t *, dev_t *, int, int, - cred_t *, boolean_t); static int ip_input_proc_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr); static int ip_int_set(queue_t *, mblk_t *, char *, caddr_t, @@ -859,30 +775,15 @@ static int icmp_kstat_update(kstat_t *kp, int rw); static void *ip_kstat2_init(netstackid_t, ip_stat_t *); static void ip_kstat2_fini(netstackid_t, kstat_t *); -static mblk_t *ip_tcp_input(mblk_t *, ipha_t *, ill_t *, boolean_t, - ire_t *, mblk_t *, uint_t, queue_t *, ill_rx_ring_t *); +static void ipobs_init(ip_stack_t *); +static void ipobs_fini(ip_stack_t *); -static void ip_rput_process_forward(queue_t *, mblk_t *, ire_t *, - ipha_t *, ill_t *, boolean_t, boolean_t); - -static void ipobs_init(ip_stack_t *); -static void ipobs_fini(ip_stack_t *); ipaddr_t ip_g_all_ones = IP_HOST_MASK; /* How long, in seconds, we allow frags to hang around. */ #define IP_FRAG_TIMEOUT 15 #define IPV6_FRAG_TIMEOUT 60 -/* - * Threshold which determines whether MDT should be used when - * generating IP fragments; payload size must be greater than - * this threshold for MDT to take place. - */ -#define IP_WPUT_FRAG_MDT_MIN 32768 - -/* Setable in /etc/system only */ -int ip_wput_frag_mdt_min = IP_WPUT_FRAG_MDT_MIN; - static long ip_rput_pullups; int dohwcksum = 1; /* use h/w cksum if supported by the hardware */ @@ -891,24 +792,12 @@ vmem_t *ip_minor_arena_la; /* for minor nos. from 2^^18 thru 2^^32-1 */ int ip_debug; -#ifdef DEBUG -uint32_t ipsechw_debug = 0; -#endif - /* * Multirouting/CGTP stuff */ int ip_cgtp_filter_rev = CGTP_FILTER_REV; /* CGTP hooks version */ /* - * XXX following really should only be in a header. Would need more - * header and .c clean up first. - */ -extern optdb_obj_t ip_opt_obj; - -ulong_t ip_squeue_enter_unbound = 0; - -/* * Named Dispatch Parameter Table. * All of these are alterable, within the min/max values given, at run time. */ @@ -922,18 +811,18 @@ static ipparam_t lcl_param_arr[] = { { 0, 1, 1, "ip_send_redirects"}, { 0, 1, 0, "ip_forward_directed_broadcasts"}, { 0, 10, 0, "ip_mrtdebug"}, - { 5000, 999999999, 60000, "ip_ire_timer_interval" }, - { 60000, 999999999, 1200000, "ip_ire_arp_interval" }, - { 60000, 999999999, 60000, "ip_ire_redirect_interval" }, + { 1, 8, 3, "ip_ire_reclaim_fraction" }, + { 1, 8, 3, "ip_nce_reclaim_fraction" }, + { 1, 8, 3, "ip_dce_reclaim_fraction" }, { 1, 255, 255, "ip_def_ttl" }, { 0, 1, 0, "ip_forward_src_routed"}, { 0, 256, 32, "ip_wroff_extra" }, - { 5000, 999999999, 600000, "ip_ire_pathmtu_interval" }, + { 2, 999999999, 60*20, "ip_pathmtu_interval" }, /* In seconds */ { 8, 65536, 64, "ip_icmp_return_data_bytes" }, { 0, 1, 1, "ip_path_mtu_discovery" }, - { 0, 240, 30, "ip_ignore_delete_time" }, + { 68, 65535, 576, "ip_pmtu_min" }, { 0, 1, 0, "ip_ignore_redirect" }, - { 0, 1, 1, "ip_output_queue" }, + { 0, 1, 0, "ip_arp_icmp_error" }, { 1, 254, 1, "ip_broadcast_ttl" }, { 0, 99999, 100, "ip_icmp_err_interval" }, { 1, 99999, 10, "ip_icmp_err_burst" }, @@ -955,7 +844,7 @@ static ipparam_t lcl_param_arr[] = { { 0, 1, 0, "ip6_ignore_redirect" }, { 0, 1, 0, "ip6_strict_dst_multihoming" }, - { 1, 8, 3, "ip_ire_reclaim_fraction" }, + { 0, 2, 2, "ip_src_check" }, { 0, 999999, 1000, "ipsec_policy_log_interval" }, @@ -964,12 +853,16 @@ static ipparam_t lcl_param_arr[] = { { 1, 20, 3, "ip_ndp_unsolicit_count" }, { 0, 1, 1, "ip6_ignore_home_address_opt" }, { 0, 15, 0, "ip_policy_mask" }, - { 1000, 60000, 1000, "ip_multirt_resolution_interval" }, + { 0, 2, 2, "ip_ecmp_behavior" }, { 0, 255, 1, "ip_multirt_ttl" }, - { 0, 1, 1, "ip_multidata_outbound" }, - { 0, 3600000, 300000, "ip_ndp_defense_interval" }, + { 0, 3600, 60, "ip_ire_badcnt_lifetime" }, /* In seconds */ { 0, 999999, 60*60*24, "ip_max_temp_idle" }, { 0, 1000, 1, "ip_max_temp_defend" }, + /* + * when a conflict of an active address is detected, + * defend up to ip_max_defend times, within any + * ip_defend_interval span. + */ { 0, 1000, 3, "ip_max_defend" }, { 0, 999999, 30, "ip_defend_interval" }, { 0, 3600000, 300000, "ip_dup_recovery" }, @@ -977,12 +870,45 @@ static ipparam_t lcl_param_arr[] = { { 0, 1, 1, "ip_lso_outbound" }, { IGMP_V1_ROUTER, IGMP_V3_ROUTER, IGMP_V3_ROUTER, "igmp_max_version" }, { MLD_V1_ROUTER, MLD_V2_ROUTER, MLD_V2_ROUTER, "mld_max_version" }, - { 68, 65535, 576, "ip_pmtu_min" }, #ifdef DEBUG { 0, 1, 0, "ip6_drop_inbound_icmpv6" }, #else { 0, 0, 0, "" }, #endif + /* delay before sending first probe: */ + { 0, 20000, 1000, "arp_probe_delay" }, + { 0, 20000, 100, "arp_fastprobe_delay" }, + /* interval at which DAD probes are sent: */ + { 10, 20000, 1500, "arp_probe_interval" }, + { 10, 20000, 150, "arp_fastprobe_interval" }, + /* setting probe count to 0 will disable ARP probing for DAD. */ + { 0, 20, 3, "arp_probe_count" }, + { 0, 20, 3, "arp_fastprobe_count" }, + + { 0, 3600000, 15000, "ipv4_dad_announce_interval"}, + { 0, 3600000, 15000, "ipv6_dad_announce_interval"}, + /* + * Rate limiting parameters for DAD defense used in + * ill_defend_rate_limit(): + * defend_rate : pkts/hour permitted + * defend_interval : time that can elapse before we send out a + * DAD defense. + * defend_period: denominator for defend_rate (in seconds). + */ + { 0, 3600000, 300000, "arp_defend_interval"}, + { 0, 20000, 100, "arp_defend_rate"}, + { 0, 3600000, 300000, "ndp_defend_interval"}, + { 0, 20000, 100, "ndp_defend_rate"}, + { 5, 86400, 3600, "arp_defend_period"}, + { 5, 86400, 3600, "ndp_defend_period"}, + { 0, 1, 1, "ipv4_icmp_return_pmtu" }, + { 0, 1, 1, "ipv6_icmp_return_pmtu" }, + /* + * publish count/interval values used to announce local addresses + * for IPv4, IPv6. + */ + { 1, 20, 5, "ip_arp_publish_count" }, + { 1000, 20000, 2000, "ip_arp_publish_interval" }, }; /* @@ -1336,11 +1262,11 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { ip_sioctl_get_lifsrcof, NULL }, /* 178 */ { SIOCGMSFILTER, sizeof (struct group_filter), IPI_GET_CMD, MSFILT_CMD, ip_sioctl_msfilter, NULL }, - /* 179 */ { SIOCSMSFILTER, sizeof (struct group_filter), IPI_WR, + /* 179 */ { SIOCSMSFILTER, sizeof (struct group_filter), 0, MSFILT_CMD, ip_sioctl_msfilter, NULL }, /* 180 */ { SIOCGIPMSFILTER, sizeof (struct ip_msfilter), IPI_GET_CMD, MSFILT_CMD, ip_sioctl_msfilter, NULL }, - /* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), IPI_WR, + /* 181 */ { SIOCSIPMSFILTER, sizeof (struct ip_msfilter), 0, MSFILT_CMD, ip_sioctl_msfilter, NULL }, /* 182 */ { IPI_DONTCARE, 0, 0, 0, NULL, NULL }, /* SIOCSENABLESDP is handled by SDP */ @@ -1355,12 +1281,12 @@ ip_ioctl_cmd_t ip_ndx_ioctl_table[] = { int ip_ndx_ioctl_count = sizeof (ip_ndx_ioctl_table) / sizeof (ip_ioctl_cmd_t); ip_ioctl_cmd_t ip_misc_ioctl_table[] = { - { I_LINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, - { I_UNLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, - { I_PLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, - { I_PUNLINK, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, - { ND_GET, 0, IPI_PASS_DOWN, 0, NULL, NULL }, - { ND_SET, 0, IPI_PRIV | IPI_WR | IPI_PASS_DOWN, 0, NULL, NULL }, + { I_LINK, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL }, + { I_UNLINK, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL }, + { I_PLINK, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL }, + { I_PUNLINK, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL }, + { ND_GET, 0, 0, 0, NULL, NULL }, + { ND_SET, 0, IPI_PRIV | IPI_WR, 0, NULL, NULL }, { IP_IOCTL, 0, 0, 0, NULL, NULL }, { SIOCGETVIFCNT, sizeof (struct sioc_vif_req), IPI_GET_CMD, MISC_CMD, mrt_ioctl}, @@ -1384,12 +1310,14 @@ static nv_t ire_nv_arr[] = { { IRE_BROADCAST, "BROADCAST" }, { IRE_LOCAL, "LOCAL" }, { IRE_LOOPBACK, "LOOPBACK" }, - { IRE_CACHE, "CACHE" }, { IRE_DEFAULT, "DEFAULT" }, { IRE_PREFIX, "PREFIX" }, { IRE_IF_NORESOLVER, "IF_NORESOL" }, { IRE_IF_RESOLVER, "IF_RESOLV" }, + { IRE_IF_CLONE, "IF_CLONE" }, { IRE_HOST, "HOST" }, + { IRE_MULTICAST, "MULTICAST" }, + { IRE_NOROUTE, "NOROUTE" }, { 0 } }; @@ -1412,7 +1340,6 @@ struct module_info ip_mod_info = { /* * Entry points for IP as a device and as a module. - * FIXME: down the road we might want a separate module and driver qinit. * We have separate open functions for the /dev/ip and /dev/ip6 devices. */ static struct qinit iprinitv4 = { @@ -1425,13 +1352,8 @@ struct qinit iprinitv6 = { &ip_mod_info }; -static struct qinit ipwinitv4 = { - (pfi_t)ip_wput, (pfi_t)ip_wsrv, NULL, NULL, NULL, - &ip_mod_info -}; - -struct qinit ipwinitv6 = { - (pfi_t)ip_wput_v6, (pfi_t)ip_wsrv, NULL, NULL, NULL, +static struct qinit ipwinit = { + (pfi_t)ip_wput_nondata, (pfi_t)ip_wsrv, NULL, NULL, NULL, &ip_mod_info }; @@ -1447,98 +1369,32 @@ static struct qinit iplwinit = { /* For AF_INET aka /dev/ip */ struct streamtab ipinfov4 = { - &iprinitv4, &ipwinitv4, &iplrinit, &iplwinit + &iprinitv4, &ipwinit, &iplrinit, &iplwinit }; /* For AF_INET6 aka /dev/ip6 */ struct streamtab ipinfov6 = { - &iprinitv6, &ipwinitv6, &iplrinit, &iplwinit + &iprinitv6, &ipwinit, &iplrinit, &iplwinit }; #ifdef DEBUG -static boolean_t skip_sctp_cksum = B_FALSE; +boolean_t skip_sctp_cksum = B_FALSE; #endif /* - * Prepend the zoneid using an ipsec_out_t for later use by functions like - * ip_rput_v6(), ip_output(), etc. If the message - * block already has a M_CTL at the front of it, then simply set the zoneid - * appropriately. - */ -mblk_t * -ip_prepend_zoneid(mblk_t *mp, zoneid_t zoneid, ip_stack_t *ipst) -{ - mblk_t *first_mp; - ipsec_out_t *io; - - ASSERT(zoneid != ALL_ZONES); - if (mp->b_datap->db_type == M_CTL) { - io = (ipsec_out_t *)mp->b_rptr; - ASSERT(io->ipsec_out_type == IPSEC_OUT); - io->ipsec_out_zoneid = zoneid; - return (mp); - } - - first_mp = ipsec_alloc_ipsec_out(ipst->ips_netstack); - if (first_mp == NULL) - return (NULL); - io = (ipsec_out_t *)first_mp->b_rptr; - /* This is not a secure packet */ - io->ipsec_out_secure = B_FALSE; - io->ipsec_out_zoneid = zoneid; - first_mp->b_cont = mp; - return (first_mp); -} - -/* - * Copy an M_CTL-tagged message, preserving reference counts appropriately. + * Generate an ICMP fragmentation needed message. + * When called from ip_output side a minimal ip_recv_attr_t needs to be + * constructed by the caller. */ -mblk_t * -ip_copymsg(mblk_t *mp) -{ - mblk_t *nmp; - ipsec_info_t *in; - - if (mp->b_datap->db_type != M_CTL) - return (copymsg(mp)); - - in = (ipsec_info_t *)mp->b_rptr; - - /* - * Note that M_CTL is also used for delivering ICMP error messages - * upstream to transport layers. - */ - if (in->ipsec_info_type != IPSEC_OUT && - in->ipsec_info_type != IPSEC_IN) - return (copymsg(mp)); - - nmp = copymsg(mp->b_cont); - - if (in->ipsec_info_type == IPSEC_OUT) { - return (ipsec_out_tag(mp, nmp, - ((ipsec_out_t *)in)->ipsec_out_ns)); - } else { - return (ipsec_in_tag(mp, nmp, - ((ipsec_in_t *)in)->ipsec_in_ns)); - } -} - -/* Generate an ICMP fragmentation needed message. */ -static void -icmp_frag_needed(queue_t *q, mblk_t *mp, int mtu, zoneid_t zoneid, - ip_stack_t *ipst) +void +icmp_frag_needed(mblk_t *mp, int mtu, ip_recv_attr_t *ira) { icmph_t icmph; - mblk_t *first_mp; - boolean_t mctl_present; + ip_stack_t *ipst = ira->ira_ill->ill_ipst; - EXTRACT_PKT_MP(mp, first_mp, mctl_present); - - if (!(mp = icmp_pkt_err_ok(mp, ipst))) { - if (mctl_present) - freeb(first_mp); + mp = icmp_pkt_err_ok(mp, ira); + if (mp == NULL) return; - } bzero(&icmph, sizeof (icmph_t)); icmph.icmph_type = ICMP_DEST_UNREACHABLE; @@ -1546,29 +1402,29 @@ icmp_frag_needed(queue_t *q, mblk_t *mp, int mtu, zoneid_t zoneid, icmph.icmph_du_mtu = htons((uint16_t)mtu); BUMP_MIB(&ipst->ips_icmp_mib, icmpOutFragNeeded); BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs); - icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present, zoneid, - ipst); + + icmp_pkt(mp, &icmph, sizeof (icmph_t), ira); } /* - * icmp_inbound deals with ICMP messages in the following ways. + * icmp_inbound_v4 deals with ICMP messages that are handled by IP. + * If the ICMP message is consumed by IP, i.e., it should not be delivered + * to any IPPROTO_ICMP raw sockets, then it returns NULL. + * Likewise, if the ICMP error is misformed (too short, etc), then it + * returns NULL. The caller uses this to determine whether or not to send + * to raw sockets. * + * All error messages are passed to the matching transport stream. + * + * The following cases are handled by icmp_inbound: * 1) It needs to send a reply back and possibly delivering it * to the "interested" upper clients. - * 2) It needs to send it to the upper clients only. + * 2) Return the mblk so that the caller can pass it to the RAW socket clients. * 3) It needs to change some values in IP only. - * 4) It needs to change some values in IP and upper layers e.g TCP. - * - * We need to accomodate icmp messages coming in clear until we get - * everything secure from the wire. If icmp_accept_clear_messages - * is zero we check with the global policy and act accordingly. If - * it is non-zero, we accept the message without any checks. But - * *this does not mean* that this will be delivered to the upper - * clients. By accepting we might send replies back, change our MTU - * value etc. but delivery to the ULP/clients depends on their policy - * dispositions. + * 4) It needs to change some values in IP and upper layers e.g TCP + * by delivering an error to the upper layers. * - * We handle the above 4 cases in the context of IPsec in the + * We handle the above three cases in the context of IPsec in the * following way : * * 1) Send the reply back in the same way as the request came in. @@ -1610,13 +1466,13 @@ icmp_frag_needed(queue_t *q, mblk_t *mp, int mtu, zoneid_t zoneid, * come to a stop. This is solved by making similar decisions * at both levels. Currently, when we are unable to deliver * to the Upper Layer (due to policy failures) while IP has - * adjusted ire_max_frag, the next outbound datagram would + * adjusted dce_pmtu, the next outbound datagram would * generate a local ICMP_FRAGMENTATION_NEEDED message - which * will be with the right level of protection. Thus the right * value will be communicated even if we are not able to * communicate when we get from the wire initially. But this * assumes there would be at least one outbound datagram after - * IP has adjusted its ire_max_frag value. To make things + * IP has adjusted its dce_pmtu value. To make things * simpler, we accept in clear after the validation of * AH/ESP headers. * @@ -1627,105 +1483,54 @@ icmp_frag_needed(queue_t *q, mblk_t *mp, int mtu, zoneid_t zoneid, * should be accepted in clear when the Upper layer expects secure. * Thus the communication may get aborted by some bad ICMP * packets. - * - * IPQoS Notes: - * The only instance when a packet is sent for processing is when there - * isn't an ICMP client and if we are interested in it. - * If there is a client, IPPF processing will take place in the - * ip_fanout_proto routine. - * - * Zones notes: - * The packet is only processed in the context of the specified zone: typically - * only this zone will reply to an echo request, and only interested clients in - * this zone will receive a copy of the packet. This means that the caller must - * call icmp_inbound() for each relevant zone. */ -static void -icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill, - int sum_valid, uint32_t sum, boolean_t mctl_present, boolean_t ip_policy, - ill_t *recv_ill, zoneid_t zoneid) +mblk_t * +icmp_inbound_v4(mblk_t *mp, ip_recv_attr_t *ira) { - icmph_t *icmph; - ipha_t *ipha; - int iph_hdr_length; - int hdr_length; + icmph_t *icmph; + ipha_t *ipha; /* Outer header */ + int ip_hdr_length; /* Outer header length */ boolean_t interested; + ipif_t *ipif; uint32_t ts; - uchar_t *wptr; - ipif_t *ipif; - mblk_t *first_mp; - ipsec_in_t *ii; - timestruc_t now; - uint32_t ill_index; - ip_stack_t *ipst; - - ASSERT(ill != NULL); - ipst = ill->ill_ipst; - - first_mp = mp; - if (mctl_present) { - mp = first_mp->b_cont; - ASSERT(mp != NULL); - } + uint32_t *tsp; + timestruc_t now; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; + zoneid_t zoneid = ira->ira_zoneid; + int len_needed; + mblk_t *mp_ret = NULL; ipha = (ipha_t *)mp->b_rptr; - if (ipst->ips_icmp_accept_clear_messages == 0) { - first_mp = ipsec_check_global_policy(first_mp, NULL, - ipha, NULL, mctl_present, ipst->ips_netstack); - if (first_mp == NULL) - return; - } - - /* - * On a labeled system, we have to check whether the zone itself is - * permitted to receive raw traffic. - */ - if (is_system_labeled()) { - if (zoneid == ALL_ZONES) - zoneid = tsol_packet_to_zoneid(mp); - if (!tsol_can_accept_raw(mp, B_FALSE)) { - ip1dbg(("icmp_inbound: zone %d can't receive raw", - zoneid)); - BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); - freemsg(first_mp); - return; - } - } - - /* - * We have accepted the ICMP message. It means that we will - * respond to the packet if needed. It may not be delivered - * to the upper client depending on the policy constraints - * and the disposition in ipsec_inbound_accept_clear. - */ - - ASSERT(ill != NULL); BUMP_MIB(&ipst->ips_icmp_mib, icmpInMsgs); - iph_hdr_length = IPH_HDR_LENGTH(ipha); - if ((mp->b_wptr - mp->b_rptr) < (iph_hdr_length + ICMPH_SIZE)) { + + ip_hdr_length = ira->ira_ip_hdr_length; + if ((mp->b_wptr - mp->b_rptr) < (ip_hdr_length + ICMPH_SIZE)) { + if (ira->ira_pktlen < (ip_hdr_length + ICMPH_SIZE)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts); + ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill); + freemsg(mp); + return (NULL); + } /* Last chance to get real. */ - if (!pullupmsg(mp, iph_hdr_length + ICMPH_SIZE)) { + ipha = ip_pullup(mp, ip_hdr_length + ICMPH_SIZE, ira); + if (ipha == NULL) { BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); - freemsg(first_mp); - return; + freemsg(mp); + return (NULL); } - /* Refresh iph following the pullup. */ - ipha = (ipha_t *)mp->b_rptr; - } - /* ICMP header checksum, including checksum field, should be zero. */ - if (sum_valid ? (sum != 0 && sum != 0xFFFF) : - IP_CSUM(mp, iph_hdr_length, 0)) { - BUMP_MIB(&ipst->ips_icmp_mib, icmpInCksumErrs); - freemsg(first_mp); - return; } + /* The IP header will always be a multiple of four bytes */ - icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; - ip2dbg(("icmp_inbound: type %d code %d\n", icmph->icmph_type, + icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length]; + ip2dbg(("icmp_inbound_v4: type %d code %d\n", icmph->icmph_type, icmph->icmph_code)); - wptr = (uchar_t *)icmph + ICMPH_SIZE; - /* We will set "interested" to "true" if we want a copy */ + + /* + * We will set "interested" to "true" if we should pass a copy to + * the transport or if we handle the packet locally. + */ interested = B_FALSE; switch (icmph->icmph_type) { case ICMP_ECHO_REPLY: @@ -1753,18 +1558,42 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill, * (what isn't?). We aim to please, you pick it. * Default is do it. */ - if (!broadcast && !CLASSD(ipha->ipha_dst)) { - /* unicast: always respond */ - interested = B_TRUE; - } else if (CLASSD(ipha->ipha_dst)) { + if (ira->ira_flags & IRAF_MULTICAST) { /* multicast: respond based on tunable */ interested = ipst->ips_ip_g_resp_to_echo_mcast; - } else if (broadcast) { + } else if (ira->ira_flags & IRAF_BROADCAST) { /* broadcast: respond based on tunable */ interested = ipst->ips_ip_g_resp_to_echo_bcast; + } else { + /* unicast: always respond */ + interested = B_TRUE; } BUMP_MIB(&ipst->ips_icmp_mib, icmpInEchos); - break; + if (!interested) { + /* We never pass these to RAW sockets */ + freemsg(mp); + return (NULL); + } + + /* Check db_ref to make sure we can modify the packet. */ + if (mp->b_datap->db_ref > 1) { + mblk_t *mp1; + + mp1 = copymsg(mp); + freemsg(mp); + if (!mp1) { + BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); + return (NULL); + } + mp = mp1; + ipha = (ipha_t *)mp->b_rptr; + icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length]; + } + icmph->icmph_type = ICMP_ECHO_REPLY; + BUMP_MIB(&ipst->ips_icmp_mib, icmpOutEchoReps); + icmp_send_reply_v4(mp, ipha, icmph, ira); + return (NULL); + case ICMP_ROUTER_ADVERTISEMENT: case ICMP_ROUTER_SOLICITATION: break; @@ -1778,28 +1607,63 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill, break; case ICMP_TIME_STAMP_REQUEST: /* Response to Time Stamp Requests is local policy. */ - if (ipst->ips_ip_g_resp_to_timestamp && - /* So is whether to respond if it was an IP broadcast. */ - (!broadcast || ipst->ips_ip_g_resp_to_timestamp_bcast)) { - int tstamp_len = 3 * sizeof (uint32_t); - - if (wptr + tstamp_len > mp->b_wptr) { - if (!pullupmsg(mp, wptr + tstamp_len - - mp->b_rptr)) { - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsInDiscards); - freemsg(first_mp); - return; - } - /* Refresh ipha following the pullup. */ - ipha = (ipha_t *)mp->b_rptr; - icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; - wptr = (uchar_t *)icmph + ICMPH_SIZE; + if (ipst->ips_ip_g_resp_to_timestamp) { + if (ira->ira_flags & IRAF_MULTIBROADCAST) + interested = + ipst->ips_ip_g_resp_to_timestamp_bcast; + else + interested = B_TRUE; + } + if (!interested) { + /* We never pass these to RAW sockets */ + freemsg(mp); + return (NULL); + } + + /* Make sure we have enough of the packet */ + len_needed = ip_hdr_length + ICMPH_SIZE + + 3 * sizeof (uint32_t); + + if (mp->b_wptr - mp->b_rptr < len_needed) { + ipha = ip_pullup(mp, len_needed, ira); + if (ipha == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards - ip_pullup", + mp, ill); + freemsg(mp); + return (NULL); } - interested = B_TRUE; + /* Refresh following the pullup. */ + icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length]; } BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestamps); - break; + /* Check db_ref to make sure we can modify the packet. */ + if (mp->b_datap->db_ref > 1) { + mblk_t *mp1; + + mp1 = copymsg(mp); + freemsg(mp); + if (!mp1) { + BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); + return (NULL); + } + mp = mp1; + ipha = (ipha_t *)mp->b_rptr; + icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length]; + } + icmph->icmph_type = ICMP_TIME_STAMP_REPLY; + tsp = (uint32_t *)&icmph[1]; + tsp++; /* Skip past 'originate time' */ + /* Compute # of milliseconds since midnight */ + gethrestime(&now); + ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + + now.tv_nsec / (NANOSEC / MILLISEC); + *tsp++ = htonl(ts); /* Lay in 'receive time' */ + *tsp++ = htonl(ts); /* Lay in 'send time' */ + BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimestampReps); + icmp_send_reply_v4(mp, ipha, icmph, ira); + return (NULL); + case ICMP_TIME_STAMP_REPLY: BUMP_MIB(&ipst->ips_icmp_mib, icmpInTimestampReps); break; @@ -1808,14 +1672,68 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill, case ICMP_INFO_REPLY: break; case ICMP_ADDRESS_MASK_REQUEST: - if ((ipst->ips_ip_respond_to_address_mask_broadcast || - !broadcast) && - /* TODO m_pullup of complete header? */ - (mp->b_datap->db_lim - wptr) >= IP_ADDR_LEN) { + if (ira->ira_flags & IRAF_MULTIBROADCAST) { + interested = + ipst->ips_ip_respond_to_address_mask_broadcast; + } else { interested = B_TRUE; } + if (!interested) { + /* We never pass these to RAW sockets */ + freemsg(mp); + return (NULL); + } + len_needed = ip_hdr_length + ICMPH_SIZE + IP_ADDR_LEN; + if (mp->b_wptr - mp->b_rptr < len_needed) { + ipha = ip_pullup(mp, len_needed, ira); + if (ipha == NULL) { + BUMP_MIB(ill->ill_ip_mib, + ipIfStatsInTruncatedPkts); + ip_drop_input("ipIfStatsInTruncatedPkts", mp, + ill); + freemsg(mp); + return (NULL); + } + /* Refresh following the pullup. */ + icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length]; + } BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMasks); - break; + /* Check db_ref to make sure we can modify the packet. */ + if (mp->b_datap->db_ref > 1) { + mblk_t *mp1; + + mp1 = copymsg(mp); + freemsg(mp); + if (!mp1) { + BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); + return (NULL); + } + mp = mp1; + ipha = (ipha_t *)mp->b_rptr; + icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length]; + } + /* + * Need the ipif with the mask be the same as the source + * address of the mask reply. For unicast we have a specific + * ipif. For multicast/broadcast we only handle onlink + * senders, and use the source address to pick an ipif. + */ + ipif = ipif_lookup_addr(ipha->ipha_dst, ill, zoneid, ipst); + if (ipif == NULL) { + /* Broadcast or multicast */ + ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid); + if (ipif == NULL) { + freemsg(mp); + return (NULL); + } + } + icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; + bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN); + ipif_refrele(ipif); + BUMP_MIB(&ipst->ips_icmp_mib, icmpOutAddrMaskReps); + icmp_send_reply_v4(mp, ipha, icmph, ira); + return (NULL); + case ICMP_ADDRESS_MASK_REPLY: BUMP_MIB(&ipst->ips_icmp_mib, icmpInAddrMaskReps); break; @@ -1824,206 +1742,103 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill, BUMP_MIB(&ipst->ips_icmp_mib, icmpInUnknowns); break; } - /* See if there is an ICMP client. */ - if (ipst->ips_ipcl_proto_fanout[IPPROTO_ICMP].connf_head != NULL) { + /* + * See if there is an ICMP client to avoid an extra copymsg/freemsg + * if there isn't one. + */ + if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_ICMP].connf_head != NULL) { /* If there is an ICMP client and we want one too, copy it. */ - mblk_t *first_mp1; if (!interested) { - ip_fanout_proto(q, first_mp, ill, ipha, 0, mctl_present, - ip_policy, recv_ill, zoneid); - return; + /* Caller will deliver to RAW sockets */ + return (mp); } - first_mp1 = ip_copymsg(first_mp); - if (first_mp1 != NULL) { - ip_fanout_proto(q, first_mp1, ill, ipha, - 0, mctl_present, ip_policy, recv_ill, zoneid); + mp_ret = copymsg(mp); + if (mp_ret == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill); } } else if (!interested) { - freemsg(first_mp); - return; - } else { - /* - * Initiate policy processing for this packet if ip_policy - * is true. - */ - if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) { - ill_index = ill->ill_phyint->phyint_ifindex; - ip_process(IPP_LOCAL_IN, &mp, ill_index); - if (mp == NULL) { - if (mctl_present) { - freeb(first_mp); - } - BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); - return; - } + /* Neither we nor raw sockets are interested. Drop packet now */ + freemsg(mp); + return (NULL); + } + + /* + * ICMP error or redirect packet. Make sure we have enough of + * the header and that db_ref == 1 since we might end up modifying + * the packet. + */ + if (mp->b_cont != NULL) { + if (ip_pullup(mp, -1, ira) == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards - ip_pullup", + mp, ill); + freemsg(mp); + return (mp_ret); } } - /* We want to do something with it. */ - /* Check db_ref to make sure we can modify the packet. */ + if (mp->b_datap->db_ref > 1) { - mblk_t *first_mp1; + mblk_t *mp1; - first_mp1 = ip_copymsg(first_mp); - freemsg(first_mp); - if (!first_mp1) { - BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); - return; - } - first_mp = first_mp1; - if (mctl_present) { - mp = first_mp->b_cont; - ASSERT(mp != NULL); - } else { - mp = first_mp; + mp1 = copymsg(mp); + if (mp1 == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill); + freemsg(mp); + return (mp_ret); } - ipha = (ipha_t *)mp->b_rptr; - icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; - wptr = (uchar_t *)icmph + ICMPH_SIZE; + freemsg(mp); + mp = mp1; } - switch (icmph->icmph_type) { - case ICMP_ADDRESS_MASK_REQUEST: - ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid); - if (ipif == NULL) { - freemsg(first_mp); - return; - } - /* - * outging interface must be IPv4 - */ - ASSERT(ipif != NULL && !ipif->ipif_isv6); - icmph->icmph_type = ICMP_ADDRESS_MASK_REPLY; - bcopy(&ipif->ipif_net_mask, wptr, IP_ADDR_LEN); - ipif_refrele(ipif); - BUMP_MIB(&ipst->ips_icmp_mib, icmpOutAddrMaskReps); - break; - case ICMP_ECHO_REQUEST: - icmph->icmph_type = ICMP_ECHO_REPLY; - BUMP_MIB(&ipst->ips_icmp_mib, icmpOutEchoReps); - break; - case ICMP_TIME_STAMP_REQUEST: { - uint32_t *tsp; - icmph->icmph_type = ICMP_TIME_STAMP_REPLY; - tsp = (uint32_t *)wptr; - tsp++; /* Skip past 'originate time' */ - /* Compute # of milliseconds since midnight */ - gethrestime(&now); - ts = (now.tv_sec % (24 * 60 * 60)) * 1000 + - now.tv_nsec / (NANOSEC / MILLISEC); - *tsp++ = htonl(ts); /* Lay in 'receive time' */ - *tsp++ = htonl(ts); /* Lay in 'send time' */ - BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimestampReps); - break; + /* + * In case mp has changed, verify the message before any further + * processes. + */ + ipha = (ipha_t *)mp->b_rptr; + icmph = (icmph_t *)&mp->b_rptr[ip_hdr_length]; + if (!icmp_inbound_verify_v4(mp, icmph, ira)) { + freemsg(mp); + return (mp_ret); } - default: - ipha = (ipha_t *)&icmph[1]; - if ((uchar_t *)&ipha[1] > mp->b_wptr) { - if (!pullupmsg(mp, (uchar_t *)&ipha[1] - mp->b_rptr)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - freemsg(first_mp); - return; - } - icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; - ipha = (ipha_t *)&icmph[1]; - } - if ((IPH_HDR_VERSION(ipha) != IPV4_VERSION)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - freemsg(first_mp); - return; - } - hdr_length = IPH_HDR_LENGTH(ipha); - if (hdr_length < sizeof (ipha_t)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - freemsg(first_mp); - return; - } - if ((uchar_t *)ipha + hdr_length > mp->b_wptr) { - if (!pullupmsg(mp, - (uchar_t *)ipha + hdr_length - mp->b_rptr)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - freemsg(first_mp); - return; - } - icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; - ipha = (ipha_t *)&icmph[1]; - } - switch (icmph->icmph_type) { - case ICMP_REDIRECT: - /* - * As there is no upper client to deliver, we don't - * need the first_mp any more. - */ - if (mctl_present) { - freeb(first_mp); - } - icmp_redirect(ill, mp); - return; - case ICMP_DEST_UNREACHABLE: - if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) { - if (!icmp_inbound_too_big(icmph, ipha, ill, - zoneid, mp, iph_hdr_length, ipst)) { - freemsg(first_mp); - return; - } - /* - * icmp_inbound_too_big() may alter mp. - * Resynch ipha and icmph accordingly. - */ - icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; - ipha = (ipha_t *)&icmph[1]; - } - /* FALLTHRU */ - default : - /* - * IPQoS notes: Since we have already done IPQoS - * processing we don't want to do it again in - * the fanout routines called by - * icmp_inbound_error_fanout, hence the last - * argument, ip_policy, is B_FALSE. - */ - icmp_inbound_error_fanout(q, ill, first_mp, icmph, - ipha, iph_hdr_length, hdr_length, mctl_present, - B_FALSE, recv_ill, zoneid); + + switch (icmph->icmph_type) { + case ICMP_REDIRECT: + icmp_redirect_v4(mp, ipha, icmph, ira); + break; + case ICMP_DEST_UNREACHABLE: + if (icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) { + /* Update DCE and adjust MTU is icmp header if needed */ + icmp_inbound_too_big_v4(icmph, ira); } - return; + /* FALLTHRU */ + default: + icmp_inbound_error_fanout_v4(mp, icmph, ira); + break; } + return (mp_ret); +} + +/* + * Send an ICMP echo, timestamp or address mask reply. + * The caller has already updated the payload part of the packet. + * We handle the ICMP checksum, IP source address selection and feed + * the packet into ip_output_simple. + */ +static void +icmp_send_reply_v4(mblk_t *mp, ipha_t *ipha, icmph_t *icmph, + ip_recv_attr_t *ira) +{ + uint_t ip_hdr_length = ira->ira_ip_hdr_length; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; + ip_xmit_attr_t ixas; + /* Send out an ICMP packet */ icmph->icmph_checksum = 0; - icmph->icmph_checksum = IP_CSUM(mp, iph_hdr_length, 0); - if (broadcast || CLASSD(ipha->ipha_dst)) { - ipif_t *ipif_chosen; - /* - * Make it look like it was directed to us, so we don't look - * like a fool with a broadcast or multicast source address. - */ - ipif = ipif_lookup_remote(ill, ipha->ipha_src, zoneid); - /* - * Make sure that we haven't grabbed an interface that's DOWN. - */ - if (ipif != NULL) { - ipif_chosen = ipif_select_source(ipif->ipif_ill, - ipha->ipha_src, zoneid); - if (ipif_chosen != NULL) { - ipif_refrele(ipif); - ipif = ipif_chosen; - } - } - if (ipif == NULL) { - ip0dbg(("icmp_inbound: " - "No source for broadcast/multicast:\n" - "\tsrc 0x%x dst 0x%x ill %p " - "ipif_lcl_addr 0x%x\n", - ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), - (void *)ill, - ill->ill_ipif->ipif_lcl_addr)); - freemsg(first_mp); - return; - } - ASSERT(ipif != NULL && !ipif->ipif_isv6); - ipha->ipha_dst = ipif->ipif_src_addr; - ipif_refrele(ipif); - } + icmph->icmph_checksum = IP_CSUM(mp, ip_hdr_length, 0); /* Reset time to live. */ ipha->ipha_ttl = ipst->ips_ip_def_ttl; { @@ -2038,138 +1853,159 @@ icmp_inbound(queue_t *q, mblk_t *mp, boolean_t broadcast, ill_t *ill, if (!IS_SIMPLE_IPH(ipha)) icmp_options_update(ipha); - if (!mctl_present) { + bzero(&ixas, sizeof (ixas)); + ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; + ixas.ixa_zoneid = ira->ira_zoneid; + ixas.ixa_cred = kcred; + ixas.ixa_cpid = NOPID; + ixas.ixa_tsl = ira->ira_tsl; /* Behave as a multi-level responder */ + ixas.ixa_ifindex = 0; + ixas.ixa_ipst = ipst; + ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; + + if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) { /* * This packet should go out the same way as it - * came in i.e in clear. To make sure that global - * policy will not be applied to this in ip_wput_ire, - * we attach a IPSEC_IN mp and clear ipsec_in_secure. + * came in i.e in clear, independent of the IPsec policy + * for transmitting packets. */ - ASSERT(first_mp == mp); - first_mp = ipsec_in_alloc(B_TRUE, ipst->ips_netstack); - if (first_mp == NULL) { + ixas.ixa_flags |= IXAF_NO_IPSEC; + } else { + if (!ipsec_in_to_out(ira, &ixas, mp, ipha, NULL)) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - freemsg(mp); + /* Note: mp already consumed and ip_drop_packet done */ return; } - ii = (ipsec_in_t *)first_mp->b_rptr; - - /* This is not a secure packet */ - ii->ipsec_in_secure = B_FALSE; - first_mp->b_cont = mp; - } else { - ii = (ipsec_in_t *)first_mp->b_rptr; - ii->ipsec_in_ns = ipst->ips_netstack; /* No netstack_hold */ } - if (!ipsec_in_to_out(first_mp, ipha, NULL, zoneid)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - return; + if (ira->ira_flags & IRAF_MULTIBROADCAST) { + /* + * Not one or our addresses (IRE_LOCALs), thus we let + * ip_output_simple pick the source. + */ + ipha->ipha_src = INADDR_ANY; + ixas.ixa_flags |= IXAF_SET_SOURCE; + } + /* Should we send with DF and use dce_pmtu? */ + if (ipst->ips_ipv4_icmp_return_pmtu) { + ixas.ixa_flags |= IXAF_PMTU_DISCOVERY; + ipha->ipha_fragment_offset_and_flags |= IPH_DF_HTONS; } + BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs); - put(WR(q), first_mp); + + (void) ip_output_simple(mp, &ixas); + ixa_cleanup(&ixas); } -static ipaddr_t -icmp_get_nexthop_addr(ipha_t *ipha, ill_t *ill, zoneid_t zoneid, mblk_t *mp) +/* + * Verify the ICMP messages for either for ICMP error or redirect packet. + * The caller should have fully pulled up the message. If it's a redirect + * packet, only basic checks on IP header will be done; otherwise, verify + * the packet by looking at the included ULP header. + * + * Called before icmp_inbound_error_fanout_v4 is called. + */ +static boolean_t +icmp_inbound_verify_v4(mblk_t *mp, icmph_t *icmph, ip_recv_attr_t *ira) { - conn_t *connp; - connf_t *connfp; - ipaddr_t nexthop_addr = INADDR_ANY; - int hdr_length = IPH_HDR_LENGTH(ipha); - uint16_t *up; - uint32_t ports; - ip_stack_t *ipst = ill->ill_ipst; + ill_t *ill = ira->ira_ill; + int hdr_length; + ip_stack_t *ipst = ira->ira_ill->ill_ipst; + conn_t *connp; + ipha_t *ipha; /* Inner IP header */ - up = (uint16_t *)((uchar_t *)ipha + hdr_length); - switch (ipha->ipha_protocol) { - case IPPROTO_TCP: - { - tcph_t *tcph; - - /* do a reverse lookup */ - tcph = (tcph_t *)((uchar_t *)ipha + hdr_length); - connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcph, - TCPS_LISTEN, ipst); - break; - } - case IPPROTO_UDP: - { - uint32_t dstport, srcport; + ipha = (ipha_t *)&icmph[1]; + if ((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH > mp->b_wptr) + goto truncated; - ((uint16_t *)&ports)[0] = up[1]; - ((uint16_t *)&ports)[1] = up[0]; + hdr_length = IPH_HDR_LENGTH(ipha); - /* Extract ports in net byte order */ - dstport = htons(ntohl(ports) & 0xFFFF); - srcport = htons(ntohl(ports) >> 16); + if ((IPH_HDR_VERSION(ipha) != IPV4_VERSION)) + goto discard_pkt; - connfp = &ipst->ips_ipcl_udp_fanout[ - IPCL_UDP_HASH(dstport, ipst)]; - mutex_enter(&connfp->connf_lock); - connp = connfp->connf_head; + if (hdr_length < sizeof (ipha_t)) + goto truncated; - /* do a reverse lookup */ - while ((connp != NULL) && - (!IPCL_UDP_MATCH(connp, dstport, - ipha->ipha_src, srcport, ipha->ipha_dst) || - !IPCL_ZONE_MATCH(connp, zoneid))) { - connp = connp->conn_next; - } - if (connp != NULL) - CONN_INC_REF(connp); - mutex_exit(&connfp->connf_lock); - break; - } - case IPPROTO_SCTP: - { - in6_addr_t map_src, map_dst; + if ((uchar_t *)ipha + hdr_length > mp->b_wptr) + goto truncated; - IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_src); - IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_dst); - ((uint16_t *)&ports)[0] = up[1]; - ((uint16_t *)&ports)[1] = up[0]; + /* + * Stop here for ICMP_REDIRECT. + */ + if (icmph->icmph_type == ICMP_REDIRECT) + return (B_TRUE); - connp = sctp_find_conn(&map_src, &map_dst, ports, - zoneid, ipst->ips_netstack->netstack_sctp); - if (connp == NULL) { - connp = ipcl_classify_raw(mp, IPPROTO_SCTP, - zoneid, ports, ipha, ipst); - } else { - CONN_INC_REF(connp); - SCTP_REFRELE(CONN2SCTP(connp)); - } - break; - } - default: - { - ipha_t ripha; + /* + * ICMP errors only. + */ + switch (ipha->ipha_protocol) { + case IPPROTO_UDP: + /* + * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of + * transport header. + */ + if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > + mp->b_wptr) + goto truncated; + break; + case IPPROTO_TCP: { + tcpha_t *tcpha; + + /* + * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of + * transport header. + */ + if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > + mp->b_wptr) + goto truncated; - ripha.ipha_src = ipha->ipha_dst; - ripha.ipha_dst = ipha->ipha_src; - ripha.ipha_protocol = ipha->ipha_protocol; + tcpha = (tcpha_t *)((uchar_t *)ipha + hdr_length); + connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcpha, TCPS_LISTEN, + ipst); + if (connp == NULL) + goto discard_pkt; - connfp = &ipst->ips_ipcl_proto_fanout[ - ipha->ipha_protocol]; - mutex_enter(&connfp->connf_lock); - connp = connfp->connf_head; - for (connp = connfp->connf_head; connp != NULL; - connp = connp->conn_next) { - if (IPCL_PROTO_MATCH(connp, - ipha->ipha_protocol, &ripha, ill, - 0, zoneid)) { - CONN_INC_REF(connp); - break; - } - } - mutex_exit(&connfp->connf_lock); + if ((connp->conn_verifyicmp != NULL) && + !connp->conn_verifyicmp(connp, tcpha, icmph, NULL, ira)) { + CONN_DEC_REF(connp); + goto discard_pkt; } - } - if (connp != NULL) { - if (connp->conn_nexthop_set) - nexthop_addr = connp->conn_nexthop_v4; CONN_DEC_REF(connp); + break; + } + case IPPROTO_SCTP: + /* + * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of + * transport header. + */ + if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > + mp->b_wptr) + goto truncated; + break; + case IPPROTO_ESP: + case IPPROTO_AH: + break; + case IPPROTO_ENCAP: + if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) > + mp->b_wptr) + goto truncated; + break; + default: + break; } - return (nexthop_addr); + + return (B_TRUE); + +discard_pkt: + /* Bogus ICMP error. */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + return (B_FALSE); + +truncated: + /* We pulled up everthing already. Must be truncated */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts); + ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill); + return (B_FALSE); } /* Table from RFC 1191 */ @@ -2178,64 +2014,52 @@ static int icmp_frag_size_table[] = /* * Process received ICMP Packet too big. - * After updating any IRE it does the fanout to any matching transport streams. - * Assumes the message has been pulled up till the IP header that caused - * the error. + * Just handles the DCE create/update, including using the above table of + * PMTU guesses. The caller is responsible for validating the packet before + * passing it in and also to fanout the ICMP error to any matching transport + * conns. Assumes the message has been fully pulled up and verified. + * + * Before getting here, the caller has called icmp_inbound_verify_v4() + * that should have verified with ULP to prevent undoing the changes we're + * going to make to DCE. For example, TCP might have verified that the packet + * which generated error is in the send window. * - * Returns B_FALSE on failure and B_TRUE on success. + * In some cases modified this MTU in the ICMP header packet; the caller + * should pass to the matching ULP after this returns. */ -static boolean_t -icmp_inbound_too_big(icmph_t *icmph, ipha_t *ipha, ill_t *ill, - zoneid_t zoneid, mblk_t *mp, int iph_hdr_length, - ip_stack_t *ipst) +static void +icmp_inbound_too_big_v4(icmph_t *icmph, ip_recv_attr_t *ira) { - ire_t *ire, *first_ire; - int mtu, orig_mtu; - int hdr_length; - ipaddr_t nexthop_addr; - boolean_t disable_pmtud; + dce_t *dce; + int old_mtu; + int mtu, orig_mtu; + ipaddr_t dst; + boolean_t disable_pmtud; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; + uint_t hdr_length; + ipha_t *ipha; + /* Caller already pulled up everything. */ + ipha = (ipha_t *)&icmph[1]; ASSERT(icmph->icmph_type == ICMP_DEST_UNREACHABLE && icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED); ASSERT(ill != NULL); hdr_length = IPH_HDR_LENGTH(ipha); - /* Drop if the original packet contained a source route */ - if (ip_source_route_included(ipha)) { - return (B_FALSE); - } /* - * Verify we have at least ICMP_MIN_TP_HDR_LENGTH bytes of transport - * header. + * We handle path MTU for source routed packets since the DCE + * is looked up using the final destination. */ - if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > - mp->b_wptr) { - if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + - ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - ip1dbg(("icmp_inbound_too_big: insufficient hdr\n")); - return (B_FALSE); - } - icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; - ipha = (ipha_t *)&icmph[1]; - } - nexthop_addr = icmp_get_nexthop_addr(ipha, ill, zoneid, mp); - if (nexthop_addr != INADDR_ANY) { - /* nexthop set */ - first_ire = ire_ctable_lookup(ipha->ipha_dst, - nexthop_addr, 0, NULL, ALL_ZONES, msg_getlabel(mp), - MATCH_IRE_MARK_PRIVATE_ADDR | MATCH_IRE_GW, ipst); - } else { - /* nexthop not set */ - first_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_CACHE, - NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); - } + dst = ip_get_dst(ipha); - if (!first_ire) { - ip1dbg(("icmp_inbound_too_big: no route for 0x%x\n", - ntohl(ipha->ipha_dst))); - return (B_FALSE); + dce = dce_lookup_and_add_v4(dst, ipst); + if (dce == NULL) { + /* Couldn't add a unique one - ENOMEM */ + ip1dbg(("icmp_inbound_too_big_v4: no dce for 0x%x\n", + ntohl(dst))); + return; } /* Check for MTU discovery advice as described in RFC 1191 */ @@ -2243,149 +2067,112 @@ icmp_inbound_too_big(icmph_t *icmph, ipha_t *ipha, ill_t *ill, orig_mtu = mtu; disable_pmtud = B_FALSE; - rw_enter(&first_ire->ire_bucket->irb_lock, RW_READER); - for (ire = first_ire; ire != NULL && ire->ire_addr == ipha->ipha_dst; - ire = ire->ire_next) { - /* - * Look for the connection to which this ICMP message is - * directed. If it has the IP_NEXTHOP option set, then the - * search is limited to IREs with the MATCH_IRE_PRIVATE - * option. Else the search is limited to regular IREs. - */ - if (((ire->ire_marks & IRE_MARK_PRIVATE_ADDR) && - (nexthop_addr != ire->ire_gateway_addr)) || - (!(ire->ire_marks & IRE_MARK_PRIVATE_ADDR) && - (nexthop_addr != INADDR_ANY))) - continue; + mutex_enter(&dce->dce_lock); + if (dce->dce_flags & DCEF_PMTU) + old_mtu = dce->dce_pmtu; + else + old_mtu = ill->ill_mtu; - mutex_enter(&ire->ire_lock); - if (icmph->icmph_du_zero != 0 || mtu < ipst->ips_ip_pmtu_min) { - uint32_t length; - int i; + if (icmph->icmph_du_zero != 0 || mtu < ipst->ips_ip_pmtu_min) { + uint32_t length; + int i; + /* + * Use the table from RFC 1191 to figure out + * the next "plateau" based on the length in + * the original IP packet. + */ + length = ntohs(ipha->ipha_length); + DTRACE_PROBE2(ip4__pmtu__guess, dce_t *, dce, + uint32_t, length); + if (old_mtu <= length && + old_mtu >= length - hdr_length) { /* - * Use the table from RFC 1191 to figure out - * the next "plateau" based on the length in - * the original IP packet. + * Handle broken BSD 4.2 systems that + * return the wrong ipha_length in ICMP + * errors. */ - length = ntohs(ipha->ipha_length); - DTRACE_PROBE2(ip4__pmtu__guess, ire_t *, ire, - uint32_t, length); - if (ire->ire_max_frag <= length && - ire->ire_max_frag >= length - hdr_length) { - /* - * Handle broken BSD 4.2 systems that - * return the wrong iph_length in ICMP - * errors. - */ - length -= hdr_length; - } - for (i = 0; i < A_CNT(icmp_frag_size_table); i++) { - if (length > icmp_frag_size_table[i]) - break; - } - if (i == A_CNT(icmp_frag_size_table)) { - /* Smaller than 68! */ - disable_pmtud = B_TRUE; + ip1dbg(("Wrong mtu: sent %d, dce %d\n", + length, old_mtu)); + length -= hdr_length; + } + for (i = 0; i < A_CNT(icmp_frag_size_table); i++) { + if (length > icmp_frag_size_table[i]) + break; + } + if (i == A_CNT(icmp_frag_size_table)) { + /* Smaller than IP_MIN_MTU! */ + ip1dbg(("Too big for packet size %d\n", + length)); + disable_pmtud = B_TRUE; + mtu = ipst->ips_ip_pmtu_min; + } else { + mtu = icmp_frag_size_table[i]; + ip1dbg(("Calculated mtu %d, packet size %d, " + "before %d\n", mtu, length, old_mtu)); + if (mtu < ipst->ips_ip_pmtu_min) { mtu = ipst->ips_ip_pmtu_min; - } else { - mtu = icmp_frag_size_table[i]; - if (mtu < ipst->ips_ip_pmtu_min) { - mtu = ipst->ips_ip_pmtu_min; - disable_pmtud = B_TRUE; - } + disable_pmtud = B_TRUE; } - /* Fool the ULP into believing our guessed PMTU. */ - icmph->icmph_du_zero = 0; - icmph->icmph_du_mtu = htons(mtu); - } - if (disable_pmtud) - ire->ire_frag_flag = 0; - /* Reduce the IRE max frag value as advised. */ - ire->ire_max_frag = MIN(ire->ire_max_frag, mtu); - if (ire->ire_max_frag == mtu) { - /* Decreased it */ - ire->ire_marks |= IRE_MARK_PMTU; } - mutex_exit(&ire->ire_lock); - DTRACE_PROBE4(ip4__pmtu__change, icmph_t *, icmph, ire_t *, - ire, int, orig_mtu, int, mtu); } - rw_exit(&first_ire->ire_bucket->irb_lock); - ire_refrele(first_ire); - return (B_TRUE); + if (disable_pmtud) + dce->dce_flags |= DCEF_TOO_SMALL_PMTU; + else + dce->dce_flags &= ~DCEF_TOO_SMALL_PMTU; + + dce->dce_pmtu = MIN(old_mtu, mtu); + /* Prepare to send the new max frag size for the ULP. */ + icmph->icmph_du_zero = 0; + icmph->icmph_du_mtu = htons((uint16_t)dce->dce_pmtu); + DTRACE_PROBE4(ip4__pmtu__change, icmph_t *, icmph, dce_t *, + dce, int, orig_mtu, int, mtu); + + /* We now have a PMTU for sure */ + dce->dce_flags |= DCEF_PMTU; + dce->dce_last_change_time = TICK_TO_SEC(lbolt64); + mutex_exit(&dce->dce_lock); + /* + * After dropping the lock the new value is visible to everyone. + * Then we bump the generation number so any cached values reinspect + * the dce_t. + */ + dce_increment_generation(dce); + dce_refrele(dce); } /* - * If the packet in error is Self-Encapsulated, icmp_inbound_error_fanout + * If the packet in error is Self-Encapsulated, icmp_inbound_error_fanout_v4 * calls this function. */ static mblk_t * -icmp_inbound_self_encap_error(mblk_t *mp, int iph_hdr_length, int hdr_length) +icmp_inbound_self_encap_error_v4(mblk_t *mp, ipha_t *ipha, ipha_t *in_ipha) { - ipha_t *ipha; - icmph_t *icmph; - ipha_t *in_ipha; int length; ASSERT(mp->b_datap->db_type == M_DATA); - /* - * For Self-encapsulated packets, we added an extra IP header - * without the options. Inner IP header is the one from which - * the outer IP header was formed. Thus, we need to remove the - * outer IP header. To do this, we pullup the whole message - * and overlay whatever follows the outer IP header over the - * outer IP header. - */ - - if (!pullupmsg(mp, -1)) - return (NULL); - - icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; - ipha = (ipha_t *)&icmph[1]; - in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length); + /* icmp_inbound_v4 has already pulled up the whole error packet */ + ASSERT(mp->b_cont == NULL); /* - * The length that we want to overlay is following the inner - * IP header. Subtracting the IP header + icmp header + outer - * IP header's length should give us the length that we want to - * overlay. + * The length that we want to overlay is the inner header + * and what follows it. */ - length = msgdsize(mp) - iph_hdr_length - sizeof (icmph_t) - - hdr_length; + length = msgdsize(mp) - ((uchar_t *)in_ipha - mp->b_rptr); + /* - * Overlay whatever follows the inner header over the + * Overlay the inner header and whatever follows it over the * outer header. */ bcopy((uchar_t *)in_ipha, (uchar_t *)ipha, length); - /* Set the wptr to account for the outer header */ - mp->b_wptr -= hdr_length; + /* Adjust for what we removed */ + mp->b_wptr -= (uchar_t *)in_ipha - (uchar_t *)ipha; return (mp); } /* - * Fanout for ICMP errors containing IP-in-IPv4 packets. Returns B_TRUE if a - * tunnel consumed the message, and B_FALSE otherwise. - */ -static boolean_t -icmp_inbound_iptun_fanout(mblk_t *first_mp, ipha_t *ripha, ill_t *ill, - ip_stack_t *ipst) -{ - conn_t *connp; - - if ((connp = ipcl_iptun_classify_v4(&ripha->ipha_src, &ripha->ipha_dst, - ipst)) == NULL) - return (B_FALSE); - - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); - connp->conn_recv(connp, first_mp, NULL); - CONN_DEC_REF(connp); - return (B_TRUE); -} - -/* * Try to pass the ICMP message upstream in case the ULP cares. * * If the packet that caused the ICMP error is secure, we send @@ -2400,25 +2187,22 @@ icmp_inbound_iptun_fanout(mblk_t *first_mp, ipha_t *ripha, ill_t *ill, * * IFN could have been generated locally or by some router. * - * LOCAL : *ip_wput_ire -> icmp_frag_needed could have generated this. + * LOCAL : ire_send_wire (before calling ipsec_out_process) can call + * icmp_frag_needed/icmp_pkt2big_v6 to generated a local IFN. * This happens because IP adjusted its value of MTU on an * earlier IFN message and could not tell the upper layer, * the new adjusted value of MTU e.g. Packet was encrypted * or there was not enough information to fanout to upper - * layers. Thus on the next outbound datagram, ip_wput_ire + * layers. Thus on the next outbound datagram, ire_send_wire * generates the IFN, where IPsec processing has *not* been * done. * - * *ip_wput_ire_fragmentit -> ip_wput_frag -> icmp_frag_needed - * could have generated this. This happens because ire_max_frag - * value in IP was set to a new value, while the IPsec processing - * was being done and after we made the fragmentation check in - * ip_wput_ire. Thus on return from IPsec processing, - * ip_wput_ipsec_out finds that the new length is > ire_max_frag - * and generates the IFN. As IPsec processing is over, we fanout - * to AH/ESP to remove the header. + * Note that we retain ixa_fragsize across IPsec thus once + * we have picking ixa_fragsize and entered ipsec_out_process we do + * no change the fragsize even if the path MTU changes before + * we reach ip_output_post_ipsec. * - * In both these cases, ipsec_in_loopback will be set indicating + * In the local case, IRAF_LOOPBACK will be set indicating * that IFN was generated locally. * * ROUTER : IFN could be secure or non-secure. @@ -2432,45 +2216,38 @@ icmp_inbound_iptun_fanout(mblk_t *first_mp, ipha_t *ripha, ill_t *ill, * If the packet in error does not have AH/ESP, we handle it * like any other case. * - * * NON_SECURE : If the packet in error has AH/ESP headers, - * we attach a dummy ipsec_in and send it up to AH/ESP - * for validation. AH/ESP will verify whether there is a + * * NON_SECURE : If the packet in error has AH/ESP headers, we send it + * up to AH/ESP for validation. AH/ESP will verify whether there is a * valid SA or not and send it back. We will fanout again if * we have more data in the packet. * * If the packet in error does not have AH/ESP, we handle it * like any other case. + * + * The caller must have called icmp_inbound_verify_v4. */ static void -icmp_inbound_error_fanout(queue_t *q, ill_t *ill, mblk_t *mp, - icmph_t *icmph, ipha_t *ipha, int iph_hdr_length, int hdr_length, - boolean_t mctl_present, boolean_t ip_policy, ill_t *recv_ill, - zoneid_t zoneid) -{ - uint16_t *up; /* Pointer to ports in ULP header */ - uint32_t ports; /* reversed ports for fanout */ - ipha_t ripha; /* With reversed addresses */ - mblk_t *first_mp; - ipsec_in_t *ii; - tcph_t *tcph; - conn_t *connp; - ip_stack_t *ipst; - - ASSERT(ill != NULL); - - ASSERT(recv_ill != NULL); - ipst = recv_ill->ill_ipst; +icmp_inbound_error_fanout_v4(mblk_t *mp, icmph_t *icmph, ip_recv_attr_t *ira) +{ + uint16_t *up; /* Pointer to ports in ULP header */ + uint32_t ports; /* reversed ports for fanout */ + ipha_t ripha; /* With reversed addresses */ + ipha_t *ipha; /* Inner IP header */ + uint_t hdr_length; /* Inner IP header length */ + tcpha_t *tcpha; + conn_t *connp; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; + ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; + ill_t *rill = ira->ira_rill; - first_mp = mp; - if (mctl_present) { - mp = first_mp->b_cont; - ASSERT(mp != NULL); + /* Caller already pulled up everything. */ + ipha = (ipha_t *)&icmph[1]; + ASSERT((uchar_t *)&ipha[1] <= mp->b_wptr); + ASSERT(mp->b_cont == NULL); - ii = (ipsec_in_t *)first_mp->b_rptr; - ASSERT(ii->ipsec_in_type == IPSEC_IN); - } else { - ii = NULL; - } + hdr_length = IPH_HDR_LENGTH(ipha); + ira->ira_protocol = ipha->ipha_protocol; /* * We need a separate IP header with the source and destination @@ -2482,249 +2259,223 @@ icmp_inbound_error_fanout(queue_t *q, ill_t *ill, mblk_t *mp, ripha.ipha_protocol = ipha->ipha_protocol; ripha.ipha_version_and_hdr_length = ipha->ipha_version_and_hdr_length; - ip2dbg(("icmp_inbound_error: proto %d %x to %x: %d/%d\n", + ip2dbg(("icmp_inbound_error_v4: proto %d %x to %x: %d/%d\n", ripha.ipha_protocol, ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), icmph->icmph_type, icmph->icmph_code)); switch (ipha->ipha_protocol) { case IPPROTO_UDP: - /* - * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of - * transport header. - */ - if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > - mp->b_wptr) { - if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + - ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { - goto discard_pkt; - } - icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; - ipha = (ipha_t *)&icmph[1]; - } up = (uint16_t *)((uchar_t *)ipha + hdr_length); /* Attempt to find a client stream based on port. */ - ((uint16_t *)&ports)[0] = up[1]; - ((uint16_t *)&ports)[1] = up[0]; - ip2dbg(("icmp_inbound_error: UDP ports %d to %d\n", + ip2dbg(("icmp_inbound_error_v4: UDP ports %d to %d\n", ntohs(up[0]), ntohs(up[1]))); - /* Have to change db_type after any pullupmsg */ - DB_TYPE(mp) = M_CTL; - - ip_fanout_udp(q, first_mp, ill, &ripha, ports, B_FALSE, 0, - mctl_present, ip_policy, recv_ill, zoneid); + /* Note that we send error to all matches. */ + ira->ira_flags |= IRAF_ICMP_ERROR; + ip_fanout_udp_multi_v4(mp, &ripha, up[0], up[1], ira); + ira->ira_flags &= ~IRAF_ICMP_ERROR; return; case IPPROTO_TCP: /* - * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of - * transport header. - */ - if ((uchar_t *)ipha + hdr_length + ICMP_MIN_TP_HDR_LEN > - mp->b_wptr) { - if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + - ICMP_MIN_TP_HDR_LEN - mp->b_rptr)) { - goto discard_pkt; - } - icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; - ipha = (ipha_t *)&icmph[1]; - } - /* * Find a TCP client stream for this packet. * Note that we do a reverse lookup since the header is * in the form we sent it out. */ - tcph = (tcph_t *)((uchar_t *)ipha + hdr_length); - connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcph, TCPS_LISTEN, + tcpha = (tcpha_t *)((uchar_t *)ipha + hdr_length); + connp = ipcl_tcp_lookup_reversed_ipv4(ipha, tcpha, TCPS_LISTEN, ipst); if (connp == NULL) goto discard_pkt; - /* Have to change db_type after any pullupmsg */ - DB_TYPE(mp) = M_CTL; - SQUEUE_ENTER_ONE(connp->conn_sqp, first_mp, tcp_input, connp, - SQ_FILL, SQTAG_TCP_INPUT_ICMP_ERR); + if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || + (ira->ira_flags & IRAF_IPSEC_SECURE)) { + mp = ipsec_check_inbound_policy(mp, connp, + ipha, NULL, ira); + if (mp == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + /* Note that mp is NULL */ + ip_drop_input("ipIfStatsInDiscards", mp, ill); + CONN_DEC_REF(connp); + return; + } + } + + ira->ira_flags |= IRAF_ICMP_ERROR; + ira->ira_ill = ira->ira_rill = NULL; + if (IPCL_IS_TCP(connp)) { + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, + connp->conn_recvicmp, connp, ira, SQ_FILL, + SQTAG_TCP_INPUT_ICMP_ERR); + } else { + /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */ + (connp->conn_recv)(connp, mp, NULL, ira); + CONN_DEC_REF(connp); + } + ira->ira_ill = ill; + ira->ira_rill = rill; + ira->ira_flags &= ~IRAF_ICMP_ERROR; return; case IPPROTO_SCTP: - /* - * Verify we have at least ICMP_MIN_SCTP_HDR_LEN bytes of - * transport header, in the first mp. - */ - if ((uchar_t *)ipha + hdr_length + ICMP_MIN_SCTP_HDR_LEN > - mp->b_wptr) { - if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + - ICMP_MIN_SCTP_HDR_LEN - mp->b_rptr)) { - goto discard_pkt; - } - icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; - ipha = (ipha_t *)&icmph[1]; - } up = (uint16_t *)((uchar_t *)ipha + hdr_length); /* Find a SCTP client stream for this packet. */ ((uint16_t *)&ports)[0] = up[1]; ((uint16_t *)&ports)[1] = up[0]; - /* Have to change db_type after any pullupmsg */ - DB_TYPE(mp) = M_CTL; - ip_fanout_sctp(first_mp, recv_ill, &ripha, ports, 0, - mctl_present, ip_policy, zoneid); + ira->ira_flags |= IRAF_ICMP_ERROR; + ip_fanout_sctp(mp, &ripha, NULL, ports, ira); + ira->ira_flags &= ~IRAF_ICMP_ERROR; return; case IPPROTO_ESP: - case IPPROTO_AH: { - int ipsec_rc; - ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; - - /* - * We need a IPSEC_IN in the front to fanout to AH/ESP. - * We will re-use the IPSEC_IN if it is already present as - * AH/ESP will not affect any fields in the IPSEC_IN for - * ICMP errors. If there is no IPSEC_IN, allocate a new - * one and attach it in the front. - */ - if (ii != NULL) { - /* - * ip_fanout_proto_again converts the ICMP errors - * that come back from AH/ESP to M_DATA so that - * if it is non-AH/ESP and we do a pullupmsg in - * this function, it would work. Convert it back - * to M_CTL before we send up as this is a ICMP - * error. This could have been generated locally or - * by some router. Validate the inner IPsec - * headers. - * - * NOTE : ill_index is used by ip_fanout_proto_again - * to locate the ill. - */ - ASSERT(ill != NULL); - ii->ipsec_in_ill_index = - ill->ill_phyint->phyint_ifindex; - ii->ipsec_in_rill_index = - recv_ill->ill_phyint->phyint_ifindex; - DB_TYPE(first_mp->b_cont) = M_CTL; - } else { - /* - * IPSEC_IN is not present. We attach a ipsec_in - * message and send up to IPsec for validating - * and removing the IPsec headers. Clear - * ipsec_in_secure so that when we return - * from IPsec, we don't mistakenly think that this - * is a secure packet came from the network. - * - * NOTE : ill_index is used by ip_fanout_proto_again - * to locate the ill. - */ - ASSERT(first_mp == mp); - first_mp = ipsec_in_alloc(B_TRUE, ipst->ips_netstack); - if (first_mp == NULL) { - freemsg(mp); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - return; - } - ii = (ipsec_in_t *)first_mp->b_rptr; - - /* This is not a secure packet */ - ii->ipsec_in_secure = B_FALSE; - first_mp->b_cont = mp; - DB_TYPE(mp) = M_CTL; - ASSERT(ill != NULL); - ii->ipsec_in_ill_index = - ill->ill_phyint->phyint_ifindex; - ii->ipsec_in_rill_index = - recv_ill->ill_phyint->phyint_ifindex; - } - + case IPPROTO_AH: if (!ipsec_loaded(ipss)) { - ip_proto_not_sup(q, first_mp, 0, zoneid, ipst); + ip_proto_not_sup(mp, ira); return; } if (ipha->ipha_protocol == IPPROTO_ESP) - ipsec_rc = ipsecesp_icmp_error(first_mp); + mp = ipsecesp_icmp_error(mp, ira); else - ipsec_rc = ipsecah_icmp_error(first_mp); - if (ipsec_rc == IPSEC_STATUS_FAILED) + mp = ipsecah_icmp_error(mp, ira); + if (mp == NULL) + return; + + /* Just in case ipsec didn't preserve the NULL b_cont */ + if (mp->b_cont != NULL) { + if (!pullupmsg(mp, -1)) + goto discard_pkt; + } + + /* + * Note that ira_pktlen and ira_ip_hdr_length are no longer + * correct, but we don't use them any more here. + * + * If succesful, the mp has been modified to not include + * the ESP/AH header so we can fanout to the ULP's icmp + * error handler. + */ + if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) + goto truncated; + + /* Verify the modified message before any further processes. */ + ipha = (ipha_t *)mp->b_rptr; + hdr_length = IPH_HDR_LENGTH(ipha); + icmph = (icmph_t *)&mp->b_rptr[hdr_length]; + if (!icmp_inbound_verify_v4(mp, icmph, ira)) { + freemsg(mp); return; + } - ip_fanout_proto_again(first_mp, ill, recv_ill, NULL); + icmp_inbound_error_fanout_v4(mp, icmph, ira); return; - } - case IPPROTO_ENCAP: - case IPPROTO_IPV6: - if (ipha->ipha_protocol == IPPROTO_ENCAP) { - ipha_t *in_ipha; - if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) > - mp->b_wptr) { - if (!pullupmsg(mp, (uchar_t *)ipha + - hdr_length + sizeof (ipha_t) - - mp->b_rptr)) { + case IPPROTO_ENCAP: { + /* Look for self-encapsulated packets that caused an error */ + ipha_t *in_ipha; + + /* + * Caller has verified that length has to be + * at least the size of IP header. + */ + ASSERT(hdr_length >= sizeof (ipha_t)); + /* + * Check the sanity of the inner IP header like + * we did for the outer header. + */ + in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length); + if ((IPH_HDR_VERSION(in_ipha) != IPV4_VERSION)) { + goto discard_pkt; + } + if (IPH_HDR_LENGTH(in_ipha) < sizeof (ipha_t)) { + goto discard_pkt; + } + /* Check for Self-encapsulated tunnels */ + if (in_ipha->ipha_src == ipha->ipha_src && + in_ipha->ipha_dst == ipha->ipha_dst) { + + mp = icmp_inbound_self_encap_error_v4(mp, ipha, + in_ipha); + if (mp == NULL) + goto discard_pkt; + + /* + * Just in case self_encap didn't preserve the NULL + * b_cont + */ + if (mp->b_cont != NULL) { + if (!pullupmsg(mp, -1)) goto discard_pkt; - } - icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; - ipha = (ipha_t *)&icmph[1]; } /* - * Caller has verified that length has to be - * at least the size of IP header. + * Note that ira_pktlen and ira_ip_hdr_length are no + * longer correct, but we don't use them any more here. */ - ASSERT(hdr_length >= sizeof (ipha_t)); + if (mp->b_wptr - mp->b_rptr < IP_SIMPLE_HDR_LENGTH) + goto truncated; + /* - * Check the sanity of the inner IP header like - * we did for the outer header. + * Verify the modified message before any further + * processes. */ - in_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length); - if ((IPH_HDR_VERSION(in_ipha) != IPV4_VERSION) || - IPH_HDR_LENGTH(in_ipha) < sizeof (ipha_t)) - goto discard_pkt; - /* Check for Self-encapsulated tunnels */ - if (in_ipha->ipha_src == ipha->ipha_src && - in_ipha->ipha_dst == ipha->ipha_dst) { - - mp = icmp_inbound_self_encap_error(mp, - iph_hdr_length, hdr_length); - if (mp == NULL) - goto discard_pkt; - icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; - ipha = (ipha_t *)&icmph[1]; - hdr_length = IPH_HDR_LENGTH(ipha); - /* - * The packet in error is self-encapsualted. - * And we are finding it further encapsulated - * which we could not have possibly generated. - */ - if (ipha->ipha_protocol == IPPROTO_ENCAP) { - goto discard_pkt; - } - icmp_inbound_error_fanout(q, ill, first_mp, - icmph, ipha, iph_hdr_length, hdr_length, - mctl_present, ip_policy, recv_ill, zoneid); + ipha = (ipha_t *)mp->b_rptr; + hdr_length = IPH_HDR_LENGTH(ipha); + icmph = (icmph_t *)&mp->b_rptr[hdr_length]; + if (!icmp_inbound_verify_v4(mp, icmph, ira)) { + freemsg(mp); return; } - } - DB_TYPE(mp) = M_CTL; - if (icmp_inbound_iptun_fanout(first_mp, &ripha, ill, ipst)) + /* + * The packet in error is self-encapsualted. + * And we are finding it further encapsulated + * which we could not have possibly generated. + */ + if (ipha->ipha_protocol == IPPROTO_ENCAP) { + goto discard_pkt; + } + icmp_inbound_error_fanout_v4(mp, icmph, ira); return; + } + /* No self-encapsulated */ + /* FALLTHRU */ + } + case IPPROTO_IPV6: + if ((connp = ipcl_iptun_classify_v4(&ripha.ipha_src, + &ripha.ipha_dst, ipst)) != NULL) { + ira->ira_flags |= IRAF_ICMP_ERROR; + connp->conn_recvicmp(connp, mp, NULL, ira); + CONN_DEC_REF(connp); + ira->ira_flags &= ~IRAF_ICMP_ERROR; + return; + } /* * No IP tunnel is interested, fallthrough and see * if a raw socket will want it. */ /* FALLTHRU */ default: - ip_fanout_proto(q, first_mp, ill, &ripha, 0, mctl_present, - ip_policy, recv_ill, zoneid); + ira->ira_flags |= IRAF_ICMP_ERROR; + ip_fanout_proto_v4(mp, &ripha, ira); + ira->ira_flags &= ~IRAF_ICMP_ERROR; return; } /* NOTREACHED */ discard_pkt: BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); -drop_pkt:; - ip1dbg(("icmp_inbound_error_fanout: drop pkt\n")); - freemsg(first_mp); + ip1dbg(("icmp_inbound_error_fanout_v4: drop pkt\n")); + ip_drop_input("ipIfStatsInDiscards", mp, ill); + freemsg(mp); + return; + +truncated: + /* We pulled up everthing already. Must be truncated */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts); + ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill); + freemsg(mp); } /* @@ -2747,6 +2498,16 @@ ipoptp_first(ipoptp_t *optp, ipha_t *ipha) return (ipoptp_next(optp)); } +/* Like above but without an ipha_t */ +uint8_t +ipoptp_first2(ipoptp_t *optp, uint32_t totallen, uint8_t *opt) +{ + optp->ipoptp_next = opt; + optp->ipoptp_end = optp->ipoptp_next + totallen; + optp->ipoptp_flags = 0; + return (ipoptp_next(optp)); +} + /* * Common IP options parser: extract next option. */ @@ -2858,38 +2619,55 @@ ipoptp_next(ipoptp_t *optp) /* * Use the outgoing IP header to create an IP_OPTIONS option the way * it was passed down from the application. + * + * This is compatible with BSD in that it returns + * the reverse source route with the final destination + * as the last entry. The first 4 bytes of the option + * will contain the final destination. */ int -ip_opt_get_user(const ipha_t *ipha, uchar_t *buf) +ip_opt_get_user(conn_t *connp, uchar_t *buf) { ipoptp_t opts; - const uchar_t *opt; + uchar_t *opt; uint8_t optval; uint8_t optlen; uint32_t len = 0; - uchar_t *buf1 = buf; + uchar_t *buf1 = buf; + uint32_t totallen; + ipaddr_t dst; + ip_pkt_t *ipp = &connp->conn_xmit_ipp; + + if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS)) + return (0); + + totallen = ipp->ipp_ipv4_options_len; + if (totallen & 0x3) + return (0); buf += IP_ADDR_LEN; /* Leave room for final destination */ len += IP_ADDR_LEN; bzero(buf1, IP_ADDR_LEN); - /* - * OK to cast away const here, as we don't store through the returned - * opts.ipoptp_cur pointer. - */ - for (optval = ipoptp_first(&opts, (ipha_t *)ipha); + dst = connp->conn_faddr_v4; + + for (optval = ipoptp_first2(&opts, totallen, ipp->ipp_ipv4_options); optval != IPOPT_EOL; optval = ipoptp_next(&opts)) { int off; opt = opts.ipoptp_cur; + if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { + break; + } optlen = opts.ipoptp_len; + switch (optval) { case IPOPT_SSRR: case IPOPT_LSRR: /* - * Insert ipha_dst as the first entry in the source + * Insert destination as the first entry in the source * route and move down the entries on step. * The last entry gets placed at buf1. */ @@ -2902,8 +2680,9 @@ ip_opt_get_user(const ipha_t *ipha, uchar_t *buf) /* No entries in source route */ break; } - /* Last entry in source route */ - bcopy(opt + off, buf1, IP_ADDR_LEN); + /* Last entry in source route if not already set */ + if (dst == INADDR_ANY) + bcopy(opt + off, buf1, IP_ADDR_LEN); off -= IP_ADDR_LEN; while (off > 0) { @@ -2913,19 +2692,12 @@ ip_opt_get_user(const ipha_t *ipha, uchar_t *buf) off -= IP_ADDR_LEN; } /* ipha_dst into first slot */ - bcopy(&ipha->ipha_dst, - buf + off + IP_ADDR_LEN, + bcopy(&dst, buf + off + IP_ADDR_LEN, IP_ADDR_LEN); buf += optlen; len += optlen; break; - case IPOPT_COMSEC: - case IPOPT_SECURITY: - /* if passing up a label is not ok, then remove */ - if (is_system_labeled()) - break; - /* FALLTHROUGH */ default: bcopy(opt, buf, optlen); buf += optlen; @@ -3007,57 +2779,46 @@ icmp_options_update(ipha_t *ipha) /* * Process received ICMP Redirect messages. + * Assumes the caller has verified that the headers are in the pulled up mblk. + * Consumes mp. */ static void -icmp_redirect(ill_t *ill, mblk_t *mp) +icmp_redirect_v4(mblk_t *mp, ipha_t *ipha, icmph_t *icmph, ip_recv_attr_t *ira) { - ipha_t *ipha; - int iph_hdr_length; - icmph_t *icmph; - ipha_t *ipha_err; - ire_t *ire; - ire_t *prev_ire; - ire_t *save_ire; - ipaddr_t src, dst, gateway; - iulp_t ulp_info = { 0 }; - int error; - ip_stack_t *ipst; + ire_t *ire, *nire; + ire_t *prev_ire; + ipaddr_t src, dst, gateway; + ip_stack_t *ipst = ira->ira_ill->ill_ipst; + ipha_t *inner_ipha; /* Inner IP header */ - ASSERT(ill != NULL); - ipst = ill->ill_ipst; - - ipha = (ipha_t *)mp->b_rptr; - iph_hdr_length = IPH_HDR_LENGTH(ipha); - if (((mp->b_wptr - mp->b_rptr) - iph_hdr_length) < - sizeof (icmph_t) + IP_SIMPLE_HDR_LENGTH) { - BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); - freemsg(mp); - return; - } - icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; - ipha_err = (ipha_t *)&icmph[1]; + /* Caller already pulled up everything. */ + inner_ipha = (ipha_t *)&icmph[1]; src = ipha->ipha_src; - dst = ipha_err->ipha_dst; + dst = inner_ipha->ipha_dst; gateway = icmph->icmph_rd_gateway; /* Make sure the new gateway is reachable somehow. */ - ire = ire_route_lookup(gateway, 0, 0, IRE_INTERFACE, NULL, NULL, - ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); + ire = ire_ftable_lookup_v4(gateway, 0, 0, IRE_ONLINK, NULL, + ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst, NULL); /* * Make sure we had a route for the dest in question and that * that route was pointing to the old gateway (the source of the * redirect packet.) + * Note: this merely says that there is some IRE which matches that + * gateway; not that the longest match matches that gateway. */ - prev_ire = ire_route_lookup(dst, 0, src, 0, NULL, NULL, ALL_ZONES, - NULL, MATCH_IRE_GW, ipst); + prev_ire = ire_ftable_lookup_v4(dst, 0, src, 0, NULL, ALL_ZONES, + NULL, MATCH_IRE_GW, 0, ipst, NULL); /* * Check that * the redirect was not from ourselves * the new gateway and the old gateway are directly reachable */ - if (!prev_ire || - !ire || - ire->ire_type == IRE_LOCAL) { + if (prev_ire == NULL || ire == NULL || + (prev_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) || + (prev_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || + !(ire->ire_type & IRE_IF_ALL)) { BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects); + ip_drop_input("icmpInBadRedirects - ire", mp, ira->ira_ill); freemsg(mp); if (ire != NULL) ire_refrele(ire); @@ -3066,49 +2827,9 @@ icmp_redirect(ill_t *ill, mblk_t *mp) return; } - /* - * Should we use the old ULP info to create the new gateway? From - * a user's perspective, we should inherit the info so that it - * is a "smooth" transition. If we do not do that, then new - * connections going thru the new gateway will have no route metrics, - * which is counter-intuitive to user. From a network point of - * view, this may or may not make sense even though the new gateway - * is still directly connected to us so the route metrics should not - * change much. - * - * But if the old ire_uinfo is not initialized, we do another - * recursive lookup on the dest using the new gateway. There may - * be a route to that. If so, use it to initialize the redirect - * route. - */ - if (prev_ire->ire_uinfo.iulp_set) { - bcopy(&prev_ire->ire_uinfo, &ulp_info, sizeof (iulp_t)); - } else { - ire_t *tmp_ire; - ire_t *sire; - - tmp_ire = ire_ftable_lookup(dst, 0, gateway, 0, NULL, &sire, - ALL_ZONES, 0, NULL, - (MATCH_IRE_RECURSIVE | MATCH_IRE_GW | MATCH_IRE_DEFAULT), - ipst); - if (sire != NULL) { - bcopy(&sire->ire_uinfo, &ulp_info, sizeof (iulp_t)); - /* - * If sire != NULL, ire_ftable_lookup() should not - * return a NULL value. - */ - ASSERT(tmp_ire != NULL); - ire_refrele(tmp_ire); - ire_refrele(sire); - } else if (tmp_ire != NULL) { - bcopy(&tmp_ire->ire_uinfo, &ulp_info, - sizeof (iulp_t)); - ire_refrele(tmp_ire); - } - } - if (prev_ire->ire_type == IRE_CACHE) - ire_delete(prev_ire); ire_refrele(prev_ire); + ire_refrele(ire); + /* * TODO: more precise handling for cases 0, 2, 3, the latter two * require TOS routing @@ -3121,47 +2842,42 @@ icmp_redirect(ill_t *ill, mblk_t *mp) case 3: break; default: - freemsg(mp); BUMP_MIB(&ipst->ips_icmp_mib, icmpInBadRedirects); - ire_refrele(ire); + ip_drop_input("icmpInBadRedirects - code", mp, ira->ira_ill); + freemsg(mp); return; } /* * Create a Route Association. This will allow us to remember that * someone we believe told us to use the particular gateway. */ - save_ire = ire; ire = ire_create( (uchar_t *)&dst, /* dest addr */ (uchar_t *)&ip_g_all_ones, /* mask */ - (uchar_t *)&save_ire->ire_src_addr, /* source addr */ (uchar_t *)&gateway, /* gateway addr */ - &save_ire->ire_max_frag, /* max frag */ - NULL, /* no src nce */ - NULL, /* no rfq */ - NULL, /* no stq */ IRE_HOST, - NULL, /* ipif */ - 0, /* cmask */ - 0, /* phandle */ - 0, /* ihandle */ + NULL, /* ill */ + ALL_ZONES, (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), - &ulp_info, NULL, /* tsol_gc_t */ - NULL, /* gcgrp */ ipst); if (ire == NULL) { freemsg(mp); - ire_refrele(save_ire); return; } - error = ire_add(&ire, NULL, NULL, NULL, B_FALSE); - ire_refrele(save_ire); - atomic_inc_32(&ipst->ips_ip_redirect_cnt); + nire = ire_add(ire); + /* Check if it was a duplicate entry */ + if (nire != NULL && nire != ire) { + ASSERT(nire->ire_identical_ref > 1); + ire_delete(nire); + ire_refrele(nire); + nire = NULL; + } + ire = nire; + if (ire != NULL) { + ire_refrele(ire); /* Held in ire_add */ - if (error == 0) { - ire_refrele(ire); /* Held in ire_add_v4 */ /* tell routing sockets that we received a redirect */ ip_rts_change(RTM_REDIRECT, dst, gateway, IP_HOST_MASK, 0, src, (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 0, @@ -3173,8 +2889,8 @@ icmp_redirect(ill_t *ill, mblk_t *mp) * This together with the added IRE has the effect of * modifying an existing redirect. */ - prev_ire = ire_ftable_lookup(dst, 0, src, IRE_HOST, NULL, NULL, - ALL_ZONES, 0, NULL, (MATCH_IRE_GW | MATCH_IRE_TYPE), ipst); + prev_ire = ire_ftable_lookup_v4(dst, 0, src, IRE_HOST, NULL, + ALL_ZONES, NULL, (MATCH_IRE_GW | MATCH_IRE_TYPE), 0, ipst, NULL); if (prev_ire != NULL) { if (prev_ire ->ire_flags & RTF_DYNAMIC) ire_delete(prev_ire); @@ -3186,29 +2902,24 @@ icmp_redirect(ill_t *ill, mblk_t *mp) /* * Generate an ICMP parameter problem message. + * When called from ip_output side a minimal ip_recv_attr_t needs to be + * constructed by the caller. */ static void -icmp_param_problem(queue_t *q, mblk_t *mp, uint8_t ptr, zoneid_t zoneid, - ip_stack_t *ipst) +icmp_param_problem(mblk_t *mp, uint8_t ptr, ip_recv_attr_t *ira) { icmph_t icmph; - boolean_t mctl_present; - mblk_t *first_mp; + ip_stack_t *ipst = ira->ira_ill->ill_ipst; - EXTRACT_PKT_MP(mp, first_mp, mctl_present); - - if (!(mp = icmp_pkt_err_ok(mp, ipst))) { - if (mctl_present) - freeb(first_mp); + mp = icmp_pkt_err_ok(mp, ira); + if (mp == NULL) return; - } bzero(&icmph, sizeof (icmph_t)); icmph.icmph_type = ICMP_PARAM_PROBLEM; icmph.icmph_pp_ptr = ptr; BUMP_MIB(&ipst->ips_icmp_mib, icmpOutParmProbs); - icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present, zoneid, - ipst); + icmp_pkt(mp, &icmph, sizeof (icmph_t), ira); } /* @@ -3217,15 +2928,11 @@ icmp_param_problem(queue_t *q, mblk_t *mp, uint8_t ptr, zoneid_t zoneid, * Note: assumes that icmp_pkt_err_ok has been called to verify that * an icmp error packet can be sent. * Assigns an appropriate source address to the packet. If ipha_dst is - * one of our addresses use it for source. Otherwise pick a source based - * on a route lookup back to ipha_src. - * Note that ipha_src must be set here since the - * packet is likely to arrive on an ill queue in ip_wput() which will - * not set a source address. + * one of our addresses use it for source. Otherwise let ip_output_simple + * pick the source address. */ static void -icmp_pkt(queue_t *q, mblk_t *mp, void *stuff, size_t len, - boolean_t mctl_present, zoneid_t zoneid, ip_stack_t *ipst) +icmp_pkt(mblk_t *mp, void *stuff, size_t len, ip_recv_attr_t *ira) { ipaddr_t dst; icmph_t *icmph; @@ -3235,115 +2942,62 @@ icmp_pkt(queue_t *q, mblk_t *mp, void *stuff, size_t len, mblk_t *mp1; ipaddr_t src; ire_t *ire; - mblk_t *ipsec_mp; - ipsec_out_t *io = NULL; - - if (mctl_present) { - /* - * If it is : - * - * 1) a IPSEC_OUT, then this is caused by outbound - * datagram originating on this host. IPsec processing - * may or may not have been done. Refer to comments above - * icmp_inbound_error_fanout for details. - * - * 2) a IPSEC_IN if we are generating a icmp_message - * for an incoming datagram destined for us i.e called - * from ip_fanout_send_icmp. - */ - ipsec_info_t *in; - ipsec_mp = mp; - mp = ipsec_mp->b_cont; + ip_xmit_attr_t ixas; + ip_stack_t *ipst = ira->ira_ill->ill_ipst; - in = (ipsec_info_t *)ipsec_mp->b_rptr; - ipha = (ipha_t *)mp->b_rptr; + ipha = (ipha_t *)mp->b_rptr; - ASSERT(in->ipsec_info_type == IPSEC_OUT || - in->ipsec_info_type == IPSEC_IN); + bzero(&ixas, sizeof (ixas)); + ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; + ixas.ixa_zoneid = ira->ira_zoneid; + ixas.ixa_ifindex = 0; + ixas.ixa_ipst = ipst; + ixas.ixa_cred = kcred; + ixas.ixa_cpid = NOPID; + ixas.ixa_tsl = ira->ira_tsl; /* Behave as a multi-level responder */ + ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; - if (in->ipsec_info_type == IPSEC_IN) { - /* - * Convert the IPSEC_IN to IPSEC_OUT. - */ - if (!ipsec_in_to_out(ipsec_mp, ipha, NULL, zoneid)) { - BUMP_MIB(&ipst->ips_ip_mib, - ipIfStatsOutDiscards); - return; - } - io = (ipsec_out_t *)ipsec_mp->b_rptr; - } else { - ASSERT(in->ipsec_info_type == IPSEC_OUT); - io = (ipsec_out_t *)in; - /* - * Clear out ipsec_out_proc_begin, so we do a fresh - * ire lookup. - */ - io->ipsec_out_proc_begin = B_FALSE; - } - ASSERT(zoneid != ALL_ZONES); - /* - * The IPSEC_IN (now an IPSEC_OUT) didn't have its zoneid - * initialized. We need to do that now. - */ - io->ipsec_out_zoneid = zoneid; - } else { + if (ira->ira_flags & IRAF_IPSEC_SECURE) { /* - * This is in clear. The icmp message we are building - * here should go out in clear. + * Apply IPsec based on how IPsec was applied to + * the packet that had the error. * - * Pardon the convolution of it all, but it's easier to - * allocate a "use cleartext" IPSEC_IN message and convert - * it than it is to allocate a new one. + * If it was an outbound packet that caused the ICMP + * error, then the caller will have setup the IRA + * appropriately. */ - ipsec_in_t *ii; - ASSERT(DB_TYPE(mp) == M_DATA); - ipsec_mp = ipsec_in_alloc(B_TRUE, ipst->ips_netstack); - if (ipsec_mp == NULL) { - freemsg(mp); + if (!ipsec_in_to_out(ira, &ixas, mp, ipha, NULL)) { BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); + /* Note: mp already consumed and ip_drop_packet done */ return; } - ii = (ipsec_in_t *)ipsec_mp->b_rptr; - - /* This is not a secure packet */ - ii->ipsec_in_secure = B_FALSE; - ipsec_mp->b_cont = mp; - ipha = (ipha_t *)mp->b_rptr; + } else { /* - * Convert the IPSEC_IN to IPSEC_OUT. + * This is in clear. The icmp message we are building + * here should go out in clear, independent of our policy. */ - if (!ipsec_in_to_out(ipsec_mp, ipha, NULL, zoneid)) { - BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); - return; - } - io = (ipsec_out_t *)ipsec_mp->b_rptr; + ixas.ixa_flags |= IXAF_NO_IPSEC; } /* Remember our eventual destination */ dst = ipha->ipha_src; - ire = ire_route_lookup(ipha->ipha_dst, 0, 0, (IRE_LOCAL|IRE_LOOPBACK), - NULL, NULL, zoneid, NULL, MATCH_IRE_TYPE, ipst); - if (ire != NULL && - (ire->ire_zoneid == zoneid || ire->ire_zoneid == ALL_ZONES)) { + /* + * If the packet was for one of our unicast addresses, make + * sure we respond with that as the source. Otherwise + * have ip_output_simple pick the source address. + */ + ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0, + (IRE_LOCAL|IRE_LOOPBACK), NULL, ira->ira_zoneid, NULL, + MATCH_IRE_TYPE|MATCH_IRE_ZONEONLY, 0, ipst, NULL); + if (ire != NULL) { + ire_refrele(ire); src = ipha->ipha_dst; } else { - if (ire != NULL) - ire_refrele(ire); - ire = ire_route_lookup(dst, 0, 0, 0, NULL, NULL, zoneid, NULL, - (MATCH_IRE_DEFAULT|MATCH_IRE_RECURSIVE|MATCH_IRE_ZONEONLY), - ipst); - if (ire == NULL) { - BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); - freemsg(ipsec_mp); - return; - } - src = ire->ire_src_addr; + src = INADDR_ANY; + ixas.ixa_flags |= IXAF_SET_SOURCE; } - if (ire != NULL) - ire_refrele(ire); - /* * Check if we can send back more then 8 bytes in addition to * the IP header. We try to send 64 bytes of data and the internal @@ -3352,10 +3006,10 @@ icmp_pkt(queue_t *q, mblk_t *mp, void *stuff, size_t len, len_needed = IPH_HDR_LENGTH(ipha); if (ipha->ipha_protocol == IPPROTO_ENCAP || ipha->ipha_protocol == IPPROTO_IPV6) { - if (!pullupmsg(mp, -1)) { BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); - freemsg(ipsec_mp); + ip_drop_output("ipIfStatsOutDiscards", mp, NULL); + freemsg(mp); return; } ipha = (ipha_t *)mp->b_rptr; @@ -3376,28 +3030,23 @@ icmp_pkt(queue_t *q, mblk_t *mp, void *stuff, size_t len, (void) adjmsg(mp, len_needed - msg_len); msg_len = len_needed; } - /* Make sure we propagate the cred/label for TX */ - mp1 = allocb_tmpl(sizeof (icmp_ipha) + len, mp); + mp1 = allocb(sizeof (icmp_ipha) + len, BPRI_MED); if (mp1 == NULL) { BUMP_MIB(&ipst->ips_icmp_mib, icmpOutErrors); - freemsg(ipsec_mp); + freemsg(mp); return; } mp1->b_cont = mp; mp = mp1; - ASSERT(ipsec_mp->b_datap->db_type == M_CTL && - ipsec_mp->b_rptr == (uint8_t *)io && - io->ipsec_out_type == IPSEC_OUT); - ipsec_mp->b_cont = mp; /* - * Set ipsec_out_icmp_loopback so we can let the ICMP messages this + * Set IXAF_TRUSTED_ICMP so we can let the ICMP messages this * node generates be accepted in peace by all on-host destinations. * If we do NOT assume that all on-host destinations trust * self-generated ICMP messages, then rework here, ip6.c, and spd.c. - * (Look for ipsec_out_icmp_loopback). + * (Look for IXAF_TRUSTED_ICMP). */ - io->ipsec_out_icmp_loopback = B_TRUE; + ixas.ixa_flags |= IXAF_TRUSTED_ICMP; ipha = (ipha_t *)mp->b_rptr; mp1->b_wptr = (uchar_t *)ipha + (sizeof (icmp_ipha) + len); @@ -3416,7 +3065,9 @@ icmp_pkt(queue_t *q, mblk_t *mp, void *stuff, size_t len, icmph->icmph_checksum = 0; icmph->icmph_checksum = IP_CSUM(mp, (int32_t)sizeof (ipha_t), 0); BUMP_MIB(&ipst->ips_icmp_mib, icmpOutMsgs); - put(q, ipsec_mp); + + (void) ip_output_simple(mp, &ixas); + ixa_cleanup(&ixas); } /* @@ -3480,37 +3131,30 @@ icmp_err_rate_limit(ip_stack_t *ipst) * ICMP error packet should be sent. */ static mblk_t * -icmp_pkt_err_ok(mblk_t *mp, ip_stack_t *ipst) +icmp_pkt_err_ok(mblk_t *mp, ip_recv_attr_t *ira) { + ip_stack_t *ipst = ira->ira_ill->ill_ipst; icmph_t *icmph; ipha_t *ipha; uint_t len_needed; - ire_t *src_ire; - ire_t *dst_ire; if (!mp) return (NULL); ipha = (ipha_t *)mp->b_rptr; if (ip_csum_hdr(ipha)) { BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInCksumErrs); + ip_drop_input("ipIfStatsInCksumErrs", mp, NULL); freemsg(mp); return (NULL); } - src_ire = ire_ctable_lookup(ipha->ipha_dst, 0, IRE_BROADCAST, - NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); - dst_ire = ire_ctable_lookup(ipha->ipha_src, 0, IRE_BROADCAST, - NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); - if (src_ire != NULL || dst_ire != NULL || + if (ip_type_v4(ipha->ipha_dst, ipst) == IRE_BROADCAST || + ip_type_v4(ipha->ipha_src, ipst) == IRE_BROADCAST || CLASSD(ipha->ipha_dst) || CLASSD(ipha->ipha_src) || (ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_OFFSET)) { /* Note: only errors to the fragment with offset 0 */ BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); freemsg(mp); - if (src_ire != NULL) - ire_refrele(src_ire); - if (dst_ire != NULL) - ire_refrele(dst_ire); return (NULL); } if (ipha->ipha_protocol == IPPROTO_ICMP) { @@ -3546,7 +3190,7 @@ icmp_pkt_err_ok(mblk_t *mp, ip_stack_t *ipst) * If this is a labeled system, then check to see if we're allowed to * send a response to this particular sender. If not, then just drop. */ - if (is_system_labeled() && !tsol_can_reply_error(mp)) { + if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) { ip2dbg(("icmp_pkt_err_ok: can't respond to packet\n")); BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDrops); freemsg(mp); @@ -3565,956 +3209,178 @@ icmp_pkt_err_ok(mblk_t *mp, ip_stack_t *ipst) } /* - * Generate an ICMP redirect message. + * Called when a packet was sent out the same link that it arrived on. + * Check if it is ok to send a redirect and then send it. */ -static void -icmp_send_redirect(queue_t *q, mblk_t *mp, ipaddr_t gateway, ip_stack_t *ipst) +void +ip_send_potential_redirect_v4(mblk_t *mp, ipha_t *ipha, ire_t *ire, + ip_recv_attr_t *ira) { - icmph_t icmph; + ip_stack_t *ipst = ira->ira_ill->ill_ipst; + ipaddr_t src, nhop; + mblk_t *mp1; + ire_t *nhop_ire; /* - * We are called from ip_rput where we could - * not have attached an IPSEC_IN. - */ - ASSERT(mp->b_datap->db_type == M_DATA); - - if (!(mp = icmp_pkt_err_ok(mp, ipst))) { + * Check the source address to see if it originated + * on the same logical subnet it is going back out on. + * If so, we should be able to send it a redirect. + * Avoid sending a redirect if the destination + * is directly connected (i.e., we matched an IRE_ONLINK), + * or if the packet was source routed out this interface. + * + * We avoid sending a redirect if the + * destination is directly connected + * because it is possible that multiple + * IP subnets may have been configured on + * the link, and the source may not + * be on the same subnet as ip destination, + * even though they are on the same + * physical link. + */ + if ((ire->ire_type & IRE_ONLINK) || + ip_source_routed(ipha, ipst)) return; - } - - bzero(&icmph, sizeof (icmph_t)); - icmph.icmph_type = ICMP_REDIRECT; - icmph.icmph_code = 1; - icmph.icmph_rd_gateway = gateway; - BUMP_MIB(&ipst->ips_icmp_mib, icmpOutRedirects); - /* Redirects sent by router, and router is global zone */ - icmp_pkt(q, mp, &icmph, sizeof (icmph_t), B_FALSE, GLOBAL_ZONEID, ipst); -} -/* - * Generate an ICMP time exceeded message. - */ -void -icmp_time_exceeded(queue_t *q, mblk_t *mp, uint8_t code, zoneid_t zoneid, - ip_stack_t *ipst) -{ - icmph_t icmph; - boolean_t mctl_present; - mblk_t *first_mp; - - EXTRACT_PKT_MP(mp, first_mp, mctl_present); - - if (!(mp = icmp_pkt_err_ok(mp, ipst))) { - if (mctl_present) - freeb(first_mp); + nhop_ire = ire_nexthop(ire); + if (nhop_ire == NULL) return; - } - - bzero(&icmph, sizeof (icmph_t)); - icmph.icmph_type = ICMP_TIME_EXCEEDED; - icmph.icmph_code = code; - BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimeExcds); - icmp_pkt(q, first_mp, &icmph, sizeof (icmph_t), mctl_present, zoneid, - ipst); -} -/* - * Generate an ICMP unreachable message. - */ -void -icmp_unreachable(queue_t *q, mblk_t *mp, uint8_t code, zoneid_t zoneid, - ip_stack_t *ipst) -{ - icmph_t icmph; - mblk_t *first_mp; - boolean_t mctl_present; + nhop = nhop_ire->ire_addr; - EXTRACT_PKT_MP(mp, first_mp, mctl_present); + if (nhop_ire->ire_type & IRE_IF_CLONE) { + ire_t *ire2; - if (!(mp = icmp_pkt_err_ok(mp, ipst))) { - if (mctl_present) - freeb(first_mp); - return; + /* Follow ire_dep_parent to find non-clone IRE_INTERFACE */ + mutex_enter(&nhop_ire->ire_lock); + ire2 = nhop_ire->ire_dep_parent; + if (ire2 != NULL) + ire_refhold(ire2); + mutex_exit(&nhop_ire->ire_lock); + ire_refrele(nhop_ire); + nhop_ire = ire2; } - - bzero(&icmph, sizeof (icmph_t)); - icmph.icmph_type = ICMP_DEST_UNREACHABLE; - icmph.icmph_code = code; - BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs); - ip2dbg(("send icmp destination unreachable code %d\n", code)); - icmp_pkt(q, first_mp, (char *)&icmph, sizeof (icmph_t), mctl_present, - zoneid, ipst); -} - -/* - * Attempt to start recovery of an IPv4 interface that's been shut down as a - * duplicate. As long as someone else holds the address, the interface will - * stay down. When that conflict goes away, the interface is brought back up. - * This is done so that accidental shutdowns of addresses aren't made - * permanent. Your server will recover from a failure. - * - * For DHCP, recovery is not done in the kernel. Instead, it's handled by a - * user space process (dhcpagent). - * - * Recovery completes if ARP reports that the address is now ours (via - * AR_CN_READY). In that case, we go to ip_arp_excl to finish the operation. - * - * This function is entered on a timer expiry; the ID is in ipif_recovery_id. - */ -static void -ipif_dup_recovery(void *arg) -{ - ipif_t *ipif = arg; - ill_t *ill = ipif->ipif_ill; - mblk_t *arp_add_mp; - mblk_t *arp_del_mp; - ip_stack_t *ipst = ill->ill_ipst; - - ipif->ipif_recovery_id = 0; - - /* - * No lock needed for moving or condemned check, as this is just an - * optimization. - */ - if (ill->ill_arp_closing || !(ipif->ipif_flags & IPIF_DUPLICATE) || - (ipif->ipif_flags & IPIF_POINTOPOINT) || - (ipif->ipif_state_flags & (IPIF_CONDEMNED))) { - /* No reason to try to bring this address back. */ + if (nhop_ire == NULL) return; - } - /* ACE_F_UNVERIFIED restarts DAD */ - if ((arp_add_mp = ipif_area_alloc(ipif, ACE_F_UNVERIFIED)) == NULL) - goto alloc_fail; - - if (ipif->ipif_arp_del_mp == NULL) { - if ((arp_del_mp = ipif_ared_alloc(ipif)) == NULL) - goto alloc_fail; - ipif->ipif_arp_del_mp = arp_del_mp; - } + ASSERT(!(nhop_ire->ire_type & IRE_IF_CLONE)); - putnext(ill->ill_rq, arp_add_mp); - return; + src = ipha->ipha_src; -alloc_fail: /* - * On allocation failure, just restart the timer. Note that the ipif - * is down here, so no other thread could be trying to start a recovery - * timer. The ill_lock protects the condemned flag and the recovery - * timer ID. + * We look at the interface ire for the nexthop, + * to see if ipha_src is in the same subnet + * as the nexthop. */ - freemsg(arp_add_mp); - mutex_enter(&ill->ill_lock); - if (ipst->ips_ip_dup_recovery > 0 && ipif->ipif_recovery_id == 0 && - !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { - ipif->ipif_recovery_id = timeout(ipif_dup_recovery, ipif, - MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); - } - mutex_exit(&ill->ill_lock); -} - -/* - * This is for exclusive changes due to ARP. Either tear down an interface due - * to AR_CN_FAILED and AR_CN_BOGON, or bring one up for successful recovery. - */ -/* ARGSUSED */ -static void -ip_arp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) -{ - ill_t *ill = rq->q_ptr; - arh_t *arh; - ipaddr_t src; - ipif_t *ipif; - char ibuf[LIFNAMSIZ + 10]; /* 10 digits for logical i/f number */ - char hbuf[MAC_STR_LEN]; - char sbuf[INET_ADDRSTRLEN]; - const char *failtype; - boolean_t bring_up; - ip_stack_t *ipst = ill->ill_ipst; - - switch (((arcn_t *)mp->b_rptr)->arcn_code) { - case AR_CN_READY: - failtype = NULL; - bring_up = B_TRUE; - break; - case AR_CN_FAILED: - failtype = "in use"; - bring_up = B_FALSE; - break; - default: - failtype = "claimed"; - bring_up = B_FALSE; - break; - } - - arh = (arh_t *)mp->b_cont->b_rptr; - bcopy((char *)&arh[1] + arh->arh_hlen, &src, IP_ADDR_LEN); - - (void) mac_colon_addr((uint8_t *)(arh + 1), arh->arh_hlen, hbuf, - sizeof (hbuf)); - (void) ip_dot_addr(src, sbuf); - for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { - - if ((ipif->ipif_flags & IPIF_POINTOPOINT) || - ipif->ipif_lcl_addr != src) { - continue; - } - - /* - * If we failed on a recovery probe, then restart the timer to - * try again later. - */ - if (!bring_up && (ipif->ipif_flags & IPIF_DUPLICATE) && - !(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && - ill->ill_net_type == IRE_IF_RESOLVER && - !(ipif->ipif_state_flags & IPIF_CONDEMNED) && - ipst->ips_ip_dup_recovery > 0 && - ipif->ipif_recovery_id == 0) { - ipif->ipif_recovery_id = timeout(ipif_dup_recovery, - ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); - continue; - } - - /* - * If what we're trying to do has already been done, then do - * nothing. - */ - if (bring_up == ((ipif->ipif_flags & IPIF_UP) != 0)) - continue; - - ipif_get_name(ipif, ibuf, sizeof (ibuf)); - - if (failtype == NULL) { - cmn_err(CE_NOTE, "recovered address %s on %s", sbuf, - ibuf); - } else { - cmn_err(CE_WARN, "%s has duplicate address %s (%s " - "by %s); disabled", ibuf, sbuf, failtype, hbuf); - } - - if (bring_up) { - ASSERT(ill->ill_dl_up); - /* - * Free up the ARP delete message so we can allocate - * a fresh one through the normal path. - */ - freemsg(ipif->ipif_arp_del_mp); - ipif->ipif_arp_del_mp = NULL; - if (ipif_resolver_up(ipif, Res_act_initial) != - EINPROGRESS) { - ipif->ipif_addr_ready = 1; - (void) ipif_up_done(ipif); - ASSERT(ill->ill_move_ipif == NULL); - } - continue; - } - - mutex_enter(&ill->ill_lock); - ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); - ipif->ipif_flags |= IPIF_DUPLICATE; - ill->ill_ipif_dup_count++; - mutex_exit(&ill->ill_lock); + if ((src & nhop_ire->ire_mask) == (nhop & nhop_ire->ire_mask)) { /* - * Already exclusive on the ill; no need to handle deferred - * processing here. + * The source is directly connected. */ - (void) ipif_down(ipif, NULL, NULL); - ipif_down_tail(ipif); - mutex_enter(&ill->ill_lock); - if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && - ill->ill_net_type == IRE_IF_RESOLVER && - !(ipif->ipif_state_flags & IPIF_CONDEMNED) && - ipst->ips_ip_dup_recovery > 0) { - ASSERT(ipif->ipif_recovery_id == 0); - ipif->ipif_recovery_id = timeout(ipif_dup_recovery, - ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); + mp1 = copymsg(mp); + if (mp1 != NULL) { + icmp_send_redirect(mp1, nhop, ira); } - mutex_exit(&ill->ill_lock); } - freemsg(mp); -} - -/* ARGSUSED */ -static void -ip_arp_defend(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) -{ - ill_t *ill = rq->q_ptr; - arh_t *arh; - ipaddr_t src; - ipif_t *ipif; - - arh = (arh_t *)mp->b_cont->b_rptr; - bcopy((char *)&arh[1] + arh->arh_hlen, &src, IP_ADDR_LEN); - for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { - if ((ipif->ipif_flags & IPIF_UP) && ipif->ipif_lcl_addr == src) - (void) ipif_resolver_up(ipif, Res_act_defend); - } - freemsg(mp); + ire_refrele(nhop_ire); } /* - * News from ARP. ARP sends notification of interesting events down - * to its clients using M_CTL messages with the interesting ARP packet - * attached via b_cont. - * The interesting event from a device comes up the corresponding ARP-IP-DEV - * queue as opposed to ARP sending the message to all the clients, i.e. all - * its ARP-IP-DEV instances. Thus, for AR_CN_ANNOUNCE, we must walk the cache - * table if a cache IRE is found to delete all the entries for the address in - * the packet. + * Generate an ICMP redirect message. */ static void -ip_arp_news(queue_t *q, mblk_t *mp) +icmp_send_redirect(mblk_t *mp, ipaddr_t gateway, ip_recv_attr_t *ira) { - arcn_t *arcn; - arh_t *arh; - ire_t *ire = NULL; - char hbuf[MAC_STR_LEN]; - char sbuf[INET_ADDRSTRLEN]; - ipaddr_t src; - in6_addr_t v6src; - boolean_t isv6 = B_FALSE; - ipif_t *ipif; - ill_t *ill; - ip_stack_t *ipst; - - if (CONN_Q(q)) { - conn_t *connp = Q_TO_CONN(q); - - ipst = connp->conn_netstack->netstack_ip; - } else { - ill_t *ill = (ill_t *)q->q_ptr; - - ipst = ill->ill_ipst; - } + icmph_t icmph; + ip_stack_t *ipst = ira->ira_ill->ill_ipst; - if ((mp->b_wptr - mp->b_rptr) < sizeof (arcn_t) || !mp->b_cont) { - if (q->q_next) { - putnext(q, mp); - } else - freemsg(mp); - return; - } - arh = (arh_t *)mp->b_cont->b_rptr; - /* Is it one we are interested in? */ - if (BE16_TO_U16(arh->arh_proto) == ETHERTYPE_IPV6) { - isv6 = B_TRUE; - bcopy((char *)&arh[1] + (arh->arh_hlen & 0xFF), &v6src, - IPV6_ADDR_LEN); - } else if (BE16_TO_U16(arh->arh_proto) == IP_ARP_PROTO_TYPE) { - bcopy((char *)&arh[1] + (arh->arh_hlen & 0xFF), &src, - IP_ADDR_LEN); - } else { - freemsg(mp); + mp = icmp_pkt_err_ok(mp, ira); + if (mp == NULL) return; - } - - ill = q->q_ptr; - arcn = (arcn_t *)mp->b_rptr; - switch (arcn->arcn_code) { - case AR_CN_BOGON: - /* - * Someone is sending ARP packets with a source protocol - * address that we have published and for which we believe our - * entry is authoritative and (when ill_arp_extend is set) - * verified to be unique on the network. - * - * The ARP module internally handles the cases where the sender - * is just probing (for DAD) and where the hardware address of - * a non-authoritative entry has changed. Thus, these are the - * real conflicts, and we have to do resolution. - * - * We back away quickly from the address if it's from DHCP or - * otherwise temporary and hasn't been used recently (or at - * all). We'd like to include "deprecated" addresses here as - * well (as there's no real reason to defend something we're - * discarding), but IPMP "reuses" this flag to mean something - * other than the standard meaning. - * - * If the ARP module above is not extended (meaning that it - * doesn't know how to defend the address), then we just log - * the problem as we always did and continue on. It's not - * right, but there's little else we can do, and those old ATM - * users are going away anyway. - */ - (void) mac_colon_addr((uint8_t *)(arh + 1), arh->arh_hlen, - hbuf, sizeof (hbuf)); - (void) ip_dot_addr(src, sbuf); - if (isv6) { - ire = ire_cache_lookup_v6(&v6src, ALL_ZONES, NULL, - ipst); - } else { - ire = ire_cache_lookup(src, ALL_ZONES, NULL, ipst); - } - if (ire != NULL && IRE_IS_LOCAL(ire)) { - uint32_t now; - uint32_t maxage; - clock_t lused; - uint_t maxdefense; - uint_t defs; - - /* - * First, figure out if this address hasn't been used - * in a while. If it hasn't, then it's a better - * candidate for abandoning. - */ - ipif = ire->ire_ipif; - ASSERT(ipif != NULL); - now = gethrestime_sec(); - maxage = now - ire->ire_create_time; - if (maxage > ipst->ips_ip_max_temp_idle) - maxage = ipst->ips_ip_max_temp_idle; - lused = drv_hztousec(ddi_get_lbolt() - - ire->ire_last_used_time) / MICROSEC + 1; - if (lused >= maxage && (ipif->ipif_flags & - (IPIF_DHCPRUNNING | IPIF_TEMPORARY))) - maxdefense = ipst->ips_ip_max_temp_defend; - else - maxdefense = ipst->ips_ip_max_defend; - - /* - * Now figure out how many times we've defended - * ourselves. Ignore defenses that happened long in - * the past. - */ - mutex_enter(&ire->ire_lock); - if ((defs = ire->ire_defense_count) > 0 && - now - ire->ire_defense_time > - ipst->ips_ip_defend_interval) { - ire->ire_defense_count = defs = 0; - } - ire->ire_defense_count++; - ire->ire_defense_time = now; - mutex_exit(&ire->ire_lock); - ill_refhold(ill); - ire_refrele(ire); - - /* - * If we've defended ourselves too many times already, - * then give up and tear down the interface(s) using - * this address. Otherwise, defend by sending out a - * gratuitous ARP. - */ - if (defs >= maxdefense && ill->ill_arp_extend) { - qwriter_ip(ill, q, mp, ip_arp_excl, NEW_OP, - B_FALSE); - } else { - cmn_err(CE_WARN, - "node %s is using our IP address %s on %s", - hbuf, sbuf, ill->ill_name); - /* - * If this is an old (ATM) ARP module, then - * don't try to defend the address. Remain - * compatible with the old behavior. Defend - * only with new ARP. - */ - if (ill->ill_arp_extend) { - qwriter_ip(ill, q, mp, ip_arp_defend, - NEW_OP, B_FALSE); - } else { - ill_refrele(ill); - } - } - return; - } - cmn_err(CE_WARN, - "proxy ARP problem? Node '%s' is using %s on %s", - hbuf, sbuf, ill->ill_name); - if (ire != NULL) - ire_refrele(ire); - break; - case AR_CN_ANNOUNCE: - if (isv6) { - /* - * For XRESOLV interfaces. - * Delete the IRE cache entry and NCE for this - * v6 address - */ - ip_ire_clookup_and_delete_v6(&v6src, ipst); - /* - * If v6src is a non-zero, it's a router address - * as below. Do the same sort of thing to clean - * out off-net IRE_CACHE entries that go through - * the router. - */ - if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) { - ire_walk_v6(ire_delete_cache_gw_v6, - (char *)&v6src, ALL_ZONES, ipst); - } - } else { - nce_hw_map_t hwm; - - /* - * ARP gives us a copy of any packet where it thinks - * the address has changed, so that we can update our - * caches. We're responsible for caching known answers - * in the current design. We check whether the - * hardware address really has changed in all of our - * entries that have cached this mapping, and if so, we - * blow them away. This way we will immediately pick - * up the rare case of a host changing hardware - * address. - */ - if (src == 0) - break; - hwm.hwm_addr = src; - hwm.hwm_hwlen = arh->arh_hlen; - hwm.hwm_hwaddr = (uchar_t *)(arh + 1); - NDP_HW_CHANGE_INCR(ipst->ips_ndp4); - ndp_walk_common(ipst->ips_ndp4, NULL, - (pfi_t)nce_delete_hw_changed, &hwm, ALL_ZONES); - NDP_HW_CHANGE_DECR(ipst->ips_ndp4); - } - break; - case AR_CN_READY: - /* No external v6 resolver has a contract to use this */ - if (isv6) - break; - /* If the link is down, we'll retry this later */ - if (!(ill->ill_phyint->phyint_flags & PHYI_RUNNING)) - break; - ipif = ipif_lookup_addr(src, ill, ALL_ZONES, NULL, NULL, - NULL, NULL, ipst); - if (ipif != NULL) { - /* - * If this is a duplicate recovery, then we now need to - * go exclusive to bring this thing back up. - */ - if ((ipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)) == - IPIF_DUPLICATE) { - ipif_refrele(ipif); - ill_refhold(ill); - qwriter_ip(ill, q, mp, ip_arp_excl, NEW_OP, - B_FALSE); - return; - } - /* - * If this is the first notice that this address is - * ready, then let the user know now. - */ - if ((ipif->ipif_flags & IPIF_UP) && - !ipif->ipif_addr_ready) { - ipif_mask_reply(ipif); - ipif_up_notify(ipif); - } - ipif->ipif_addr_ready = 1; - ipif_refrele(ipif); - } - ire = ire_cache_lookup(src, ALL_ZONES, msg_getlabel(mp), ipst); - if (ire != NULL) { - ire->ire_defense_count = 0; - ire_refrele(ire); - } - break; - case AR_CN_FAILED: - /* No external v6 resolver has a contract to use this */ - if (isv6) - break; - if (!ill->ill_arp_extend) { - (void) mac_colon_addr((uint8_t *)(arh + 1), - arh->arh_hlen, hbuf, sizeof (hbuf)); - (void) ip_dot_addr(src, sbuf); - - cmn_err(CE_WARN, - "node %s is using our IP address %s on %s", - hbuf, sbuf, ill->ill_name); - break; - } - ill_refhold(ill); - qwriter_ip(ill, q, mp, ip_arp_excl, NEW_OP, B_FALSE); - return; - } - freemsg(mp); + bzero(&icmph, sizeof (icmph_t)); + icmph.icmph_type = ICMP_REDIRECT; + icmph.icmph_code = 1; + icmph.icmph_rd_gateway = gateway; + BUMP_MIB(&ipst->ips_icmp_mib, icmpOutRedirects); + icmp_pkt(mp, &icmph, sizeof (icmph_t), ira); } /* - * Create a mblk suitable for carrying the interface index and/or source link - * address. This mblk is tagged as an M_CTL and is sent to ULP. This is used - * when the IP_RECVIF and/or IP_RECVSLLA socket option is set by the user - * application. + * Generate an ICMP time exceeded message. */ -mblk_t * -ip_add_info(mblk_t *data_mp, ill_t *ill, uint_t flags, zoneid_t zoneid, - ip_stack_t *ipst) +void +icmp_time_exceeded(mblk_t *mp, uint8_t code, ip_recv_attr_t *ira) { - mblk_t *mp; - ip_pktinfo_t *pinfo; - ipha_t *ipha; - struct ether_header *pether; - boolean_t ipmp_ill_held = B_FALSE; - - mp = allocb(sizeof (ip_pktinfo_t), BPRI_MED); - if (mp == NULL) { - ip1dbg(("ip_add_info: allocation failure.\n")); - return (data_mp); - } - - ipha = (ipha_t *)data_mp->b_rptr; - pinfo = (ip_pktinfo_t *)mp->b_rptr; - bzero(pinfo, sizeof (ip_pktinfo_t)); - pinfo->ip_pkt_flags = (uchar_t)flags; - pinfo->ip_pkt_ulp_type = IN_PKTINFO; /* Tell ULP what type of info */ - - pether = (struct ether_header *)((char *)ipha - - sizeof (struct ether_header)); - - /* - * Make sure the interface is an ethernet type, since this option - * is currently supported only on this type of interface. Also make - * sure we are pointing correctly above db_base. - */ - if ((flags & IPF_RECVSLLA) && - ((uchar_t *)pether >= data_mp->b_datap->db_base) && - (ill->ill_type == IFT_ETHER) && - (ill->ill_net_type == IRE_IF_RESOLVER)) { - pinfo->ip_pkt_slla.sdl_type = IFT_ETHER; - bcopy(pether->ether_shost.ether_addr_octet, - pinfo->ip_pkt_slla.sdl_data, ETHERADDRL); - } else { - /* - * Clear the bit. Indicate to upper layer that IP is not - * sending this ancillary info. - */ - pinfo->ip_pkt_flags = pinfo->ip_pkt_flags & ~IPF_RECVSLLA; - } - - /* - * If `ill' is in an IPMP group, use the IPMP ill to determine - * IPF_RECVIF and IPF_RECVADDR. (This currently assumes that - * IPF_RECVADDR support on test addresses is not needed.) - * - * Note that `ill' may already be an IPMP ill if e.g. we're - * processing a packet looped back to an IPMP data address - * (since those IRE_LOCALs are tied to IPMP ills). - */ - if (IS_UNDER_IPMP(ill)) { - if ((ill = ipmp_ill_hold_ipmp_ill(ill)) == NULL) { - ip1dbg(("ip_add_info: cannot hold IPMP ill.\n")); - freemsg(mp); - return (data_mp); - } - ipmp_ill_held = B_TRUE; - } - - if (flags & (IPF_RECVIF | IPF_RECVADDR)) - pinfo->ip_pkt_ifindex = ill->ill_phyint->phyint_ifindex; - if (flags & IPF_RECVADDR) { - ipif_t *ipif; - ire_t *ire; - - /* - * Only valid for V4 - */ - ASSERT((ipha->ipha_version_and_hdr_length & 0xf0) == - (IPV4_VERSION << 4)); - - ipif = ipif_get_next_ipif(NULL, ill); - if (ipif != NULL) { - /* - * Since a decision has already been made to deliver the - * packet, there is no need to test for SECATTR and - * ZONEONLY. - * When a multicast packet is transmitted - * a cache entry is created for the multicast address. - * When delivering a copy of the packet or when new - * packets are received we do not want to match on the - * cached entry so explicitly match on - * IRE_LOCAL and IRE_LOOPBACK - */ - ire = ire_ctable_lookup(ipha->ipha_dst, 0, - IRE_LOCAL | IRE_LOOPBACK, - ipif, zoneid, NULL, - MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst); - if (ire == NULL) { - /* - * packet must have come on a different - * interface. - * Since a decision has already been made to - * deliver the packet, there is no need to test - * for SECATTR and ZONEONLY. - * Only match on local and broadcast ire's. - * See detailed comment above. - */ - ire = ire_ctable_lookup(ipha->ipha_dst, 0, - IRE_LOCAL | IRE_LOOPBACK, ipif, zoneid, - NULL, MATCH_IRE_TYPE, ipst); - } - - if (ire == NULL) { - /* - * This is either a multicast packet or - * the address has been removed since - * the packet was received. - * Return INADDR_ANY so that normal source - * selection occurs for the response. - */ - - pinfo->ip_pkt_match_addr.s_addr = INADDR_ANY; - } else { - pinfo->ip_pkt_match_addr.s_addr = - ire->ire_src_addr; - ire_refrele(ire); - } - ipif_refrele(ipif); - } else { - pinfo->ip_pkt_match_addr.s_addr = INADDR_ANY; - } - } - - if (ipmp_ill_held) - ill_refrele(ill); + icmph_t icmph; + ip_stack_t *ipst = ira->ira_ill->ill_ipst; - mp->b_datap->db_type = M_CTL; - mp->b_wptr += sizeof (ip_pktinfo_t); - mp->b_cont = data_mp; + mp = icmp_pkt_err_ok(mp, ira); + if (mp == NULL) + return; - return (mp); + bzero(&icmph, sizeof (icmph_t)); + icmph.icmph_type = ICMP_TIME_EXCEEDED; + icmph.icmph_code = code; + BUMP_MIB(&ipst->ips_icmp_mib, icmpOutTimeExcds); + icmp_pkt(mp, &icmph, sizeof (icmph_t), ira); } /* - * Used to determine the most accurate cred_t to use for TX. - * First priority is SCM_UCRED having set the label in the message, - * which is used for MLP on UDP. Second priority is the open credentials - * with the peer's label (aka conn_effective_cred), which is needed for - * MLP on TCP/SCTP and for MAC-Exempt. Last priority is the open credentials. + * Generate an ICMP unreachable message. + * When called from ip_output side a minimal ip_recv_attr_t needs to be + * constructed by the caller. */ -cred_t * -ip_best_cred(mblk_t *mp, conn_t *connp, pid_t *pidp) +void +icmp_unreachable(mblk_t *mp, uint8_t code, ip_recv_attr_t *ira) { - cred_t *cr; + icmph_t icmph; + ip_stack_t *ipst = ira->ira_ill->ill_ipst; - cr = msg_getcred(mp, pidp); - if (cr != NULL && crgetlabel(cr) != NULL) - return (cr); - *pidp = NOPID; - return (CONN_CRED(connp)); + mp = icmp_pkt_err_ok(mp, ira); + if (mp == NULL) + return; + + bzero(&icmph, sizeof (icmph_t)); + icmph.icmph_type = ICMP_DEST_UNREACHABLE; + icmph.icmph_code = code; + BUMP_MIB(&ipst->ips_icmp_mib, icmpOutDestUnreachs); + icmp_pkt(mp, &icmph, sizeof (icmph_t), ira); } /* - * Latch in the IPsec state for a stream based on the ipsec_in_t passed in as - * part of the bind request. + * Latch in the IPsec state for a stream based the policy in the listener + * and the actions in the ip_recv_attr_t. + * Called directly from TCP and SCTP. */ - boolean_t -ip_bind_ipsec_policy_set(conn_t *connp, mblk_t *policy_mp) +ip_ipsec_policy_inherit(conn_t *connp, conn_t *lconnp, ip_recv_attr_t *ira) { - ipsec_in_t *ii; - - ASSERT(policy_mp != NULL); - ASSERT(policy_mp->b_datap->db_type == IPSEC_POLICY_SET); + ASSERT(lconnp->conn_policy != NULL); + ASSERT(connp->conn_policy == NULL); - ii = (ipsec_in_t *)policy_mp->b_rptr; - ASSERT(ii->ipsec_in_type == IPSEC_IN); + IPPH_REFHOLD(lconnp->conn_policy); + connp->conn_policy = lconnp->conn_policy; - connp->conn_policy = ii->ipsec_in_policy; - ii->ipsec_in_policy = NULL; - - if (ii->ipsec_in_action != NULL) { + if (ira->ira_ipsec_action != NULL) { if (connp->conn_latch == NULL) { connp->conn_latch = iplatch_create(); if (connp->conn_latch == NULL) return (B_FALSE); } - ipsec_latch_inbound(connp->conn_latch, ii); + ipsec_latch_inbound(connp, ira); } return (B_TRUE); } /* - * Upper level protocols (ULP) pass through bind requests to IP for inspection - * and to arrange for power-fanout assist. The ULP is identified by - * adding a single byte at the end of the original bind message. - * A ULP other than UDP or TCP that wishes to be recognized passes - * down a bind with a zero length address. - * - * The binding works as follows: - * - A zero byte address means just bind to the protocol. - * - A four byte address is treated as a request to validate - * that the address is a valid local address, appropriate for - * an application to bind to. This does not affect any fanout - * information in IP. - * - A sizeof sin_t byte address is used to bind to only the local address - * and port. - * - A sizeof ipa_conn_t byte address contains complete fanout information - * consisting of local and remote addresses and ports. In - * this case, the addresses are both validated as appropriate - * for this operation, and, if so, the information is retained - * for use in the inbound fanout. + * Verify whether or not the IP address is a valid local address. + * Could be a unicast, including one for a down interface. + * If allow_mcbc then a multicast or broadcast address is also + * acceptable. * - * The ULP (except in the zero-length bind) can append an - * additional mblk of db_type IRE_DB_REQ_TYPE or IPSEC_POLICY_SET to the - * T_BIND_REQ/O_T_BIND_REQ. IRE_DB_REQ_TYPE indicates that the ULP wants - * a copy of the source or destination IRE (source for local bind; - * destination for complete bind). IPSEC_POLICY_SET indicates that the - * policy information contained should be copied on to the conn. - * - * NOTE : Only one of IRE_DB_REQ_TYPE or IPSEC_POLICY_SET can be present. - */ -mblk_t * -ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp) -{ - ssize_t len; - struct T_bind_req *tbr; - sin_t *sin; - ipa_conn_t *ac; - uchar_t *ucp; - int error = 0; - int protocol; - ipa_conn_x_t *acx; - cred_t *cr; - - /* - * All Solaris components should pass a db_credp - * for this TPI message, hence we ASSERT. - * But in case there is some other M_PROTO that looks - * like a TPI message sent by some other kernel - * component, we check and return an error. - */ - cr = msg_getcred(mp, NULL); - ASSERT(cr != NULL); - if (cr == NULL) { - error = EINVAL; - goto bad_addr; - } - - ASSERT(!connp->conn_af_isv6); - connp->conn_pkt_isv6 = B_FALSE; - - len = MBLKL(mp); - if (len < (sizeof (*tbr) + 1)) { - (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, - "ip_bind: bogus msg, len %ld", len); - /* XXX: Need to return something better */ - goto bad_addr; - } - /* Back up and extract the protocol identifier. */ - mp->b_wptr--; - protocol = *mp->b_wptr & 0xFF; - tbr = (struct T_bind_req *)mp->b_rptr; - /* Reset the message type in preparation for shipping it back. */ - DB_TYPE(mp) = M_PCPROTO; - - connp->conn_ulp = (uint8_t)protocol; - - /* - * Check for a zero length address. This is from a protocol that - * wants to register to receive all packets of its type. - */ - if (tbr->ADDR_length == 0) { - /* - * These protocols are now intercepted in ip_bind_v6(). - * Reject protocol-level binds here for now. - * - * For SCTP raw socket, ICMP sends down a bind with sin_t - * so that the protocol type cannot be SCTP. - */ - if (protocol == IPPROTO_TCP || protocol == IPPROTO_AH || - protocol == IPPROTO_ESP || protocol == IPPROTO_SCTP) { - goto bad_addr; - } - - /* - * - * The udp module never sends down a zero-length address, - * and allowing this on a labeled system will break MLP - * functionality. - */ - if (is_system_labeled() && protocol == IPPROTO_UDP) - goto bad_addr; - - if (connp->conn_mac_mode != CONN_MAC_DEFAULT) - goto bad_addr; - - /* No hash here really. The table is big enough. */ - connp->conn_srcv6 = ipv6_all_zeros; - - ipcl_proto_insert(connp, protocol); - - tbr->PRIM_type = T_BIND_ACK; - return (mp); - } - - /* Extract the address pointer from the message. */ - ucp = (uchar_t *)mi_offset_param(mp, tbr->ADDR_offset, - tbr->ADDR_length); - if (ucp == NULL) { - ip1dbg(("ip_bind: no address\n")); - goto bad_addr; - } - if (!OK_32PTR(ucp)) { - ip1dbg(("ip_bind: unaligned address\n")); - goto bad_addr; - } - - switch (tbr->ADDR_length) { - default: - ip1dbg(("ip_bind: bad address length %d\n", - (int)tbr->ADDR_length)); - goto bad_addr; - - case IP_ADDR_LEN: - /* Verification of local address only */ - error = ip_bind_laddr_v4(connp, &mp->b_cont, protocol, - *(ipaddr_t *)ucp, 0, B_FALSE); - break; - - case sizeof (sin_t): - sin = (sin_t *)ucp; - error = ip_bind_laddr_v4(connp, &mp->b_cont, protocol, - sin->sin_addr.s_addr, sin->sin_port, B_TRUE); - break; - - case sizeof (ipa_conn_t): - ac = (ipa_conn_t *)ucp; - /* For raw socket, the local port is not set. */ - if (ac->ac_lport == 0) - ac->ac_lport = connp->conn_lport; - /* Always verify destination reachability. */ - error = ip_bind_connected_v4(connp, &mp->b_cont, protocol, - &ac->ac_laddr, ac->ac_lport, ac->ac_faddr, ac->ac_fport, - B_TRUE, B_TRUE, cr); - break; - - case sizeof (ipa_conn_x_t): - acx = (ipa_conn_x_t *)ucp; - /* - * Whether or not to verify destination reachability depends - * on the setting of the ACX_VERIFY_DST flag in acx->acx_flags. - */ - error = ip_bind_connected_v4(connp, &mp->b_cont, protocol, - &acx->acx_conn.ac_laddr, acx->acx_conn.ac_lport, - acx->acx_conn.ac_faddr, acx->acx_conn.ac_fport, - B_TRUE, (acx->acx_flags & ACX_VERIFY_DST) != 0, cr); - break; - } - ASSERT(error != EINPROGRESS); - if (error != 0) - goto bad_addr; - - /* Send it home. */ - mp->b_datap->db_type = M_PCPROTO; - tbr->PRIM_type = T_BIND_ACK; - return (mp); - -bad_addr: - /* - * If error = -1 then we generate a TBADADDR - otherwise error is - * a unix errno. - */ - if (error > 0) - mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error); - else - mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0); - return (mp); -} - -/* - * Here address is verified to be a valid local address. - * If the IRE_DB_REQ_TYPE mp is present, a broadcast/multicast - * address is also considered a valid local address. * In the case of a broadcast/multicast address, however, the * upper protocol is expected to reset the src address - * to 0 if it sees a IRE_BROADCAST type returned so that + * to zero when we return IPVL_MCAST/IPVL_BCAST so that * no packets are emitted with broadcast/multicast address as * source address (that violates hosts requirements RFC 1122) * The addresses valid for bind are: @@ -4530,323 +3396,189 @@ bad_addr: * application still has to issue an * IP_ADD_MEMBERSHIP socket option. * - * On error, return -1 for TBADADDR otherwise pass the - * errno with TSYSERR reply. - * * In all the above cases, the bound address must be valid in the current zone. * When the address is loopback, multicast or broadcast, there might be many * matching IREs so bind has to look up based on the zone. - * - * Note: lport is in network byte order. - * */ -int -ip_bind_laddr_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol, - ipaddr_t src_addr, uint16_t lport, boolean_t fanout_insert) +ip_laddr_t +ip_laddr_verify_v4(ipaddr_t src_addr, zoneid_t zoneid, + ip_stack_t *ipst, boolean_t allow_mcbc) { - int error = 0; - ire_t *src_ire; - zoneid_t zoneid; - ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - mblk_t *mp = NULL; - boolean_t ire_requested = B_FALSE; - boolean_t ipsec_policy_set = B_FALSE; + ire_t *src_ire; - if (mpp) - mp = *mpp; + ASSERT(src_addr != INADDR_ANY); - if (mp != NULL) { - ire_requested = (DB_TYPE(mp) == IRE_DB_REQ_TYPE); - ipsec_policy_set = (DB_TYPE(mp) == IPSEC_POLICY_SET); - } + src_ire = ire_ftable_lookup_v4(src_addr, 0, 0, 0, + NULL, zoneid, NULL, MATCH_IRE_ZONEONLY, 0, ipst, NULL); /* - * If it was previously connected, conn_fully_bound would have - * been set. + * If an address other than in6addr_any is requested, + * we verify that it is a valid address for bind + * Note: Following code is in if-else-if form for + * readability compared to a condition check. */ - connp->conn_fully_bound = B_FALSE; - - src_ire = NULL; + if (src_ire != NULL && (src_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK))) { + /* + * (2) Bind to address of local UP interface + */ + ire_refrele(src_ire); + return (IPVL_UNICAST_UP); + } else if (src_ire != NULL && src_ire->ire_type & IRE_BROADCAST) { + /* + * (4) Bind to broadcast address + */ + ire_refrele(src_ire); + if (allow_mcbc) + return (IPVL_BCAST); + else + return (IPVL_BAD); + } else if (CLASSD(src_addr)) { + /* (5) bind to multicast address. */ + if (src_ire != NULL) + ire_refrele(src_ire); - zoneid = IPCL_ZONEID(connp); + if (allow_mcbc) + return (IPVL_MCAST); + else + return (IPVL_BAD); + } else { + ipif_t *ipif; - if (src_addr) { - src_ire = ire_route_lookup(src_addr, 0, 0, 0, - NULL, NULL, zoneid, NULL, MATCH_IRE_ZONEONLY, ipst); /* - * If an address other than 0.0.0.0 is requested, - * we verify that it is a valid address for bind - * Note: Following code is in if-else-if form for - * readability compared to a condition check. + * (3) Bind to address of local DOWN interface? + * (ipif_lookup_addr() looks up all interfaces + * but we do not get here for UP interfaces + * - case (2) above) */ - /* LINTED - statement has no consequence */ - if (IRE_IS_LOCAL(src_ire)) { - /* - * (2) Bind to address of local UP interface - */ - } else if (src_ire && src_ire->ire_type == IRE_BROADCAST) { - /* - * (4) Bind to broadcast address - * Note: permitted only from transports that - * request IRE - */ - if (!ire_requested) - error = EADDRNOTAVAIL; - } else { - /* - * (3) Bind to address of local DOWN interface - * (ipif_lookup_addr() looks up all interfaces - * but we do not get here for UP interfaces - * - case (2) above) - */ - /* LINTED - statement has no consequent */ - if (ip_addr_exists(src_addr, zoneid, ipst)) { - /* The address exists */ - } else if (CLASSD(src_addr)) { - error = 0; - if (src_ire != NULL) - ire_refrele(src_ire); - /* - * (5) bind to multicast address. - * Fake out the IRE returned to upper - * layer to be a broadcast IRE. - */ - src_ire = ire_ctable_lookup( - INADDR_BROADCAST, INADDR_ANY, - IRE_BROADCAST, NULL, zoneid, NULL, - (MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY), - ipst); - if (src_ire == NULL || !ire_requested) - error = EADDRNOTAVAIL; - } else { - /* - * Not a valid address for bind - */ - error = EADDRNOTAVAIL; - } - } - if (error) { - /* Red Alert! Attempting to be a bogon! */ - ip1dbg(("ip_bind_laddr_v4: bad src address 0x%x\n", - ntohl(src_addr))); - goto bad_addr; + if (src_ire != NULL) + ire_refrele(src_ire); + + ipif = ipif_lookup_addr(src_addr, NULL, zoneid, ipst); + if (ipif == NULL) + return (IPVL_BAD); + + /* Not a useful source? */ + if (ipif->ipif_flags & (IPIF_NOLOCAL | IPIF_ANYCAST)) { + ipif_refrele(ipif); + return (IPVL_BAD); } + ipif_refrele(ipif); + return (IPVL_UNICAST_DOWN); } +} + +/* + * Insert in the bind fanout for IPv4 and IPv6. + * The caller should already have used ip_laddr_verify_v*() before calling + * this. + */ +int +ip_laddr_fanout_insert(conn_t *connp) +{ + int error; /* - * Allow setting new policies. For example, disconnects come - * down as ipa_t bind. As we would have set conn_policy_cached + * Allow setting new policies. For example, disconnects result + * in us being called. As we would have set conn_policy_cached * to B_TRUE before, we should set it to B_FALSE, so that policy * can change after the disconnect. */ connp->conn_policy_cached = B_FALSE; - /* - * If not fanout_insert this was just an address verification - */ - if (fanout_insert) { - /* - * The addresses have been verified. Time to insert in - * the correct fanout list. - */ - IN6_IPADDR_TO_V4MAPPED(src_addr, &connp->conn_srcv6); - IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &connp->conn_remv6); - connp->conn_lport = lport; - connp->conn_fport = 0; - /* - * Do we need to add a check to reject Multicast packets - */ - error = ipcl_bind_insert(connp, protocol, src_addr, lport); - } - - if (error == 0) { - if (ire_requested) { - if (!ip_bind_get_ire_v4(mpp, src_ire, NULL, ipst)) { - error = -1; - /* Falls through to bad_addr */ - } - } else if (ipsec_policy_set) { - if (!ip_bind_ipsec_policy_set(connp, mp)) { - error = -1; - /* Falls through to bad_addr */ - } - } - } -bad_addr: + error = ipcl_bind_insert(connp); if (error != 0) { if (connp->conn_anon_port) { (void) tsol_mlp_anon(crgetzone(connp->conn_cred), - connp->conn_mlp_type, connp->conn_ulp, ntohs(lport), - B_FALSE); + connp->conn_mlp_type, connp->conn_proto, + ntohs(connp->conn_lport), B_FALSE); } connp->conn_mlp_type = mlptSingle; } - if (src_ire != NULL) - IRE_REFRELE(src_ire); - return (error); -} - -int -ip_proto_bind_laddr_v4(conn_t *connp, mblk_t **ire_mpp, uint8_t protocol, - ipaddr_t src_addr, uint16_t lport, boolean_t fanout_insert) -{ - int error; - - ASSERT(!connp->conn_af_isv6); - connp->conn_pkt_isv6 = B_FALSE; - connp->conn_ulp = protocol; - - error = ip_bind_laddr_v4(connp, ire_mpp, protocol, src_addr, lport, - fanout_insert); - if (error < 0) - error = -TBADADDR; return (error); } /* - * Verify that both the source and destination addresses - * are valid. If verify_dst is false, then the destination address may be - * unreachable, i.e. have no route to it. Protocols like TCP want to verify - * destination reachability, while tunnels do not. - * Note that we allow connect to broadcast and multicast - * addresses when ire_requested is set. Thus the ULP - * has to check for IRE_BROADCAST and multicast. + * Verify that both the source and destination addresses are valid. If + * IPDF_VERIFY_DST is not set, then the destination address may be unreachable, + * i.e. have no route to it. Protocols like TCP want to verify destination + * reachability, while tunnels do not. * - * Returns zero if ok. - * On error: returns -1 to mean TBADADDR otherwise returns an errno - * (for use with TSYSERR reply). + * Determine the route, the interface, and (optionally) the source address + * to use to reach a given destination. + * Note that we allow connect to broadcast and multicast addresses when + * IPDF_ALLOW_MCBC is set. + * first_hop and dst_addr are normally the same, but if source routing + * they will differ; in that case the first_hop is what we'll use for the + * routing lookup but the dce and label checks will be done on dst_addr, * - * Note: lport and fport are in network byte order. + * If uinfo is set, then we fill in the best available information + * we have for the destination. This is based on (in priority order) any + * metrics and path MTU stored in a dce_t, route metrics, and finally the + * ill_mtu. + * + * Tsol note: If we have a source route then dst_addr != firsthop. But we + * always do the label check on dst_addr. */ int -ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol, - ipaddr_t *src_addrp, uint16_t lport, ipaddr_t dst_addr, uint16_t fport, - boolean_t fanout_insert, boolean_t verify_dst, cred_t *cr) +ip_set_destination_v4(ipaddr_t *src_addrp, ipaddr_t dst_addr, ipaddr_t firsthop, + ip_xmit_attr_t *ixa, iulp_t *uinfo, uint32_t flags, uint_t mac_mode) { - - ire_t *src_ire; - ire_t *dst_ire; + ire_t *ire = NULL; int error = 0; - ire_t *sire = NULL; - ire_t *md_dst_ire = NULL; - ire_t *lso_dst_ire = NULL; + ipaddr_t setsrc; /* RTF_SETSRC */ + zoneid_t zoneid = ixa->ixa_zoneid; /* Honors SO_ALLZONES */ + ip_stack_t *ipst = ixa->ixa_ipst; + dce_t *dce; + uint_t pmtu; + uint_t generation; + nce_t *nce; ill_t *ill = NULL; - zoneid_t zoneid; - ipaddr_t src_addr = *src_addrp; - ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - mblk_t *mp = NULL; - boolean_t ire_requested = B_FALSE; - boolean_t ipsec_policy_set = B_FALSE; - ts_label_t *tsl = NULL; - cred_t *effective_cred = NULL; - - if (mpp) - mp = *mpp; - - if (mp != NULL) { - ire_requested = (DB_TYPE(mp) == IRE_DB_REQ_TYPE); - ipsec_policy_set = (DB_TYPE(mp) == IPSEC_POLICY_SET); - } + boolean_t multirt = B_FALSE; - src_ire = dst_ire = NULL; + ASSERT(ixa->ixa_flags & IXAF_IS_IPV4); /* - * If we never got a disconnect before, clear it now. + * We never send to zero; the ULPs map it to the loopback address. + * We can't allow it since we use zero to mean unitialized in some + * places. */ - connp->conn_fully_bound = B_FALSE; + ASSERT(dst_addr != INADDR_ANY); - zoneid = IPCL_ZONEID(connp); - - /* - * Check whether Trusted Solaris policy allows communication with this - * host, and pretend that the destination is unreachable if not. - * - * This is never a problem for TCP, since that transport is known to - * compute the label properly as part of the tcp_rput_other T_BIND_ACK - * handling. If the remote is unreachable, it will be detected at that - * point, so there's no reason to check it here. - * - * Note that for sendto (and other datagram-oriented friends), this - * check is done as part of the data path label computation instead. - * The check here is just to make non-TCP connect() report the right - * error. - */ - if (is_system_labeled() && !IPCL_IS_TCP(connp)) { - if ((error = tsol_check_dest(cr, &dst_addr, IPV4_VERSION, - connp->conn_mac_mode, &effective_cred)) != 0) { - if (ip_debug > 2) { - pr_addr_dbg( - "ip_bind_connected_v4:" - " no label for dst %s\n", - AF_INET, &dst_addr); - } - goto bad_addr; - } + if (is_system_labeled()) { + ts_label_t *tsl = NULL; - /* - * tsol_check_dest() may have created a new cred with - * a modified security label. Use that cred if it exists - * for ire lookups. - */ - if (effective_cred == NULL) { - tsl = crgetlabel(cr); - } else { - tsl = crgetlabel(effective_cred); + error = tsol_check_dest(ixa->ixa_tsl, &dst_addr, IPV4_VERSION, + mac_mode, (flags & IPDF_ZONE_IS_GLOBAL) != 0, &tsl); + if (error != 0) + return (error); + if (tsl != NULL) { + /* Update the label */ + ip_xmit_attr_replace_tsl(ixa, tsl); } } - if (CLASSD(dst_addr)) { - /* Pick up an IRE_BROADCAST */ - dst_ire = ire_route_lookup(ip_g_all_ones, 0, 0, 0, NULL, - NULL, zoneid, tsl, - (MATCH_IRE_RECURSIVE | - MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE | - MATCH_IRE_SECATTR), ipst); - } else { - /* - * If conn_dontroute is set or if conn_nexthop_set is set, - * and onlink ipif is not found set ENETUNREACH error. - */ - if (connp->conn_dontroute || connp->conn_nexthop_set) { - ipif_t *ipif; - - ipif = ipif_lookup_onlink_addr(connp->conn_dontroute ? - dst_addr : connp->conn_nexthop_v4, zoneid, ipst); - if (ipif == NULL) { - error = ENETUNREACH; - goto bad_addr; - } - ipif_refrele(ipif); - } + setsrc = INADDR_ANY; + /* + * Select a route; For IPMP interfaces, we would only select + * a "hidden" route (i.e., going through a specific under_ill) + * if ixa_ifindex has been specified. + */ + ire = ip_select_route_v4(firsthop, ixa, &generation, &setsrc, &error, + &multirt); + ASSERT(ire != NULL); /* IRE_NOROUTE if none found */ + if (error != 0) + goto bad_addr; - if (connp->conn_nexthop_set) { - dst_ire = ire_route_lookup(connp->conn_nexthop_v4, 0, - 0, 0, NULL, NULL, zoneid, tsl, - MATCH_IRE_SECATTR, ipst); - } else { - dst_ire = ire_route_lookup(dst_addr, 0, 0, 0, NULL, - &sire, zoneid, tsl, - (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | - MATCH_IRE_PARENT | MATCH_IRE_RJ_BHOLE | - MATCH_IRE_SECATTR), ipst); - } - } /* - * dst_ire can't be a broadcast when not ire_requested. - * We also prevent ire's with src address INADDR_ANY to - * be used, which are created temporarily for - * sending out packets from endpoints that have - * conn_unspec_src set. If verify_dst is true, the destination must be - * reachable. If verify_dst is false, the destination needn't be - * reachable. + * ire can't be a broadcast or multicast unless IPDF_ALLOW_MCBC is set. + * If IPDF_VERIFY_DST is set, the destination must be reachable; + * Otherwise the destination needn't be reachable. * * If we match on a reject or black hole, then we've got a * local failure. May as well fail out the connect() attempt, * since it's never going to succeed. */ - if (dst_ire == NULL || dst_ire->ire_src_addr == INADDR_ANY || - (dst_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || - ((dst_ire->ire_type & IRE_BROADCAST) && !ire_requested)) { + if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { /* * If we're verifying destination reachability, we always want * to complain here. @@ -4854,425 +3586,435 @@ ip_bind_connected_v4(conn_t *connp, mblk_t **mpp, uint8_t protocol, * If we're not verifying destination reachability but the * destination has a route, we still want to fail on the * temporary address and broadcast address tests. + * + * In both cases do we let the code continue so some reasonable + * information is returned to the caller. That enables the + * caller to use (and even cache) the IRE. conn_ip_ouput will + * use the generation mismatch path to check for the unreachable + * case thereby avoiding any specific check in the main path. */ - if (verify_dst || (dst_ire != NULL)) { - if (ip_debug > 2) { - pr_addr_dbg("ip_bind_connected_v4:" - "bad connected dst %s\n", - AF_INET, &dst_addr); - } - if (dst_ire == NULL || !(dst_ire->ire_type & IRE_HOST)) + ASSERT(generation == IRE_GENERATION_VERIFY); + if (flags & IPDF_VERIFY_DST) { + /* + * Set errno but continue to set up ixa_ire to be + * the RTF_REJECT|RTF_BLACKHOLE IRE. + * That allows callers to use ip_output to get an + * ICMP error back. + */ + if (!(ire->ire_type & IRE_HOST)) error = ENETUNREACH; else error = EHOSTUNREACH; - goto bad_addr; + } + } + + if ((ire->ire_type & (IRE_BROADCAST|IRE_MULTICAST)) && + !(flags & IPDF_ALLOW_MCBC)) { + ire_refrele(ire); + ire = ire_reject(ipst, B_FALSE); + generation = IRE_GENERATION_VERIFY; + error = ENETUNREACH; + } + + /* Cache things */ + if (ixa->ixa_ire != NULL) + ire_refrele_notr(ixa->ixa_ire); +#ifdef DEBUG + ire_refhold_notr(ire); + ire_refrele(ire); +#endif + ixa->ixa_ire = ire; + ixa->ixa_ire_generation = generation; + + /* + * For multicast with multirt we have a flag passed back from + * ire_lookup_multi_ill_v4 since we don't have an IRE for each + * possible multicast address. + * We also need a flag for multicast since we can't check + * whether RTF_MULTIRT is set in ixa_ire for multicast. + */ + if (multirt) { + ixa->ixa_postfragfn = ip_postfrag_multirt_v4; + ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST; + } else { + ixa->ixa_postfragfn = ire->ire_postfragfn; + ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST; + } + if (!(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { + /* Get an nce to cache. */ + nce = ire_to_nce(ire, firsthop, NULL); + if (nce == NULL) { + /* Allocation failure? */ + ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; + } else { + if (ixa->ixa_nce != NULL) + nce_refrele(ixa->ixa_nce); + ixa->ixa_nce = nce; } } /* - * If the app does a connect(), it means that it will most likely - * send more than 1 packet to the destination. It makes sense - * to clear the temporary flag. + * We use use ire_nexthop_ill to avoid the under ipmp + * interface for source address selection. Note that for ipmp + * probe packets, ixa_ifindex would have been specified, and + * the ip_select_route() invocation would have picked an ire + * will ire_ill pointing at an under interface. */ - if (dst_ire != NULL && dst_ire->ire_type == IRE_CACHE && - (dst_ire->ire_marks & IRE_MARK_TEMPORARY)) { - irb_t *irb = dst_ire->ire_bucket; + ill = ire_nexthop_ill(ire); - rw_enter(&irb->irb_lock, RW_WRITER); + /* + * If the source address is a loopback address, the + * destination had best be local or multicast. + * If we are sending to an IRE_LOCAL using a loopback source then + * it had better be the same zoneid. + */ + if (*src_addrp == htonl(INADDR_LOOPBACK)) { + if ((ire->ire_type & IRE_LOCAL) && ire->ire_zoneid != zoneid) { + ire = NULL; /* Stored in ixa_ire */ + error = EADDRNOTAVAIL; + goto bad_addr; + } + if (!(ire->ire_type & (IRE_LOOPBACK|IRE_LOCAL|IRE_MULTICAST))) { + ire = NULL; /* Stored in ixa_ire */ + error = EADDRNOTAVAIL; + goto bad_addr; + } + } + if (ire->ire_type & IRE_BROADCAST) { /* - * We need to recheck for IRE_MARK_TEMPORARY after acquiring - * the lock to guarantee irb_tmp_ire_cnt. + * If the ULP didn't have a specified source, then we + * make sure we reselect the source when sending + * broadcasts out different interfaces. */ - if (dst_ire->ire_marks & IRE_MARK_TEMPORARY) { - dst_ire->ire_marks &= ~IRE_MARK_TEMPORARY; - irb->irb_tmp_ire_cnt--; - } - rw_exit(&irb->irb_lock); + if (flags & IPDF_SELECT_SRC) + ixa->ixa_flags |= IXAF_SET_SOURCE; + else + ixa->ixa_flags &= ~IXAF_SET_SOURCE; } /* - * See if we should notify ULP about LSO/MDT; we do this whether or not - * ire_requested is TRUE, in order to handle active connects; LSO/MDT - * eligibility tests for passive connects are handled separately - * through tcp_adapt_ire(). We do this before the source address - * selection, because dst_ire may change after a call to - * ipif_select_source(). This is a best-effort check, as the - * packet for this connection may not actually go through - * dst_ire->ire_stq, and the exact IRE can only be known after - * calling ip_newroute(). This is why we further check on the - * IRE during LSO/Multidata packet transmission in - * tcp_lsosend()/tcp_multisend(). + * Does the caller want us to pick a source address? */ - if (!ipsec_policy_set && dst_ire != NULL && - !(dst_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST)) && - (ill = ire_to_ill(dst_ire), ill != NULL)) { - if (ipst->ips_ip_lso_outbound && ILL_LSO_CAPABLE(ill)) { - lso_dst_ire = dst_ire; - IRE_REFHOLD(lso_dst_ire); - } else if (ipst->ips_ip_multidata_outbound && - ILL_MDT_CAPABLE(ill)) { - md_dst_ire = dst_ire; - IRE_REFHOLD(md_dst_ire); + if (flags & IPDF_SELECT_SRC) { + ipaddr_t src_addr; + + /* If unreachable we have no ill but need some source */ + if (ill == NULL) { + src_addr = htonl(INADDR_LOOPBACK); + /* Make sure we look for a better source address */ + generation = SRC_GENERATION_VERIFY; + } else { + error = ip_select_source_v4(ill, setsrc, dst_addr, + ixa->ixa_multicast_ifaddr, zoneid, + ipst, &src_addr, &generation, NULL); + if (error != 0) { + ire = NULL; /* Stored in ixa_ire */ + goto bad_addr; + } } - } - if (dst_ire != NULL && dst_ire->ire_type == IRE_LOCAL && - dst_ire->ire_zoneid != zoneid && dst_ire->ire_zoneid != ALL_ZONES) { /* - * If the IRE belongs to a different zone, look for a matching - * route in the forwarding table and use the source address from - * that route. + * We allow the source address to to down. + * However, we check that we don't use the loopback address + * as a source when sending out on the wire. */ - src_ire = ire_ftable_lookup(dst_addr, 0, 0, 0, NULL, NULL, - zoneid, 0, NULL, - MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | - MATCH_IRE_RJ_BHOLE, ipst); - if (src_ire == NULL) { - error = EHOSTUNREACH; - goto bad_addr; - } else if (src_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { - if (!(src_ire->ire_type & IRE_HOST)) - error = ENETUNREACH; - else - error = EHOSTUNREACH; + if ((src_addr == htonl(INADDR_LOOPBACK)) && + !(ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK|IRE_MULTICAST)) && + !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { + ire = NULL; /* Stored in ixa_ire */ + error = EADDRNOTAVAIL; goto bad_addr; } - if (src_addr == INADDR_ANY) - src_addr = src_ire->ire_src_addr; - ire_refrele(src_ire); - src_ire = NULL; - } else if ((src_addr == INADDR_ANY) && (dst_ire != NULL)) { - if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) { - src_addr = sire->ire_src_addr; - ire_refrele(dst_ire); - dst_ire = sire; - sire = NULL; - } else { - /* - * Pick a source address so that a proper inbound - * load spreading would happen. - */ - ill_t *ire_ill = dst_ire->ire_ipif->ipif_ill; - ipif_t *src_ipif = NULL; - ire_t *ipif_ire; - /* - * Supply a local source address such that inbound - * load spreading happens. - * - * Determine the best source address on this ill for - * the destination. - * - * 1) For broadcast, we should return a broadcast ire - * found above so that upper layers know that the - * destination address is a broadcast address. - * - * 2) If the ipif is DEPRECATED, select a better - * source address. Similarly, if the ipif is on - * the IPMP meta-interface, pick a source address - * at random to improve inbound load spreading. - * - * 3) If the outgoing interface is part of a usesrc - * group, then try selecting a source address from - * the usesrc ILL. - */ - if ((dst_ire->ire_zoneid != zoneid && - dst_ire->ire_zoneid != ALL_ZONES) || - (!(dst_ire->ire_flags & RTF_SETSRC)) && - (!(dst_ire->ire_type & IRE_BROADCAST) && - (IS_IPMP(ire_ill) || - (dst_ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) || - (ire_ill->ill_usesrc_ifindex != 0)))) { - /* - * If the destination is reachable via a - * given gateway, the selected source address - * should be in the same subnet as the gateway. - * Otherwise, the destination is not reachable. - * - * If there are no interfaces on the same subnet - * as the destination, ipif_select_source gives - * first non-deprecated interface which might be - * on a different subnet than the gateway. - * This is not desirable. Hence pass the dst_ire - * source address to ipif_select_source. - * It is sure that the destination is reachable - * with the dst_ire source address subnet. - * So passing dst_ire source address to - * ipif_select_source will make sure that the - * selected source will be on the same subnet - * as dst_ire source address. - */ - ipaddr_t saddr = - dst_ire->ire_ipif->ipif_src_addr; - src_ipif = ipif_select_source(ire_ill, - saddr, zoneid); - if (src_ipif != NULL) { - if (IS_VNI(src_ipif->ipif_ill)) { - /* - * For VNI there is no - * interface route - */ - src_addr = - src_ipif->ipif_src_addr; - } else { - ipif_ire = - ipif_to_ire(src_ipif); - if (ipif_ire != NULL) { - IRE_REFRELE(dst_ire); - dst_ire = ipif_ire; - } - src_addr = - dst_ire->ire_src_addr; - } - ipif_refrele(src_ipif); - } else { - src_addr = dst_ire->ire_src_addr; - } - } else { - src_addr = dst_ire->ire_src_addr; - } - } + *src_addrp = src_addr; + ixa->ixa_src_generation = generation; } + if (flags & IPDF_UNIQUE_DCE) { + /* Fallback to the default dce if allocation fails */ + dce = dce_lookup_and_add_v4(dst_addr, ipst); + if (dce != NULL) + generation = dce->dce_generation; + else + dce = dce_lookup_v4(dst_addr, ipst, &generation); + } else { + dce = dce_lookup_v4(dst_addr, ipst, &generation); + } + ASSERT(dce != NULL); + if (ixa->ixa_dce != NULL) + dce_refrele_notr(ixa->ixa_dce); +#ifdef DEBUG + dce_refhold_notr(dce); + dce_refrele(dce); +#endif + ixa->ixa_dce = dce; + ixa->ixa_dce_generation = generation; + /* - * We do ire_route_lookup() here (and not - * interface lookup as we assert that - * src_addr should only come from an - * UP interface for hard binding. + * Make sure we don't leave an unreachable ixa_nce in place + * since ip_select_route is used when we unplumb i.e., remove + * references on ixa_ire, ixa_nce, and ixa_dce. */ - ASSERT(src_ire == NULL); - src_ire = ire_route_lookup(src_addr, 0, 0, 0, NULL, - NULL, zoneid, NULL, MATCH_IRE_ZONEONLY, ipst); - /* src_ire must be a local|loopback */ - if (!IRE_IS_LOCAL(src_ire)) { - if (ip_debug > 2) { - pr_addr_dbg("ip_bind_connected_v4: bad connected " - "src %s\n", AF_INET, &src_addr); - } - error = EADDRNOTAVAIL; - goto bad_addr; + nce = ixa->ixa_nce; + if (nce != NULL && nce->nce_is_condemned) { + nce_refrele(nce); + ixa->ixa_nce = NULL; + ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; } /* - * If the source address is a loopback address, the - * destination had best be local or multicast. - * The transports that can't handle multicast will reject - * those addresses. + * The caller has set IXAF_PMTU_DISCOVERY if path MTU is desired. + * However, we can't do it for IPv4 multicast or broadcast. */ - if (src_ire->ire_type == IRE_LOOPBACK && - !(IRE_IS_LOCAL(dst_ire) || CLASSD(dst_addr))) { - ip1dbg(("ip_bind_connected_v4: bad connected loopback\n")); - error = -1; - goto bad_addr; - } + if (ire->ire_type & (IRE_BROADCAST|IRE_MULTICAST)) + ixa->ixa_flags &= ~IXAF_PMTU_DISCOVERY; /* - * Allow setting new policies. For example, disconnects come - * down as ipa_t bind. As we would have set conn_policy_cached - * to B_TRUE before, we should set it to B_FALSE, so that policy - * can change after the disconnect. + * Set initial value for fragmentation limit. Either conn_ip_output + * or ULP might updates it when there are routing changes. + * Handles a NULL ixa_ire->ire_ill or a NULL ixa_nce for RTF_REJECT. */ - connp->conn_policy_cached = B_FALSE; + pmtu = ip_get_pmtu(ixa); + ixa->ixa_fragsize = pmtu; + /* Make sure ixa_fragsize and ixa_pmtu remain identical */ + if (ixa->ixa_flags & IXAF_VERIFY_PMTU) + ixa->ixa_pmtu = pmtu; /* - * Set the conn addresses/ports immediately, so the IPsec policy calls - * can handle their passed-in conn's. + * Extract information useful for some transports. + * First we look for DCE metrics. Then we take what we have in + * the metrics in the route, where the offlink is used if we have + * one. */ + if (uinfo != NULL) { + bzero(uinfo, sizeof (*uinfo)); - IN6_IPADDR_TO_V4MAPPED(src_addr, &connp->conn_srcv6); - IN6_IPADDR_TO_V4MAPPED(dst_addr, &connp->conn_remv6); - connp->conn_lport = lport; - connp->conn_fport = fport; - *src_addrp = src_addr; + if (dce->dce_flags & DCEF_UINFO) + *uinfo = dce->dce_uinfo; - ASSERT(!(ipsec_policy_set && ire_requested)); - if (ire_requested) { - iulp_t *ulp_info = NULL; + rts_merge_metrics(uinfo, &ire->ire_metrics); - /* - * Note that sire will not be NULL if this is an off-link - * connection and there is not cache for that dest yet. - * - * XXX Because of an existing bug, if there are multiple - * default routes, the IRE returned now may not be the actual - * default route used (default routes are chosen in a - * round robin fashion). So if the metrics for different - * default routes are different, we may return the wrong - * metrics. This will not be a problem if the existing - * bug is fixed. - */ - if (sire != NULL) { - ulp_info = &(sire->ire_uinfo); - } - if (!ip_bind_get_ire_v4(mpp, dst_ire, ulp_info, ipst)) { - error = -1; - goto bad_addr; - } - mp = *mpp; - } else if (ipsec_policy_set) { - if (!ip_bind_ipsec_policy_set(connp, mp)) { - error = -1; - goto bad_addr; - } + /* Allow ire_metrics to decrease the path MTU from above */ + if (uinfo->iulp_mtu == 0 || uinfo->iulp_mtu > pmtu) + uinfo->iulp_mtu = pmtu; + + uinfo->iulp_localnet = (ire->ire_type & IRE_ONLINK) != 0; + uinfo->iulp_loopback = (ire->ire_type & IRE_LOOPBACK) != 0; + uinfo->iulp_local = (ire->ire_type & IRE_LOCAL) != 0; } - /* - * Cache IPsec policy in this conn. If we have per-socket policy, - * we'll cache that. If we don't, we'll inherit global policy. - * - * We can't insert until the conn reflects the policy. Note that - * conn_policy_cached is set by ipsec_conn_cache_policy() even for - * connections where we don't have a policy. This is to prevent - * global policy lookups in the inbound path. - * - * If we insert before we set conn_policy_cached, - * CONN_INBOUND_POLICY_PRESENT() check can still evaluate true - * because global policy cound be non-empty. We normally call - * ipsec_check_policy() for conn_policy_cached connections only if - * ipc_in_enforce_policy is set. But in this case, - * conn_policy_cached can get set anytime since we made the - * CONN_INBOUND_POLICY_PRESENT() check and ipsec_check_policy() is - * called, which will make the above assumption false. Thus, we - * need to insert after we set conn_policy_cached. - */ - if ((error = ipsec_conn_cache_policy(connp, B_TRUE)) != 0) - goto bad_addr; + if (ill != NULL) + ill_refrele(ill); - if (fanout_insert) { - /* - * The addresses have been verified. Time to insert in - * the correct fanout list. - */ - error = ipcl_conn_insert(connp, protocol, src_addr, - dst_addr, connp->conn_ports); - } + return (error); - if (error == 0) { - connp->conn_fully_bound = B_TRUE; - /* - * Our initial checks for LSO/MDT have passed; the IRE is not - * LOCAL/LOOPBACK/BROADCAST, and the link layer seems to - * be supporting LSO/MDT. Pass the IRE, IPC and ILL into - * ip_xxinfo_return(), which performs further checks - * against them and upon success, returns the LSO/MDT info - * mblk which we will attach to the bind acknowledgment. - */ - if (lso_dst_ire != NULL) { - mblk_t *lsoinfo_mp; - - ASSERT(ill->ill_lso_capab != NULL); - if ((lsoinfo_mp = ip_lsoinfo_return(lso_dst_ire, connp, - ill->ill_name, ill->ill_lso_capab)) != NULL) { - if (mp == NULL) { - *mpp = lsoinfo_mp; - } else { - linkb(mp, lsoinfo_mp); - } - } - } else if (md_dst_ire != NULL) { - mblk_t *mdinfo_mp; - - ASSERT(ill->ill_mdt_capab != NULL); - if ((mdinfo_mp = ip_mdinfo_return(md_dst_ire, connp, - ill->ill_name, ill->ill_mdt_capab)) != NULL) { - if (mp == NULL) { - *mpp = mdinfo_mp; - } else { - linkb(mp, mdinfo_mp); - } - } - } - } bad_addr: - if (ipsec_policy_set) { - ASSERT(mp != NULL); - freeb(mp); - /* - * As of now assume that nothing else accompanies - * IPSEC_POLICY_SET. - */ - *mpp = NULL; + if (ire != NULL) + ire_refrele(ire); + + if (ill != NULL) + ill_refrele(ill); + + /* + * Make sure we don't leave an unreachable ixa_nce in place + * since ip_select_route is used when we unplumb i.e., remove + * references on ixa_ire, ixa_nce, and ixa_dce. + */ + nce = ixa->ixa_nce; + if (nce != NULL && nce->nce_is_condemned) { + nce_refrele(nce); + ixa->ixa_nce = NULL; + ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; } - if (src_ire != NULL) - IRE_REFRELE(src_ire); - if (dst_ire != NULL) - IRE_REFRELE(dst_ire); - if (sire != NULL) - IRE_REFRELE(sire); - if (md_dst_ire != NULL) - IRE_REFRELE(md_dst_ire); - if (lso_dst_ire != NULL) - IRE_REFRELE(lso_dst_ire); - if (effective_cred != NULL) - crfree(effective_cred); + return (error); } -int -ip_proto_bind_connected_v4(conn_t *connp, mblk_t **ire_mpp, uint8_t protocol, - ipaddr_t *src_addrp, uint16_t lport, ipaddr_t dst_addr, uint16_t fport, - boolean_t fanout_insert, boolean_t verify_dst, cred_t *cr) + +/* + * Get the base MTU for the case when path MTU discovery is not used. + * Takes the MTU of the IRE into account. + */ +uint_t +ip_get_base_mtu(ill_t *ill, ire_t *ire) { - int error; - - ASSERT(!connp->conn_af_isv6); - connp->conn_pkt_isv6 = B_FALSE; - connp->conn_ulp = protocol; - - /* For raw socket, the local port is not set. */ - if (lport == 0) - lport = connp->conn_lport; - error = ip_bind_connected_v4(connp, ire_mpp, protocol, - src_addrp, lport, dst_addr, fport, fanout_insert, verify_dst, cr); - if (error < 0) - error = -TBADADDR; - return (error); + uint_t mtu = ill->ill_mtu; + uint_t iremtu = ire->ire_metrics.iulp_mtu; + + if (iremtu != 0 && iremtu < mtu) + mtu = iremtu; + + return (mtu); } /* - * Get the ire in *mpp. Returns false if it fails (due to lack of space). - * Prefers dst_ire over src_ire. + * Get the PMTU for the attributes. Handles both IPv4 and IPv6. + * Assumes that ixa_ire, dce, and nce have already been set up. + * + * The caller has set IXAF_PMTU_DISCOVERY if path MTU discovery is desired. + * We avoid path MTU discovery if it is disabled with ndd. + * Furtermore, if the path MTU is too small, then we don't set DF for IPv4. + * + * NOTE: We also used to turn it off for source routed packets. That + * is no longer required since the dce is per final destination. */ -static boolean_t -ip_bind_get_ire_v4(mblk_t **mpp, ire_t *ire, iulp_t *ulp_info, ip_stack_t *ipst) +uint_t +ip_get_pmtu(ip_xmit_attr_t *ixa) { - mblk_t *mp = *mpp; - ire_t *ret_ire; + ip_stack_t *ipst = ixa->ixa_ipst; + dce_t *dce; + nce_t *nce; + ire_t *ire; + uint_t pmtu; - ASSERT(mp != NULL); + ire = ixa->ixa_ire; + dce = ixa->ixa_dce; + nce = ixa->ixa_nce; - if (ire != NULL) { - /* - * mp initialized above to IRE_DB_REQ_TYPE - * appended mblk. Its <upper protocol>'s - * job to make sure there is room. - */ - if ((mp->b_datap->db_lim - mp->b_rptr) < sizeof (ire_t)) - return (B_FALSE); + /* + * If path MTU discovery has been turned off by ndd, then we ignore + * any dce_pmtu and for IPv4 we will not set DF. + */ + if (!ipst->ips_ip_path_mtu_discovery) + ixa->ixa_flags &= ~IXAF_PMTU_DISCOVERY; - mp->b_datap->db_type = IRE_DB_TYPE; - mp->b_wptr = mp->b_rptr + sizeof (ire_t); - bcopy(ire, mp->b_rptr, sizeof (ire_t)); - ret_ire = (ire_t *)mp->b_rptr; + pmtu = IP_MAXPACKET; + /* + * Decide whether whether IPv4 sets DF + * For IPv6 "no DF" means to use the 1280 mtu + */ + if (ixa->ixa_flags & IXAF_PMTU_DISCOVERY) { + ixa->ixa_flags |= IXAF_PMTU_IPV4_DF; + } else { + ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF; + if (!(ixa->ixa_flags & IXAF_IS_IPV4)) + pmtu = IPV6_MIN_MTU; + } + + /* Check if the PMTU is to old before we use it */ + if ((dce->dce_flags & DCEF_PMTU) && + TICK_TO_SEC(lbolt64) - dce->dce_last_change_time > + ipst->ips_ip_pathmtu_interval) { /* - * Pass the latest setting of the ip_path_mtu_discovery and - * copy the ulp info if any. + * Older than 20 minutes. Drop the path MTU information. */ - ret_ire->ire_frag_flag |= (ipst->ips_ip_path_mtu_discovery) ? - IPH_DF : 0; - if (ulp_info != NULL) { - bcopy(ulp_info, &(ret_ire->ire_uinfo), - sizeof (iulp_t)); + mutex_enter(&dce->dce_lock); + dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU); + dce->dce_last_change_time = TICK_TO_SEC(lbolt64); + mutex_exit(&dce->dce_lock); + dce_increment_generation(dce); + } + + /* The metrics on the route can lower the path MTU */ + if (ire->ire_metrics.iulp_mtu != 0 && + ire->ire_metrics.iulp_mtu < pmtu) + pmtu = ire->ire_metrics.iulp_mtu; + + /* + * If the path MTU is smaller than some minimum, we still use dce_pmtu + * above (would be 576 for IPv4 and 1280 for IPv6), but we clear + * IXAF_PMTU_IPV4_DF so that we avoid setting DF for IPv4. + */ + if (ixa->ixa_flags & IXAF_PMTU_DISCOVERY) { + if (dce->dce_flags & DCEF_PMTU) { + if (dce->dce_pmtu < pmtu) + pmtu = dce->dce_pmtu; + + if (dce->dce_flags & DCEF_TOO_SMALL_PMTU) { + ixa->ixa_flags |= IXAF_PMTU_TOO_SMALL; + ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF; + } else { + ixa->ixa_flags &= ~IXAF_PMTU_TOO_SMALL; + ixa->ixa_flags |= IXAF_PMTU_IPV4_DF; + } + } else { + ixa->ixa_flags &= ~IXAF_PMTU_TOO_SMALL; + ixa->ixa_flags |= IXAF_PMTU_IPV4_DF; } - ret_ire->ire_mp = mp; - } else { + } + + /* + * If we have an IRE_LOCAL we use the loopback mtu instead of + * the ill for going out the wire i.e., IRE_LOCAL gets the same + * mtu as IRE_LOOPBACK. + */ + if (ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) { + uint_t loopback_mtu; + + loopback_mtu = (ire->ire_ipversion == IPV6_VERSION) ? + ip_loopback_mtu_v6plus : ip_loopback_mtuplus; + + if (loopback_mtu < pmtu) + pmtu = loopback_mtu; + } else if (nce != NULL) { /* - * No IRE was found. Remove IRE mblk. + * Make sure we don't exceed the interface MTU. + * In the case of RTF_REJECT or RTF_BLACKHOLE we might not have + * an ill. We'd use the above IP_MAXPACKET in that case just + * to tell the transport something larger than zero. */ - *mpp = mp->b_cont; - freeb(mp); + if (nce->nce_common->ncec_ill->ill_mtu < pmtu) + pmtu = nce->nce_common->ncec_ill->ill_mtu; + if (nce->nce_common->ncec_ill != nce->nce_ill && + nce->nce_ill->ill_mtu < pmtu) { + /* + * for interfaces in an IPMP group, the mtu of + * the nce_ill (under_ill) could be different + * from the mtu of the ncec_ill, so we take the + * min of the two. + */ + pmtu = nce->nce_ill->ill_mtu; + } } - return (B_TRUE); + + /* + * Handle the IPV6_USE_MIN_MTU socket option or ancillary data. + * Only applies to IPv6. + */ + if (!(ixa->ixa_flags & IXAF_IS_IPV4)) { + if (ixa->ixa_flags & IXAF_USE_MIN_MTU) { + switch (ixa->ixa_use_min_mtu) { + case IPV6_USE_MIN_MTU_MULTICAST: + if (ire->ire_type & IRE_MULTICAST) + pmtu = IPV6_MIN_MTU; + break; + case IPV6_USE_MIN_MTU_ALWAYS: + pmtu = IPV6_MIN_MTU; + break; + case IPV6_USE_MIN_MTU_NEVER: + break; + } + } else { + /* Default is IPV6_USE_MIN_MTU_MULTICAST */ + if (ire->ire_type & IRE_MULTICAST) + pmtu = IPV6_MIN_MTU; + } + } + + /* + * After receiving an ICMPv6 "packet too big" message with a + * MTU < 1280, and for multirouted IPv6 packets, the IP layer + * will insert a 8-byte fragment header in every packet. We compensate + * for those cases by returning a smaller path MTU to the ULP. + * + * In the case of CGTP then ip_output will add a fragment header. + * Make sure there is room for it by telling a smaller number + * to the transport. + * + * When IXAF_IPV6_ADDR_FRAGHDR we subtract the frag hdr here + * so the ULPs consistently see a iulp_pmtu and ip_get_pmtu() + * which is the size of the packets it can send. + */ + if (!(ixa->ixa_flags & IXAF_IS_IPV4)) { + if ((dce->dce_flags & DCEF_TOO_SMALL_PMTU) || + (ire->ire_flags & RTF_MULTIRT) || + (ixa->ixa_flags & IXAF_MULTIRT_MULTICAST)) { + pmtu -= sizeof (ip6_frag_t); + ixa->ixa_flags |= IXAF_IPV6_ADD_FRAGHDR; + } + } + + return (pmtu); } /* @@ -5386,6 +4128,7 @@ ip_modclose(ill_t *ill) queue_t *q = ill->ill_rq; ip_stack_t *ipst = ill->ill_ipst; int i; + arl_ill_common_t *ai = ill->ill_common; /* * The punlink prior to this may have initiated a capability @@ -5452,6 +4195,7 @@ ip_modclose(ill_t *ill) mutex_enter(&ill->ill_lock); while (!ill_is_freeable(ill)) cv_wait(&ill->ill_cv, &ill->ill_lock); + while (ill->ill_waiters) cv_wait(&ill->ill_cv, &ill->ill_lock); @@ -5466,12 +4210,16 @@ ip_modclose(ill_t *ill) /* qprocsoff is done via ill_delete_tail */ ill_delete_tail(ill); + /* + * synchronously wait for arp stream to unbind. After this, we + * cannot get any data packets up from the driver. + */ + arp_unbind_complete(ill); ASSERT(ill->ill_ipst == NULL); /* - * Walk through all upper (conn) streams and qenable - * those that have queued data. - * close synchronization needs this to + * Walk through all conns and qenable those that have queued data. + * Close synchronization needs this to * be done to ensure that all upper layers blocked * due to flow control to the closing device * get unblocked. @@ -5481,6 +4229,25 @@ ip_modclose(ill_t *ill) conn_walk_drain(ipst, &ipst->ips_idl_tx_list[i]); } + /* + * ai can be null if this is an IPv6 ill, or if the IPv4 + * stream is being torn down before ARP was plumbed (e.g., + * /sbin/ifconfig plumbing a stream twice, and encountering + * an error + */ + if (ai != NULL) { + ASSERT(!ill->ill_isv6); + mutex_enter(&ai->ai_lock); + ai->ai_ill = NULL; + if (ai->ai_arl == NULL) { + mutex_destroy(&ai->ai_lock); + kmem_free(ai, sizeof (*ai)); + } else { + cv_signal(&ai->ai_ill_unplumb_done); + mutex_exit(&ai->ai_lock); + } + } + mutex_enter(&ipst->ips_ip_mi_lock); mi_close_unlink(&ipst->ips_ip_g_head, (IDP)ill); mutex_exit(&ipst->ips_ip_mi_lock); @@ -5492,6 +4259,12 @@ ip_modclose(ill_t *ill) if (ill->ill_credp != NULL) crfree(ill->ill_credp); + mutex_destroy(&ill->ill_saved_ire_lock); + mutex_destroy(&ill->ill_lock); + rw_destroy(&ill->ill_mcast_lock); + mutex_destroy(&ill->ill_mcast_serializer); + list_destroy(&ill->ill_nce); + /* * Now we are done with the module close pieces that * need the netstack_t. @@ -5525,11 +4298,8 @@ ip_quiesce_conn(conn_t *connp) * Mark the conn as closing, and this conn must not be * inserted in future into any list. Eg. conn_drain_insert(), * won't insert this conn into the conn_drain_list. - * Similarly ill_pending_mp_add() will not add any mp to - * the pending mp list, after this conn has started closing. * - * conn_idl, conn_pending_ill, conn_down_pending_ill, conn_ilg - * cannot get set henceforth. + * conn_idl, and conn_ilg cannot get set henceforth. */ mutex_enter(&connp->conn_lock); ASSERT(!(connp->conn_state_flags & CONN_QUIESCED)); @@ -5541,9 +4311,10 @@ ip_quiesce_conn(conn_t *connp) if (connp->conn_dhcpinit_ill != NULL) { ASSERT(connp->conn_dhcpinit_ill->ill_dhcpinit != 0); atomic_dec_32(&connp->conn_dhcpinit_ill->ill_dhcpinit); + ill_set_inputfn(connp->conn_dhcpinit_ill); connp->conn_dhcpinit_ill = NULL; } - if (connp->conn_ilg_inuse != 0) + if (connp->conn_ilg != NULL) ilg_cleanup_reqd = B_TRUE; mutex_exit(&connp->conn_lock); @@ -5552,7 +4323,7 @@ ip_quiesce_conn(conn_t *connp) if (is_system_labeled() && connp->conn_anon_port) { (void) tsol_mlp_anon(crgetzone(connp->conn_cred), - connp->conn_mlp_type, connp->conn_ulp, + connp->conn_mlp_type, connp->conn_proto, ntohs(connp->conn_lport), B_FALSE); connp->conn_anon_port = 0; } @@ -5568,21 +4339,22 @@ ip_quiesce_conn(conn_t *connp) /* * Remove this conn from the drain list, and do * any other cleanup that may be required. - * (Only non-tcp streams may have a non-null conn_idl. - * TCP streams are never flow controlled, and + * (Only non-tcp conns may have a non-null conn_idl. + * TCP conns are never flow controlled, and * conn_idl will be null) */ - if (drain_cleanup_reqd) + if (drain_cleanup_reqd && connp->conn_idl != NULL) { + mutex_enter(&connp->conn_idl->idl_lock); conn_drain_tail(connp, B_TRUE); + mutex_exit(&connp->conn_idl->idl_lock); + } if (connp == ipst->ips_ip_g_mrouter) - (void) ip_mrouter_done(NULL, ipst); + (void) ip_mrouter_done(ipst); if (ilg_cleanup_reqd) ilg_delete_all(connp); - conn_delete_ire(connp, NULL); - /* * Now conn refcnt can increase only thru CONN_INC_REF_LOCKED. * callers from write side can't be there now because close @@ -5603,8 +4375,6 @@ ip_close(queue_t *q, int flags) { conn_t *connp; - TRACE_1(TR_FAC_IP, TR_IP_CLOSE, "ip_close: q %p", q); - /* * Call the appropriate delete routine depending on whether this is * a module or device. @@ -5646,13 +4416,21 @@ ip_close(queue_t *q, int flags) */ /*ARGSUSED2*/ static void -ip_conn_input(void *arg1, mblk_t *mp, void *arg2) +ip_conn_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) { conn_t *connp = (conn_t *)arg1; putnext(connp->conn_rq, mp); } +/* Dummy in case ICMP error delivery is attempted to a /dev/ip instance */ +/* ARGSUSED */ +static void +ip_conn_input_icmp(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) +{ + freemsg(mp); +} + /* * Called when the module is about to be unloaded */ @@ -5667,6 +4445,7 @@ ip_ddi_destroy(void) sctp_ddi_g_destroy(); tcp_ddi_g_destroy(); ilb_ddi_g_destroy(); + dce_g_destroy(); ipsec_policy_g_destroy(); ipcl_g_destroy(); ip_net_g_destroy(); @@ -5709,16 +4488,12 @@ ip_stack_shutdown(netstackid_t stackid, void *arg) */ ipv4_hook_shutdown(ipst); ipv6_hook_shutdown(ipst); + arp_hook_shutdown(ipst); mutex_enter(&ipst->ips_capab_taskq_lock); ipst->ips_capab_taskq_quit = B_TRUE; cv_signal(&ipst->ips_capab_taskq_cv); mutex_exit(&ipst->ips_capab_taskq_lock); - - mutex_enter(&ipst->ips_mrt_lock); - ipst->ips_mrt_flags |= IP_MRT_STOP; - cv_signal(&ipst->ips_mrt_cv); - mutex_exit(&ipst->ips_mrt_lock); } /* @@ -5741,18 +4516,12 @@ ip_stack_fini(netstackid_t stackid, void *arg) ipobs_fini(ipst); ipv4_hook_destroy(ipst); ipv6_hook_destroy(ipst); + arp_hook_destroy(ipst); ip_net_destroy(ipst); mutex_destroy(&ipst->ips_capab_taskq_lock); cv_destroy(&ipst->ips_capab_taskq_cv); - mutex_enter(&ipst->ips_mrt_lock); - while (!(ipst->ips_mrt_flags & IP_MRT_DONE)) - cv_wait(&ipst->ips_mrt_done_cv, &ipst->ips_mrt_lock); - mutex_destroy(&ipst->ips_mrt_lock); - cv_destroy(&ipst->ips_mrt_cv); - cv_destroy(&ipst->ips_mrt_done_cv); - ipmp_destroy(ipst); rw_destroy(&ipst->ips_srcid_lock); @@ -5773,10 +4542,10 @@ ip_stack_fini(netstackid_t stackid, void *arg) kmem_free(ipst->ips_ndp_arr, sizeof (lcl_ndp_arr)); ipst->ips_ndp_arr = NULL; + dce_stack_destroy(ipst); ip_mrouter_stack_destroy(ipst); mutex_destroy(&ipst->ips_ip_mi_lock); - rw_destroy(&ipst->ips_ipsec_capab_ills_lock); rw_destroy(&ipst->ips_ill_g_usesrc_lock); rw_destroy(&ipst->ips_ip_g_nd_lock); @@ -5808,13 +4577,6 @@ ip_stack_fini(netstackid_t stackid, void *arg) ASSERT(ipst->ips_mld_slowtimeout_id != 0); ipst->ips_mld_slowtimeout_id = 0; } - ret = untimeout(ipst->ips_ip_ire_expire_id); - if (ret == -1) { - ASSERT(ipst->ips_ip_ire_expire_id == 0); - } else { - ASSERT(ipst->ips_ip_ire_expire_id != 0); - ipst->ips_ip_ire_expire_id = 0; - } mutex_destroy(&ipst->ips_igmp_timer_lock); mutex_destroy(&ipst->ips_mld_timer_lock); @@ -5915,6 +4677,10 @@ ip_ddi_init(void) list_create(&ip_thread_list, sizeof (th_hash_t), offsetof(th_hash_t, thh_link)); #endif + ipsec_policy_g_init(); + tcp_ddi_g_init(); + sctp_ddi_g_init(); + dce_g_init(); /* * We want to be informed each time a stack is created or @@ -5924,10 +4690,6 @@ ip_ddi_init(void) netstack_register(NS_IP, ip_stack_init, ip_stack_shutdown, ip_stack_fini); - ipsec_policy_g_init(); - tcp_ddi_g_init(); - sctp_ddi_g_init(); - tnet_init(); udp_ddi_g_init(); @@ -5973,7 +4735,6 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns) mutex_init(&ipst->ips_ip_mi_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ipst->ips_ip_addr_avail_lock, NULL, MUTEX_DEFAULT, NULL); rw_init(&ipst->ips_ill_g_lock, NULL, RW_DEFAULT, NULL); - rw_init(&ipst->ips_ipsec_capab_ills_lock, NULL, RW_DEFAULT, NULL); rw_init(&ipst->ips_ill_g_usesrc_lock, NULL, RW_DEFAULT, NULL); ipcl_init(ipst); @@ -5982,6 +4743,7 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns) ipif_init(ipst); conn_drain_init(ipst); ip_mrouter_stack_init(ipst); + dce_stack_init(ipst); ipst->ips_ip_g_frag_timeout = IP_FRAG_TIMEOUT; ipst->ips_ip_g_frag_timo_ms = IP_FRAG_TIMEOUT * 1000; @@ -6026,9 +4788,12 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns) ipst->ips_ip_src_id = 1; rw_init(&ipst->ips_srcid_lock, NULL, RW_DEFAULT, NULL); + ipst->ips_src_generation = SRC_GENERATION_INITIAL; + ip_net_init(ipst, ns); ipv4_hook_init(ipst); ipv6_hook_init(ipst); + arp_hook_init(ipst); ipmp_init(ipst); ipobs_init(ipst); @@ -6040,15 +4805,6 @@ ip_stack_init(netstackid_t stackid, netstack_t *ns) mutex_init(&ipst->ips_capab_taskq_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&ipst->ips_capab_taskq_cv, NULL, CV_DEFAULT, NULL); - /* - * Create the mcast_restart_timers_thread() worker thread. - */ - mutex_init(&ipst->ips_mrt_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&ipst->ips_mrt_cv, NULL, CV_DEFAULT, NULL); - cv_init(&ipst->ips_mrt_done_cv, NULL, CV_DEFAULT, NULL); - ipst->ips_mrt_thread = thread_create(NULL, 0, - mcast_restart_timers_thread, ipst, 0, &p0, TS_RUN, minclsyspri); - major = mod_name_to_major(INET_NAME); (void) ldi_ident_from_major(major, &ipst->ips_ldi_ident); return (ipst); @@ -6161,37 +4917,26 @@ mac_colon_addr(const uint8_t *addr, size_t alen, char *buf, size_t buflen) } /* - * Send an ICMP error after patching up the packet appropriately. Returns - * non-zero if the appropriate MIB should be bumped; zero otherwise. + * Called when it is conceptually a ULP that would sent the packet + * e.g., port unreachable and protocol unreachable. Check that the packet + * would have passed the IPsec global policy before sending the error. + * + * Send an ICMP error after patching up the packet appropriately. + * Uses ip_drop_input and bumps the appropriate MIB. */ -static boolean_t -ip_fanout_send_icmp(queue_t *q, mblk_t *mp, uint_t flags, - uint_t icmp_type, uint_t icmp_code, boolean_t mctl_present, - zoneid_t zoneid, ip_stack_t *ipst) +void +ip_fanout_send_icmp_v4(mblk_t *mp, uint_t icmp_type, uint_t icmp_code, + ip_recv_attr_t *ira) { - ipha_t *ipha; - mblk_t *first_mp; - boolean_t secure; - unsigned char db_type; - ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; + ipha_t *ipha; + boolean_t secure; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; + netstack_t *ns = ipst->ips_netstack; + ipsec_stack_t *ipss = ns->netstack_ipsec; + + secure = ira->ira_flags & IRAF_IPSEC_SECURE; - first_mp = mp; - if (mctl_present) { - mp = mp->b_cont; - secure = ipsec_in_is_secure(first_mp); - ASSERT(mp != NULL); - } else { - /* - * If this is an ICMP error being reported - which goes - * up as M_CTLs, we need to convert them to M_DATA till - * we finish checking with global policy because - * ipsec_check_global_policy() assumes M_DATA as clear - * and M_CTL as secure. - */ - db_type = DB_TYPE(mp); - DB_TYPE(mp) = M_DATA; - secure = B_FALSE; - } /* * We are generating an icmp error for some inbound packet. * Called from all ip_fanout_(udp, tcp, proto) functions. @@ -6201,47 +4946,52 @@ ip_fanout_send_icmp(queue_t *q, mblk_t *mp, uint_t flags, */ ipha = (ipha_t *)mp->b_rptr; if (secure || ipss->ipsec_inbound_v4_policy_present) { - first_mp = ipsec_check_global_policy(first_mp, NULL, - ipha, NULL, mctl_present, ipst->ips_netstack); - if (first_mp == NULL) - return (B_FALSE); + mp = ipsec_check_global_policy(mp, NULL, ipha, NULL, ira, ns); + if (mp == NULL) + return; } - if (!mctl_present) - DB_TYPE(mp) = db_type; + /* We never send errors for protocols that we do implement */ + if (ira->ira_protocol == IPPROTO_ICMP || + ira->ira_protocol == IPPROTO_IGMP) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ip_fanout_send_icmp_v4", mp, ill); + freemsg(mp); + return; + } + /* + * Have to correct checksum since + * the packet might have been + * fragmented and the reassembly code in ip_rput + * does not restore the IP checksum. + */ + ipha->ipha_hdr_checksum = 0; + ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); - if (flags & IP_FF_SEND_ICMP) { - if (flags & IP_FF_HDR_COMPLETE) { - if (ip_hdr_complete(ipha, zoneid, ipst)) { - freemsg(first_mp); - return (B_TRUE); - } - } - if (flags & IP_FF_CKSUM) { - /* - * Have to correct checksum since - * the packet might have been - * fragmented and the reassembly code in ip_rput - * does not restore the IP checksum. - */ - ipha->ipha_hdr_checksum = 0; - ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); - } - switch (icmp_type) { - case ICMP_DEST_UNREACHABLE: - icmp_unreachable(WR(q), first_mp, icmp_code, zoneid, - ipst); + switch (icmp_type) { + case ICMP_DEST_UNREACHABLE: + switch (icmp_code) { + case ICMP_PROTOCOL_UNREACHABLE: + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInUnknownProtos); + ip_drop_input("ipIfStatsInUnknownProtos", mp, ill); break; - default: - freemsg(first_mp); + case ICMP_PORT_UNREACHABLE: + BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts); + ip_drop_input("ipIfStatsNoPorts", mp, ill); break; } - } else { - freemsg(first_mp); - return (B_FALSE); - } - return (B_TRUE); + icmp_unreachable(mp, icmp_code, ira); + break; + default: +#ifdef DEBUG + panic("ip_fanout_send_icmp_v4: wrong type"); + /*NOTREACHED*/ +#else + freemsg(mp); + break; +#endif + } } /* @@ -6250,66 +5000,86 @@ ip_fanout_send_icmp(queue_t *q, mblk_t *mp, uint_t flags, * is consumed by this function. */ void -ip_proto_not_sup(queue_t *q, mblk_t *ipsec_mp, uint_t flags, zoneid_t zoneid, - ip_stack_t *ipst) +ip_proto_not_sup(mblk_t *mp, ip_recv_attr_t *ira) { - mblk_t *mp; - ipha_t *ipha; - ill_t *ill; - ipsec_in_t *ii; - - ii = (ipsec_in_t *)ipsec_mp->b_rptr; - ASSERT(ii->ipsec_in_type == IPSEC_IN); + ipha_t *ipha; - mp = ipsec_mp->b_cont; - ipsec_mp->b_cont = NULL; ipha = (ipha_t *)mp->b_rptr; - /* Get ill from index in ipsec_in_t. */ - ill = ill_lookup_on_ifindex(ii->ipsec_in_ill_index, - (IPH_HDR_VERSION(ipha) == IPV6_VERSION), NULL, NULL, NULL, NULL, - ipst); - if (ill != NULL) { - if (IPH_HDR_VERSION(ipha) == IP_VERSION) { - if (ip_fanout_send_icmp(q, mp, flags, - ICMP_DEST_UNREACHABLE, - ICMP_PROTOCOL_UNREACHABLE, B_FALSE, zoneid, ipst)) { - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsInUnknownProtos); - } - } else { - if (ip_fanout_send_icmp_v6(q, mp, flags, - ICMP6_PARAM_PROB, ICMP6_PARAMPROB_NEXTHEADER, - 0, B_FALSE, zoneid, ipst)) { - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsInUnknownProtos); - } - } - ill_refrele(ill); - } else { /* re-link for the freemsg() below. */ - ipsec_mp->b_cont = mp; + if (ira->ira_flags & IRAF_IS_IPV4) { + ASSERT(IPH_HDR_VERSION(ipha) == IP_VERSION); + ip_fanout_send_icmp_v4(mp, ICMP_DEST_UNREACHABLE, + ICMP_PROTOCOL_UNREACHABLE, ira); + } else { + ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION); + ip_fanout_send_icmp_v6(mp, ICMP6_PARAM_PROB, + ICMP6_PARAMPROB_NEXTHEADER, ira); } - - /* If ICMP delivered, ipsec_mp will be a singleton (b_cont == NULL). */ - freemsg(ipsec_mp); } /* - * See if the inbound datagram has had IPsec processing applied to it. + * Deliver a rawip packet to the given conn, possibly applying ipsec policy. + * Handles IPv4 and IPv6. + * We are responsible for disposing of mp, such as by freemsg() or putnext() + * Caller is responsible for dropping references to the conn. */ -boolean_t -ipsec_in_is_secure(mblk_t *ipsec_mp) +void +ip_fanout_proto_conn(conn_t *connp, mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, + ip_recv_attr_t *ira) { - ipsec_in_t *ii; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; + ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; + boolean_t secure; + uint_t protocol = ira->ira_protocol; + iaflags_t iraflags = ira->ira_flags; + queue_t *rq; + + secure = iraflags & IRAF_IPSEC_SECURE; + + rq = connp->conn_rq; + if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld : !canputnext(rq)) { + switch (protocol) { + case IPPROTO_ICMPV6: + BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInOverflows); + break; + case IPPROTO_ICMP: + BUMP_MIB(&ipst->ips_icmp_mib, icmpInOverflows); + break; + default: + BUMP_MIB(ill->ill_ip_mib, rawipIfStatsInOverflows); + break; + } + freemsg(mp); + return; + } - ii = (ipsec_in_t *)ipsec_mp->b_rptr; - ASSERT(ii->ipsec_in_type == IPSEC_IN); + ASSERT(!(IPCL_IS_IPTUN(connp))); - if (ii->ipsec_in_loopback) { - return (ii->ipsec_in_secure); + if (((iraflags & IRAF_IS_IPV4) ? + CONN_INBOUND_POLICY_PRESENT(connp, ipss) : + CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) || + secure) { + mp = ipsec_check_inbound_policy(mp, connp, ipha, + ip6h, ira); + if (mp == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + /* Note that mp is NULL */ + ip_drop_input("ipIfStatsInDiscards", mp, ill); + return; + } + } + + if (iraflags & IRAF_ICMP_ERROR) { + (connp->conn_recvicmp)(connp, mp, NULL, ira); } else { - return (ii->ipsec_in_ah_sa != NULL || - ii->ipsec_in_esp_sa != NULL || - ii->ipsec_in_decaps); + ill_t *rill = ira->ira_rill; + + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); + ira->ira_ill = ira->ira_rill = NULL; + /* Send it upstream */ + (connp->conn_recv)(connp, mp, NULL, ira); + ira->ira_ill = ill; + ira->ira_rill = rill; } } @@ -6336,65 +5106,33 @@ ipsec_in_is_secure(mblk_t *ipsec_mp) * is used to negotiate SAs as SAs will be added only after * verifying the policy. * - * IPQoS Notes: - * Once we have determined the client, invoke IPPF processing. - * Policy processing takes place only if the callout_position, IPP_LOCAL_IN, - * is enabled. If we get here from icmp_inbound_error_fanout or ip_wput_local - * ip_policy will be false. - * * Zones notes: - * Currently only applications in the global zone can create raw sockets for - * protocols other than ICMP. So unlike the broadcast / multicast case of - * ip_fanout_udp(), we only send a copy of the packet to streams in the - * specified zone. For ICMP, this is handled by the callers of icmp_inbound(). + * Earlier in ip_input on a system with multiple shared-IP zones we + * duplicate the multicast and broadcast packets and send them up + * with each explicit zoneid that exists on that ill. + * This means that here we can match the zoneid with SO_ALLZONES being special. */ -static void -ip_fanout_proto(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, uint_t flags, - boolean_t mctl_present, boolean_t ip_policy, ill_t *recv_ill, - zoneid_t zoneid) +void +ip_fanout_proto_v4(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira) { - queue_t *rq; - mblk_t *mp1, *first_mp1; - uint_t protocol = ipha->ipha_protocol; - ipaddr_t dst; - mblk_t *first_mp = mp; - boolean_t secure; - uint32_t ill_index; - conn_t *connp, *first_connp, *next_connp; - connf_t *connfp; - boolean_t shared_addr; - mib2_ipIfStatsEntry_t *mibptr; - ip_stack_t *ipst = recv_ill->ill_ipst; - ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; + mblk_t *mp1; + ipaddr_t laddr; + conn_t *connp, *first_connp, *next_connp; + connf_t *connfp; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; - mibptr = (ill != NULL) ? ill->ill_ip_mib : &ipst->ips_ip_mib; - if (mctl_present) { - mp = first_mp->b_cont; - secure = ipsec_in_is_secure(first_mp); - ASSERT(mp != NULL); - } else { - secure = B_FALSE; - } - dst = ipha->ipha_dst; - shared_addr = (zoneid == ALL_ZONES); - if (shared_addr) { - /* - * We don't allow multilevel ports for raw IP, so no need to - * check for that here. - */ - zoneid = tsol_packet_to_zoneid(mp); - } + laddr = ipha->ipha_dst; - connfp = &ipst->ips_ipcl_proto_fanout[protocol]; + connfp = &ipst->ips_ipcl_proto_fanout_v4[ira->ira_protocol]; mutex_enter(&connfp->connf_lock); connp = connfp->connf_head; for (connp = connfp->connf_head; connp != NULL; connp = connp->conn_next) { - if (IPCL_PROTO_MATCH(connp, protocol, ipha, ill, flags, - zoneid) && - (!is_system_labeled() || - tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, - connp))) { + /* Note: IPCL_PROTO_MATCH includes conn_wantpacket */ + if (IPCL_PROTO_MATCH(connp, ira, ipha) && + (!(ira->ira_flags & IRAF_SYSTEM_LABELED) || + tsol_receive_local(mp, &laddr, IPV4_VERSION, ira, connp))) { break; } } @@ -6406,40 +5144,12 @@ ip_fanout_proto(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, uint_t flags, * unclaimed datagrams? */ mutex_exit(&connfp->connf_lock); - /* - * Check for IPPROTO_ENCAP... - */ - if (protocol == IPPROTO_ENCAP && ipst->ips_ip_g_mrouter) { - /* - * If an IPsec mblk is here on a multicast - * tunnel (using ip_mroute stuff), check policy here, - * THEN ship off to ip_mroute_decap(). - * - * BTW, If I match a configured IP-in-IP - * tunnel, this path will not be reached, and - * ip_mroute_decap will never be called. - */ - first_mp = ipsec_check_global_policy(first_mp, connp, - ipha, NULL, mctl_present, ipst->ips_netstack); - if (first_mp != NULL) { - if (mctl_present) - freeb(first_mp); - ip_mroute_decap(q, mp, ill); - } /* Else we already freed everything! */ - } else { - /* - * Otherwise send an ICMP protocol unreachable. - */ - if (ip_fanout_send_icmp(q, first_mp, flags, - ICMP_DEST_UNREACHABLE, ICMP_PROTOCOL_UNREACHABLE, - mctl_present, zoneid, ipst)) { - BUMP_MIB(mibptr, ipIfStatsInUnknownProtos); - } - } + ip_fanout_send_icmp_v4(mp, ICMP_DEST_UNREACHABLE, + ICMP_PROTOCOL_UNREACHABLE, ira); return; } - ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL); + ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL); CONN_INC_REF(connp); first_connp = connp; @@ -6447,111 +5157,35 @@ ip_fanout_proto(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, uint_t flags, for (;;) { while (connp != NULL) { - if (IPCL_PROTO_MATCH(connp, protocol, ipha, ill, - flags, zoneid) && - (!is_system_labeled() || - tsol_receive_local(mp, &dst, IPV4_VERSION, - shared_addr, connp))) + /* Note: IPCL_PROTO_MATCH includes conn_wantpacket */ + if (IPCL_PROTO_MATCH(connp, ira, ipha) && + (!(ira->ira_flags & IRAF_SYSTEM_LABELED) || + tsol_receive_local(mp, &laddr, IPV4_VERSION, + ira, connp))) break; connp = connp->conn_next; } - /* - * Copy the packet. - */ - if (connp == NULL || - (((first_mp1 = dupmsg(first_mp)) == NULL) && - ((first_mp1 = ip_copymsg(first_mp)) == NULL))) { - /* - * No more interested clients or memory - * allocation failed - */ + if (connp == NULL) { + /* No more interested clients */ connp = first_connp; break; } - ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL); - mp1 = mctl_present ? first_mp1->b_cont : first_mp1; + if (((mp1 = dupmsg(mp)) == NULL) && + ((mp1 = copymsg(mp)) == NULL)) { + /* Memory allocation failed */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); + connp = first_connp; + break; + } + CONN_INC_REF(connp); mutex_exit(&connfp->connf_lock); - rq = connp->conn_rq; - /* - * Check flow control - */ - if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) || - (!IPCL_IS_NONSTR(connp) && !canputnext(rq))) { - if (flags & IP_FF_RAWIP) { - BUMP_MIB(mibptr, rawipIfStatsInOverflows); - } else { - BUMP_MIB(&ipst->ips_icmp_mib, icmpInOverflows); - } + ip_fanout_proto_conn(connp, mp1, (ipha_t *)mp1->b_rptr, NULL, + ira); - freemsg(first_mp1); - } else { - /* - * Enforce policy like any other conn_t. Note that - * IP-in-IP packets don't come through here, but - * through ip_iptun_input() or - * icmp_inbound_iptun_fanout(). IPsec policy for such - * packets is enforced in the iptun module. - */ - if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || - secure) { - first_mp1 = ipsec_check_inbound_policy - (first_mp1, connp, ipha, NULL, - mctl_present); - } - if (first_mp1 != NULL) { - int in_flags = 0; - /* - * ip_fanout_proto also gets called from - * icmp_inbound_error_fanout, in which case - * the msg type is M_CTL. Don't add info - * in this case for the time being. In future - * when there is a need for knowing the - * inbound iface index for ICMP error msgs, - * then this can be changed. - */ - if (connp->conn_recvif) - in_flags = IPF_RECVIF; - /* - * The ULP may support IP_RECVPKTINFO for both - * IP v4 and v6 so pass the appropriate argument - * based on conn IP version. - */ - if (connp->conn_ip_recvpktinfo) { - if (connp->conn_af_isv6) { - /* - * V6 only needs index - */ - in_flags |= IPF_RECVIF; - } else { - /* - * V4 needs index + - * matching address. - */ - in_flags |= IPF_RECVADDR; - } - } - if ((in_flags != 0) && - (mp->b_datap->db_type != M_CTL)) { - /* - * the actual data will be - * contained in b_cont upon - * successful return of the - * following call else - * original mblk is returned - */ - ASSERT(recv_ill != NULL); - mp1 = ip_add_info(mp1, recv_ill, - in_flags, IPCL_ZONEID(connp), ipst); - } - BUMP_MIB(mibptr, ipIfStatsHCInDelivers); - if (mctl_present) - freeb(first_mp1); - (connp->conn_recv)(connp, mp1, NULL); - } - } mutex_enter(&connfp->connf_lock); /* Follow the next pointer before releasing the conn. */ next_connp = connp->conn_next; @@ -6562,363 +5196,27 @@ ip_fanout_proto(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, uint_t flags, /* Last one. Send it upstream. */ mutex_exit(&connfp->connf_lock); - /* - * If this packet is coming from icmp_inbound_error_fanout ip_policy - * will be set to false. - */ - if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) { - ill_index = ill->ill_phyint->phyint_ifindex; - ip_process(IPP_LOCAL_IN, &mp, ill_index); - if (mp == NULL) { - CONN_DEC_REF(connp); - if (mctl_present) { - freeb(first_mp); - } - return; - } - } - - rq = connp->conn_rq; - /* - * Check flow control - */ - if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) || - (!IPCL_IS_NONSTR(connp) && !canputnext(rq))) { - if (flags & IP_FF_RAWIP) { - BUMP_MIB(mibptr, rawipIfStatsInOverflows); - } else { - BUMP_MIB(&ipst->ips_icmp_mib, icmpInOverflows); - } - - freemsg(first_mp); - } else { - ASSERT(!IPCL_IS_IPTUN(connp)); - - if ((CONN_INBOUND_POLICY_PRESENT(connp, ipss) || secure)) { - first_mp = ipsec_check_inbound_policy(first_mp, connp, - ipha, NULL, mctl_present); - } - - if (first_mp != NULL) { - int in_flags = 0; - - /* - * ip_fanout_proto also gets called - * from icmp_inbound_error_fanout, in - * which case the msg type is M_CTL. - * Don't add info in this case for time - * being. In future when there is a - * need for knowing the inbound iface - * index for ICMP error msgs, then this - * can be changed - */ - if (connp->conn_recvif) - in_flags = IPF_RECVIF; - if (connp->conn_ip_recvpktinfo) { - if (connp->conn_af_isv6) { - /* - * V6 only needs index - */ - in_flags |= IPF_RECVIF; - } else { - /* - * V4 needs index + - * matching address. - */ - in_flags |= IPF_RECVADDR; - } - } - if ((in_flags != 0) && - (mp->b_datap->db_type != M_CTL)) { + ip_fanout_proto_conn(connp, mp, ipha, NULL, ira); - /* - * the actual data will be contained in - * b_cont upon successful return - * of the following call else original - * mblk is returned - */ - ASSERT(recv_ill != NULL); - mp = ip_add_info(mp, recv_ill, - in_flags, IPCL_ZONEID(connp), ipst); - } - BUMP_MIB(mibptr, ipIfStatsHCInDelivers); - (connp->conn_recv)(connp, mp, NULL); - if (mctl_present) - freeb(first_mp); - } - } CONN_DEC_REF(connp); } /* - * Serialize tcp resets by calling tcp_xmit_reset_serialize through - * SQUEUE_ENTER_ONE(SQ_FILL). We do this to ensure the reset is handled on - * the correct squeue, in this case the same squeue as a valid listener with - * no current connection state for the packet we are processing. The function - * is called for synchronizing both IPv4 and IPv6. - */ -void -ip_xmit_reset_serialize(mblk_t *mp, int hdrlen, zoneid_t zoneid, - tcp_stack_t *tcps, conn_t *connp) -{ - mblk_t *rst_mp; - tcp_xmit_reset_event_t *eventp; - - rst_mp = allocb(sizeof (tcp_xmit_reset_event_t), BPRI_HI); - - if (rst_mp == NULL) { - freemsg(mp); - return; - } - - rst_mp->b_datap->db_type = M_PROTO; - rst_mp->b_wptr += sizeof (tcp_xmit_reset_event_t); - - eventp = (tcp_xmit_reset_event_t *)rst_mp->b_rptr; - eventp->tcp_xre_event = TCP_XRE_EVENT_IP_FANOUT_TCP; - eventp->tcp_xre_iphdrlen = hdrlen; - eventp->tcp_xre_zoneid = zoneid; - eventp->tcp_xre_tcps = tcps; - - rst_mp->b_cont = mp; - mp = rst_mp; - - /* - * Increment the connref, this ref will be released by the squeue - * framework. - */ - CONN_INC_REF(connp); - SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_xmit_reset, connp, - SQ_FILL, SQTAG_XMIT_EARLY_RESET); -} - -/* - * Fanout for TCP packets - * The caller puts <fport, lport> in the ports parameter. - * - * IPQoS Notes - * Before sending it to the client, invoke IPPF processing. - * Policy processing takes place only if the callout_position, IPP_LOCAL_IN, - * is enabled. If we get here from icmp_inbound_error_fanout or ip_wput_local - * ip_policy is false. - */ -static void -ip_fanout_tcp(queue_t *q, mblk_t *mp, ill_t *recv_ill, ipha_t *ipha, - uint_t flags, boolean_t mctl_present, boolean_t ip_policy, zoneid_t zoneid) -{ - mblk_t *first_mp; - boolean_t secure; - uint32_t ill_index; - int ip_hdr_len; - tcph_t *tcph; - boolean_t syn_present = B_FALSE; - conn_t *connp; - ip_stack_t *ipst = recv_ill->ill_ipst; - ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; - - ASSERT(recv_ill != NULL); - - first_mp = mp; - if (mctl_present) { - ASSERT(first_mp->b_datap->db_type == M_CTL); - mp = first_mp->b_cont; - secure = ipsec_in_is_secure(first_mp); - ASSERT(mp != NULL); - } else { - secure = B_FALSE; - } - - ip_hdr_len = IPH_HDR_LENGTH(mp->b_rptr); - - if ((connp = ipcl_classify_v4(mp, IPPROTO_TCP, ip_hdr_len, - zoneid, ipst)) == NULL) { - /* - * No connected connection or listener. Send a - * TH_RST via tcp_xmit_listeners_reset. - */ - - /* Initiate IPPf processing, if needed. */ - if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) { - uint32_t ill_index; - ill_index = recv_ill->ill_phyint->phyint_ifindex; - ip_process(IPP_LOCAL_IN, &first_mp, ill_index); - if (first_mp == NULL) - return; - } - BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers); - ip2dbg(("ip_fanout_tcp: no listener; send reset to zone %d\n", - zoneid)); - tcp_xmit_listeners_reset(first_mp, ip_hdr_len, zoneid, - ipst->ips_netstack->netstack_tcp, NULL); - return; - } - - /* - * Allocate the SYN for the TCP connection here itself - */ - tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; - if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) { - if (IPCL_IS_TCP(connp)) { - squeue_t *sqp; - - /* - * If the queue belongs to a conn, and fused tcp - * loopback is enabled, assign the eager's squeue - * to be that of the active connect's. Note that - * we don't check for IP_FF_LOOPBACK here since this - * routine gets called only for loopback (unlike the - * IPv6 counterpart). - */ - if (do_tcp_fusion && - CONN_Q(q) && IPCL_IS_TCP(Q_TO_CONN(q)) && - !CONN_INBOUND_POLICY_PRESENT(connp, ipss) && - !secure && - !IPP_ENABLED(IPP_LOCAL_IN, ipst) && !ip_policy) { - ASSERT(Q_TO_CONN(q)->conn_sqp != NULL); - sqp = Q_TO_CONN(q)->conn_sqp; - } else { - sqp = IP_SQUEUE_GET(lbolt); - } - - mp->b_datap->db_struioflag |= STRUIO_EAGER; - DB_CKSUMSTART(mp) = (intptr_t)sqp; - syn_present = B_TRUE; - } - } - - if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp) && !syn_present) { - uint_t flags = (unsigned int)tcph->th_flags[0] & 0xFF; - BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers); - if ((flags & TH_RST) || (flags & TH_URG)) { - CONN_DEC_REF(connp); - freemsg(first_mp); - return; - } - if (flags & TH_ACK) { - ip_xmit_reset_serialize(first_mp, ip_hdr_len, zoneid, - ipst->ips_netstack->netstack_tcp, connp); - CONN_DEC_REF(connp); - return; - } - - CONN_DEC_REF(connp); - freemsg(first_mp); - return; - } - - if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || secure) { - first_mp = ipsec_check_inbound_policy(first_mp, connp, ipha, - NULL, mctl_present); - if (first_mp == NULL) { - BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards); - CONN_DEC_REF(connp); - return; - } - if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp)) { - ASSERT(syn_present); - if (mctl_present) { - ASSERT(first_mp != mp); - first_mp->b_datap->db_struioflag |= - STRUIO_POLICY; - } else { - ASSERT(first_mp == mp); - mp->b_datap->db_struioflag &= - ~STRUIO_EAGER; - mp->b_datap->db_struioflag |= - STRUIO_POLICY; - } - } else { - /* - * Discard first_mp early since we're dealing with a - * fully-connected conn_t and tcp doesn't do policy in - * this case. - */ - if (mctl_present) { - freeb(first_mp); - mctl_present = B_FALSE; - } - first_mp = mp; - } - } - - /* - * Initiate policy processing here if needed. If we get here from - * icmp_inbound_error_fanout, ip_policy is false. - */ - if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) { - ill_index = recv_ill->ill_phyint->phyint_ifindex; - ip_process(IPP_LOCAL_IN, &mp, ill_index); - if (mp == NULL) { - CONN_DEC_REF(connp); - if (mctl_present) - freeb(first_mp); - return; - } else if (mctl_present) { - ASSERT(first_mp != mp); - first_mp->b_cont = mp; - } else { - first_mp = mp; - } - } - - /* Handle socket options. */ - if (!syn_present && - connp->conn_ip_recvpktinfo && (flags & IP_FF_IPINFO)) { - /* Add header */ - ASSERT(recv_ill != NULL); - /* - * Since tcp does not support IP_RECVPKTINFO for V4, only pass - * IPF_RECVIF. - */ - mp = ip_add_info(mp, recv_ill, IPF_RECVIF, IPCL_ZONEID(connp), - ipst); - if (mp == NULL) { - BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards); - CONN_DEC_REF(connp); - if (mctl_present) - freeb(first_mp); - return; - } else if (mctl_present) { - /* - * ip_add_info might return a new mp. - */ - ASSERT(first_mp != mp); - first_mp->b_cont = mp; - } else { - first_mp = mp; - } - } - BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers); - if (IPCL_IS_TCP(connp)) { - /* do not drain, certain use cases can blow the stack */ - SQUEUE_ENTER_ONE(connp->conn_sqp, first_mp, connp->conn_recv, - connp, SQ_NODRAIN, SQTAG_IP_FANOUT_TCP); - } else { - /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */ - (connp->conn_recv)(connp, first_mp, NULL); - CONN_DEC_REF(connp); - } -} - -/* * If we have a IPsec NAT-Traversal packet, strip the zero-SPI or - * pass it along to ESP if the SPI is non-zero. Returns TRUE if the mblk + * pass it along to ESP if the SPI is non-zero. Returns the mblk if the mblk * is not consumed. * - * One of four things can happen, all of which affect the passed-in mblk: - * - * 1.) ICMP messages that go through here just get returned TRUE. + * One of three things can happen, all of which affect the passed-in mblk: * - * 2.) The packet is stock UDP and gets its zero-SPI stripped. Return TRUE. + * 1.) The packet is stock UDP and gets its zero-SPI stripped. Return mblk.. * - * 3.) The packet is ESP-in-UDP, gets transformed into an equivalent - * ESP packet, and is passed along to ESP for consumption. Return FALSE. + * 2.) The packet is ESP-in-UDP, gets transformed into an equivalent + * ESP packet, and is passed along to ESP for consumption. Return NULL. * - * 4.) The packet is an ESP-in-UDP Keepalive. Drop it and return FALSE. + * 3.) The packet is an ESP-in-UDP Keepalive. Drop it and return NULL. */ -static boolean_t -zero_spi_check(queue_t *q, mblk_t *mp, ire_t *ire, ill_t *recv_ill, - ipsec_stack_t *ipss) +mblk_t * +zero_spi_check(mblk_t *mp, ip_recv_attr_t *ira) { int shift, plen, iph_len; ipha_t *ipha; @@ -6926,28 +5224,12 @@ zero_spi_check(queue_t *q, mblk_t *mp, ire_t *ire, ill_t *recv_ill, uint32_t *spi; uint32_t esp_ports; uint8_t *orptr; - boolean_t free_ire; - - if (DB_TYPE(mp) == M_CTL) { - /* - * ICMP message with UDP inside. Don't bother stripping, just - * send it up. - * - * NOTE: Any app with UDP_NAT_T_ENDPOINT set is probably going - * to ignore errors set by ICMP anyway ('cause they might be - * forged), but that's the app's decision, not ours. - */ - - /* Bunch of reality checks for DEBUG kernels... */ - ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION); - ASSERT(((ipha_t *)mp->b_rptr)->ipha_protocol == IPPROTO_ICMP); - - return (B_TRUE); - } + ip_stack_t *ipst = ira->ira_ill->ill_ipst; + ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; ipha = (ipha_t *)mp->b_rptr; - iph_len = IPH_HDR_LENGTH(ipha); - plen = ntohs(ipha->ipha_length); + iph_len = ira->ira_ip_hdr_length; + plen = ira->ira_pktlen; if (plen - iph_len - sizeof (udpha_t) < sizeof (uint32_t)) { /* @@ -6958,18 +5240,18 @@ zero_spi_check(queue_t *q, mblk_t *mp, ire_t *ire, ill_t *recv_ill, * byte packets (keepalives are 1-byte), but we'll drop them * also. */ - ip_drop_packet(mp, B_TRUE, recv_ill, NULL, + ip_drop_packet(mp, B_TRUE, ira->ira_ill, DROPPER(ipss, ipds_esp_nat_t_ka), &ipss->ipsec_dropper); - return (B_FALSE); + return (NULL); } if (MBLKL(mp) < iph_len + sizeof (udpha_t) + sizeof (*spi)) { /* might as well pull it all up - it might be ESP. */ if (!pullupmsg(mp, -1)) { - ip_drop_packet(mp, B_TRUE, recv_ill, NULL, + ip_drop_packet(mp, B_TRUE, ira->ira_ill, DROPPER(ipss, ipds_esp_nomem), &ipss->ipsec_dropper); - return (B_FALSE); + return (NULL); } ipha = (ipha_t *)mp->b_rptr; @@ -6985,7 +5267,8 @@ zero_spi_check(queue_t *q, mblk_t *mp, ire_t *ire, ill_t *recv_ill, } /* Fix IP header */ - ipha->ipha_length = htons(plen - shift); + ira->ira_pktlen = (plen - shift); + ipha->ipha_length = htons(ira->ira_pktlen); ipha->ipha_hdr_checksum = 0; orptr = mp->b_rptr; @@ -7005,388 +5288,185 @@ zero_spi_check(queue_t *q, mblk_t *mp, ire_t *ire, ill_t *recv_ill, if (esp_ports != 0) /* Punt up for ESP processing. */ { ipha = (ipha_t *)(orptr + shift); - free_ire = (ire == NULL); - if (free_ire) { - /* Re-acquire ire. */ - ire = ire_cache_lookup(ipha->ipha_dst, ALL_ZONES, NULL, - ipss->ipsec_netstack->netstack_ip); - if (ire == NULL || !(ire->ire_type & IRE_LOCAL)) { - if (ire != NULL) - ire_refrele(ire); - /* - * Do a regular freemsg(), as this is an IP - * error (no local route) not an IPsec one. - */ - freemsg(mp); - } - } - - ip_proto_input(q, mp, ipha, ire, recv_ill, esp_ports); - if (free_ire) - ire_refrele(ire); + ira->ira_flags |= IRAF_ESP_UDP_PORTS; + ira->ira_esp_udp_ports = esp_ports; + ip_fanout_v4(mp, ipha, ira); + return (NULL); } - - return (esp_ports == 0); + return (mp); } /* * Deliver a udp packet to the given conn, possibly applying ipsec policy. + * Handles IPv4 and IPv6. * We are responsible for disposing of mp, such as by freemsg() or putnext() - * Caller is responsible for dropping references to the conn, and freeing - * first_mp. - * - * IPQoS Notes - * Before sending it to the client, invoke IPPF processing. Policy processing - * takes place only if the callout_position, IPP_LOCAL_IN, is enabled and - * ip_policy is true. If we get here from icmp_inbound_error_fanout or - * ip_wput_local, ip_policy is false. + * Caller is responsible for dropping references to the conn. */ -static void -ip_fanout_udp_conn(conn_t *connp, mblk_t *first_mp, mblk_t *mp, - boolean_t secure, ill_t *ill, ipha_t *ipha, uint_t flags, ill_t *recv_ill, - boolean_t ip_policy) +void +ip_fanout_udp_conn(conn_t *connp, mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, + ip_recv_attr_t *ira) { - boolean_t mctl_present = (first_mp != NULL); - uint32_t in_flags = 0; /* set to IP_RECVSLLA and/or IP_RECVIF */ - uint32_t ill_index; - ip_stack_t *ipst = recv_ill->ill_ipst; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; + boolean_t secure; + iaflags_t iraflags = ira->ira_flags; - ASSERT(ill != NULL); + secure = iraflags & IRAF_IPSEC_SECURE; - if (mctl_present) - first_mp->b_cont = mp; - else - first_mp = mp; - - if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) || - (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) { + if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld : + !canputnext(connp->conn_rq)) { BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows); - freemsg(first_mp); + freemsg(mp); return; } - if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || secure) { - first_mp = ipsec_check_inbound_policy(first_mp, connp, ipha, - NULL, mctl_present); - /* Freed by ipsec_check_inbound_policy(). */ - if (first_mp == NULL) { + if (((iraflags & IRAF_IS_IPV4) ? + CONN_INBOUND_POLICY_PRESENT(connp, ipss) : + CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) || + secure) { + mp = ipsec_check_inbound_policy(mp, connp, ipha, + ip6h, ira); + if (mp == NULL) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + /* Note that mp is NULL */ + ip_drop_input("ipIfStatsInDiscards", mp, ill); return; } } - if (mctl_present) - freeb(first_mp); - - /* Let's hope the compilers utter "branch, predict-not-taken..." ;) */ - if (connp->conn_udp->udp_nat_t_endpoint) { - if (mctl_present) { - /* mctl_present *shouldn't* happen. */ - ip_drop_packet(mp, B_TRUE, NULL, NULL, - DROPPER(ipss, ipds_esp_nat_t_ipsec), - &ipss->ipsec_dropper); - return; - } - - if (!zero_spi_check(ill->ill_rq, mp, NULL, recv_ill, ipss)) - return; - } - /* Handle options. */ - if (connp->conn_recvif) - in_flags = IPF_RECVIF; /* - * UDP supports IP_RECVPKTINFO option for both v4 and v6 so the flag - * passed to ip_add_info is based on IP version of connp. + * Since this code is not used for UDP unicast we don't need a NAT_T + * check. Only ip_fanout_v4 has that check. */ - if (connp->conn_ip_recvpktinfo && (flags & IP_FF_IPINFO)) { - if (connp->conn_af_isv6) { - /* - * V6 only needs index - */ - in_flags |= IPF_RECVIF; - } else { - /* - * V4 needs index + matching address. - */ - in_flags |= IPF_RECVADDR; - } - } - - if (connp->conn_recvslla && !(flags & IP_FF_SEND_SLLA)) - in_flags |= IPF_RECVSLLA; + if (ira->ira_flags & IRAF_ICMP_ERROR) { + (connp->conn_recvicmp)(connp, mp, NULL, ira); + } else { + ill_t *rill = ira->ira_rill; - /* - * Initiate IPPF processing here, if needed. Note first_mp won't be - * freed if the packet is dropped. The caller will do so. - */ - if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) { - ill_index = recv_ill->ill_phyint->phyint_ifindex; - ip_process(IPP_LOCAL_IN, &mp, ill_index); - if (mp == NULL) { - return; - } - } - if ((in_flags != 0) && - (mp->b_datap->db_type != M_CTL)) { - /* - * The actual data will be contained in b_cont - * upon successful return of the following call - * else original mblk is returned - */ - ASSERT(recv_ill != NULL); - mp = ip_add_info(mp, recv_ill, in_flags, IPCL_ZONEID(connp), - ipst); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); + ira->ira_ill = ira->ira_rill = NULL; + /* Send it upstream */ + (connp->conn_recv)(connp, mp, NULL, ira); + ira->ira_ill = ill; + ira->ira_rill = rill; } - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); - /* Send it upstream */ - (connp->conn_recv)(connp, mp, NULL); } /* - * Fanout for UDP packets. - * The caller puts <fport, lport> in the ports parameter. + * Fanout for UDP packets that are multicast or broadcast, and ICMP errors. + * (Unicast fanout is handled in ip_input_v4.) * * If SO_REUSEADDR is set all multicast and broadcast packets - * will be delivered to all streams bound to the same port. + * will be delivered to all conns bound to the same port. * - * Zones notes: - * Multicast and broadcast packets will be distributed to streams in all zones. + * If there is at least one matching AF_INET receiver, then we will + * ignore any AF_INET6 receivers. * In the special case where an AF_INET socket binds to 0.0.0.0/<port> and an * AF_INET6 socket binds to ::/<port>, only the AF_INET socket receives the IPv4 - * packets. To maintain this behavior with multiple zones, the conns are grouped - * by zone and the SO_REUSEADDR flag is checked for the first matching conn in - * each zone. If unset, all the following conns in the same zone are skipped. + * packets. + * + * Zones notes: + * Earlier in ip_input on a system with multiple shared-IP zones we + * duplicate the multicast and broadcast packets and send them up + * with each explicit zoneid that exists on that ill. + * This means that here we can match the zoneid with SO_ALLZONES being special. */ -static void -ip_fanout_udp(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, - uint32_t ports, boolean_t broadcast, uint_t flags, boolean_t mctl_present, - boolean_t ip_policy, ill_t *recv_ill, zoneid_t zoneid) +void +ip_fanout_udp_multi_v4(mblk_t *mp, ipha_t *ipha, uint16_t lport, uint16_t fport, + ip_recv_attr_t *ira) { - uint32_t dstport, srcport; - ipaddr_t dst; - mblk_t *first_mp; - boolean_t secure; - in6_addr_t v6src; + ipaddr_t laddr; + in6_addr_t v6faddr; conn_t *connp; connf_t *connfp; - conn_t *first_connp; - conn_t *next_connp; - mblk_t *mp1, *first_mp1; - ipaddr_t src; - zoneid_t last_zoneid; - boolean_t reuseaddr; - boolean_t shared_addr; - boolean_t unlabeled; - ip_stack_t *ipst; - - ASSERT(recv_ill != NULL); - ipst = recv_ill->ill_ipst; - - first_mp = mp; - if (mctl_present) { - mp = first_mp->b_cont; - first_mp->b_cont = NULL; - secure = ipsec_in_is_secure(first_mp); - ASSERT(mp != NULL); - } else { - first_mp = NULL; - secure = B_FALSE; - } + ipaddr_t faddr; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; - /* Extract ports in net byte order */ - dstport = htons(ntohl(ports) & 0xFFFF); - srcport = htons(ntohl(ports) >> 16); - dst = ipha->ipha_dst; - src = ipha->ipha_src; + ASSERT(ira->ira_flags & (IRAF_MULTIBROADCAST|IRAF_ICMP_ERROR)); - unlabeled = B_FALSE; - if (is_system_labeled()) - /* Cred cannot be null on IPv4 */ - unlabeled = (msg_getlabel(mp)->tsl_flags & - TSLF_UNLABELED) != 0; - shared_addr = (zoneid == ALL_ZONES); - if (shared_addr) { - /* - * No need to handle exclusive-stack zones since ALL_ZONES - * only applies to the shared stack. - */ - zoneid = tsol_mlp_findzone(IPPROTO_UDP, dstport); - /* - * If no shared MLP is found, tsol_mlp_findzone returns - * ALL_ZONES. In that case, we assume it's SLP, and - * search for the zone based on the packet label. - * - * If there is such a zone, we prefer to find a - * connection in it. Otherwise, we look for a - * MAC-exempt connection in any zone whose label - * dominates the default label on the packet. - */ - if (zoneid == ALL_ZONES) - zoneid = tsol_packet_to_zoneid(mp); - else - unlabeled = B_FALSE; - } + laddr = ipha->ipha_dst; + faddr = ipha->ipha_src; - connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(dstport, ipst)]; + connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; mutex_enter(&connfp->connf_lock); connp = connfp->connf_head; - if (!broadcast && !CLASSD(dst)) { - /* - * Not broadcast or multicast. Send to the one (first) - * client we find. No need to check conn_wantpacket() - * since IP_BOUND_IF/conn_incoming_ill does not apply to - * IPv4 unicast packets. - */ - while ((connp != NULL) && - (!IPCL_UDP_MATCH(connp, dstport, dst, srcport, src) || - (!IPCL_ZONE_MATCH(connp, zoneid) && - !(unlabeled && (connp->conn_mac_mode != CONN_MAC_DEFAULT) && - shared_addr)))) { - /* - * We keep searching since the conn did not match, - * or its zone did not match and it is not either - * an allzones conn or a mac exempt conn (if the - * sender is unlabeled.) - */ - connp = connp->conn_next; - } - - if (connp == NULL || - !IPCL_IS_NONSTR(connp) && connp->conn_upq == NULL) - goto notfound; - - ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL); - - if (is_system_labeled() && - !tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, - connp)) - goto notfound; - - CONN_INC_REF(connp); - mutex_exit(&connfp->connf_lock); - ip_fanout_udp_conn(connp, first_mp, mp, secure, ill, ipha, - flags, recv_ill, ip_policy); - IP_STAT(ipst, ip_udp_fannorm); - CONN_DEC_REF(connp); - return; - } /* - * Broadcast and multicast case - * - * Need to check conn_wantpacket(). * If SO_REUSEADDR has been set on the first we send the * packet to all clients that have joined the group and * match the port. */ - while (connp != NULL) { - if ((IPCL_UDP_MATCH(connp, dstport, dst, srcport, src)) && - conn_wantpacket(connp, ill, ipha, flags, zoneid) && - (!is_system_labeled() || - tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, - connp))) + if ((IPCL_UDP_MATCH(connp, lport, laddr, fport, faddr)) && + conn_wantpacket(connp, ira, ipha) && + (!(ira->ira_flags & IRAF_SYSTEM_LABELED) || + tsol_receive_local(mp, &laddr, IPV4_VERSION, ira, connp))) break; connp = connp->conn_next; } - if (connp == NULL || - !IPCL_IS_NONSTR(connp) && connp->conn_upq == NULL) + if (connp == NULL) goto notfound; - ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL); + CONN_INC_REF(connp); - first_connp = connp; - /* - * When SO_REUSEADDR is not set, send the packet only to the first - * matching connection in its zone by keeping track of the zoneid. - */ - reuseaddr = first_connp->conn_reuseaddr; - last_zoneid = first_connp->conn_zoneid; + if (connp->conn_reuseaddr) { + conn_t *first_connp = connp; + conn_t *next_connp; + mblk_t *mp1; - CONN_INC_REF(connp); - connp = connp->conn_next; - for (;;) { - while (connp != NULL) { - if (IPCL_UDP_MATCH(connp, dstport, dst, srcport, src) && - (reuseaddr || connp->conn_zoneid != last_zoneid) && - conn_wantpacket(connp, ill, ipha, flags, zoneid) && - (!is_system_labeled() || - tsol_receive_local(mp, &dst, IPV4_VERSION, - shared_addr, connp))) + connp = connp->conn_next; + for (;;) { + while (connp != NULL) { + if (IPCL_UDP_MATCH(connp, lport, laddr, + fport, faddr) && + conn_wantpacket(connp, ira, ipha) && + (!(ira->ira_flags & IRAF_SYSTEM_LABELED) || + tsol_receive_local(mp, &laddr, IPV4_VERSION, + ira, connp))) + break; + connp = connp->conn_next; + } + if (connp == NULL) { + /* No more interested clients */ + connp = first_connp; break; - connp = connp->conn_next; - } - /* - * Just copy the data part alone. The mctl part is - * needed just for verifying policy and it is never - * sent up. - */ - if (connp == NULL || (((mp1 = dupmsg(mp)) == NULL) && - ((mp1 = copymsg(mp)) == NULL))) { - /* - * No more interested clients or memory - * allocation failed - */ - connp = first_connp; - break; - } - if (connp->conn_zoneid != last_zoneid) { - /* - * Update the zoneid so that the packet isn't sent to - * any more conns in the same zone unless SO_REUSEADDR - * is set. - */ - reuseaddr = connp->conn_reuseaddr; - last_zoneid = connp->conn_zoneid; - } - if (first_mp != NULL) { - ASSERT(((ipsec_info_t *)first_mp->b_rptr)-> - ipsec_info_type == IPSEC_IN); - first_mp1 = ipsec_in_tag(first_mp, NULL, - ipst->ips_netstack); - if (first_mp1 == NULL) { - freemsg(mp1); + } + if (((mp1 = dupmsg(mp)) == NULL) && + ((mp1 = copymsg(mp)) == NULL)) { + /* Memory allocation failed */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); connp = first_connp; break; } - } else { - first_mp1 = NULL; + CONN_INC_REF(connp); + mutex_exit(&connfp->connf_lock); + + IP_STAT(ipst, ip_udp_fanmb); + ip_fanout_udp_conn(connp, mp1, (ipha_t *)mp1->b_rptr, + NULL, ira); + mutex_enter(&connfp->connf_lock); + /* Follow the next pointer before releasing the conn */ + next_connp = connp->conn_next; + CONN_DEC_REF(connp); + connp = next_connp; } - CONN_INC_REF(connp); - mutex_exit(&connfp->connf_lock); - /* - * IPQoS notes: We don't send the packet for policy - * processing here, will do it for the last one (below). - * i.e. we do it per-packet now, but if we do policy - * processing per-conn, then we would need to do it - * here too. - */ - ip_fanout_udp_conn(connp, first_mp1, mp1, secure, ill, - ipha, flags, recv_ill, B_FALSE); - mutex_enter(&connfp->connf_lock); - /* Follow the next pointer before releasing the conn. */ - next_connp = connp->conn_next; - IP_STAT(ipst, ip_udp_fanmb); - CONN_DEC_REF(connp); - connp = next_connp; } /* Last one. Send it upstream. */ mutex_exit(&connfp->connf_lock); - ip_fanout_udp_conn(connp, first_mp, mp, secure, ill, ipha, flags, - recv_ill, ip_policy); IP_STAT(ipst, ip_udp_fanmb); + ip_fanout_udp_conn(connp, mp, ipha, NULL, ira); CONN_DEC_REF(connp); return; notfound: - mutex_exit(&connfp->connf_lock); - IP_STAT(ipst, ip_udp_fanothers); /* - * IPv6 endpoints bound to unicast or multicast IPv4-mapped addresses + * IPv6 endpoints bound to multicast IPv4-mapped addresses * have already been matched above, since they live in the IPv4 * fanout tables. This implies we only need to * check for IPv6 in6addr_any endpoints here. @@ -7394,85 +5474,28 @@ notfound: * address, except for the multicast group membership lookup which * uses the IPv4 destination. */ - IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src); - connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(dstport, ipst)]; + IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6faddr); + connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; mutex_enter(&connfp->connf_lock); connp = connfp->connf_head; - if (!broadcast && !CLASSD(dst)) { - while (connp != NULL) { - if (IPCL_UDP_MATCH_V6(connp, dstport, ipv6_all_zeros, - srcport, v6src) && IPCL_ZONE_MATCH(connp, zoneid) && - conn_wantpacket(connp, ill, ipha, flags, zoneid) && - !connp->conn_ipv6_v6only) - break; - connp = connp->conn_next; - } - - if (connp != NULL && is_system_labeled() && - !tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, - connp)) - connp = NULL; - - if (connp == NULL || - !IPCL_IS_NONSTR(connp) && connp->conn_upq == NULL) { - /* - * No one bound to this port. Is - * there a client that wants all - * unclaimed datagrams? - */ - mutex_exit(&connfp->connf_lock); - - if (mctl_present) - first_mp->b_cont = mp; - else - first_mp = mp; - if (ipst->ips_ipcl_proto_fanout[IPPROTO_UDP]. - connf_head != NULL) { - ip_fanout_proto(q, first_mp, ill, ipha, - flags | IP_FF_RAWIP, mctl_present, - ip_policy, recv_ill, zoneid); - } else { - if (ip_fanout_send_icmp(q, first_mp, flags, - ICMP_DEST_UNREACHABLE, - ICMP_PORT_UNREACHABLE, - mctl_present, zoneid, ipst)) { - BUMP_MIB(ill->ill_ip_mib, - udpIfStatsNoPorts); - } - } - return; - } - ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL); - - CONN_INC_REF(connp); - mutex_exit(&connfp->connf_lock); - ip_fanout_udp_conn(connp, first_mp, mp, secure, ill, ipha, - flags, recv_ill, ip_policy); - CONN_DEC_REF(connp); - return; - } /* * IPv4 multicast packet being delivered to an AF_INET6 * in6addr_any endpoint. * Need to check conn_wantpacket(). Note that we use conn_wantpacket() * and not conn_wantpacket_v6() since any multicast membership is * for an IPv4-mapped multicast address. - * The packet is sent to all clients in all zones that have joined the - * group and match the port. */ while (connp != NULL) { - if (IPCL_UDP_MATCH_V6(connp, dstport, ipv6_all_zeros, - srcport, v6src) && - conn_wantpacket(connp, ill, ipha, flags, zoneid) && - (!is_system_labeled() || - tsol_receive_local(mp, &dst, IPV4_VERSION, shared_addr, - connp))) + if (IPCL_UDP_MATCH_V6(connp, lport, ipv6_all_zeros, + fport, v6faddr) && + conn_wantpacket(connp, ira, ipha) && + (!(ira->ira_flags & IRAF_SYSTEM_LABELED) || + tsol_receive_local(mp, &laddr, IPV4_VERSION, ira, connp))) break; connp = connp->conn_next; } - if (connp == NULL || - !IPCL_IS_NONSTR(connp) && connp->conn_upq == NULL) { + if (connp == NULL) { /* * No one bound to this port. Is * there a client that wants all @@ -7480,15 +5503,10 @@ notfound: */ mutex_exit(&connfp->connf_lock); - if (mctl_present) - first_mp->b_cont = mp; - else - first_mp = mp; - if (ipst->ips_ipcl_proto_fanout[IPPROTO_UDP].connf_head != + if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_UDP].connf_head != NULL) { - ip_fanout_proto(q, first_mp, ill, ipha, - flags | IP_FF_RAWIP, mctl_present, ip_policy, - recv_ill, zoneid); + ASSERT(ira->ira_protocol == IPPROTO_UDP); + ip_fanout_proto_v4(mp, ipha, ira); } else { /* * We used to attempt to send an icmp error here, but @@ -7497,102 +5515,263 @@ notfound: * multicast, just drop the packet and give up sooner. */ BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts); - freemsg(first_mp); + freemsg(mp); } return; } - ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL); + ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL); - first_connp = connp; + /* + * If SO_REUSEADDR has been set on the first we send the + * packet to all clients that have joined the group and + * match the port. + */ + if (connp->conn_reuseaddr) { + conn_t *first_connp = connp; + conn_t *next_connp; + mblk_t *mp1; - CONN_INC_REF(connp); - connp = connp->conn_next; - for (;;) { - while (connp != NULL) { - if (IPCL_UDP_MATCH_V6(connp, dstport, - ipv6_all_zeros, srcport, v6src) && - conn_wantpacket(connp, ill, ipha, flags, zoneid) && - (!is_system_labeled() || - tsol_receive_local(mp, &dst, IPV4_VERSION, - shared_addr, connp))) + CONN_INC_REF(connp); + connp = connp->conn_next; + for (;;) { + while (connp != NULL) { + if (IPCL_UDP_MATCH_V6(connp, lport, + ipv6_all_zeros, fport, v6faddr) && + conn_wantpacket(connp, ira, ipha) && + (!(ira->ira_flags & IRAF_SYSTEM_LABELED) || + tsol_receive_local(mp, &laddr, IPV4_VERSION, + ira, connp))) + break; + connp = connp->conn_next; + } + if (connp == NULL) { + /* No more interested clients */ + connp = first_connp; break; - connp = connp->conn_next; - } - /* - * Just copy the data part alone. The mctl part is - * needed just for verifying policy and it is never - * sent up. - */ - if (connp == NULL || (((mp1 = dupmsg(mp)) == NULL) && - ((mp1 = copymsg(mp)) == NULL))) { - /* - * No more intested clients or memory - * allocation failed - */ - connp = first_connp; - break; - } - if (first_mp != NULL) { - ASSERT(((ipsec_info_t *)first_mp->b_rptr)-> - ipsec_info_type == IPSEC_IN); - first_mp1 = ipsec_in_tag(first_mp, NULL, - ipst->ips_netstack); - if (first_mp1 == NULL) { - freemsg(mp1); + } + if (((mp1 = dupmsg(mp)) == NULL) && + ((mp1 = copymsg(mp)) == NULL)) { + /* Memory allocation failed */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); connp = first_connp; break; } - } else { - first_mp1 = NULL; + CONN_INC_REF(connp); + mutex_exit(&connfp->connf_lock); + + IP_STAT(ipst, ip_udp_fanmb); + ip_fanout_udp_conn(connp, mp1, (ipha_t *)mp1->b_rptr, + NULL, ira); + mutex_enter(&connfp->connf_lock); + /* Follow the next pointer before releasing the conn */ + next_connp = connp->conn_next; + CONN_DEC_REF(connp); + connp = next_connp; } - CONN_INC_REF(connp); - mutex_exit(&connfp->connf_lock); - /* - * IPQoS notes: We don't send the packet for policy - * processing here, will do it for the last one (below). - * i.e. we do it per-packet now, but if we do policy - * processing per-conn, then we would need to do it - * here too. - */ - ip_fanout_udp_conn(connp, first_mp1, mp1, secure, ill, - ipha, flags, recv_ill, B_FALSE); - mutex_enter(&connfp->connf_lock); - /* Follow the next pointer before releasing the conn. */ - next_connp = connp->conn_next; - CONN_DEC_REF(connp); - connp = next_connp; } /* Last one. Send it upstream. */ mutex_exit(&connfp->connf_lock); - ip_fanout_udp_conn(connp, first_mp, mp, secure, ill, ipha, flags, - recv_ill, ip_policy); + IP_STAT(ipst, ip_udp_fanmb); + ip_fanout_udp_conn(connp, mp, ipha, NULL, ira); CONN_DEC_REF(connp); } /* - * Complete the ip_wput header so that it - * is possible to generate ICMP - * errors. + * Split an incoming packet's IPv4 options into the label and the other options. + * If 'allocate' is set it does memory allocation for the ip_pkt_t, including + * clearing out any leftover label or options. + * Otherwise it just makes ipp point into the packet. + * + * Returns zero if ok; ENOMEM if the buffer couldn't be allocated. */ int -ip_hdr_complete(ipha_t *ipha, zoneid_t zoneid, ip_stack_t *ipst) +ip_find_hdr_v4(ipha_t *ipha, ip_pkt_t *ipp, boolean_t allocate) { - ire_t *ire; + uchar_t *opt; + uint32_t totallen; + uint32_t optval; + uint32_t optlen; - if (ipha->ipha_src == INADDR_ANY) { - ire = ire_lookup_local(zoneid, ipst); - if (ire == NULL) { - ip1dbg(("ip_hdr_complete: no source IRE\n")); - return (1); + ipp->ipp_fields |= IPPF_HOPLIMIT | IPPF_TCLASS | IPPF_ADDR; + ipp->ipp_hoplimit = ipha->ipha_ttl; + ipp->ipp_type_of_service = ipha->ipha_type_of_service; + IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &ipp->ipp_addr); + + /* + * Get length (in 4 byte octets) of IP header options. + */ + totallen = ipha->ipha_version_and_hdr_length - + (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS); + + if (totallen == 0) { + if (!allocate) + return (0); + + /* Clear out anything from a previous packet */ + if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) { + kmem_free(ipp->ipp_ipv4_options, + ipp->ipp_ipv4_options_len); + ipp->ipp_ipv4_options = NULL; + ipp->ipp_ipv4_options_len = 0; + ipp->ipp_fields &= ~IPPF_IPV4_OPTIONS; } - ipha->ipha_src = ire->ire_addr; - ire_refrele(ire); + if (ipp->ipp_fields & IPPF_LABEL_V4) { + kmem_free(ipp->ipp_label_v4, ipp->ipp_label_len_v4); + ipp->ipp_label_v4 = NULL; + ipp->ipp_label_len_v4 = 0; + ipp->ipp_fields &= ~IPPF_LABEL_V4; + } + return (0); } - ipha->ipha_ttl = ipst->ips_ip_def_ttl; - ipha->ipha_hdr_checksum = 0; - ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); - return (0); + + totallen <<= 2; + opt = (uchar_t *)&ipha[1]; + if (!is_system_labeled()) { + + copyall: + if (!allocate) { + if (totallen != 0) { + ipp->ipp_ipv4_options = opt; + ipp->ipp_ipv4_options_len = totallen; + ipp->ipp_fields |= IPPF_IPV4_OPTIONS; + } + return (0); + } + /* Just copy all of options */ + if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) { + if (totallen == ipp->ipp_ipv4_options_len) { + bcopy(opt, ipp->ipp_ipv4_options, totallen); + return (0); + } + kmem_free(ipp->ipp_ipv4_options, + ipp->ipp_ipv4_options_len); + ipp->ipp_ipv4_options = NULL; + ipp->ipp_ipv4_options_len = 0; + ipp->ipp_fields &= ~IPPF_IPV4_OPTIONS; + } + if (totallen == 0) + return (0); + + ipp->ipp_ipv4_options = kmem_alloc(totallen, KM_NOSLEEP); + if (ipp->ipp_ipv4_options == NULL) + return (ENOMEM); + ipp->ipp_ipv4_options_len = totallen; + ipp->ipp_fields |= IPPF_IPV4_OPTIONS; + bcopy(opt, ipp->ipp_ipv4_options, totallen); + return (0); + } + + if (allocate && (ipp->ipp_fields & IPPF_LABEL_V4)) { + kmem_free(ipp->ipp_label_v4, ipp->ipp_label_len_v4); + ipp->ipp_label_v4 = NULL; + ipp->ipp_label_len_v4 = 0; + ipp->ipp_fields &= ~IPPF_LABEL_V4; + } + + /* + * Search for CIPSO option. + * We assume CIPSO is first in options if it is present. + * If it isn't, then ipp_opt_ipv4_options will not include the options + * prior to the CIPSO option. + */ + while (totallen != 0) { + switch (optval = opt[IPOPT_OPTVAL]) { + case IPOPT_EOL: + return (0); + case IPOPT_NOP: + optlen = 1; + break; + default: + if (totallen <= IPOPT_OLEN) + return (EINVAL); + optlen = opt[IPOPT_OLEN]; + if (optlen < 2) + return (EINVAL); + } + if (optlen > totallen) + return (EINVAL); + + switch (optval) { + case IPOPT_COMSEC: + if (!allocate) { + ipp->ipp_label_v4 = opt; + ipp->ipp_label_len_v4 = optlen; + ipp->ipp_fields |= IPPF_LABEL_V4; + } else { + ipp->ipp_label_v4 = kmem_alloc(optlen, + KM_NOSLEEP); + if (ipp->ipp_label_v4 == NULL) + return (ENOMEM); + ipp->ipp_label_len_v4 = optlen; + ipp->ipp_fields |= IPPF_LABEL_V4; + bcopy(opt, ipp->ipp_label_v4, optlen); + } + totallen -= optlen; + opt += optlen; + + /* Skip padding bytes until we get to a multiple of 4 */ + while ((totallen & 3) != 0 && opt[0] == IPOPT_NOP) { + totallen--; + opt++; + } + /* Remaining as ipp_ipv4_options */ + goto copyall; + } + totallen -= optlen; + opt += optlen; + } + /* No CIPSO found; return everything as ipp_ipv4_options */ + totallen = ipha->ipha_version_and_hdr_length - + (uint8_t)((IP_VERSION << 4) + IP_SIMPLE_HDR_LENGTH_IN_WORDS); + totallen <<= 2; + opt = (uchar_t *)&ipha[1]; + goto copyall; +} + +/* + * Efficient versions of lookup for an IRE when we only + * match the address. + * For RTF_REJECT or BLACKHOLE we return IRE_NOROUTE. + * Does not handle multicast addresses. + */ +uint_t +ip_type_v4(ipaddr_t addr, ip_stack_t *ipst) +{ + ire_t *ire; + uint_t result; + + ire = ire_ftable_lookup_simple_v4(addr, 0, ipst, NULL); + ASSERT(ire != NULL); + if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) + result = IRE_NOROUTE; + else + result = ire->ire_type; + ire_refrele(ire); + return (result); +} + +/* + * Efficient versions of lookup for an IRE when we only + * match the address. + * For RTF_REJECT or BLACKHOLE we return IRE_NOROUTE. + * Does not handle multicast addresses. + */ +uint_t +ip_type_v6(const in6_addr_t *addr, ip_stack_t *ipst) +{ + ire_t *ire; + uint_t result; + + ire = ire_ftable_lookup_simple_v6(addr, 0, ipst, NULL); + ASSERT(ire != NULL); + if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) + result = IRE_NOROUTE; + else + result = ire->ire_type; + ire_refrele(ire); + return (result); } /* @@ -7602,8 +5781,6 @@ ip_hdr_complete(ipha_t *ipha, zoneid_t zoneid, ip_stack_t *ipst) static void ip_lrput(queue_t *q, mblk_t *mp) { - mblk_t *mp1; - switch (mp->b_datap->db_type) { case M_FLUSH: /* Turn around */ @@ -7614,9 +5791,6 @@ ip_lrput(queue_t *q, mblk_t *mp) } break; } - /* Could receive messages that passed through ar_rput */ - for (mp1 = mp; mp1; mp1 = mp1->b_cont) - mp1->b_prev = mp1->b_next = NULL; freemsg(mp); } @@ -7631,7 +5805,7 @@ ip_lwput(queue_t *q, mblk_t *mp) /* * Move the first hop in any source route to ipha_dst and remove that part of * the source route. Called by other protocols. Errors in option formatting - * are ignored - will be handled by ip_wput_options Return the final + * are ignored - will be handled by ip_output_options. Return the final * destination (either ipha_dst or the last entry in a source route.) */ ipaddr_t @@ -7643,7 +5817,6 @@ ip_massage_options(ipha_t *ipha, netstack_t *ns) uint8_t optlen; ipaddr_t dst; int i; - ire_t *ire; ip_stack_t *ipst = ns->netstack_ip; ip2dbg(("ip_massage_options\n")); @@ -7679,10 +5852,7 @@ ip_massage_options(ipha_t *ipha, netstack_t *ns) * XXX verify per-interface ip_forwarding * for source route? */ - ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, - ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); - if (ire != NULL) { - ire_refrele(ire); + if (ip_type_v4(dst, ipst) == IRE_LOCAL) { off += IP_ADDR_LEN; goto redo_srr; } @@ -7760,1843 +5930,41 @@ ip_net_mask(ipaddr_t addr) return ((ipaddr_t)0); } -/* - * Helper ill lookup function used by IPsec. - */ -ill_t * -ip_grab_ill(mblk_t *first_mp, int ifindex, boolean_t isv6, ip_stack_t *ipst) +/* Name/Value Table Lookup Routine */ +char * +ip_nv_lookup(nv_t *nv, int value) { - ill_t *ret_ill; - - ASSERT(ifindex != 0); - - ret_ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL, - ipst); - if (ret_ill == NULL) { - if (isv6) { - BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); - ip1dbg(("ip_grab_ill (IPv6): bad ifindex %d.\n", - ifindex)); - } else { - BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); - ip1dbg(("ip_grab_ill (IPv4): bad ifindex %d.\n", - ifindex)); - } - freemsg(first_mp); + if (!nv) return (NULL); + for (; nv->nv_name; nv++) { + if (nv->nv_value == value) + return (nv->nv_name); } - return (ret_ill); -} - -/* - * IPv4 - - * ip_newroute is called by ip_rput or ip_wput whenever we need to send - * out a packet to a destination address for which we do not have specific - * (or sufficient) routing information. - * - * NOTE : These are the scopes of some of the variables that point at IRE, - * which needs to be followed while making any future modifications - * to avoid memory leaks. - * - * - ire and sire are the entries looked up initially by - * ire_ftable_lookup. - * - ipif_ire is used to hold the interface ire associated with - * the new cache ire. But it's scope is limited, so we always REFRELE - * it before branching out to error paths. - * - save_ire is initialized before ire_create, so that ire returned - * by ire_create will not over-write the ire. We REFRELE save_ire - * before breaking out of the switch. - * - * Thus on failures, we have to REFRELE only ire and sire, if they - * are not NULL. - */ -void -ip_newroute(queue_t *q, mblk_t *mp, ipaddr_t dst, conn_t *connp, - zoneid_t zoneid, ip_stack_t *ipst) -{ - areq_t *areq; - ipaddr_t gw = 0; - ire_t *ire = NULL; - mblk_t *res_mp; - ipaddr_t *addrp; - ipaddr_t nexthop_addr; - ipif_t *src_ipif = NULL; - ill_t *dst_ill = NULL; - ipha_t *ipha; - ire_t *sire = NULL; - mblk_t *first_mp; - ire_t *save_ire; - ushort_t ire_marks = 0; - boolean_t mctl_present; - ipsec_out_t *io; - mblk_t *saved_mp; - mblk_t *copy_mp = NULL; - mblk_t *xmit_mp = NULL; - ipaddr_t save_dst; - uint32_t multirt_flags = - MULTIRT_CACHEGW | MULTIRT_USESTAMP | MULTIRT_SETSTAMP; - boolean_t multirt_is_resolvable; - boolean_t multirt_resolve_next; - boolean_t unspec_src; - boolean_t ip_nexthop = B_FALSE; - tsol_ire_gw_secattr_t *attrp = NULL; - tsol_gcgrp_t *gcgrp = NULL; - tsol_gcgrp_addr_t ga; - int multirt_res_failures = 0; - int multirt_res_attempts = 0; - int multirt_already_resolved = 0; - boolean_t multirt_no_icmp_error = B_FALSE; - - if (ip_debug > 2) { - /* ip1dbg */ - pr_addr_dbg("ip_newroute: dst %s\n", AF_INET, &dst); - } - - EXTRACT_PKT_MP(mp, first_mp, mctl_present); - if (mctl_present) { - io = (ipsec_out_t *)first_mp->b_rptr; - ASSERT(io->ipsec_out_type == IPSEC_OUT); - ASSERT(zoneid == io->ipsec_out_zoneid); - ASSERT(zoneid != ALL_ZONES); - } - - ipha = (ipha_t *)mp->b_rptr; - - /* All multicast lookups come through ip_newroute_ipif() */ - if (CLASSD(dst)) { - ip0dbg(("ip_newroute: CLASSD 0x%x (b_prev %p, b_next %p)\n", - ntohl(dst), (void *)mp->b_prev, (void *)mp->b_next)); - freemsg(first_mp); - return; - } - - if (mctl_present && io->ipsec_out_ip_nexthop) { - ip_nexthop = B_TRUE; - nexthop_addr = io->ipsec_out_nexthop_addr; - } - /* - * If this IRE is created for forwarding or it is not for - * traffic for congestion controlled protocols, mark it as temporary. - */ - if (mp->b_prev != NULL || !IP_FLOW_CONTROLLED_ULP(ipha->ipha_protocol)) - ire_marks |= IRE_MARK_TEMPORARY; - - /* - * Get what we can from ire_ftable_lookup which will follow an IRE - * chain until it gets the most specific information available. - * For example, we know that there is no IRE_CACHE for this dest, - * but there may be an IRE_OFFSUBNET which specifies a gateway. - * ire_ftable_lookup will look up the gateway, etc. - * Otherwise, given ire_ftable_lookup algorithm, only one among routes - * to the destination, of equal netmask length in the forward table, - * will be recursively explored. If no information is available - * for the final gateway of that route, we force the returned ire - * to be equal to sire using MATCH_IRE_PARENT. - * At least, in this case we have a starting point (in the buckets) - * to look for other routes to the destination in the forward table. - * This is actually used only for multirouting, where a list - * of routes has to be processed in sequence. - * - * In the process of coming up with the most specific information, - * ire_ftable_lookup may end up with an incomplete IRE_CACHE entry - * for the gateway (i.e., one for which the ire_nce->nce_state is - * not yet ND_REACHABLE, and is in the middle of arp resolution). - * Two caveats when handling incomplete ire's in ip_newroute: - * - we should be careful when accessing its ire_nce (specifically - * the nce_res_mp) ast it might change underneath our feet, and, - * - not all legacy code path callers are prepared to handle - * incomplete ire's, so we should not create/add incomplete - * ire_cache entries here. (See discussion about temporary solution - * further below). - * - * In order to minimize packet dropping, and to preserve existing - * behavior, we treat this case as if there were no IRE_CACHE for the - * gateway, and instead use the IF_RESOLVER ire to send out - * another request to ARP (this is achieved by passing the - * MATCH_IRE_COMPLETE flag to ire_ftable_lookup). When the - * arp response comes back in ip_wput_nondata, we will create - * a per-dst ire_cache that has an ND_COMPLETE ire. - * - * Note that this is a temporary solution; the correct solution is - * to create an incomplete per-dst ire_cache entry, and send the - * packet out when the gw's nce is resolved. In order to achieve this, - * all packet processing must have been completed prior to calling - * ire_add_then_send. Some legacy code paths (e.g. cgtp) would need - * to be modified to accomodate this solution. - */ - if (ip_nexthop) { - /* - * The first time we come here, we look for an IRE_INTERFACE - * entry for the specified nexthop, set the dst to be the - * nexthop address and create an IRE_CACHE entry for the - * nexthop. The next time around, we are able to find an - * IRE_CACHE entry for the nexthop, set the gateway to be the - * nexthop address and create an IRE_CACHE entry for the - * destination address via the specified nexthop. - */ - ire = ire_cache_lookup(nexthop_addr, zoneid, - msg_getlabel(mp), ipst); - if (ire != NULL) { - gw = nexthop_addr; - ire_marks |= IRE_MARK_PRIVATE_ADDR; - } else { - ire = ire_ftable_lookup(nexthop_addr, 0, 0, - IRE_INTERFACE, NULL, NULL, zoneid, 0, - msg_getlabel(mp), - MATCH_IRE_TYPE | MATCH_IRE_SECATTR, - ipst); - if (ire != NULL) { - dst = nexthop_addr; - } - } - } else { - ire = ire_ftable_lookup(dst, 0, 0, 0, - NULL, &sire, zoneid, 0, msg_getlabel(mp), - MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | - MATCH_IRE_RJ_BHOLE | MATCH_IRE_PARENT | - MATCH_IRE_SECATTR | MATCH_IRE_COMPLETE, - ipst); - } - - ip3dbg(("ip_newroute: ire_ftable_lookup() " - "returned ire %p, sire %p\n", (void *)ire, (void *)sire)); - - /* - * This loop is run only once in most cases. - * We loop to resolve further routes only when the destination - * can be reached through multiple RTF_MULTIRT-flagged ires. - */ - do { - /* Clear the previous iteration's values */ - if (src_ipif != NULL) { - ipif_refrele(src_ipif); - src_ipif = NULL; - } - if (dst_ill != NULL) { - ill_refrele(dst_ill); - dst_ill = NULL; - } - - multirt_resolve_next = B_FALSE; - /* - * We check if packets have to be multirouted. - * In this case, given the current <ire, sire> couple, - * we look for the next suitable <ire, sire>. - * This check is done in ire_multirt_lookup(), - * which applies various criteria to find the next route - * to resolve. ire_multirt_lookup() leaves <ire, sire> - * unchanged if it detects it has not been tried yet. - */ - if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) { - ip3dbg(("ip_newroute: starting next_resolution " - "with first_mp %p, tag %d\n", - (void *)first_mp, - MULTIRT_DEBUG_TAGGED(first_mp))); - - ASSERT(sire != NULL); - multirt_is_resolvable = - ire_multirt_lookup(&ire, &sire, multirt_flags, - &multirt_already_resolved, msg_getlabel(mp), ipst); - - ip3dbg(("ip_newroute: multirt_is_resolvable %d, " - "multirt_already_resolved %d, " - "multirt_res_attempts %d, multirt_res_failures %d, " - "ire %p, sire %p\n", multirt_is_resolvable, - multirt_already_resolved, multirt_res_attempts, - multirt_res_failures, (void *)ire, (void *)sire)); - - if (!multirt_is_resolvable) { - /* - * No more multirt route to resolve; give up - * (all routes resolved or no more - * resolvable routes). - */ - if (ire != NULL) { - ire_refrele(ire); - ire = NULL; - } - /* - * Generate ICMP error only if all attempts to - * resolve multirt route failed and there is no - * already resolved one. Don't generate ICMP - * error when: - * - * 1) there was no attempt to resolve - * 2) at least one attempt passed - * 3) a multirt route is already resolved - * - * Case 1) may occur due to multiple - * resolution attempts during single - * ip_multirt_resolution_interval. - * - * Case 2-3) means that CGTP destination is - * reachable via one link so we don't want to - * generate ICMP host unreachable error. - */ - if (multirt_res_attempts == 0 || - multirt_res_failures < - multirt_res_attempts || - multirt_already_resolved > 0) - multirt_no_icmp_error = B_TRUE; - } else { - ASSERT(sire != NULL); - ASSERT(ire != NULL); - - multirt_res_attempts++; - } - } - - if (ire == NULL) { - if (ip_debug > 3) { - /* ip2dbg */ - pr_addr_dbg("ip_newroute: " - "can't resolve %s\n", AF_INET, &dst); - } - ip3dbg(("ip_newroute: " - "ire %p, sire %p, multirt_no_icmp_error %d\n", - (void *)ire, (void *)sire, - (int)multirt_no_icmp_error)); - - if (sire != NULL) { - ire_refrele(sire); - sire = NULL; - } - - if (multirt_no_icmp_error) { - /* There is no need to report an ICMP error. */ - MULTIRT_DEBUG_UNTAG(first_mp); - freemsg(first_mp); - return; - } - ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0, - RTA_DST, ipst); - goto icmp_err_ret; - } - - /* - * Verify that the returned IRE does not have either - * the RTF_REJECT or RTF_BLACKHOLE flags set and that the IRE is - * either an IRE_CACHE, IRE_IF_NORESOLVER or IRE_IF_RESOLVER. - */ - if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) || - (ire->ire_type & (IRE_CACHE | IRE_INTERFACE)) == 0) { - goto icmp_err_ret; - } - /* - * Increment the ire_ob_pkt_count field for ire if it is an - * INTERFACE (IF_RESOLVER or IF_NORESOLVER) IRE type, and - * increment the same for the parent IRE, sire, if it is some - * sort of prefix IRE (which includes DEFAULT, PREFIX, and HOST) - */ - if ((ire->ire_type & IRE_INTERFACE) != 0) { - UPDATE_OB_PKT_COUNT(ire); - ire->ire_last_used_time = lbolt; - } - - if (sire != NULL) { - gw = sire->ire_gateway_addr; - ASSERT((sire->ire_type & (IRE_CACHETABLE | - IRE_INTERFACE)) == 0); - UPDATE_OB_PKT_COUNT(sire); - sire->ire_last_used_time = lbolt; - } - /* - * We have a route to reach the destination. Find the - * appropriate ill, then get a source address using - * ipif_select_source(). - * - * If we are here trying to create an IRE_CACHE for an offlink - * destination and have an IRE_CACHE entry for VNI, then use - * ire_stq instead since VNI's queue is a black hole. - */ - if ((ire->ire_type == IRE_CACHE) && - IS_VNI(ire->ire_ipif->ipif_ill)) { - dst_ill = ire->ire_stq->q_ptr; - ill_refhold(dst_ill); - } else { - ill_t *ill = ire->ire_ipif->ipif_ill; - - if (IS_IPMP(ill)) { - dst_ill = - ipmp_illgrp_hold_next_ill(ill->ill_grp); - } else { - dst_ill = ill; - ill_refhold(dst_ill); - } - } - - if (dst_ill == NULL) { - if (ip_debug > 2) { - pr_addr_dbg("ip_newroute: no dst " - "ill for dst %s\n", AF_INET, &dst); - } - goto icmp_err_ret; - } - ip2dbg(("ip_newroute: dst_ill %s\n", dst_ill->ill_name)); - - /* - * Pick the best source address from dst_ill. - * - * 1) Try to pick the source address from the destination - * route. Clustering assumes that when we have multiple - * prefixes hosted on an interface, the prefix of the - * source address matches the prefix of the destination - * route. We do this only if the address is not - * DEPRECATED. - * - * 2) If the conn is in a different zone than the ire, we - * need to pick a source address from the right zone. - */ - ASSERT(src_ipif == NULL); - if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) { - /* - * The RTF_SETSRC flag is set in the parent ire (sire). - * Check that the ipif matching the requested source - * address still exists. - */ - src_ipif = ipif_lookup_addr(sire->ire_src_addr, NULL, - zoneid, NULL, NULL, NULL, NULL, ipst); - } - - unspec_src = (connp != NULL && connp->conn_unspec_src); - - if (src_ipif == NULL && - (!unspec_src || ipha->ipha_src != INADDR_ANY)) { - ire_marks |= IRE_MARK_USESRC_CHECK; - if (!IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) && - IS_IPMP(ire->ire_ipif->ipif_ill) || - (ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) || - (connp != NULL && ire->ire_zoneid != zoneid && - ire->ire_zoneid != ALL_ZONES) || - (dst_ill->ill_usesrc_ifindex != 0)) { - /* - * If the destination is reachable via a - * given gateway, the selected source address - * should be in the same subnet as the gateway. - * Otherwise, the destination is not reachable. - * - * If there are no interfaces on the same subnet - * as the destination, ipif_select_source gives - * first non-deprecated interface which might be - * on a different subnet than the gateway. - * This is not desirable. Hence pass the dst_ire - * source address to ipif_select_source. - * It is sure that the destination is reachable - * with the dst_ire source address subnet. - * So passing dst_ire source address to - * ipif_select_source will make sure that the - * selected source will be on the same subnet - * as dst_ire source address. - */ - ipaddr_t saddr = ire->ire_ipif->ipif_src_addr; - - src_ipif = ipif_select_source(dst_ill, saddr, - zoneid); - if (src_ipif == NULL) { - /* - * In the case of multirouting, it may - * happen that ipif_select_source fails - * as DAD may disallow use of the - * particular source interface. Anyway, - * we need to continue and attempt to - * resolve other multirt routes. - */ - if ((sire != NULL) && - (sire->ire_flags & RTF_MULTIRT)) { - ire_refrele(ire); - ire = NULL; - multirt_resolve_next = B_TRUE; - multirt_res_failures++; - continue; - } - - if (ip_debug > 2) { - pr_addr_dbg("ip_newroute: " - "no src for dst %s ", - AF_INET, &dst); - printf("on interface %s\n", - dst_ill->ill_name); - } - goto icmp_err_ret; - } - } else { - src_ipif = ire->ire_ipif; - ASSERT(src_ipif != NULL); - /* hold src_ipif for uniformity */ - ipif_refhold(src_ipif); - } - } - - /* - * Assign a source address while we have the conn. - * We can't have ip_wput_ire pick a source address when the - * packet returns from arp since we need to look at - * conn_unspec_src and conn_zoneid, and we lose the conn when - * going through arp. - * - * NOTE : ip_newroute_v6 does not have this piece of code as - * it uses ip6i to store this information. - */ - if (ipha->ipha_src == INADDR_ANY && !unspec_src) - ipha->ipha_src = src_ipif->ipif_src_addr; - - if (ip_debug > 3) { - /* ip2dbg */ - pr_addr_dbg("ip_newroute: first hop %s\n", - AF_INET, &gw); - } - ip2dbg(("\tire type %s (%d)\n", - ip_nv_lookup(ire_nv_tbl, ire->ire_type), ire->ire_type)); - - /* - * The TTL of multirouted packets is bounded by the - * ip_multirt_ttl ndd variable. - */ - if ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)) { - /* Force TTL of multirouted packets */ - if ((ipst->ips_ip_multirt_ttl > 0) && - (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) { - ip2dbg(("ip_newroute: forcing multirt TTL " - "to %d (was %d), dst 0x%08x\n", - ipst->ips_ip_multirt_ttl, ipha->ipha_ttl, - ntohl(sire->ire_addr))); - ipha->ipha_ttl = ipst->ips_ip_multirt_ttl; - } - } - /* - * At this point in ip_newroute(), ire is either the - * IRE_CACHE of the next-hop gateway for an off-subnet - * destination or an IRE_INTERFACE type that should be used - * to resolve an on-subnet destination or an on-subnet - * next-hop gateway. - * - * In the IRE_CACHE case, we have the following : - * - * 1) src_ipif - used for getting a source address. - * - * 2) dst_ill - from which we derive ire_stq/ire_rfq. This - * means packets using this IRE_CACHE will go out on - * dst_ill. - * - * 3) The IRE sire will point to the prefix that is the - * longest matching route for the destination. These - * prefix types include IRE_DEFAULT, IRE_PREFIX, IRE_HOST. - * - * The newly created IRE_CACHE entry for the off-subnet - * destination is tied to both the prefix route and the - * interface route used to resolve the next-hop gateway - * via the ire_phandle and ire_ihandle fields, - * respectively. - * - * In the IRE_INTERFACE case, we have the following : - * - * 1) src_ipif - used for getting a source address. - * - * 2) dst_ill - from which we derive ire_stq/ire_rfq. This - * means packets using the IRE_CACHE that we will build - * here will go out on dst_ill. - * - * 3) sire may or may not be NULL. But, the IRE_CACHE that is - * to be created will only be tied to the IRE_INTERFACE - * that was derived from the ire_ihandle field. - * - * If sire is non-NULL, it means the destination is - * off-link and we will first create the IRE_CACHE for the - * gateway. Next time through ip_newroute, we will create - * the IRE_CACHE for the final destination as described - * above. - * - * In both cases, after the current resolution has been - * completed (or possibly initialised, in the IRE_INTERFACE - * case), the loop may be re-entered to attempt the resolution - * of another RTF_MULTIRT route. - * - * When an IRE_CACHE entry for the off-subnet destination is - * created, RTF_SETSRC and RTF_MULTIRT are inherited from sire, - * for further processing in emission loops. - */ - save_ire = ire; - switch (ire->ire_type) { - case IRE_CACHE: { - ire_t *ipif_ire; - - ASSERT(save_ire->ire_nce->nce_state == ND_REACHABLE); - if (gw == 0) - gw = ire->ire_gateway_addr; - /* - * We need 3 ire's to create a new cache ire for an - * off-link destination from the cache ire of the - * gateway. - * - * 1. The prefix ire 'sire' (Note that this does - * not apply to the conn_nexthop_set case) - * 2. The cache ire of the gateway 'ire' - * 3. The interface ire 'ipif_ire' - * - * We have (1) and (2). We lookup (3) below. - * - * If there is no interface route to the gateway, - * it is a race condition, where we found the cache - * but the interface route has been deleted. - */ - if (ip_nexthop) { - ipif_ire = ire_ihandle_lookup_onlink(ire); - } else { - ipif_ire = - ire_ihandle_lookup_offlink(ire, sire); - } - if (ipif_ire == NULL) { - ip1dbg(("ip_newroute: " - "ire_ihandle_lookup_offlink failed\n")); - goto icmp_err_ret; - } - - /* - * Check cached gateway IRE for any security - * attributes; if found, associate the gateway - * credentials group to the destination IRE. - */ - if ((attrp = save_ire->ire_gw_secattr) != NULL) { - mutex_enter(&attrp->igsa_lock); - if ((gcgrp = attrp->igsa_gcgrp) != NULL) - GCGRP_REFHOLD(gcgrp); - mutex_exit(&attrp->igsa_lock); - } - - /* - * XXX For the source of the resolver mp, - * we are using the same DL_UNITDATA_REQ - * (from save_ire->ire_nce->nce_res_mp) - * though the save_ire is not pointing at the same ill. - * This is incorrect. We need to send it up to the - * resolver to get the right res_mp. For ethernets - * this may be okay (ill_type == DL_ETHER). - */ - - ire = ire_create( - (uchar_t *)&dst, /* dest address */ - (uchar_t *)&ip_g_all_ones, /* mask */ - (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ - (uchar_t *)&gw, /* gateway address */ - &save_ire->ire_max_frag, - save_ire->ire_nce, /* src nce */ - dst_ill->ill_rq, /* recv-from queue */ - dst_ill->ill_wq, /* send-to queue */ - IRE_CACHE, /* IRE type */ - src_ipif, - (sire != NULL) ? - sire->ire_mask : 0, /* Parent mask */ - (sire != NULL) ? - sire->ire_phandle : 0, /* Parent handle */ - ipif_ire->ire_ihandle, /* Interface handle */ - (sire != NULL) ? (sire->ire_flags & - (RTF_SETSRC | RTF_MULTIRT)) : 0, /* flags */ - (sire != NULL) ? - &(sire->ire_uinfo) : &(save_ire->ire_uinfo), - NULL, - gcgrp, - ipst); - - if (ire == NULL) { - if (gcgrp != NULL) { - GCGRP_REFRELE(gcgrp); - gcgrp = NULL; - } - ire_refrele(ipif_ire); - ire_refrele(save_ire); - break; - } - - /* reference now held by IRE */ - gcgrp = NULL; - - ire->ire_marks |= ire_marks; - - /* - * Prevent sire and ipif_ire from getting deleted. - * The newly created ire is tied to both of them via - * the phandle and ihandle respectively. - */ - if (sire != NULL) { - IRB_REFHOLD(sire->ire_bucket); - /* Has it been removed already ? */ - if (sire->ire_marks & IRE_MARK_CONDEMNED) { - IRB_REFRELE(sire->ire_bucket); - ire_refrele(ipif_ire); - ire_refrele(save_ire); - break; - } - } - - IRB_REFHOLD(ipif_ire->ire_bucket); - /* Has it been removed already ? */ - if (ipif_ire->ire_marks & IRE_MARK_CONDEMNED) { - IRB_REFRELE(ipif_ire->ire_bucket); - if (sire != NULL) - IRB_REFRELE(sire->ire_bucket); - ire_refrele(ipif_ire); - ire_refrele(save_ire); - break; - } - - xmit_mp = first_mp; - /* - * In the case of multirouting, a copy - * of the packet is done before its sending. - * The copy is used to attempt another - * route resolution, in a next loop. - */ - if (ire->ire_flags & RTF_MULTIRT) { - copy_mp = copymsg(first_mp); - if (copy_mp != NULL) { - xmit_mp = copy_mp; - MULTIRT_DEBUG_TAG(first_mp); - } - } - - ire_add_then_send(q, ire, xmit_mp); - ire_refrele(save_ire); - - /* Assert that sire is not deleted yet. */ - if (sire != NULL) { - ASSERT(sire->ire_ptpn != NULL); - IRB_REFRELE(sire->ire_bucket); - } - - /* Assert that ipif_ire is not deleted yet. */ - ASSERT(ipif_ire->ire_ptpn != NULL); - IRB_REFRELE(ipif_ire->ire_bucket); - ire_refrele(ipif_ire); - - /* - * If copy_mp is not NULL, multirouting was - * requested. We loop to initiate a next - * route resolution attempt, starting from sire. - */ - if (copy_mp != NULL) { - /* - * Search for the next unresolved - * multirt route. - */ - copy_mp = NULL; - ipif_ire = NULL; - ire = NULL; - multirt_resolve_next = B_TRUE; - continue; - } - if (sire != NULL) - ire_refrele(sire); - ipif_refrele(src_ipif); - ill_refrele(dst_ill); - return; - } - case IRE_IF_NORESOLVER: { - if (dst_ill->ill_resolver_mp == NULL) { - ip1dbg(("ip_newroute: dst_ill %p " - "for IRE_IF_NORESOLVER ire %p has " - "no ill_resolver_mp\n", - (void *)dst_ill, (void *)ire)); - break; - } - - /* - * TSol note: We are creating the ire cache for the - * destination 'dst'. If 'dst' is offlink, going - * through the first hop 'gw', the security attributes - * of 'dst' must be set to point to the gateway - * credentials of gateway 'gw'. If 'dst' is onlink, it - * is possible that 'dst' is a potential gateway that is - * referenced by some route that has some security - * attributes. Thus in the former case, we need to do a - * gcgrp_lookup of 'gw' while in the latter case we - * need to do gcgrp_lookup of 'dst' itself. - */ - ga.ga_af = AF_INET; - IN6_IPADDR_TO_V4MAPPED(gw != INADDR_ANY ? gw : dst, - &ga.ga_addr); - gcgrp = gcgrp_lookup(&ga, B_FALSE); - - ire = ire_create( - (uchar_t *)&dst, /* dest address */ - (uchar_t *)&ip_g_all_ones, /* mask */ - (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ - (uchar_t *)&gw, /* gateway address */ - &save_ire->ire_max_frag, - NULL, /* no src nce */ - dst_ill->ill_rq, /* recv-from queue */ - dst_ill->ill_wq, /* send-to queue */ - IRE_CACHE, - src_ipif, - save_ire->ire_mask, /* Parent mask */ - (sire != NULL) ? /* Parent handle */ - sire->ire_phandle : 0, - save_ire->ire_ihandle, /* Interface handle */ - (sire != NULL) ? sire->ire_flags & - (RTF_SETSRC | RTF_MULTIRT) : 0, /* flags */ - &(save_ire->ire_uinfo), - NULL, - gcgrp, - ipst); - - if (ire == NULL) { - if (gcgrp != NULL) { - GCGRP_REFRELE(gcgrp); - gcgrp = NULL; - } - ire_refrele(save_ire); - break; - } - - /* reference now held by IRE */ - gcgrp = NULL; - - ire->ire_marks |= ire_marks; - - /* Prevent save_ire from getting deleted */ - IRB_REFHOLD(save_ire->ire_bucket); - /* Has it been removed already ? */ - if (save_ire->ire_marks & IRE_MARK_CONDEMNED) { - IRB_REFRELE(save_ire->ire_bucket); - ire_refrele(save_ire); - break; - } - - /* - * In the case of multirouting, a copy - * of the packet is made before it is sent. - * The copy is used in the next - * loop to attempt another resolution. - */ - xmit_mp = first_mp; - if ((sire != NULL) && - (sire->ire_flags & RTF_MULTIRT)) { - copy_mp = copymsg(first_mp); - if (copy_mp != NULL) { - xmit_mp = copy_mp; - MULTIRT_DEBUG_TAG(first_mp); - } - } - ire_add_then_send(q, ire, xmit_mp); - - /* Assert that it is not deleted yet. */ - ASSERT(save_ire->ire_ptpn != NULL); - IRB_REFRELE(save_ire->ire_bucket); - ire_refrele(save_ire); - - if (copy_mp != NULL) { - /* - * If we found a (no)resolver, we ignore any - * trailing top priority IRE_CACHE in further - * loops. This ensures that we do not omit any - * (no)resolver. - * This IRE_CACHE, if any, will be processed - * by another thread entering ip_newroute(). - * IRE_CACHE entries, if any, will be processed - * by another thread entering ip_newroute(), - * (upon resolver response, for instance). - * This aims to force parallel multirt - * resolutions as soon as a packet must be sent. - * In the best case, after the tx of only one - * packet, all reachable routes are resolved. - * Otherwise, the resolution of all RTF_MULTIRT - * routes would require several emissions. - */ - multirt_flags &= ~MULTIRT_CACHEGW; - - /* - * Search for the next unresolved multirt - * route. - */ - copy_mp = NULL; - save_ire = NULL; - ire = NULL; - multirt_resolve_next = B_TRUE; - continue; - } - - /* - * Don't need sire anymore - */ - if (sire != NULL) - ire_refrele(sire); - - ipif_refrele(src_ipif); - ill_refrele(dst_ill); - return; - } - case IRE_IF_RESOLVER: - /* - * We can't build an IRE_CACHE yet, but at least we - * found a resolver that can help. - */ - res_mp = dst_ill->ill_resolver_mp; - if (!OK_RESOLVER_MP(res_mp)) - break; - - /* - * To be at this point in the code with a non-zero gw - * means that dst is reachable through a gateway that - * we have never resolved. By changing dst to the gw - * addr we resolve the gateway first. - * When ire_add_then_send() tries to put the IP dg - * to dst, it will reenter ip_newroute() at which - * time we will find the IRE_CACHE for the gw and - * create another IRE_CACHE in case IRE_CACHE above. - */ - if (gw != INADDR_ANY) { - /* - * The source ipif that was determined above was - * relative to the destination address, not the - * gateway's. If src_ipif was not taken out of - * the IRE_IF_RESOLVER entry, we'll need to call - * ipif_select_source() again. - */ - if (src_ipif != ire->ire_ipif) { - ipif_refrele(src_ipif); - src_ipif = ipif_select_source(dst_ill, - gw, zoneid); - /* - * In the case of multirouting, it may - * happen that ipif_select_source fails - * as DAD may disallow use of the - * particular source interface. Anyway, - * we need to continue and attempt to - * resolve other multirt routes. - */ - if (src_ipif == NULL) { - if (sire != NULL && - (sire->ire_flags & - RTF_MULTIRT)) { - ire_refrele(ire); - ire = NULL; - multirt_resolve_next = - B_TRUE; - multirt_res_failures++; - continue; - } - if (ip_debug > 2) { - pr_addr_dbg( - "ip_newroute: no " - "src for gw %s ", - AF_INET, &gw); - printf("on " - "interface %s\n", - dst_ill->ill_name); - } - goto icmp_err_ret; - } - } - save_dst = dst; - dst = gw; - gw = INADDR_ANY; - } - - /* - * We obtain a partial IRE_CACHE which we will pass - * along with the resolver query. When the response - * comes back it will be there ready for us to add. - * The ire_max_frag is atomically set under the - * irebucket lock in ire_add_v[46]. - */ - - ire = ire_create_mp( - (uchar_t *)&dst, /* dest address */ - (uchar_t *)&ip_g_all_ones, /* mask */ - (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ - (uchar_t *)&gw, /* gateway address */ - NULL, /* ire_max_frag */ - NULL, /* no src nce */ - dst_ill->ill_rq, /* recv-from queue */ - dst_ill->ill_wq, /* send-to queue */ - IRE_CACHE, - src_ipif, /* Interface ipif */ - save_ire->ire_mask, /* Parent mask */ - 0, - save_ire->ire_ihandle, /* Interface handle */ - 0, /* flags if any */ - &(save_ire->ire_uinfo), - NULL, - NULL, - ipst); - - if (ire == NULL) { - ire_refrele(save_ire); - break; - } - - if ((sire != NULL) && - (sire->ire_flags & RTF_MULTIRT)) { - copy_mp = copymsg(first_mp); - if (copy_mp != NULL) - MULTIRT_DEBUG_TAG(copy_mp); - } - - ire->ire_marks |= ire_marks; - - /* - * Construct message chain for the resolver - * of the form: - * ARP_REQ_MBLK-->IRE_MBLK-->Packet - * Packet could contain a IPSEC_OUT mp. - * - * NOTE : ire will be added later when the response - * comes back from ARP. If the response does not - * come back, ARP frees the packet. For this reason, - * we can't REFHOLD the bucket of save_ire to prevent - * deletions. We may not be able to REFRELE the bucket - * if the response never comes back. Thus, before - * adding the ire, ire_add_v4 will make sure that the - * interface route does not get deleted. This is the - * only case unlike ip_newroute_v6, ip_newroute_ipif_v6 - * where we can always prevent deletions because of - * the synchronous nature of adding IRES i.e - * ire_add_then_send is called after creating the IRE. - */ - ASSERT(ire->ire_mp != NULL); - ire->ire_mp->b_cont = first_mp; - /* Have saved_mp handy, for cleanup if canput fails */ - saved_mp = mp; - mp = copyb(res_mp); - if (mp == NULL) { - /* Prepare for cleanup */ - mp = saved_mp; /* pkt */ - ire_delete(ire); /* ire_mp */ - ire = NULL; - ire_refrele(save_ire); - if (copy_mp != NULL) { - MULTIRT_DEBUG_UNTAG(copy_mp); - freemsg(copy_mp); - copy_mp = NULL; - } - break; - } - linkb(mp, ire->ire_mp); - - /* - * Fill in the source and dest addrs for the resolver. - * NOTE: this depends on memory layouts imposed by - * ill_init(). - */ - areq = (areq_t *)mp->b_rptr; - addrp = (ipaddr_t *)((char *)areq + - areq->areq_sender_addr_offset); - *addrp = save_ire->ire_src_addr; - - ire_refrele(save_ire); - addrp = (ipaddr_t *)((char *)areq + - areq->areq_target_addr_offset); - *addrp = dst; - /* Up to the resolver. */ - if (canputnext(dst_ill->ill_rq) && - !(dst_ill->ill_arp_closing)) { - putnext(dst_ill->ill_rq, mp); - ire = NULL; - if (copy_mp != NULL) { - /* - * If we found a resolver, we ignore - * any trailing top priority IRE_CACHE - * in the further loops. This ensures - * that we do not omit any resolver. - * IRE_CACHE entries, if any, will be - * processed next time we enter - * ip_newroute(). - */ - multirt_flags &= ~MULTIRT_CACHEGW; - /* - * Search for the next unresolved - * multirt route. - */ - first_mp = copy_mp; - copy_mp = NULL; - /* Prepare the next resolution loop. */ - mp = first_mp; - EXTRACT_PKT_MP(mp, first_mp, - mctl_present); - if (mctl_present) - io = (ipsec_out_t *) - first_mp->b_rptr; - ipha = (ipha_t *)mp->b_rptr; - - ASSERT(sire != NULL); - - dst = save_dst; - multirt_resolve_next = B_TRUE; - continue; - } - - if (sire != NULL) - ire_refrele(sire); - - /* - * The response will come back in ip_wput - * with db_type IRE_DB_TYPE. - */ - ipif_refrele(src_ipif); - ill_refrele(dst_ill); - return; - } else { - /* Prepare for cleanup */ - DTRACE_PROBE1(ip__newroute__drop, mblk_t *, - mp); - mp->b_cont = NULL; - freeb(mp); /* areq */ - /* - * this is an ire that is not added to the - * cache. ire_freemblk will handle the release - * of any resources associated with the ire. - */ - ire_delete(ire); /* ire_mp */ - mp = saved_mp; /* pkt */ - ire = NULL; - if (copy_mp != NULL) { - MULTIRT_DEBUG_UNTAG(copy_mp); - freemsg(copy_mp); - copy_mp = NULL; - } - break; - } - default: - break; - } - } while (multirt_resolve_next); - - ip1dbg(("ip_newroute: dropped\n")); - /* Did this packet originate externally? */ - if (mp->b_prev) { - mp->b_next = NULL; - mp->b_prev = NULL; - BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); - } else { - if (dst_ill != NULL) { - BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsOutDiscards); - } else { - BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); - } - } - ASSERT(copy_mp == NULL); - MULTIRT_DEBUG_UNTAG(first_mp); - freemsg(first_mp); - if (ire != NULL) - ire_refrele(ire); - if (sire != NULL) - ire_refrele(sire); - if (src_ipif != NULL) - ipif_refrele(src_ipif); - if (dst_ill != NULL) - ill_refrele(dst_ill); - return; - -icmp_err_ret: - ip1dbg(("ip_newroute: no route\n")); - if (src_ipif != NULL) - ipif_refrele(src_ipif); - if (dst_ill != NULL) - ill_refrele(dst_ill); - if (sire != NULL) - ire_refrele(sire); - /* Did this packet originate externally? */ - if (mp->b_prev) { - mp->b_next = NULL; - mp->b_prev = NULL; - BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInNoRoutes); - q = WR(q); - } else { - /* - * There is no outgoing ill, so just increment the - * system MIB. - */ - BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); - /* - * Since ip_wput() isn't close to finished, we fill - * in enough of the header for credible error reporting. - */ - if (ip_hdr_complete(ipha, zoneid, ipst)) { - /* Failed */ - MULTIRT_DEBUG_UNTAG(first_mp); - freemsg(first_mp); - if (ire != NULL) - ire_refrele(ire); - return; - } - } - - /* - * At this point we will have ire only if RTF_BLACKHOLE - * or RTF_REJECT flags are set on the IRE. It will not - * generate ICMP_HOST_UNREACHABLE if RTF_BLACKHOLE is set. - */ - if (ire != NULL) { - if (ire->ire_flags & RTF_BLACKHOLE) { - ire_refrele(ire); - MULTIRT_DEBUG_UNTAG(first_mp); - freemsg(first_mp); - return; - } - ire_refrele(ire); - } - if (ip_source_routed(ipha, ipst)) { - icmp_unreachable(q, first_mp, ICMP_SOURCE_ROUTE_FAILED, - zoneid, ipst); - return; - } - icmp_unreachable(q, first_mp, ICMP_HOST_UNREACHABLE, zoneid, ipst); + return ("unknown"); } -ip_opt_info_t zero_info; - -/* - * IPv4 - - * ip_newroute_ipif is called by ip_wput_multicast and - * ip_rput_forward_multicast whenever we need to send - * out a packet to a destination address for which we do not have specific - * routing information. It is used when the packet will be sent out - * on a specific interface. It is also called by ip_wput() when IP_BOUND_IF - * socket option is set or icmp error message wants to go out on a particular - * interface for a unicast packet. - * - * In most cases, the destination address is resolved thanks to the ipif - * intrinsic resolver. However, there are some cases where the call to - * ip_newroute_ipif must take into account the potential presence of - * RTF_SETSRC and/or RTF_MULITRT flags in an IRE_OFFSUBNET ire - * that uses the interface. This is specified through flags, - * which can be a combination of: - * - RTF_SETSRC: if an IRE_OFFSUBNET ire exists that has the RTF_SETSRC - * flag, the resulting ire will inherit the IRE_OFFSUBNET source address - * and flags. Additionally, the packet source address has to be set to - * the specified address. The caller is thus expected to set this flag - * if the packet has no specific source address yet. - * - RTF_MULTIRT: if an IRE_OFFSUBNET ire exists that has the RTF_MULTIRT - * flag, the resulting ire will inherit the flag. All unresolved routes - * to the destination must be explored in the same call to - * ip_newroute_ipif(). - */ -static void -ip_newroute_ipif(queue_t *q, mblk_t *mp, ipif_t *ipif, ipaddr_t dst, - conn_t *connp, uint32_t flags, zoneid_t zoneid, ip_opt_info_t *infop) +static int +ip_wait_for_info_ack(ill_t *ill) { - areq_t *areq; - ire_t *ire = NULL; - mblk_t *res_mp; - ipaddr_t *addrp; - mblk_t *first_mp; - ire_t *save_ire = NULL; - ipif_t *src_ipif = NULL; - ushort_t ire_marks = 0; - ill_t *dst_ill = NULL; - ipha_t *ipha; - mblk_t *saved_mp; - ire_t *fire = NULL; - mblk_t *copy_mp = NULL; - boolean_t multirt_resolve_next; - boolean_t unspec_src; - ipaddr_t ipha_dst; - ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; - - /* - * CGTP goes in a loop which looks up a new ipif, do an ipif_refhold - * here for uniformity - */ - ipif_refhold(ipif); - - /* - * This loop is run only once in most cases. - * We loop to resolve further routes only when the destination - * can be reached through multiple RTF_MULTIRT-flagged ires. - */ - do { - if (dst_ill != NULL) { - ill_refrele(dst_ill); - dst_ill = NULL; - } - if (src_ipif != NULL) { - ipif_refrele(src_ipif); - src_ipif = NULL; - } - multirt_resolve_next = B_FALSE; - - ip1dbg(("ip_newroute_ipif: dst 0x%x, if %s\n", ntohl(dst), - ipif->ipif_ill->ill_name)); - - first_mp = mp; - if (DB_TYPE(mp) == M_CTL) - mp = mp->b_cont; - ipha = (ipha_t *)mp->b_rptr; - - /* - * Save the packet destination address, we may need it after - * the packet has been consumed. - */ - ipha_dst = ipha->ipha_dst; - - /* - * If the interface is a pt-pt interface we look for an - * IRE_IF_RESOLVER or IRE_IF_NORESOLVER that matches both the - * local_address and the pt-pt destination address. Otherwise - * we just match the local address. - * NOTE: dst could be different than ipha->ipha_dst in case - * of sending igmp multicast packets over a point-to-point - * connection. - * Thus we must be careful enough to check ipha_dst to be a - * multicast address, otherwise it will take xmit_if path for - * multicast packets resulting into kernel stack overflow by - * repeated calls to ip_newroute_ipif from ire_send(). - */ - if (CLASSD(ipha_dst) && - !(ipif->ipif_ill->ill_flags & ILLF_MULTICAST)) { - goto err_ret; - } - - /* - * We check if an IRE_OFFSUBNET for the addr that goes through - * ipif exists. We need it to determine if the RTF_SETSRC and/or - * RTF_MULTIRT flags must be honored. This IRE_OFFSUBNET ire may - * propagate its flags to the new ire. - */ - if (CLASSD(ipha_dst) && (flags & (RTF_MULTIRT | RTF_SETSRC))) { - fire = ipif_lookup_multi_ire(ipif, ipha_dst); - ip2dbg(("ip_newroute_ipif: " - "ipif_lookup_multi_ire(" - "ipif %p, dst %08x) = fire %p\n", - (void *)ipif, ntohl(dst), (void *)fire)); - } - - /* - * Note: While we pick a dst_ill we are really only - * interested in the ill for load spreading. The source - * ipif is determined by source address selection below. - */ - if (IS_IPMP(ipif->ipif_ill)) { - ipmp_illgrp_t *illg = ipif->ipif_ill->ill_grp; - - if (CLASSD(ipha_dst)) - dst_ill = ipmp_illgrp_hold_cast_ill(illg); - else - dst_ill = ipmp_illgrp_hold_next_ill(illg); - } else { - dst_ill = ipif->ipif_ill; - ill_refhold(dst_ill); - } - - if (dst_ill == NULL) { - if (ip_debug > 2) { - pr_addr_dbg("ip_newroute_ipif: no dst ill " - "for dst %s\n", AF_INET, &dst); - } - goto err_ret; - } - - /* - * Pick a source address preferring non-deprecated ones. - * Unlike ip_newroute, we don't do any source address - * selection here since for multicast it really does not help - * in inbound load spreading as in the unicast case. - */ - if ((flags & RTF_SETSRC) && (fire != NULL) && - (fire->ire_flags & RTF_SETSRC)) { - /* - * As requested by flags, an IRE_OFFSUBNET was looked up - * on that interface. This ire has RTF_SETSRC flag, so - * the source address of the packet must be changed. - * Check that the ipif matching the requested source - * address still exists. - */ - src_ipif = ipif_lookup_addr(fire->ire_src_addr, NULL, - zoneid, NULL, NULL, NULL, NULL, ipst); - } - - unspec_src = (connp != NULL && connp->conn_unspec_src); - - if (!IS_UNDER_IPMP(ipif->ipif_ill) && - (IS_IPMP(ipif->ipif_ill) || - (!ipif->ipif_isv6 && ipif->ipif_lcl_addr == INADDR_ANY) || - (ipif->ipif_flags & (IPIF_DEPRECATED|IPIF_UP)) != IPIF_UP || - (connp != NULL && ipif->ipif_zoneid != zoneid && - ipif->ipif_zoneid != ALL_ZONES)) && - (src_ipif == NULL) && - (!unspec_src || ipha->ipha_src != INADDR_ANY)) { - src_ipif = ipif_select_source(dst_ill, dst, zoneid); - if (src_ipif == NULL) { - if (ip_debug > 2) { - /* ip1dbg */ - pr_addr_dbg("ip_newroute_ipif: " - "no src for dst %s", - AF_INET, &dst); - } - ip1dbg((" on interface %s\n", - dst_ill->ill_name)); - goto err_ret; - } - ipif_refrele(ipif); - ipif = src_ipif; - ipif_refhold(ipif); - } - if (src_ipif == NULL) { - src_ipif = ipif; - ipif_refhold(src_ipif); - } - - /* - * Assign a source address while we have the conn. - * We can't have ip_wput_ire pick a source address when the - * packet returns from arp since conn_unspec_src might be set - * and we lose the conn when going through arp. - */ - if (ipha->ipha_src == INADDR_ANY && !unspec_src) - ipha->ipha_src = src_ipif->ipif_src_addr; - - /* - * In the case of IP_BOUND_IF and IP_PKTINFO, it is possible - * that the outgoing interface does not have an interface ire. - */ - if (CLASSD(ipha_dst) && (connp == NULL || - connp->conn_outgoing_ill == NULL) && - infop->ip_opt_ill_index == 0) { - /* ipif_to_ire returns an held ire */ - ire = ipif_to_ire(ipif); - if (ire == NULL) - goto err_ret; - if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) - goto err_ret; - save_ire = ire; - - ip2dbg(("ip_newroute_ipif: ire %p, ipif %p, " - "flags %04x\n", - (void *)ire, (void *)ipif, flags)); - if ((flags & RTF_MULTIRT) && (fire != NULL) && - (fire->ire_flags & RTF_MULTIRT)) { - /* - * As requested by flags, an IRE_OFFSUBNET was - * looked up on that interface. This ire has - * RTF_MULTIRT flag, so the resolution loop will - * be re-entered to resolve additional routes on - * other interfaces. For that purpose, a copy of - * the packet is performed at this point. - */ - fire->ire_last_used_time = lbolt; - copy_mp = copymsg(first_mp); - if (copy_mp) { - MULTIRT_DEBUG_TAG(copy_mp); - } - } - if ((flags & RTF_SETSRC) && (fire != NULL) && - (fire->ire_flags & RTF_SETSRC)) { - /* - * As requested by flags, an IRE_OFFSUBET was - * looked up on that interface. This ire has - * RTF_SETSRC flag, so the source address of the - * packet must be changed. - */ - ipha->ipha_src = fire->ire_src_addr; - } - } else { - /* - * The only ways we can come here are: - * 1) IP_BOUND_IF socket option is set - * 2) SO_DONTROUTE socket option is set - * 3) IP_PKTINFO option is passed in as ancillary data. - * In all cases, the new ire will not be added - * into cache table. - */ - ASSERT(connp == NULL || connp->conn_dontroute || - connp->conn_outgoing_ill != NULL || - infop->ip_opt_ill_index != 0); - ire_marks |= IRE_MARK_NOADD; - } - - switch (ipif->ipif_net_type) { - case IRE_IF_NORESOLVER: { - /* We have what we need to build an IRE_CACHE. */ - - if (dst_ill->ill_resolver_mp == NULL) { - ip1dbg(("ip_newroute_ipif: dst_ill %p " - "for IRE_IF_NORESOLVER ire %p has " - "no ill_resolver_mp\n", - (void *)dst_ill, (void *)ire)); - break; - } - - /* - * The new ire inherits the IRE_OFFSUBNET flags - * and source address, if this was requested. - */ - ire = ire_create( - (uchar_t *)&dst, /* dest address */ - (uchar_t *)&ip_g_all_ones, /* mask */ - (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ - NULL, /* gateway address */ - &ipif->ipif_mtu, - NULL, /* no src nce */ - dst_ill->ill_rq, /* recv-from queue */ - dst_ill->ill_wq, /* send-to queue */ - IRE_CACHE, - src_ipif, - (save_ire != NULL ? save_ire->ire_mask : 0), - (fire != NULL) ? /* Parent handle */ - fire->ire_phandle : 0, - (save_ire != NULL) ? /* Interface handle */ - save_ire->ire_ihandle : 0, - (fire != NULL) ? - (fire->ire_flags & - (RTF_SETSRC | RTF_MULTIRT)) : 0, - (save_ire == NULL ? &ire_uinfo_null : - &save_ire->ire_uinfo), - NULL, - NULL, - ipst); - - if (ire == NULL) { - if (save_ire != NULL) - ire_refrele(save_ire); - break; - } - - ire->ire_marks |= ire_marks; - - /* - * If IRE_MARK_NOADD is set then we need to convert - * the max_fragp to a useable value now. This is - * normally done in ire_add_v[46]. We also need to - * associate the ire with an nce (normally would be - * done in ip_wput_nondata()). - * - * Note that IRE_MARK_NOADD packets created here - * do not have a non-null ire_mp pointer. The null - * value of ire_bucket indicates that they were - * never added. - */ - if (ire->ire_marks & IRE_MARK_NOADD) { - uint_t max_frag; - - max_frag = *ire->ire_max_fragp; - ire->ire_max_fragp = NULL; - ire->ire_max_frag = max_frag; - - if ((ire->ire_nce = ndp_lookup_v4( - ire_to_ill(ire), - (ire->ire_gateway_addr != INADDR_ANY ? - &ire->ire_gateway_addr : &ire->ire_addr), - B_FALSE)) == NULL) { - if (save_ire != NULL) - ire_refrele(save_ire); - break; - } - ASSERT(ire->ire_nce->nce_state == - ND_REACHABLE); - NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce); - } - - /* Prevent save_ire from getting deleted */ - if (save_ire != NULL) { - IRB_REFHOLD(save_ire->ire_bucket); - /* Has it been removed already ? */ - if (save_ire->ire_marks & IRE_MARK_CONDEMNED) { - IRB_REFRELE(save_ire->ire_bucket); - ire_refrele(save_ire); - break; - } - } - - ire_add_then_send(q, ire, first_mp); - - /* Assert that save_ire is not deleted yet. */ - if (save_ire != NULL) { - ASSERT(save_ire->ire_ptpn != NULL); - IRB_REFRELE(save_ire->ire_bucket); - ire_refrele(save_ire); - save_ire = NULL; - } - if (fire != NULL) { - ire_refrele(fire); - fire = NULL; - } - - /* - * the resolution loop is re-entered if this - * was requested through flags and if we - * actually are in a multirouting case. - */ - if ((flags & RTF_MULTIRT) && (copy_mp != NULL)) { - boolean_t need_resolve = - ire_multirt_need_resolve(ipha_dst, - msg_getlabel(copy_mp), ipst); - if (!need_resolve) { - MULTIRT_DEBUG_UNTAG(copy_mp); - freemsg(copy_mp); - copy_mp = NULL; - } else { - /* - * ipif_lookup_group() calls - * ire_lookup_multi() that uses - * ire_ftable_lookup() to find - * an IRE_INTERFACE for the group. - * In the multirt case, - * ire_lookup_multi() then invokes - * ire_multirt_lookup() to find - * the next resolvable ire. - * As a result, we obtain an new - * interface, derived from the - * next ire. - */ - ipif_refrele(ipif); - ipif = ipif_lookup_group(ipha_dst, - zoneid, ipst); - ip2dbg(("ip_newroute_ipif: " - "multirt dst %08x, ipif %p\n", - htonl(dst), (void *)ipif)); - if (ipif != NULL) { - mp = copy_mp; - copy_mp = NULL; - multirt_resolve_next = B_TRUE; - continue; - } else { - freemsg(copy_mp); - } - } - } - if (ipif != NULL) - ipif_refrele(ipif); - ill_refrele(dst_ill); - ipif_refrele(src_ipif); - return; - } - case IRE_IF_RESOLVER: - /* - * We can't build an IRE_CACHE yet, but at least - * we found a resolver that can help. - */ - res_mp = dst_ill->ill_resolver_mp; - if (!OK_RESOLVER_MP(res_mp)) - break; - - /* - * We obtain a partial IRE_CACHE which we will pass - * along with the resolver query. When the response - * comes back it will be there ready for us to add. - * The new ire inherits the IRE_OFFSUBNET flags - * and source address, if this was requested. - * The ire_max_frag is atomically set under the - * irebucket lock in ire_add_v[46]. Only in the - * case of IRE_MARK_NOADD, we set it here itself. - */ - ire = ire_create_mp( - (uchar_t *)&dst, /* dest address */ - (uchar_t *)&ip_g_all_ones, /* mask */ - (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ - NULL, /* gateway address */ - (ire_marks & IRE_MARK_NOADD) ? - ipif->ipif_mtu : 0, /* max_frag */ - NULL, /* no src nce */ - dst_ill->ill_rq, /* recv-from queue */ - dst_ill->ill_wq, /* send-to queue */ - IRE_CACHE, - src_ipif, - (save_ire != NULL ? save_ire->ire_mask : 0), - (fire != NULL) ? /* Parent handle */ - fire->ire_phandle : 0, - (save_ire != NULL) ? /* Interface handle */ - save_ire->ire_ihandle : 0, - (fire != NULL) ? /* flags if any */ - (fire->ire_flags & - (RTF_SETSRC | RTF_MULTIRT)) : 0, - (save_ire == NULL ? &ire_uinfo_null : - &save_ire->ire_uinfo), - NULL, - NULL, - ipst); - - if (save_ire != NULL) { - ire_refrele(save_ire); - save_ire = NULL; - } - if (ire == NULL) - break; - - ire->ire_marks |= ire_marks; - /* - * Construct message chain for the resolver of the - * form: - * ARP_REQ_MBLK-->IRE_MBLK-->Packet - * - * NOTE : ire will be added later when the response - * comes back from ARP. If the response does not - * come back, ARP frees the packet. For this reason, - * we can't REFHOLD the bucket of save_ire to prevent - * deletions. We may not be able to REFRELE the - * bucket if the response never comes back. - * Thus, before adding the ire, ire_add_v4 will make - * sure that the interface route does not get deleted. - * This is the only case unlike ip_newroute_v6, - * ip_newroute_ipif_v6 where we can always prevent - * deletions because ire_add_then_send is called after - * creating the IRE. - * If IRE_MARK_NOADD is set, then ire_add_then_send - * does not add this IRE into the IRE CACHE. - */ - ASSERT(ire->ire_mp != NULL); - ire->ire_mp->b_cont = first_mp; - /* Have saved_mp handy, for cleanup if canput fails */ - saved_mp = mp; - mp = copyb(res_mp); - if (mp == NULL) { - /* Prepare for cleanup */ - mp = saved_mp; /* pkt */ - ire_delete(ire); /* ire_mp */ - ire = NULL; - if (copy_mp != NULL) { - MULTIRT_DEBUG_UNTAG(copy_mp); - freemsg(copy_mp); - copy_mp = NULL; - } - break; - } - linkb(mp, ire->ire_mp); - - /* - * Fill in the source and dest addrs for the resolver. - * NOTE: this depends on memory layouts imposed by - * ill_init(). There are corner cases above where we - * might've created the IRE with an INADDR_ANY source - * address (e.g., if the zeroth ipif on an underlying - * ill in an IPMP group is 0.0.0.0, but another ipif - * on the ill has a usable test address). If so, tell - * ARP to use ipha_src as its sender address. - */ - areq = (areq_t *)mp->b_rptr; - addrp = (ipaddr_t *)((char *)areq + - areq->areq_sender_addr_offset); - if (ire->ire_src_addr != INADDR_ANY) - *addrp = ire->ire_src_addr; - else - *addrp = ipha->ipha_src; - addrp = (ipaddr_t *)((char *)areq + - areq->areq_target_addr_offset); - *addrp = dst; - /* Up to the resolver. */ - if (canputnext(dst_ill->ill_rq) && - !(dst_ill->ill_arp_closing)) { - putnext(dst_ill->ill_rq, mp); - /* - * The response will come back in ip_wput - * with db_type IRE_DB_TYPE. - */ - } else { - mp->b_cont = NULL; - freeb(mp); /* areq */ - ire_delete(ire); /* ire_mp */ - saved_mp->b_next = NULL; - saved_mp->b_prev = NULL; - freemsg(first_mp); /* pkt */ - ip2dbg(("ip_newroute_ipif: dropped\n")); - } - - if (fire != NULL) { - ire_refrele(fire); - fire = NULL; - } + int err; - /* - * The resolution loop is re-entered if this was - * requested through flags and we actually are - * in a multirouting case. - */ - if ((flags & RTF_MULTIRT) && (copy_mp != NULL)) { - boolean_t need_resolve = - ire_multirt_need_resolve(ipha_dst, - msg_getlabel(copy_mp), ipst); - if (!need_resolve) { - MULTIRT_DEBUG_UNTAG(copy_mp); - freemsg(copy_mp); - copy_mp = NULL; - } else { - /* - * ipif_lookup_group() calls - * ire_lookup_multi() that uses - * ire_ftable_lookup() to find - * an IRE_INTERFACE for the group. - * In the multirt case, - * ire_lookup_multi() then invokes - * ire_multirt_lookup() to find - * the next resolvable ire. - * As a result, we obtain an new - * interface, derived from the - * next ire. - */ - ipif_refrele(ipif); - ipif = ipif_lookup_group(ipha_dst, - zoneid, ipst); - if (ipif != NULL) { - mp = copy_mp; - copy_mp = NULL; - multirt_resolve_next = B_TRUE; - continue; - } else { - freemsg(copy_mp); - } - } - } - if (ipif != NULL) - ipif_refrele(ipif); - ill_refrele(dst_ill); - ipif_refrele(src_ipif); - return; - default: - break; - } - } while (multirt_resolve_next); - -err_ret: - ip2dbg(("ip_newroute_ipif: dropped\n")); - if (fire != NULL) - ire_refrele(fire); - ipif_refrele(ipif); - /* Did this packet originate externally? */ - if (dst_ill != NULL) - ill_refrele(dst_ill); - if (src_ipif != NULL) - ipif_refrele(src_ipif); - if (mp->b_prev || mp->b_next) { - mp->b_next = NULL; - mp->b_prev = NULL; - } else { + mutex_enter(&ill->ill_lock); + while (ill->ill_state_flags & ILL_LL_SUBNET_PENDING) { /* - * Since ip_wput() isn't close to finished, we fill - * in enough of the header for credible error reporting. + * Return value of 0 indicates a pending signal. */ - if (ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst)) { - /* Failed */ - freemsg(first_mp); - if (ire != NULL) - ire_refrele(ire); - return; + err = cv_wait_sig(&ill->ill_cv, &ill->ill_lock); + if (err == 0) { + mutex_exit(&ill->ill_lock); + return (EINTR); } } + mutex_exit(&ill->ill_lock); /* - * At this point we will have ire only if RTF_BLACKHOLE - * or RTF_REJECT flags are set on the IRE. It will not - * generate ICMP_HOST_UNREACHABLE if RTF_BLACKHOLE is set. + * ip_rput_other could have set an error in ill_error on + * receipt of M_ERROR. */ - if (ire != NULL) { - if (ire->ire_flags & RTF_BLACKHOLE) { - ire_refrele(ire); - freemsg(first_mp); - return; - } - ire_refrele(ire); - } - icmp_unreachable(q, first_mp, ICMP_HOST_UNREACHABLE, zoneid, ipst); -} - -/* Name/Value Table Lookup Routine */ -char * -ip_nv_lookup(nv_t *nv, int value) -{ - if (!nv) - return (NULL); - for (; nv->nv_name; nv++) { - if (nv->nv_value == value) - return (nv->nv_name); - } - return ("unknown"); + return (ill->ill_error); } /* @@ -9604,7 +5972,7 @@ ip_nv_lookup(nv_t *nv, int value) * to a DLPI device. We allocate an ill_t as the instance data in * this case. */ -int +static int ip_modopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) { ill_t *ill; @@ -9644,6 +6012,7 @@ ip_modopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) * down a DL_INFO_REQ after calling qprocson. */ err = ill_init(q, ill); + if (err != 0) { mi_free(ill); netstack_rele(ipst->ips_netstack); @@ -9652,41 +6021,26 @@ ip_modopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) return (err); } - /* ill_init initializes the ipsq marking this thread as writer */ - ipsq_exit(ill->ill_phyint->phyint_ipsq); - /* Wait for the DL_INFO_ACK */ - mutex_enter(&ill->ill_lock); - while (ill->ill_state_flags & ILL_LL_SUBNET_PENDING) { - /* - * Return value of 0 indicates a pending signal. - */ - err = cv_wait_sig(&ill->ill_cv, &ill->ill_lock); - if (err == 0) { - mutex_exit(&ill->ill_lock); - (void) ip_close(q, 0); - return (EINTR); - } - } - mutex_exit(&ill->ill_lock); - /* - * ip_rput_other could have set an error in ill_error on - * receipt of M_ERROR. + * Wait for the DL_INFO_ACK if a DL_INFO_REQ was sent. + * + * ill_init initializes the ipsq marking this thread as + * writer */ + ipsq_exit(ill->ill_phyint->phyint_ipsq); + err = ip_wait_for_info_ack(ill); + if (err == 0) + ill->ill_credp = credp; + else + goto fail; - err = ill->ill_error; - if (err != 0) { - (void) ip_close(q, 0); - return (err); - } - - ill->ill_credp = credp; crhold(credp); mutex_enter(&ipst->ips_ip_mi_lock); - err = mi_open_link(&ipst->ips_ip_g_head, (IDP)ill, devp, flag, sflag, - credp); + err = mi_open_link(&ipst->ips_ip_g_head, (IDP)q->q_ptr, devp, flag, + sflag, credp); mutex_exit(&ipst->ips_ip_mi_lock); +fail: if (err) { (void) ip_close(q, 0); return (err); @@ -9719,8 +6073,6 @@ ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, netstack_t *ns; ip_stack_t *ipst; - TRACE_1(TR_FAC_IP, TR_IP_OPEN, "ip_open: q %p", q); - /* Allow reopen. */ if (q->q_ptr != NULL) return (0); @@ -9765,25 +6117,24 @@ ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, */ netstack_rele(ipst->ips_netstack); + connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM; + /* conn_allzones can not be set this early, hence no IPCL_ZONEID */ + connp->conn_ixa->ixa_zoneid = zoneid; connp->conn_zoneid = zoneid; - connp->conn_sqp = NULL; - connp->conn_initial_sqp = NULL; - connp->conn_final_sqp = NULL; - connp->conn_upq = q; + connp->conn_rq = q; q->q_ptr = WR(q)->q_ptr = connp; - if (flag & SO_SOCKSTR) - connp->conn_flags |= IPCL_SOCKET; - /* Minor tells us which /dev entry was opened */ if (isv6) { - connp->conn_af_isv6 = B_TRUE; - ip_setpktversion(connp, isv6, B_FALSE, ipst); - connp->conn_src_preferences = IPV6_PREFER_SRC_DEFAULT; + connp->conn_family = AF_INET6; + connp->conn_ipversion = IPV6_VERSION; + connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4; + connp->conn_ixa->ixa_src_preferences = IPV6_PREFER_SRC_DEFAULT; } else { - connp->conn_af_isv6 = B_FALSE; - connp->conn_pkt_isv6 = B_FALSE; + connp->conn_family = AF_INET; + connp->conn_ipversion = IPV4_VERSION; + connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4; } if ((ip_minor_arena_la != NULL) && (flag & SO_SOCKSTR) && @@ -9812,11 +6163,17 @@ ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, * connp->conn_cred is crfree()ed in ipcl_conn_destroy() */ connp->conn_cred = credp; + /* Cache things in ixa without an extra refhold */ + connp->conn_ixa->ixa_cred = connp->conn_cred; + connp->conn_ixa->ixa_cpid = connp->conn_cpid; + if (is_system_labeled()) + connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred); /* - * Handle IP_RTS_REQUEST and other ioctls which use conn_recv + * Handle IP_IOC_RTS_REQUEST and other ioctls which use conn_recv */ connp->conn_recv = ip_conn_input; + connp->conn_recvicmp = ip_conn_input_icmp; crhold(connp->conn_cred); @@ -9827,11 +6184,13 @@ ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, if (getpflags(NET_MAC_AWARE, credp) != 0) connp->conn_mac_mode = CONN_MAC_AWARE; + connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID); + connp->conn_rq = q; connp->conn_wq = WR(q); /* Non-zero default values */ - connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; + connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP; /* * Make the conn globally visible to walkers @@ -9847,210 +6206,6 @@ ip_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, } /* - * Change the output format (IPv4 vs. IPv6) for a conn_t. - * Note that there is no race since either ip_output function works - it - * is just an optimization to enter the best ip_output routine directly. - */ -void -ip_setpktversion(conn_t *connp, boolean_t isv6, boolean_t bump_mib, - ip_stack_t *ipst) -{ - if (isv6) { - if (bump_mib) { - BUMP_MIB(&ipst->ips_ip6_mib, - ipIfStatsOutSwitchIPVersion); - } - connp->conn_send = ip_output_v6; - connp->conn_pkt_isv6 = B_TRUE; - } else { - if (bump_mib) { - BUMP_MIB(&ipst->ips_ip_mib, - ipIfStatsOutSwitchIPVersion); - } - connp->conn_send = ip_output; - connp->conn_pkt_isv6 = B_FALSE; - } - -} - -/* - * See if IPsec needs loading because of the options in mp. - */ -static boolean_t -ipsec_opt_present(mblk_t *mp) -{ - uint8_t *optcp, *next_optcp, *opt_endcp; - struct opthdr *opt; - struct T_opthdr *topt; - int opthdr_len; - t_uscalar_t optname, optlevel; - struct T_optmgmt_req *tor = (struct T_optmgmt_req *)mp->b_rptr; - ipsec_req_t *ipsr; - - /* - * Walk through the mess, and find IP_SEC_OPT. If it's there, - * return TRUE. - */ - - optcp = mi_offset_param(mp, tor->OPT_offset, tor->OPT_length); - opt_endcp = optcp + tor->OPT_length; - if (tor->PRIM_type == T_OPTMGMT_REQ) { - opthdr_len = sizeof (struct T_opthdr); - } else { /* O_OPTMGMT_REQ */ - ASSERT(tor->PRIM_type == T_SVR4_OPTMGMT_REQ); - opthdr_len = sizeof (struct opthdr); - } - for (; optcp < opt_endcp; optcp = next_optcp) { - if (optcp + opthdr_len > opt_endcp) - return (B_FALSE); /* Not enough option header. */ - if (tor->PRIM_type == T_OPTMGMT_REQ) { - topt = (struct T_opthdr *)optcp; - optlevel = topt->level; - optname = topt->name; - next_optcp = optcp + _TPI_ALIGN_TOPT(topt->len); - } else { - opt = (struct opthdr *)optcp; - optlevel = opt->level; - optname = opt->name; - next_optcp = optcp + opthdr_len + - _TPI_ALIGN_OPT(opt->len); - } - if ((next_optcp < optcp) || /* wraparound pointer space */ - ((next_optcp >= opt_endcp) && /* last option bad len */ - ((next_optcp - opt_endcp) >= __TPI_ALIGN_SIZE))) - return (B_FALSE); /* bad option buffer */ - if ((optlevel == IPPROTO_IP && optname == IP_SEC_OPT) || - (optlevel == IPPROTO_IPV6 && optname == IPV6_SEC_OPT)) { - /* - * Check to see if it's an all-bypass or all-zeroes - * IPsec request. Don't bother loading IPsec if - * the socket doesn't want to use it. (A good example - * is a bypass request.) - * - * Basically, if any of the non-NEVER bits are set, - * load IPsec. - */ - ipsr = (ipsec_req_t *)(optcp + opthdr_len); - if ((ipsr->ipsr_ah_req & ~IPSEC_PREF_NEVER) != 0 || - (ipsr->ipsr_esp_req & ~IPSEC_PREF_NEVER) != 0 || - (ipsr->ipsr_self_encap_req & ~IPSEC_PREF_NEVER) - != 0) - return (B_TRUE); - } - } - return (B_FALSE); -} - -/* - * If conn is is waiting for ipsec to finish loading, kick it. - */ -/* ARGSUSED */ -static void -conn_restart_ipsec_waiter(conn_t *connp, void *arg) -{ - t_scalar_t optreq_prim; - mblk_t *mp; - cred_t *cr; - int err = 0; - - /* - * This function is called, after ipsec loading is complete. - * Since IP checks exclusively and atomically (i.e it prevents - * ipsec load from completing until ip_optcom_req completes) - * whether ipsec load is complete, there cannot be a race with IP - * trying to set the CONN_IPSEC_LOAD_WAIT flag on any conn now. - */ - mutex_enter(&connp->conn_lock); - if (connp->conn_state_flags & CONN_IPSEC_LOAD_WAIT) { - ASSERT(connp->conn_ipsec_opt_mp != NULL); - mp = connp->conn_ipsec_opt_mp; - connp->conn_ipsec_opt_mp = NULL; - connp->conn_state_flags &= ~CONN_IPSEC_LOAD_WAIT; - mutex_exit(&connp->conn_lock); - - /* - * All Solaris components should pass a db_credp - * for this TPI message, hence we ASSERT. - * But in case there is some other M_PROTO that looks - * like a TPI message sent by some other kernel - * component, we check and return an error. - */ - cr = msg_getcred(mp, NULL); - ASSERT(cr != NULL); - if (cr == NULL) { - mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL); - if (mp != NULL) - qreply(connp->conn_wq, mp); - return; - } - - ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); - - optreq_prim = ((union T_primitives *)mp->b_rptr)->type; - if (optreq_prim == T_OPTMGMT_REQ) { - err = tpi_optcom_req(CONNP_TO_WQ(connp), mp, cr, - &ip_opt_obj, B_FALSE); - } else { - ASSERT(optreq_prim == T_SVR4_OPTMGMT_REQ); - err = svr4_optcom_req(CONNP_TO_WQ(connp), mp, cr, - &ip_opt_obj, B_FALSE); - } - if (err != EINPROGRESS) - CONN_OPER_PENDING_DONE(connp); - return; - } - mutex_exit(&connp->conn_lock); -} - -/* - * Called from the ipsec_loader thread, outside any perimeter, to tell - * ip qenable any of the queues waiting for the ipsec loader to - * complete. - */ -void -ip_ipsec_load_complete(ipsec_stack_t *ipss) -{ - netstack_t *ns = ipss->ipsec_netstack; - - ipcl_walk(conn_restart_ipsec_waiter, NULL, ns->netstack_ip); -} - -/* - * Can't be used. Need to call svr4* -> optset directly. the leaf routine - * determines the grp on which it has to become exclusive, queues the mp - * and IPSQ draining restarts the optmgmt - */ -static boolean_t -ip_check_for_ipsec_opt(queue_t *q, mblk_t *mp) -{ - conn_t *connp = Q_TO_CONN(q); - ipsec_stack_t *ipss = connp->conn_netstack->netstack_ipsec; - - /* - * Take IPsec requests and treat them special. - */ - if (ipsec_opt_present(mp)) { - /* First check if IPsec is loaded. */ - mutex_enter(&ipss->ipsec_loader_lock); - if (ipss->ipsec_loader_state != IPSEC_LOADER_WAIT) { - mutex_exit(&ipss->ipsec_loader_lock); - return (B_FALSE); - } - mutex_enter(&connp->conn_lock); - connp->conn_state_flags |= CONN_IPSEC_LOAD_WAIT; - - ASSERT(connp->conn_ipsec_opt_mp == NULL); - connp->conn_ipsec_opt_mp = mp; - mutex_exit(&connp->conn_lock); - mutex_exit(&ipss->ipsec_loader_lock); - - ipsec_loader_loadnow(ipss); - return (B_TRUE); - } - return (B_FALSE); -} - -/* * Set IPsec policy from an ipsec_req_t. If the req is not "zero" and valid, * all of them are copied to the conn_t. If the req is "zero", the policy is * zeroed out. A "zero" policy has zero ipsr_{ah,req,self_encap}_req @@ -10149,15 +6304,14 @@ ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req) } } - mutex_enter(&connp->conn_lock); + ASSERT(MUTEX_HELD(&connp->conn_lock)); /* - * If we have already cached policies in ip_bind_connected*(), don't + * If we have already cached policies in conn_connect(), don't * let them change now. We cache policies for connections * whose src,dst [addr, port] is known. */ if (connp->conn_policy_cached) { - mutex_exit(&connp->conn_lock); return (EINVAL); } @@ -10171,10 +6325,8 @@ ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req) IPPH_REFRELE(connp->conn_policy, ipst->ips_netstack); connp->conn_policy = NULL; } - connp->conn_flags &= ~IPCL_CHECK_POLICY; connp->conn_in_enforce_policy = B_FALSE; connp->conn_out_enforce_policy = B_FALSE; - mutex_exit(&connp->conn_lock); return (0); } @@ -10203,7 +6355,7 @@ ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req) * We're looking at a v6 socket, also insert the v6-specific * entries. */ - if (connp->conn_af_isv6) { + if (connp->conn_family == AF_INET6) { if (!ipsec_polhead_insert(ph, actp, nact, IPSEC_AF_V6, IPSEC_TYPE_INBOUND, ns)) goto enomem; @@ -10217,10 +6369,10 @@ ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req) /* * If the requests need security, set enforce_policy. * If the requests are IPSEC_PREF_NEVER, one should - * still set conn_out_enforce_policy so that an ipsec_out - * gets attached in ip_wput. This is needed so that - * for connections that we don't cache policy in ip_bind, - * if global policy matches in ip_wput_attach_policy, we + * still set conn_out_enforce_policy so that ip_set_destination + * marks the ip_xmit_attr_t appropriatly. This is needed so that + * for connections that we don't cache policy in at connect time, + * if global policy matches in ip_output_attach_policy, we * don't wrongly inherit global policy. Similarly, we need * to set conn_in_enforce_policy also so that we don't verify * policy wrongly. @@ -10230,10 +6382,8 @@ ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req) (se_req & REQ_MASK) != 0) { connp->conn_in_enforce_policy = B_TRUE; connp->conn_out_enforce_policy = B_TRUE; - connp->conn_flags |= IPCL_CHECK_POLICY; } - mutex_exit(&connp->conn_lock); return (error); #undef REQ_MASK @@ -10241,7 +6391,6 @@ ipsec_set_req(cred_t *cr, conn_t *connp, ipsec_req_t *req) * Common memory-allocation-failure exit path. */ enomem: - mutex_exit(&connp->conn_lock); if (actp != NULL) ipsec_actvec_free(actp, nact); if (is_pol_inserted) @@ -10250,1250 +6399,283 @@ enomem: } /* - * Only for options that pass in an IP addr. Currently only V4 options - * pass in an ipif. V6 options always pass an ifindex specifying the ill. - * So this function assumes level is IPPROTO_IP + * Set socket options for joining and leaving multicast groups. + * Common to IPv4 and IPv6; inet6 indicates the type of socket. + * The caller has already check that the option name is consistent with + * the address family of the socket. */ int -ip_opt_set_ipif(conn_t *connp, ipaddr_t addr, boolean_t checkonly, int option, - mblk_t *first_mp) +ip_opt_set_multicast_group(conn_t *connp, t_scalar_t name, + uchar_t *invalp, boolean_t inet6, boolean_t checkonly) { - ipif_t *ipif = NULL; - int error; - ill_t *ill; - int zoneid; - ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - - ip2dbg(("ip_opt_set_ipif: ipaddr %X\n", addr)); - - if (addr != INADDR_ANY || checkonly) { - ASSERT(connp != NULL); - zoneid = IPCL_ZONEID(connp); - if (option == IP_NEXTHOP) { - ipif = ipif_lookup_onlink_addr(addr, - connp->conn_zoneid, ipst); - } else { - ipif = ipif_lookup_addr(addr, NULL, zoneid, - CONNP_TO_WQ(connp), first_mp, ip_restart_optmgmt, - &error, ipst); - } - if (ipif == NULL) { - if (error == EINPROGRESS) - return (error); - if ((option == IP_MULTICAST_IF) || - (option == IP_NEXTHOP)) - return (EHOSTUNREACH); - else - return (EINVAL); - } else if (checkonly) { - if (option == IP_MULTICAST_IF) { - ill = ipif->ipif_ill; - /* not supported by the virtual network iface */ - if (IS_VNI(ill)) { - ipif_refrele(ipif); - return (EINVAL); - } - } - ipif_refrele(ipif); - return (0); - } - ill = ipif->ipif_ill; - mutex_enter(&connp->conn_lock); - mutex_enter(&ill->ill_lock); - if ((ill->ill_state_flags & ILL_CONDEMNED) || - (ipif->ipif_state_flags & IPIF_CONDEMNED)) { - mutex_exit(&ill->ill_lock); - mutex_exit(&connp->conn_lock); - ipif_refrele(ipif); - return (option == IP_MULTICAST_IF ? - EHOSTUNREACH : EINVAL); - } - } else { - mutex_enter(&connp->conn_lock); - } - - /* None of the options below are supported on the VNI */ - if (ipif != NULL && IS_VNI(ipif->ipif_ill)) { - mutex_exit(&ill->ill_lock); - mutex_exit(&connp->conn_lock); - ipif_refrele(ipif); - return (EINVAL); - } - - switch (option) { - case IP_MULTICAST_IF: - connp->conn_multicast_ipif = ipif; + int *i1 = (int *)invalp; + int error = 0; + ip_stack_t *ipst = connp->conn_netstack->netstack_ip; + struct ip_mreq *v4_mreqp; + struct ipv6_mreq *v6_mreqp; + struct group_req *greqp; + ire_t *ire; + boolean_t done = B_FALSE; + ipaddr_t ifaddr; + in6_addr_t v6group; + uint_t ifindex; + boolean_t mcast_opt = B_TRUE; + mcast_record_t fmode; + int (*optfn)(conn_t *, boolean_t, const in6_addr_t *, + ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *); + + switch (name) { + case IP_ADD_MEMBERSHIP: + case IPV6_JOIN_GROUP: + mcast_opt = B_FALSE; + /* FALLTHRU */ + case MCAST_JOIN_GROUP: + fmode = MODE_IS_EXCLUDE; + optfn = ip_opt_add_group; break; - case IP_NEXTHOP: - connp->conn_nexthop_v4 = addr; - connp->conn_nexthop_set = B_TRUE; + + case IP_DROP_MEMBERSHIP: + case IPV6_LEAVE_GROUP: + mcast_opt = B_FALSE; + /* FALLTHRU */ + case MCAST_LEAVE_GROUP: + fmode = MODE_IS_INCLUDE; + optfn = ip_opt_delete_group; break; + default: + ASSERT(0); } - if (ipif != NULL) { - mutex_exit(&ill->ill_lock); - mutex_exit(&connp->conn_lock); - ipif_refrele(ipif); - return (0); - } - mutex_exit(&connp->conn_lock); - /* We succeded in cleared the option */ - return (0); -} + if (mcast_opt) { + struct sockaddr_in *sin; + struct sockaddr_in6 *sin6; -/* - * For options that pass in an ifindex specifying the ill. V6 options always - * pass in an ill. Some v4 options also pass in ifindex specifying the ill. - */ -int -ip_opt_set_ill(conn_t *connp, int ifindex, boolean_t isv6, boolean_t checkonly, - int level, int option, mblk_t *first_mp) -{ - ill_t *ill = NULL; - int error = 0; - ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - - ip2dbg(("ip_opt_set_ill: ifindex %d\n", ifindex)); - if (ifindex != 0) { - ASSERT(connp != NULL); - ill = ill_lookup_on_ifindex(ifindex, isv6, CONNP_TO_WQ(connp), - first_mp, ip_restart_optmgmt, &error, ipst); - if (ill != NULL) { - if (checkonly) { - /* not supported by the virtual network iface */ - if (IS_VNI(ill)) { - ill_refrele(ill); - return (EINVAL); - } - ill_refrele(ill); - return (0); - } - if (!ipif_lookup_zoneid(ill, connp->conn_zoneid, - 0, NULL)) { - ill_refrele(ill); - ill = NULL; - mutex_enter(&connp->conn_lock); - goto setit; - } - mutex_enter(&connp->conn_lock); - mutex_enter(&ill->ill_lock); - if (ill->ill_state_flags & ILL_CONDEMNED) { - mutex_exit(&ill->ill_lock); - mutex_exit(&connp->conn_lock); - ill_refrele(ill); - ill = NULL; - mutex_enter(&connp->conn_lock); - } - goto setit; - } else if (error == EINPROGRESS) { - return (error); + greqp = (struct group_req *)i1; + if (greqp->gr_group.ss_family == AF_INET) { + sin = (struct sockaddr_in *)&(greqp->gr_group); + IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &v6group); } else { - error = 0; - } + if (!inet6) + return (EINVAL); /* Not on INET socket */ + + sin6 = (struct sockaddr_in6 *)&(greqp->gr_group); + v6group = sin6->sin6_addr; + } + ifaddr = INADDR_ANY; + ifindex = greqp->gr_interface; + } else if (inet6) { + v6_mreqp = (struct ipv6_mreq *)i1; + v6group = v6_mreqp->ipv6mr_multiaddr; + ifaddr = INADDR_ANY; + ifindex = v6_mreqp->ipv6mr_interface; + } else { + v4_mreqp = (struct ip_mreq *)i1; + IN6_INADDR_TO_V4MAPPED(&v4_mreqp->imr_multiaddr, &v6group); + ifaddr = (ipaddr_t)v4_mreqp->imr_interface.s_addr; + ifindex = 0; } - mutex_enter(&connp->conn_lock); -setit: - ASSERT((level == IPPROTO_IP || level == IPPROTO_IPV6)); /* - * The options below assume that the ILL (if any) transmits and/or - * receives traffic. Neither of which is true for the virtual network - * interface, so fail setting these on a VNI. + * In the multirouting case, we need to replicate + * the request on all interfaces that will take part + * in replication. We do so because multirouting is + * reflective, thus we will probably receive multi- + * casts on those interfaces. + * The ip_multirt_apply_membership() succeeds if + * the operation succeeds on at least one interface. */ - if (IS_VNI(ill)) { - ASSERT(ill != NULL); - mutex_exit(&ill->ill_lock); - mutex_exit(&connp->conn_lock); - ill_refrele(ill); - return (EINVAL); - } - - if (level == IPPROTO_IP) { - switch (option) { - case IP_BOUND_IF: - connp->conn_incoming_ill = ill; - connp->conn_outgoing_ill = ill; - break; - - case IP_MULTICAST_IF: - /* - * This option is an internal special. The socket - * level IP_MULTICAST_IF specifies an 'ipaddr' and - * is handled in ip_opt_set_ipif. IPV6_MULTICAST_IF - * specifies an ifindex and we try first on V6 ill's. - * If we don't find one, we they try using on v4 ill's - * intenally and we come here. - */ - if (!checkonly && ill != NULL) { - ipif_t *ipif; - ipif = ill->ill_ipif; - - if (ipif->ipif_state_flags & IPIF_CONDEMNED) { - mutex_exit(&ill->ill_lock); - mutex_exit(&connp->conn_lock); - ill_refrele(ill); - ill = NULL; - mutex_enter(&connp->conn_lock); - } else { - connp->conn_multicast_ipif = ipif; - } - } - break; + if (IN6_IS_ADDR_V4MAPPED(&v6group)) { + ipaddr_t group; - case IP_DHCPINIT_IF: - if (connp->conn_dhcpinit_ill != NULL) { - /* - * We've locked the conn so conn_cleanup_ill() - * cannot clear conn_dhcpinit_ill -- so it's - * safe to access the ill. - */ - ill_t *oill = connp->conn_dhcpinit_ill; + IN6_V4MAPPED_TO_IPADDR(&v6group, group); - ASSERT(oill->ill_dhcpinit != 0); - atomic_dec_32(&oill->ill_dhcpinit); - connp->conn_dhcpinit_ill = NULL; - } - - if (ill != NULL) { - connp->conn_dhcpinit_ill = ill; - atomic_inc_32(&ill->ill_dhcpinit); - } - break; - } + ire = ire_ftable_lookup_v4(group, IP_HOST_MASK, 0, + IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL, + MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL); } else { - switch (option) { - case IPV6_BOUND_IF: - connp->conn_incoming_ill = ill; - connp->conn_outgoing_ill = ill; - break; - - case IPV6_MULTICAST_IF: - /* - * Set conn_multicast_ill to be the IPv6 ill. - * Set conn_multicast_ipif to be an IPv4 ipif - * for ifindex to make IPv4 mapped addresses - * on PF_INET6 sockets honor IPV6_MULTICAST_IF. - * Even if no IPv6 ill exists for the ifindex - * we need to check for an IPv4 ifindex in order - * for this to work with mapped addresses. In that - * case only set conn_multicast_ipif. - */ - if (!checkonly) { - if (ifindex == 0) { - connp->conn_multicast_ill = NULL; - connp->conn_multicast_ipif = NULL; - } else if (ill != NULL) { - connp->conn_multicast_ill = ill; - } - } - break; + ire = ire_ftable_lookup_v6(&v6group, &ipv6_all_ones, 0, + IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL, + MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL); + } + if (ire != NULL) { + if (ire->ire_flags & RTF_MULTIRT) { + error = ip_multirt_apply_membership(optfn, ire, connp, + checkonly, &v6group, fmode, &ipv6_all_zeros); + done = B_TRUE; } + ire_refrele(ire); } - if (ill != NULL) { - mutex_exit(&ill->ill_lock); - mutex_exit(&connp->conn_lock); - ill_refrele(ill); - return (0); + if (!done) { + error = optfn(connp, checkonly, &v6group, ifaddr, ifindex, + fmode, &ipv6_all_zeros); } - mutex_exit(&connp->conn_lock); - /* - * We succeeded in clearing the option (ifindex == 0) or failed to - * locate the ill and could not set the option (ifindex != 0) - */ - return (ifindex == 0 ? 0 : EINVAL); + return (error); } -/* This routine sets socket options. */ -/* ARGSUSED */ +/* + * Set socket options for joining and leaving multicast groups + * for specific sources. + * Common to IPv4 and IPv6; inet6 indicates the type of socket. + * The caller has already check that the option name is consistent with + * the address family of the socket. + */ int -ip_opt_set(queue_t *q, uint_t optset_context, int level, int name, - uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, - void *dummy, cred_t *cr, mblk_t *first_mp) +ip_opt_set_multicast_sources(conn_t *connp, t_scalar_t name, + uchar_t *invalp, boolean_t inet6, boolean_t checkonly) { int *i1 = (int *)invalp; - conn_t *connp = Q_TO_CONN(q); int error = 0; - boolean_t checkonly; - ire_t *ire; - boolean_t found; ip_stack_t *ipst = connp->conn_netstack->netstack_ip; + struct ip_mreq_source *imreqp; + struct group_source_req *gsreqp; + in6_addr_t v6group, v6src; + uint32_t ifindex; + ipaddr_t ifaddr; + boolean_t mcast_opt = B_TRUE; + mcast_record_t fmode; + ire_t *ire; + boolean_t done = B_FALSE; + int (*optfn)(conn_t *, boolean_t, const in6_addr_t *, + ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *); - switch (optset_context) { - - case SETFN_OPTCOM_CHECKONLY: - checkonly = B_TRUE; - /* - * Note: Implies T_CHECK semantics for T_OPTCOM_REQ - * inlen != 0 implies value supplied and - * we have to "pretend" to set it. - * inlen == 0 implies that there is no - * value part in T_CHECK request and just validation - * done elsewhere should be enough, we just return here. - */ - if (inlen == 0) { - *outlenp = 0; - return (0); - } - break; - case SETFN_OPTCOM_NEGOTIATE: - case SETFN_UD_NEGOTIATE: - case SETFN_CONN_NEGOTIATE: - checkonly = B_FALSE; + switch (name) { + case IP_BLOCK_SOURCE: + mcast_opt = B_FALSE; + /* FALLTHRU */ + case MCAST_BLOCK_SOURCE: + fmode = MODE_IS_EXCLUDE; + optfn = ip_opt_add_group; break; - default: - /* - * We should never get here - */ - *outlenp = 0; - return (EINVAL); - } - - ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || - (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); - /* - * For fixed length options, no sanity check - * of passed in length is done. It is assumed *_optcom_req() - * routines do the right thing. - */ - - switch (level) { - case SOL_SOCKET: - /* - * conn_lock protects the bitfields, and is used to - * set the fields atomically. - */ - switch (name) { - case SO_BROADCAST: - if (!checkonly) { - /* TODO: use value someplace? */ - mutex_enter(&connp->conn_lock); - connp->conn_broadcast = *i1 ? 1 : 0; - mutex_exit(&connp->conn_lock); - } - break; /* goto sizeof (int) option return */ - case SO_USELOOPBACK: - if (!checkonly) { - /* TODO: use value someplace? */ - mutex_enter(&connp->conn_lock); - connp->conn_loopback = *i1 ? 1 : 0; - mutex_exit(&connp->conn_lock); - } - break; /* goto sizeof (int) option return */ - case SO_DONTROUTE: - if (!checkonly) { - mutex_enter(&connp->conn_lock); - connp->conn_dontroute = *i1 ? 1 : 0; - mutex_exit(&connp->conn_lock); - } - break; /* goto sizeof (int) option return */ - case SO_REUSEADDR: - if (!checkonly) { - mutex_enter(&connp->conn_lock); - connp->conn_reuseaddr = *i1 ? 1 : 0; - mutex_exit(&connp->conn_lock); - } - break; /* goto sizeof (int) option return */ - case SO_PROTOTYPE: - if (!checkonly) { - mutex_enter(&connp->conn_lock); - connp->conn_proto = *i1; - mutex_exit(&connp->conn_lock); - } - break; /* goto sizeof (int) option return */ - case SO_ALLZONES: - if (!checkonly) { - mutex_enter(&connp->conn_lock); - if (IPCL_IS_BOUND(connp)) { - mutex_exit(&connp->conn_lock); - return (EINVAL); - } - connp->conn_allzones = *i1 != 0 ? 1 : 0; - mutex_exit(&connp->conn_lock); - } - break; /* goto sizeof (int) option return */ - case SO_ANON_MLP: - if (!checkonly) { - mutex_enter(&connp->conn_lock); - connp->conn_anon_mlp = *i1 != 0 ? 1 : 0; - mutex_exit(&connp->conn_lock); - } - break; /* goto sizeof (int) option return */ - case SO_MAC_EXEMPT: - if (secpolicy_net_mac_aware(cr) != 0 || - IPCL_IS_BOUND(connp)) - return (EACCES); - if (!checkonly) { - mutex_enter(&connp->conn_lock); - connp->conn_mac_mode = *i1 != 0 ? - CONN_MAC_AWARE : CONN_MAC_DEFAULT; - mutex_exit(&connp->conn_lock); - } - break; /* goto sizeof (int) option return */ - case SO_MAC_IMPLICIT: - if (secpolicy_net_mac_implicit(cr) != 0) - return (EACCES); - if (!checkonly) { - mutex_enter(&connp->conn_lock); - connp->conn_mac_mode = *i1 != 0 ? - CONN_MAC_IMPLICIT : CONN_MAC_DEFAULT; - mutex_exit(&connp->conn_lock); - } - break; /* goto sizeof (int) option return */ - default: - /* - * "soft" error (negative) - * option not handled at this level - * Note: Do not modify *outlenp - */ - return (-EINVAL); - } + case IP_UNBLOCK_SOURCE: + mcast_opt = B_FALSE; + /* FALLTHRU */ + case MCAST_UNBLOCK_SOURCE: + fmode = MODE_IS_EXCLUDE; + optfn = ip_opt_delete_group; break; - case IPPROTO_IP: - switch (name) { - case IP_NEXTHOP: - if (secpolicy_ip_config(cr, B_FALSE) != 0) - return (EPERM); - /* FALLTHRU */ - case IP_MULTICAST_IF: { - ipaddr_t addr = *i1; - - error = ip_opt_set_ipif(connp, addr, checkonly, name, - first_mp); - if (error != 0) - return (error); - break; /* goto sizeof (int) option return */ - } - - case IP_MULTICAST_TTL: - /* Recorded in transport above IP */ - *outvalp = *invalp; - *outlenp = sizeof (uchar_t); - return (0); - case IP_MULTICAST_LOOP: - if (!checkonly) { - mutex_enter(&connp->conn_lock); - connp->conn_multicast_loop = *invalp ? 1 : 0; - mutex_exit(&connp->conn_lock); - } - *outvalp = *invalp; - *outlenp = sizeof (uchar_t); - return (0); - case IP_ADD_MEMBERSHIP: - case MCAST_JOIN_GROUP: - case IP_DROP_MEMBERSHIP: - case MCAST_LEAVE_GROUP: { - struct ip_mreq *mreqp; - struct group_req *greqp; - ire_t *ire; - boolean_t done = B_FALSE; - ipaddr_t group, ifaddr; - struct sockaddr_in *sin; - uint32_t *ifindexp; - boolean_t mcast_opt = B_TRUE; - mcast_record_t fmode; - int (*optfn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t, - uint_t *, mcast_record_t, ipaddr_t, mblk_t *); - - switch (name) { - case IP_ADD_MEMBERSHIP: - mcast_opt = B_FALSE; - /* FALLTHRU */ - case MCAST_JOIN_GROUP: - fmode = MODE_IS_EXCLUDE; - optfn = ip_opt_add_group; - break; - - case IP_DROP_MEMBERSHIP: - mcast_opt = B_FALSE; - /* FALLTHRU */ - case MCAST_LEAVE_GROUP: - fmode = MODE_IS_INCLUDE; - optfn = ip_opt_delete_group; - break; - } - - if (mcast_opt) { - greqp = (struct group_req *)i1; - sin = (struct sockaddr_in *)&greqp->gr_group; - if (sin->sin_family != AF_INET) { - *outlenp = 0; - return (ENOPROTOOPT); - } - group = (ipaddr_t)sin->sin_addr.s_addr; - ifaddr = INADDR_ANY; - ifindexp = &greqp->gr_interface; - } else { - mreqp = (struct ip_mreq *)i1; - group = (ipaddr_t)mreqp->imr_multiaddr.s_addr; - ifaddr = (ipaddr_t)mreqp->imr_interface.s_addr; - ifindexp = NULL; - } - - /* - * In the multirouting case, we need to replicate - * the request on all interfaces that will take part - * in replication. We do so because multirouting is - * reflective, thus we will probably receive multi- - * casts on those interfaces. - * The ip_multirt_apply_membership() succeeds if the - * operation succeeds on at least one interface. - */ - ire = ire_ftable_lookup(group, IP_HOST_MASK, 0, - IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, - MATCH_IRE_MASK | MATCH_IRE_TYPE, ipst); - if (ire != NULL) { - if (ire->ire_flags & RTF_MULTIRT) { - error = ip_multirt_apply_membership( - optfn, ire, connp, checkonly, group, - fmode, INADDR_ANY, first_mp); - done = B_TRUE; - } - ire_refrele(ire); - } - if (!done) { - error = optfn(connp, checkonly, group, ifaddr, - ifindexp, fmode, INADDR_ANY, first_mp); - } - if (error) { - /* - * EINPROGRESS is a soft error, needs retry - * so don't make *outlenp zero. - */ - if (error != EINPROGRESS) - *outlenp = 0; - return (error); - } - /* OK return - copy input buffer into output buffer */ - if (invalp != outvalp) { - /* don't trust bcopy for identical src/dst */ - bcopy(invalp, outvalp, inlen); - } - *outlenp = inlen; - return (0); - } - case IP_BLOCK_SOURCE: - case IP_UNBLOCK_SOURCE: - case IP_ADD_SOURCE_MEMBERSHIP: - case IP_DROP_SOURCE_MEMBERSHIP: - case MCAST_BLOCK_SOURCE: - case MCAST_UNBLOCK_SOURCE: - case MCAST_JOIN_SOURCE_GROUP: - case MCAST_LEAVE_SOURCE_GROUP: { - struct ip_mreq_source *imreqp; - struct group_source_req *gsreqp; - in_addr_t grp, src, ifaddr = INADDR_ANY; - uint32_t ifindex = 0; - mcast_record_t fmode; - struct sockaddr_in *sin; - ire_t *ire; - boolean_t mcast_opt = B_TRUE, done = B_FALSE; - int (*optfn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t, - uint_t *, mcast_record_t, ipaddr_t, mblk_t *); - - switch (name) { - case IP_BLOCK_SOURCE: - mcast_opt = B_FALSE; - /* FALLTHRU */ - case MCAST_BLOCK_SOURCE: - fmode = MODE_IS_EXCLUDE; - optfn = ip_opt_add_group; - break; - - case IP_UNBLOCK_SOURCE: - mcast_opt = B_FALSE; - /* FALLTHRU */ - case MCAST_UNBLOCK_SOURCE: - fmode = MODE_IS_EXCLUDE; - optfn = ip_opt_delete_group; - break; - - case IP_ADD_SOURCE_MEMBERSHIP: - mcast_opt = B_FALSE; - /* FALLTHRU */ - case MCAST_JOIN_SOURCE_GROUP: - fmode = MODE_IS_INCLUDE; - optfn = ip_opt_add_group; - break; - - case IP_DROP_SOURCE_MEMBERSHIP: - mcast_opt = B_FALSE; - /* FALLTHRU */ - case MCAST_LEAVE_SOURCE_GROUP: - fmode = MODE_IS_INCLUDE; - optfn = ip_opt_delete_group; - break; - } - - if (mcast_opt) { - gsreqp = (struct group_source_req *)i1; - if (gsreqp->gsr_group.ss_family != AF_INET) { - *outlenp = 0; - return (ENOPROTOOPT); - } - sin = (struct sockaddr_in *)&gsreqp->gsr_group; - grp = (ipaddr_t)sin->sin_addr.s_addr; - sin = (struct sockaddr_in *)&gsreqp->gsr_source; - src = (ipaddr_t)sin->sin_addr.s_addr; - ifindex = gsreqp->gsr_interface; - } else { - imreqp = (struct ip_mreq_source *)i1; - grp = (ipaddr_t)imreqp->imr_multiaddr.s_addr; - src = (ipaddr_t)imreqp->imr_sourceaddr.s_addr; - ifaddr = (ipaddr_t)imreqp->imr_interface.s_addr; - } - /* - * In the multirouting case, we need to replicate - * the request as noted in the mcast cases above. - */ - ire = ire_ftable_lookup(grp, IP_HOST_MASK, 0, - IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, - MATCH_IRE_MASK | MATCH_IRE_TYPE, ipst); - if (ire != NULL) { - if (ire->ire_flags & RTF_MULTIRT) { - error = ip_multirt_apply_membership( - optfn, ire, connp, checkonly, grp, - fmode, src, first_mp); - done = B_TRUE; - } - ire_refrele(ire); - } - if (!done) { - error = optfn(connp, checkonly, grp, ifaddr, - &ifindex, fmode, src, first_mp); - } - if (error != 0) { - /* - * EINPROGRESS is a soft error, needs retry - * so don't make *outlenp zero. - */ - if (error != EINPROGRESS) - *outlenp = 0; - return (error); - } - /* OK return - copy input buffer into output buffer */ - if (invalp != outvalp) { - bcopy(invalp, outvalp, inlen); - } - *outlenp = inlen; - return (0); - } - case IP_SEC_OPT: - error = ipsec_set_req(cr, connp, (ipsec_req_t *)invalp); - if (error != 0) { - *outlenp = 0; - return (error); - } - break; - case IP_HDRINCL: - case IP_OPTIONS: - case T_IP_OPTIONS: - case IP_TOS: - case T_IP_TOS: - case IP_TTL: - case IP_RECVDSTADDR: - case IP_RECVOPTS: - /* OK return - copy input buffer into output buffer */ - if (invalp != outvalp) { - /* don't trust bcopy for identical src/dst */ - bcopy(invalp, outvalp, inlen); - } - *outlenp = inlen; - return (0); - case IP_RECVIF: - /* Retrieve the inbound interface index */ - if (!checkonly) { - mutex_enter(&connp->conn_lock); - connp->conn_recvif = *i1 ? 1 : 0; - mutex_exit(&connp->conn_lock); - } - break; /* goto sizeof (int) option return */ - case IP_RECVPKTINFO: - if (!checkonly) { - mutex_enter(&connp->conn_lock); - connp->conn_ip_recvpktinfo = *i1 ? 1 : 0; - mutex_exit(&connp->conn_lock); - } - break; /* goto sizeof (int) option return */ - case IP_RECVSLLA: - /* Retrieve the source link layer address */ - if (!checkonly) { - mutex_enter(&connp->conn_lock); - connp->conn_recvslla = *i1 ? 1 : 0; - mutex_exit(&connp->conn_lock); - } - break; /* goto sizeof (int) option return */ - case MRT_INIT: - case MRT_DONE: - case MRT_ADD_VIF: - case MRT_DEL_VIF: - case MRT_ADD_MFC: - case MRT_DEL_MFC: - case MRT_ASSERT: - if ((error = secpolicy_ip_config(cr, B_FALSE)) != 0) { - *outlenp = 0; - return (error); - } - error = ip_mrouter_set((int)name, q, checkonly, - (uchar_t *)invalp, inlen, first_mp); - if (error) { - *outlenp = 0; - return (error); - } - /* OK return - copy input buffer into output buffer */ - if (invalp != outvalp) { - /* don't trust bcopy for identical src/dst */ - bcopy(invalp, outvalp, inlen); - } - *outlenp = inlen; - return (0); - case IP_BOUND_IF: - case IP_DHCPINIT_IF: - error = ip_opt_set_ill(connp, *i1, B_FALSE, checkonly, - level, name, first_mp); - if (error != 0) - return (error); - break; /* goto sizeof (int) option return */ - - case IP_UNSPEC_SRC: - /* Allow sending with a zero source address */ - if (!checkonly) { - mutex_enter(&connp->conn_lock); - connp->conn_unspec_src = *i1 ? 1 : 0; - mutex_exit(&connp->conn_lock); - } - break; /* goto sizeof (int) option return */ - default: - /* - * "soft" error (negative) - * option not handled at this level - * Note: Do not modify *outlenp - */ - return (-EINVAL); - } + case IP_ADD_SOURCE_MEMBERSHIP: + mcast_opt = B_FALSE; + /* FALLTHRU */ + case MCAST_JOIN_SOURCE_GROUP: + fmode = MODE_IS_INCLUDE; + optfn = ip_opt_add_group; break; - case IPPROTO_IPV6: - switch (name) { - case IPV6_BOUND_IF: - error = ip_opt_set_ill(connp, *i1, B_TRUE, checkonly, - level, name, first_mp); - if (error != 0) - return (error); - break; /* goto sizeof (int) option return */ - case IPV6_MULTICAST_IF: - /* - * The only possible errors are EINPROGRESS and - * EINVAL. EINPROGRESS will be restarted and is not - * a hard error. We call this option on both V4 and V6 - * If both return EINVAL, then this call returns - * EINVAL. If at least one of them succeeds we - * return success. - */ - found = B_FALSE; - error = ip_opt_set_ill(connp, *i1, B_TRUE, checkonly, - level, name, first_mp); - if (error == EINPROGRESS) - return (error); - if (error == 0) - found = B_TRUE; - error = ip_opt_set_ill(connp, *i1, B_FALSE, checkonly, - IPPROTO_IP, IP_MULTICAST_IF, first_mp); - if (error == 0) - found = B_TRUE; - if (!found) - return (error); - break; /* goto sizeof (int) option return */ - - case IPV6_MULTICAST_HOPS: - /* Recorded in transport above IP */ - break; /* goto sizeof (int) option return */ - case IPV6_MULTICAST_LOOP: - if (!checkonly) { - mutex_enter(&connp->conn_lock); - connp->conn_multicast_loop = *i1; - mutex_exit(&connp->conn_lock); - } - break; /* goto sizeof (int) option return */ - case IPV6_JOIN_GROUP: - case MCAST_JOIN_GROUP: - case IPV6_LEAVE_GROUP: - case MCAST_LEAVE_GROUP: { - struct ipv6_mreq *ip_mreqp; - struct group_req *greqp; - ire_t *ire; - boolean_t done = B_FALSE; - in6_addr_t groupv6; - uint32_t ifindex; - boolean_t mcast_opt = B_TRUE; - mcast_record_t fmode; - int (*optfn)(conn_t *, boolean_t, const in6_addr_t *, - int, mcast_record_t, const in6_addr_t *, mblk_t *); - - switch (name) { - case IPV6_JOIN_GROUP: - mcast_opt = B_FALSE; - /* FALLTHRU */ - case MCAST_JOIN_GROUP: - fmode = MODE_IS_EXCLUDE; - optfn = ip_opt_add_group_v6; - break; - - case IPV6_LEAVE_GROUP: - mcast_opt = B_FALSE; - /* FALLTHRU */ - case MCAST_LEAVE_GROUP: - fmode = MODE_IS_INCLUDE; - optfn = ip_opt_delete_group_v6; - break; - } + case IP_DROP_SOURCE_MEMBERSHIP: + mcast_opt = B_FALSE; + /* FALLTHRU */ + case MCAST_LEAVE_SOURCE_GROUP: + fmode = MODE_IS_INCLUDE; + optfn = ip_opt_delete_group; + break; + default: + ASSERT(0); + } - if (mcast_opt) { - struct sockaddr_in *sin; - struct sockaddr_in6 *sin6; - greqp = (struct group_req *)i1; - if (greqp->gr_group.ss_family == AF_INET) { - sin = (struct sockaddr_in *) - &(greqp->gr_group); - IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, - &groupv6); - } else { - sin6 = (struct sockaddr_in6 *) - &(greqp->gr_group); - groupv6 = sin6->sin6_addr; - } - ifindex = greqp->gr_interface; - } else { - ip_mreqp = (struct ipv6_mreq *)i1; - groupv6 = ip_mreqp->ipv6mr_multiaddr; - ifindex = ip_mreqp->ipv6mr_interface; - } - /* - * In the multirouting case, we need to replicate - * the request on all interfaces that will take part - * in replication. We do so because multirouting is - * reflective, thus we will probably receive multi- - * casts on those interfaces. - * The ip_multirt_apply_membership_v6() succeeds if - * the operation succeeds on at least one interface. - */ - ire = ire_ftable_lookup_v6(&groupv6, &ipv6_all_ones, 0, - IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, - MATCH_IRE_MASK | MATCH_IRE_TYPE, ipst); - if (ire != NULL) { - if (ire->ire_flags & RTF_MULTIRT) { - error = ip_multirt_apply_membership_v6( - optfn, ire, connp, checkonly, - &groupv6, fmode, &ipv6_all_zeros, - first_mp); - done = B_TRUE; - } - ire_refrele(ire); - } - if (!done) { - error = optfn(connp, checkonly, &groupv6, - ifindex, fmode, &ipv6_all_zeros, first_mp); - } - if (error) { - /* - * EINPROGRESS is a soft error, needs retry - * so don't make *outlenp zero. - */ - if (error != EINPROGRESS) - *outlenp = 0; - return (error); - } - /* OK return - copy input buffer into output buffer */ - if (invalp != outvalp) { - /* don't trust bcopy for identical src/dst */ - bcopy(invalp, outvalp, inlen); - } - *outlenp = inlen; - return (0); - } - case MCAST_BLOCK_SOURCE: - case MCAST_UNBLOCK_SOURCE: - case MCAST_JOIN_SOURCE_GROUP: - case MCAST_LEAVE_SOURCE_GROUP: { - struct group_source_req *gsreqp; - in6_addr_t v6grp, v6src; - uint32_t ifindex; - mcast_record_t fmode; - ire_t *ire; - boolean_t done = B_FALSE; - int (*optfn)(conn_t *, boolean_t, const in6_addr_t *, - int, mcast_record_t, const in6_addr_t *, mblk_t *); - - switch (name) { - case MCAST_BLOCK_SOURCE: - fmode = MODE_IS_EXCLUDE; - optfn = ip_opt_add_group_v6; - break; - case MCAST_UNBLOCK_SOURCE: - fmode = MODE_IS_EXCLUDE; - optfn = ip_opt_delete_group_v6; - break; - case MCAST_JOIN_SOURCE_GROUP: - fmode = MODE_IS_INCLUDE; - optfn = ip_opt_add_group_v6; - break; - case MCAST_LEAVE_SOURCE_GROUP: - fmode = MODE_IS_INCLUDE; - optfn = ip_opt_delete_group_v6; - break; - } + if (mcast_opt) { + gsreqp = (struct group_source_req *)i1; + ifindex = gsreqp->gsr_interface; + if (gsreqp->gsr_group.ss_family == AF_INET) { + struct sockaddr_in *s; + s = (struct sockaddr_in *)&gsreqp->gsr_group; + IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6group); + s = (struct sockaddr_in *)&gsreqp->gsr_source; + IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6src); + } else { + struct sockaddr_in6 *s6; - gsreqp = (struct group_source_req *)i1; - ifindex = gsreqp->gsr_interface; - if (gsreqp->gsr_group.ss_family == AF_INET) { - struct sockaddr_in *s; - s = (struct sockaddr_in *)&gsreqp->gsr_group; - IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6grp); - s = (struct sockaddr_in *)&gsreqp->gsr_source; - IN6_INADDR_TO_V4MAPPED(&s->sin_addr, &v6src); - } else { - struct sockaddr_in6 *s6; - s6 = (struct sockaddr_in6 *)&gsreqp->gsr_group; - v6grp = s6->sin6_addr; - s6 = (struct sockaddr_in6 *)&gsreqp->gsr_source; - v6src = s6->sin6_addr; - } + if (!inet6) + return (EINVAL); /* Not on INET socket */ - /* - * In the multirouting case, we need to replicate - * the request as noted in the mcast cases above. - */ - ire = ire_ftable_lookup_v6(&v6grp, &ipv6_all_ones, 0, - IRE_HOST, NULL, NULL, ALL_ZONES, 0, NULL, - MATCH_IRE_MASK | MATCH_IRE_TYPE, ipst); - if (ire != NULL) { - if (ire->ire_flags & RTF_MULTIRT) { - error = ip_multirt_apply_membership_v6( - optfn, ire, connp, checkonly, - &v6grp, fmode, &v6src, first_mp); - done = B_TRUE; - } - ire_refrele(ire); - } - if (!done) { - error = optfn(connp, checkonly, &v6grp, - ifindex, fmode, &v6src, first_mp); - } - if (error != 0) { - /* - * EINPROGRESS is a soft error, needs retry - * so don't make *outlenp zero. - */ - if (error != EINPROGRESS) - *outlenp = 0; - return (error); - } - /* OK return - copy input buffer into output buffer */ - if (invalp != outvalp) { - bcopy(invalp, outvalp, inlen); - } - *outlenp = inlen; - return (0); + s6 = (struct sockaddr_in6 *)&gsreqp->gsr_group; + v6group = s6->sin6_addr; + s6 = (struct sockaddr_in6 *)&gsreqp->gsr_source; + v6src = s6->sin6_addr; } - case IPV6_UNICAST_HOPS: - /* Recorded in transport above IP */ - break; /* goto sizeof (int) option return */ - case IPV6_UNSPEC_SRC: - /* Allow sending with a zero source address */ - if (!checkonly) { - mutex_enter(&connp->conn_lock); - connp->conn_unspec_src = *i1 ? 1 : 0; - mutex_exit(&connp->conn_lock); - } - break; /* goto sizeof (int) option return */ - case IPV6_RECVPKTINFO: - if (!checkonly) { - mutex_enter(&connp->conn_lock); - connp->conn_ip_recvpktinfo = *i1 ? 1 : 0; - mutex_exit(&connp->conn_lock); - } - break; /* goto sizeof (int) option return */ - case IPV6_RECVTCLASS: - if (!checkonly) { - if (*i1 < 0 || *i1 > 1) { - return (EINVAL); - } - mutex_enter(&connp->conn_lock); - connp->conn_ipv6_recvtclass = *i1; - mutex_exit(&connp->conn_lock); - } - break; - case IPV6_RECVPATHMTU: - if (!checkonly) { - if (*i1 < 0 || *i1 > 1) { - return (EINVAL); - } - mutex_enter(&connp->conn_lock); - connp->conn_ipv6_recvpathmtu = *i1; - mutex_exit(&connp->conn_lock); - } - break; - case IPV6_RECVHOPLIMIT: - if (!checkonly) { - mutex_enter(&connp->conn_lock); - connp->conn_ipv6_recvhoplimit = *i1 ? 1 : 0; - mutex_exit(&connp->conn_lock); - } - break; /* goto sizeof (int) option return */ - case IPV6_RECVHOPOPTS: - if (!checkonly) { - mutex_enter(&connp->conn_lock); - connp->conn_ipv6_recvhopopts = *i1 ? 1 : 0; - mutex_exit(&connp->conn_lock); - } - break; /* goto sizeof (int) option return */ - case IPV6_RECVDSTOPTS: - if (!checkonly) { - mutex_enter(&connp->conn_lock); - connp->conn_ipv6_recvdstopts = *i1 ? 1 : 0; - mutex_exit(&connp->conn_lock); - } - break; /* goto sizeof (int) option return */ - case IPV6_RECVRTHDR: - if (!checkonly) { - mutex_enter(&connp->conn_lock); - connp->conn_ipv6_recvrthdr = *i1 ? 1 : 0; - mutex_exit(&connp->conn_lock); - } - break; /* goto sizeof (int) option return */ - case IPV6_RECVRTHDRDSTOPTS: - if (!checkonly) { - mutex_enter(&connp->conn_lock); - connp->conn_ipv6_recvrtdstopts = *i1 ? 1 : 0; - mutex_exit(&connp->conn_lock); - } - break; /* goto sizeof (int) option return */ - case IPV6_PKTINFO: - if (inlen == 0) - return (-EINVAL); /* clearing option */ - error = ip6_set_pktinfo(cr, connp, - (struct in6_pktinfo *)invalp); - if (error != 0) - *outlenp = 0; - else - *outlenp = inlen; - return (error); - case IPV6_NEXTHOP: { - struct sockaddr_in6 *sin6; - - /* Verify that the nexthop is reachable */ - if (inlen == 0) - return (-EINVAL); /* clearing option */ + ifaddr = INADDR_ANY; + } else { + imreqp = (struct ip_mreq_source *)i1; + IN6_INADDR_TO_V4MAPPED(&imreqp->imr_multiaddr, &v6group); + IN6_INADDR_TO_V4MAPPED(&imreqp->imr_sourceaddr, &v6src); + ifaddr = (ipaddr_t)imreqp->imr_interface.s_addr; + ifindex = 0; + } - sin6 = (struct sockaddr_in6 *)invalp; - ire = ire_route_lookup_v6(&sin6->sin6_addr, - 0, 0, 0, NULL, NULL, connp->conn_zoneid, - NULL, MATCH_IRE_DEFAULT, ipst); + /* + * Handle src being mapped INADDR_ANY by changing it to unspecified. + */ + if (IN6_IS_ADDR_V4MAPPED_ANY(&v6src)) + v6src = ipv6_all_zeros; - if (ire == NULL) { - *outlenp = 0; - return (EHOSTUNREACH); - } - ire_refrele(ire); - return (-EINVAL); - } - case IPV6_SEC_OPT: - error = ipsec_set_req(cr, connp, (ipsec_req_t *)invalp); - if (error != 0) { - *outlenp = 0; - return (error); - } - break; - case IPV6_SRC_PREFERENCES: { - /* - * This is implemented strictly in the ip module - * (here and in tcp_opt_*() to accomodate tcp - * sockets). Modules above ip pass this option - * down here since ip is the only one that needs to - * be aware of source address preferences. - * - * This socket option only affects connected - * sockets that haven't already bound to a specific - * IPv6 address. In other words, sockets that - * don't call bind() with an address other than the - * unspecified address and that call connect(). - * ip_bind_connected_v6() passes these preferences - * to the ipif_select_source_v6() function. - */ - if (inlen != sizeof (uint32_t)) - return (EINVAL); - error = ip6_set_src_preferences(connp, - *(uint32_t *)invalp); - if (error != 0) { - *outlenp = 0; - return (error); - } else { - *outlenp = sizeof (uint32_t); - } - break; - } - case IPV6_V6ONLY: - if (*i1 < 0 || *i1 > 1) { - return (EINVAL); - } - mutex_enter(&connp->conn_lock); - connp->conn_ipv6_v6only = *i1; - mutex_exit(&connp->conn_lock); - break; - default: - return (-EINVAL); - } - break; - default: - /* - * "soft" error (negative) - * option not handled at this level - * Note: Do not modify *outlenp - */ - return (-EINVAL); - } /* - * Common case of return from an option that is sizeof (int) + * In the multirouting case, we need to replicate + * the request as noted in the mcast cases above. */ - *(int *)outvalp = *i1; - *outlenp = sizeof (int); - return (0); -} + if (IN6_IS_ADDR_V4MAPPED(&v6group)) { + ipaddr_t group; -/* - * This routine gets default values of certain options whose default - * values are maintained by protocol specific code - */ -/* ARGSUSED */ -int -ip_opt_default(queue_t *q, int level, int name, uchar_t *ptr) -{ - int *i1 = (int *)ptr; - ip_stack_t *ipst = CONNQ_TO_IPST(q); + IN6_V4MAPPED_TO_IPADDR(&v6group, group); - switch (level) { - case IPPROTO_IP: - switch (name) { - case IP_MULTICAST_TTL: - *ptr = (uchar_t)IP_DEFAULT_MULTICAST_TTL; - return (sizeof (uchar_t)); - case IP_MULTICAST_LOOP: - *ptr = (uchar_t)IP_DEFAULT_MULTICAST_LOOP; - return (sizeof (uchar_t)); - default: - return (-1); - } - case IPPROTO_IPV6: - switch (name) { - case IPV6_UNICAST_HOPS: - *i1 = ipst->ips_ipv6_def_hops; - return (sizeof (int)); - case IPV6_MULTICAST_HOPS: - *i1 = IP_DEFAULT_MULTICAST_TTL; - return (sizeof (int)); - case IPV6_MULTICAST_LOOP: - *i1 = IP_DEFAULT_MULTICAST_LOOP; - return (sizeof (int)); - case IPV6_V6ONLY: - *i1 = 1; - return (sizeof (int)); - default: - return (-1); + ire = ire_ftable_lookup_v4(group, IP_HOST_MASK, 0, + IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL, + MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL); + } else { + ire = ire_ftable_lookup_v6(&v6group, &ipv6_all_ones, 0, + IRE_HOST | IRE_INTERFACE, NULL, ALL_ZONES, NULL, + MATCH_IRE_MASK | MATCH_IRE_TYPE, 0, ipst, NULL); + } + if (ire != NULL) { + if (ire->ire_flags & RTF_MULTIRT) { + error = ip_multirt_apply_membership(optfn, ire, connp, + checkonly, &v6group, fmode, &v6src); + done = B_TRUE; } - default: - return (-1); + ire_refrele(ire); } - /* NOTREACHED */ + if (!done) { + error = optfn(connp, checkonly, &v6group, ifaddr, ifindex, + fmode, &v6src); + } + return (error); } /* * Given a destination address and a pointer to where to put the information * this routine fills in the mtuinfo. + * The socket must be connected. + * For sctp conn_faddr is the primary address. */ int -ip_fill_mtuinfo(struct in6_addr *in6, in_port_t port, - struct ip6_mtuinfo *mtuinfo, netstack_t *ns) +ip_fill_mtuinfo(conn_t *connp, ip_xmit_attr_t *ixa, struct ip6_mtuinfo *mtuinfo) { - ire_t *ire; - ip_stack_t *ipst = ns->netstack_ip; + uint32_t pmtu = IP_MAXPACKET; + uint_t scopeid; - if (IN6_IS_ADDR_UNSPECIFIED(in6)) + if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) return (-1); + /* In case we never sent or called ip_set_destination_v4/v6 */ + if (ixa->ixa_ire != NULL) + pmtu = ip_get_pmtu(ixa); + + if (ixa->ixa_flags & IXAF_SCOPEID_SET) + scopeid = ixa->ixa_scopeid; + else + scopeid = 0; + bzero(mtuinfo, sizeof (*mtuinfo)); mtuinfo->ip6m_addr.sin6_family = AF_INET6; - mtuinfo->ip6m_addr.sin6_port = port; - mtuinfo->ip6m_addr.sin6_addr = *in6; + mtuinfo->ip6m_addr.sin6_port = connp->conn_fport; + mtuinfo->ip6m_addr.sin6_addr = connp->conn_faddr_v6; + mtuinfo->ip6m_addr.sin6_scope_id = scopeid; + mtuinfo->ip6m_mtu = pmtu; - ire = ire_cache_lookup_v6(in6, ALL_ZONES, NULL, ipst); - if (ire != NULL) { - mtuinfo->ip6m_mtu = ire->ire_max_frag; - ire_refrele(ire); - } else { - mtuinfo->ip6m_mtu = IPV6_MIN_MTU; - } return (sizeof (struct ip6_mtuinfo)); } -/* - * This routine gets socket options. For MRT_VERSION and MRT_ASSERT, error - * checking of cred and that ip_g_mrouter is set should be done and - * isn't. This doesn't matter as the error checking is done properly for the - * other MRT options coming in through ip_opt_set. - */ -int -ip_opt_get(queue_t *q, int level, int name, uchar_t *ptr) -{ - conn_t *connp = Q_TO_CONN(q); - ipsec_req_t *req = (ipsec_req_t *)ptr; - - switch (level) { - case IPPROTO_IP: - switch (name) { - case MRT_VERSION: - case MRT_ASSERT: - (void) ip_mrouter_get(name, q, ptr); - return (sizeof (int)); - case IP_SEC_OPT: - return (ipsec_req_from_conn(connp, req, IPSEC_AF_V4)); - case IP_NEXTHOP: - if (connp->conn_nexthop_set) { - *(ipaddr_t *)ptr = connp->conn_nexthop_v4; - return (sizeof (ipaddr_t)); - } else - return (0); - case IP_RECVPKTINFO: - *(int *)ptr = connp->conn_ip_recvpktinfo ? 1: 0; - return (sizeof (int)); - default: - break; - } - break; - case IPPROTO_IPV6: - switch (name) { - case IPV6_SEC_OPT: - return (ipsec_req_from_conn(connp, req, IPSEC_AF_V6)); - case IPV6_SRC_PREFERENCES: { - return (ip6_get_src_preferences(connp, - (uint32_t *)ptr)); - } - case IPV6_V6ONLY: - *(int *)ptr = connp->conn_ipv6_v6only ? 1 : 0; - return (sizeof (int)); - case IPV6_PATHMTU: - return (ip_fill_mtuinfo(&connp->conn_remv6, 0, - (struct ip6_mtuinfo *)ptr, connp->conn_netstack)); - default: - break; - } - break; - default: - break; - } - return (-1); -} /* Named Dispatch routine to get a current value out of our parameter table. */ /* ARGSUSED */ static int @@ -11955,130 +7137,18 @@ ip_reassemble(mblk_t *mp, ipf_t *ipf, uint_t start, boolean_t more, ill_t *ill, } /* - * ipsec processing for the fast path, used for input UDP Packets - * Returns true if ready for passup to UDP. - * Return false if packet is not passable to UDP (e.g. it failed IPsec policy, - * was an ESP-in-UDP packet, etc.). - */ -static boolean_t -ip_udp_check(queue_t *q, conn_t *connp, ill_t *ill, ipha_t *ipha, - mblk_t **mpp, mblk_t **first_mpp, boolean_t mctl_present, ire_t *ire) -{ - uint32_t ill_index; - uint_t in_flags; /* IPF_RECVSLLA and/or IPF_RECVIF */ - ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; - udp_t *udp = connp->conn_udp; - - ASSERT(ipha->ipha_protocol == IPPROTO_UDP); - /* The ill_index of the incoming ILL */ - ill_index = ((ill_t *)q->q_ptr)->ill_phyint->phyint_ifindex; - - /* pass packet up to the transport */ - if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || mctl_present) { - *first_mpp = ipsec_check_inbound_policy(*first_mpp, connp, ipha, - NULL, mctl_present); - if (*first_mpp == NULL) { - return (B_FALSE); - } - } - - /* Initiate IPPF processing for fastpath UDP */ - if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) { - ip_process(IPP_LOCAL_IN, mpp, ill_index); - if (*mpp == NULL) { - ip2dbg(("ip_input_ipsec_process: UDP pkt " - "deferred/dropped during IPPF processing\n")); - return (B_FALSE); - } - } - /* - * Remove 0-spi if it's 0, or move everything behind - * the UDP header over it and forward to ESP via - * ip_proto_input(). - */ - if (udp->udp_nat_t_endpoint) { - if (mctl_present) { - /* mctl_present *shouldn't* happen. */ - ip_drop_packet(*first_mpp, B_TRUE, NULL, - NULL, DROPPER(ipss, ipds_esp_nat_t_ipsec), - &ipss->ipsec_dropper); - *first_mpp = NULL; - return (B_FALSE); - } - - /* "ill" is "recv_ill" in actuality. */ - if (!zero_spi_check(q, *mpp, ire, ill, ipss)) - return (B_FALSE); - - /* Else continue like a normal UDP packet. */ - } - - /* - * We make the checks as below since we are in the fast path - * and want to minimize the number of checks if the IP_RECVIF and/or - * IP_RECVSLLA and/or IPV6_RECVPKTINFO options are not set - */ - if (connp->conn_recvif || connp->conn_recvslla || - connp->conn_ip_recvpktinfo) { - if (connp->conn_recvif) { - in_flags = IPF_RECVIF; - } - /* - * UDP supports IP_RECVPKTINFO option for both v4 and v6 - * so the flag passed to ip_add_info is based on IP version - * of connp. - */ - if (connp->conn_ip_recvpktinfo) { - if (connp->conn_af_isv6) { - /* - * V6 only needs index - */ - in_flags |= IPF_RECVIF; - } else { - /* - * V4 needs index + matching address. - */ - in_flags |= IPF_RECVADDR; - } - } - if (connp->conn_recvslla) { - in_flags |= IPF_RECVSLLA; - } - /* - * since in_flags are being set ill will be - * referenced in ip_add_info, so it better not - * be NULL. - */ - /* - * the actual data will be contained in b_cont - * upon successful return of the following call. - * If the call fails then the original mblk is - * returned. - */ - *mpp = ip_add_info(*mpp, ill, in_flags, IPCL_ZONEID(connp), - ipst); - } - - return (B_TRUE); -} - -/* * Fragmentation reassembly. Each ILL has a hash table for * queuing packets undergoing reassembly for all IPIFs * associated with the ILL. The hash is based on the packet * IP ident field. The ILL frag hash table was allocated * as a timer block at the time the ILL was created. Whenever * there is anything on the reassembly queue, the timer will - * be running. Returns B_TRUE if successful else B_FALSE; - * frees mp on failure. + * be running. Returns the reassembled packet if reassembly completes. */ -static boolean_t -ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha, - uint32_t *cksum_val, uint16_t *cksum_flags) +mblk_t * +ip_input_fragment(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira) { uint32_t frag_offset_flags; - mblk_t *mp = *mpp; mblk_t *t_mp; ipaddr_t dst; uint8_t proto = ipha->ipha_protocol; @@ -12099,12 +7169,8 @@ ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha, uint8_t ecn_info = 0; uint32_t packet_size; boolean_t pruned = B_FALSE; - ip_stack_t *ipst = ill->ill_ipst; - - if (cksum_val != NULL) - *cksum_val = 0; - if (cksum_flags != NULL) - *cksum_flags = 0; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; /* * Drop the fragmented as early as possible, if @@ -12112,13 +7178,13 @@ ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha, */ if (ipst->ips_ip_reass_queue_bytes == 0) { freemsg(mp); - return (B_FALSE); + return (NULL); } /* Check for fragmentation offset; return if there's none */ if ((frag_offset_flags = ntohs(ipha->ipha_fragment_offset_and_flags) & (IPH_MF | IPH_OFFSET)) == 0) - return (B_TRUE); + return (mp); /* * We utilize hardware computed checksum info only for UDP since @@ -12126,8 +7192,9 @@ ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha, * addition, checksum offload support for IP fragments carrying * UDP payload is commonly implemented across network adapters. */ - ASSERT(recv_ill != NULL); - if (proto == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(recv_ill) && + ASSERT(ira->ira_rill != NULL); + if (proto == IPPROTO_UDP && dohwcksum && + ILL_HCKSUM_CAPABLE(ira->ira_rill) && (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) { mblk_t *mp1 = mp->b_cont; int32_t len; @@ -12178,7 +7245,7 @@ ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha, /* If end == 0 then we have a packet with no data, so just free it */ if (end == 0) { freemsg(mp); - return (B_FALSE); + return (NULL); } /* Record the ECN field info. */ @@ -12192,16 +7259,25 @@ ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha, end += offset; } - msg_len = MBLKSIZE(mp); + /* Handle vnic loopback of fragments */ + if (mp->b_datap->db_ref > 2) + msg_len = 0; + else + msg_len = MBLKSIZE(mp); + tail_mp = mp; while (tail_mp->b_cont != NULL) { tail_mp = tail_mp->b_cont; - msg_len += MBLKSIZE(tail_mp); + if (tail_mp->b_datap->db_ref <= 2) + msg_len += MBLKSIZE(tail_mp); } /* If the reassembly list for this ILL will get too big, prune it */ if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >= ipst->ips_ip_reass_queue_bytes) { + DTRACE_PROBE3(ip_reass_queue_bytes, uint_t, msg_len, + uint_t, ill->ill_frag_count, + uint_t, ipst->ips_ip_reass_queue_bytes); ill_frag_prune(ill, (ipst->ips_ip_reass_queue_bytes < msg_len) ? 0 : (ipst->ips_ip_reass_queue_bytes - msg_len)); @@ -12232,7 +7308,7 @@ ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha, ill_frag_free_pkts(ill, ipfb, ipf, 1); freemsg(mp); mutex_exit(&ipfb->ipfb_lock); - return (B_FALSE); + return (NULL); } /* Found it. */ break; @@ -12254,7 +7330,7 @@ ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha, if (pruned && offset != 0) { mutex_exit(&ipfb->ipfb_lock); freemsg(mp); - return (B_FALSE); + return (NULL); } if (ipfb->ipfb_frag_pkts >= MAX_FRAG_PKTS(ipst)) { @@ -12269,10 +7345,11 @@ ip_rput_fragment(ill_t *ill, ill_t *recv_ill, mblk_t **mpp, ipha_t *ipha, mp1 = allocb(sizeof (*ipf), BPRI_MED); if (mp1 == NULL) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); freemsg(mp); reass_done: mutex_exit(&ipfb->ipfb_lock); - return (B_FALSE); + return (NULL); } BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmReqds); @@ -12478,19 +7555,22 @@ reass_done: /* Restore original IP length in header. */ packet_size = (uint32_t)msgdsize(mp); if (packet_size > IP_MAXPACKET) { - freemsg(mp); BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); - return (B_FALSE); + ip_drop_input("Reassembled packet too large", mp, ill); + freemsg(mp); + return (NULL); } if (DB_REF(mp) > 1) { mblk_t *mp2 = copymsg(mp); - freemsg(mp); if (mp2 == NULL) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - return (B_FALSE); + ip_drop_input("ipIfStatsInDiscards", mp, ill); + freemsg(mp); + return (NULL); } + freemsg(mp); mp = mp2; } ipha = (ipha_t *)mp->b_rptr; @@ -12501,1187 +7581,239 @@ reass_done: /* Record the ECN info. */ ipha->ipha_type_of_service &= 0xFC; ipha->ipha_type_of_service |= ecn_info; - *mpp = mp; - /* Reassembly is successful; return checksum information if needed */ - if (cksum_val != NULL) - *cksum_val = sum_val; - if (cksum_flags != NULL) - *cksum_flags = sum_flags; + /* Update the receive attributes */ + ira->ira_pktlen = packet_size; + ira->ira_ip_hdr_length = IPH_HDR_LENGTH(ipha); - return (B_TRUE); + /* Reassembly is successful; set checksum information in packet */ + DB_CKSUM16(mp) = (uint16_t)sum_val; + DB_CKSUMFLAGS(mp) = sum_flags; + DB_CKSUMSTART(mp) = ira->ira_ip_hdr_length; + + return (mp); } /* - * Perform ip header check sum update local options. - * return B_TRUE if all is well, else return B_FALSE and release - * the mp. caller is responsible for decrementing ire ref cnt. + * Pullup function that should be used for IP input in order to + * ensure we do not loose the L2 source address; we need the l2 source + * address for IP_RECVSLLA and for ndp_input. + * + * We return either NULL or b_rptr. */ -static boolean_t -ip_options_cksum(queue_t *q, ill_t *ill, mblk_t *mp, ipha_t *ipha, ire_t *ire, - ip_stack_t *ipst) +void * +ip_pullup(mblk_t *mp, ssize_t len, ip_recv_attr_t *ira) { - mblk_t *first_mp; - boolean_t mctl_present; - uint16_t sum; + ill_t *ill = ira->ira_ill; - EXTRACT_PKT_MP(mp, first_mp, mctl_present); - /* - * Don't do the checksum if it has gone through AH/ESP - * processing. - */ - if (!mctl_present) { - sum = ip_csum_hdr(ipha); - if (sum != 0) { - if (ill != NULL) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); - } else { - BUMP_MIB(&ipst->ips_ip_mib, - ipIfStatsInCksumErrs); - } - freemsg(first_mp); - return (B_FALSE); - } + if (ip_rput_pullups++ == 0) { + (void) mi_strlog(ill->ill_rq, 1, SL_ERROR|SL_TRACE, + "ip_pullup: %s forced us to " + " pullup pkt, hdr len %ld, hdr addr %p", + ill->ill_name, len, (void *)mp->b_rptr); } - - if (!ip_rput_local_options(q, mp, ipha, ire, ipst)) { - if (mctl_present) - freeb(first_mp); - return (B_FALSE); - } - - return (B_TRUE); + if (!(ira->ira_flags & IRAF_L2SRC_SET)) + ip_setl2src(mp, ira, ira->ira_rill); + ASSERT(ira->ira_flags & IRAF_L2SRC_SET); + if (!pullupmsg(mp, len)) + return (NULL); + else + return (mp->b_rptr); } /* - * All udp packet are delivered to the local host via this routine. + * Make sure ira_l2src has an address. If we don't have one fill with zeros. + * When called from the ULP ira_rill will be NULL hence the caller has to + * pass in the ill. */ +/* ARGSUSED */ void -ip_udp_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, - ill_t *recv_ill) +ip_setl2src(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill) { - uint32_t sum; - uint32_t u1; - boolean_t mctl_present; - conn_t *connp; - mblk_t *first_mp; - uint16_t *up; - ill_t *ill = (ill_t *)q->q_ptr; - uint16_t reass_hck_flags = 0; - ip_stack_t *ipst; - - ASSERT(recv_ill != NULL); - ipst = recv_ill->ill_ipst; + const uchar_t *addr; + int alen; -#define rptr ((uchar_t *)ipha) + if (ira->ira_flags & IRAF_L2SRC_SET) + return; - EXTRACT_PKT_MP(mp, first_mp, mctl_present); - ASSERT(!mctl_present || ipsec_in_is_secure(first_mp)); - ASSERT(ipha->ipha_protocol == IPPROTO_UDP); ASSERT(ill != NULL); - - /* - * FAST PATH for udp packets - */ - - /* u1 is # words of IP options */ - u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) + - IP_SIMPLE_HDR_LENGTH_IN_WORDS); - - /* IP options present */ - if (u1 != 0) - goto ipoptions; - - /* Check the IP header checksum. */ - if (IS_IP_HDR_HWCKSUM(mctl_present, mp, recv_ill)) { - /* Clear the IP header h/w cksum flag */ - DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; - } else if (!mctl_present) { - /* - * Don't verify header checksum if this packet is coming - * back from AH/ESP as we already did it. - */ -#define uph ((uint16_t *)ipha) - sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + uph[5] + - uph[6] + uph[7] + uph[8] + uph[9]; -#undef uph - /* finish doing IP checksum */ - sum = (sum & 0xFFFF) + (sum >> 16); - sum = ~(sum + (sum >> 16)) & 0xFFFF; - if (sum != 0 && sum != 0xFFFF) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); - freemsg(first_mp); - return; - } - } - - /* - * Count for SNMP of inbound packets for ire. - * if mctl is present this might be a secure packet and - * has already been counted for in ip_proto_input(). - */ - if (!mctl_present) { - UPDATE_IB_PKT_COUNT(ire); - ire->ire_last_used_time = lbolt; + alen = ill->ill_phys_addr_length; + ASSERT(alen <= sizeof (ira->ira_l2src)); + if (ira->ira_mhip != NULL && + (addr = ira->ira_mhip->mhi_saddr) != NULL) { + bcopy(addr, ira->ira_l2src, alen); + } else if ((ira->ira_flags & IRAF_L2SRC_LOOPBACK) && + (addr = ill->ill_phys_addr) != NULL) { + bcopy(addr, ira->ira_l2src, alen); + } else { + bzero(ira->ira_l2src, alen); } + ira->ira_flags |= IRAF_L2SRC_SET; +} - /* packet part of fragmented IP packet? */ - u1 = ntohs(ipha->ipha_fragment_offset_and_flags); - if (u1 & (IPH_MF | IPH_OFFSET)) { - goto fragmented; - } +/* + * check ip header length and align it. + */ +mblk_t * +ip_check_and_align_header(mblk_t *mp, uint_t min_size, ip_recv_attr_t *ira) +{ + ill_t *ill = ira->ira_ill; + ssize_t len; - /* u1 = IP header length (20 bytes) */ - u1 = IP_SIMPLE_HDR_LENGTH; + len = MBLKL(mp); - /* packet does not contain complete IP & UDP headers */ - if ((mp->b_wptr - rptr) < (IP_SIMPLE_HDR_LENGTH + UDPH_SIZE)) - goto udppullup; + if (!OK_32PTR(mp->b_rptr)) + IP_STAT(ill->ill_ipst, ip_notaligned); + else + IP_STAT(ill->ill_ipst, ip_recv_pullup); - /* up points to UDP header */ - up = (uint16_t *)((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH); -#define iphs ((uint16_t *)ipha) + /* Guard against bogus device drivers */ + if (len < 0) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); + ip_drop_input("ipIfStatsInHdrErrors", mp, ill); + freemsg(mp); + return (NULL); + } - /* if udp hdr cksum != 0, then need to checksum udp packet */ - if (up[3] != 0) { + if (len == 0) { + /* GLD sometimes sends up mblk with b_rptr == b_wptr! */ mblk_t *mp1 = mp->b_cont; - boolean_t cksum_err; - uint16_t hck_flags = 0; - /* Pseudo-header checksum */ - u1 = IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] + - iphs[9] + up[2]; + if (!(ira->ira_flags & IRAF_L2SRC_SET)) + ip_setl2src(mp, ira, ira->ira_rill); + ASSERT(ira->ira_flags & IRAF_L2SRC_SET); - /* - * Revert to software checksum calculation if the interface - * isn't capable of checksum offload or if IPsec is present. - */ - if (ILL_HCKSUM_CAPABLE(recv_ill) && !mctl_present && dohwcksum) - hck_flags = DB_CKSUMFLAGS(mp); - - if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) - IP_STAT(ipst, ip_in_sw_cksum); - - IP_CKSUM_RECV(hck_flags, u1, - (uchar_t *)(rptr + DB_CKSUMSTART(mp)), - (int32_t)((uchar_t *)up - rptr), - mp, mp1, cksum_err); - - if (cksum_err) { - BUMP_MIB(ill->ill_ip_mib, udpIfStatsInCksumErrs); - if (hck_flags & HCK_FULLCKSUM) - IP_STAT(ipst, ip_udp_in_full_hw_cksum_err); - else if (hck_flags & HCK_PARTIALCKSUM) - IP_STAT(ipst, ip_udp_in_part_hw_cksum_err); - else - IP_STAT(ipst, ip_udp_in_sw_cksum_err); + freeb(mp); + mp = mp1; + if (mp == NULL) + return (NULL); - freemsg(first_mp); - return; - } + if (OK_32PTR(mp->b_rptr) && MBLKL(mp) >= min_size) + return (mp); } - - /* Non-fragmented broadcast or multicast packet? */ - if (ire->ire_type == IRE_BROADCAST) - goto udpslowpath; - - if ((connp = ipcl_classify_v4(mp, IPPROTO_UDP, IP_SIMPLE_HDR_LENGTH, - ire->ire_zoneid, ipst)) != NULL) { - ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL); - IP_STAT(ipst, ip_udp_fast_path); - - if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) || - (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) { - freemsg(mp); - BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows); + if (ip_pullup(mp, min_size, ira) == NULL) { + if (msgdsize(mp) < min_size) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); + ip_drop_input("ipIfStatsInHdrErrors", mp, ill); } else { - if (!mctl_present) { - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsHCInDelivers); - } - /* - * mp and first_mp can change. - */ - if (ip_udp_check(q, connp, recv_ill, - ipha, &mp, &first_mp, mctl_present, ire)) { - /* Send it upstream */ - (connp->conn_recv)(connp, mp, NULL); - } - } - /* - * freeb() cannot deal with null mblk being passed - * in and first_mp can be set to null in the call - * ipsec_input_fast_proc()->ipsec_check_inbound_policy. - */ - if (mctl_present && first_mp != NULL) { - freeb(first_mp); - } - CONN_DEC_REF(connp); - return; - } - - /* - * if we got here we know the packet is not fragmented and - * has no options. The classifier could not find a conn_t and - * most likely its an icmp packet so send it through slow path. - */ - - goto udpslowpath; - -ipoptions: - if (!ip_options_cksum(q, ill, mp, ipha, ire, ipst)) { - goto slow_done; - } - - UPDATE_IB_PKT_COUNT(ire); - ire->ire_last_used_time = lbolt; - u1 = ntohs(ipha->ipha_fragment_offset_and_flags); - if (u1 & (IPH_MF | IPH_OFFSET)) { -fragmented: - /* - * "sum" and "reass_hck_flags" are non-zero if the - * reassembled packet has a valid hardware computed - * checksum information associated with it. - */ - if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, &sum, - &reass_hck_flags)) { - goto slow_done; - } - - /* - * Make sure that first_mp points back to mp as - * the mp we came in with could have changed in - * ip_rput_fragment(). - */ - ASSERT(!mctl_present); - ipha = (ipha_t *)mp->b_rptr; - first_mp = mp; - } - - /* Now we have a complete datagram, destined for this machine. */ - u1 = IPH_HDR_LENGTH(ipha); - /* Pull up the UDP header, if necessary. */ - if ((MBLKL(mp)) < (u1 + UDPH_SIZE)) { -udppullup: - if (!pullupmsg(mp, u1 + UDPH_SIZE)) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - freemsg(first_mp); - goto slow_done; - } - ipha = (ipha_t *)mp->b_rptr; - } - - /* - * Validate the checksum for the reassembled packet; for the - * pullup case we calculate the payload checksum in software. - */ - up = (uint16_t *)((uchar_t *)ipha + u1 + UDP_PORTS_OFFSET); - if (up[3] != 0) { - boolean_t cksum_err; - - if ((reass_hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) - IP_STAT(ipst, ip_in_sw_cksum); - - IP_CKSUM_RECV_REASS(reass_hck_flags, - (int32_t)((uchar_t *)up - (uchar_t *)ipha), - IP_UDP_CSUM_COMP + iphs[6] + iphs[7] + iphs[8] + - iphs[9] + up[2], sum, cksum_err); - - if (cksum_err) { - BUMP_MIB(ill->ill_ip_mib, udpIfStatsInCksumErrs); - - if (reass_hck_flags & HCK_FULLCKSUM) - IP_STAT(ipst, ip_udp_in_full_hw_cksum_err); - else if (reass_hck_flags & HCK_PARTIALCKSUM) - IP_STAT(ipst, ip_udp_in_part_hw_cksum_err); - else - IP_STAT(ipst, ip_udp_in_sw_cksum_err); - - freemsg(first_mp); - goto slow_done; + ip_drop_input("ipIfStatsInDiscards", mp, ill); } + freemsg(mp); + return (NULL); } -udpslowpath: - - /* Clear hardware checksum flag to be safe */ - DB_CKSUMFLAGS(mp) = 0; - - ip_fanout_udp(q, first_mp, ill, ipha, *(uint32_t *)up, - (ire->ire_type == IRE_BROADCAST), - IP_FF_SEND_ICMP | IP_FF_CKSUM | IP_FF_IPINFO, - mctl_present, B_TRUE, recv_ill, ire->ire_zoneid); - -slow_done: - IP_STAT(ipst, ip_udp_slow_path); - return; - -#undef iphs -#undef rptr -} - -static boolean_t -ip_iptun_input(mblk_t *ipsec_mp, mblk_t *data_mp, ipha_t *ipha, ill_t *ill, - ire_t *ire, ip_stack_t *ipst) -{ - conn_t *connp; - - ASSERT(ipsec_mp == NULL || ipsec_mp->b_cont == data_mp); - - if ((connp = ipcl_classify_v4(data_mp, ipha->ipha_protocol, - IP_SIMPLE_HDR_LENGTH, ire->ire_zoneid, ipst)) != NULL) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); - connp->conn_recv(connp, ipsec_mp != NULL ? ipsec_mp : data_mp, - NULL); - CONN_DEC_REF(connp); - return (B_TRUE); - } - return (B_FALSE); + return (mp); } -/* ARGSUSED */ -static mblk_t * -ip_tcp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present, - ire_t *ire, mblk_t *first_mp, uint_t flags, queue_t *q, - ill_rx_ring_t *ill_ring) +/* + * Common code for IPv4 and IPv6 to check and pullup multi-mblks + */ +mblk_t * +ip_check_length(mblk_t *mp, uchar_t *rptr, ssize_t len, uint_t pkt_len, + uint_t min_size, ip_recv_attr_t *ira) { - conn_t *connp; - uint32_t sum; - uint32_t u1; - uint16_t *up; - int offset; - ssize_t len; - mblk_t *mp1; - boolean_t syn_present = B_FALSE; - tcph_t *tcph; - uint_t tcph_flags; - uint_t ip_hdr_len; - ill_t *ill = (ill_t *)q->q_ptr; - zoneid_t zoneid = ire->ire_zoneid; - boolean_t cksum_err; - uint16_t hck_flags = 0; - ip_stack_t *ipst = recv_ill->ill_ipst; - ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; - -#define rptr ((uchar_t *)ipha) - - ASSERT(ipha->ipha_protocol == IPPROTO_TCP); - ASSERT(ill != NULL); - - /* - * FAST PATH for tcp packets - */ - - /* u1 is # words of IP options */ - u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) - + IP_SIMPLE_HDR_LENGTH_IN_WORDS); - - /* IP options present */ - if (u1) { - goto ipoptions; - } else if (!mctl_present) { - /* Check the IP header checksum. */ - if (IS_IP_HDR_HWCKSUM(mctl_present, mp, recv_ill)) { - /* Clear the IP header h/w cksum flag */ - DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; - } else if (!mctl_present) { - /* - * Don't verify header checksum if this packet - * is coming back from AH/ESP as we already did it. - */ -#define uph ((uint16_t *)ipha) - sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + - uph[5] + uph[6] + uph[7] + uph[8] + uph[9]; -#undef uph - /* finish doing IP checksum */ - sum = (sum & 0xFFFF) + (sum >> 16); - sum = ~(sum + (sum >> 16)) & 0xFFFF; - if (sum != 0 && sum != 0xFFFF) { - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsInCksumErrs); - goto error; - } - } - } - - if (!mctl_present) { - UPDATE_IB_PKT_COUNT(ire); - ire->ire_last_used_time = lbolt; - } - - /* packet part of fragmented IP packet? */ - u1 = ntohs(ipha->ipha_fragment_offset_and_flags); - if (u1 & (IPH_MF | IPH_OFFSET)) { - goto fragmented; - } - - /* u1 = IP header length (20 bytes) */ - u1 = ip_hdr_len = IP_SIMPLE_HDR_LENGTH; - - /* does packet contain IP+TCP headers? */ - len = mp->b_wptr - rptr; - if (len < (IP_SIMPLE_HDR_LENGTH + TCP_MIN_HEADER_LENGTH)) { - IP_STAT(ipst, ip_tcppullup); - goto tcppullup; - } - - /* TCP options present? */ - offset = ((uchar_t *)ipha)[IP_SIMPLE_HDR_LENGTH + 12] >> 4; - - /* - * If options need to be pulled up, then goto tcpoptions. - * otherwise we are still in the fast path - */ - if (len < (offset << 2) + IP_SIMPLE_HDR_LENGTH) { - IP_STAT(ipst, ip_tcpoptions); - goto tcpoptions; - } - - /* multiple mblks of tcp data? */ - if ((mp1 = mp->b_cont) != NULL) { - IP_STAT(ipst, ip_multipkttcp); - len += msgdsize(mp1); - } - - up = (uint16_t *)(rptr + IP_SIMPLE_HDR_LENGTH + TCP_PORTS_OFFSET); - - /* part of pseudo checksum */ - - /* TCP datagram length */ - u1 = len - IP_SIMPLE_HDR_LENGTH; - -#define iphs ((uint16_t *)ipha) - -#ifdef _BIG_ENDIAN - u1 += IPPROTO_TCP; -#else - u1 = ((u1 >> 8) & 0xFF) + (((u1 & 0xFF) + IPPROTO_TCP) << 8); -#endif - u1 += iphs[6] + iphs[7] + iphs[8] + iphs[9]; - - /* - * Revert to software checksum calculation if the interface - * isn't capable of checksum offload or if IPsec is present. - */ - if (ILL_HCKSUM_CAPABLE(recv_ill) && !mctl_present && dohwcksum) - hck_flags = DB_CKSUMFLAGS(mp); - - if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) - IP_STAT(ipst, ip_in_sw_cksum); - - IP_CKSUM_RECV(hck_flags, u1, - (uchar_t *)(rptr + DB_CKSUMSTART(mp)), - (int32_t)((uchar_t *)up - rptr), - mp, mp1, cksum_err); - - if (cksum_err) { - BUMP_MIB(ill->ill_ip_mib, tcpIfStatsInErrs); - - if (hck_flags & HCK_FULLCKSUM) - IP_STAT(ipst, ip_tcp_in_full_hw_cksum_err); - else if (hck_flags & HCK_PARTIALCKSUM) - IP_STAT(ipst, ip_tcp_in_part_hw_cksum_err); - else - IP_STAT(ipst, ip_tcp_in_sw_cksum_err); - - goto error; - } - -try_again: - - if ((connp = ipcl_classify_v4(mp, IPPROTO_TCP, ip_hdr_len, - zoneid, ipst)) == NULL) { - /* Send the TH_RST */ - goto no_conn; - } - - tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; - tcph_flags = tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG); + ill_t *ill = ira->ira_ill; /* - * TCP FAST PATH for AF_INET socket. - * - * TCP fast path to avoid extra work. An AF_INET socket type - * does not have facility to receive extra information via - * ip_process or ip_add_info. Also, when the connection was - * established, we made a check if this connection is impacted - * by any global IPsec policy or per connection policy (a - * policy that comes in effect later will not apply to this - * connection). Since all this can be determined at the - * connection establishment time, a quick check of flags - * can avoid extra work. + * Make sure we have data length consistent + * with the IP header. */ - if (IPCL_IS_TCP4_CONNECTED_NO_POLICY(connp) && !mctl_present && - !IPP_ENABLED(IPP_LOCAL_IN, ipst)) { - ASSERT(first_mp == mp); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); - if (tcph_flags != (TH_SYN | TH_ACK)) { - SET_SQUEUE(mp, tcp_rput_data, connp); - return (mp); - } - mp->b_datap->db_struioflag |= STRUIO_CONNECT; - DB_CKSUMSTART(mp) = (intptr_t)ip_squeue_get(ill_ring); - SET_SQUEUE(mp, tcp_input, connp); - return (mp); - } - - if (tcph_flags == TH_SYN) { - if (IPCL_IS_TCP(connp)) { - mp->b_datap->db_struioflag |= STRUIO_EAGER; - DB_CKSUMSTART(mp) = - (intptr_t)ip_squeue_get(ill_ring); - if (IPCL_IS_FULLY_BOUND(connp) && !mctl_present && - !CONN_INBOUND_POLICY_PRESENT(connp, ipss)) { - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsHCInDelivers); - SET_SQUEUE(mp, connp->conn_recv, connp); - return (mp); - } else if (IPCL_IS_BOUND(connp) && !mctl_present && - !CONN_INBOUND_POLICY_PRESENT(connp, ipss)) { - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsHCInDelivers); - ip_squeue_enter_unbound++; - SET_SQUEUE(mp, tcp_conn_request_unbound, - connp); - return (mp); - } - syn_present = B_TRUE; - } - } - - if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp) && !syn_present) { - uint_t flags = (unsigned int)tcph->th_flags[0] & 0xFF; - - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); - /* No need to send this packet to TCP */ - if ((flags & TH_RST) || (flags & TH_URG)) { - CONN_DEC_REF(connp); - freemsg(first_mp); - return (NULL); - } - if (flags & TH_ACK) { - ip_xmit_reset_serialize(first_mp, ip_hdr_len, zoneid, - ipst->ips_netstack->netstack_tcp, connp); - CONN_DEC_REF(connp); - return (NULL); - } - - CONN_DEC_REF(connp); - freemsg(first_mp); - return (NULL); - } - - if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || mctl_present) { - first_mp = ipsec_check_inbound_policy(first_mp, connp, - ipha, NULL, mctl_present); - if (first_mp == NULL) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - CONN_DEC_REF(connp); + if (mp->b_cont == NULL) { + /* pkt_len is based on ipha_len, not the mblk length */ + if (pkt_len < min_size) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); + ip_drop_input("ipIfStatsInHdrErrors", mp, ill); + freemsg(mp); return (NULL); } - if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp)) { - ASSERT(syn_present); - if (mctl_present) { - ASSERT(first_mp != mp); - first_mp->b_datap->db_struioflag |= - STRUIO_POLICY; - } else { - ASSERT(first_mp == mp); - mp->b_datap->db_struioflag &= ~STRUIO_EAGER; - mp->b_datap->db_struioflag |= STRUIO_POLICY; - } - } else { - /* - * Discard first_mp early since we're dealing with a - * fully-connected conn_t and tcp doesn't do policy in - * this case. - */ - if (mctl_present) { - freeb(first_mp); - mctl_present = B_FALSE; - } - first_mp = mp; - } - } - - /* Initiate IPPF processing for fastpath */ - if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) { - uint32_t ill_index; - - ill_index = recv_ill->ill_phyint->phyint_ifindex; - ip_process(IPP_LOCAL_IN, &mp, ill_index); - if (mp == NULL) { - ip2dbg(("ip_input_ipsec_process: TCP pkt " - "deferred/dropped during IPPF processing\n")); - CONN_DEC_REF(connp); - if (mctl_present) - freeb(first_mp); + if (len < 0) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts); + ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill); + freemsg(mp); return (NULL); - } else if (mctl_present) { - /* - * ip_process might return a new mp. - */ - ASSERT(first_mp != mp); - first_mp->b_cont = mp; - } else { - first_mp = mp; } - - } - - if (!syn_present && connp->conn_ip_recvpktinfo) { - /* - * TCP does not support IP_RECVPKTINFO for v4 so lets - * make sure IPF_RECVIF is passed to ip_add_info. - */ - mp = ip_add_info(mp, recv_ill, flags|IPF_RECVIF, - IPCL_ZONEID(connp), ipst); - if (mp == NULL) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - CONN_DEC_REF(connp); - if (mctl_present) - freeb(first_mp); + /* Drop any pad */ + mp->b_wptr = rptr + pkt_len; + } else if ((len += msgdsize(mp->b_cont)) != 0) { + ASSERT(pkt_len >= min_size); + if (pkt_len < min_size) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); + ip_drop_input("ipIfStatsInHdrErrors", mp, ill); + freemsg(mp); return (NULL); - } else if (mctl_present) { - /* - * ip_add_info might return a new mp. - */ - ASSERT(first_mp != mp); - first_mp->b_cont = mp; - } else { - first_mp = mp; } - } - - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); - if (IPCL_IS_TCP(connp)) { - SET_SQUEUE(first_mp, connp->conn_recv, connp); - return (first_mp); - } else { - /* SOCK_RAW, IPPROTO_TCP case */ - (connp->conn_recv)(connp, first_mp, NULL); - CONN_DEC_REF(connp); - return (NULL); - } - -no_conn: - /* Initiate IPPf processing, if needed. */ - if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) { - uint32_t ill_index; - ill_index = recv_ill->ill_phyint->phyint_ifindex; - ip_process(IPP_LOCAL_IN, &first_mp, ill_index); - if (first_mp == NULL) { + if (len < 0) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts); + ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill); + freemsg(mp); return (NULL); } - } - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); - - tcp_xmit_listeners_reset(first_mp, IPH_HDR_LENGTH(mp->b_rptr), zoneid, - ipst->ips_netstack->netstack_tcp, NULL); - return (NULL); -ipoptions: - if (!ip_options_cksum(q, ill, first_mp, ipha, ire, ipst)) { - goto slow_done; - } - - UPDATE_IB_PKT_COUNT(ire); - ire->ire_last_used_time = lbolt; - - u1 = ntohs(ipha->ipha_fragment_offset_and_flags); - if (u1 & (IPH_MF | IPH_OFFSET)) { -fragmented: - if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, NULL, NULL)) { - if (mctl_present) - freeb(first_mp); - goto slow_done; - } - /* - * Make sure that first_mp points back to mp as - * the mp we came in with could have changed in - * ip_rput_fragment(). - */ - ASSERT(!mctl_present); - ipha = (ipha_t *)mp->b_rptr; - first_mp = mp; - } - - /* Now we have a complete datagram, destined for this machine. */ - u1 = ip_hdr_len = IPH_HDR_LENGTH(ipha); - - len = mp->b_wptr - mp->b_rptr; - /* Pull up a minimal TCP header, if necessary. */ - if (len < (u1 + 20)) { -tcppullup: - if (!pullupmsg(mp, u1 + 20)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - goto error; - } - ipha = (ipha_t *)mp->b_rptr; - len = mp->b_wptr - mp->b_rptr; - } - - /* - * Extract the offset field from the TCP header. As usual, we - * try to help the compiler more than the reader. - */ - offset = ((uchar_t *)ipha)[u1 + 12] >> 4; - if (offset != 5) { -tcpoptions: - if (offset < 5) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - goto error; - } - /* - * There must be TCP options. - * Make sure we can grab them. - */ - offset <<= 2; - offset += u1; - if (len < offset) { - if (!pullupmsg(mp, offset)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - goto error; - } - ipha = (ipha_t *)mp->b_rptr; - len = mp->b_wptr - rptr; - } - } - - /* Get the total packet length in len, including headers. */ - if (mp->b_cont) - len = msgdsize(mp); - - /* - * Check the TCP checksum by pulling together the pseudo- - * header checksum, and passing it to ip_csum to be added in - * with the TCP datagram. - * - * Since we are not using the hwcksum if available we must - * clear the flag. We may come here via tcppullup or tcpoptions. - * If either of these fails along the way the mblk is freed. - * If this logic ever changes and mblk is reused to say send - * ICMP's back, then this flag may need to be cleared in - * other places as well. - */ - DB_CKSUMFLAGS(mp) = 0; - - up = (uint16_t *)(rptr + u1 + TCP_PORTS_OFFSET); - - u1 = (uint32_t)(len - u1); /* TCP datagram length. */ -#ifdef _BIG_ENDIAN - u1 += IPPROTO_TCP; -#else - u1 = ((u1 >> 8) & 0xFF) + (((u1 & 0xFF) + IPPROTO_TCP) << 8); -#endif - u1 += iphs[6] + iphs[7] + iphs[8] + iphs[9]; - /* - * Not M_DATA mblk or its a dup, so do the checksum now. - */ - IP_STAT(ipst, ip_in_sw_cksum); - if (IP_CSUM(mp, (int32_t)((uchar_t *)up - rptr), u1) != 0) { - BUMP_MIB(ill->ill_ip_mib, tcpIfStatsInErrs); - goto error; - } - - IP_STAT(ipst, ip_tcp_slow_path); - goto try_again; -#undef iphs -#undef rptr - -error: - freemsg(first_mp); -slow_done: - return (NULL); -} - -/* ARGSUSED */ -static void -ip_sctp_input(mblk_t *mp, ipha_t *ipha, ill_t *recv_ill, boolean_t mctl_present, - ire_t *ire, mblk_t *first_mp, uint_t flags, queue_t *q, ipaddr_t dst) -{ - conn_t *connp; - uint32_t sum; - uint32_t u1; - ssize_t len; - sctp_hdr_t *sctph; - zoneid_t zoneid = ire->ire_zoneid; - uint32_t pktsum; - uint32_t calcsum; - uint32_t ports; - in6_addr_t map_src, map_dst; - ill_t *ill = (ill_t *)q->q_ptr; - ip_stack_t *ipst; - sctp_stack_t *sctps; - boolean_t sctp_csum_err = B_FALSE; - - ASSERT(recv_ill != NULL); - ipst = recv_ill->ill_ipst; - sctps = ipst->ips_netstack->netstack_sctp; - -#define rptr ((uchar_t *)ipha) - - ASSERT(ipha->ipha_protocol == IPPROTO_SCTP); - ASSERT(ill != NULL); - - /* u1 is # words of IP options */ - u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) - + IP_SIMPLE_HDR_LENGTH_IN_WORDS); - - /* IP options present */ - if (u1 > 0) { - goto ipoptions; - } else { - /* Check the IP header checksum. */ - if (!IS_IP_HDR_HWCKSUM(mctl_present, mp, recv_ill) && - !mctl_present) { -#define uph ((uint16_t *)ipha) - sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + - uph[5] + uph[6] + uph[7] + uph[8] + uph[9]; -#undef uph - /* finish doing IP checksum */ - sum = (sum & 0xFFFF) + (sum >> 16); - sum = ~(sum + (sum >> 16)) & 0xFFFF; - /* - * Don't verify header checksum if this packet - * is coming back from AH/ESP as we already did it. - */ - if (sum != 0 && sum != 0xFFFF) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); - goto error; - } - } + /* Drop any pad */ + (void) adjmsg(mp, -len); /* - * Since there is no SCTP h/w cksum support yet, just - * clear the flag. + * adjmsg may have freed an mblk from the chain, hence + * invalidate any hw checksum here. This will force IP to + * calculate the checksum in sw, but only for this packet. */ DB_CKSUMFLAGS(mp) = 0; + IP_STAT(ill->ill_ipst, ip_multimblk); } - - /* - * Don't verify header checksum if this packet is coming - * back from AH/ESP as we already did it. - */ - if (!mctl_present) { - UPDATE_IB_PKT_COUNT(ire); - ire->ire_last_used_time = lbolt; - } - - /* packet part of fragmented IP packet? */ - u1 = ntohs(ipha->ipha_fragment_offset_and_flags); - if (u1 & (IPH_MF | IPH_OFFSET)) - goto fragmented; - - /* u1 = IP header length (20 bytes) */ - u1 = IP_SIMPLE_HDR_LENGTH; - -find_sctp_client: - /* Pullup if we don't have the sctp common header. */ - len = MBLKL(mp); - if (len < (u1 + SCTP_COMMON_HDR_LENGTH)) { - if (mp->b_cont == NULL || - !pullupmsg(mp, u1 + SCTP_COMMON_HDR_LENGTH)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - goto error; - } - ipha = (ipha_t *)mp->b_rptr; - len = MBLKL(mp); - } - - sctph = (sctp_hdr_t *)(rptr + u1); -#ifdef DEBUG - if (!skip_sctp_cksum) { -#endif - pktsum = sctph->sh_chksum; - sctph->sh_chksum = 0; - calcsum = sctp_cksum(mp, u1); - sctph->sh_chksum = pktsum; - if (calcsum != pktsum) - sctp_csum_err = B_TRUE; -#ifdef DEBUG /* skip_sctp_cksum */ - } -#endif - /* get the ports */ - ports = *(uint32_t *)&sctph->sh_sport; - - IRE_REFRELE(ire); - IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_dst); - IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_src); - if (sctp_csum_err) { - /* - * No potential sctp checksum errors go to the Sun - * sctp stack however they might be Adler-32 summed - * packets a userland stack bound to a raw IP socket - * could reasonably use. Note though that Adler-32 is - * a long deprecated algorithm and customer sctp - * networks should eventually migrate to CRC-32 at - * which time this facility should be removed. - */ - flags |= IP_FF_SCTP_CSUM_ERR; - goto no_conn; - } - if ((connp = sctp_fanout(&map_src, &map_dst, ports, zoneid, mp, - sctps)) == NULL) { - /* Check for raw socket or OOTB handling */ - goto no_conn; - } - - /* Found a client; up it goes */ - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); - sctp_input(connp, ipha, mp, first_mp, recv_ill, B_TRUE, mctl_present); - return; - -no_conn: - ip_fanout_sctp_raw(first_mp, recv_ill, ipha, B_TRUE, - ports, mctl_present, flags, B_TRUE, zoneid); - return; - -ipoptions: - DB_CKSUMFLAGS(mp) = 0; - if (!ip_options_cksum(q, ill, first_mp, ipha, ire, ipst)) - goto slow_done; - - UPDATE_IB_PKT_COUNT(ire); - ire->ire_last_used_time = lbolt; - - u1 = ntohs(ipha->ipha_fragment_offset_and_flags); - if (u1 & (IPH_MF | IPH_OFFSET)) { -fragmented: - if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, NULL, NULL)) - goto slow_done; - /* - * Make sure that first_mp points back to mp as - * the mp we came in with could have changed in - * ip_rput_fragment(). - */ - ASSERT(!mctl_present); - ipha = (ipha_t *)mp->b_rptr; - first_mp = mp; - } - - /* Now we have a complete datagram, destined for this machine. */ - u1 = IPH_HDR_LENGTH(ipha); - goto find_sctp_client; -#undef iphs -#undef rptr - -error: - freemsg(first_mp); -slow_done: - IRE_REFRELE(ire); + return (mp); } -#define VER_BITS 0xF0 -#define VERSION_6 0x60 - -static boolean_t -ip_rput_multimblk_ipoptions(queue_t *q, ill_t *ill, mblk_t *mp, ipha_t **iphapp, - ipaddr_t *dstp, ip_stack_t *ipst) +/* + * Check that the IPv4 opt_len is consistent with the packet and pullup + * the options. + */ +mblk_t * +ip_check_optlen(mblk_t *mp, ipha_t *ipha, uint_t opt_len, uint_t pkt_len, + ip_recv_attr_t *ira) { - uint_t opt_len; - ipha_t *ipha; + ill_t *ill = ira->ira_ill; ssize_t len; - uint_t pkt_len; - ASSERT(ill != NULL); - IP_STAT(ipst, ip_ipoptions); - ipha = *iphapp; - -#define rptr ((uchar_t *)ipha) /* Assume no IPv6 packets arrive over the IPv4 queue */ - if (IPH_HDR_VERSION(ipha) == IPV6_VERSION) { + if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); BUMP_MIB(ill->ill_ip_mib, ipIfStatsInWrongIPVersion); - freemsg(mp); - return (B_FALSE); - } - - /* multiple mblk or too short */ - pkt_len = ntohs(ipha->ipha_length); - - /* Get the number of words of IP options in the IP header. */ - opt_len = ipha->ipha_version_and_hdr_length - IP_SIMPLE_HDR_VERSION; - if (opt_len) { - /* IP Options present! Validate and process. */ - if (opt_len > (15 - IP_SIMPLE_HDR_LENGTH_IN_WORDS)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); - goto done; - } - /* - * Recompute complete header length and make sure we - * have access to all of it. - */ - len = ((size_t)opt_len + IP_SIMPLE_HDR_LENGTH_IN_WORDS) << 2; - if (len > (mp->b_wptr - rptr)) { - if (len > pkt_len) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); - goto done; - } - if (!pullupmsg(mp, len)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - goto done; - } - ipha = (ipha_t *)mp->b_rptr; - } - /* - * Go off to ip_rput_options which returns the next hop - * destination address, which may have been affected - * by source routing. - */ - IP_STAT(ipst, ip_opt); - if (ip_rput_options(q, mp, ipha, dstp, ipst) == -1) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - return (B_FALSE); - } - } - *iphapp = ipha; - return (B_TRUE); -done: - /* clear b_prev - used by ip_mroute_decap */ - mp->b_prev = NULL; - freemsg(mp); - return (B_FALSE); -#undef rptr -} - -/* - * Deal with the fact that there is no ire for the destination. - */ -static ire_t * -ip_rput_noire(queue_t *q, mblk_t *mp, int ll_multicast, ipaddr_t dst) -{ - ipha_t *ipha; - ill_t *ill; - ire_t *ire; - ip_stack_t *ipst; - enum ire_forward_action ret_action; - - ipha = (ipha_t *)mp->b_rptr; - ill = (ill_t *)q->q_ptr; - - ASSERT(ill != NULL); - ipst = ill->ill_ipst; - - /* - * No IRE for this destination, so it can't be for us. - * Unless we are forwarding, drop the packet. - * We have to let source routed packets through - * since we don't yet know if they are 'ping -l' - * packets i.e. if they will go out over the - * same interface as they came in on. - */ - if (ll_multicast) { + ip_drop_input("IPvN packet on IPv4 ill", mp, ill); freemsg(mp); return (NULL); } - if (!(ill->ill_flags & ILLF_ROUTER) && !ip_source_routed(ipha, ipst)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); + + if (opt_len > (15 - IP_SIMPLE_HDR_LENGTH_IN_WORDS)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); + ip_drop_input("ipIfStatsInHdrErrors", mp, ill); freemsg(mp); return (NULL); } - /* - * Mark this packet as having originated externally. - * - * For non-forwarding code path, ire_send later double - * checks this interface to see if it is still exists - * post-ARP resolution. - * - * Also, IPQOS uses this to differentiate between - * IPP_FWD_OUT and IPP_LOCAL_OUT for post-ARP - * QOS packet processing in ip_wput_attach_llhdr(). - * The QoS module can mark the b_band for a fastpath message - * or the dl_priority field in a unitdata_req header for - * CoS marking. This info can only be found in - * ip_wput_attach_llhdr(). + * Recompute complete header length and make sure we + * have access to all of it. */ - mp->b_prev = (mblk_t *)(uintptr_t)ill->ill_phyint->phyint_ifindex; - /* - * Clear the indication that this may have a hardware checksum - * as we are not using it - */ - DB_CKSUMFLAGS(mp) = 0; - - ire = ire_forward(dst, &ret_action, NULL, NULL, - msg_getlabel(mp), ipst); - - if (ire == NULL && ret_action == Forward_check_multirt) { - /* Let ip_newroute handle CGTP */ - ip_newroute(q, mp, dst, NULL, GLOBAL_ZONEID, ipst); - return (NULL); - } - - if (ire != NULL) - return (ire); - - mp->b_prev = mp->b_next = 0; - - if (ret_action == Forward_blackhole) { - freemsg(mp); - return (NULL); - } - /* send icmp unreachable */ - q = WR(q); - /* Sent by forwarding path, and router is global zone */ - if (ip_source_routed(ipha, ipst)) { - icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED, - GLOBAL_ZONEID, ipst); - } else { - icmp_unreachable(q, mp, ICMP_HOST_UNREACHABLE, GLOBAL_ZONEID, - ipst); - } - - return (NULL); - -} - -/* - * check ip header length and align it. - */ -static boolean_t -ip_check_and_align_header(queue_t *q, mblk_t *mp, ip_stack_t *ipst) -{ - ssize_t len; - ill_t *ill; - ipha_t *ipha; - - len = MBLKL(mp); - - if (!OK_32PTR(mp->b_rptr) || len < IP_SIMPLE_HDR_LENGTH) { - ill = (ill_t *)q->q_ptr; - - if (!OK_32PTR(mp->b_rptr)) - IP_STAT(ipst, ip_notaligned1); - else - IP_STAT(ipst, ip_notaligned2); - /* Guard against bogus device drivers */ - if (len < 0) { - /* clear b_prev - used by ip_mroute_decap */ - mp->b_prev = NULL; + len = ((size_t)opt_len + IP_SIMPLE_HDR_LENGTH_IN_WORDS) << 2; + if (len > (mp->b_wptr - mp->b_rptr)) { + if (len > pkt_len) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); + ip_drop_input("ipIfStatsInHdrErrors", mp, ill); freemsg(mp); - return (B_FALSE); - } - - if (ip_rput_pullups++ == 0) { - ipha = (ipha_t *)mp->b_rptr; - (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, - "ip_check_and_align_header: %s forced us to " - " pullup pkt, hdr len %ld, hdr addr %p", - ill->ill_name, len, (void *)ipha); + return (NULL); } - if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { - /* clear b_prev - used by ip_mroute_decap */ - mp->b_prev = NULL; + if (ip_pullup(mp, len, ira) == NULL) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); freemsg(mp); - return (B_FALSE); + return (NULL); } } - return (B_TRUE); + return (mp); } /* - * Handle the situation where a packet came in on `ill' but matched an IRE - * whose ire_rfq doesn't match `ill'. We return the IRE that should be used - * for interface statistics. + * Returns a new ire, or the same ire, or NULL. + * If a different IRE is returned, then it is held; the caller + * needs to release it. + * In no case is there any hold/release on the ire argument. */ ire_t * ip_check_multihome(void *addr, ire_t *ire, ill_t *ill) @@ -13697,10 +7829,9 @@ ip_check_multihome(void *addr, ire_t *ire, ill_t *ill) * issue (e.g. packet received on an underlying interface matched an * IRE_LOCAL on its associated group interface). */ - if (ire->ire_rfq != NULL && - IS_IN_SAME_ILLGRP(ill, (ill_t *)ire->ire_rfq->q_ptr)) { + ASSERT(ire->ire_ill != NULL); + if (IS_IN_SAME_ILLGRP(ill, ire->ire_ill)) return (ire); - } /* * Do another ire lookup here, using the ingress ill, to see if the @@ -13711,25 +7842,24 @@ ip_check_multihome(void *addr, ire_t *ire, ill_t *ill) * ip*_strict_dst_multihoming switch is on. * We also need to check for IPIF_UNNUMBERED point2point interfaces * where the local address may not be unique. In this case we were - * at the mercy of the initial ire cache lookup and the IRE_LOCAL it + * at the mercy of the initial ire lookup and the IRE_LOCAL it * actually returned. The new lookup, which is more specific, should * only find the IRE_LOCAL associated with the ingress ill if one * exists. */ - if (ire->ire_ipversion == IPV4_VERSION) { if (ipst->ips_ip_strict_dst_multihoming) strict_check = B_TRUE; - new_ire = ire_ctable_lookup(*((ipaddr_t *)addr), 0, IRE_LOCAL, - ill->ill_ipif, ALL_ZONES, NULL, - (MATCH_IRE_TYPE|MATCH_IRE_ILL), ipst); + new_ire = ire_ftable_lookup_v4(*((ipaddr_t *)addr), 0, 0, + IRE_LOCAL, ill, ALL_ZONES, NULL, + (MATCH_IRE_TYPE|MATCH_IRE_ILL), 0, ipst, NULL); } else { ASSERT(!IN6_IS_ADDR_MULTICAST((in6_addr_t *)addr)); if (ipst->ips_ipv6_strict_dst_multihoming) strict_check = B_TRUE; - new_ire = ire_ctable_lookup_v6((in6_addr_t *)addr, NULL, - IRE_LOCAL, ill->ill_ipif, ALL_ZONES, NULL, - (MATCH_IRE_TYPE|MATCH_IRE_ILL), ipst); + new_ire = ire_ftable_lookup_v6((in6_addr_t *)addr, NULL, NULL, + IRE_LOCAL, ill, ALL_ZONES, NULL, + (MATCH_IRE_TYPE|MATCH_IRE_ILL), 0, ipst, NULL); } /* * If the same ire that was returned in ip_input() is found then this @@ -13741,38 +7871,27 @@ ip_check_multihome(void *addr, ire_t *ire, ill_t *ill) * order to have accurate interface statistics. */ if (new_ire != NULL) { - if ((new_ire != ire) && (new_ire->ire_rfq != NULL)) { - ire_refrele(ire); - ire = new_ire; - } else { - ire_refrele(new_ire); - } - return (ire); - } else if ((ire->ire_rfq == NULL) && - (ire->ire_ipversion == IPV4_VERSION)) { - /* - * The best match could have been the original ire which - * was created against an IRE_LOCAL on lo0. In the IPv4 case - * the strict multihoming checks are irrelevant as we consider - * local addresses hosted on lo0 to be interface agnostic. We - * only expect a null ire_rfq on IREs which are associated with - * lo0 hence we can return now. - */ + /* Note: held in one case but not the other? Caller handles */ + if (new_ire != ire) + return (new_ire); + /* Unchanged */ + ire_refrele(new_ire); return (ire); } /* * Chase pointers once and store locally. */ - ire_ill = (ire->ire_rfq == NULL) ? NULL : - (ill_t *)(ire->ire_rfq->q_ptr); + ASSERT(ire->ire_ill != NULL); + ire_ill = ire->ire_ill; ifindex = ill->ill_usesrc_ifindex; /* * Check if it's a legal address on the 'usesrc' interface. + * For IPMP data addresses the IRE_LOCAL is the upper, hence we + * can just check phyint_ifindex. */ - if ((ifindex != 0) && (ire_ill != NULL) && - (ifindex == ire_ill->ill_phyint->phyint_ifindex)) { + if (ifindex != 0 && ifindex == ire_ill->ill_phyint->phyint_ifindex) { return (ire); } @@ -13783,905 +7902,234 @@ ip_check_multihome(void *addr, ire_t *ire, ill_t *ill) if (!(strict_check)) return (ire); - if ((ill->ill_flags & ire->ire_ipif->ipif_ill->ill_flags & - ILLF_ROUTER) != 0) { + if ((ill->ill_flags & ire->ire_ill->ill_flags & ILLF_ROUTER) != 0) { return (ire); } - - ire_refrele(ire); return (NULL); } /* + * This function is used to construct a mac_header_info_s from a + * DL_UNITDATA_IND message. + * The address fields in the mhi structure points into the message, + * thus the caller can't use those fields after freeing the message. * - * This is the fast forward path. If we are here, we dont need to - * worry about RSVP, CGTP, or TSol. Furthermore the ftable lookup - * needed to find the nexthop in this case is much simpler + * We determine whether the packet received is a non-unicast packet + * and in doing so, determine whether or not it is broadcast vs multicast. + * For it to be a broadcast packet, we must have the appropriate mblk_t + * hanging off the ill_t. If this is either not present or doesn't match + * the destination mac address in the DL_UNITDATA_IND, the packet is deemed + * to be multicast. Thus NICs that have no broadcast address (or no + * capability for one, such as point to point links) cannot return as + * the packet being broadcast. */ -ire_t * -ip_fast_forward(ire_t *ire, ipaddr_t dst, ill_t *ill, mblk_t *mp) +void +ip_dlur_to_mhi(ill_t *ill, mblk_t *mb, struct mac_header_info_s *mhip) { - ipha_t *ipha; - ire_t *src_ire; - ill_t *stq_ill; - uint_t hlen; - uint_t pkt_len; - uint32_t sum; - queue_t *dev_q; - ip_stack_t *ipst = ill->ill_ipst; - mblk_t *fpmp; - enum ire_forward_action ret_action; - - ipha = (ipha_t *)mp->b_rptr; - - if (ire != NULL && - ire->ire_zoneid != GLOBAL_ZONEID && - ire->ire_zoneid != ALL_ZONES) { - /* - * Should only use IREs that are visible to the global - * zone for forwarding. - */ - ire_refrele(ire); - ire = ire_cache_lookup(dst, GLOBAL_ZONEID, NULL, ipst); - /* - * ire_cache_lookup() can return ire of IRE_LOCAL in - * transient cases. In such case, just drop the packet - */ - if (ire != NULL && ire->ire_type != IRE_CACHE) - goto indiscard; - } - - /* - * Martian Address Filtering [RFC 1812, Section 5.3.7] - * The loopback address check for both src and dst has already - * been checked in ip_input - */ - - if (dst == INADDR_ANY || CLASSD(ipha->ipha_src)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); - goto drop; - } - src_ire = ire_ctable_lookup(ipha->ipha_src, 0, IRE_BROADCAST, NULL, - ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); - - if (src_ire != NULL) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); - ire_refrele(src_ire); - goto drop; - } - - /* No ire cache of nexthop. So first create one */ - if (ire == NULL) { - - ire = ire_forward_simple(dst, &ret_action, ipst); + dl_unitdata_ind_t *ind = (dl_unitdata_ind_t *)mb->b_rptr; + mblk_t *bmp; + uint_t extra_offset; - /* - * We only come to ip_fast_forward if ip_cgtp_filter - * is not set. So ire_forward() should not return with - * Forward_check_multirt as the next action. - */ - ASSERT(ret_action != Forward_check_multirt); - if (ire == NULL) { - /* An attempt was made to forward the packet */ - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - mp->b_prev = mp->b_next = 0; - /* send icmp unreachable */ - /* Sent by forwarding path, and router is global zone */ - if (ret_action == Forward_ret_icmp_err) { - if (ip_source_routed(ipha, ipst)) { - icmp_unreachable(ill->ill_wq, mp, - ICMP_SOURCE_ROUTE_FAILED, - GLOBAL_ZONEID, ipst); - } else { - icmp_unreachable(ill->ill_wq, mp, - ICMP_HOST_UNREACHABLE, - GLOBAL_ZONEID, ipst); - } - } else { - freemsg(mp); - } - return (NULL); - } - } + bzero(mhip, sizeof (struct mac_header_info_s)); - /* - * Forwarding fastpath exception case: - * If any of the following are true, we take the slowpath: - * o forwarding is not enabled - * o incoming and outgoing interface are the same, or in the same - * IPMP group. - * o corresponding ire is in incomplete state - * o packet needs fragmentation - * o ARP cache is not resolved - * - * The codeflow from here on is thus: - * ip_rput_process_forward->ip_rput_forward->ip_xmit_v4 - */ - pkt_len = ntohs(ipha->ipha_length); - stq_ill = (ill_t *)ire->ire_stq->q_ptr; - if (!(stq_ill->ill_flags & ILLF_ROUTER) || - (ill == stq_ill) || IS_IN_SAME_ILLGRP(ill, stq_ill) || - (ire->ire_nce == NULL) || - (pkt_len > ire->ire_max_frag) || - ((fpmp = ire->ire_nce->nce_fp_mp) == NULL) || - ((hlen = MBLKL(fpmp)) > MBLKHEAD(mp)) || - ipha->ipha_ttl <= 1) { - ip_rput_process_forward(ill->ill_rq, mp, ire, - ipha, ill, B_FALSE, B_TRUE); - return (ire); - } - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams); + mhip->mhi_dsttype = MAC_ADDRTYPE_UNICAST; - DTRACE_PROBE4(ip4__forwarding__start, - ill_t *, ill, ill_t *, stq_ill, ipha_t *, ipha, mblk_t *, mp); + if (ill->ill_sap_length < 0) + extra_offset = 0; + else + extra_offset = ill->ill_sap_length; - FW_HOOKS(ipst->ips_ip4_forwarding_event, - ipst->ips_ipv4firewall_forwarding, - ill, stq_ill, ipha, mp, mp, 0, ipst); + mhip->mhi_daddr = (uchar_t *)ind + ind->dl_dest_addr_offset + + extra_offset; + mhip->mhi_saddr = (uchar_t *)ind + ind->dl_src_addr_offset + + extra_offset; - DTRACE_PROBE1(ip4__forwarding__end, mblk_t *, mp); + if (!ind->dl_group_address) + return; - if (mp == NULL) - goto drop; + /* Multicast or broadcast */ + mhip->mhi_dsttype = MAC_ADDRTYPE_MULTICAST; - mp->b_datap->db_struioun.cksum.flags = 0; - /* Adjust the checksum to reflect the ttl decrement. */ - sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST; - ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16)); - ipha->ipha_ttl--; + if (ind->dl_dest_addr_offset > sizeof (*ind) && + ind->dl_dest_addr_offset + ind->dl_dest_addr_length < MBLKL(mb) && + (bmp = ill->ill_bcast_mp) != NULL) { + dl_unitdata_req_t *dlur; + uint8_t *bphys_addr; - /* - * Write the link layer header. We can do this safely here, - * because we have already tested to make sure that the IP - * policy is not set, and that we have a fast path destination - * header. - */ - mp->b_rptr -= hlen; - bcopy(fpmp->b_rptr, mp->b_rptr, hlen); - - UPDATE_IB_PKT_COUNT(ire); - ire->ire_last_used_time = lbolt; - BUMP_MIB(stq_ill->ill_ip_mib, ipIfStatsHCOutForwDatagrams); - BUMP_MIB(stq_ill->ill_ip_mib, ipIfStatsHCOutTransmits); - UPDATE_MIB(stq_ill->ill_ip_mib, ipIfStatsHCOutOctets, pkt_len); - - if (!ILL_DIRECT_CAPABLE(stq_ill) || DB_TYPE(mp) != M_DATA) { - dev_q = ire->ire_stq->q_next; - if (DEV_Q_FLOW_BLOCKED(dev_q)) - goto indiscard; - } - - DTRACE_PROBE4(ip4__physical__out__start, - ill_t *, NULL, ill_t *, stq_ill, ipha_t *, ipha, mblk_t *, mp); - FW_HOOKS(ipst->ips_ip4_physical_out_event, - ipst->ips_ipv4firewall_physical_out, - NULL, stq_ill, ipha, mp, mp, 0, ipst); - DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); - DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, - ipha, __dtrace_ipsr_ill_t *, stq_ill, ipha_t *, ipha, - ip6_t *, NULL, int, 0); - - if (mp != NULL) { - if (ipst->ips_ip4_observe.he_interested) { - zoneid_t szone; + dlur = (dl_unitdata_req_t *)bmp->b_rptr; + bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset + + extra_offset; - /* - * Both of these functions expect b_rptr to be - * where the IP header starts, so advance past the - * link layer header if present. - */ - mp->b_rptr += hlen; - szone = ip_get_zoneid_v4(ipha->ipha_src, mp, - ipst, ALL_ZONES); - ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, - ALL_ZONES, ill, ipst); - mp->b_rptr -= hlen; - } - ILL_SEND_TX(stq_ill, ire, dst, mp, IP_DROP_ON_NO_DESC, NULL); + if (bcmp(mhip->mhi_daddr, bphys_addr, + ind->dl_dest_addr_length) == 0) + mhip->mhi_dsttype = MAC_ADDRTYPE_BROADCAST; } - return (ire); - -indiscard: - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); -drop: - if (mp != NULL) - freemsg(mp); - return (ire); - } /* - * This function is called in the forwarding slowpath, when - * either the ire lacks the link-layer address, or the packet needs - * further processing(eg. fragmentation), before transmission. + * This function is used to construct a mac_header_info_s from a + * M_DATA fastpath message from a DLPI driver. + * The address fields in the mhi structure points into the message, + * thus the caller can't use those fields after freeing the message. + * + * We determine whether the packet received is a non-unicast packet + * and in doing so, determine whether or not it is broadcast vs multicast. + * For it to be a broadcast packet, we must have the appropriate mblk_t + * hanging off the ill_t. If this is either not present or doesn't match + * the destination mac address in the DL_UNITDATA_IND, the packet is deemed + * to be multicast. Thus NICs that have no broadcast address (or no + * capability for one, such as point to point links) cannot return as + * the packet being broadcast. */ - -static void -ip_rput_process_forward(queue_t *q, mblk_t *mp, ire_t *ire, ipha_t *ipha, - ill_t *ill, boolean_t ll_multicast, boolean_t from_ip_fast_forward) +void +ip_mdata_to_mhi(ill_t *ill, mblk_t *mp, struct mac_header_info_s *mhip) { - queue_t *dev_q; - ire_t *src_ire; - ip_stack_t *ipst = ill->ill_ipst; - boolean_t same_illgrp = B_FALSE; - - ASSERT(ire->ire_stq != NULL); - - mp->b_prev = NULL; /* ip_rput_noire sets incoming interface here */ - mp->b_next = NULL; /* ip_rput_noire sets dst here */ + mblk_t *bmp; + struct ether_header *pether; - /* - * If the caller of this function is ip_fast_forward() skip the - * next three checks as it does not apply. - */ - if (from_ip_fast_forward) - goto skip; + bzero(mhip, sizeof (struct mac_header_info_s)); - if (ll_multicast != 0) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - goto drop_pkt; - } + mhip->mhi_dsttype = MAC_ADDRTYPE_UNICAST; - /* - * check if ipha_src is a broadcast address. Note that this - * check is redundant when we get here from ip_fast_forward() - * which has already done this check. However, since we can - * also get here from ip_rput_process_broadcast() or, for - * for the slow path through ip_fast_forward(), we perform - * the check again for code-reusability - */ - src_ire = ire_ctable_lookup(ipha->ipha_src, 0, IRE_BROADCAST, NULL, - ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); - if (src_ire != NULL || ipha->ipha_dst == INADDR_ANY) { - if (src_ire != NULL) - ire_refrele(src_ire); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); - ip2dbg(("ip_rput_process_forward: Received packet with" - " bad src/dst address on %s\n", ill->ill_name)); - goto drop_pkt; - } + pether = (struct ether_header *)((char *)mp->b_rptr + - sizeof (struct ether_header)); /* - * Check if we want to forward this one at this time. - * We allow source routed packets on a host provided that - * they go out the same ill or illgrp as they came in on. - * - * XXX To be quicker, we may wish to not chase pointers to - * get the ILLF_ROUTER flag and instead store the - * forwarding policy in the ire. An unfortunate - * side-effect of that would be requiring an ire flush - * whenever the ILLF_ROUTER flag changes. + * Make sure the interface is an ethernet type, since we don't + * know the header format for anything but Ethernet. Also make + * sure we are pointing correctly above db_base. */ -skip: - same_illgrp = IS_IN_SAME_ILLGRP(ill, (ill_t *)ire->ire_rfq->q_ptr); - - if (((ill->ill_flags & - ((ill_t *)ire->ire_stq->q_ptr)->ill_flags & ILLF_ROUTER) == 0) && - !(ip_source_routed(ipha, ipst) && - (ire->ire_rfq == q || same_illgrp))) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); - if (ip_source_routed(ipha, ipst)) { - q = WR(q); - /* - * Clear the indication that this may have - * hardware checksum as we are not using it. - */ - DB_CKSUMFLAGS(mp) = 0; - /* Sent by forwarding path, and router is global zone */ - icmp_unreachable(q, mp, - ICMP_SOURCE_ROUTE_FAILED, GLOBAL_ZONEID, ipst); - return; - } - goto drop_pkt; - } - - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams); - - /* Packet is being forwarded. Turning off hwcksum flag. */ - DB_CKSUMFLAGS(mp) = 0; - if (ipst->ips_ip_g_send_redirects) { - /* - * Check whether the incoming interface and outgoing - * interface is part of the same group. If so, - * send redirects. - * - * Check the source address to see if it originated - * on the same logical subnet it is going back out on. - * If so, we should be able to send it a redirect. - * Avoid sending a redirect if the destination - * is directly connected (i.e., ipha_dst is the same - * as ire_gateway_addr or the ire_addr of the - * nexthop IRE_CACHE ), or if the packet was source - * routed out this interface. - */ - ipaddr_t src, nhop; - mblk_t *mp1; - ire_t *nhop_ire = NULL; - - /* - * Check whether ire_rfq and q are from the same ill or illgrp. - * If so, send redirects. - */ - if ((ire->ire_rfq == q || same_illgrp) && - !ip_source_routed(ipha, ipst)) { - - nhop = (ire->ire_gateway_addr != 0 ? - ire->ire_gateway_addr : ire->ire_addr); - - if (ipha->ipha_dst == nhop) { - /* - * We avoid sending a redirect if the - * destination is directly connected - * because it is possible that multiple - * IP subnets may have been configured on - * the link, and the source may not - * be on the same subnet as ip destination, - * even though they are on the same - * physical link. - */ - goto sendit; - } - - src = ipha->ipha_src; - - /* - * We look up the interface ire for the nexthop, - * to see if ipha_src is in the same subnet - * as the nexthop. - * - * Note that, if, in the future, IRE_CACHE entries - * are obsoleted, this lookup will not be needed, - * as the ire passed to this function will be the - * same as the nhop_ire computed below. - */ - nhop_ire = ire_ftable_lookup(nhop, 0, 0, - IRE_INTERFACE, NULL, NULL, ALL_ZONES, - 0, NULL, MATCH_IRE_TYPE, ipst); - - if (nhop_ire != NULL) { - if ((src & nhop_ire->ire_mask) == - (nhop & nhop_ire->ire_mask)) { - /* - * The source is directly connected. - * Just copy the ip header (which is - * in the first mblk) - */ - mp1 = copyb(mp); - if (mp1 != NULL) { - icmp_send_redirect(WR(q), mp1, - nhop, ipst); - } - } - ire_refrele(nhop_ire); - } - } - } -sendit: - dev_q = ire->ire_stq->q_next; - if (DEV_Q_FLOW_BLOCKED(dev_q)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - freemsg(mp); + if (ill->ill_type != IFT_ETHER) return; - } - - ip_rput_forward(ire, ipha, mp, ill); - return; - -drop_pkt: - ip2dbg(("ip_rput_process_forward: drop pkt\n")); - freemsg(mp); -} - -ire_t * -ip_rput_process_broadcast(queue_t **qp, mblk_t *mp, ire_t *ire, ipha_t *ipha, - ill_t *ill, ipaddr_t dst, int cgtp_flt_pkt, int ll_multicast) -{ - queue_t *q; - uint16_t hcksumflags; - ip_stack_t *ipst = ill->ill_ipst; - - q = *qp; - - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInBcastPkts); - - /* - * Clear the indication that this may have hardware - * checksum as we are not using it for forwarding. - */ - hcksumflags = DB_CKSUMFLAGS(mp); - DB_CKSUMFLAGS(mp) = 0; - - /* - * Directed broadcast forwarding: if the packet came in over a - * different interface then it is routed out over we can forward it. - */ - if (ipha->ipha_protocol == IPPROTO_TCP) { - ire_refrele(ire); - freemsg(mp); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - return (NULL); - } - /* - * For multicast we have set dst to be INADDR_BROADCAST - * for delivering to all STREAMS. - */ - if (!CLASSD(ipha->ipha_dst)) { - ire_t *new_ire; - ipif_t *ipif; - - ipif = ipif_get_next_ipif(NULL, ill); - if (ipif == NULL) { -discard: ire_refrele(ire); - freemsg(mp); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - return (NULL); - } - new_ire = ire_ctable_lookup(dst, 0, 0, - ipif, ALL_ZONES, NULL, MATCH_IRE_ILL, ipst); - ipif_refrele(ipif); - if (new_ire != NULL) { - /* - * If the matching IRE_BROADCAST is part of an IPMP - * group, then drop the packet unless our ill has been - * nominated to receive for the group. - */ - if (IS_IPMP(new_ire->ire_ipif->ipif_ill) && - new_ire->ire_rfq != q) { - ire_refrele(new_ire); - goto discard; - } - - /* - * In the special case of multirouted broadcast - * packets, we unconditionally need to "gateway" - * them to the appropriate interface here. - * In the normal case, this cannot happen, because - * there is no broadcast IRE tagged with the - * RTF_MULTIRT flag. - */ - if (new_ire->ire_flags & RTF_MULTIRT) { - ire_refrele(new_ire); - if (ire->ire_rfq != NULL) { - q = ire->ire_rfq; - *qp = q; - } - } else { - ire_refrele(ire); - ire = new_ire; - } - } else if (cgtp_flt_pkt == CGTP_IP_PKT_NOT_CGTP) { - if (!ipst->ips_ip_g_forward_directed_bcast) { - /* - * Free the message if - * ip_g_forward_directed_bcast is turned - * off for non-local broadcast. - */ - ire_refrele(ire); - freemsg(mp); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - return (NULL); - } - } else { - /* - * This CGTP packet successfully passed the - * CGTP filter, but the related CGTP - * broadcast IRE has not been found, - * meaning that the redundant ipif is - * probably down. However, if we discarded - * this packet, its duplicate would be - * filtered out by the CGTP filter so none - * of them would get through. So we keep - * going with this one. - */ - ASSERT(cgtp_flt_pkt == CGTP_IP_PKT_PREMIUM); - if (ire->ire_rfq != NULL) { - q = ire->ire_rfq; - *qp = q; - } - } - } - if (ipst->ips_ip_g_forward_directed_bcast && ll_multicast == 0) { - /* - * Verify that there are not more then one - * IRE_BROADCAST with this broadcast address which - * has ire_stq set. - * TODO: simplify, loop over all IRE's - */ - ire_t *ire1; - int num_stq = 0; - mblk_t *mp1; - - /* Find the first one with ire_stq set */ - rw_enter(&ire->ire_bucket->irb_lock, RW_READER); - for (ire1 = ire; ire1 && - !ire1->ire_stq && ire1->ire_addr == ire->ire_addr; - ire1 = ire1->ire_next) - ; - if (ire1) { - ire_refrele(ire); - ire = ire1; - IRE_REFHOLD(ire); - } +retry: + if ((uchar_t *)pether < mp->b_datap->db_base) + return; - /* Check if there are additional ones with stq set */ - for (ire1 = ire; ire1; ire1 = ire1->ire_next) { - if (ire->ire_addr != ire1->ire_addr) - break; - if (ire1->ire_stq) { - num_stq++; - break; - } + /* Is there a VLAN tag? */ + if (ill->ill_isv6) { + if (pether->ether_type != htons(ETHERTYPE_IPV6)) { + pether = (struct ether_header *)((char *)pether - 4); + goto retry; } - rw_exit(&ire->ire_bucket->irb_lock); - if (num_stq == 1 && ire->ire_stq != NULL) { - ip1dbg(("ip_rput_process_broadcast: directed " - "broadcast to 0x%x\n", - ntohl(ire->ire_addr))); - mp1 = copymsg(mp); - if (mp1) { - switch (ipha->ipha_protocol) { - case IPPROTO_UDP: - ip_udp_input(q, mp1, ipha, ire, ill); - break; - default: - ip_proto_input(q, mp1, ipha, ire, ill, - 0); - break; - } - } - /* - * Adjust ttl to 2 (1+1 - the forward engine - * will decrement it by one. - */ - if (ip_csum_hdr(ipha)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); - ip2dbg(("ip_rput_broadcast:drop pkt\n")); - freemsg(mp); - ire_refrele(ire); - return (NULL); - } - ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl + 1; - ipha->ipha_hdr_checksum = 0; - ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); - ip_rput_process_forward(q, mp, ire, ipha, - ill, ll_multicast, B_FALSE); - ire_refrele(ire); - return (NULL); + } else { + if (pether->ether_type != htons(ETHERTYPE_IP)) { + pether = (struct ether_header *)((char *)pether - 4); + goto retry; } - ip1dbg(("ip_rput: NO directed broadcast to 0x%x\n", - ntohl(ire->ire_addr))); } + mhip->mhi_daddr = (uchar_t *)&pether->ether_dhost; + mhip->mhi_saddr = (uchar_t *)&pether->ether_shost; - /* Restore any hardware checksum flags */ - DB_CKSUMFLAGS(mp) = hcksumflags; - return (ire); -} - -/* ARGSUSED */ -static boolean_t -ip_rput_process_multicast(queue_t *q, mblk_t *mp, ill_t *ill, ipha_t *ipha, - int *ll_multicast, ipaddr_t *dstp) -{ - ip_stack_t *ipst = ill->ill_ipst; - - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastPkts); - UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastOctets, - ntohs(ipha->ipha_length)); + if (!(mhip->mhi_daddr[0] & 0x01)) + return; - /* - * So that we don't end up with dups, only one ill in an IPMP group is - * nominated to receive multicast traffic. - */ - if (IS_UNDER_IPMP(ill) && !ill->ill_nom_cast) - goto drop_pkt; + /* Multicast or broadcast */ + mhip->mhi_dsttype = MAC_ADDRTYPE_MULTICAST; - /* - * Forward packets only if we have joined the allmulti - * group on this interface. - */ - if (ipst->ips_ip_g_mrouter && ill->ill_join_allmulti) { - int retval; + if ((bmp = ill->ill_bcast_mp) != NULL) { + dl_unitdata_req_t *dlur; + uint8_t *bphys_addr; + uint_t addrlen; - /* - * Clear the indication that this may have hardware - * checksum as we are not using it. - */ - DB_CKSUMFLAGS(mp) = 0; - retval = ip_mforward(ill, ipha, mp); - /* ip_mforward updates mib variables if needed */ - /* clear b_prev - used by ip_mroute_decap */ - mp->b_prev = NULL; - - switch (retval) { - case 0: - /* - * pkt is okay and arrived on phyint. - * - * If we are running as a multicast router - * we need to see all IGMP and/or PIM packets. - */ - if ((ipha->ipha_protocol == IPPROTO_IGMP) || - (ipha->ipha_protocol == IPPROTO_PIM)) { - goto done; - } - break; - case -1: - /* pkt is mal-formed, toss it */ - goto drop_pkt; - case 1: - /* pkt is okay and arrived on a tunnel */ - /* - * If we are running a multicast router - * we need to see all igmp packets. - */ - if (ipha->ipha_protocol == IPPROTO_IGMP) { - *dstp = INADDR_BROADCAST; - *ll_multicast = 1; - return (B_FALSE); - } - - goto drop_pkt; + dlur = (dl_unitdata_req_t *)bmp->b_rptr; + addrlen = dlur->dl_dest_addr_length; + if (ill->ill_sap_length < 0) { + bphys_addr = (uchar_t *)dlur + + dlur->dl_dest_addr_offset; + addrlen += ill->ill_sap_length; + } else { + bphys_addr = (uchar_t *)dlur + + dlur->dl_dest_addr_offset + + ill->ill_sap_length; + addrlen -= ill->ill_sap_length; } + if (bcmp(mhip->mhi_daddr, bphys_addr, addrlen) == 0) + mhip->mhi_dsttype = MAC_ADDRTYPE_BROADCAST; } - - if (ilm_lookup_ill(ill, *dstp, ALL_ZONES) == NULL) { - /* - * This might just be caused by the fact that - * multiple IP Multicast addresses map to the same - * link layer multicast - no need to increment counter! - */ - freemsg(mp); - return (B_TRUE); - } -done: - ip2dbg(("ip_rput: multicast for us: 0x%x\n", ntohl(*dstp))); - /* - * This assumes the we deliver to all streams for multicast - * and broadcast packets. - */ - *dstp = INADDR_BROADCAST; - *ll_multicast = 1; - return (B_FALSE); -drop_pkt: - ip2dbg(("ip_rput: drop pkt\n")); - freemsg(mp); - return (B_TRUE); } /* - * This function is used to both return an indication of whether or not - * the packet received is a non-unicast packet (by way of the DL_UNITDATA_IND) - * and in doing so, determine whether or not it is broadcast vs multicast. - * For it to be a broadcast packet, we must have the appropriate mblk_t - * hanging off the ill_t. If this is either not present or doesn't match - * the destination mac address in the DL_UNITDATA_IND, the packet is deemed - * to be multicast. Thus NICs that have no broadcast address (or no - * capability for one, such as point to point links) cannot return as - * the packet being broadcast. The use of HPE_BROADCAST/HPE_MULTICAST as - * the return values simplifies the current use of the return value of this - * function, which is to pass through the multicast/broadcast characteristic - * to consumers of the netinfo/pfhooks API. While this is not cast in stone, - * changing the return value to some other symbol demands the appropriate - * "translation" when hpe_flags is set prior to calling hook_run() for - * packet events. + * Handle anything but M_DATA messages + * We see the DL_UNITDATA_IND which are part + * of the data path, and also the other messages from the driver. */ -int -ip_get_dlpi_mbcast(ill_t *ill, mblk_t *mb) -{ - dl_unitdata_ind_t *ind = (dl_unitdata_ind_t *)mb->b_rptr; - mblk_t *bmp; - - if (ind->dl_group_address) { - if (ind->dl_dest_addr_offset > sizeof (*ind) && - ind->dl_dest_addr_offset + ind->dl_dest_addr_length < - MBLKL(mb) && - (bmp = ill->ill_bcast_mp) != NULL) { - dl_unitdata_req_t *dlur; - uint8_t *bphys_addr; - - dlur = (dl_unitdata_req_t *)bmp->b_rptr; - if (ill->ill_sap_length < 0) - bphys_addr = (uchar_t *)dlur + - dlur->dl_dest_addr_offset; - else - bphys_addr = (uchar_t *)dlur + - dlur->dl_dest_addr_offset + - ill->ill_sap_length; - - if (bcmp(mb->b_rptr + ind->dl_dest_addr_offset, - bphys_addr, ind->dl_dest_addr_length) == 0) { - return (HPE_BROADCAST); - } - return (HPE_MULTICAST); - } - return (HPE_MULTICAST); - } - return (0); -} - -static boolean_t -ip_rput_process_notdata(queue_t *q, mblk_t **first_mpp, ill_t *ill, - int *ll_multicast, mblk_t **mpp) +void +ip_rput_notdata(ill_t *ill, mblk_t *mp) { - mblk_t *mp1, *from_mp, *to_mp, *mp, *first_mp; - boolean_t must_copy = B_FALSE; + mblk_t *first_mp; struct iocblk *iocp; - ipha_t *ipha; - ip_stack_t *ipst = ill->ill_ipst; - -#define rptr ((uchar_t *)ipha) - - first_mp = *first_mpp; - mp = *mpp; + struct mac_header_info_s mhi; - ASSERT(first_mp == mp); - - /* - * if db_ref > 1 then copymsg and free original. Packet may be - * changed and do not want other entity who has a reference to this - * message to trip over the changes. This is a blind change because - * trying to catch all places that might change packet is too - * difficult (since it may be a module above this one) - * - * This corresponds to the non-fast path case. We walk down the full - * chain in this case, and check the db_ref count of all the dblks, - * and do a copymsg if required. It is possible that the db_ref counts - * of the data blocks in the mblk chain can be different. - * For Example, we can get a DL_UNITDATA_IND(M_PROTO) with a db_ref - * count of 1, followed by a M_DATA block with a ref count of 2, if - * 'snoop' is running. - */ - for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { - if (mp1->b_datap->db_ref > 1) { - must_copy = B_TRUE; - break; - } - } - - if (must_copy) { - mp1 = copymsg(mp); - if (mp1 == NULL) { - for (mp1 = mp; mp1 != NULL; - mp1 = mp1->b_cont) { - mp1->b_next = NULL; - mp1->b_prev = NULL; - } - freemsg(mp); - if (ill != NULL) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - } else { - BUMP_MIB(&ipst->ips_ip_mib, - ipIfStatsInDiscards); - } - return (B_TRUE); - } - for (from_mp = mp, to_mp = mp1; from_mp != NULL; - from_mp = from_mp->b_cont, to_mp = to_mp->b_cont) { - /* Copy b_prev - used by ip_mroute_decap */ - to_mp->b_prev = from_mp->b_prev; - from_mp->b_prev = NULL; - } - *first_mpp = first_mp = mp1; - freemsg(mp); - mp = mp1; - *mpp = mp1; - } - - ipha = (ipha_t *)mp->b_rptr; - - /* - * previous code has a case for M_DATA. - * We want to check how that happens. - */ - ASSERT(first_mp->b_datap->db_type != M_DATA); - switch (first_mp->b_datap->db_type) { + switch (DB_TYPE(mp)) { case M_PROTO: - case M_PCPROTO: - if (((dl_unitdata_ind_t *)rptr)->dl_primitive != + case M_PCPROTO: { + if (((dl_unitdata_ind_t *)mp->b_rptr)->dl_primitive != DL_UNITDATA_IND) { /* Go handle anything other than data elsewhere. */ - ip_rput_dlpi(q, mp); - return (B_TRUE); + ip_rput_dlpi(ill, mp); + return; } - *ll_multicast = ip_get_dlpi_mbcast(ill, mp); + first_mp = mp; + mp = first_mp->b_cont; + first_mp->b_cont = NULL; + + if (mp == NULL) { + freeb(first_mp); + return; + } + ip_dlur_to_mhi(ill, first_mp, &mhi); + if (ill->ill_isv6) + ip_input_v6(ill, NULL, mp, &mhi); + else + ip_input(ill, NULL, mp, &mhi); + /* Ditch the DLPI header. */ - mp1 = mp->b_cont; - ASSERT(first_mp == mp); - *first_mpp = mp1; - freeb(mp); - *mpp = mp1; - return (B_FALSE); + freeb(first_mp); + return; + } case M_IOCACK: - ip1dbg(("got iocack ")); iocp = (struct iocblk *)mp->b_rptr; switch (iocp->ioc_cmd) { case DL_IOC_HDR_INFO: - ill = (ill_t *)q->q_ptr; ill_fastpath_ack(ill, mp); - return (B_TRUE); + return; default: - putnext(q, mp); - return (B_TRUE); + putnext(ill->ill_rq, mp); + return; } /* FALLTHRU */ case M_ERROR: case M_HANGUP: - /* - * Since this is on the ill stream we unconditionally - * bump up the refcount - */ - ill_refhold(ill); - qwriter_ip(ill, q, mp, ip_rput_other, CUR_OP, B_FALSE); - return (B_TRUE); - case M_CTL: - if ((MBLKL(first_mp) >= sizeof (da_ipsec_t)) && - (((da_ipsec_t *)first_mp->b_rptr)->da_type == - IPHADA_M_CTL)) { - /* - * It's an IPsec accelerated packet. - * Make sure that the ill from which we received the - * packet has enabled IPsec hardware acceleration. - */ - if (!(ill->ill_capabilities & - (ILL_CAPAB_AH|ILL_CAPAB_ESP))) { - /* IPsec kstats: bean counter */ - freemsg(mp); - return (B_TRUE); - } - - /* - * Make mp point to the mblk following the M_CTL, - * then process according to type of mp. - * After this processing, first_mp will point to - * the data-attributes and mp to the pkt following - * the M_CTL. - */ - mp = first_mp->b_cont; - if (mp == NULL) { - freemsg(first_mp); - return (B_TRUE); - } - /* - * A Hardware Accelerated packet can only be M_DATA - * ESP or AH packet. - */ - if (mp->b_datap->db_type != M_DATA) { - /* non-M_DATA IPsec accelerated packet */ - IPSECHW_DEBUG(IPSECHW_PKT, - ("non-M_DATA IPsec accelerated pkt\n")); - freemsg(first_mp); - return (B_TRUE); - } - ipha = (ipha_t *)mp->b_rptr; - if (ipha->ipha_protocol != IPPROTO_AH && - ipha->ipha_protocol != IPPROTO_ESP) { - IPSECHW_DEBUG(IPSECHW_PKT, - ("non-M_DATA IPsec accelerated pkt\n")); - freemsg(first_mp); - return (B_TRUE); - } - *mpp = mp; - return (B_FALSE); + mutex_enter(&ill->ill_lock); + if (ill->ill_state_flags & ILL_CONDEMNED) { + mutex_exit(&ill->ill_lock); + freemsg(mp); + return; } - putnext(q, mp); - return (B_TRUE); + ill_refhold_locked(ill); + mutex_exit(&ill->ill_lock); + qwriter_ip(ill, ill->ill_rq, mp, ip_rput_other, CUR_OP, + B_FALSE); + return; + case M_CTL: + putnext(ill->ill_rq, mp); + return; case M_IOCNAK: ip1dbg(("got iocnak ")); iocp = (struct iocblk *)mp->b_rptr; switch (iocp->ioc_cmd) { case DL_IOC_HDR_INFO: - ip_rput_other(NULL, q, mp, NULL); - return (B_TRUE); + ip_rput_other(NULL, ill->ill_rq, mp, NULL); + return; default: break; } /* FALLTHRU */ default: - putnext(q, mp); - return (B_TRUE); + putnext(ill->ill_rq, mp); + return; } } @@ -14692,8 +8140,6 @@ ip_rput(queue_t *q, mblk_t *mp) ill_t *ill; union DL_primitives *dl; - TRACE_1(TR_FAC_IP, TR_IP_RPUT_START, "ip_rput_start: q %p", q); - ill = (ill_t *)q->q_ptr; if (ill->ill_state_flags & (ILL_CONDEMNED | ILL_LL_SUBNET_PENDING)) { @@ -14707,70 +8153,42 @@ ip_rput(queue_t *q, mblk_t *mp) if (DB_TYPE(mp) != M_PCPROTO || dl->dl_primitive == DL_UNITDATA_IND) { inet_freemsg(mp); - TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, - "ip_rput_end: q %p (%S)", q, "uninit"); return; } } + if (DB_TYPE(mp) == M_DATA) { + struct mac_header_info_s mhi; - TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, - "ip_rput_end: q %p (%S)", q, "end"); - - ip_input(ill, NULL, mp, NULL); + ip_mdata_to_mhi(ill, mp, &mhi); + ip_input(ill, NULL, mp, &mhi); + } else { + ip_rput_notdata(ill, mp); + } } -static mblk_t * -ip_fix_dbref(ill_t *ill, mblk_t *mp) +/* + * Move the information to a copy. + */ +mblk_t * +ip_fix_dbref(mblk_t *mp, ip_recv_attr_t *ira) { - mblk_t *mp1; - boolean_t adjusted = B_FALSE; - ip_stack_t *ipst = ill->ill_ipst; + mblk_t *mp1; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; IP_STAT(ipst, ip_db_ref); - /* - * The IP_RECVSLLA option depends on having the - * link layer header. First check that: - * a> the underlying device is of type ether, - * since this option is currently supported only - * over ethernet. - * b> there is enough room to copy over the link - * layer header. - * - * Once the checks are done, adjust rptr so that - * the link layer header will be copied via - * copymsg. Note that, IFT_ETHER may be returned - * by some non-ethernet drivers but in this case - * the second check will fail. - */ - if (ill->ill_type == IFT_ETHER && - (mp->b_rptr - mp->b_datap->db_base) >= - sizeof (struct ether_header)) { - mp->b_rptr -= sizeof (struct ether_header); - adjusted = B_TRUE; - } - mp1 = copymsg(mp); + /* Make sure we have ira_l2src before we loose the original mblk */ + if (!(ira->ira_flags & IRAF_L2SRC_SET)) + ip_setl2src(mp, ira, ira->ira_rill); + + mp1 = copymsg(mp); if (mp1 == NULL) { - mp->b_next = NULL; - /* clear b_prev - used by ip_mroute_decap */ - mp->b_prev = NULL; - freemsg(mp); BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); + freemsg(mp); return (NULL); } - - if (adjusted) { - /* - * Copy is done. Restore the pointer in - * the _new_ mblk - */ - mp1->b_rptr += sizeof (struct ether_header); - } - - /* Copy b_prev - used by ip_mroute_decap */ - mp1->b_prev = mp->b_prev; - mp->b_prev = NULL; - /* preserve the hardware checksum flags and data, if present */ if (DB_CKSUMFLAGS(mp) != 0) { DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp); @@ -14779,888 +8197,10 @@ ip_fix_dbref(ill_t *ill, mblk_t *mp) DB_CKSUMEND(mp1) = DB_CKSUMEND(mp); DB_CKSUM16(mp1) = DB_CKSUM16(mp); } - freemsg(mp); return (mp1); } -#define ADD_TO_CHAIN(head, tail, cnt, mp) { \ - if (tail != NULL) \ - tail->b_next = mp; \ - else \ - head = mp; \ - tail = mp; \ - cnt++; \ -} - -/* - * Direct read side procedure capable of dealing with chains. GLDv3 based - * drivers call this function directly with mblk chains while STREAMS - * read side procedure ip_rput() calls this for single packet with ip_ring - * set to NULL to process one packet at a time. - * - * The ill will always be valid if this function is called directly from - * the driver. - * - * If ip_input() is called from GLDv3: - * - * - This must be a non-VLAN IP stream. - * - 'mp' is either an untagged or a special priority-tagged packet. - * - Any VLAN tag that was in the MAC header has been stripped. - * - * If the IP header in packet is not 32-bit aligned, every message in the - * chain will be aligned before further operations. This is required on SPARC - * platform. - */ -/* ARGSUSED */ -void -ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain, - struct mac_header_info_s *mhip) -{ - ipaddr_t dst = NULL; - ipaddr_t prev_dst; - ire_t *ire = NULL; - ipha_t *ipha; - uint_t pkt_len; - ssize_t len; - uint_t opt_len; - int ll_multicast; - int cgtp_flt_pkt; - queue_t *q = ill->ill_rq; - squeue_t *curr_sqp = NULL; - mblk_t *head = NULL; - mblk_t *tail = NULL; - mblk_t *first_mp; - int cnt = 0; - ip_stack_t *ipst = ill->ill_ipst; - mblk_t *mp; - mblk_t *dmp; - uint8_t tag; - ilb_stack_t *ilbs; - ipaddr_t lb_dst; - - ASSERT(mp_chain != NULL); - ASSERT(ill != NULL); - - TRACE_1(TR_FAC_IP, TR_IP_RPUT_START, "ip_input_start: q %p", q); - - tag = (ip_ring != NULL) ? SQTAG_IP_INPUT_RX_RING : SQTAG_IP_INPUT; - -#define rptr ((uchar_t *)ipha) - - ilbs = ipst->ips_netstack->netstack_ilb; - while (mp_chain != NULL) { - mp = mp_chain; - mp_chain = mp_chain->b_next; - mp->b_next = NULL; - ll_multicast = 0; - - /* - * We do ire caching from one iteration to - * another. In the event the packet chain contains - * all packets from the same dst, this caching saves - * an ire_cache_lookup for each of the succeeding - * packets in a packet chain. - */ - prev_dst = dst; - - /* - * if db_ref > 1 then copymsg and free original. Packet - * may be changed and we do not want the other entity - * who has a reference to this message to trip over the - * changes. This is a blind change because trying to - * catch all places that might change the packet is too - * difficult. - * - * This corresponds to the fast path case, where we have - * a chain of M_DATA mblks. We check the db_ref count - * of only the 1st data block in the mblk chain. There - * doesn't seem to be a reason why a device driver would - * send up data with varying db_ref counts in the mblk - * chain. In any case the Fast path is a private - * interface, and our drivers don't do such a thing. - * Given the above assumption, there is no need to walk - * down the entire mblk chain (which could have a - * potential performance problem) - * - * The "(DB_REF(mp) > 1)" check was moved from ip_rput() - * to here because of exclusive ip stacks and vnics. - * Packets transmitted from exclusive stack over vnic - * can have db_ref > 1 and when it gets looped back to - * another vnic in a different zone, you have ip_input() - * getting dblks with db_ref > 1. So if someone - * complains of TCP performance under this scenario, - * take a serious look here on the impact of copymsg(). - */ - - if (DB_REF(mp) > 1) { - if ((mp = ip_fix_dbref(ill, mp)) == NULL) - continue; - } - - /* - * Check and align the IP header. - */ - first_mp = mp; - if (DB_TYPE(mp) == M_DATA) { - dmp = mp; - } else if (DB_TYPE(mp) == M_PROTO && - *(t_uscalar_t *)mp->b_rptr == DL_UNITDATA_IND) { - dmp = mp->b_cont; - } else { - dmp = NULL; - } - if (dmp != NULL) { - /* - * IP header ptr not aligned? - * OR IP header not complete in first mblk - */ - if (!OK_32PTR(dmp->b_rptr) || - MBLKL(dmp) < IP_SIMPLE_HDR_LENGTH) { - if (!ip_check_and_align_header(q, dmp, ipst)) - continue; - } - } - - /* - * ip_input fast path - */ - - /* mblk type is not M_DATA */ - if (DB_TYPE(mp) != M_DATA) { - if (ip_rput_process_notdata(q, &first_mp, ill, - &ll_multicast, &mp)) - continue; - - /* - * The only way we can get here is if we had a - * packet that was either a DL_UNITDATA_IND or - * an M_CTL for an IPsec accelerated packet. - * - * In either case, the first_mp will point to - * the leading M_PROTO or M_CTL. - */ - ASSERT(first_mp != NULL); - } else if (mhip != NULL) { - /* - * ll_multicast is set here so that it is ready - * for easy use with FW_HOOKS(). ip_get_dlpi_mbcast - * manipulates ll_multicast in the same fashion when - * called from ip_rput_process_notdata. - */ - switch (mhip->mhi_dsttype) { - case MAC_ADDRTYPE_MULTICAST : - ll_multicast = HPE_MULTICAST; - break; - case MAC_ADDRTYPE_BROADCAST : - ll_multicast = HPE_BROADCAST; - break; - default : - break; - } - } - - /* Only M_DATA can come here and it is always aligned */ - ASSERT(DB_TYPE(mp) == M_DATA); - ASSERT(DB_REF(mp) == 1 && OK_32PTR(mp->b_rptr)); - - ipha = (ipha_t *)mp->b_rptr; - len = mp->b_wptr - rptr; - pkt_len = ntohs(ipha->ipha_length); - - /* - * We must count all incoming packets, even if they end - * up being dropped later on. - */ - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); - UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pkt_len); - - /* multiple mblk or too short */ - len -= pkt_len; - if (len != 0) { - /* - * Make sure we have data length consistent - * with the IP header. - */ - if (mp->b_cont == NULL) { - if (len < 0 || pkt_len < IP_SIMPLE_HDR_LENGTH) { - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsInHdrErrors); - ip2dbg(("ip_input: drop pkt\n")); - freemsg(mp); - continue; - } - mp->b_wptr = rptr + pkt_len; - } else if ((len += msgdsize(mp->b_cont)) != 0) { - if (len < 0 || pkt_len < IP_SIMPLE_HDR_LENGTH) { - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsInHdrErrors); - ip2dbg(("ip_input: drop pkt\n")); - freemsg(mp); - continue; - } - (void) adjmsg(mp, -len); - /* - * adjmsg may have freed an mblk from the chain, - * hence invalidate any hw checksum here. This - * will force IP to calculate the checksum in - * sw, but only for this packet. - */ - DB_CKSUMFLAGS(mp) = 0; - IP_STAT(ipst, ip_multimblk3); - } - } - - /* Obtain the dst of the current packet */ - dst = ipha->ipha_dst; - - DTRACE_IP7(receive, mblk_t *, first_mp, conn_t *, NULL, - void_ip_t *, ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, - ipha, ip6_t *, NULL, int, 0); - - /* - * The following test for loopback is faster than - * IP_LOOPBACK_ADDR(), because it avoids any bitwise - * operations. - * Note that these addresses are always in network byte order - */ - if (((*(uchar_t *)&ipha->ipha_dst) == 127) || - ((*(uchar_t *)&ipha->ipha_src) == 127)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors); - freemsg(mp); - continue; - } - - /* - * The event for packets being received from a 'physical' - * interface is placed after validation of the source and/or - * destination address as being local so that packets can be - * redirected to loopback addresses using ipnat. - */ - DTRACE_PROBE4(ip4__physical__in__start, - ill_t *, ill, ill_t *, NULL, - ipha_t *, ipha, mblk_t *, first_mp); - - FW_HOOKS(ipst->ips_ip4_physical_in_event, - ipst->ips_ipv4firewall_physical_in, - ill, NULL, ipha, first_mp, mp, ll_multicast, ipst); - - DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, first_mp); - - if (first_mp == NULL) { - continue; - } - dst = ipha->ipha_dst; - /* - * Attach any necessary label information to - * this packet - */ - if (is_system_labeled() && - !tsol_get_pkt_label(mp, IPV4_VERSION)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - freemsg(mp); - continue; - } - - if (ipst->ips_ip4_observe.he_interested) { - zoneid_t dzone; - - /* - * On the inbound path the src zone will be unknown as - * this packet has come from the wire. - */ - dzone = ip_get_zoneid_v4(dst, mp, ipst, ALL_ZONES); - ipobs_hook(mp, IPOBS_HOOK_INBOUND, ALL_ZONES, dzone, - ill, ipst); - } - - /* - * Here we check to see if we machine is setup as - * L3 loadbalancer and if the incoming packet is for a VIP - * - * Check the following: - * - there is at least a rule - * - protocol of the packet is supported - */ - if (ilb_has_rules(ilbs) && ILB_SUPP_L4(ipha->ipha_protocol)) { - int lb_ret; - - /* For convenience, we pull up the mblk. */ - if (mp->b_cont != NULL) { - if (pullupmsg(mp, -1) == 0) { - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsInDiscards); - freemsg(first_mp); - continue; - } - ipha = (ipha_t *)mp->b_rptr; - } - - /* - * We just drop all fragments going to any VIP, at - * least for now.... - */ - if (ntohs(ipha->ipha_fragment_offset_and_flags) & - (IPH_MF | IPH_OFFSET)) { - if (!ilb_rule_match_vip_v4(ilbs, - ipha->ipha_dst, NULL)) { - goto after_ilb; - } - - ILB_KSTAT_UPDATE(ilbs, ip_frag_in, 1); - ILB_KSTAT_UPDATE(ilbs, ip_frag_dropped, 1); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - freemsg(first_mp); - continue; - } - lb_ret = ilb_check_v4(ilbs, ill, mp, ipha, - ipha->ipha_protocol, (uint8_t *)ipha + - IPH_HDR_LENGTH(ipha), &lb_dst); - - if (lb_ret == ILB_DROPPED) { - /* Is this the right counter to increase? */ - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - freemsg(first_mp); - continue; - } else if (lb_ret == ILB_BALANCED) { - /* Set the dst to that of the chosen server */ - dst = lb_dst; - DB_CKSUMFLAGS(mp) = 0; - } - } - -after_ilb: - /* - * Reuse the cached ire only if the ipha_dst of the previous - * packet is the same as the current packet AND it is not - * INADDR_ANY. - */ - if (!(dst == prev_dst && dst != INADDR_ANY) && - (ire != NULL)) { - ire_refrele(ire); - ire = NULL; - } - - opt_len = ipha->ipha_version_and_hdr_length - - IP_SIMPLE_HDR_VERSION; - - /* - * Check to see if we can take the fastpath. - * That is possible if the following conditions are met - * o Tsol disabled - * o CGTP disabled - * o ipp_action_count is 0 - * o no options in the packet - * o not a RSVP packet - * o not a multicast packet - * o ill not in IP_DHCPINIT_IF mode - */ - if (!is_system_labeled() && - !ipst->ips_ip_cgtp_filter && ipp_action_count == 0 && - opt_len == 0 && ipha->ipha_protocol != IPPROTO_RSVP && - !ll_multicast && !CLASSD(dst) && ill->ill_dhcpinit == 0) { - if (ire == NULL) - ire = ire_cache_lookup_simple(dst, ipst); - /* - * Unless forwarding is enabled, dont call - * ip_fast_forward(). Incoming packet is for forwarding - */ - if ((ill->ill_flags & ILLF_ROUTER) && - (ire == NULL || (ire->ire_type & IRE_CACHE))) { - ire = ip_fast_forward(ire, dst, ill, mp); - continue; - } - /* incoming packet is for local consumption */ - if ((ire != NULL) && (ire->ire_type & IRE_LOCAL)) - goto local; - } - - /* - * Disable ire caching for anything more complex - * than the simple fast path case we checked for above. - */ - if (ire != NULL) { - ire_refrele(ire); - ire = NULL; - } - - /* - * Brutal hack for DHCPv4 unicast: RFC2131 allows a DHCP - * server to unicast DHCP packets to a DHCP client using the - * IP address it is offering to the client. This can be - * disabled through the "broadcast bit", but not all DHCP - * servers honor that bit. Therefore, to interoperate with as - * many DHCP servers as possible, the DHCP client allows the - * server to unicast, but we treat those packets as broadcast - * here. Note that we don't rewrite the packet itself since - * (a) that would mess up the checksums and (b) the DHCP - * client conn is bound to INADDR_ANY so ip_fanout_udp() will - * hand it the packet regardless. - */ - if (ill->ill_dhcpinit != 0 && - IS_SIMPLE_IPH(ipha) && ipha->ipha_protocol == IPPROTO_UDP && - pullupmsg(mp, sizeof (ipha_t) + sizeof (udpha_t)) == 1) { - udpha_t *udpha; - - /* - * Reload ipha since pullupmsg() can change b_rptr. - */ - ipha = (ipha_t *)mp->b_rptr; - udpha = (udpha_t *)&ipha[1]; - - if (ntohs(udpha->uha_dst_port) == IPPORT_BOOTPC) { - DTRACE_PROBE2(ip4__dhcpinit__pkt, ill_t *, ill, - mblk_t *, mp); - dst = INADDR_BROADCAST; - } - } - - /* Full-blown slow path */ - if (opt_len != 0) { - if (len != 0) - IP_STAT(ipst, ip_multimblk4); - else - IP_STAT(ipst, ip_ipoptions); - if (!ip_rput_multimblk_ipoptions(q, ill, mp, &ipha, - &dst, ipst)) - continue; - } - - /* - * Invoke the CGTP (multirouting) filtering module to process - * the incoming packet. Packets identified as duplicates - * must be discarded. Filtering is active only if the - * the ip_cgtp_filter ndd variable is non-zero. - */ - cgtp_flt_pkt = CGTP_IP_PKT_NOT_CGTP; - if (ipst->ips_ip_cgtp_filter && - ipst->ips_ip_cgtp_filter_ops != NULL) { - netstackid_t stackid; - - stackid = ipst->ips_netstack->netstack_stackid; - cgtp_flt_pkt = - ipst->ips_ip_cgtp_filter_ops->cfo_filter(stackid, - ill->ill_phyint->phyint_ifindex, mp); - if (cgtp_flt_pkt == CGTP_IP_PKT_DUPLICATE) { - freemsg(first_mp); - continue; - } - } - - /* - * If rsvpd is running, let RSVP daemon handle its processing - * and forwarding of RSVP multicast/unicast packets. - * If rsvpd is not running but mrouted is running, RSVP - * multicast packets are forwarded as multicast traffic - * and RSVP unicast packets are forwarded by unicast router. - * If neither rsvpd nor mrouted is running, RSVP multicast - * packets are not forwarded, but the unicast packets are - * forwarded like unicast traffic. - */ - if (ipha->ipha_protocol == IPPROTO_RSVP && - ipst->ips_ipcl_proto_fanout[IPPROTO_RSVP].connf_head != - NULL) { - /* RSVP packet and rsvpd running. Treat as ours */ - ip2dbg(("ip_input: RSVP for us: 0x%x\n", ntohl(dst))); - /* - * This assumes that we deliver to all streams for - * multicast and broadcast packets. - * We have to force ll_multicast to 1 to handle the - * M_DATA messages passed in from ip_mroute_decap. - */ - dst = INADDR_BROADCAST; - ll_multicast = 1; - } else if (CLASSD(dst)) { - /* packet is multicast */ - mp->b_next = NULL; - if (ip_rput_process_multicast(q, mp, ill, ipha, - &ll_multicast, &dst)) - continue; - } - - if (ire == NULL) { - ire = ire_cache_lookup(dst, ALL_ZONES, - msg_getlabel(mp), ipst); - } - - if (ire != NULL && ire->ire_stq != NULL && - ire->ire_zoneid != GLOBAL_ZONEID && - ire->ire_zoneid != ALL_ZONES) { - /* - * Should only use IREs that are visible from the - * global zone for forwarding. - */ - ire_refrele(ire); - ire = ire_cache_lookup(dst, GLOBAL_ZONEID, - msg_getlabel(mp), ipst); - } - - if (ire == NULL) { - /* - * No IRE for this destination, so it can't be for us. - * Unless we are forwarding, drop the packet. - * We have to let source routed packets through - * since we don't yet know if they are 'ping -l' - * packets i.e. if they will go out over the - * same interface as they came in on. - */ - ire = ip_rput_noire(q, mp, ll_multicast, dst); - if (ire == NULL) - continue; - } - - /* - * Broadcast IRE may indicate either broadcast or - * multicast packet - */ - if (ire->ire_type == IRE_BROADCAST) { - /* - * Skip broadcast checks if packet is UDP multicast; - * we'd rather not enter ip_rput_process_broadcast() - * unless the packet is broadcast for real, since - * that routine is a no-op for multicast. - */ - if (ipha->ipha_protocol != IPPROTO_UDP || - !CLASSD(ipha->ipha_dst)) { - ire = ip_rput_process_broadcast(&q, mp, - ire, ipha, ill, dst, cgtp_flt_pkt, - ll_multicast); - if (ire == NULL) - continue; - } - } else if (ire->ire_stq != NULL) { - /* fowarding? */ - ip_rput_process_forward(q, mp, ire, ipha, ill, - ll_multicast, B_FALSE); - /* ip_rput_process_forward consumed the packet */ - continue; - } - -local: - /* - * If the queue in the ire is different to the ingress queue - * then we need to check to see if we can accept the packet. - * Note that for multicast packets and broadcast packets sent - * to a broadcast address which is shared between multiple - * interfaces we should not do this since we just got a random - * broadcast ire. - */ - if ((ire->ire_rfq != q) && (ire->ire_type != IRE_BROADCAST)) { - ire = ip_check_multihome(&ipha->ipha_dst, ire, ill); - if (ire == NULL) { - /* Drop packet */ - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsForwProhibits); - freemsg(mp); - continue; - } - if (ire->ire_rfq != NULL) - q = ire->ire_rfq; - } - - switch (ipha->ipha_protocol) { - case IPPROTO_TCP: - ASSERT(first_mp == mp); - if ((mp = ip_tcp_input(mp, ipha, ill, B_FALSE, ire, - mp, 0, q, ip_ring)) != NULL) { - if (curr_sqp == NULL) { - curr_sqp = GET_SQUEUE(mp); - ASSERT(cnt == 0); - cnt++; - head = tail = mp; - } else if (curr_sqp == GET_SQUEUE(mp)) { - ASSERT(tail != NULL); - cnt++; - tail->b_next = mp; - tail = mp; - } else { - /* - * A different squeue. Send the - * chain for the previous squeue on - * its way. This shouldn't happen - * often unless interrupt binding - * changes. - */ - IP_STAT(ipst, ip_input_multi_squeue); - SQUEUE_ENTER(curr_sqp, head, - tail, cnt, SQ_PROCESS, tag); - curr_sqp = GET_SQUEUE(mp); - head = mp; - tail = mp; - cnt = 1; - } - } - continue; - case IPPROTO_UDP: - ASSERT(first_mp == mp); - ip_udp_input(q, mp, ipha, ire, ill); - continue; - case IPPROTO_SCTP: - ASSERT(first_mp == mp); - ip_sctp_input(mp, ipha, ill, B_FALSE, ire, mp, 0, - q, dst); - /* ire has been released by ip_sctp_input */ - ire = NULL; - continue; - case IPPROTO_ENCAP: - case IPPROTO_IPV6: - ASSERT(first_mp == mp); - if (ip_iptun_input(NULL, mp, ipha, ill, ire, ipst)) - break; - /* - * If there was no IP tunnel data-link bound to - * receive this packet, then we fall through to - * allow potential raw sockets bound to either of - * these protocols to pick it up. - */ - default: - ip_proto_input(q, first_mp, ipha, ire, ill, 0); - continue; - } - } - - if (ire != NULL) - ire_refrele(ire); - - if (head != NULL) - SQUEUE_ENTER(curr_sqp, head, tail, cnt, SQ_PROCESS, tag); - - TRACE_2(TR_FAC_IP, TR_IP_RPUT_END, - "ip_input_end: q %p (%S)", q, "end"); -#undef rptr -} - -/* - * ip_accept_tcp() - This function is called by the squeue when it retrieves - * a chain of packets in the poll mode. The packets have gone through the - * data link processing but not IP processing. For performance and latency - * reasons, the squeue wants to process the chain in line instead of feeding - * it back via ip_input path. - * - * So this is a light weight function which checks to see if the packets - * retrived are indeed TCP packets (TCP squeue always polls TCP soft ring - * but we still do the paranoid check) meant for local machine and we don't - * have labels etc enabled. Packets that meet the criterion are returned to - * the squeue and processed inline while the rest go via ip_input path. - */ -/*ARGSUSED*/ -mblk_t * -ip_accept_tcp(ill_t *ill, ill_rx_ring_t *ip_ring, squeue_t *target_sqp, - mblk_t *mp_chain, mblk_t **last, uint_t *cnt) -{ - mblk_t *mp; - ipaddr_t dst = NULL; - ipaddr_t prev_dst; - ire_t *ire = NULL; - ipha_t *ipha; - uint_t pkt_len; - ssize_t len; - uint_t opt_len; - queue_t *q = ill->ill_rq; - squeue_t *curr_sqp; - mblk_t *ahead = NULL; /* Accepted head */ - mblk_t *atail = NULL; /* Accepted tail */ - uint_t acnt = 0; /* Accepted count */ - mblk_t *utail = NULL; /* Unaccepted head */ - mblk_t *uhead = NULL; /* Unaccepted tail */ - uint_t ucnt = 0; /* Unaccepted cnt */ - ip_stack_t *ipst = ill->ill_ipst; - ilb_stack_t *ilbs = ipst->ips_netstack->netstack_ilb; - - *cnt = 0; - - ASSERT(ill != NULL); - ASSERT(ip_ring != NULL); - - TRACE_1(TR_FAC_IP, TR_IP_RPUT_START, "ip_accept_tcp: q %p", q); - - /* If ILB is enabled, don't do fast processing. */ - if (ilb_has_rules(ilbs)) { - uhead = mp_chain; - goto all_reject; - } - -#define rptr ((uchar_t *)ipha) - - while (mp_chain != NULL) { - mp = mp_chain; - mp_chain = mp_chain->b_next; - mp->b_next = NULL; - - /* - * We do ire caching from one iteration to - * another. In the event the packet chain contains - * all packets from the same dst, this caching saves - * an ire_cache_lookup for each of the succeeding - * packets in a packet chain. - */ - prev_dst = dst; - - ipha = (ipha_t *)mp->b_rptr; - len = mp->b_wptr - rptr; - - ASSERT(!MBLK_RX_FANOUT_SLOWPATH(mp, ipha)); - - /* - * If it is a non TCP packet, or doesn't have H/W cksum, - * or doesn't have min len, reject. - */ - if ((ipha->ipha_protocol != IPPROTO_TCP) || (len < - (IP_SIMPLE_HDR_LENGTH + TCP_MIN_HEADER_LENGTH))) { - ADD_TO_CHAIN(uhead, utail, ucnt, mp); - continue; - } - - pkt_len = ntohs(ipha->ipha_length); - if (len != pkt_len) { - if (len > pkt_len) { - mp->b_wptr = rptr + pkt_len; - } else { - ADD_TO_CHAIN(uhead, utail, ucnt, mp); - continue; - } - } - - opt_len = ipha->ipha_version_and_hdr_length - - IP_SIMPLE_HDR_VERSION; - dst = ipha->ipha_dst; - - /* IP version bad or there are IP options */ - if (opt_len && (!ip_rput_multimblk_ipoptions(q, ill, - mp, &ipha, &dst, ipst))) - continue; - - if (is_system_labeled() || (ill->ill_dhcpinit != 0) || - (ipst->ips_ip_cgtp_filter && - ipst->ips_ip_cgtp_filter_ops != NULL)) { - ADD_TO_CHAIN(uhead, utail, ucnt, mp); - continue; - } - - /* - * Reuse the cached ire only if the ipha_dst of the previous - * packet is the same as the current packet AND it is not - * INADDR_ANY. - */ - if (!(dst == prev_dst && dst != INADDR_ANY) && - (ire != NULL)) { - ire_refrele(ire); - ire = NULL; - } - - if (ire == NULL) - ire = ire_cache_lookup_simple(dst, ipst); - - /* - * Unless forwarding is enabled, dont call - * ip_fast_forward(). Incoming packet is for forwarding - */ - if ((ill->ill_flags & ILLF_ROUTER) && - (ire == NULL || (ire->ire_type & IRE_CACHE))) { - - DTRACE_PROBE4(ip4__physical__in__start, - ill_t *, ill, ill_t *, NULL, - ipha_t *, ipha, mblk_t *, mp); - - FW_HOOKS(ipst->ips_ip4_physical_in_event, - ipst->ips_ipv4firewall_physical_in, - ill, NULL, ipha, mp, mp, 0, ipst); - - DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, mp); - - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); - UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, - pkt_len); - - if (mp != NULL) - ire = ip_fast_forward(ire, dst, ill, mp); - continue; - } - - /* incoming packet is for local consumption */ - if ((ire != NULL) && (ire->ire_type & IRE_LOCAL)) - goto local_accept; - - /* - * Disable ire caching for anything more complex - * than the simple fast path case we checked for above. - */ - if (ire != NULL) { - ire_refrele(ire); - ire = NULL; - } - - ire = ire_cache_lookup(dst, ALL_ZONES, msg_getlabel(mp), - ipst); - if (ire == NULL || ire->ire_type == IRE_BROADCAST || - ire->ire_stq != NULL) { - ADD_TO_CHAIN(uhead, utail, ucnt, mp); - if (ire != NULL) { - ire_refrele(ire); - ire = NULL; - } - continue; - } - -local_accept: - - if (ire->ire_rfq != q) { - ADD_TO_CHAIN(uhead, utail, ucnt, mp); - if (ire != NULL) { - ire_refrele(ire); - ire = NULL; - } - continue; - } - - /* - * The event for packets being received from a 'physical' - * interface is placed after validation of the source and/or - * destination address as being local so that packets can be - * redirected to loopback addresses using ipnat. - */ - DTRACE_PROBE4(ip4__physical__in__start, - ill_t *, ill, ill_t *, NULL, - ipha_t *, ipha, mblk_t *, mp); - - FW_HOOKS(ipst->ips_ip4_physical_in_event, - ipst->ips_ipv4firewall_physical_in, - ill, NULL, ipha, mp, mp, 0, ipst); - - DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, mp); - - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); - UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pkt_len); - - if (mp != NULL && - (mp = ip_tcp_input(mp, ipha, ill, B_FALSE, ire, mp, - 0, q, ip_ring)) != NULL) { - if ((curr_sqp = GET_SQUEUE(mp)) == target_sqp) { - ADD_TO_CHAIN(ahead, atail, acnt, mp); - } else { - SQUEUE_ENTER(curr_sqp, mp, mp, 1, - SQ_FILL, SQTAG_IP_INPUT); - } - } - } - - if (ire != NULL) - ire_refrele(ire); - -all_reject: - if (uhead != NULL) - ip_input(ill, ip_ring, uhead, NULL); - - if (ahead != NULL) { - *last = atail; - *cnt = acnt; - return (ahead); - } - - return (NULL); -#undef rptr -} - static void ip_dlpi_error(ill_t *ill, t_uscalar_t prim, t_uscalar_t dl_err, t_uscalar_t err) @@ -15684,14 +8224,16 @@ ip_dlpi_error(ill_t *ill, t_uscalar_t prim, t_uscalar_t dl_err, * ill_refhold before that, since qwriter_ip does an ill_refrele. */ void -ip_rput_dlpi(queue_t *q, mblk_t *mp) +ip_rput_dlpi(ill_t *ill, mblk_t *mp) { dl_ok_ack_t *dloa = (dl_ok_ack_t *)mp->b_rptr; dl_error_ack_t *dlea = (dl_error_ack_t *)dloa; - ill_t *ill = q->q_ptr; + queue_t *q = ill->ill_rq; t_uscalar_t prim = dloa->dl_primitive; t_uscalar_t reqprim = DL_PRIM_INVAL; + DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi", + char *, dl_primstr(prim), ill_t *, ill); ip1dbg(("ip_rput_dlpi")); /* @@ -15721,9 +8263,6 @@ ip_rput_dlpi(queue_t *q, mblk_t *mp) case DL_NOTIFY_ACK: reqprim = DL_NOTIFY_REQ; break; - case DL_CONTROL_ACK: - reqprim = DL_CONTROL_REQ; - break; case DL_CAPABILITY_ACK: reqprim = DL_CAPABILITY_REQ; break; @@ -15781,7 +8320,7 @@ ip_rput_dlpi(queue_t *q, mblk_t *mp) /* * Handling of DLPI messages that require exclusive access to the ipsq. * - * Need to do ill_pending_mp_release on ioctl completion, which could + * Need to do ipsq_pending_mp_get on ioctl completion, which could * happen here. (along with mi_copy_done) */ /* ARGSUSED */ @@ -15791,7 +8330,7 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) dl_ok_ack_t *dloa = (dl_ok_ack_t *)mp->b_rptr; dl_error_ack_t *dlea = (dl_error_ack_t *)dloa; int err = 0; - ill_t *ill; + ill_t *ill = (ill_t *)q->q_ptr; ipif_t *ipif = NULL; mblk_t *mp1 = NULL; conn_t *connp = NULL; @@ -15800,15 +8339,14 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) boolean_t success; boolean_t ioctl_aborted = B_FALSE; boolean_t log = B_TRUE; - ip_stack_t *ipst; + + DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi_writer", + char *, dl_primstr(dloa->dl_primitive), ill_t *, ill); ip1dbg(("ip_rput_dlpi_writer ..")); - ill = (ill_t *)q->q_ptr; ASSERT(ipsq->ipsq_xop == ill->ill_phyint->phyint_ipsq->ipsq_xop); ASSERT(IAM_WRITER_ILL(ill)); - ipst = ill->ill_ipst; - ipif = ipsq->ipsq_xop->ipx_pending_ipif; /* * The current ioctl could have been aborted by the user and a new @@ -15823,6 +8361,10 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for %s\n", dl_primstr(dlea->dl_error_primitive))); + DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi_writer error", + char *, dl_primstr(dlea->dl_error_primitive), + ill_t *, ill); + switch (dlea->dl_error_primitive) { case DL_DISABMULTI_REQ: ill_dlpi_done(ill, dlea->dl_error_primitive); @@ -15916,7 +8458,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) if (ill->ill_dlpi_multicast_state == IDS_INPROGRESS) ill->ill_dlpi_multicast_state = IDS_FAILED; if (ill->ill_dlpi_multicast_state == IDS_FAILED) { - ipif_t *ipif; printf("ip: joining multicasts failed (%d)" " on %s - will use link layer " @@ -15924,32 +8465,18 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) dlea->dl_errno, ill->ill_name); /* - * Set up the multicast mapping alone. + * Set up for multi_bcast; We are the * writer, so ok to access ill->ill_ipif * without any lock. */ - ipif = ill->ill_ipif; mutex_enter(&ill->ill_phyint->phyint_lock); ill->ill_phyint->phyint_flags |= PHYI_MULTI_BCAST; mutex_exit(&ill->ill_phyint->phyint_lock); - if (!ill->ill_isv6) { - (void) ipif_arp_setup_multicast(ipif, - NULL); - } else { - (void) ipif_ndp_setup_multicast(ipif, - NULL); - } } freemsg(mp); /* Don't want to pass this up */ return; - case DL_CONTROL_REQ: - ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for " - "DL_CONTROL_REQ\n")); - ill_dlpi_done(ill, dlea->dl_error_primitive); - freemsg(mp); - return; case DL_CAPABILITY_REQ: ip1dbg(("ip_rput_dlpi_writer: got DL_ERROR_ACK for " "DL_CAPABILITY REQ\n")); @@ -16003,10 +8530,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) mp = NULL; break; - case DL_CONTROL_ACK: - /* We treat all of these as "fire and forget" */ - ill_dlpi_done(ill, DL_CONTROL_REQ); - break; case DL_INFO_ACK: /* Call a routine to handle this one. */ ill_dlpi_done(ill, DL_INFO_REQ); @@ -16019,29 +8542,33 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) * sent by ill_dl_phys, in which case just return */ ill_dlpi_done(ill, DL_BIND_REQ); - if (ill->ill_ifname_pending) + if (ill->ill_ifname_pending) { + DTRACE_PROBE2(ip__rput__dlpi__ifname__pending, + ill_t *, ill, mblk_t *, mp); break; - + } if (!ioctl_aborted) mp1 = ipsq_pending_mp_get(ipsq, &connp); - if (mp1 == NULL) + if (mp1 == NULL) { + DTRACE_PROBE1(ip__rput__dlpi__no__mblk, ill_t *, ill); break; + } /* * mp1 was added by ill_dl_up(). if that is a result of * a DL_NOTE_REPLUMB notification, connp could be NULL. */ if (connp != NULL) q = CONNP_TO_WQ(connp); - /* * We are exclusive. So nothing can change even after - * we get the pending mp. If need be we can put it back - * and restart, as in calling ipif_arp_up() below. + * we get the pending mp. */ ip1dbg(("ip_rput_dlpi: bind_ack %s\n", ill->ill_name)); + DTRACE_PROBE1(ip__rput__dlpi__bind__ack, ill_t *, ill); mutex_enter(&ill->ill_lock); ill->ill_dl_up = 1; + ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS; ill_nic_event_dispatch(ill, 0, NE_UP, NULL, 0); mutex_exit(&ill->ill_lock); @@ -16052,34 +8579,15 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) * ill_dl_up(), which stopped ipif_up()'s processing. */ if (ill->ill_isv6) { - if (ill->ill_flags & ILLF_XRESOLV) { - if (connp != NULL) - mutex_enter(&connp->conn_lock); - mutex_enter(&ill->ill_lock); - success = ipsq_pending_mp_add(connp, ipif, q, - mp1, 0); - mutex_exit(&ill->ill_lock); - if (connp != NULL) - mutex_exit(&connp->conn_lock); - if (success) { - err = ipif_resolver_up(ipif, - Res_act_initial); - if (err == EINPROGRESS) { - freemsg(mp); - return; - } - ASSERT(err != 0); - mp1 = ipsq_pending_mp_get(ipsq, &connp); - ASSERT(mp1 != NULL); - } else { - /* conn has started closing */ - err = EINTR; - } - } else { /* Non XRESOLV interface */ - (void) ipif_resolver_up(ipif, Res_act_initial); - if ((err = ipif_ndp_up(ipif, B_TRUE)) == 0) - err = ipif_up_done_v6(ipif); - } + /* + * v6 interfaces. + * Unlike ARP which has to do another bind + * and attach, once we get here we are + * done with NDP + */ + (void) ipif_resolver_up(ipif, Res_act_initial); + if ((err = ipif_ndp_up(ipif, B_TRUE)) == 0) + err = ipif_up_done_v6(ipif); } else if (ill->ill_net_type == IRE_IF_RESOLVER) { /* * ARP and other v4 external resolvers. @@ -16099,7 +8607,7 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) freemsg(mp); return; } - ASSERT(err != 0); + ASSERT(arp_no_defense || err != 0); mp1 = ipsq_pending_mp_get(ipsq, &connp); } else { /* The conn has started closing */ @@ -16144,10 +8652,7 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) case DL_NOTIFY_IND: { dl_notify_ind_t *notify = (dl_notify_ind_t *)mp->b_rptr; - ire_t *ire; uint_t orig_mtu; - boolean_t need_ire_walk_v4 = B_FALSE; - boolean_t need_ire_walk_v6 = B_FALSE; switch (notify->dl_notification) { case DL_NOTE_PHYS_ADDR: @@ -16164,95 +8669,52 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) return; case DL_NOTE_FASTPATH_FLUSH: - ill_fastpath_flush(ill); + nce_flush(ill, B_FALSE); break; case DL_NOTE_SDU_SIZE: /* - * Change the MTU size of the interface, of all - * attached ipif's, and of all relevant ire's. The - * new value's a uint32_t at notify->dl_data. - * Mtu change Vs. new ire creation - protocol below. - * - * a Mark the ipif as IPIF_CHANGING. - * b Set the new mtu in the ipif. - * c Change the ire_max_frag on all affected ires - * d Unmark the IPIF_CHANGING + * The dce and fragmentation code can cope with + * this changing while packets are being sent. + * When packets are sent ip_output will discover + * a change. * - * To see how the protocol works, assume an interface - * route is also being added simultaneously by - * ip_rt_add and let 'ipif' be the ipif referenced by - * the ire. If the ire is created before step a, - * it will be cleaned up by step c. If the ire is - * created after step d, it will see the new value of - * ipif_mtu. Any attempt to create the ire between - * steps a to d will fail because of the IPIF_CHANGING - * flag. Note that ire_create() is passed a pointer to - * the ipif_mtu, and not the value. During ire_add - * under the bucket lock, the ire_max_frag of the - * new ire being created is set from the ipif/ire from - * which it is being derived. + * Change the MTU size of the interface. */ mutex_enter(&ill->ill_lock); + ill->ill_current_frag = (uint_t)notify->dl_data; + if (ill->ill_current_frag > ill->ill_max_frag) + ill->ill_max_frag = ill->ill_current_frag; - orig_mtu = ill->ill_max_mtu; - ill->ill_max_frag = (uint_t)notify->dl_data; - ill->ill_max_mtu = (uint_t)notify->dl_data; - - /* - * If ill_user_mtu was set (via SIOCSLIFLNKINFO), - * clamp ill_max_mtu at it. - */ - if (ill->ill_user_mtu != 0 && - ill->ill_user_mtu < ill->ill_max_mtu) - ill->ill_max_mtu = ill->ill_user_mtu; + orig_mtu = ill->ill_mtu; + if (!(ill->ill_flags & ILLF_FIXEDMTU)) { + ill->ill_mtu = ill->ill_current_frag; - /* - * If the MTU is unchanged, we're done. - */ - if (orig_mtu == ill->ill_max_mtu) { - mutex_exit(&ill->ill_lock); - break; - } - - if (ill->ill_isv6) { - if (ill->ill_max_mtu < IPV6_MIN_MTU) - ill->ill_max_mtu = IPV6_MIN_MTU; - } else { - if (ill->ill_max_mtu < IP_MIN_MTU) - ill->ill_max_mtu = IP_MIN_MTU; - } - for (ipif = ill->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { /* - * Don't override the mtu if the user - * has explicitly set it. + * If ill_user_mtu was set (via + * SIOCSLIFLNKINFO), clamp ill_mtu at it. */ - if (ipif->ipif_flags & IPIF_FIXEDMTU) - continue; - ipif->ipif_mtu = (uint_t)notify->dl_data; - if (ipif->ipif_isv6) - ire = ipif_to_ire_v6(ipif); - else - ire = ipif_to_ire(ipif); - if (ire != NULL) { - ire->ire_max_frag = ipif->ipif_mtu; - ire_refrele(ire); - } - if (ipif->ipif_flags & IPIF_UP) { - if (ill->ill_isv6) - need_ire_walk_v6 = B_TRUE; - else - need_ire_walk_v4 = B_TRUE; + if (ill->ill_user_mtu != 0 && + ill->ill_user_mtu < ill->ill_mtu) + ill->ill_mtu = ill->ill_user_mtu; + + if (ill->ill_isv6) { + if (ill->ill_mtu < IPV6_MIN_MTU) + ill->ill_mtu = IPV6_MIN_MTU; + } else { + if (ill->ill_mtu < IP_MIN_MTU) + ill->ill_mtu = IP_MIN_MTU; } } mutex_exit(&ill->ill_lock); - if (need_ire_walk_v4) - ire_walk_v4(ill_mtu_change, (char *)ill, - ALL_ZONES, ipst); - if (need_ire_walk_v6) - ire_walk_v6(ill_mtu_change, (char *)ill, - ALL_ZONES, ipst); + /* + * Make sure all dce_generation checks find out + * that ill_mtu has changed. + */ + if (orig_mtu != ill->ill_mtu) { + dce_increment_all_generations(ill->ill_isv6, + ill->ill_ipst); + } /* * Refresh IPMP meta-interface MTU if necessary. @@ -16303,8 +8765,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) case DL_NOTE_PROMISC_ON_PHYS: { phyint_t *phyint = ill->ill_phyint; - IPSECHW_DEBUG(IPSECHW_PKT, ("ip_rput_dlpi_writer: " - "got a DL_NOTE_PROMISC_ON_PHYS\n")); mutex_enter(&phyint->phyint_lock); phyint->phyint_flags |= PHYI_PROMISC; mutex_exit(&phyint->phyint_lock); @@ -16313,8 +8773,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) case DL_NOTE_PROMISC_OFF_PHYS: { phyint_t *phyint = ill->ill_phyint; - IPSECHW_DEBUG(IPSECHW_PKT, ("ip_rput_dlpi_writer: " - "got a DL_NOTE_PROMISC_OFF_PHYS\n")); mutex_enter(&phyint->phyint_lock); phyint->phyint_flags &= ~PHYI_PROMISC; mutex_exit(&phyint->phyint_lock); @@ -16474,6 +8932,10 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) ip2dbg(("DL_OK_ACK %s (0x%x)\n", dl_primstr((int)dloa->dl_correct_primitive), dloa->dl_correct_primitive)); + DTRACE_PROBE3(ill__dlpi, char *, "ip_rput_dlpi_writer ok", + char *, dl_primstr(dloa->dl_correct_primitive), + ill_t *, ill); + switch (dloa->dl_correct_primitive) { case DL_ENABMULTI_REQ: case DL_DISABMULTI_REQ: @@ -16502,6 +8964,10 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) */ ASSERT(err != EINPROGRESS); + DTRACE_PROBE4(ipif__ioctl, char *, "ip_rput_dlpi_writer finish", + int, ipsq->ipsq_xop->ipx_current_ioctl, ill_t *, ill, + ipif_t *, NULL); + switch (ipsq->ipsq_xop->ipx_current_ioctl) { case 0: ipsq_current_finish(ipsq); @@ -16595,7 +9061,10 @@ ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) if (ill->ill_dlpi_fastpath_state == IDS_INPROGRESS) { ill->ill_dlpi_fastpath_state = IDS_FAILED; mutex_exit(&ill->ill_lock); - ill_fastpath_nack(ill); + /* + * don't flush the nce_t entries: we use them + * as an index to the ncec itself. + */ ip1dbg(("ip_rput: DLPI fastpath off on interface %s\n", ill->ill_name)); } else { @@ -16611,235 +9080,24 @@ ip_rput_other(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) } /* - * NOTE : This function does not ire_refrele the ire argument passed in. - * - * IPQoS notes - * IP policy is invoked twice for a forwarded packet, once on the read side - * and again on the write side if both, IPP_FWD_IN and IPP_FWD_OUT are - * enabled. An additional parameter, in_ill, has been added for this purpose. - * Note that in_ill could be NULL when called from ip_rput_forward_multicast - * because ip_mroute drops this information. - * + * Update any source route, record route or timestamp options + * When it fails it has consumed the message and BUMPed the MIB. */ -void -ip_rput_forward(ire_t *ire, ipha_t *ipha, mblk_t *mp, ill_t *in_ill) -{ - uint32_t old_pkt_len; - uint32_t pkt_len; - queue_t *q; - uint32_t sum; -#define rptr ((uchar_t *)ipha) - uint32_t max_frag; - uint32_t ill_index; - ill_t *out_ill; - mib2_ipIfStatsEntry_t *mibptr; - ip_stack_t *ipst = ((ill_t *)(ire->ire_stq->q_ptr))->ill_ipst; - - /* Get the ill_index of the incoming ILL */ - ill_index = (in_ill != NULL) ? in_ill->ill_phyint->phyint_ifindex : 0; - mibptr = (in_ill != NULL) ? in_ill->ill_ip_mib : &ipst->ips_ip_mib; - - /* Initiate Read side IPPF processing */ - if (IPP_ENABLED(IPP_FWD_IN, ipst)) { - ip_process(IPP_FWD_IN, &mp, ill_index); - if (mp == NULL) { - ip2dbg(("ip_rput_forward: pkt dropped/deferred "\ - "during IPPF processing\n")); - return; - } - } - - /* Adjust the checksum to reflect the ttl decrement. */ - sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST; - ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16)); - - if (ipha->ipha_ttl-- <= 1) { - if (ip_csum_hdr(ipha)) { - BUMP_MIB(mibptr, ipIfStatsInCksumErrs); - goto drop_pkt; - } - /* - * Note: ire_stq this will be NULL for multicast - * datagrams using the long path through arp (the IRE - * is not an IRE_CACHE). This should not cause - * problems since we don't generate ICMP errors for - * multicast packets. - */ - BUMP_MIB(mibptr, ipIfStatsForwProhibits); - q = ire->ire_stq; - if (q != NULL) { - /* Sent by forwarding path, and router is global zone */ - icmp_time_exceeded(q, mp, ICMP_TTL_EXCEEDED, - GLOBAL_ZONEID, ipst); - } else - freemsg(mp); - return; - } - - /* - * Don't forward if the interface is down - */ - if (ire->ire_ipif->ipif_ill->ill_ipif_up_count == 0) { - BUMP_MIB(mibptr, ipIfStatsInDiscards); - ip2dbg(("ip_rput_forward:interface is down\n")); - goto drop_pkt; - } - - /* Get the ill_index of the outgoing ILL */ - out_ill = ire_to_ill(ire); - ill_index = out_ill->ill_phyint->phyint_ifindex; - - DTRACE_PROBE4(ip4__forwarding__start, - ill_t *, in_ill, ill_t *, out_ill, ipha_t *, ipha, mblk_t *, mp); - - FW_HOOKS(ipst->ips_ip4_forwarding_event, - ipst->ips_ipv4firewall_forwarding, - in_ill, out_ill, ipha, mp, mp, 0, ipst); - - DTRACE_PROBE1(ip4__forwarding__end, mblk_t *, mp); - - if (mp == NULL) - return; - old_pkt_len = pkt_len = ntohs(ipha->ipha_length); - - if (is_system_labeled()) { - mblk_t *mp1; - - if ((mp1 = tsol_ip_forward(ire, mp)) == NULL) { - BUMP_MIB(mibptr, ipIfStatsForwProhibits); - goto drop_pkt; - } - /* Size may have changed */ - mp = mp1; - ipha = (ipha_t *)mp->b_rptr; - pkt_len = ntohs(ipha->ipha_length); - } - - /* Check if there are options to update */ - if (!IS_SIMPLE_IPH(ipha)) { - if (ip_csum_hdr(ipha)) { - BUMP_MIB(mibptr, ipIfStatsInCksumErrs); - goto drop_pkt; - } - if (ip_rput_forward_options(mp, ipha, ire, ipst)) { - BUMP_MIB(mibptr, ipIfStatsForwProhibits); - return; - } - - ipha->ipha_hdr_checksum = 0; - ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); - } - max_frag = ire->ire_max_frag; - if (pkt_len > max_frag) { - /* - * It needs fragging on its way out. We haven't - * verified the header checksum yet. Since we - * are going to put a surely good checksum in the - * outgoing header, we have to make sure that it - * was good coming in. - */ - if (ip_csum_hdr(ipha)) { - BUMP_MIB(mibptr, ipIfStatsInCksumErrs); - goto drop_pkt; - } - /* Initiate Write side IPPF processing */ - if (IPP_ENABLED(IPP_FWD_OUT, ipst)) { - ip_process(IPP_FWD_OUT, &mp, ill_index); - if (mp == NULL) { - ip2dbg(("ip_rput_forward: pkt dropped/deferred"\ - " during IPPF processing\n")); - return; - } - } - /* - * Handle labeled packet resizing. - * - * If we have added a label, inform ip_wput_frag() of its - * effect on the MTU for ICMP messages. - */ - if (pkt_len > old_pkt_len) { - uint32_t secopt_size; - - secopt_size = pkt_len - old_pkt_len; - if (secopt_size < max_frag) - max_frag -= secopt_size; - } - - ip_wput_frag(ire, mp, IB_PKT, max_frag, 0, - GLOBAL_ZONEID, ipst, NULL); - ip2dbg(("ip_rput_forward:sent to ip_wput_frag\n")); - return; - } - - DTRACE_PROBE4(ip4__physical__out__start, ill_t *, NULL, - ill_t *, out_ill, ipha_t *, ipha, mblk_t *, mp); - FW_HOOKS(ipst->ips_ip4_physical_out_event, - ipst->ips_ipv4firewall_physical_out, - NULL, out_ill, ipha, mp, mp, 0, ipst); - DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); - if (mp == NULL) - return; - - mp->b_prev = (mblk_t *)IPP_FWD_OUT; - ip1dbg(("ip_rput_forward: Calling ip_xmit_v4\n")); - (void) ip_xmit_v4(mp, ire, NULL, B_FALSE, NULL); - /* ip_xmit_v4 always consumes the packet */ - return; - -drop_pkt:; - ip1dbg(("ip_rput_forward: drop pkt\n")); - freemsg(mp); -#undef rptr -} - -void -ip_rput_forward_multicast(ipaddr_t dst, mblk_t *mp, ipif_t *ipif) -{ - ire_t *ire; - ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; - - ASSERT(!ipif->ipif_isv6); - /* - * Find an IRE which matches the destination and the outgoing - * queue in the cache table. All we need is an IRE_CACHE which - * is pointing at ipif->ipif_ill. - */ - if (ipif->ipif_flags & IPIF_POINTOPOINT) - dst = ipif->ipif_pp_dst_addr; - - ire = ire_ctable_lookup(dst, 0, 0, ipif, ALL_ZONES, msg_getlabel(mp), - MATCH_IRE_ILL | MATCH_IRE_SECATTR, ipst); - if (ire == NULL) { - /* - * Mark this packet to make it be delivered to - * ip_rput_forward after the new ire has been - * created. - */ - mp->b_prev = NULL; - mp->b_next = mp; - ip_newroute_ipif(ipif->ipif_ill->ill_wq, mp, ipif, dst, - NULL, 0, GLOBAL_ZONEID, &zero_info); - } else { - ip_rput_forward(ire, (ipha_t *)mp->b_rptr, mp, NULL); - IRE_REFRELE(ire); - } -} - -/* Update any source route, record route or timestamp options */ -static int -ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst) +boolean_t +ip_forward_options(mblk_t *mp, ipha_t *ipha, ill_t *dst_ill, + ip_recv_attr_t *ira) { ipoptp_t opts; uchar_t *opt; uint8_t optval; uint8_t optlen; ipaddr_t dst; + ipaddr_t ifaddr; uint32_t ts; - ire_t *dst_ire = NULL; - ire_t *tmp_ire = NULL; timestruc_t now; + ip_stack_t *ipst = ira->ira_ill->ill_ipst; - ip2dbg(("ip_rput_forward_options\n")); + ip2dbg(("ip_forward_options\n")); dst = ipha->ipha_dst; for (optval = ipoptp_first(&opts, ipha); optval != IPOPT_EOL; @@ -16847,7 +9105,7 @@ ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst) ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); opt = opts.ipoptp_cur; optlen = opts.ipoptp_len; - ip2dbg(("ip_rput_forward_options: opt %d, len %d\n", + ip2dbg(("ip_forward_options: opt %d, len %d\n", optval, opts.ipoptp_len)); switch (optval) { uint32_t off; @@ -16855,27 +9113,17 @@ ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst) case IPOPT_LSRR: /* Check if adminstratively disabled */ if (!ipst->ips_ip_forward_src_routed) { - if (ire->ire_stq != NULL) { - /* - * Sent by forwarding path, and router - * is global zone - */ - icmp_unreachable(ire->ire_stq, mp, - ICMP_SOURCE_ROUTE_FAILED, - GLOBAL_ZONEID, ipst); - } else { - ip0dbg(("ip_rput_forward_options: " - "unable to send unreach\n")); - freemsg(mp); - } - return (-1); + BUMP_MIB(dst_ill->ill_ip_mib, + ipIfStatsForwProhibits); + ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", + mp, dst_ill); + icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, + ira); + return (B_FALSE); } - - dst_ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, - NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); - if (dst_ire == NULL) { + if (ip_type_v4(dst, ipst) != IRE_LOCAL) { /* - * Must be partial since ip_rput_options + * Must be partial since ip_input_options * checked for strict. */ break; @@ -16887,31 +9135,33 @@ ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst) off > optlen - IP_ADDR_LEN) { /* End of source route */ ip1dbg(( - "ip_rput_forward_options: end of SR\n")); - ire_refrele(dst_ire); + "ip_forward_options: end of SR\n")); break; } + /* Pick a reasonable address on the outbound if */ + ASSERT(dst_ill != NULL); + if (ip_select_source_v4(dst_ill, INADDR_ANY, dst, + INADDR_ANY, ALL_ZONES, ipst, &ifaddr, NULL, + NULL) != 0) { + /* No source! Shouldn't happen */ + ifaddr = INADDR_ANY; + } bcopy((char *)opt + off, &dst, IP_ADDR_LEN); - bcopy(&ire->ire_src_addr, (char *)opt + off, - IP_ADDR_LEN); - ip1dbg(("ip_rput_forward_options: next hop 0x%x\n", + bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN); + ip1dbg(("ip_forward_options: next hop 0x%x\n", ntohl(dst))); /* * Check if our address is present more than * once as consecutive hops in source route. */ - tmp_ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, - NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); - if (tmp_ire != NULL) { - ire_refrele(tmp_ire); + if (ip_type_v4(dst, ipst) == IRE_LOCAL) { off += IP_ADDR_LEN; opt[IPOPT_OFFSET] += IP_ADDR_LEN; goto redo_srr; } ipha->ipha_dst = dst; opt[IPOPT_OFFSET] += IP_ADDR_LEN; - ire_refrele(dst_ire); break; case IPOPT_RR: off = opt[IPOPT_OFFSET]; @@ -16920,11 +9170,18 @@ ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst) off > optlen - IP_ADDR_LEN) { /* No more room - ignore */ ip1dbg(( - "ip_rput_forward_options: end of RR\n")); + "ip_forward_options: end of RR\n")); break; } - bcopy(&ire->ire_src_addr, (char *)opt + off, - IP_ADDR_LEN); + /* Pick a reasonable address on the outbound if */ + ASSERT(dst_ill != NULL); + if (ip_select_source_v4(dst_ill, INADDR_ANY, dst, + INADDR_ANY, ALL_ZONES, ipst, &ifaddr, NULL, + NULL) != 0) { + /* No source! Shouldn't happen */ + ifaddr = INADDR_ANY; + } + bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN); opt[IPOPT_OFFSET] += IP_ADDR_LEN; break; case IPOPT_TS: @@ -16938,14 +9195,10 @@ ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst) /* Verify that the address matched */ off = opt[IPOPT_OFFSET] - 1; bcopy((char *)opt + off, &dst, IP_ADDR_LEN); - dst_ire = ire_ctable_lookup(dst, 0, - IRE_LOCAL, NULL, ALL_ZONES, NULL, - MATCH_IRE_TYPE, ipst); - if (dst_ire == NULL) { + if (ip_type_v4(dst, ipst) != IRE_LOCAL) { /* Not for us */ break; } - ire_refrele(dst_ire); /* FALLTHRU */ case IPOPT_TS_TSANDADDR: off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; @@ -16955,9 +9208,9 @@ ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst) * ip_*put_options should have already * dropped this packet. */ - cmn_err(CE_PANIC, "ip_rput_forward_options: " - "unknown IT - bug in ip_rput_options?\n"); - return (0); /* Keep "lint" happy */ + cmn_err(CE_PANIC, "ip_forward_options: " + "unknown IT - bug in ip_input_options?\n"); + return (B_TRUE); /* Keep "lint" happy */ } if (opt[IPOPT_OFFSET] - 1 + off > optlen) { /* Increase overflow counter */ @@ -16972,8 +9225,15 @@ ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst) case IPOPT_TS_PRESPEC: case IPOPT_TS_PRESPEC_RFC791: case IPOPT_TS_TSANDADDR: - bcopy(&ire->ire_src_addr, - (char *)opt + off, IP_ADDR_LEN); + /* Pick a reasonable addr on the outbound if */ + ASSERT(dst_ill != NULL); + if (ip_select_source_v4(dst_ill, INADDR_ANY, + dst, INADDR_ANY, ALL_ZONES, ipst, &ifaddr, + NULL, NULL) != 0) { + /* No source! Shouldn't happen */ + ifaddr = INADDR_ANY; + } + bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN); opt[IPOPT_OFFSET] += IP_ADDR_LEN; /* FALLTHRU */ case IPOPT_TS_TSONLY: @@ -16989,223 +9249,7 @@ ip_rput_forward_options(mblk_t *mp, ipha_t *ipha, ire_t *ire, ip_stack_t *ipst) break; } } - return (0); -} - -/* - * This is called after processing at least one of AH/ESP headers. - * - * NOTE: the ill corresponding to ipsec_in_ill_index may not be - * the actual, physical interface on which the packet was received, - * but, when ip_strict_dst_multihoming is set to 1, could be the - * interface which had the ipha_dst configured when the packet went - * through ip_rput. The ill_index corresponding to the recv_ill - * is saved in ipsec_in_rill_index - * - * NOTE2: The "ire" argument is only used in IPv4 cases. This function - * cannot assume "ire" points to valid data for any IPv6 cases. - */ -void -ip_fanout_proto_again(mblk_t *ipsec_mp, ill_t *ill, ill_t *recv_ill, ire_t *ire) -{ - mblk_t *mp; - ipaddr_t dst; - in6_addr_t *v6dstp; - ipha_t *ipha; - ip6_t *ip6h; - ipsec_in_t *ii; - boolean_t ill_need_rele = B_FALSE; - boolean_t rill_need_rele = B_FALSE; - boolean_t ire_need_rele = B_FALSE; - netstack_t *ns; - ip_stack_t *ipst; - - ii = (ipsec_in_t *)ipsec_mp->b_rptr; - ASSERT(ii->ipsec_in_ill_index != 0); - ns = ii->ipsec_in_ns; - ASSERT(ii->ipsec_in_ns != NULL); - ipst = ns->netstack_ip; - - mp = ipsec_mp->b_cont; - ASSERT(mp != NULL); - - if (ill == NULL) { - ASSERT(recv_ill == NULL); - /* - * We need to get the original queue on which ip_rput_local - * or ip_rput_data_v6 was called. - */ - ill = ill_lookup_on_ifindex(ii->ipsec_in_ill_index, - !ii->ipsec_in_v4, NULL, NULL, NULL, NULL, ipst); - ill_need_rele = B_TRUE; - - if (ii->ipsec_in_ill_index != ii->ipsec_in_rill_index) { - recv_ill = ill_lookup_on_ifindex( - ii->ipsec_in_rill_index, !ii->ipsec_in_v4, - NULL, NULL, NULL, NULL, ipst); - rill_need_rele = B_TRUE; - } else { - recv_ill = ill; - } - - if ((ill == NULL) || (recv_ill == NULL)) { - ip0dbg(("ip_fanout_proto_again: interface " - "disappeared\n")); - if (ill != NULL) - ill_refrele(ill); - if (recv_ill != NULL) - ill_refrele(recv_ill); - freemsg(ipsec_mp); - return; - } - } - - ASSERT(ill != NULL && recv_ill != NULL); - - if (mp->b_datap->db_type == M_CTL) { - /* - * AH/ESP is returning the ICMP message after - * removing their headers. Fanout again till - * it gets to the right protocol. - */ - if (ii->ipsec_in_v4) { - icmph_t *icmph; - int iph_hdr_length; - int hdr_length; - - ipha = (ipha_t *)mp->b_rptr; - iph_hdr_length = IPH_HDR_LENGTH(ipha); - icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; - ipha = (ipha_t *)&icmph[1]; - hdr_length = IPH_HDR_LENGTH(ipha); - /* - * icmp_inbound_error_fanout may need to do pullupmsg. - * Reset the type to M_DATA. - */ - mp->b_datap->db_type = M_DATA; - icmp_inbound_error_fanout(ill->ill_rq, ill, ipsec_mp, - icmph, ipha, iph_hdr_length, hdr_length, B_TRUE, - B_FALSE, ill, ii->ipsec_in_zoneid); - } else { - icmp6_t *icmp6; - int hdr_length; - - ip6h = (ip6_t *)mp->b_rptr; - /* Don't call hdr_length_v6() unless you have to. */ - if (ip6h->ip6_nxt != IPPROTO_ICMPV6) - hdr_length = ip_hdr_length_v6(mp, ip6h); - else - hdr_length = IPV6_HDR_LEN; - - icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]); - /* - * icmp_inbound_error_fanout_v6 may need to do - * pullupmsg. Reset the type to M_DATA. - */ - mp->b_datap->db_type = M_DATA; - icmp_inbound_error_fanout_v6(ill->ill_rq, ipsec_mp, - ip6h, icmp6, ill, recv_ill, B_TRUE, - ii->ipsec_in_zoneid); - } - if (ill_need_rele) - ill_refrele(ill); - if (rill_need_rele) - ill_refrele(recv_ill); - return; - } - - if (ii->ipsec_in_v4) { - ipha = (ipha_t *)mp->b_rptr; - dst = ipha->ipha_dst; - if (CLASSD(dst)) { - /* - * Multicast has to be delivered to all streams. - */ - dst = INADDR_BROADCAST; - } - - if (ire == NULL) { - ire = ire_cache_lookup(dst, ii->ipsec_in_zoneid, - msg_getlabel(mp), ipst); - if (ire == NULL) { - if (ill_need_rele) - ill_refrele(ill); - if (rill_need_rele) - ill_refrele(recv_ill); - ip1dbg(("ip_fanout_proto_again: " - "IRE not found")); - freemsg(ipsec_mp); - return; - } - ire_need_rele = B_TRUE; - } - - switch (ipha->ipha_protocol) { - case IPPROTO_UDP: - ip_udp_input(ill->ill_rq, ipsec_mp, ipha, ire, - recv_ill); - if (ire_need_rele) - ire_refrele(ire); - break; - case IPPROTO_TCP: - if (!ire_need_rele) - IRE_REFHOLD(ire); - mp = ip_tcp_input(mp, ipha, ill, B_TRUE, - ire, ipsec_mp, 0, ill->ill_rq, NULL); - IRE_REFRELE(ire); - if (mp != NULL) { - SQUEUE_ENTER(GET_SQUEUE(mp), mp, - mp, 1, SQ_PROCESS, - SQTAG_IP_PROTO_AGAIN); - } - break; - case IPPROTO_SCTP: - if (!ire_need_rele) - IRE_REFHOLD(ire); - ip_sctp_input(mp, ipha, ill, B_TRUE, ire, - ipsec_mp, 0, ill->ill_rq, dst); - break; - case IPPROTO_ENCAP: - case IPPROTO_IPV6: - if (ip_iptun_input(ipsec_mp, mp, ipha, ill, ire, - ill->ill_ipst)) { - /* - * If we made it here, we don't need to worry - * about the raw-socket/protocol fanout. - */ - if (ire_need_rele) - ire_refrele(ire); - break; - } - /* else FALLTHRU */ - default: - ip_proto_input(ill->ill_rq, ipsec_mp, ipha, ire, - recv_ill, 0); - if (ire_need_rele) - ire_refrele(ire); - break; - } - } else { - uint32_t rput_flags = 0; - - ip6h = (ip6_t *)mp->b_rptr; - v6dstp = &ip6h->ip6_dst; - /* - * XXX Assumes ip_rput_v6 sets ll_multicast only for multicast - * address. - * - * Currently, we don't store that state in the IPSEC_IN - * message, and we may need to. - */ - rput_flags |= (IN6_IS_ADDR_MULTICAST(v6dstp) ? - IP6_IN_LLMCAST : 0); - ip_rput_data_v6(ill->ill_rq, ill, ipsec_mp, ip6h, rput_flags, - NULL, NULL); - } - if (ill_need_rele) - ill_refrele(ill); - if (rill_need_rele) - ill_refrele(recv_ill); + return (B_TRUE); } /* @@ -17290,609 +9334,25 @@ ill_frag_timer_start(ill_t *ill) } /* - * This routine is needed for loopback when forwarding multicasts. - * - * IPQoS Notes: - * IPPF processing is done in fanout routines. - * Policy processing is done only if IPP_lOCAL_IN is enabled. Further, - * processing for IPsec packets is done when it comes back in clear. - * NOTE : The callers of this function need to do the ire_refrele for the - * ire that is being passed in. - */ -void -ip_proto_input(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, - ill_t *recv_ill, uint32_t esp_udp_ports) -{ - boolean_t esp_in_udp_packet = (esp_udp_ports != 0); - ill_t *ill = (ill_t *)q->q_ptr; - uint32_t sum; - uint32_t u1; - uint32_t u2; - int hdr_length; - boolean_t mctl_present; - mblk_t *first_mp = mp; - mblk_t *hada_mp = NULL; - ipha_t *inner_ipha; - ip_stack_t *ipst; - - ASSERT(recv_ill != NULL); - ipst = recv_ill->ill_ipst; - - TRACE_1(TR_FAC_IP, TR_IP_RPUT_LOCL_START, - "ip_rput_locl_start: q %p", q); - - ASSERT(ire->ire_ipversion == IPV4_VERSION); - ASSERT(ill != NULL); - -#define rptr ((uchar_t *)ipha) -#define iphs ((uint16_t *)ipha) - - /* - * no UDP or TCP packet should come here anymore. - */ - ASSERT(ipha->ipha_protocol != IPPROTO_TCP && - ipha->ipha_protocol != IPPROTO_UDP); - - EXTRACT_PKT_MP(mp, first_mp, mctl_present); - if (mctl_present && - ((da_ipsec_t *)first_mp->b_rptr)->da_type == IPHADA_M_CTL) { - ASSERT(MBLKL(first_mp) >= sizeof (da_ipsec_t)); - - /* - * It's an IPsec accelerated packet. - * Keep a pointer to the data attributes around until - * we allocate the ipsec_info_t. - */ - IPSECHW_DEBUG(IPSECHW_PKT, - ("ip_rput_local: inbound HW accelerated IPsec pkt\n")); - hada_mp = first_mp; - hada_mp->b_cont = NULL; - /* - * Since it is accelerated, it comes directly from - * the ill and the data attributes is followed by - * the packet data. - */ - ASSERT(mp->b_datap->db_type != M_CTL); - first_mp = mp; - mctl_present = B_FALSE; - } - - /* - * IF M_CTL is not present, then ipsec_in_is_secure - * should return B_TRUE. There is a case where loopback - * packets has an M_CTL in the front with all the - * IPsec options set to IPSEC_PREF_NEVER - which means - * ipsec_in_is_secure will return B_FALSE. As loopback - * packets never comes here, it is safe to ASSERT the - * following. - */ - ASSERT(!mctl_present || ipsec_in_is_secure(first_mp)); - - /* - * Also, we should never have an mctl_present if this is an - * ESP-in-UDP packet. - */ - ASSERT(!mctl_present || !esp_in_udp_packet); - - /* u1 is # words of IP options */ - u1 = ipha->ipha_version_and_hdr_length - (uchar_t)((IP_VERSION << 4) + - IP_SIMPLE_HDR_LENGTH_IN_WORDS); - - /* - * Don't verify header checksum if we just removed UDP header or - * packet is coming back from AH/ESP. - */ - if (!esp_in_udp_packet && !mctl_present) { - if (u1) { - if (!ip_options_cksum(q, ill, mp, ipha, ire, ipst)) { - if (hada_mp != NULL) - freemsg(hada_mp); - return; - } - } else { - /* Check the IP header checksum. */ -#define uph ((uint16_t *)ipha) - sum = uph[0] + uph[1] + uph[2] + uph[3] + uph[4] + - uph[5] + uph[6] + uph[7] + uph[8] + uph[9]; -#undef uph - /* finish doing IP checksum */ - sum = (sum & 0xFFFF) + (sum >> 16); - sum = ~(sum + (sum >> 16)) & 0xFFFF; - if (sum && sum != 0xFFFF) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); - goto drop_pkt; - } - } - } - - /* - * Count for SNMP of inbound packets for ire. As ip_proto_input - * might be called more than once for secure packets, count only - * the first time. - */ - if (!mctl_present) { - UPDATE_IB_PKT_COUNT(ire); - ire->ire_last_used_time = lbolt; - } - - /* Check for fragmentation offset. */ - u2 = ntohs(ipha->ipha_fragment_offset_and_flags); - u1 = u2 & (IPH_MF | IPH_OFFSET); - if (u1) { - /* - * We re-assemble fragments before we do the AH/ESP - * processing. Thus, M_CTL should not be present - * while we are re-assembling. - */ - ASSERT(!mctl_present); - ASSERT(first_mp == mp); - if (!ip_rput_fragment(ill, recv_ill, &mp, ipha, NULL, NULL)) - return; - - /* - * Make sure that first_mp points back to mp as - * the mp we came in with could have changed in - * ip_rput_fragment(). - */ - ipha = (ipha_t *)mp->b_rptr; - first_mp = mp; - } - - /* - * Clear hardware checksumming flag as it is currently only - * used by TCP and UDP. - */ - DB_CKSUMFLAGS(mp) = 0; - - /* Now we have a complete datagram, destined for this machine. */ - u1 = IPH_HDR_LENGTH(ipha); - switch (ipha->ipha_protocol) { - case IPPROTO_ICMP: { - ire_t *ire_zone; - ilm_t *ilm; - mblk_t *mp1; - zoneid_t last_zoneid; - ilm_walker_t ilw; - - if (CLASSD(ipha->ipha_dst) && !IS_LOOPBACK(recv_ill)) { - ASSERT(ire->ire_type == IRE_BROADCAST); - - /* - * In the multicast case, applications may have joined - * the group from different zones, so we need to deliver - * the packet to each of them. Loop through the - * multicast memberships structures (ilm) on the receive - * ill and send a copy of the packet up each matching - * one. However, we don't do this for multicasts sent on - * the loopback interface (PHYI_LOOPBACK flag set) as - * they must stay in the sender's zone. - * - * ilm_add_v6() ensures that ilms in the same zone are - * contiguous in the ill_ilm list. We use this property - * to avoid sending duplicates needed when two - * applications in the same zone join the same group on - * different logical interfaces: we ignore the ilm if - * its zoneid is the same as the last matching one. - * In addition, the sending of the packet for - * ire_zoneid is delayed until all of the other ilms - * have been exhausted. - */ - last_zoneid = -1; - ilm = ilm_walker_start(&ilw, recv_ill); - for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { - if (ipha->ipha_dst != ilm->ilm_addr || - ilm->ilm_zoneid == last_zoneid || - ilm->ilm_zoneid == ire->ire_zoneid || - ilm->ilm_zoneid == ALL_ZONES || - !(ilm->ilm_ipif->ipif_flags & IPIF_UP)) - continue; - mp1 = ip_copymsg(first_mp); - if (mp1 == NULL) - continue; - icmp_inbound(q, mp1, B_TRUE, ilw.ilw_walk_ill, - 0, sum, mctl_present, B_TRUE, - recv_ill, ilm->ilm_zoneid); - last_zoneid = ilm->ilm_zoneid; - } - ilm_walker_finish(&ilw); - } else if (ire->ire_type == IRE_BROADCAST) { - /* - * In the broadcast case, there may be many zones - * which need a copy of the packet delivered to them. - * There is one IRE_BROADCAST per broadcast address - * and per zone; we walk those using a helper function. - * In addition, the sending of the packet for ire is - * delayed until all of the other ires have been - * processed. - */ - IRB_REFHOLD(ire->ire_bucket); - ire_zone = NULL; - while ((ire_zone = ire_get_next_bcast_ire(ire_zone, - ire)) != NULL) { - mp1 = ip_copymsg(first_mp); - if (mp1 == NULL) - continue; - - UPDATE_IB_PKT_COUNT(ire_zone); - ire_zone->ire_last_used_time = lbolt; - icmp_inbound(q, mp1, B_TRUE, ill, - 0, sum, mctl_present, B_TRUE, - recv_ill, ire_zone->ire_zoneid); - } - IRB_REFRELE(ire->ire_bucket); - } - icmp_inbound(q, first_mp, (ire->ire_type == IRE_BROADCAST), - ill, 0, sum, mctl_present, B_TRUE, recv_ill, - ire->ire_zoneid); - TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, - "ip_rput_locl_end: q %p (%S)", q, "icmp"); - return; - } - case IPPROTO_IGMP: - /* - * If we are not willing to accept IGMP packets in clear, - * then check with global policy. - */ - if (ipst->ips_igmp_accept_clear_messages == 0) { - first_mp = ipsec_check_global_policy(first_mp, NULL, - ipha, NULL, mctl_present, ipst->ips_netstack); - if (first_mp == NULL) - return; - } - if (is_system_labeled() && !tsol_can_accept_raw(mp, B_TRUE)) { - freemsg(first_mp); - ip1dbg(("ip_proto_input: zone all cannot accept raw")); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - return; - } - if ((mp = igmp_input(q, mp, ill)) == NULL) { - /* Bad packet - discarded by igmp_input */ - TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, - "ip_rput_locl_end: q %p (%S)", q, "igmp"); - if (mctl_present) - freeb(first_mp); - return; - } - /* - * igmp_input() may have returned the pulled up message. - * So first_mp and ipha need to be reinitialized. - */ - ipha = (ipha_t *)mp->b_rptr; - if (mctl_present) - first_mp->b_cont = mp; - else - first_mp = mp; - if (ipst->ips_ipcl_proto_fanout[ipha->ipha_protocol]. - connf_head != NULL) { - /* No user-level listener for IGMP packets */ - goto drop_pkt; - } - /* deliver to local raw users */ - break; - case IPPROTO_PIM: - /* - * If we are not willing to accept PIM packets in clear, - * then check with global policy. - */ - if (ipst->ips_pim_accept_clear_messages == 0) { - first_mp = ipsec_check_global_policy(first_mp, NULL, - ipha, NULL, mctl_present, ipst->ips_netstack); - if (first_mp == NULL) - return; - } - if (is_system_labeled() && !tsol_can_accept_raw(mp, B_TRUE)) { - freemsg(first_mp); - ip1dbg(("ip_proto_input: zone all cannot accept PIM")); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - return; - } - if (pim_input(q, mp, ill) != 0) { - /* Bad packet - discarded by pim_input */ - TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, - "ip_rput_locl_end: q %p (%S)", q, "pim"); - if (mctl_present) - freeb(first_mp); - return; - } - - /* - * pim_input() may have pulled up the message so ipha needs to - * be reinitialized. - */ - ipha = (ipha_t *)mp->b_rptr; - if (ipst->ips_ipcl_proto_fanout[ipha->ipha_protocol]. - connf_head != NULL) { - /* No user-level listener for PIM packets */ - goto drop_pkt; - } - /* deliver to local raw users */ - break; - case IPPROTO_ENCAP: - /* - * Handle self-encapsulated packets (IP-in-IP where - * the inner addresses == the outer addresses). - */ - hdr_length = IPH_HDR_LENGTH(ipha); - if ((uchar_t *)ipha + hdr_length + sizeof (ipha_t) > - mp->b_wptr) { - if (!pullupmsg(mp, (uchar_t *)ipha + hdr_length + - sizeof (ipha_t) - mp->b_rptr)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - freemsg(first_mp); - return; - } - ipha = (ipha_t *)mp->b_rptr; - } - inner_ipha = (ipha_t *)((uchar_t *)ipha + hdr_length); - /* - * Check the sanity of the inner IP header. - */ - if ((IPH_HDR_VERSION(inner_ipha) != IPV4_VERSION)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - freemsg(first_mp); - return; - } - if (IPH_HDR_LENGTH(inner_ipha) < sizeof (ipha_t)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - freemsg(first_mp); - return; - } - if (inner_ipha->ipha_src == ipha->ipha_src && - inner_ipha->ipha_dst == ipha->ipha_dst) { - ipsec_in_t *ii; - - /* - * Self-encapsulated tunnel packet. Remove - * the outer IP header and fanout again. - * We also need to make sure that the inner - * header is pulled up until options. - */ - mp->b_rptr = (uchar_t *)inner_ipha; - ipha = inner_ipha; - hdr_length = IPH_HDR_LENGTH(ipha); - if ((uchar_t *)ipha + hdr_length > mp->b_wptr) { - if (!pullupmsg(mp, (uchar_t *)ipha + - + hdr_length - mp->b_rptr)) { - freemsg(first_mp); - return; - } - ipha = (ipha_t *)mp->b_rptr; - } - if (hdr_length > sizeof (ipha_t)) { - /* We got options on the inner packet. */ - ipaddr_t dst = ipha->ipha_dst; - - if (ip_rput_options(q, mp, ipha, &dst, ipst) == - -1) { - /* Bad options! */ - return; - } - if (dst != ipha->ipha_dst) { - /* - * Someone put a source-route in - * the inside header of a self- - * encapsulated packet. Drop it - * with extreme prejudice and let - * the sender know. - */ - icmp_unreachable(q, first_mp, - ICMP_SOURCE_ROUTE_FAILED, - recv_ill->ill_zoneid, ipst); - return; - } - } - if (!mctl_present) { - ASSERT(first_mp == mp); - /* - * This means that somebody is sending - * Self-encapsualted packets without AH/ESP. - * If AH/ESP was present, we would have already - * allocated the first_mp. - * - * Send this packet to find a tunnel endpoint. - * if I can't find one, an ICMP - * PROTOCOL_UNREACHABLE will get sent. - */ - goto fanout; - } - /* - * We generally store the ill_index if we need to - * do IPsec processing as we lose the ill queue when - * we come back. But in this case, we never should - * have to store the ill_index here as it should have - * been stored previously when we processed the - * AH/ESP header in this routine or for non-ipsec - * cases, we still have the queue. But for some bad - * packets from the wire, we can get to IPsec after - * this and we better store the index for that case. - */ - ill = (ill_t *)q->q_ptr; - ii = (ipsec_in_t *)first_mp->b_rptr; - ii->ipsec_in_ill_index = - ill->ill_phyint->phyint_ifindex; - ii->ipsec_in_rill_index = - recv_ill->ill_phyint->phyint_ifindex; - if (ii->ipsec_in_decaps) { - /* - * This packet is self-encapsulated multiple - * times. We don't want to recurse infinitely. - * To keep it simple, drop the packet. - */ - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - freemsg(first_mp); - return; - } - ii->ipsec_in_decaps = B_TRUE; - ip_fanout_proto_again(first_mp, recv_ill, recv_ill, - ire); - return; - } - break; - case IPPROTO_AH: - case IPPROTO_ESP: { - ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; - - /* - * Fast path for AH/ESP. If this is the first time - * we are sending a datagram to AH/ESP, allocate - * a IPSEC_IN message and prepend it. Otherwise, - * just fanout. - */ - - int ipsec_rc; - ipsec_in_t *ii; - netstack_t *ns = ipst->ips_netstack; - - IP_STAT(ipst, ipsec_proto_ahesp); - if (!mctl_present) { - ASSERT(first_mp == mp); - first_mp = ipsec_in_alloc(B_TRUE, ns); - if (first_mp == NULL) { - ip1dbg(("ip_proto_input: IPSEC_IN " - "allocation failure.\n")); - freemsg(hada_mp); /* okay ifnull */ - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - freemsg(mp); - return; - } - /* - * Store the ill_index so that when we come back - * from IPsec we ride on the same queue. - */ - ill = (ill_t *)q->q_ptr; - ii = (ipsec_in_t *)first_mp->b_rptr; - ii->ipsec_in_ill_index = - ill->ill_phyint->phyint_ifindex; - ii->ipsec_in_rill_index = - recv_ill->ill_phyint->phyint_ifindex; - first_mp->b_cont = mp; - /* - * Cache hardware acceleration info. - */ - if (hada_mp != NULL) { - IPSECHW_DEBUG(IPSECHW_PKT, - ("ip_rput_local: caching data attr.\n")); - ii->ipsec_in_accelerated = B_TRUE; - ii->ipsec_in_da = hada_mp; - hada_mp = NULL; - } - } else { - ii = (ipsec_in_t *)first_mp->b_rptr; - } - - ii->ipsec_in_esp_udp_ports = esp_udp_ports; - - if (!ipsec_loaded(ipss)) { - ip_proto_not_sup(q, first_mp, IP_FF_SEND_ICMP, - ire->ire_zoneid, ipst); - return; - } - - ns = ipst->ips_netstack; - /* select inbound SA and have IPsec process the pkt */ - if (ipha->ipha_protocol == IPPROTO_ESP) { - esph_t *esph = ipsec_inbound_esp_sa(first_mp, ns); - boolean_t esp_in_udp_sa; - if (esph == NULL) - return; - ASSERT(ii->ipsec_in_esp_sa != NULL); - ASSERT(ii->ipsec_in_esp_sa->ipsa_input_func != NULL); - esp_in_udp_sa = ((ii->ipsec_in_esp_sa->ipsa_flags & - IPSA_F_NATT) != 0); - /* - * The following is a fancy, but quick, way of saying: - * ESP-in-UDP SA and Raw ESP packet --> drop - * OR - * ESP SA and ESP-in-UDP packet --> drop - */ - if (esp_in_udp_sa != esp_in_udp_packet) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - ip_drop_packet(first_mp, B_TRUE, ill, NULL, - DROPPER(ns->netstack_ipsec, ipds_esp_no_sa), - &ns->netstack_ipsec->ipsec_dropper); - return; - } - ipsec_rc = ii->ipsec_in_esp_sa->ipsa_input_func( - first_mp, esph); - } else { - ah_t *ah = ipsec_inbound_ah_sa(first_mp, ns); - if (ah == NULL) - return; - ASSERT(ii->ipsec_in_ah_sa != NULL); - ASSERT(ii->ipsec_in_ah_sa->ipsa_input_func != NULL); - ipsec_rc = ii->ipsec_in_ah_sa->ipsa_input_func( - first_mp, ah); - } - - switch (ipsec_rc) { - case IPSEC_STATUS_SUCCESS: - break; - case IPSEC_STATUS_FAILED: - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - /* FALLTHRU */ - case IPSEC_STATUS_PENDING: - return; - } - /* we're done with IPsec processing, send it up */ - ip_fanout_proto_again(first_mp, ill, recv_ill, ire); - return; - } - default: - break; - } - if (is_system_labeled() && !tsol_can_accept_raw(mp, B_FALSE)) { - ip1dbg(("ip_proto_input: zone %d cannot accept raw IP", - ire->ire_zoneid)); - goto drop_pkt; - } - /* - * Handle protocols with which IP is less intimate. There - * can be more than one stream bound to a particular - * protocol. When this is the case, each one gets a copy - * of any incoming packets. - */ -fanout: - ip_fanout_proto(q, first_mp, ill, ipha, - IP_FF_SEND_ICMP | IP_FF_CKSUM | IP_FF_RAWIP, mctl_present, - B_TRUE, recv_ill, ire->ire_zoneid); - TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, - "ip_rput_locl_end: q %p (%S)", q, "ip_fanout_proto"); - return; - -drop_pkt: - freemsg(first_mp); - if (hada_mp != NULL) - freeb(hada_mp); - TRACE_2(TR_FAC_IP, TR_IP_RPUT_LOCL_END, - "ip_rput_locl_end: q %p (%S)", q, "droppkt"); -#undef rptr -#undef iphs - -} - -/* * Update any source route, record route or timestamp options. * Check that we are at end of strict source route. - * The options have already been checked for sanity in ip_rput_options(). + * The options have already been checked for sanity in ip_input_options(). */ -static boolean_t -ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, - ip_stack_t *ipst) +boolean_t +ip_input_local_options(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira) { ipoptp_t opts; uchar_t *opt; uint8_t optval; uint8_t optlen; ipaddr_t dst; + ipaddr_t ifaddr; uint32_t ts; - ire_t *dst_ire; timestruc_t now; - zoneid_t zoneid; - ill_t *ill; - - ASSERT(ire->ire_ipversion == IPV4_VERSION); + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; - ip2dbg(("ip_rput_local_options\n")); + ip2dbg(("ip_input_local_options\n")); for (optval = ipoptp_first(&opts, ipha); optval != IPOPT_EOL; @@ -17900,7 +9360,7 @@ ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, ASSERT((opts.ipoptp_flags & IPOPTP_ERROR) == 0); opt = opts.ipoptp_cur; optlen = opts.ipoptp_len; - ip2dbg(("ip_rput_local_options: opt %d, len %d\n", + ip2dbg(("ip_input_local_options: opt %d, len %d\n", optval, optlen)); switch (optval) { uint32_t off; @@ -17911,7 +9371,7 @@ ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, if (optlen < IP_ADDR_LEN || off > optlen - IP_ADDR_LEN) { /* End of source route */ - ip1dbg(("ip_rput_local_options: end of SR\n")); + ip1dbg(("ip_input_local_options: end of SR\n")); break; } /* @@ -17920,7 +9380,7 @@ ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, * it is a packet with a loose source route which * reaches us before consuming the whole source route */ - ip1dbg(("ip_rput_local_options: not end of SR\n")); + ip1dbg(("ip_input_local_options: not end of SR\n")); if (optval == IPOPT_SSRR) { goto bad_src_route; } @@ -17941,11 +9401,17 @@ ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, off > optlen - IP_ADDR_LEN) { /* No more room - ignore */ ip1dbg(( - "ip_rput_local_options: end of RR\n")); + "ip_input_local_options: end of RR\n")); break; } - bcopy(&ire->ire_src_addr, (char *)opt + off, - IP_ADDR_LEN); + /* Pick a reasonable address on the outbound if */ + if (ip_select_source_v4(ill, INADDR_ANY, ipha->ipha_dst, + INADDR_ANY, ALL_ZONES, ipst, &ifaddr, NULL, + NULL) != 0) { + /* No source! Shouldn't happen */ + ifaddr = INADDR_ANY; + } + bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN); opt[IPOPT_OFFSET] += IP_ADDR_LEN; break; case IPOPT_TS: @@ -17959,14 +9425,10 @@ ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, /* Verify that the address matched */ off = opt[IPOPT_OFFSET] - 1; bcopy((char *)opt + off, &dst, IP_ADDR_LEN); - dst_ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, - NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, - ipst); - if (dst_ire == NULL) { + if (ip_type_v4(dst, ipst) != IRE_LOCAL) { /* Not for us */ break; } - ire_refrele(dst_ire); /* FALLTHRU */ case IPOPT_TS_TSANDADDR: off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; @@ -17976,8 +9438,8 @@ ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, * ip_*put_options should have already * dropped this packet. */ - cmn_err(CE_PANIC, "ip_rput_local_options: " - "unknown IT - bug in ip_rput_options?\n"); + cmn_err(CE_PANIC, "ip_input_local_options: " + "unknown IT - bug in ip_input_options?\n"); return (B_TRUE); /* Keep "lint" happy */ } if (opt[IPOPT_OFFSET] - 1 + off > optlen) { @@ -17993,8 +9455,14 @@ ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, case IPOPT_TS_PRESPEC: case IPOPT_TS_PRESPEC_RFC791: case IPOPT_TS_TSANDADDR: - bcopy(&ire->ire_src_addr, (char *)opt + off, - IP_ADDR_LEN); + /* Pick a reasonable addr on the outbound if */ + if (ip_select_source_v4(ill, INADDR_ANY, + ipha->ipha_dst, INADDR_ANY, ALL_ZONES, ipst, + &ifaddr, NULL, NULL) != 0) { + /* No source! Shouldn't happen */ + ifaddr = INADDR_ANY; + } + bcopy(&ifaddr, (char *)opt + off, IP_ADDR_LEN); opt[IPOPT_OFFSET] += IP_ADDR_LEN; /* FALLTHRU */ case IPOPT_TS_TSONLY: @@ -18013,51 +9481,41 @@ ip_rput_local_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ire_t *ire, return (B_TRUE); bad_src_route: - q = WR(q); - if (q->q_next != NULL) - ill = q->q_ptr; - else - ill = NULL; - /* make sure we clear any indication of a hardware checksum */ DB_CKSUMFLAGS(mp) = 0; - zoneid = ipif_lookup_addr_zoneid(ipha->ipha_dst, ill, ipst); - if (zoneid == ALL_ZONES) - freemsg(mp); - else - icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED, zoneid, ipst); + ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ill); + icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira); return (B_FALSE); } /* - * Process IP options in an inbound packet. If an option affects the - * effective destination address, return the next hop address via dstp. - * Returns -1 if something fails in which case an ICMP error has been sent + * Process IP options in an inbound packet. Always returns the nexthop. + * Normally this is the passed in nexthop, but if there is an option + * that effects the nexthop (such as a source route) that will be returned. + * Sets *errorp if there is an error, in which case an ICMP error has been sent * and mp freed. */ -static int -ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp, - ip_stack_t *ipst) +ipaddr_t +ip_input_options(ipha_t *ipha, ipaddr_t dst, mblk_t *mp, + ip_recv_attr_t *ira, int *errorp) { + ip_stack_t *ipst = ira->ira_ill->ill_ipst; ipoptp_t opts; uchar_t *opt; uint8_t optval; uint8_t optlen; - ipaddr_t dst; intptr_t code = 0; - ire_t *ire = NULL; - zoneid_t zoneid; - ill_t *ill; + ire_t *ire; - ip2dbg(("ip_rput_options\n")); - dst = ipha->ipha_dst; + ip2dbg(("ip_input_options\n")); + *errorp = 0; for (optval = ipoptp_first(&opts, ipha); optval != IPOPT_EOL; optval = ipoptp_next(&opts)) { opt = opts.ipoptp_cur; optlen = opts.ipoptp_len; - ip2dbg(("ip_rput_options: opt %d, len %d\n", + ip2dbg(("ip_input_options: opt %d, len %d\n", optval, optlen)); /* * Note: we need to verify the checksum before we @@ -18068,27 +9526,24 @@ ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp, uint32_t off; case IPOPT_SSRR: case IPOPT_LSRR: - ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, - ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); - if (ire == NULL) { + if (ip_type_v4(dst, ipst) != IRE_LOCAL) { if (optval == IPOPT_SSRR) { - ip1dbg(("ip_rput_options: not next" + ip1dbg(("ip_input_options: not next" " strict source route 0x%x\n", ntohl(dst))); code = (char *)&ipha->ipha_dst - (char *)ipha; goto param_prob; /* RouterReq's */ } - ip2dbg(("ip_rput_options: " + ip2dbg(("ip_input_options: " "not next source route 0x%x\n", ntohl(dst))); break; } - ire_refrele(ire); if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { ip1dbg(( - "ip_rput_options: bad option offset\n")); + "ip_input_options: bad option offset\n")); code = (char *)&opt[IPOPT_OLEN] - (char *)ipha; goto param_prob; @@ -18099,11 +9554,11 @@ ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp, if (optlen < IP_ADDR_LEN || off > optlen - IP_ADDR_LEN) { /* End of source route */ - ip1dbg(("ip_rput_options: end of SR\n")); + ip1dbg(("ip_input_options: end of SR\n")); break; } bcopy((char *)opt + off, &dst, IP_ADDR_LEN); - ip1dbg(("ip_rput_options: next hop 0x%x\n", + ip1dbg(("ip_input_options: next hop 0x%x\n", ntohl(dst))); /* @@ -18112,17 +9567,13 @@ ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp, * XXX verify per-interface ip_forwarding * for source route? */ - ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, - ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); - - if (ire != NULL) { - ire_refrele(ire); + if (ip_type_v4(dst, ipst) == IRE_LOCAL) { off += IP_ADDR_LEN; goto redo_srr; } if (dst == htonl(INADDR_LOOPBACK)) { - ip1dbg(("ip_rput_options: loopback addr in " + ip1dbg(("ip_input_options: loopback addr in " "source route!\n")); goto bad_src_route; } @@ -18131,12 +9582,13 @@ ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp, * reachable. */ if (optval == IPOPT_SSRR) { - ire = ire_ftable_lookup(dst, 0, 0, - IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, - msg_getlabel(mp), - MATCH_IRE_TYPE | MATCH_IRE_SECATTR, ipst); + ire = ire_ftable_lookup_v4(dst, 0, 0, + IRE_IF_ALL, NULL, ALL_ZONES, + ira->ira_tsl, + MATCH_IRE_TYPE | MATCH_IRE_SECATTR, 0, ipst, + NULL); if (ire == NULL) { - ip1dbg(("ip_rput_options: SSRR not " + ip1dbg(("ip_input_options: SSRR not " "directly reachable: 0x%x\n", ntohl(dst))); goto bad_src_route; @@ -18151,7 +9603,7 @@ ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp, case IPOPT_RR: if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { ip1dbg(( - "ip_rput_options: bad option offset\n")); + "ip_input_options: bad option offset\n")); code = (char *)&opt[IPOPT_OLEN] - (char *)ipha; goto param_prob; @@ -18169,7 +9621,7 @@ ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp, } if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { ip1dbg(( - "ip_rput_options: bad option offset\n")); + "ip_input_options: bad option offset\n")); code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha; goto param_prob; @@ -18201,45 +9653,27 @@ ip_rput_options(queue_t *q, mblk_t *mp, ipha_t *ipha, ipaddr_t *dstp, } if ((opts.ipoptp_flags & IPOPTP_ERROR) == 0) { - *dstp = dst; - return (0); + return (dst); } - ip1dbg(("ip_rput_options: error processing IP options.")); + ip1dbg(("ip_input_options: error processing IP options.")); code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha; param_prob: - q = WR(q); - if (q->q_next != NULL) - ill = q->q_ptr; - else - ill = NULL; - /* make sure we clear any indication of a hardware checksum */ DB_CKSUMFLAGS(mp) = 0; - /* Don't know whether this is for non-global or global/forwarding */ - zoneid = ipif_lookup_addr_zoneid(dst, ill, ipst); - if (zoneid == ALL_ZONES) - freemsg(mp); - else - icmp_param_problem(q, mp, (uint8_t)code, zoneid, ipst); - return (-1); + ip_drop_input("ICMP_PARAM_PROBLEM", mp, ira->ira_ill); + icmp_param_problem(mp, (uint8_t)code, ira); + *errorp = -1; + return (dst); bad_src_route: - q = WR(q); - if (q->q_next != NULL) - ill = q->q_ptr; - else - ill = NULL; - /* make sure we clear any indication of a hardware checksum */ DB_CKSUMFLAGS(mp) = 0; - zoneid = ipif_lookup_addr_zoneid(dst, ill, ipst); - if (zoneid == ALL_ZONES) - freemsg(mp); - else - icmp_unreachable(q, mp, ICMP_SOURCE_ROUTE_FAILED, zoneid, ipst); - return (-1); + ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ira->ira_ill); + icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira); + *errorp = -1; + return (dst); } /* @@ -18248,7 +9682,7 @@ bad_src_route: * - icmp fixed part (mib2_icmp_t) * - ipAddrEntryTable (ip 20) all IPv4 ipifs * - ipRouteEntryTable (ip 21) all IPv4 IREs - * - ipNetToMediaEntryTable (ip 22) [filled in by the arp module] + * - ipNetToMediaEntryTable (ip 22) all IPv4 Neighbor Cache entries * - ipRouteAttributeTable (ip 102) labeled routes * - ip multicast membership (ip_member_t) * - ip multicast source filtering (ip_grpsrc_t) @@ -18262,13 +9696,11 @@ bad_src_route: * One per ill plus one generic * - ipv6RouteEntry all IPv6 IREs * - ipv6RouteAttributeTable (ip6 102) labeled routes - * - ipv6NetToMediaEntry all Neighbor Cache entries + * - ipv6NetToMediaEntry all IPv6 Neighbor Cache entries * - ipv6AddrEntry all IPv6 ipifs * - ipv6 multicast membership (ipv6_member_t) * - ipv6 multicast source filtering (ipv6_grpsrc_t) * - * MIB2_IP_MEDIA is filled in by the arp module with ARP cache entries. - * * NOTE: original mpctl is copied for msg's 2..N, since its ctl part is * already filled in by the caller. * Return value of 0 indicates that no messages were sent and caller @@ -18387,6 +9819,9 @@ ip_snmp_get(queue_t *q, mblk_t *mpctl, int level) if ((mpctl = sctp_snmp_get_mib2(q, mpctl, sctps)) == NULL) { return (1); } + if ((mpctl = ip_snmp_get_mib2_ip_dce(q, mpctl, ipst)) == NULL) { + return (1); + } freemsg(mpctl); return (1); } @@ -18426,6 +9861,7 @@ ip_snmp_get_mib2_ip(queue_t *q, mblk_t *mpctl, mib2_ipIfStatsEntry_t *ipmib, SET_MIB(old_ip_mib.ipRouteAttributeSize, sizeof (mib2_ipAttributeEntry_t)); SET_MIB(old_ip_mib.transportMLPSize, sizeof (mib2_transportMLPEntry_t)); + SET_MIB(old_ip_mib.ipDestEntrySize, sizeof (dest_cache_entry_t)); /* * Grab the statistics from the new IP MIB @@ -18681,9 +10117,14 @@ ip_snmp_get_mib2_ip_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) if (ipif->ipif_zoneid != zoneid && ipif->ipif_zoneid != ALL_ZONES) continue; + /* Sum of count from dead IRE_LO* and our current */ mae.ipAdEntInfo.ae_ibcnt = ipif->ipif_ib_pkt_count; - mae.ipAdEntInfo.ae_obcnt = ipif->ipif_ob_pkt_count; - mae.ipAdEntInfo.ae_focnt = ipif->ipif_fo_pkt_count; + if (ipif->ipif_ire_local != NULL) { + mae.ipAdEntInfo.ae_ibcnt += + ipif->ipif_ire_local->ire_ib_pkt_count; + } + mae.ipAdEntInfo.ae_obcnt = 0; + mae.ipAdEntInfo.ae_focnt = 0; ipif_get_name(ipif, mae.ipAdEntIfIndex.o_bytes, OCTET_LENGTH); @@ -18694,7 +10135,7 @@ ip_snmp_get_mib2_ip_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) mae.ipAdEntInfo.ae_subnet = ipif->ipif_subnet; mae.ipAdEntInfo.ae_subnet_len = ip_mask_to_plen(ipif->ipif_net_mask); - mae.ipAdEntInfo.ae_src_addr = ipif->ipif_src_addr; + mae.ipAdEntInfo.ae_src_addr = ipif->ipif_lcl_addr; for (bitval = 1; bitval && !(bitval & ipif->ipif_brd_addr); @@ -18702,7 +10143,7 @@ ip_snmp_get_mib2_ip_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) noop; mae.ipAdEntBcastAddr = bitval; mae.ipAdEntReasmMaxSize = IP_MAXPACKET; - mae.ipAdEntInfo.ae_mtu = ipif->ipif_mtu; + mae.ipAdEntInfo.ae_mtu = ipif->ipif_ill->ill_mtu; mae.ipAdEntInfo.ae_metric = ipif->ipif_metric; mae.ipAdEntInfo.ae_broadcast_addr = ipif->ipif_brd_addr; @@ -18710,7 +10151,8 @@ ip_snmp_get_mib2_ip_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) ipif->ipif_pp_dst_addr; mae.ipAdEntInfo.ae_flags = ipif->ipif_flags | ill->ill_flags | ill->ill_phyint->phyint_flags; - mae.ipAdEntRetransmitTime = AR_EQ_DEFAULT_XMIT_INTERVAL; + mae.ipAdEntRetransmitTime = + ill->ill_reachable_retrans_time; if (!snmp_append_data2(mpctl->b_cont, &mp_tail, (char *)&mae, (int)sizeof (mib2_ipAddrEntry_t))) { @@ -18762,9 +10204,14 @@ ip_snmp_get_mib2_ip6_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) if (ipif->ipif_zoneid != zoneid && ipif->ipif_zoneid != ALL_ZONES) continue; + /* Sum of count from dead IRE_LO* and our current */ mae6.ipv6AddrInfo.ae_ibcnt = ipif->ipif_ib_pkt_count; - mae6.ipv6AddrInfo.ae_obcnt = ipif->ipif_ob_pkt_count; - mae6.ipv6AddrInfo.ae_focnt = ipif->ipif_fo_pkt_count; + if (ipif->ipif_ire_local != NULL) { + mae6.ipv6AddrInfo.ae_ibcnt += + ipif->ipif_ire_local->ire_ib_pkt_count; + } + mae6.ipv6AddrInfo.ae_obcnt = 0; + mae6.ipv6AddrInfo.ae_focnt = 0; ipif_get_name(ipif, mae6.ipv6AddrIfIndex.o_bytes, OCTET_LENGTH); @@ -18776,7 +10223,7 @@ ip_snmp_get_mib2_ip6_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) mae6.ipv6AddrInfo.ae_subnet = ipif->ipif_v6subnet; mae6.ipv6AddrInfo.ae_subnet_len = mae6.ipv6AddrPfxLength; - mae6.ipv6AddrInfo.ae_src_addr = ipif->ipif_v6src_addr; + mae6.ipv6AddrInfo.ae_src_addr = ipif->ipif_v6lcl_addr; /* Type: stateless(1), stateful(2), unknown(3) */ if (ipif->ipif_flags & IPIF_ADDRCONF) @@ -18799,7 +10246,7 @@ ip_snmp_get_mib2_ip6_addr(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) mae6.ipv6AddrStatus = 2; else mae6.ipv6AddrStatus = 1; - mae6.ipv6AddrInfo.ae_mtu = ipif->ipif_mtu; + mae6.ipv6AddrInfo.ae_mtu = ipif->ipif_ill->ill_mtu; mae6.ipv6AddrInfo.ae_metric = ipif->ipif_metric; mae6.ipv6AddrInfo.ae_pp_dst_addr = ipif->ipif_v6pp_dst_addr; @@ -18842,7 +10289,6 @@ ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) mblk_t *mp_tail = NULL; ill_walk_context_t ctx; zoneid_t zoneid; - ilm_walker_t ilw; /* * make a copy of the original message @@ -18859,36 +10305,49 @@ ip_snmp_get_mib2_ip_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) rw_enter(&ipst->ips_ill_g_lock, RW_READER); ill = ILL_START_WALK_V4(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { - if (IS_UNDER_IPMP(ill)) + /* Make sure the ill isn't going away. */ + if (!ill_check_and_refhold(ill)) continue; + rw_exit(&ipst->ips_ill_g_lock); + rw_enter(&ill->ill_mcast_lock, RW_READER); + for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { + if (ilm->ilm_zoneid != zoneid && + ilm->ilm_zoneid != ALL_ZONES) + continue; - ilm = ilm_walker_start(&ilw, ill); - for (ipif = ill->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - if (ipif->ipif_zoneid != zoneid && - ipif->ipif_zoneid != ALL_ZONES) - continue; /* not this zone */ - ipif_get_name(ipif, ipm.ipGroupMemberIfIndex.o_bytes, - OCTET_LENGTH); + /* Is there an ipif for ilm_ifaddr? */ + for (ipif = ill->ill_ipif; ipif != NULL; + ipif = ipif->ipif_next) { + if (!IPIF_IS_CONDEMNED(ipif) && + ipif->ipif_lcl_addr == ilm->ilm_ifaddr && + ilm->ilm_ifaddr != INADDR_ANY) + break; + } + if (ipif != NULL) { + ipif_get_name(ipif, + ipm.ipGroupMemberIfIndex.o_bytes, + OCTET_LENGTH); + } else { + ill_get_name(ill, + ipm.ipGroupMemberIfIndex.o_bytes, + OCTET_LENGTH); + } ipm.ipGroupMemberIfIndex.o_length = mi_strlen(ipm.ipGroupMemberIfIndex.o_bytes); - for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { - ASSERT(ilm->ilm_ipif != NULL); - ASSERT(ilm->ilm_ill == NULL); - if (ilm->ilm_ipif != ipif) - continue; - ipm.ipGroupMemberAddress = ilm->ilm_addr; - ipm.ipGroupMemberRefCnt = ilm->ilm_refcnt; - ipm.ipGroupMemberFilterMode = ilm->ilm_fmode; - if (!snmp_append_data2(mpctl->b_cont, &mp_tail, - (char *)&ipm, (int)sizeof (ipm))) { - ip1dbg(("ip_snmp_get_mib2_ip_group: " - "failed to allocate %u bytes\n", - (uint_t)sizeof (ipm))); - } + + ipm.ipGroupMemberAddress = ilm->ilm_addr; + ipm.ipGroupMemberRefCnt = ilm->ilm_refcnt; + ipm.ipGroupMemberFilterMode = ilm->ilm_fmode; + if (!snmp_append_data2(mpctl->b_cont, &mp_tail, + (char *)&ipm, (int)sizeof (ipm))) { + ip1dbg(("ip_snmp_get_mib2_ip_group: " + "failed to allocate %u bytes\n", + (uint_t)sizeof (ipm))); } } - ilm_walker_finish(&ilw); + rw_exit(&ill->ill_mcast_lock); + ill_refrele(ill); + rw_enter(&ipst->ips_ill_g_lock, RW_READER); } rw_exit(&ipst->ips_ill_g_lock); optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); @@ -18910,7 +10369,6 @@ ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) mblk_t *mp_tail = NULL; ill_walk_context_t ctx; zoneid_t zoneid; - ilm_walker_t ilw; /* * make a copy of the original message @@ -18926,15 +10384,19 @@ ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) rw_enter(&ipst->ips_ill_g_lock, RW_READER); ill = ILL_START_WALK_V6(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { - if (IS_UNDER_IPMP(ill)) + /* Make sure the ill isn't going away. */ + if (!ill_check_and_refhold(ill)) continue; - - ilm = ilm_walker_start(&ilw, ill); + rw_exit(&ipst->ips_ill_g_lock); + /* + * Normally we don't have any members on under IPMP interfaces. + * We report them as a debugging aid. + */ + rw_enter(&ill->ill_mcast_lock, RW_READER); ipm6.ipv6GroupMemberIfIndex = ill->ill_phyint->phyint_ifindex; - for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { - ASSERT(ilm->ilm_ipif == NULL); - ASSERT(ilm->ilm_ill != NULL); - if (ilm->ilm_zoneid != zoneid) + for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { + if (ilm->ilm_zoneid != zoneid && + ilm->ilm_zoneid != ALL_ZONES) continue; /* not this zone */ ipm6.ipv6GroupMemberAddress = ilm->ilm_v6addr; ipm6.ipv6GroupMemberRefCnt = ilm->ilm_refcnt; @@ -18947,7 +10409,9 @@ ip_snmp_get_mib2_ip6_group_mem(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) (uint_t)sizeof (ipm6))); } } - ilm_walker_finish(&ilw); + rw_exit(&ill->ill_mcast_lock); + ill_refrele(ill); + rw_enter(&ipst->ips_ill_g_lock, RW_READER); } rw_exit(&ipst->ips_ill_g_lock); @@ -18973,7 +10437,6 @@ ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) zoneid_t zoneid; int i; slist_t *sl; - ilm_walker_t ilw; /* * make a copy of the original message @@ -18990,43 +10453,56 @@ ip_snmp_get_mib2_ip_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) rw_enter(&ipst->ips_ill_g_lock, RW_READER); ill = ILL_START_WALK_V4(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { - if (IS_UNDER_IPMP(ill)) + /* Make sure the ill isn't going away. */ + if (!ill_check_and_refhold(ill)) continue; + rw_exit(&ipst->ips_ill_g_lock); + rw_enter(&ill->ill_mcast_lock, RW_READER); + for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { + sl = ilm->ilm_filter; + if (ilm->ilm_zoneid != zoneid && + ilm->ilm_zoneid != ALL_ZONES) + continue; + if (SLIST_IS_EMPTY(sl)) + continue; - ilm = ilm_walker_start(&ilw, ill); - for (ipif = ill->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - if (ipif->ipif_zoneid != zoneid) - continue; /* not this zone */ - ipif_get_name(ipif, ips.ipGroupSourceIfIndex.o_bytes, - OCTET_LENGTH); + /* Is there an ipif for ilm_ifaddr? */ + for (ipif = ill->ill_ipif; ipif != NULL; + ipif = ipif->ipif_next) { + if (!IPIF_IS_CONDEMNED(ipif) && + ipif->ipif_lcl_addr == ilm->ilm_ifaddr && + ilm->ilm_ifaddr != INADDR_ANY) + break; + } + if (ipif != NULL) { + ipif_get_name(ipif, + ips.ipGroupSourceIfIndex.o_bytes, + OCTET_LENGTH); + } else { + ill_get_name(ill, + ips.ipGroupSourceIfIndex.o_bytes, + OCTET_LENGTH); + } ips.ipGroupSourceIfIndex.o_length = mi_strlen(ips.ipGroupSourceIfIndex.o_bytes); - for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { - ASSERT(ilm->ilm_ipif != NULL); - ASSERT(ilm->ilm_ill == NULL); - sl = ilm->ilm_filter; - if (ilm->ilm_ipif != ipif || SLIST_IS_EMPTY(sl)) + + ips.ipGroupSourceGroup = ilm->ilm_addr; + for (i = 0; i < sl->sl_numsrc; i++) { + if (!IN6_IS_ADDR_V4MAPPED(&sl->sl_addr[i])) continue; - ips.ipGroupSourceGroup = ilm->ilm_addr; - for (i = 0; i < sl->sl_numsrc; i++) { - if (!IN6_IS_ADDR_V4MAPPED( - &sl->sl_addr[i])) - continue; - IN6_V4MAPPED_TO_IPADDR(&sl->sl_addr[i], - ips.ipGroupSourceAddress); - if (snmp_append_data2(mpctl->b_cont, - &mp_tail, (char *)&ips, - (int)sizeof (ips)) == 0) { - ip1dbg(("ip_snmp_get_mib2_" - "ip_group_src: failed to " - "allocate %u bytes\n", - (uint_t)sizeof (ips))); - } + IN6_V4MAPPED_TO_IPADDR(&sl->sl_addr[i], + ips.ipGroupSourceAddress); + if (snmp_append_data2(mpctl->b_cont, &mp_tail, + (char *)&ips, (int)sizeof (ips)) == 0) { + ip1dbg(("ip_snmp_get_mib2_ip_group_src:" + " failed to allocate %u bytes\n", + (uint_t)sizeof (ips))); } } } - ilm_walker_finish(&ilw); + rw_exit(&ill->ill_mcast_lock); + ill_refrele(ill); + rw_enter(&ipst->ips_ill_g_lock, RW_READER); } rw_exit(&ipst->ips_ill_g_lock); optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); @@ -19050,7 +10526,6 @@ ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) zoneid_t zoneid; int i; slist_t *sl; - ilm_walker_t ilw; /* * make a copy of the original message @@ -19066,16 +10541,22 @@ ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) rw_enter(&ipst->ips_ill_g_lock, RW_READER); ill = ILL_START_WALK_V6(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { - if (IS_UNDER_IPMP(ill)) + /* Make sure the ill isn't going away. */ + if (!ill_check_and_refhold(ill)) continue; - - ilm = ilm_walker_start(&ilw, ill); + rw_exit(&ipst->ips_ill_g_lock); + /* + * Normally we don't have any members on under IPMP interfaces. + * We report them as a debugging aid. + */ + rw_enter(&ill->ill_mcast_lock, RW_READER); ips6.ipv6GroupSourceIfIndex = ill->ill_phyint->phyint_ifindex; - for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { - ASSERT(ilm->ilm_ipif == NULL); - ASSERT(ilm->ilm_ill != NULL); + for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { sl = ilm->ilm_filter; - if (ilm->ilm_zoneid != zoneid || SLIST_IS_EMPTY(sl)) + if (ilm->ilm_zoneid != zoneid && + ilm->ilm_zoneid != ALL_ZONES) + continue; + if (SLIST_IS_EMPTY(sl)) continue; ips6.ipv6GroupSourceGroup = ilm->ilm_v6addr; for (i = 0; i < sl->sl_numsrc; i++) { @@ -19089,7 +10570,9 @@ ip_snmp_get_mib2_ip6_group_src(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) } } } - ilm_walker_finish(&ilw); + rw_exit(&ill->ill_mcast_lock); + ill_refrele(ill); + rw_enter(&ipst->ips_ill_g_lock, RW_READER); } rw_exit(&ipst->ips_ill_g_lock); @@ -19189,13 +10672,13 @@ ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, int level, ird.ird_netmedia.lp_head = mp3ctl->b_cont; ird.ird_attrs.lp_head = mp4ctl->b_cont; /* - * If the level has been set the special EXPER_IP_AND_TESTHIDDEN - * value, then also include IRE_MARK_TESTHIDDEN IREs. This is + * If the level has been set the special EXPER_IP_AND_ALL_IRES value, + * then also include ire_testhidden IREs and IRE_IF_CLONE. This is * intended a temporary solution until a proper MIB API is provided * that provides complete filtering/caller-opt-in. */ - if (level == EXPER_IP_AND_TESTHIDDEN) - ird.ird_flags |= IRD_REPORT_TESTHIDDEN; + if (level == EXPER_IP_AND_ALL_IRES) + ird.ird_flags |= IRD_REPORT_ALL; zoneid = Q_TO_CONN(q)->conn_zoneid; ire_walk_v4(ip_snmp_get2_v4, &ird, zoneid, ipst); @@ -19210,6 +10693,8 @@ ip_snmp_get_mib2_ip_route_media(queue_t *q, mblk_t *mpctl, int level, qreply(q, mpctl); /* ipNetToMediaEntryTable in mp3ctl */ + ncec_walk(NULL, ip_snmp_get2_v4_media, &ird, ipst); + optp = (struct opthdr *)&mp3ctl->b_rptr[sizeof (struct T_optmgmt_ack)]; optp->level = MIB2_IP; optp->name = MIB2_IP_MEDIA; @@ -19272,13 +10757,13 @@ ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl, int level, ird.ird_netmedia.lp_head = mp3ctl->b_cont; ird.ird_attrs.lp_head = mp4ctl->b_cont; /* - * If the level has been set the special EXPER_IP_AND_TESTHIDDEN - * value, then also include IRE_MARK_TESTHIDDEN IREs. This is + * If the level has been set the special EXPER_IP_AND_ALL_IRES value, + * then also include ire_testhidden IREs and IRE_IF_CLONE. This is * intended a temporary solution until a proper MIB API is provided * that provides complete filtering/caller-opt-in. */ - if (level == EXPER_IP_AND_TESTHIDDEN) - ird.ird_flags |= IRD_REPORT_TESTHIDDEN; + if (level == EXPER_IP_AND_ALL_IRES) + ird.ird_flags |= IRD_REPORT_ALL; zoneid = Q_TO_CONN(q)->conn_zoneid; ire_walk_v6(ip_snmp_get2_v6_route, &ird, zoneid, ipst); @@ -19292,7 +10777,7 @@ ip_snmp_get_mib2_ip6_route_media(queue_t *q, mblk_t *mpctl, int level, qreply(q, mpctl); /* ipv6NetToMediaEntryTable in mp3ctl */ - ndp_walk(NULL, ip_snmp_get2_v6_media, &ird, ipst); + ncec_walk(NULL, ip_snmp_get2_v6_media, &ird, ipst); optp = (struct opthdr *)&mp3ctl->b_rptr[sizeof (struct T_optmgmt_ack)]; optp->level = MIB2_IP6; @@ -19487,21 +10972,20 @@ static void ip_snmp_get2_v4(ire_t *ire, iproutedata_t *ird) { ill_t *ill; - ipif_t *ipif; mib2_ipRouteEntry_t *re; - mib2_ipAttributeEntry_t *iae, *iaeptr; - ipaddr_t gw_addr; + mib2_ipAttributeEntry_t iaes; tsol_ire_gw_secattr_t *attrp; tsol_gc_t *gc = NULL; tsol_gcgrp_t *gcgrp = NULL; - uint_t sacnt = 0; - int i; + ip_stack_t *ipst = ire->ire_ipst; ASSERT(ire->ire_ipversion == IPV4_VERSION); - if (!(ird->ird_flags & IRD_REPORT_TESTHIDDEN) && - ire->ire_marks & IRE_MARK_TESTHIDDEN) { - return; + if (!(ird->ird_flags & IRD_REPORT_ALL)) { + if (ire->ire_testhidden) + return; + if (ire->ire_type & IRE_IF_CLONE) + return; } if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL) @@ -19513,52 +10997,17 @@ ip_snmp_get2_v4(ire_t *ire, iproutedata_t *ird) gcgrp = gc->gc_grp; ASSERT(gcgrp != NULL); rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); - sacnt = 1; - } else if ((gcgrp = attrp->igsa_gcgrp) != NULL) { - rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); - gc = gcgrp->gcgrp_head; - sacnt = gcgrp->gcgrp_count; } mutex_exit(&attrp->igsa_lock); - - /* do nothing if there's no gc to report */ - if (gc == NULL) { - ASSERT(sacnt == 0); - if (gcgrp != NULL) { - /* we might as well drop the lock now */ - rw_exit(&gcgrp->gcgrp_rwlock); - gcgrp = NULL; - } - attrp = NULL; - } - - ASSERT(gc == NULL || (gcgrp != NULL && - RW_LOCK_HELD(&gcgrp->gcgrp_rwlock))); } - ASSERT(sacnt == 0 || gc != NULL); - - if (sacnt != 0 && - (iae = kmem_alloc(sacnt * sizeof (*iae), KM_NOSLEEP)) == NULL) { - kmem_free(re, sizeof (*re)); - rw_exit(&gcgrp->gcgrp_rwlock); - return; - } - /* * Return all IRE types for route table... let caller pick and choose */ re->ipRouteDest = ire->ire_addr; - ipif = ire->ire_ipif; + ill = ire->ire_ill; re->ipRouteIfIndex.o_length = 0; - if (ire->ire_type == IRE_CACHE) { - ill = (ill_t *)ire->ire_stq->q_ptr; - re->ipRouteIfIndex.o_length = - ill->ill_name_length == 0 ? 0 : - MIN(OCTET_LENGTH, ill->ill_name_length - 1); - bcopy(ill->ill_name, re->ipRouteIfIndex.o_bytes, - re->ipRouteIfIndex.o_length); - } else if (ipif != NULL) { - ipif_get_name(ipif, re->ipRouteIfIndex.o_bytes, OCTET_LENGTH); + if (ill != NULL) { + ill_get_name(ill, re->ipRouteIfIndex.o_bytes, OCTET_LENGTH); re->ipRouteIfIndex.o_length = mi_strlen(re->ipRouteIfIndex.o_bytes); } @@ -19567,30 +11016,45 @@ ip_snmp_get2_v4(ire_t *ire, iproutedata_t *ird) re->ipRouteMetric3 = -1; re->ipRouteMetric4 = -1; - gw_addr = ire->ire_gateway_addr; - - if (ire->ire_type & (IRE_INTERFACE|IRE_LOOPBACK|IRE_BROADCAST)) - re->ipRouteNextHop = ire->ire_src_addr; - else - re->ipRouteNextHop = gw_addr; + re->ipRouteNextHop = ire->ire_gateway_addr; /* indirect(4), direct(3), or invalid(2) */ if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) re->ipRouteType = 2; + else if (ire->ire_type & IRE_ONLINK) + re->ipRouteType = 3; else - re->ipRouteType = (gw_addr != 0) ? 4 : 3; + re->ipRouteType = 4; + re->ipRouteProto = -1; re->ipRouteAge = gethrestime_sec() - ire->ire_create_time; re->ipRouteMask = ire->ire_mask; re->ipRouteMetric5 = -1; - re->ipRouteInfo.re_max_frag = ire->ire_max_frag; - re->ipRouteInfo.re_frag_flag = ire->ire_frag_flag; - re->ipRouteInfo.re_rtt = ire->ire_uinfo.iulp_rtt; + re->ipRouteInfo.re_max_frag = ire->ire_metrics.iulp_mtu; + if (ire->ire_ill != NULL && re->ipRouteInfo.re_max_frag == 0) + re->ipRouteInfo.re_max_frag = ire->ire_ill->ill_mtu; + + re->ipRouteInfo.re_frag_flag = 0; + re->ipRouteInfo.re_rtt = 0; + re->ipRouteInfo.re_src_addr = 0; re->ipRouteInfo.re_ref = ire->ire_refcnt; - re->ipRouteInfo.re_src_addr = ire->ire_src_addr; re->ipRouteInfo.re_obpkt = ire->ire_ob_pkt_count; re->ipRouteInfo.re_ibpkt = ire->ire_ib_pkt_count; re->ipRouteInfo.re_flags = ire->ire_flags; + /* Add the IRE_IF_CLONE's counters to their parent IRE_INTERFACE */ + if (ire->ire_type & IRE_INTERFACE) { + ire_t *child; + + rw_enter(&ipst->ips_ire_dep_lock, RW_READER); + child = ire->ire_dep_children; + while (child != NULL) { + re->ipRouteInfo.re_obpkt += child->ire_ob_pkt_count; + re->ipRouteInfo.re_ibpkt += child->ire_ib_pkt_count; + child = child->ire_dep_sib_next; + } + rw_exit(&ipst->ips_ire_dep_lock); + } + if (ire->ire_flags & RTF_DYNAMIC) { re->ipRouteInfo.re_ire_type = IRE_HOST_REDIRECT; } else { @@ -19603,25 +11067,22 @@ ip_snmp_get2_v4(ire_t *ire, iproutedata_t *ird) (uint_t)sizeof (*re))); } - for (iaeptr = iae, i = 0; i < sacnt; i++, iaeptr++, gc = gc->gc_next) { - iaeptr->iae_routeidx = ird->ird_idx; - iaeptr->iae_doi = gc->gc_db->gcdb_doi; - iaeptr->iae_slrange = gc->gc_db->gcdb_slrange; - } + if (gc != NULL) { + iaes.iae_routeidx = ird->ird_idx; + iaes.iae_doi = gc->gc_db->gcdb_doi; + iaes.iae_slrange = gc->gc_db->gcdb_slrange; - if (!snmp_append_data2(ird->ird_attrs.lp_head, &ird->ird_attrs.lp_tail, - (char *)iae, sacnt * sizeof (*iae))) { - ip1dbg(("ip_snmp_get2_v4: failed to allocate %u bytes\n", - (unsigned)(sacnt * sizeof (*iae)))); + if (!snmp_append_data2(ird->ird_attrs.lp_head, + &ird->ird_attrs.lp_tail, (char *)&iaes, sizeof (iaes))) { + ip1dbg(("ip_snmp_get2_v4: failed to allocate %u " + "bytes\n", (uint_t)sizeof (iaes))); + } } /* bump route index for next pass */ ird->ird_idx++; kmem_free(re, sizeof (*re)); - if (sacnt != 0) - kmem_free(iae, sacnt * sizeof (*iae)); - if (gcgrp != NULL) rw_exit(&gcgrp->gcgrp_rwlock); } @@ -19633,21 +11094,20 @@ static void ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird) { ill_t *ill; - ipif_t *ipif; mib2_ipv6RouteEntry_t *re; - mib2_ipAttributeEntry_t *iae, *iaeptr; - in6_addr_t gw_addr_v6; + mib2_ipAttributeEntry_t iaes; tsol_ire_gw_secattr_t *attrp; tsol_gc_t *gc = NULL; tsol_gcgrp_t *gcgrp = NULL; - uint_t sacnt = 0; - int i; + ip_stack_t *ipst = ire->ire_ipst; ASSERT(ire->ire_ipversion == IPV6_VERSION); - if (!(ird->ird_flags & IRD_REPORT_TESTHIDDEN) && - ire->ire_marks & IRE_MARK_TESTHIDDEN) { - return; + if (!(ird->ird_flags & IRD_REPORT_ALL)) { + if (ire->ire_testhidden) + return; + if (ire->ire_type & IRE_IF_CLONE) + return; } if ((re = kmem_zalloc(sizeof (*re), KM_NOSLEEP)) == NULL) @@ -19659,37 +11119,9 @@ ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird) gcgrp = gc->gc_grp; ASSERT(gcgrp != NULL); rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); - sacnt = 1; - } else if ((gcgrp = attrp->igsa_gcgrp) != NULL) { - rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); - gc = gcgrp->gcgrp_head; - sacnt = gcgrp->gcgrp_count; } mutex_exit(&attrp->igsa_lock); - - /* do nothing if there's no gc to report */ - if (gc == NULL) { - ASSERT(sacnt == 0); - if (gcgrp != NULL) { - /* we might as well drop the lock now */ - rw_exit(&gcgrp->gcgrp_rwlock); - gcgrp = NULL; - } - attrp = NULL; - } - - ASSERT(gc == NULL || (gcgrp != NULL && - RW_LOCK_HELD(&gcgrp->gcgrp_rwlock))); - } - ASSERT(sacnt == 0 || gc != NULL); - - if (sacnt != 0 && - (iae = kmem_alloc(sacnt * sizeof (*iae), KM_NOSLEEP)) == NULL) { - kmem_free(re, sizeof (*re)); - rw_exit(&gcgrp->gcgrp_rwlock); - return; } - /* * Return all IRE types for route table... let caller pick and choose */ @@ -19697,16 +11129,9 @@ ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird) re->ipv6RoutePfxLength = ip_mask_to_plen_v6(&ire->ire_mask_v6); re->ipv6RouteIndex = 0; /* Unique when multiple with same dest/plen */ re->ipv6RouteIfIndex.o_length = 0; - ipif = ire->ire_ipif; - if (ire->ire_type == IRE_CACHE) { - ill = (ill_t *)ire->ire_stq->q_ptr; - re->ipv6RouteIfIndex.o_length = - ill->ill_name_length == 0 ? 0 : - MIN(OCTET_LENGTH, ill->ill_name_length - 1); - bcopy(ill->ill_name, re->ipv6RouteIfIndex.o_bytes, - re->ipv6RouteIfIndex.o_length); - } else if (ipif != NULL) { - ipif_get_name(ipif, re->ipv6RouteIfIndex.o_bytes, OCTET_LENGTH); + ill = ire->ire_ill; + if (ill != NULL) { + ill_get_name(ill, re->ipv6RouteIfIndex.o_bytes, OCTET_LENGTH); re->ipv6RouteIfIndex.o_length = mi_strlen(re->ipv6RouteIfIndex.o_bytes); } @@ -19714,18 +11139,13 @@ ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird) ASSERT(!(ire->ire_type & IRE_BROADCAST)); mutex_enter(&ire->ire_lock); - gw_addr_v6 = ire->ire_gateway_addr_v6; + re->ipv6RouteNextHop = ire->ire_gateway_addr_v6; mutex_exit(&ire->ire_lock); - if (ire->ire_type & (IRE_INTERFACE|IRE_LOOPBACK)) - re->ipv6RouteNextHop = ire->ire_src_addr_v6; - else - re->ipv6RouteNextHop = gw_addr_v6; - /* remote(4), local(3), or discard(2) */ if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) re->ipv6RouteType = 2; - else if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) + else if (ire->ire_type & IRE_ONLINK) re->ipv6RouteType = 3; else re->ipv6RouteType = 4; @@ -19736,15 +11156,31 @@ ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird) re->ipv6RouteNextHopRDI = 0; re->ipv6RouteWeight = 0; re->ipv6RouteMetric = 0; - re->ipv6RouteInfo.re_max_frag = ire->ire_max_frag; - re->ipv6RouteInfo.re_frag_flag = ire->ire_frag_flag; - re->ipv6RouteInfo.re_rtt = ire->ire_uinfo.iulp_rtt; - re->ipv6RouteInfo.re_src_addr = ire->ire_src_addr_v6; + re->ipv6RouteInfo.re_max_frag = ire->ire_metrics.iulp_mtu; + if (ire->ire_ill != NULL && re->ipv6RouteInfo.re_max_frag == 0) + re->ipv6RouteInfo.re_max_frag = ire->ire_ill->ill_mtu; + + re->ipv6RouteInfo.re_frag_flag = 0; + re->ipv6RouteInfo.re_rtt = 0; + re->ipv6RouteInfo.re_src_addr = ipv6_all_zeros; re->ipv6RouteInfo.re_obpkt = ire->ire_ob_pkt_count; re->ipv6RouteInfo.re_ibpkt = ire->ire_ib_pkt_count; re->ipv6RouteInfo.re_ref = ire->ire_refcnt; re->ipv6RouteInfo.re_flags = ire->ire_flags; + /* Add the IRE_IF_CLONE's counters to their parent IRE_INTERFACE */ + if (ire->ire_type & IRE_INTERFACE) { + ire_t *child; + + rw_enter(&ipst->ips_ire_dep_lock, RW_READER); + child = ire->ire_dep_children; + while (child != NULL) { + re->ipv6RouteInfo.re_obpkt += child->ire_ob_pkt_count; + re->ipv6RouteInfo.re_ibpkt += child->ire_ib_pkt_count; + child = child->ire_dep_sib_next; + } + rw_exit(&ipst->ips_ire_dep_lock); + } if (ire->ire_flags & RTF_DYNAMIC) { re->ipv6RouteInfo.re_ire_type = IRE_HOST_REDIRECT; } else { @@ -19757,79 +11193,67 @@ ip_snmp_get2_v6_route(ire_t *ire, iproutedata_t *ird) (uint_t)sizeof (*re))); } - for (iaeptr = iae, i = 0; i < sacnt; i++, iaeptr++, gc = gc->gc_next) { - iaeptr->iae_routeidx = ird->ird_idx; - iaeptr->iae_doi = gc->gc_db->gcdb_doi; - iaeptr->iae_slrange = gc->gc_db->gcdb_slrange; - } + if (gc != NULL) { + iaes.iae_routeidx = ird->ird_idx; + iaes.iae_doi = gc->gc_db->gcdb_doi; + iaes.iae_slrange = gc->gc_db->gcdb_slrange; - if (!snmp_append_data2(ird->ird_attrs.lp_head, &ird->ird_attrs.lp_tail, - (char *)iae, sacnt * sizeof (*iae))) { - ip1dbg(("ip_snmp_get2_v6: failed to allocate %u bytes\n", - (unsigned)(sacnt * sizeof (*iae)))); + if (!snmp_append_data2(ird->ird_attrs.lp_head, + &ird->ird_attrs.lp_tail, (char *)&iaes, sizeof (iaes))) { + ip1dbg(("ip_snmp_get2_v6: failed to allocate %u " + "bytes\n", (uint_t)sizeof (iaes))); + } } /* bump route index for next pass */ ird->ird_idx++; kmem_free(re, sizeof (*re)); - if (sacnt != 0) - kmem_free(iae, sacnt * sizeof (*iae)); - if (gcgrp != NULL) rw_exit(&gcgrp->gcgrp_rwlock); } /* - * ndp_walk routine to create ipv6NetToMediaEntryTable + * ncec_walk routine to create ipv6NetToMediaEntryTable */ static int -ip_snmp_get2_v6_media(nce_t *nce, iproutedata_t *ird) +ip_snmp_get2_v6_media(ncec_t *ncec, iproutedata_t *ird) { ill_t *ill; mib2_ipv6NetToMediaEntry_t ntme; - dl_unitdata_req_t *dl; - ill = nce->nce_ill; - if (ill->ill_isv6 == B_FALSE) /* skip arpce entry */ + ill = ncec->ncec_ill; + /* skip arpce entries, and loopback ncec entries */ + if (ill->ill_isv6 == B_FALSE || ill->ill_net_type == IRE_LOOPBACK) return (0); - /* * Neighbor cache entry attached to IRE with on-link * destination. + * We report all IPMP groups on ncec_ill which is normally the upper. */ ntme.ipv6NetToMediaIfIndex = ill->ill_phyint->phyint_ifindex; - ntme.ipv6NetToMediaNetAddress = nce->nce_addr; - if ((ill->ill_flags & ILLF_XRESOLV) && - (nce->nce_res_mp != NULL)) { - dl = (dl_unitdata_req_t *)(nce->nce_res_mp->b_rptr); - ntme.ipv6NetToMediaPhysAddress.o_length = - dl->dl_dest_addr_length; - } else { - ntme.ipv6NetToMediaPhysAddress.o_length = - ill->ill_phys_addr_length; - } - if (nce->nce_res_mp != NULL) { - bcopy((char *)nce->nce_res_mp->b_rptr + - NCE_LL_ADDR_OFFSET(ill), - ntme.ipv6NetToMediaPhysAddress.o_bytes, + ntme.ipv6NetToMediaNetAddress = ncec->ncec_addr; + ntme.ipv6NetToMediaPhysAddress.o_length = ill->ill_phys_addr_length; + if (ncec->ncec_lladdr != NULL) { + bcopy(ncec->ncec_lladdr, ntme.ipv6NetToMediaPhysAddress.o_bytes, ntme.ipv6NetToMediaPhysAddress.o_length); - } else { - bzero(ntme.ipv6NetToMediaPhysAddress.o_bytes, - ill->ill_phys_addr_length); } /* * Note: Returns ND_* states. Should be: * reachable(1), stale(2), delay(3), probe(4), * invalid(5), unknown(6) */ - ntme.ipv6NetToMediaState = nce->nce_state; + ntme.ipv6NetToMediaState = ncec->ncec_state; ntme.ipv6NetToMediaLastUpdated = 0; /* other(1), dynamic(2), static(3), local(4) */ - if (IN6_IS_ADDR_LOOPBACK(&nce->nce_addr)) { + if (NCE_MYADDR(ncec)) { ntme.ipv6NetToMediaType = 4; - } else if (IN6_IS_ADDR_MULTICAST(&nce->nce_addr)) { + } else if (ncec->ncec_flags & NCE_F_PUBLISH) { + ntme.ipv6NetToMediaType = 1; /* proxy */ + } else if (ncec->ncec_flags & NCE_F_STATIC) { + ntme.ipv6NetToMediaType = 3; + } else if (ncec->ncec_flags & (NCE_F_MCAST|NCE_F_BCAST)) { ntme.ipv6NetToMediaType = 1; } else { ntme.ipv6NetToMediaType = 2; @@ -19843,6 +11267,93 @@ ip_snmp_get2_v6_media(nce_t *nce, iproutedata_t *ird) return (0); } +int +nce2ace(ncec_t *ncec) +{ + int flags = 0; + + if (NCE_ISREACHABLE(ncec)) + flags |= ACE_F_RESOLVED; + if (ncec->ncec_flags & NCE_F_AUTHORITY) + flags |= ACE_F_AUTHORITY; + if (ncec->ncec_flags & NCE_F_PUBLISH) + flags |= ACE_F_PUBLISH; + if ((ncec->ncec_flags & NCE_F_NONUD) != 0) + flags |= ACE_F_PERMANENT; + if (NCE_MYADDR(ncec)) + flags |= (ACE_F_MYADDR | ACE_F_AUTHORITY); + if (ncec->ncec_flags & NCE_F_UNVERIFIED) + flags |= ACE_F_UNVERIFIED; + if (ncec->ncec_flags & NCE_F_AUTHORITY) + flags |= ACE_F_AUTHORITY; + if (ncec->ncec_flags & NCE_F_DELAYED) + flags |= ACE_F_DELAYED; + return (flags); +} + +/* + * ncec_walk routine to create ipNetToMediaEntryTable + */ +static int +ip_snmp_get2_v4_media(ncec_t *ncec, iproutedata_t *ird) +{ + ill_t *ill; + mib2_ipNetToMediaEntry_t ntme; + const char *name = "unknown"; + ipaddr_t ncec_addr; + + ill = ncec->ncec_ill; + if (ill->ill_isv6 || (ncec->ncec_flags & NCE_F_BCAST) || + ill->ill_net_type == IRE_LOOPBACK) + return (0); + + /* We report all IPMP groups on ncec_ill which is normally the upper. */ + name = ill->ill_name; + /* Based on RFC 4293: other(1), inval(2), dyn(3), stat(4) */ + if (NCE_MYADDR(ncec)) { + ntme.ipNetToMediaType = 4; + } else if (ncec->ncec_flags & (NCE_F_MCAST|NCE_F_BCAST|NCE_F_PUBLISH)) { + ntme.ipNetToMediaType = 1; + } else { + ntme.ipNetToMediaType = 3; + } + ntme.ipNetToMediaIfIndex.o_length = MIN(OCTET_LENGTH, strlen(name)); + bcopy(name, ntme.ipNetToMediaIfIndex.o_bytes, + ntme.ipNetToMediaIfIndex.o_length); + + IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr); + bcopy(&ncec_addr, &ntme.ipNetToMediaNetAddress, sizeof (ncec_addr)); + + ntme.ipNetToMediaInfo.ntm_mask.o_length = sizeof (ipaddr_t); + ncec_addr = INADDR_BROADCAST; + bcopy(&ncec_addr, ntme.ipNetToMediaInfo.ntm_mask.o_bytes, + sizeof (ncec_addr)); + /* + * map all the flags to the ACE counterpart. + */ + ntme.ipNetToMediaInfo.ntm_flags = nce2ace(ncec); + + ntme.ipNetToMediaPhysAddress.o_length = + MIN(OCTET_LENGTH, ill->ill_phys_addr_length); + + if (!NCE_ISREACHABLE(ncec)) + ntme.ipNetToMediaPhysAddress.o_length = 0; + else { + if (ncec->ncec_lladdr != NULL) { + bcopy(ncec->ncec_lladdr, + ntme.ipNetToMediaPhysAddress.o_bytes, + ntme.ipNetToMediaPhysAddress.o_length); + } + } + + if (!snmp_append_data2(ird->ird_netmedia.lp_head, + &ird->ird_netmedia.lp_tail, (char *)&ntme, sizeof (ntme))) { + ip1dbg(("ip_snmp_get2_v4_media: failed to allocate %u bytes\n", + (uint_t)sizeof (ntme))); + } + return (0); +} + /* * return (0) if invalid set request, 1 otherwise, including non-tcp requests */ @@ -19999,7 +11510,7 @@ ip_mib2_add_icmp6_stats(mib2_ipv6IfIcmpEntry_t *o1, mib2_ipv6IfIcmpEntry_t *o2) * This routine assumes that the options are well formed i.e. that they * have already been checked. */ -static boolean_t +boolean_t ip_source_routed(ipha_t *ipha, ip_stack_t *ipst) { ipoptp_t opts; @@ -20007,7 +11518,6 @@ ip_source_routed(ipha_t *ipha, ip_stack_t *ipst) uint8_t optval; uint8_t optlen; ipaddr_t dst; - ire_t *ire; if (IS_SIMPLE_IPH(ipha)) { ip2dbg(("not source routed\n")); @@ -20030,15 +11540,12 @@ ip_source_routed(ipha_t *ipha, ip_stack_t *ipst) * If dst is one of our addresses and there are some * entries left in the source route return (true). */ - ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, NULL, - ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); - if (ire == NULL) { + if (ip_type_v4(dst, ipst) != IRE_LOCAL) { ip2dbg(("ip_source_routed: not next" " source route 0x%x\n", ntohl(dst))); return (B_FALSE); } - ire_refrele(ire); off = opt[IPOPT_OFFSET]; off--; if (optlen < IP_ADDR_LEN || @@ -20055,267 +11562,18 @@ ip_source_routed(ipha_t *ipha, ip_stack_t *ipst) } /* - * Check if the packet contains any source route. - */ -static boolean_t -ip_source_route_included(ipha_t *ipha) -{ - ipoptp_t opts; - uint8_t optval; - - if (IS_SIMPLE_IPH(ipha)) - return (B_FALSE); - for (optval = ipoptp_first(&opts, ipha); - optval != IPOPT_EOL; - optval = ipoptp_next(&opts)) { - switch (optval) { - case IPOPT_SSRR: - case IPOPT_LSRR: - return (B_TRUE); - } - } - return (B_FALSE); -} - -/* - * Called when the IRE expiration timer fires. - */ -void -ip_trash_timer_expire(void *args) -{ - int flush_flag = 0; - ire_expire_arg_t iea; - ip_stack_t *ipst = (ip_stack_t *)args; - - iea.iea_ipst = ipst; /* No netstack_hold */ - - /* - * ip_ire_expire_id is protected by ip_trash_timer_lock. - * This lock makes sure that a new invocation of this function - * that occurs due to an almost immediate timer firing will not - * progress beyond this point until the current invocation is done - */ - mutex_enter(&ipst->ips_ip_trash_timer_lock); - ipst->ips_ip_ire_expire_id = 0; - mutex_exit(&ipst->ips_ip_trash_timer_lock); - - /* Periodic timer */ - if (ipst->ips_ip_ire_arp_time_elapsed >= - ipst->ips_ip_ire_arp_interval) { - /* - * Remove all IRE_CACHE entries since they might - * contain arp information. - */ - flush_flag |= FLUSH_ARP_TIME; - ipst->ips_ip_ire_arp_time_elapsed = 0; - IP_STAT(ipst, ip_ire_arp_timer_expired); - } - if (ipst->ips_ip_ire_rd_time_elapsed >= - ipst->ips_ip_ire_redir_interval) { - /* Remove all redirects */ - flush_flag |= FLUSH_REDIRECT_TIME; - ipst->ips_ip_ire_rd_time_elapsed = 0; - IP_STAT(ipst, ip_ire_redirect_timer_expired); - } - if (ipst->ips_ip_ire_pmtu_time_elapsed >= - ipst->ips_ip_ire_pathmtu_interval) { - /* Increase path mtu */ - flush_flag |= FLUSH_MTU_TIME; - ipst->ips_ip_ire_pmtu_time_elapsed = 0; - IP_STAT(ipst, ip_ire_pmtu_timer_expired); - } - - /* - * Optimize for the case when there are no redirects in the - * ftable, that is, no need to walk the ftable in that case. - */ - if (flush_flag & (FLUSH_MTU_TIME|FLUSH_ARP_TIME)) { - iea.iea_flush_flag = flush_flag; - ire_walk_ill_tables(MATCH_IRE_TYPE, IRE_CACHETABLE, ire_expire, - (char *)(uintptr_t)&iea, IP_MASK_TABLE_SIZE, 0, NULL, - ipst->ips_ip_cache_table_size, ipst->ips_ip_cache_table, - NULL, ALL_ZONES, ipst); - } - if ((flush_flag & FLUSH_REDIRECT_TIME) && - ipst->ips_ip_redirect_cnt > 0) { - iea.iea_flush_flag = flush_flag; - ire_walk_ill_tables(MATCH_IRE_TYPE, IRE_FORWARDTABLE, - ire_expire, (char *)(uintptr_t)&iea, IP_MASK_TABLE_SIZE, - 0, NULL, 0, NULL, NULL, ALL_ZONES, ipst); - } - if (flush_flag & FLUSH_MTU_TIME) { - /* - * Walk all IPv6 IRE's and update them - * Note that ARP and redirect timers are not - * needed since NUD handles stale entries. - */ - flush_flag = FLUSH_MTU_TIME; - iea.iea_flush_flag = flush_flag; - ire_walk_v6(ire_expire, (char *)(uintptr_t)&iea, - ALL_ZONES, ipst); - } - - ipst->ips_ip_ire_arp_time_elapsed += ipst->ips_ip_timer_interval; - ipst->ips_ip_ire_rd_time_elapsed += ipst->ips_ip_timer_interval; - ipst->ips_ip_ire_pmtu_time_elapsed += ipst->ips_ip_timer_interval; - - /* - * Hold the lock to serialize timeout calls and prevent - * stale values in ip_ire_expire_id. Otherwise it is possible - * for the timer to fire and a new invocation of this function - * to start before the return value of timeout has been stored - * in ip_ire_expire_id by the current invocation. - */ - mutex_enter(&ipst->ips_ip_trash_timer_lock); - ipst->ips_ip_ire_expire_id = timeout(ip_trash_timer_expire, - (void *)ipst, MSEC_TO_TICK(ipst->ips_ip_timer_interval)); - mutex_exit(&ipst->ips_ip_trash_timer_lock); -} - -/* - * Called by the memory allocator subsystem directly, when the system - * is running low on memory. - */ -/* ARGSUSED */ -void -ip_trash_ire_reclaim(void *args) -{ - netstack_handle_t nh; - netstack_t *ns; - - netstack_next_init(&nh); - while ((ns = netstack_next(&nh)) != NULL) { - ip_trash_ire_reclaim_stack(ns->netstack_ip); - netstack_rele(ns); - } - netstack_next_fini(&nh); -} - -static void -ip_trash_ire_reclaim_stack(ip_stack_t *ipst) -{ - ire_cache_count_t icc; - ire_cache_reclaim_t icr; - ncc_cache_count_t ncc; - nce_cache_reclaim_t ncr; - uint_t delete_cnt; - /* - * Memory reclaim call back. - * Count unused, offlink, pmtu, and onlink IRE_CACHE entries. - * Then, with a target of freeing 1/Nth of IRE_CACHE - * entries, determine what fraction to free for - * each category of IRE_CACHE entries giving absolute priority - * in the order of onlink, pmtu, offlink, unused (e.g. no pmtu - * entry will be freed unless all offlink entries are freed). - */ - icc.icc_total = 0; - icc.icc_unused = 0; - icc.icc_offlink = 0; - icc.icc_pmtu = 0; - icc.icc_onlink = 0; - ire_walk(ire_cache_count, (char *)&icc, ipst); - - /* - * Free NCEs for IPv6 like the onlink ires. - */ - ncc.ncc_total = 0; - ncc.ncc_host = 0; - ndp_walk(NULL, (pfi_t)ndp_cache_count, (uchar_t *)&ncc, ipst); - - ASSERT(icc.icc_total == icc.icc_unused + icc.icc_offlink + - icc.icc_pmtu + icc.icc_onlink); - delete_cnt = icc.icc_total/ipst->ips_ip_ire_reclaim_fraction; - IP_STAT(ipst, ip_trash_ire_reclaim_calls); - if (delete_cnt == 0) - return; - IP_STAT(ipst, ip_trash_ire_reclaim_success); - /* Always delete all unused offlink entries */ - icr.icr_ipst = ipst; - icr.icr_unused = 1; - if (delete_cnt <= icc.icc_unused) { - /* - * Only need to free unused entries. In other words, - * there are enough unused entries to free to meet our - * target number of freed ire cache entries. - */ - icr.icr_offlink = icr.icr_pmtu = icr.icr_onlink = 0; - ncr.ncr_host = 0; - } else if (delete_cnt <= icc.icc_unused + icc.icc_offlink) { - /* - * Only need to free unused entries, plus a fraction of offlink - * entries. It follows from the first if statement that - * icc_offlink is non-zero, and that delete_cnt != icc_unused. - */ - delete_cnt -= icc.icc_unused; - /* Round up # deleted by truncating fraction */ - icr.icr_offlink = icc.icc_offlink / delete_cnt; - icr.icr_pmtu = icr.icr_onlink = 0; - ncr.ncr_host = 0; - } else if (delete_cnt <= - icc.icc_unused + icc.icc_offlink + icc.icc_pmtu) { - /* - * Free all unused and offlink entries, plus a fraction of - * pmtu entries. It follows from the previous if statement - * that icc_pmtu is non-zero, and that - * delete_cnt != icc_unused + icc_offlink. - */ - icr.icr_offlink = 1; - delete_cnt -= icc.icc_unused + icc.icc_offlink; - /* Round up # deleted by truncating fraction */ - icr.icr_pmtu = icc.icc_pmtu / delete_cnt; - icr.icr_onlink = 0; - ncr.ncr_host = 0; - } else { - /* - * Free all unused, offlink, and pmtu entries, plus a fraction - * of onlink entries. If we're here, then we know that - * icc_onlink is non-zero, and that - * delete_cnt != icc_unused + icc_offlink + icc_pmtu. - */ - icr.icr_offlink = icr.icr_pmtu = 1; - delete_cnt -= icc.icc_unused + icc.icc_offlink + - icc.icc_pmtu; - /* Round up # deleted by truncating fraction */ - icr.icr_onlink = icc.icc_onlink / delete_cnt; - /* Using the same delete fraction as for onlink IREs */ - ncr.ncr_host = ncc.ncc_host / delete_cnt; - } -#ifdef DEBUG - ip1dbg(("IP reclaim: target %d out of %d current %d/%d/%d/%d " - "fractions %d/%d/%d/%d\n", - icc.icc_total/ipst->ips_ip_ire_reclaim_fraction, icc.icc_total, - icc.icc_unused, icc.icc_offlink, - icc.icc_pmtu, icc.icc_onlink, - icr.icr_unused, icr.icr_offlink, - icr.icr_pmtu, icr.icr_onlink)); -#endif - ire_walk(ire_cache_reclaim, (char *)&icr, ipst); - if (ncr.ncr_host != 0) - ndp_walk(NULL, (pfi_t)ndp_cache_reclaim, - (uchar_t *)&ncr, ipst); -#ifdef DEBUG - icc.icc_total = 0; icc.icc_unused = 0; icc.icc_offlink = 0; - icc.icc_pmtu = 0; icc.icc_onlink = 0; - ire_walk(ire_cache_count, (char *)&icc, ipst); - ip1dbg(("IP reclaim: result total %d %d/%d/%d/%d\n", - icc.icc_total, icc.icc_unused, icc.icc_offlink, - icc.icc_pmtu, icc.icc_onlink)); -#endif -} - -/* - * ip_unbind is called when a copy of an unbind request is received from the - * upper level protocol. We remove this conn from any fanout hash list it is - * on, and zero out the bind information. No reply is expected up above. + * ip_unbind is called by the transports to remove a conn from + * the fanout table. */ void ip_unbind(conn_t *connp) { + ASSERT(!MUTEX_HELD(&connp->conn_lock)); if (is_system_labeled() && connp->conn_anon_port) { (void) tsol_mlp_anon(crgetzone(connp->conn_cred), - connp->conn_mlp_type, connp->conn_ulp, + connp->conn_mlp_type, connp->conn_proto, ntohs(connp->conn_lport), B_FALSE); connp->conn_anon_port = 0; } @@ -20325,1489 +11583,6 @@ ip_unbind(conn_t *connp) } /* - * Write side put procedure. Outbound data, IOCTLs, responses from - * resolvers, etc, come down through here. - * - * arg2 is always a queue_t *. - * When that queue is an ill_t (i.e. q_next != NULL), then arg must be - * the zoneid. - * When that queue is not an ill_t, then arg must be a conn_t pointer. - */ -void -ip_output(void *arg, mblk_t *mp, void *arg2, int caller) -{ - ip_output_options(arg, mp, arg2, caller, &zero_info); -} - -void -ip_output_options(void *arg, mblk_t *mp, void *arg2, int caller, - ip_opt_info_t *infop) -{ - conn_t *connp = NULL; - queue_t *q = (queue_t *)arg2; - ipha_t *ipha; -#define rptr ((uchar_t *)ipha) - ire_t *ire = NULL; - ire_t *sctp_ire = NULL; - uint32_t v_hlen_tos_len; - ipaddr_t dst; - mblk_t *first_mp = NULL; - boolean_t mctl_present; - ipsec_out_t *io; - int match_flags; - ill_t *xmit_ill = NULL; /* IP_PKTINFO etc. */ - ipif_t *dst_ipif; - boolean_t multirt_need_resolve = B_FALSE; - mblk_t *copy_mp = NULL; - int err = 0; - zoneid_t zoneid; - boolean_t need_decref = B_FALSE; - boolean_t ignore_dontroute = B_FALSE; - boolean_t ignore_nexthop = B_FALSE; - boolean_t ip_nexthop = B_FALSE; - ipaddr_t nexthop_addr; - ip_stack_t *ipst; - -#ifdef _BIG_ENDIAN -#define V_HLEN (v_hlen_tos_len >> 24) -#else -#define V_HLEN (v_hlen_tos_len & 0xFF) -#endif - - TRACE_1(TR_FAC_IP, TR_IP_WPUT_START, - "ip_wput_start: q %p", q); - - /* - * ip_wput fast path - */ - - /* is packet from ARP ? */ - if (q->q_next != NULL) { - zoneid = (zoneid_t)(uintptr_t)arg; - goto qnext; - } - - connp = (conn_t *)arg; - ASSERT(connp != NULL); - zoneid = connp->conn_zoneid; - ipst = connp->conn_netstack->netstack_ip; - ASSERT(ipst != NULL); - - /* is queue flow controlled? */ - if ((q->q_first != NULL || connp->conn_draining) && - (caller == IP_WPUT)) { - ASSERT(!need_decref); - ASSERT(!IP_FLOW_CONTROLLED_ULP(connp->conn_ulp)); - (void) putq(q, mp); - return; - } - - /* Multidata transmit? */ - if (DB_TYPE(mp) == M_MULTIDATA) { - /* - * We should never get here, since all Multidata messages - * originating from tcp should have been directed over to - * tcp_multisend() in the first place. - */ - BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); - freemsg(mp); - return; - } else if (DB_TYPE(mp) != M_DATA) - goto notdata; - - if (mp->b_flag & MSGHASREF) { - ASSERT(connp->conn_ulp == IPPROTO_SCTP); - mp->b_flag &= ~MSGHASREF; - SCTP_EXTRACT_IPINFO(mp, sctp_ire); - need_decref = B_TRUE; - } - ipha = (ipha_t *)mp->b_rptr; - - /* is IP header non-aligned or mblk smaller than basic IP header */ -#ifndef SAFETY_BEFORE_SPEED - if (!OK_32PTR(rptr) || - (mp->b_wptr - rptr) < IP_SIMPLE_HDR_LENGTH) - goto hdrtoosmall; -#endif - - ASSERT(OK_32PTR(ipha)); - - /* - * This function assumes that mp points to an IPv4 packet. If it's the - * wrong version, we'll catch it again in ip_output_v6. - * - * Note that this is *only* locally-generated output here, and never - * forwarded data, and that we need to deal only with transports that - * don't know how to label. (TCP, UDP, and ICMP/raw-IP all know how to - * label.) - */ - if (is_system_labeled() && - (ipha->ipha_version_and_hdr_length & 0xf0) == (IPV4_VERSION << 4) && - !connp->conn_ulp_labeled) { - cred_t *credp; - pid_t pid; - - credp = BEST_CRED(mp, connp, &pid); - err = tsol_check_label(credp, &mp, - connp->conn_mac_mode, ipst, pid); - ipha = (ipha_t *)mp->b_rptr; - if (err != 0) { - first_mp = mp; - if (err == EINVAL) - goto icmp_parameter_problem; - ip2dbg(("ip_wput: label check failed (%d)\n", err)); - goto discard_pkt; - } - } - - ASSERT(infop != NULL); - - if (infop->ip_opt_flags & IP_VERIFY_SRC) { - /* - * IP_PKTINFO ancillary option is present. - * IPCL_ZONEID is used to honor IP_ALLZONES option which - * allows using address of any zone as the source address. - */ - ire = ire_ctable_lookup(ipha->ipha_src, 0, - (IRE_LOCAL|IRE_LOOPBACK), NULL, IPCL_ZONEID(connp), - NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst); - if (ire == NULL) - goto drop_pkt; - ire_refrele(ire); - ire = NULL; - } - - /* - * IP_BOUND_IF has precedence over the ill index passed in IP_PKTINFO. - */ - if (infop->ip_opt_ill_index != 0 && connp->conn_outgoing_ill == NULL) { - xmit_ill = ill_lookup_on_ifindex(infop->ip_opt_ill_index, - B_FALSE, NULL, NULL, NULL, NULL, ipst); - - if (xmit_ill == NULL || IS_VNI(xmit_ill)) - goto drop_pkt; - /* - * check that there is an ipif belonging - * to our zone. IPCL_ZONEID is not used because - * IP_ALLZONES option is valid only when the ill is - * accessible from all zones i.e has a valid ipif in - * all zones. - */ - if (!ipif_lookup_zoneid(xmit_ill, zoneid, 0, NULL)) { - goto drop_pkt; - } - } - - /* - * If there is a policy, try to attach an ipsec_out in - * the front. At the end, first_mp either points to a - * M_DATA message or IPSEC_OUT message linked to a - * M_DATA message. We have to do it now as we might - * lose the "conn" if we go through ip_newroute. - */ - if (connp->conn_out_enforce_policy || (connp->conn_latch != NULL)) { - if (((mp = ipsec_attach_ipsec_out(&mp, connp, NULL, - ipha->ipha_protocol, ipst->ips_netstack)) == NULL)) { - BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); - if (need_decref) - CONN_DEC_REF(connp); - return; - } - ASSERT(mp->b_datap->db_type == M_CTL); - first_mp = mp; - mp = mp->b_cont; - mctl_present = B_TRUE; - } else { - first_mp = mp; - mctl_present = B_FALSE; - } - - v_hlen_tos_len = ((uint32_t *)ipha)[0]; - - /* is wrong version or IP options present */ - if (V_HLEN != IP_SIMPLE_HDR_VERSION) - goto version_hdrlen_check; - dst = ipha->ipha_dst; - - /* If IP_BOUND_IF has been set, use that ill. */ - if (connp->conn_outgoing_ill != NULL) { - xmit_ill = conn_get_held_ill(connp, - &connp->conn_outgoing_ill, &err); - if (err == ILL_LOOKUP_FAILED) - goto drop_pkt; - - goto send_from_ill; - } - - /* is packet multicast? */ - if (CLASSD(dst)) - goto multicast; - - /* - * If xmit_ill is set above due to index passed in ip_pkt_info. It - * takes precedence over conn_dontroute and conn_nexthop_set - */ - if (xmit_ill != NULL) - goto send_from_ill; - - if (connp->conn_dontroute || connp->conn_nexthop_set) { - /* - * If the destination is a broadcast, local, or loopback - * address, SO_DONTROUTE and IP_NEXTHOP go through the - * standard path. - */ - ire = ire_cache_lookup(dst, zoneid, msg_getlabel(mp), ipst); - if ((ire == NULL) || (ire->ire_type & - (IRE_BROADCAST | IRE_LOCAL | IRE_LOOPBACK)) == 0) { - if (ire != NULL) { - ire_refrele(ire); - /* No more access to ire */ - ire = NULL; - } - /* - * bypass routing checks and go directly to interface. - */ - if (connp->conn_dontroute) - goto dontroute; - - ASSERT(connp->conn_nexthop_set); - ip_nexthop = B_TRUE; - nexthop_addr = connp->conn_nexthop_v4; - goto send_from_ill; - } - - /* Must be a broadcast, a loopback or a local ire */ - ire_refrele(ire); - /* No more access to ire */ - ire = NULL; - } - - /* - * We cache IRE_CACHEs to avoid lookups. We don't do - * this for the tcp global queue and listen end point - * as it does not really have a real destination to - * talk to. This is also true for SCTP. - */ - if (IP_FLOW_CONTROLLED_ULP(connp->conn_ulp) && - !connp->conn_fully_bound) { - ire = ire_cache_lookup(dst, zoneid, msg_getlabel(mp), ipst); - if (ire == NULL) - goto noirefound; - TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, - "ip_wput_end: q %p (%S)", q, "end"); - - /* - * Check if the ire has the RTF_MULTIRT flag, inherited - * from an IRE_OFFSUBNET ire entry in ip_newroute(). - */ - if (ire->ire_flags & RTF_MULTIRT) { - - /* - * Force the TTL of multirouted packets if required. - * The TTL of such packets is bounded by the - * ip_multirt_ttl ndd variable. - */ - if ((ipst->ips_ip_multirt_ttl > 0) && - (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) { - ip2dbg(("ip_wput: forcing multirt TTL to %d " - "(was %d), dst 0x%08x\n", - ipst->ips_ip_multirt_ttl, ipha->ipha_ttl, - ntohl(ire->ire_addr))); - ipha->ipha_ttl = ipst->ips_ip_multirt_ttl; - } - /* - * We look at this point if there are pending - * unresolved routes. ire_multirt_resolvable() - * checks in O(n) that all IRE_OFFSUBNET ire - * entries for the packet's destination and - * flagged RTF_MULTIRT are currently resolved. - * If some remain unresolved, we make a copy - * of the current message. It will be used - * to initiate additional route resolutions. - */ - multirt_need_resolve = - ire_multirt_need_resolve(ire->ire_addr, - msg_getlabel(first_mp), ipst); - ip2dbg(("ip_wput[TCP]: ire %p, " - "multirt_need_resolve %d, first_mp %p\n", - (void *)ire, multirt_need_resolve, - (void *)first_mp)); - if (multirt_need_resolve) { - copy_mp = copymsg(first_mp); - if (copy_mp != NULL) { - MULTIRT_DEBUG_TAG(copy_mp); - } - } - } - - ip_wput_ire(q, first_mp, ire, connp, caller, zoneid); - - /* - * Try to resolve another multiroute if - * ire_multirt_need_resolve() deemed it necessary. - */ - if (copy_mp != NULL) - ip_newroute(q, copy_mp, dst, connp, zoneid, ipst); - if (need_decref) - CONN_DEC_REF(connp); - return; - } - - /* - * Access to conn_ire_cache. (protected by conn_lock) - * - * IRE_MARK_CONDEMNED is marked in ire_delete. We don't grab - * the ire bucket lock here to check for CONDEMNED as it is okay to - * send a packet or two with the IRE_CACHE that is going away. - * Access to the ire requires an ire refhold on the ire prior to - * its use since an interface unplumb thread may delete the cached - * ire and release the refhold at any time. - * - * Caching an ire in the conn_ire_cache - * - * o Caching an ire pointer in the conn requires a strict check for - * IRE_MARK_CONDEMNED. An interface unplumb thread deletes all relevant - * ires before cleaning up the conns. So the caching of an ire pointer - * in the conn is done after making sure under the bucket lock that the - * ire has not yet been marked CONDEMNED. Otherwise we will end up - * caching an ire after the unplumb thread has cleaned up the conn. - * If the conn does not send a packet subsequently the unplumb thread - * will be hanging waiting for the ire count to drop to zero. - * - * o We also need to atomically test for a null conn_ire_cache and - * set the conn_ire_cache under the the protection of the conn_lock - * to avoid races among concurrent threads trying to simultaneously - * cache an ire in the conn_ire_cache. - */ - mutex_enter(&connp->conn_lock); - ire = sctp_ire != NULL ? sctp_ire : connp->conn_ire_cache; - - if (ire != NULL && ire->ire_addr == dst && - !(ire->ire_marks & IRE_MARK_CONDEMNED)) { - - IRE_REFHOLD(ire); - mutex_exit(&connp->conn_lock); - - } else { - boolean_t cached = B_FALSE; - connp->conn_ire_cache = NULL; - mutex_exit(&connp->conn_lock); - /* Release the old ire */ - if (ire != NULL && sctp_ire == NULL) - IRE_REFRELE_NOTR(ire); - - ire = ire_cache_lookup(dst, zoneid, msg_getlabel(mp), ipst); - if (ire == NULL) - goto noirefound; - IRE_REFHOLD_NOTR(ire); - - mutex_enter(&connp->conn_lock); - if (CONN_CACHE_IRE(connp) && connp->conn_ire_cache == NULL) { - rw_enter(&ire->ire_bucket->irb_lock, RW_READER); - if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { - if (connp->conn_ulp == IPPROTO_TCP) - TCP_CHECK_IREINFO(connp->conn_tcp, ire); - connp->conn_ire_cache = ire; - cached = B_TRUE; - } - rw_exit(&ire->ire_bucket->irb_lock); - } - mutex_exit(&connp->conn_lock); - - /* - * We can continue to use the ire but since it was - * not cached, we should drop the extra reference. - */ - if (!cached) - IRE_REFRELE_NOTR(ire); - } - - TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, - "ip_wput_end: q %p (%S)", q, "end"); - - /* - * Check if the ire has the RTF_MULTIRT flag, inherited - * from an IRE_OFFSUBNET ire entry in ip_newroute(). - */ - if (ire->ire_flags & RTF_MULTIRT) { - /* - * Force the TTL of multirouted packets if required. - * The TTL of such packets is bounded by the - * ip_multirt_ttl ndd variable. - */ - if ((ipst->ips_ip_multirt_ttl > 0) && - (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) { - ip2dbg(("ip_wput: forcing multirt TTL to %d " - "(was %d), dst 0x%08x\n", - ipst->ips_ip_multirt_ttl, ipha->ipha_ttl, - ntohl(ire->ire_addr))); - ipha->ipha_ttl = ipst->ips_ip_multirt_ttl; - } - - /* - * At this point, we check to see if there are any pending - * unresolved routes. ire_multirt_resolvable() - * checks in O(n) that all IRE_OFFSUBNET ire - * entries for the packet's destination and - * flagged RTF_MULTIRT are currently resolved. - * If some remain unresolved, we make a copy - * of the current message. It will be used - * to initiate additional route resolutions. - */ - multirt_need_resolve = ire_multirt_need_resolve(ire->ire_addr, - msg_getlabel(first_mp), ipst); - ip2dbg(("ip_wput[not TCP]: ire %p, " - "multirt_need_resolve %d, first_mp %p\n", - (void *)ire, multirt_need_resolve, (void *)first_mp)); - if (multirt_need_resolve) { - copy_mp = copymsg(first_mp); - if (copy_mp != NULL) { - MULTIRT_DEBUG_TAG(copy_mp); - } - } - } - - ip_wput_ire(q, first_mp, ire, connp, caller, zoneid); - - /* - * Try to resolve another multiroute if - * ire_multirt_resolvable() deemed it necessary - */ - if (copy_mp != NULL) - ip_newroute(q, copy_mp, dst, connp, zoneid, ipst); - if (need_decref) - CONN_DEC_REF(connp); - return; - -qnext: - /* - * Upper Level Protocols pass down complete IP datagrams - * as M_DATA messages. Everything else is a sideshow. - * - * 1) We could be re-entering ip_wput because of ip_neworute - * in which case we could have a IPSEC_OUT message. We - * need to pass through ip_wput like other datagrams and - * hence cannot branch to ip_wput_nondata. - * - * 2) ARP, AH, ESP, and other clients who are on the module - * instance of IP stream, give us something to deal with. - * We will handle AH and ESP here and rest in ip_wput_nondata. - * - * 3) ICMP replies also could come here. - */ - ipst = ILLQ_TO_IPST(q); - - if (DB_TYPE(mp) != M_DATA) { -notdata: - if (DB_TYPE(mp) == M_CTL) { - /* - * M_CTL messages are used by ARP, AH and ESP to - * communicate with IP. We deal with IPSEC_IN and - * IPSEC_OUT here. ip_wput_nondata handles other - * cases. - */ - ipsec_info_t *ii = (ipsec_info_t *)mp->b_rptr; - if (mp->b_cont && (mp->b_cont->b_flag & MSGHASREF)) { - first_mp = mp->b_cont; - first_mp->b_flag &= ~MSGHASREF; - ASSERT(connp->conn_ulp == IPPROTO_SCTP); - SCTP_EXTRACT_IPINFO(first_mp, sctp_ire); - CONN_DEC_REF(connp); - connp = NULL; - } - if (ii->ipsec_info_type == IPSEC_IN) { - /* - * Either this message goes back to - * IPsec for further processing or to - * ULP after policy checks. - */ - ip_fanout_proto_again(mp, NULL, NULL, NULL); - return; - } else if (ii->ipsec_info_type == IPSEC_OUT) { - io = (ipsec_out_t *)ii; - if (io->ipsec_out_proc_begin) { - /* - * IPsec processing has already started. - * Complete it. - * IPQoS notes: We don't care what is - * in ipsec_out_ill_index since this - * won't be processed for IPQoS policies - * in ipsec_out_process. - */ - ipsec_out_process(q, mp, NULL, - io->ipsec_out_ill_index); - return; - } else { - connp = (q->q_next != NULL) ? - NULL : Q_TO_CONN(q); - first_mp = mp; - mp = mp->b_cont; - mctl_present = B_TRUE; - } - zoneid = io->ipsec_out_zoneid; - ASSERT(zoneid != ALL_ZONES); - } else if (ii->ipsec_info_type == IPSEC_CTL) { - /* - * It's an IPsec control message requesting - * an SADB update to be sent to the IPsec - * hardware acceleration capable ills. - */ - ipsec_ctl_t *ipsec_ctl = - (ipsec_ctl_t *)mp->b_rptr; - ipsa_t *sa = (ipsa_t *)ipsec_ctl->ipsec_ctl_sa; - uint_t satype = ipsec_ctl->ipsec_ctl_sa_type; - mblk_t *cmp = mp->b_cont; - - ASSERT(MBLKL(mp) >= sizeof (ipsec_ctl_t)); - ASSERT(cmp != NULL); - - freeb(mp); - ill_ipsec_capab_send_all(satype, cmp, sa, - ipst->ips_netstack); - return; - } else { - /* - * This must be ARP or special TSOL signaling. - */ - ip_wput_nondata(NULL, q, mp, NULL); - TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, - "ip_wput_end: q %p (%S)", q, "nondata"); - return; - } - } else { - /* - * This must be non-(ARP/AH/ESP) messages. - */ - ASSERT(!need_decref); - ip_wput_nondata(NULL, q, mp, NULL); - TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, - "ip_wput_end: q %p (%S)", q, "nondata"); - return; - } - } else { - first_mp = mp; - mctl_present = B_FALSE; - } - - ASSERT(first_mp != NULL); - - if (mctl_present) { - io = (ipsec_out_t *)first_mp->b_rptr; - if (io->ipsec_out_ip_nexthop) { - /* - * We may have lost the conn context if we are - * coming here from ip_newroute(). Copy the - * nexthop information. - */ - ip_nexthop = B_TRUE; - nexthop_addr = io->ipsec_out_nexthop_addr; - - ipha = (ipha_t *)mp->b_rptr; - dst = ipha->ipha_dst; - goto send_from_ill; - } - } - - ASSERT(xmit_ill == NULL); - - /* We have a complete IP datagram heading outbound. */ - ipha = (ipha_t *)mp->b_rptr; - -#ifndef SPEED_BEFORE_SAFETY - /* - * Make sure we have a full-word aligned message and that at least - * a simple IP header is accessible in the first message. If not, - * try a pullup. For labeled systems we need to always take this - * path as M_CTLs are "notdata" but have trailing data to process. - */ - if (!OK_32PTR(rptr) || - (mp->b_wptr - rptr) < IP_SIMPLE_HDR_LENGTH || is_system_labeled()) { -hdrtoosmall: - if (!pullupmsg(mp, IP_SIMPLE_HDR_LENGTH)) { - TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, - "ip_wput_end: q %p (%S)", q, "pullupfailed"); - if (first_mp == NULL) - first_mp = mp; - goto discard_pkt; - } - - /* This function assumes that mp points to an IPv4 packet. */ - if (is_system_labeled() && - (*mp->b_rptr & 0xf0) == (IPV4_VERSION << 4) && - (connp == NULL || !connp->conn_ulp_labeled)) { - cred_t *credp; - pid_t pid; - - if (connp != NULL) { - credp = BEST_CRED(mp, connp, &pid); - err = tsol_check_label(credp, &mp, - connp->conn_mac_mode, ipst, pid); - } else if ((credp = msg_getcred(mp, &pid)) != NULL) { - err = tsol_check_label(credp, &mp, - CONN_MAC_DEFAULT, ipst, pid); - } - ipha = (ipha_t *)mp->b_rptr; - if (mctl_present) - first_mp->b_cont = mp; - else - first_mp = mp; - if (err != 0) { - if (err == EINVAL) - goto icmp_parameter_problem; - ip2dbg(("ip_wput: label check failed (%d)\n", - err)); - goto discard_pkt; - } - } - - ipha = (ipha_t *)mp->b_rptr; - if (first_mp == NULL) { - ASSERT(xmit_ill == NULL); - /* - * If we got here because of "goto hdrtoosmall" - * We need to attach a IPSEC_OUT. - */ - if (connp->conn_out_enforce_policy) { - if (((mp = ipsec_attach_ipsec_out(&mp, connp, - NULL, ipha->ipha_protocol, - ipst->ips_netstack)) == NULL)) { - BUMP_MIB(&ipst->ips_ip_mib, - ipIfStatsOutDiscards); - if (need_decref) - CONN_DEC_REF(connp); - return; - } else { - ASSERT(mp->b_datap->db_type == M_CTL); - first_mp = mp; - mp = mp->b_cont; - mctl_present = B_TRUE; - } - } else { - first_mp = mp; - mctl_present = B_FALSE; - } - } - } -#endif - - /* Most of the code below is written for speed, not readability */ - v_hlen_tos_len = ((uint32_t *)ipha)[0]; - - /* - * If ip_newroute() fails, we're going to need a full - * header for the icmp wraparound. - */ - if (V_HLEN != IP_SIMPLE_HDR_VERSION) { - uint_t v_hlen; -version_hdrlen_check: - ASSERT(first_mp != NULL); - v_hlen = V_HLEN; - /* - * siphon off IPv6 packets coming down from transport - * layer modules here. - * Note: high-order bit carries NUD reachability confirmation - */ - if (((v_hlen >> 4) & 0x7) == IPV6_VERSION) { - /* - * FIXME: assume that callers of ip_output* call - * the right version? - */ - BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutWrongIPVersion); - ASSERT(xmit_ill == NULL); - if (need_decref) - mp->b_flag |= MSGHASREF; - (void) ip_output_v6(arg, first_mp, arg2, caller); - return; - } - - if ((v_hlen >> 4) != IP_VERSION) { - TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, - "ip_wput_end: q %p (%S)", q, "badvers"); - goto discard_pkt; - } - /* - * Is the header length at least 20 bytes? - * - * Are there enough bytes accessible in the header? If - * not, try a pullup. - */ - v_hlen &= 0xF; - v_hlen <<= 2; - if (v_hlen < IP_SIMPLE_HDR_LENGTH) { - TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, - "ip_wput_end: q %p (%S)", q, "badlen"); - goto discard_pkt; - } - if (v_hlen > (mp->b_wptr - rptr)) { - if (!pullupmsg(mp, v_hlen)) { - TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, - "ip_wput_end: q %p (%S)", q, "badpullup2"); - goto discard_pkt; - } - ipha = (ipha_t *)mp->b_rptr; - } - /* - * Move first entry from any source route into ipha_dst and - * verify the options - */ - if (ip_wput_options(q, first_mp, ipha, mctl_present, - zoneid, ipst)) { - ASSERT(xmit_ill == NULL); - BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); - TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, - "ip_wput_end: q %p (%S)", q, "badopts"); - if (need_decref) - CONN_DEC_REF(connp); - return; - } - } - dst = ipha->ipha_dst; - - /* - * Try to get an IRE_CACHE for the destination address. If we can't, - * we have to run the packet through ip_newroute which will take - * the appropriate action to arrange for an IRE_CACHE, such as querying - * a resolver, or assigning a default gateway, etc. - */ - if (CLASSD(dst)) { - ipif_t *ipif; - uint32_t setsrc = 0; - -multicast: - ASSERT(first_mp != NULL); - ip2dbg(("ip_wput: CLASSD\n")); - if (connp == NULL) { - /* - * Use the first good ipif on the ill. - * XXX Should this ever happen? (Appears - * to show up with just ppp and no ethernet due - * to in.rdisc.) - * However, ire_send should be able to - * call ip_wput_ire directly. - * - * XXX Also, this can happen for ICMP and other packets - * with multicast source addresses. Perhaps we should - * fix things so that we drop the packet in question, - * but for now, just run with it. - */ - ill_t *ill = (ill_t *)q->q_ptr; - - ipif = ipif_select_source(ill, dst, GLOBAL_ZONEID); - if (ipif == NULL) { - if (need_decref) - CONN_DEC_REF(connp); - freemsg(first_mp); - return; - } - ip1dbg(("ip_wput: CLASSD no CONN: dst 0x%x on %s\n", - ntohl(dst), ill->ill_name)); - } else { - /* - * The order of precedence is IP_BOUND_IF, IP_PKTINFO - * and IP_MULTICAST_IF. The block comment above this - * function explains the locking mechanism used here. - */ - if (xmit_ill == NULL) { - xmit_ill = conn_get_held_ill(connp, - &connp->conn_outgoing_ill, &err); - if (err == ILL_LOOKUP_FAILED) { - ip1dbg(("ip_wput: No ill for " - "IP_BOUND_IF\n")); - BUMP_MIB(&ipst->ips_ip_mib, - ipIfStatsOutNoRoutes); - goto drop_pkt; - } - } - - if (xmit_ill == NULL) { - ipif = conn_get_held_ipif(connp, - &connp->conn_multicast_ipif, &err); - if (err == IPIF_LOOKUP_FAILED) { - ip1dbg(("ip_wput: No ipif for " - "multicast\n")); - BUMP_MIB(&ipst->ips_ip_mib, - ipIfStatsOutNoRoutes); - goto drop_pkt; - } - } - if (xmit_ill != NULL) { - ipif = ipif_get_next_ipif(NULL, xmit_ill); - if (ipif == NULL) { - ip1dbg(("ip_wput: No ipif for " - "xmit_ill\n")); - BUMP_MIB(&ipst->ips_ip_mib, - ipIfStatsOutNoRoutes); - goto drop_pkt; - } - } else if (ipif == NULL || ipif->ipif_isv6) { - /* - * We must do this ipif determination here - * else we could pass through ip_newroute - * and come back here without the conn context. - * - * Note: we do late binding i.e. we bind to - * the interface when the first packet is sent. - * For performance reasons we do not rebind on - * each packet but keep the binding until the - * next IP_MULTICAST_IF option. - * - * conn_multicast_{ipif,ill} are shared between - * IPv4 and IPv6 and AF_INET6 sockets can - * send both IPv4 and IPv6 packets. Hence - * we have to check that "isv6" matches above. - */ - if (ipif != NULL) - ipif_refrele(ipif); - ipif = ipif_lookup_group(dst, zoneid, ipst); - if (ipif == NULL) { - ip1dbg(("ip_wput: No ipif for " - "multicast\n")); - BUMP_MIB(&ipst->ips_ip_mib, - ipIfStatsOutNoRoutes); - goto drop_pkt; - } - err = conn_set_held_ipif(connp, - &connp->conn_multicast_ipif, ipif); - if (err == IPIF_LOOKUP_FAILED) { - ipif_refrele(ipif); - ip1dbg(("ip_wput: No ipif for " - "multicast\n")); - BUMP_MIB(&ipst->ips_ip_mib, - ipIfStatsOutNoRoutes); - goto drop_pkt; - } - } - } - ASSERT(!ipif->ipif_isv6); - /* - * As we may lose the conn by the time we reach ip_wput_ire, - * we copy conn_multicast_loop and conn_dontroute on to an - * ipsec_out. In case if this datagram goes out secure, - * we need the ill_index also. Copy that also into the - * ipsec_out. - */ - if (mctl_present) { - io = (ipsec_out_t *)first_mp->b_rptr; - ASSERT(first_mp->b_datap->db_type == M_CTL); - ASSERT(io->ipsec_out_type == IPSEC_OUT); - } else { - ASSERT(mp == first_mp); - if ((first_mp = allocb(sizeof (ipsec_info_t), - BPRI_HI)) == NULL) { - ipif_refrele(ipif); - first_mp = mp; - goto discard_pkt; - } - first_mp->b_datap->db_type = M_CTL; - first_mp->b_wptr += sizeof (ipsec_info_t); - /* ipsec_out_secure is B_FALSE now */ - bzero(first_mp->b_rptr, sizeof (ipsec_info_t)); - io = (ipsec_out_t *)first_mp->b_rptr; - io->ipsec_out_type = IPSEC_OUT; - io->ipsec_out_len = sizeof (ipsec_out_t); - io->ipsec_out_use_global_policy = B_TRUE; - io->ipsec_out_ns = ipst->ips_netstack; - first_mp->b_cont = mp; - mctl_present = B_TRUE; - } - - match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; - io->ipsec_out_ill_index = - ipif->ipif_ill->ill_phyint->phyint_ifindex; - - if (connp != NULL) { - io->ipsec_out_multicast_loop = - connp->conn_multicast_loop; - io->ipsec_out_dontroute = connp->conn_dontroute; - io->ipsec_out_zoneid = connp->conn_zoneid; - } - /* - * If the application uses IP_MULTICAST_IF with - * different logical addresses of the same ILL, we - * need to make sure that the soruce address of - * the packet matches the logical IP address used - * in the option. We do it by initializing ipha_src - * here. This should keep IPsec also happy as - * when we return from IPsec processing, we don't - * have to worry about getting the right address on - * the packet. Thus it is sufficient to look for - * IRE_CACHE using MATCH_IRE_ILL rathen than - * MATCH_IRE_IPIF. - * - * NOTE : We need to do it for non-secure case also as - * this might go out secure if there is a global policy - * match in ip_wput_ire. - * - * As we do not have the ire yet, it is possible that - * we set the source address here and then later discover - * that the ire implies the source address to be assigned - * through the RTF_SETSRC flag. - * In that case, the setsrc variable will remind us - * that overwritting the source address by the one - * of the RTF_SETSRC-flagged ire is allowed. - */ - if (ipha->ipha_src == INADDR_ANY && - (connp == NULL || !connp->conn_unspec_src)) { - ipha->ipha_src = ipif->ipif_src_addr; - setsrc = RTF_SETSRC; - } - /* - * Find an IRE which matches the destination and the outgoing - * queue (i.e. the outgoing interface.) - * For loopback use a unicast IP address for - * the ire lookup. - */ - if (IS_LOOPBACK(ipif->ipif_ill)) - dst = ipif->ipif_lcl_addr; - - /* - * If xmit_ill is set, we branch out to ip_newroute_ipif. - * We don't need to lookup ire in ctable as the packet - * needs to be sent to the destination through the specified - * ill irrespective of ires in the cache table. - */ - ire = NULL; - if (xmit_ill == NULL) { - ire = ire_ctable_lookup(dst, 0, 0, ipif, - zoneid, msg_getlabel(mp), match_flags, ipst); - } - - if (ire == NULL) { - /* - * Multicast loopback and multicast forwarding is - * done in ip_wput_ire. - * - * Mark this packet to make it be delivered to - * ip_wput_ire after the new ire has been - * created. - * - * The call to ip_newroute_ipif takes into account - * the setsrc reminder. In any case, we take care - * of the RTF_MULTIRT flag. - */ - mp->b_prev = mp->b_next = NULL; - if (xmit_ill == NULL || - xmit_ill->ill_ipif_up_count > 0) { - ip_newroute_ipif(q, first_mp, ipif, dst, connp, - setsrc | RTF_MULTIRT, zoneid, infop); - TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, - "ip_wput_end: q %p (%S)", q, "noire"); - } else { - freemsg(first_mp); - } - ipif_refrele(ipif); - if (xmit_ill != NULL) - ill_refrele(xmit_ill); - if (need_decref) - CONN_DEC_REF(connp); - return; - } - - ipif_refrele(ipif); - ipif = NULL; - ASSERT(xmit_ill == NULL); - - /* - * Honor the RTF_SETSRC flag for multicast packets, - * if allowed by the setsrc reminder. - */ - if ((ire->ire_flags & RTF_SETSRC) && setsrc) { - ipha->ipha_src = ire->ire_src_addr; - } - - /* - * Unconditionally force the TTL to 1 for - * multirouted multicast packets: - * multirouted multicast should not cross - * multicast routers. - */ - if (ire->ire_flags & RTF_MULTIRT) { - if (ipha->ipha_ttl > 1) { - ip2dbg(("ip_wput: forcing multicast " - "multirt TTL to 1 (was %d), dst 0x%08x\n", - ipha->ipha_ttl, ntohl(ire->ire_addr))); - ipha->ipha_ttl = 1; - } - } - } else { - ire = ire_cache_lookup(dst, zoneid, msg_getlabel(mp), ipst); - if ((ire != NULL) && (ire->ire_type & - (IRE_BROADCAST | IRE_LOCAL | IRE_LOOPBACK))) { - ignore_dontroute = B_TRUE; - ignore_nexthop = B_TRUE; - } - if (ire != NULL) { - ire_refrele(ire); - ire = NULL; - } - /* - * Guard against coming in from arp in which case conn is NULL. - * Also guard against non M_DATA with dontroute set but - * destined to local, loopback or broadcast addresses. - */ - if (connp != NULL && connp->conn_dontroute && - !ignore_dontroute) { -dontroute: - /* - * Set TTL to 1 if SO_DONTROUTE is set to prevent - * routing protocols from seeing false direct - * connectivity. - */ - ipha->ipha_ttl = 1; - /* If suitable ipif not found, drop packet */ - dst_ipif = ipif_lookup_onlink_addr(dst, zoneid, ipst); - if (dst_ipif == NULL) { -noroute: - ip1dbg(("ip_wput: no route for dst using" - " SO_DONTROUTE\n")); - BUMP_MIB(&ipst->ips_ip_mib, - ipIfStatsOutNoRoutes); - mp->b_prev = mp->b_next = NULL; - if (first_mp == NULL) - first_mp = mp; - goto drop_pkt; - } else { - /* - * If suitable ipif has been found, set - * xmit_ill to the corresponding - * ipif_ill because we'll be using the - * send_from_ill logic below. - */ - ASSERT(xmit_ill == NULL); - xmit_ill = dst_ipif->ipif_ill; - mutex_enter(&xmit_ill->ill_lock); - if (!ILL_CAN_LOOKUP(xmit_ill)) { - mutex_exit(&xmit_ill->ill_lock); - xmit_ill = NULL; - ipif_refrele(dst_ipif); - goto noroute; - } - ill_refhold_locked(xmit_ill); - mutex_exit(&xmit_ill->ill_lock); - ipif_refrele(dst_ipif); - } - } - -send_from_ill: - if (xmit_ill != NULL) { - ipif_t *ipif; - - /* - * Mark this packet as originated locally - */ - mp->b_prev = mp->b_next = NULL; - - /* - * Could be SO_DONTROUTE case also. - * Verify that at least one ipif is up on the ill. - */ - if (xmit_ill->ill_ipif_up_count == 0) { - ip1dbg(("ip_output: xmit_ill %s is down\n", - xmit_ill->ill_name)); - goto drop_pkt; - } - - ipif = ipif_get_next_ipif(NULL, xmit_ill); - if (ipif == NULL) { - ip1dbg(("ip_output: xmit_ill %s NULL ipif\n", - xmit_ill->ill_name)); - goto drop_pkt; - } - - match_flags = 0; - if (IS_UNDER_IPMP(xmit_ill)) - match_flags |= MATCH_IRE_MARK_TESTHIDDEN; - - /* - * Look for a ire that is part of the group, - * if found use it else call ip_newroute_ipif. - * IPCL_ZONEID is not used for matching because - * IP_ALLZONES option is valid only when the - * ill is accessible from all zones i.e has a - * valid ipif in all zones. - */ - match_flags |= MATCH_IRE_ILL | MATCH_IRE_SECATTR; - ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid, - msg_getlabel(mp), match_flags, ipst); - /* - * If an ire exists use it or else create - * an ire but don't add it to the cache. - * Adding an ire may cause issues with - * asymmetric routing. - * In case of multiroute always act as if - * ire does not exist. - */ - if (ire == NULL || ire->ire_flags & RTF_MULTIRT) { - if (ire != NULL) - ire_refrele(ire); - ip_newroute_ipif(q, first_mp, ipif, - dst, connp, 0, zoneid, infop); - ipif_refrele(ipif); - ip1dbg(("ip_output: xmit_ill via %s\n", - xmit_ill->ill_name)); - ill_refrele(xmit_ill); - if (need_decref) - CONN_DEC_REF(connp); - return; - } - ipif_refrele(ipif); - } else if (ip_nexthop || (connp != NULL && - (connp->conn_nexthop_set)) && !ignore_nexthop) { - if (!ip_nexthop) { - ip_nexthop = B_TRUE; - nexthop_addr = connp->conn_nexthop_v4; - } - match_flags = MATCH_IRE_MARK_PRIVATE_ADDR | - MATCH_IRE_GW; - ire = ire_ctable_lookup(dst, nexthop_addr, 0, - NULL, zoneid, msg_getlabel(mp), match_flags, ipst); - } else { - ire = ire_cache_lookup(dst, zoneid, msg_getlabel(mp), - ipst); - } - if (!ire) { - if (ip_nexthop && !ignore_nexthop) { - if (mctl_present) { - io = (ipsec_out_t *)first_mp->b_rptr; - ASSERT(first_mp->b_datap->db_type == - M_CTL); - ASSERT(io->ipsec_out_type == IPSEC_OUT); - } else { - ASSERT(mp == first_mp); - first_mp = allocb( - sizeof (ipsec_info_t), BPRI_HI); - if (first_mp == NULL) { - first_mp = mp; - goto discard_pkt; - } - first_mp->b_datap->db_type = M_CTL; - first_mp->b_wptr += - sizeof (ipsec_info_t); - /* ipsec_out_secure is B_FALSE now */ - bzero(first_mp->b_rptr, - sizeof (ipsec_info_t)); - io = (ipsec_out_t *)first_mp->b_rptr; - io->ipsec_out_type = IPSEC_OUT; - io->ipsec_out_len = - sizeof (ipsec_out_t); - io->ipsec_out_use_global_policy = - B_TRUE; - io->ipsec_out_ns = ipst->ips_netstack; - first_mp->b_cont = mp; - mctl_present = B_TRUE; - } - io->ipsec_out_ip_nexthop = ip_nexthop; - io->ipsec_out_nexthop_addr = nexthop_addr; - } -noirefound: - /* - * Mark this packet as having originated on - * this machine. This will be noted in - * ire_add_then_send, which needs to know - * whether to run it back through ip_wput or - * ip_rput following successful resolution. - */ - mp->b_prev = NULL; - mp->b_next = NULL; - ip_newroute(q, first_mp, dst, connp, zoneid, ipst); - TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, - "ip_wput_end: q %p (%S)", q, "newroute"); - if (xmit_ill != NULL) - ill_refrele(xmit_ill); - if (need_decref) - CONN_DEC_REF(connp); - return; - } - } - - /* We now know where we are going with it. */ - - TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, - "ip_wput_end: q %p (%S)", q, "end"); - - /* - * Check if the ire has the RTF_MULTIRT flag, inherited - * from an IRE_OFFSUBNET ire entry in ip_newroute. - */ - if (ire->ire_flags & RTF_MULTIRT) { - /* - * Force the TTL of multirouted packets if required. - * The TTL of such packets is bounded by the - * ip_multirt_ttl ndd variable. - */ - if ((ipst->ips_ip_multirt_ttl > 0) && - (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) { - ip2dbg(("ip_wput: forcing multirt TTL to %d " - "(was %d), dst 0x%08x\n", - ipst->ips_ip_multirt_ttl, ipha->ipha_ttl, - ntohl(ire->ire_addr))); - ipha->ipha_ttl = ipst->ips_ip_multirt_ttl; - } - /* - * At this point, we check to see if there are any pending - * unresolved routes. ire_multirt_resolvable() - * checks in O(n) that all IRE_OFFSUBNET ire - * entries for the packet's destination and - * flagged RTF_MULTIRT are currently resolved. - * If some remain unresolved, we make a copy - * of the current message. It will be used - * to initiate additional route resolutions. - */ - multirt_need_resolve = ire_multirt_need_resolve(ire->ire_addr, - msg_getlabel(first_mp), ipst); - ip2dbg(("ip_wput[noirefound]: ire %p, " - "multirt_need_resolve %d, first_mp %p\n", - (void *)ire, multirt_need_resolve, (void *)first_mp)); - if (multirt_need_resolve) { - copy_mp = copymsg(first_mp); - if (copy_mp != NULL) { - MULTIRT_DEBUG_TAG(copy_mp); - } - } - } - - ip_wput_ire(q, first_mp, ire, connp, caller, zoneid); - /* - * Try to resolve another multiroute if - * ire_multirt_resolvable() deemed it necessary. - * At this point, we need to distinguish - * multicasts from other packets. For multicasts, - * we call ip_newroute_ipif() and request that both - * multirouting and setsrc flags are checked. - */ - if (copy_mp != NULL) { - if (CLASSD(dst)) { - ipif_t *ipif = ipif_lookup_group(dst, zoneid, ipst); - if (ipif) { - ASSERT(infop->ip_opt_ill_index == 0); - ip_newroute_ipif(q, copy_mp, ipif, dst, connp, - RTF_SETSRC | RTF_MULTIRT, zoneid, infop); - ipif_refrele(ipif); - } else { - MULTIRT_DEBUG_UNTAG(copy_mp); - freemsg(copy_mp); - copy_mp = NULL; - } - } else { - ip_newroute(q, copy_mp, dst, connp, zoneid, ipst); - } - } - if (xmit_ill != NULL) - ill_refrele(xmit_ill); - if (need_decref) - CONN_DEC_REF(connp); - return; - -icmp_parameter_problem: - /* could not have originated externally */ - ASSERT(mp->b_prev == NULL); - if (ip_hdr_complete(ipha, zoneid, ipst) == 0) { - BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); - /* it's the IP header length that's in trouble */ - icmp_param_problem(q, first_mp, 0, zoneid, ipst); - first_mp = NULL; - } - -discard_pkt: - BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); -drop_pkt: - ip1dbg(("ip_wput: dropped packet\n")); - if (ire != NULL) - ire_refrele(ire); - if (need_decref) - CONN_DEC_REF(connp); - freemsg(first_mp); - if (xmit_ill != NULL) - ill_refrele(xmit_ill); - TRACE_2(TR_FAC_IP, TR_IP_WPUT_END, - "ip_wput_end: q %p (%S)", q, "droppkt"); -} - -/* - * If this is a conn_t queue, then we pass in the conn. This includes the - * zoneid. - * Otherwise, this is a message coming back from ARP or for an ill_t queue, - * in which case we use the global zoneid since those are all part of - * the global zone. - */ -void -ip_wput(queue_t *q, mblk_t *mp) -{ - if (CONN_Q(q)) - ip_output(Q_TO_CONN(q), mp, q, IP_WPUT); - else - ip_output(GLOBAL_ZONEID, mp, q, IP_WPUT); -} - -/* - * - * The following rules must be observed when accessing any ipif or ill - * that has been cached in the conn. Typically conn_outgoing_ill, - * conn_multicast_ipif and conn_multicast_ill. - * - * Access: The ipif or ill pointed to from the conn can be accessed under - * the protection of the conn_lock or after it has been refheld under the - * protection of the conn lock. In addition the IPIF_CAN_LOOKUP or - * ILL_CAN_LOOKUP macros must be used before actually doing the refhold. - * The reason for this is that a concurrent unplumb could actually be - * cleaning up these cached pointers by walking the conns and might have - * finished cleaning up the conn in question. The macros check that an - * unplumb has not yet started on the ipif or ill. - * - * Caching: An ipif or ill pointer may be cached in the conn only after - * making sure that an unplumb has not started. So the caching is done - * while holding both the conn_lock and the ill_lock and after using the - * ILL_CAN_LOOKUP/IPIF_CAN_LOOKUP macro. An unplumb will set the ILL_CONDEMNED - * flag before starting the cleanup of conns. - * - * The list of ipifs hanging off the ill is protected by ill_g_lock and ill_lock - * On the other hand to access ipif->ipif_ill, we need one of either ill_g_lock - * or a reference to the ipif or a reference to an ire that references the - * ipif. An ipif only changes its ill when migrating from an underlying ill - * to an IPMP ill in ipif_up(). - */ -ipif_t * -conn_get_held_ipif(conn_t *connp, ipif_t **ipifp, int *err) -{ - ipif_t *ipif; - ill_t *ill; - ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - - *err = 0; - rw_enter(&ipst->ips_ill_g_lock, RW_READER); - mutex_enter(&connp->conn_lock); - ipif = *ipifp; - if (ipif != NULL) { - ill = ipif->ipif_ill; - mutex_enter(&ill->ill_lock); - if (IPIF_CAN_LOOKUP(ipif)) { - ipif_refhold_locked(ipif); - mutex_exit(&ill->ill_lock); - mutex_exit(&connp->conn_lock); - rw_exit(&ipst->ips_ill_g_lock); - return (ipif); - } else { - *err = IPIF_LOOKUP_FAILED; - } - mutex_exit(&ill->ill_lock); - } - mutex_exit(&connp->conn_lock); - rw_exit(&ipst->ips_ill_g_lock); - return (NULL); -} - -ill_t * -conn_get_held_ill(conn_t *connp, ill_t **illp, int *err) -{ - ill_t *ill; - - *err = 0; - mutex_enter(&connp->conn_lock); - ill = *illp; - if (ill != NULL) { - mutex_enter(&ill->ill_lock); - if (ILL_CAN_LOOKUP(ill)) { - ill_refhold_locked(ill); - mutex_exit(&ill->ill_lock); - mutex_exit(&connp->conn_lock); - return (ill); - } else { - *err = ILL_LOOKUP_FAILED; - } - mutex_exit(&ill->ill_lock); - } - mutex_exit(&connp->conn_lock); - return (NULL); -} - -static int -conn_set_held_ipif(conn_t *connp, ipif_t **ipifp, ipif_t *ipif) -{ - ill_t *ill; - - ill = ipif->ipif_ill; - mutex_enter(&connp->conn_lock); - mutex_enter(&ill->ill_lock); - if (IPIF_CAN_LOOKUP(ipif)) { - *ipifp = ipif; - mutex_exit(&ill->ill_lock); - mutex_exit(&connp->conn_lock); - return (0); - } - mutex_exit(&ill->ill_lock); - mutex_exit(&connp->conn_lock); - return (IPIF_LOOKUP_FAILED); -} - -/* - * This is called if the outbound datagram needs fragmentation. - * - * NOTE : This function does not ire_refrele the ire argument passed in. - */ -static void -ip_wput_ire_fragmentit(mblk_t *ipsec_mp, ire_t *ire, zoneid_t zoneid, - ip_stack_t *ipst, conn_t *connp) -{ - ipha_t *ipha; - mblk_t *mp; - uint32_t v_hlen_tos_len; - uint32_t max_frag; - uint32_t frag_flag; - boolean_t dont_use; - - if (ipsec_mp->b_datap->db_type == M_CTL) { - mp = ipsec_mp->b_cont; - } else { - mp = ipsec_mp; - } - - ipha = (ipha_t *)mp->b_rptr; - v_hlen_tos_len = ((uint32_t *)ipha)[0]; - -#ifdef _BIG_ENDIAN -#define V_HLEN (v_hlen_tos_len >> 24) -#define LENGTH (v_hlen_tos_len & 0xFFFF) -#else -#define V_HLEN (v_hlen_tos_len & 0xFF) -#define LENGTH ((v_hlen_tos_len >> 24) | ((v_hlen_tos_len >> 8) & 0xFF00)) -#endif - -#ifndef SPEED_BEFORE_SAFETY - /* - * Check that ipha_length is consistent with - * the mblk length - */ - if (LENGTH != (mp->b_cont ? msgdsize(mp) : mp->b_wptr - rptr)) { - ip0dbg(("Packet length mismatch: %d, %ld\n", - LENGTH, msgdsize(mp))); - freemsg(ipsec_mp); - TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, - "ip_wput_ire_fragmentit: mp %p (%S)", mp, - "packet length mismatch"); - return; - } -#endif - /* - * Don't use frag_flag if pre-built packet or source - * routed or if multicast (since multicast packets do not solicit - * ICMP "packet too big" messages). Get the values of - * max_frag and frag_flag atomically by acquiring the - * ire_lock. - */ - mutex_enter(&ire->ire_lock); - max_frag = ire->ire_max_frag; - frag_flag = ire->ire_frag_flag; - mutex_exit(&ire->ire_lock); - - dont_use = ((ipha->ipha_ident == IP_HDR_INCLUDED) || - (V_HLEN != IP_SIMPLE_HDR_VERSION && - ip_source_route_included(ipha)) || CLASSD(ipha->ipha_dst)); - - ip_wput_frag(ire, ipsec_mp, OB_PKT, max_frag, - (dont_use ? 0 : frag_flag), zoneid, ipst, connp); -} - -/* * Used for deciding the MSS size for the upper layer. Thus * we need to check the outbound policy values in the conn. */ @@ -21820,10 +11595,10 @@ conn_ipsec_length(conn_t *connp) if (ipl == NULL) return (0); - if (ipl->ipl_out_policy == NULL) + if (connp->conn_ixa->ixa_ipsec_policy == NULL) return (0); - return (ipl->ipl_out_policy->ipsp_act->ipa_ovhd); + return (connp->conn_ixa->ixa_ipsec_policy->ipsp_act->ipa_ovhd); } /* @@ -21831,20 +11606,17 @@ conn_ipsec_length(conn_t *connp) * we don't want to call into IPsec to get the exact size. */ int -ipsec_out_extra_length(mblk_t *ipsec_mp) +ipsec_out_extra_length(ip_xmit_attr_t *ixa) { - ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr; ipsec_action_t *a; - ASSERT(io->ipsec_out_type == IPSEC_OUT); - if (!io->ipsec_out_secure) + if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE)) return (0); - a = io->ipsec_out_act; - + a = ixa->ixa_ipsec_action; if (a == NULL) { - ASSERT(io->ipsec_out_policy != NULL); - a = io->ipsec_out_policy->ipsp_act; + ASSERT(ixa->ixa_ipsec_policy != NULL); + a = ixa->ixa_ipsec_policy->ipsp_act; } ASSERT(a != NULL); @@ -21852,22 +11624,6 @@ ipsec_out_extra_length(mblk_t *ipsec_mp) } /* - * Returns an estimate of the IPsec headers size. This is used if - * we don't want to call into IPsec to get the exact size. - */ -int -ipsec_in_extra_length(mblk_t *ipsec_mp) -{ - ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr; - ipsec_action_t *a; - - ASSERT(ii->ipsec_in_type == IPSEC_IN); - - a = ii->ipsec_in_action; - return (a == NULL ? 0 : a->ipa_ovhd); -} - -/* * If there are any source route options, return the true final * destination. Otherwise, return the destination. */ @@ -21914,2257 +11670,70 @@ ip_get_dst(ipha_t *ipha) return (dst); } -mblk_t * -ip_wput_ire_parse_ipsec_out(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, ire_t *ire, - conn_t *connp, boolean_t unspec_src, zoneid_t zoneid) -{ - ipsec_out_t *io; - mblk_t *first_mp; - boolean_t policy_present; - ip_stack_t *ipst; - ipsec_stack_t *ipss; - - ASSERT(ire != NULL); - ipst = ire->ire_ipst; - ipss = ipst->ips_netstack->netstack_ipsec; - - first_mp = mp; - if (mp->b_datap->db_type == M_CTL) { - io = (ipsec_out_t *)first_mp->b_rptr; - /* - * ip_wput[_v6] attaches an IPSEC_OUT in two cases. - * - * 1) There is per-socket policy (including cached global - * policy) or a policy on the IP-in-IP tunnel. - * 2) There is no per-socket policy, but it is - * a multicast packet that needs to go out - * on a specific interface. This is the case - * where (ip_wput and ip_wput_multicast) attaches - * an IPSEC_OUT and sets ipsec_out_secure B_FALSE. - * - * In case (2) we check with global policy to - * see if there is a match and set the ill_index - * appropriately so that we can lookup the ire - * properly in ip_wput_ipsec_out. - */ - - /* - * ipsec_out_use_global_policy is set to B_FALSE - * in ipsec_in_to_out(). Refer to that function for - * details. - */ - if ((io->ipsec_out_latch == NULL) && - (io->ipsec_out_use_global_policy)) { - return (ip_wput_attach_policy(first_mp, ipha, ip6h, - ire, connp, unspec_src, zoneid)); - } - if (!io->ipsec_out_secure) { - /* - * If this is not a secure packet, drop - * the IPSEC_OUT mp and treat it as a clear - * packet. This happens when we are sending - * a ICMP reply back to a clear packet. See - * ipsec_in_to_out() for details. - */ - mp = first_mp->b_cont; - freeb(first_mp); - } - return (mp); - } - /* - * See whether we need to attach a global policy here. We - * don't depend on the conn (as it could be null) for deciding - * what policy this datagram should go through because it - * should have happened in ip_wput if there was some - * policy. This normally happens for connections which are not - * fully bound preventing us from caching policies in - * ip_bind. Packets coming from the TCP listener/global queue - * - which are non-hard_bound - could also be affected by - * applying policy here. - * - * If this packet is coming from tcp global queue or listener, - * we will be applying policy here. This may not be *right* - * if these packets are coming from the detached connection as - * it could have gone in clear before. This happens only if a - * TCP connection started when there is no policy and somebody - * added policy before it became detached. Thus packets of the - * detached connection could go out secure and the other end - * would drop it because it will be expecting in clear. The - * converse is not true i.e if somebody starts a TCP - * connection and deletes the policy, all the packets will - * still go out with the policy that existed before deleting - * because ip_unbind sends up policy information which is used - * by TCP on subsequent ip_wputs. The right solution is to fix - * TCP to attach a dummy IPSEC_OUT and set - * ipsec_out_use_global_policy to B_FALSE. As this might - * affect performance for normal cases, we are not doing it. - * Thus, set policy before starting any TCP connections. - * - * NOTE - We might apply policy even for a hard bound connection - * - for which we cached policy in ip_bind - if somebody added - * global policy after we inherited the policy in ip_bind. - * This means that the packets that were going out in clear - * previously would start going secure and hence get dropped - * on the other side. To fix this, TCP attaches a dummy - * ipsec_out and make sure that we don't apply global policy. - */ - if (ipha != NULL) - policy_present = ipss->ipsec_outbound_v4_policy_present; - else - policy_present = ipss->ipsec_outbound_v6_policy_present; - if (!policy_present) - return (mp); - - return (ip_wput_attach_policy(mp, ipha, ip6h, ire, connp, unspec_src, - zoneid)); -} - -/* - * This function does the ire_refrele of the ire passed in as the - * argument. As this function looks up more ires i.e broadcast ires, - * it needs to REFRELE them. Currently, for simplicity we don't - * differentiate the one passed in and looked up here. We always - * REFRELE. - * IPQoS Notes: - * IP policy is invoked if IPP_LOCAL_OUT is enabled. Processing for - * IPsec packets are done in ipsec_out_process. - */ -void -ip_wput_ire(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, int caller, - zoneid_t zoneid) -{ - ipha_t *ipha; -#define rptr ((uchar_t *)ipha) - queue_t *stq; -#define Q_TO_INDEX(stq) (((ill_t *)stq->q_ptr)->ill_phyint->phyint_ifindex) - uint32_t v_hlen_tos_len; - uint32_t ttl_protocol; - ipaddr_t src; - ipaddr_t dst; - uint32_t cksum; - ipaddr_t orig_src; - ire_t *ire1; - mblk_t *next_mp; - uint_t hlen; - uint16_t *up; - uint32_t max_frag = ire->ire_max_frag; - ill_t *ill = ire_to_ill(ire); - int clusterwide; - uint16_t ip_hdr_included; /* IP header included by ULP? */ - int ipsec_len; - mblk_t *first_mp; - ipsec_out_t *io; - boolean_t conn_dontroute; /* conn value for multicast */ - boolean_t conn_multicast_loop; /* conn value for multicast */ - boolean_t multicast_forward; /* Should we forward ? */ - boolean_t unspec_src; - ill_t *conn_outgoing_ill = NULL; - ill_t *ire_ill; - ill_t *ire1_ill; - ill_t *out_ill; - uint32_t ill_index = 0; - boolean_t multirt_send = B_FALSE; - int err; - ipxmit_state_t pktxmit_state; - ip_stack_t *ipst = ire->ire_ipst; - ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; - - TRACE_1(TR_FAC_IP, TR_IP_WPUT_IRE_START, - "ip_wput_ire_start: q %p", q); - - multicast_forward = B_FALSE; - unspec_src = (connp != NULL && connp->conn_unspec_src); - - if (ire->ire_flags & RTF_MULTIRT) { - /* - * Multirouting case. The bucket where ire is stored - * probably holds other RTF_MULTIRT flagged ire - * to the destination. In this call to ip_wput_ire, - * we attempt to send the packet through all - * those ires. Thus, we first ensure that ire is the - * first RTF_MULTIRT ire in the bucket, - * before walking the ire list. - */ - ire_t *first_ire; - irb_t *irb = ire->ire_bucket; - ASSERT(irb != NULL); - - /* Make sure we do not omit any multiroute ire. */ - IRB_REFHOLD(irb); - for (first_ire = irb->irb_ire; - first_ire != NULL; - first_ire = first_ire->ire_next) { - if ((first_ire->ire_flags & RTF_MULTIRT) && - (first_ire->ire_addr == ire->ire_addr) && - !(first_ire->ire_marks & - (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN))) - break; - } - - if ((first_ire != NULL) && (first_ire != ire)) { - IRE_REFHOLD(first_ire); - ire_refrele(ire); - ire = first_ire; - ill = ire_to_ill(ire); - } - IRB_REFRELE(irb); - } - - /* - * conn_outgoing_ill variable is used only in the broadcast loop. - * for performance we don't grab the mutexs in the fastpath - */ - if (ire->ire_type == IRE_BROADCAST && connp != NULL && - connp->conn_outgoing_ill != NULL) { - conn_outgoing_ill = conn_get_held_ill(connp, - &connp->conn_outgoing_ill, &err); - if (err == ILL_LOOKUP_FAILED) { - ire_refrele(ire); - freemsg(mp); - return; - } - } - - if (mp->b_datap->db_type != M_CTL) { - ipha = (ipha_t *)mp->b_rptr; - } else { - io = (ipsec_out_t *)mp->b_rptr; - ASSERT(io->ipsec_out_type == IPSEC_OUT); - ASSERT(zoneid == io->ipsec_out_zoneid); - ASSERT(zoneid != ALL_ZONES); - ipha = (ipha_t *)mp->b_cont->b_rptr; - dst = ipha->ipha_dst; - /* - * For the multicast case, ipsec_out carries conn_dontroute and - * conn_multicast_loop as conn may not be available here. We - * need this for multicast loopback and forwarding which is done - * later in the code. - */ - if (CLASSD(dst)) { - conn_dontroute = io->ipsec_out_dontroute; - conn_multicast_loop = io->ipsec_out_multicast_loop; - /* - * If conn_dontroute is not set or conn_multicast_loop - * is set, we need to do forwarding/loopback. For - * datagrams from ip_wput_multicast, conn_dontroute is - * set to B_TRUE and conn_multicast_loop is set to - * B_FALSE so that we neither do forwarding nor - * loopback. - */ - if (!conn_dontroute || conn_multicast_loop) - multicast_forward = B_TRUE; - } - } - - if (ire->ire_type == IRE_LOCAL && ire->ire_zoneid != zoneid && - ire->ire_zoneid != ALL_ZONES) { - /* - * When a zone sends a packet to another zone, we try to deliver - * the packet under the same conditions as if the destination - * was a real node on the network. To do so, we look for a - * matching route in the forwarding table. - * RTF_REJECT and RTF_BLACKHOLE are handled just like - * ip_newroute() does. - * Note that IRE_LOCAL are special, since they are used - * when the zoneid doesn't match in some cases. This means that - * we need to handle ipha_src differently since ire_src_addr - * belongs to the receiving zone instead of the sending zone. - * When ip_restrict_interzone_loopback is set, then - * ire_cache_lookup() ensures that IRE_LOCAL are only used - * for loopback between zones when the logical "Ethernet" would - * have looped them back. - */ - ire_t *src_ire; - - src_ire = ire_ftable_lookup(ipha->ipha_dst, 0, 0, 0, - NULL, NULL, zoneid, 0, NULL, (MATCH_IRE_RECURSIVE | - MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE), ipst); - if (src_ire != NULL && - !(src_ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) && - (!ipst->ips_ip_restrict_interzone_loopback || - ire_local_same_lan(ire, src_ire))) { - if (ipha->ipha_src == INADDR_ANY && !unspec_src) - ipha->ipha_src = src_ire->ire_src_addr; - ire_refrele(src_ire); - } else { - ire_refrele(ire); - if (conn_outgoing_ill != NULL) - ill_refrele(conn_outgoing_ill); - BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); - if (src_ire != NULL) { - if (src_ire->ire_flags & RTF_BLACKHOLE) { - ire_refrele(src_ire); - freemsg(mp); - return; - } - ire_refrele(src_ire); - } - if (ip_hdr_complete(ipha, zoneid, ipst)) { - /* Failed */ - freemsg(mp); - return; - } - icmp_unreachable(q, mp, ICMP_HOST_UNREACHABLE, zoneid, - ipst); - return; - } - } - - if (mp->b_datap->db_type == M_CTL || - ipss->ipsec_outbound_v4_policy_present) { - mp = ip_wput_ire_parse_ipsec_out(mp, ipha, NULL, ire, connp, - unspec_src, zoneid); - if (mp == NULL) { - ire_refrele(ire); - if (conn_outgoing_ill != NULL) - ill_refrele(conn_outgoing_ill); - return; - } - /* - * Trusted Extensions supports all-zones interfaces, so - * zoneid == ALL_ZONES is valid, but IPsec maps ALL_ZONES to - * the global zone. - */ - if (zoneid == ALL_ZONES && mp->b_datap->db_type == M_CTL) { - io = (ipsec_out_t *)mp->b_rptr; - ASSERT(io->ipsec_out_type == IPSEC_OUT); - zoneid = io->ipsec_out_zoneid; - } - } - - first_mp = mp; - ipsec_len = 0; - - if (first_mp->b_datap->db_type == M_CTL) { - io = (ipsec_out_t *)first_mp->b_rptr; - ASSERT(io->ipsec_out_type == IPSEC_OUT); - mp = first_mp->b_cont; - ipsec_len = ipsec_out_extra_length(first_mp); - ASSERT(ipsec_len >= 0); - if (zoneid == ALL_ZONES) - zoneid = GLOBAL_ZONEID; - /* We already picked up the zoneid from the M_CTL above */ - ASSERT(zoneid == io->ipsec_out_zoneid); - - /* - * Drop M_CTL here if IPsec processing is not needed. - * (Non-IPsec use of M_CTL extracted any information it - * needed above). - */ - if (ipsec_len == 0) { - freeb(first_mp); - first_mp = mp; - } - } - - /* - * Fast path for ip_wput_ire - */ - - ipha = (ipha_t *)mp->b_rptr; - v_hlen_tos_len = ((uint32_t *)ipha)[0]; - dst = ipha->ipha_dst; - - /* - * ICMP(RAWIP) module should set the ipha_ident to IP_HDR_INCLUDED - * if the socket is a SOCK_RAW type. The transport checksum should - * be provided in the pre-built packet, so we don't need to compute it. - * Also, other application set flags, like DF, should not be altered. - * Other transport MUST pass down zero. - */ - ip_hdr_included = ipha->ipha_ident; - ASSERT(ipha->ipha_ident == 0 || ipha->ipha_ident == IP_HDR_INCLUDED); - - if (CLASSD(dst)) { - ip1dbg(("ip_wput_ire: to 0x%x ire %s addr 0x%x\n", - ntohl(dst), - ip_nv_lookup(ire_nv_tbl, ire->ire_type), - ntohl(ire->ire_addr))); - } - -/* Macros to extract header fields from data already in registers */ -#ifdef _BIG_ENDIAN -#define V_HLEN (v_hlen_tos_len >> 24) -#define LENGTH (v_hlen_tos_len & 0xFFFF) -#define PROTO (ttl_protocol & 0xFF) -#else -#define V_HLEN (v_hlen_tos_len & 0xFF) -#define LENGTH ((v_hlen_tos_len >> 24) | ((v_hlen_tos_len >> 8) & 0xFF00)) -#define PROTO (ttl_protocol >> 8) -#endif - - orig_src = src = ipha->ipha_src; - /* (The loop back to "another" is explained down below.) */ -another:; - /* - * Assign an ident value for this packet. We assign idents on - * a per destination basis out of the IRE. There could be - * other threads targeting the same destination, so we have to - * arrange for a atomic increment. Note that we use a 32-bit - * atomic add because it has better performance than its - * 16-bit sibling. - * - * If running in cluster mode and if the source address - * belongs to a replicated service then vector through - * cl_inet_ipident vector to allocate ip identifier - * NOTE: This is a contract private interface with the - * clustering group. - */ - clusterwide = 0; - if (cl_inet_ipident) { - ASSERT(cl_inet_isclusterwide); - netstackid_t stack_id = ipst->ips_netstack->netstack_stackid; - - if ((*cl_inet_isclusterwide)(stack_id, IPPROTO_IP, - AF_INET, (uint8_t *)(uintptr_t)src, NULL)) { - ipha->ipha_ident = (*cl_inet_ipident)(stack_id, - IPPROTO_IP, AF_INET, (uint8_t *)(uintptr_t)src, - (uint8_t *)(uintptr_t)dst, NULL); - clusterwide = 1; - } - } - if (!clusterwide) { - ipha->ipha_ident = - (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1); - } - -#ifndef _BIG_ENDIAN - ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8); -#endif - - /* - * Set source address unless sent on an ill or conn_unspec_src is set. - * This is needed to obey conn_unspec_src when packets go through - * ip_newroute + arp. - * Assumes ip_newroute{,_multi} sets the source address as well. - */ - if (src == INADDR_ANY && !unspec_src) { - /* - * Assign the appropriate source address from the IRE if none - * was specified. - */ - ASSERT(ire->ire_ipversion == IPV4_VERSION); - - src = ire->ire_src_addr; - if (connp == NULL) { - ip1dbg(("ip_wput_ire: no connp and no src " - "address for dst 0x%x, using src 0x%x\n", - ntohl(dst), - ntohl(src))); - } - ipha->ipha_src = src; - } - stq = ire->ire_stq; - - /* - * We only allow ire chains for broadcasts since there will - * be multiple IRE_CACHE entries for the same multicast - * address (one per ipif). - */ - next_mp = NULL; - - /* broadcast packet */ - if (ire->ire_type == IRE_BROADCAST) - goto broadcast; - - /* loopback ? */ - if (stq == NULL) - goto nullstq; - - /* The ill_index for outbound ILL */ - ill_index = Q_TO_INDEX(stq); - - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); - ttl_protocol = ((uint16_t *)ipha)[4]; - - /* pseudo checksum (do it in parts for IP header checksum) */ - cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); - - if (!IP_FLOW_CONTROLLED_ULP(PROTO)) { - queue_t *dev_q = stq->q_next; - - /* - * For DIRECT_CAPABLE, we do flow control at - * the time of sending the packet. See - * ILL_SEND_TX(). - */ - if (!ILL_DIRECT_CAPABLE((ill_t *)stq->q_ptr) && - (DEV_Q_FLOW_BLOCKED(dev_q))) - goto blocked; - - if ((PROTO == IPPROTO_UDP) && - (ip_hdr_included != IP_HDR_INCLUDED)) { - hlen = (V_HLEN & 0xF) << 2; - up = IPH_UDPH_CHECKSUMP(ipha, hlen); - if (*up != 0) { - IP_CKSUM_XMIT(ill, ire, mp, ipha, up, PROTO, - hlen, LENGTH, max_frag, ipsec_len, cksum); - /* Software checksum? */ - if (DB_CKSUMFLAGS(mp) == 0) { - IP_STAT(ipst, ip_out_sw_cksum); - IP_STAT_UPDATE(ipst, - ip_udp_out_sw_cksum_bytes, - LENGTH - hlen); - } - } - } - } else if (ip_hdr_included != IP_HDR_INCLUDED) { - hlen = (V_HLEN & 0xF) << 2; - if (PROTO == IPPROTO_TCP) { - up = IPH_TCPH_CHECKSUMP(ipha, hlen); - /* - * The packet header is processed once and for all, even - * in the multirouting case. We disable hardware - * checksum if the packet is multirouted, as it will be - * replicated via several interfaces, and not all of - * them may have this capability. - */ - IP_CKSUM_XMIT(ill, ire, mp, ipha, up, PROTO, hlen, - LENGTH, max_frag, ipsec_len, cksum); - /* Software checksum? */ - if (DB_CKSUMFLAGS(mp) == 0) { - IP_STAT(ipst, ip_out_sw_cksum); - IP_STAT_UPDATE(ipst, ip_tcp_out_sw_cksum_bytes, - LENGTH - hlen); - } - } else { - sctp_hdr_t *sctph; - - ASSERT(PROTO == IPPROTO_SCTP); - ASSERT(MBLKL(mp) >= (hlen + sizeof (*sctph))); - sctph = (sctp_hdr_t *)(mp->b_rptr + hlen); - /* - * Zero out the checksum field to ensure proper - * checksum calculation. - */ - sctph->sh_chksum = 0; -#ifdef DEBUG - if (!skip_sctp_cksum) -#endif - sctph->sh_chksum = sctp_cksum(mp, hlen); - } - } - - /* - * If this is a multicast packet and originated from ip_wput - * we need to do loopback and forwarding checks. If it comes - * from ip_wput_multicast, we SHOULD not do this. - */ - if (CLASSD(ipha->ipha_dst) && multicast_forward) goto multi_loopback; - - /* checksum */ - cksum += ttl_protocol; - - /* fragment the packet */ - if (max_frag < (uint_t)(LENGTH + ipsec_len)) - goto fragmentit; - /* - * Don't use frag_flag if packet is pre-built or source - * routed or if multicast (since multicast packets do - * not solicit ICMP "packet too big" messages). - */ - if ((ip_hdr_included != IP_HDR_INCLUDED) && - (V_HLEN == IP_SIMPLE_HDR_VERSION || - !ip_source_route_included(ipha)) && - !CLASSD(ipha->ipha_dst)) - ipha->ipha_fragment_offset_and_flags |= - htons(ire->ire_frag_flag); - - if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) { - /* calculate IP header checksum */ - cksum += ipha->ipha_ident; - cksum += (v_hlen_tos_len >> 16)+(v_hlen_tos_len & 0xFFFF); - cksum += ipha->ipha_fragment_offset_and_flags; - - /* IP options present */ - hlen = (V_HLEN & 0xF) - IP_SIMPLE_HDR_LENGTH_IN_WORDS; - if (hlen) - goto checksumoptions; - - /* calculate hdr checksum */ - cksum = ((cksum & 0xFFFF) + (cksum >> 16)); - cksum = ~(cksum + (cksum >> 16)); - ipha->ipha_hdr_checksum = (uint16_t)cksum; - } - if (ipsec_len != 0) { - /* - * We will do the rest of the processing after - * we come back from IPsec in ip_wput_ipsec_out(). - */ - ASSERT(MBLKL(first_mp) >= sizeof (ipsec_out_t)); - - io = (ipsec_out_t *)first_mp->b_rptr; - io->ipsec_out_ill_index = - ire->ire_ipif->ipif_ill->ill_phyint->phyint_ifindex; - ipsec_out_process(q, first_mp, ire, 0); - ire_refrele(ire); - if (conn_outgoing_ill != NULL) - ill_refrele(conn_outgoing_ill); - return; - } - - /* - * In most cases, the emission loop below is entered only - * once. Only in the case where the ire holds the - * RTF_MULTIRT flag, do we loop to process all RTF_MULTIRT - * flagged ires in the bucket, and send the packet - * through all crossed RTF_MULTIRT routes. - */ - if (ire->ire_flags & RTF_MULTIRT) { - multirt_send = B_TRUE; - } - do { - if (multirt_send) { - irb_t *irb; - /* - * We are in a multiple send case, need to get - * the next ire and make a duplicate of the packet. - * ire1 holds here the next ire to process in the - * bucket. If multirouting is expected, - * any non-RTF_MULTIRT ire that has the - * right destination address is ignored. - */ - irb = ire->ire_bucket; - ASSERT(irb != NULL); - - IRB_REFHOLD(irb); - for (ire1 = ire->ire_next; - ire1 != NULL; - ire1 = ire1->ire_next) { - if ((ire1->ire_flags & RTF_MULTIRT) == 0) - continue; - if (ire1->ire_addr != ire->ire_addr) - continue; - if (ire1->ire_marks & - (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)) - continue; - - /* Got one */ - IRE_REFHOLD(ire1); - break; - } - IRB_REFRELE(irb); - - if (ire1 != NULL) { - next_mp = copyb(mp); - if ((next_mp == NULL) || - ((mp->b_cont != NULL) && - ((next_mp->b_cont = - dupmsg(mp->b_cont)) == NULL))) { - freemsg(next_mp); - next_mp = NULL; - ire_refrele(ire1); - ire1 = NULL; - } - } - - /* Last multiroute ire; don't loop anymore. */ - if (ire1 == NULL) { - multirt_send = B_FALSE; - } - } - - DTRACE_PROBE4(ip4__physical__out__start, ill_t *, NULL, - ill_t *, ire->ire_ipif->ipif_ill, ipha_t *, ipha, - mblk_t *, mp); - FW_HOOKS(ipst->ips_ip4_physical_out_event, - ipst->ips_ipv4firewall_physical_out, - NULL, ire->ire_ipif->ipif_ill, ipha, mp, mp, 0, ipst); - DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); - - if (mp == NULL) - goto release_ire_and_ill; - - if (ipst->ips_ip4_observe.he_interested) { - zoneid_t szone; - - /* - * On the outbound path the destination zone will be - * unknown as we're sending this packet out on the - * wire. - */ - szone = ip_get_zoneid_v4(ipha->ipha_src, mp, ipst, - ALL_ZONES); - ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, ALL_ZONES, - ire->ire_ipif->ipif_ill, ipst); - } - mp->b_prev = SET_BPREV_FLAG(IPP_LOCAL_OUT); - DTRACE_PROBE2(ip__xmit__1, mblk_t *, mp, ire_t *, ire); - - pktxmit_state = ip_xmit_v4(mp, ire, NULL, B_TRUE, connp); - - if ((pktxmit_state == SEND_FAILED) || - (pktxmit_state == LLHDR_RESLV_FAILED)) { - ip2dbg(("ip_wput_ire: ip_xmit_v4 failed" - "- packet dropped\n")); -release_ire_and_ill: - ire_refrele(ire); - if (next_mp != NULL) { - freemsg(next_mp); - ire_refrele(ire1); - } - if (conn_outgoing_ill != NULL) - ill_refrele(conn_outgoing_ill); - return; - } - - if (CLASSD(dst)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutMcastPkts); - UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutMcastOctets, - LENGTH); - } - - TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, - "ip_wput_ire_end: q %p (%S)", - q, "last copy out"); - IRE_REFRELE(ire); - - if (multirt_send) { - ASSERT(ire1); - /* - * Proceed with the next RTF_MULTIRT ire, - * Also set up the send-to queue accordingly. - */ - ire = ire1; - ire1 = NULL; - stq = ire->ire_stq; - mp = next_mp; - next_mp = NULL; - ipha = (ipha_t *)mp->b_rptr; - ill_index = Q_TO_INDEX(stq); - ill = (ill_t *)stq->q_ptr; - } - } while (multirt_send); - if (conn_outgoing_ill != NULL) - ill_refrele(conn_outgoing_ill); - return; - - /* - * ire->ire_type == IRE_BROADCAST (minimize diffs) - */ -broadcast: - { - /* - * To avoid broadcast storms, we usually set the TTL to 1 for - * broadcasts. However, if SO_DONTROUTE isn't set, this value - * can be overridden stack-wide through the ip_broadcast_ttl - * ndd tunable, or on a per-connection basis through the - * IP_BROADCAST_TTL socket option. - * - * In the event that we are replying to incoming ICMP packets, - * connp could be NULL. - */ - ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl; - if (connp != NULL) { - if (connp->conn_dontroute) - ipha->ipha_ttl = 1; - else if (connp->conn_broadcast_ttl != 0) - ipha->ipha_ttl = connp->conn_broadcast_ttl; - } - - /* - * Note that we are not doing a IRB_REFHOLD here. - * Actually we don't care if the list changes i.e - * if somebody deletes an IRE from the list while - * we drop the lock, the next time we come around - * ire_next will be NULL and hence we won't send - * out multiple copies which is fine. - */ - rw_enter(&ire->ire_bucket->irb_lock, RW_READER); - ire1 = ire->ire_next; - if (conn_outgoing_ill != NULL) { - while (ire->ire_ipif->ipif_ill != conn_outgoing_ill) { - ASSERT(ire1 == ire->ire_next); - if (ire1 != NULL && ire1->ire_addr == dst) { - ire_refrele(ire); - ire = ire1; - IRE_REFHOLD(ire); - ire1 = ire->ire_next; - continue; - } - rw_exit(&ire->ire_bucket->irb_lock); - /* Did not find a matching ill */ - ip1dbg(("ip_wput_ire: broadcast with no " - "matching IP_BOUND_IF ill %s dst %x\n", - conn_outgoing_ill->ill_name, dst)); - freemsg(first_mp); - if (ire != NULL) - ire_refrele(ire); - ill_refrele(conn_outgoing_ill); - return; - } - } else if (ire1 != NULL && ire1->ire_addr == dst) { - /* - * If the next IRE has the same address and is not one - * of the two copies that we need to send, try to see - * whether this copy should be sent at all. This - * assumes that we insert loopbacks first and then - * non-loopbacks. This is acheived by inserting the - * loopback always before non-loopback. - * This is used to send a single copy of a broadcast - * packet out all physical interfaces that have an - * matching IRE_BROADCAST while also looping - * back one copy (to ip_wput_local) for each - * matching physical interface. However, we avoid - * sending packets out different logical that match by - * having ipif_up/ipif_down supress duplicate - * IRE_BROADCASTS. - * - * This feature is currently used to get broadcasts - * sent to multiple interfaces, when the broadcast - * address being used applies to multiple interfaces. - * For example, a whole net broadcast will be - * replicated on every connected subnet of - * the target net. - * - * Each zone has its own set of IRE_BROADCASTs, so that - * we're able to distribute inbound packets to multiple - * zones who share a broadcast address. We avoid looping - * back outbound packets in different zones but on the - * same ill, as the application would see duplicates. - * - * This logic assumes that ire_add_v4() groups the - * IRE_BROADCAST entries so that those with the same - * ire_addr are kept together. - */ - ire_ill = ire->ire_ipif->ipif_ill; - if (ire->ire_stq != NULL || ire1->ire_stq == NULL) { - while (ire1 != NULL && ire1->ire_addr == dst) { - ire1_ill = ire1->ire_ipif->ipif_ill; - if (ire1_ill != ire_ill) - break; - ire1 = ire1->ire_next; - } - } - } - ASSERT(multirt_send == B_FALSE); - if (ire1 != NULL && ire1->ire_addr == dst) { - if ((ire->ire_flags & RTF_MULTIRT) && - (ire1->ire_flags & RTF_MULTIRT)) { - /* - * We are in the multirouting case. - * The message must be sent at least - * on both ires. These ires have been - * inserted AFTER the standard ones - * in ip_rt_add(). There are thus no - * other ire entries for the destination - * address in the rest of the bucket - * that do not have the RTF_MULTIRT - * flag. We don't process a copy - * of the message here. This will be - * done in the final sending loop. - */ - multirt_send = B_TRUE; - } else { - next_mp = ip_copymsg(first_mp); - if (next_mp != NULL) - IRE_REFHOLD(ire1); - } - } - rw_exit(&ire->ire_bucket->irb_lock); - } - - if (stq) { - /* - * A non-NULL send-to queue means this packet is going - * out of this machine. - */ - out_ill = (ill_t *)stq->q_ptr; - - BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutRequests); - ttl_protocol = ((uint16_t *)ipha)[4]; - /* - * We accumulate the pseudo header checksum in cksum. - * This is pretty hairy code, so watch close. One - * thing to keep in mind is that UDP and TCP have - * stored their respective datagram lengths in their - * checksum fields. This lines things up real nice. - */ - cksum = (dst >> 16) + (dst & 0xFFFF) + - (src >> 16) + (src & 0xFFFF); - /* - * We assume the udp checksum field contains the - * length, so to compute the pseudo header checksum, - * all we need is the protocol number and src/dst. - */ - /* Provide the checksums for UDP and TCP. */ - if ((PROTO == IPPROTO_TCP) && - (ip_hdr_included != IP_HDR_INCLUDED)) { - /* hlen gets the number of uchar_ts in the IP header */ - hlen = (V_HLEN & 0xF) << 2; - up = IPH_TCPH_CHECKSUMP(ipha, hlen); - IP_STAT(ipst, ip_out_sw_cksum); - IP_STAT_UPDATE(ipst, ip_tcp_out_sw_cksum_bytes, - LENGTH - hlen); - *up = IP_CSUM(mp, hlen, cksum + IP_TCP_CSUM_COMP); - } else if (PROTO == IPPROTO_SCTP && - (ip_hdr_included != IP_HDR_INCLUDED)) { - sctp_hdr_t *sctph; - - hlen = (V_HLEN & 0xF) << 2; - ASSERT(MBLKL(mp) >= (hlen + sizeof (*sctph))); - sctph = (sctp_hdr_t *)(mp->b_rptr + hlen); - sctph->sh_chksum = 0; -#ifdef DEBUG - if (!skip_sctp_cksum) -#endif - sctph->sh_chksum = sctp_cksum(mp, hlen); - } else { - queue_t *dev_q = stq->q_next; - - if (!ILL_DIRECT_CAPABLE((ill_t *)stq->q_ptr) && - (DEV_Q_FLOW_BLOCKED(dev_q))) { -blocked: - ipha->ipha_ident = ip_hdr_included; - /* - * If we don't have a conn to apply - * backpressure, free the message. - * In the ire_send path, we don't know - * the position to requeue the packet. Rather - * than reorder packets, we just drop this - * packet. - */ - if (ipst->ips_ip_output_queue && - connp != NULL && - caller != IRE_SEND) { - if (caller == IP_WSRV) { - idl_tx_list_t *idl_txl; - - idl_txl = - &ipst->ips_idl_tx_list[0]; - connp->conn_did_putbq = 1; - (void) putbq(connp->conn_wq, - first_mp); - conn_drain_insert(connp, - idl_txl); - /* - * This is the service thread, - * and the queue is already - * noenabled. The check for - * canput and the putbq is not - * atomic. So we need to check - * again. - */ - if (canput(stq->q_next)) - connp->conn_did_putbq - = 0; - IP_STAT(ipst, ip_conn_flputbq); - } else { - /* - * We are not the service proc. - * ip_wsrv will be scheduled or - * is already running. - */ - - (void) putq(connp->conn_wq, - first_mp); - } - } else { - out_ill = (ill_t *)stq->q_ptr; - BUMP_MIB(out_ill->ill_ip_mib, - ipIfStatsOutDiscards); - freemsg(first_mp); - TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, - "ip_wput_ire_end: q %p (%S)", - q, "discard"); - } - ire_refrele(ire); - if (next_mp) { - ire_refrele(ire1); - freemsg(next_mp); - } - if (conn_outgoing_ill != NULL) - ill_refrele(conn_outgoing_ill); - return; - } - if ((PROTO == IPPROTO_UDP) && - (ip_hdr_included != IP_HDR_INCLUDED)) { - /* - * hlen gets the number of uchar_ts in the - * IP header - */ - hlen = (V_HLEN & 0xF) << 2; - up = IPH_UDPH_CHECKSUMP(ipha, hlen); - max_frag = ire->ire_max_frag; - if (*up != 0) { - IP_CKSUM_XMIT(out_ill, ire, mp, ipha, - up, PROTO, hlen, LENGTH, max_frag, - ipsec_len, cksum); - /* Software checksum? */ - if (DB_CKSUMFLAGS(mp) == 0) { - IP_STAT(ipst, ip_out_sw_cksum); - IP_STAT_UPDATE(ipst, - ip_udp_out_sw_cksum_bytes, - LENGTH - hlen); - } - } - } - } - /* - * Need to do this even when fragmenting. The local - * loopback can be done without computing checksums - * but forwarding out other interface must be done - * after the IP checksum (and ULP checksums) have been - * computed. - * - * NOTE : multicast_forward is set only if this packet - * originated from ip_wput. For packets originating from - * ip_wput_multicast, it is not set. - */ - if (CLASSD(ipha->ipha_dst) && multicast_forward) { -multi_loopback: - ip2dbg(("ip_wput: multicast, loop %d\n", - conn_multicast_loop)); - - /* Forget header checksum offload */ - DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; - - /* - * Local loopback of multicasts? Check the - * ill. - * - * Note that the loopback function will not come - * in through ip_rput - it will only do the - * client fanout thus we need to do an mforward - * as well. The is different from the BSD - * logic. - */ - if (ill != NULL) { - if (ilm_lookup_ill(ill, ipha->ipha_dst, - ALL_ZONES) != NULL) { - /* - * Pass along the virtual output q. - * ip_wput_local() will distribute the - * packet to all the matching zones, - * except the sending zone when - * IP_MULTICAST_LOOP is false. - */ - ip_multicast_loopback(q, ill, first_mp, - conn_multicast_loop ? 0 : - IP_FF_NO_MCAST_LOOP, zoneid); - } - } - if (ipha->ipha_ttl == 0) { - /* - * 0 => only to this host i.e. we are - * done. We are also done if this was the - * loopback interface since it is sufficient - * to loopback one copy of a multicast packet. - */ - freemsg(first_mp); - TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, - "ip_wput_ire_end: q %p (%S)", - q, "loopback"); - ire_refrele(ire); - if (conn_outgoing_ill != NULL) - ill_refrele(conn_outgoing_ill); - return; - } - /* - * ILLF_MULTICAST is checked in ip_newroute - * i.e. we don't need to check it here since - * all IRE_CACHEs come from ip_newroute. - * For multicast traffic, SO_DONTROUTE is interpreted - * to mean only send the packet out the interface - * (optionally specified with IP_MULTICAST_IF) - * and do not forward it out additional interfaces. - * RSVP and the rsvp daemon is an example of a - * protocol and user level process that - * handles it's own routing. Hence, it uses the - * SO_DONTROUTE option to accomplish this. - */ - - if (ipst->ips_ip_g_mrouter && !conn_dontroute && - ill != NULL) { - /* Unconditionally redo the checksum */ - ipha->ipha_hdr_checksum = 0; - ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); - - /* - * If this needs to go out secure, we need - * to wait till we finish the IPsec - * processing. - */ - if (ipsec_len == 0 && - ip_mforward(ill, ipha, mp)) { - freemsg(first_mp); - ip1dbg(("ip_wput: mforward failed\n")); - TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, - "ip_wput_ire_end: q %p (%S)", - q, "mforward failed"); - ire_refrele(ire); - if (conn_outgoing_ill != NULL) - ill_refrele(conn_outgoing_ill); - return; - } - } - } - max_frag = ire->ire_max_frag; - cksum += ttl_protocol; - if (max_frag >= (uint_t)(LENGTH + ipsec_len)) { - /* No fragmentation required for this one. */ - /* - * Don't use frag_flag if packet is pre-built or source - * routed or if multicast (since multicast packets do - * not solicit ICMP "packet too big" messages). - */ - if ((ip_hdr_included != IP_HDR_INCLUDED) && - (V_HLEN == IP_SIMPLE_HDR_VERSION || - !ip_source_route_included(ipha)) && - !CLASSD(ipha->ipha_dst)) - ipha->ipha_fragment_offset_and_flags |= - htons(ire->ire_frag_flag); - - if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) { - /* Complete the IP header checksum. */ - cksum += ipha->ipha_ident; - cksum += (v_hlen_tos_len >> 16)+ - (v_hlen_tos_len & 0xFFFF); - cksum += ipha->ipha_fragment_offset_and_flags; - hlen = (V_HLEN & 0xF) - - IP_SIMPLE_HDR_LENGTH_IN_WORDS; - if (hlen) { -checksumoptions: - /* - * Account for the IP Options in the IP - * header checksum. - */ - up = (uint16_t *)(rptr+ - IP_SIMPLE_HDR_LENGTH); - do { - cksum += up[0]; - cksum += up[1]; - up += 2; - } while (--hlen); - } - cksum = ((cksum & 0xFFFF) + (cksum >> 16)); - cksum = ~(cksum + (cksum >> 16)); - ipha->ipha_hdr_checksum = (uint16_t)cksum; - } - if (ipsec_len != 0) { - ipsec_out_process(q, first_mp, ire, ill_index); - if (!next_mp) { - ire_refrele(ire); - if (conn_outgoing_ill != NULL) - ill_refrele(conn_outgoing_ill); - return; - } - goto next; - } - - /* - * multirt_send has already been handled - * for broadcast, but not yet for multicast - * or IP options. - */ - if (next_mp == NULL) { - if (ire->ire_flags & RTF_MULTIRT) { - multirt_send = B_TRUE; - } - } - - /* - * In most cases, the emission loop below is - * entered only once. Only in the case where - * the ire holds the RTF_MULTIRT flag, do we loop - * to process all RTF_MULTIRT ires in the bucket, - * and send the packet through all crossed - * RTF_MULTIRT routes. - */ - do { - if (multirt_send) { - irb_t *irb; - - irb = ire->ire_bucket; - ASSERT(irb != NULL); - /* - * We are in a multiple send case, - * need to get the next IRE and make - * a duplicate of the packet. - */ - IRB_REFHOLD(irb); - for (ire1 = ire->ire_next; - ire1 != NULL; - ire1 = ire1->ire_next) { - if (!(ire1->ire_flags & - RTF_MULTIRT)) - continue; - - if (ire1->ire_addr != - ire->ire_addr) - continue; - - if (ire1->ire_marks & - (IRE_MARK_CONDEMNED | - IRE_MARK_TESTHIDDEN)) - continue; - - /* Got one */ - IRE_REFHOLD(ire1); - break; - } - IRB_REFRELE(irb); - - if (ire1 != NULL) { - next_mp = copyb(mp); - if ((next_mp == NULL) || - ((mp->b_cont != NULL) && - ((next_mp->b_cont = - dupmsg(mp->b_cont)) - == NULL))) { - freemsg(next_mp); - next_mp = NULL; - ire_refrele(ire1); - ire1 = NULL; - } - } - - /* - * Last multiroute ire; don't loop - * anymore. The emission is over - * and next_mp is NULL. - */ - if (ire1 == NULL) { - multirt_send = B_FALSE; - } - } - - out_ill = ire_to_ill(ire); - DTRACE_PROBE4(ip4__physical__out__start, - ill_t *, NULL, - ill_t *, out_ill, - ipha_t *, ipha, mblk_t *, mp); - FW_HOOKS(ipst->ips_ip4_physical_out_event, - ipst->ips_ipv4firewall_physical_out, - NULL, out_ill, ipha, mp, mp, 0, ipst); - DTRACE_PROBE1(ip4__physical__out__end, - mblk_t *, mp); - if (mp == NULL) - goto release_ire_and_ill_2; - - ASSERT(ipsec_len == 0); - mp->b_prev = - SET_BPREV_FLAG(IPP_LOCAL_OUT); - DTRACE_PROBE2(ip__xmit__2, - mblk_t *, mp, ire_t *, ire); - pktxmit_state = ip_xmit_v4(mp, ire, - NULL, B_TRUE, connp); - if ((pktxmit_state == SEND_FAILED) || - (pktxmit_state == LLHDR_RESLV_FAILED)) { -release_ire_and_ill_2: - if (next_mp) { - freemsg(next_mp); - ire_refrele(ire1); - } - ire_refrele(ire); - TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, - "ip_wput_ire_end: q %p (%S)", - q, "discard MDATA"); - if (conn_outgoing_ill != NULL) - ill_refrele(conn_outgoing_ill); - return; - } - - if (CLASSD(dst)) { - BUMP_MIB(out_ill->ill_ip_mib, - ipIfStatsHCOutMcastPkts); - UPDATE_MIB(out_ill->ill_ip_mib, - ipIfStatsHCOutMcastOctets, - LENGTH); - } else if (ire->ire_type == IRE_BROADCAST) { - BUMP_MIB(out_ill->ill_ip_mib, - ipIfStatsHCOutBcastPkts); - } - - if (multirt_send) { - /* - * We are in a multiple send case, - * need to re-enter the sending loop - * using the next ire. - */ - ire_refrele(ire); - ire = ire1; - stq = ire->ire_stq; - mp = next_mp; - next_mp = NULL; - ipha = (ipha_t *)mp->b_rptr; - ill_index = Q_TO_INDEX(stq); - } - } while (multirt_send); - - if (!next_mp) { - /* - * Last copy going out (the ultra-common - * case). Note that we intentionally replicate - * the putnext rather than calling it before - * the next_mp check in hopes of a little - * tail-call action out of the compiler. - */ - TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, - "ip_wput_ire_end: q %p (%S)", - q, "last copy out(1)"); - ire_refrele(ire); - if (conn_outgoing_ill != NULL) - ill_refrele(conn_outgoing_ill); - return; - } - /* More copies going out below. */ - } else { - int offset; -fragmentit: - offset = ntohs(ipha->ipha_fragment_offset_and_flags); - /* - * If this would generate a icmp_frag_needed message, - * we need to handle it before we do the IPsec - * processing. Otherwise, we need to strip the IPsec - * headers before we send up the message to the ULPs - * which becomes messy and difficult. - */ - if (ipsec_len != 0) { - if ((max_frag < (unsigned int)(LENGTH + - ipsec_len)) && (offset & IPH_DF)) { - out_ill = (ill_t *)stq->q_ptr; - BUMP_MIB(out_ill->ill_ip_mib, - ipIfStatsOutFragFails); - BUMP_MIB(out_ill->ill_ip_mib, - ipIfStatsOutFragReqds); - ipha->ipha_hdr_checksum = 0; - ipha->ipha_hdr_checksum = - (uint16_t)ip_csum_hdr(ipha); - icmp_frag_needed(ire->ire_stq, first_mp, - max_frag, zoneid, ipst); - if (!next_mp) { - ire_refrele(ire); - if (conn_outgoing_ill != NULL) { - ill_refrele( - conn_outgoing_ill); - } - return; - } - } else { - /* - * This won't cause a icmp_frag_needed - * message. to be generated. Send it on - * the wire. Note that this could still - * cause fragmentation and all we - * do is the generation of the message - * to the ULP if needed before IPsec. - */ - if (!next_mp) { - ipsec_out_process(q, first_mp, - ire, ill_index); - TRACE_2(TR_FAC_IP, - TR_IP_WPUT_IRE_END, - "ip_wput_ire_end: q %p " - "(%S)", q, - "last ipsec_out_process"); - ire_refrele(ire); - if (conn_outgoing_ill != NULL) { - ill_refrele( - conn_outgoing_ill); - } - return; - } - ipsec_out_process(q, first_mp, - ire, ill_index); - } - } else { - /* - * Initiate IPPF processing. For - * fragmentable packets we finish - * all QOS packet processing before - * calling: - * ip_wput_ire_fragmentit->ip_wput_frag - */ - - if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { - ip_process(IPP_LOCAL_OUT, &mp, - ill_index); - if (mp == NULL) { - out_ill = (ill_t *)stq->q_ptr; - BUMP_MIB(out_ill->ill_ip_mib, - ipIfStatsOutDiscards); - if (next_mp != NULL) { - freemsg(next_mp); - ire_refrele(ire1); - } - ire_refrele(ire); - TRACE_2(TR_FAC_IP, - TR_IP_WPUT_IRE_END, - "ip_wput_ire: q %p (%S)", - q, "discard MDATA"); - if (conn_outgoing_ill != NULL) { - ill_refrele( - conn_outgoing_ill); - } - return; - } - } - if (!next_mp) { - TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, - "ip_wput_ire_end: q %p (%S)", - q, "last fragmentation"); - ip_wput_ire_fragmentit(mp, ire, - zoneid, ipst, connp); - ire_refrele(ire); - if (conn_outgoing_ill != NULL) - ill_refrele(conn_outgoing_ill); - return; - } - ip_wput_ire_fragmentit(mp, ire, - zoneid, ipst, connp); - } - } - } else { -nullstq: - /* A NULL stq means the destination address is local. */ - UPDATE_OB_PKT_COUNT(ire); - ire->ire_last_used_time = lbolt; - ASSERT(ire->ire_ipif != NULL); - if (!next_mp) { - /* - * Is there an "in" and "out" for traffic local - * to a host (loopback)? The code in Solaris doesn't - * explicitly draw a line in its code for in vs out, - * so we've had to draw a line in the sand: ip_wput_ire - * is considered to be the "output" side and - * ip_wput_local to be the "input" side. - */ - out_ill = ire_to_ill(ire); - - /* - * DTrace this as ip:::send. A blocked packet will - * fire the send probe, but not the receive probe. - */ - DTRACE_IP7(send, mblk_t *, first_mp, conn_t *, NULL, - void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill, - ipha_t *, ipha, ip6_t *, NULL, int, 1); - - DTRACE_PROBE4(ip4__loopback__out__start, - ill_t *, NULL, ill_t *, out_ill, - ipha_t *, ipha, mblk_t *, first_mp); - - FW_HOOKS(ipst->ips_ip4_loopback_out_event, - ipst->ips_ipv4firewall_loopback_out, - NULL, out_ill, ipha, first_mp, mp, 0, ipst); - - DTRACE_PROBE1(ip4__loopback__out_end, - mblk_t *, first_mp); - - TRACE_2(TR_FAC_IP, TR_IP_WPUT_IRE_END, - "ip_wput_ire_end: q %p (%S)", - q, "local address"); - - if (first_mp != NULL) - ip_wput_local(q, out_ill, ipha, - first_mp, ire, 0, ire->ire_zoneid); - ire_refrele(ire); - if (conn_outgoing_ill != NULL) - ill_refrele(conn_outgoing_ill); - return; - } - - out_ill = ire_to_ill(ire); - - /* - * DTrace this as ip:::send. A blocked packet will fire the - * send probe, but not the receive probe. - */ - DTRACE_IP7(send, mblk_t *, first_mp, conn_t *, NULL, - void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill, - ipha_t *, ipha, ip6_t *, NULL, int, 1); - - DTRACE_PROBE4(ip4__loopback__out__start, - ill_t *, NULL, ill_t *, out_ill, - ipha_t *, ipha, mblk_t *, first_mp); - - FW_HOOKS(ipst->ips_ip4_loopback_out_event, - ipst->ips_ipv4firewall_loopback_out, - NULL, out_ill, ipha, first_mp, mp, 0, ipst); - - DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, first_mp); - - if (first_mp != NULL) - ip_wput_local(q, out_ill, ipha, - first_mp, ire, 0, ire->ire_zoneid); - } -next: - /* - * More copies going out to additional interfaces. - * ire1 has already been held. We don't need the - * "ire" anymore. - */ - ire_refrele(ire); - ire = ire1; - ASSERT(ire != NULL && ire->ire_refcnt >= 1 && next_mp != NULL); - mp = next_mp; - ASSERT(ire->ire_ipversion == IPV4_VERSION); - ill = ire_to_ill(ire); - first_mp = mp; - if (ipsec_len != 0) { - ASSERT(first_mp->b_datap->db_type == M_CTL); - mp = mp->b_cont; - } - dst = ire->ire_addr; - ipha = (ipha_t *)mp->b_rptr; - /* - * Restore src so that we will pick up ire->ire_src_addr if src was 0. - * Restore ipha_ident "no checksum" flag. - */ - src = orig_src; - ipha->ipha_ident = ip_hdr_included; - goto another; - -#undef rptr -#undef Q_TO_INDEX -} - -/* - * Routine to allocate a message that is used to notify the ULP about MDT. - * The caller may provide a pointer to the link-layer MDT capabilities, - * or NULL if MDT is to be disabled on the stream. - */ -mblk_t * -ip_mdinfo_alloc(ill_mdt_capab_t *isrc) -{ - mblk_t *mp; - ip_mdt_info_t *mdti; - ill_mdt_capab_t *idst; - - if ((mp = allocb(sizeof (*mdti), BPRI_HI)) != NULL) { - DB_TYPE(mp) = M_CTL; - mp->b_wptr = mp->b_rptr + sizeof (*mdti); - mdti = (ip_mdt_info_t *)mp->b_rptr; - mdti->mdt_info_id = MDT_IOC_INFO_UPDATE; - idst = &(mdti->mdt_capab); - - /* - * If the caller provides us with the capability, copy - * it over into our notification message; otherwise - * we zero out the capability portion. - */ - if (isrc != NULL) - bcopy((caddr_t)isrc, (caddr_t)idst, sizeof (*idst)); - else - bzero((caddr_t)idst, sizeof (*idst)); - } - return (mp); -} - -/* - * Routine which determines whether MDT can be enabled on the destination - * IRE and IPC combination, and if so, allocates and returns the MDT - * notification mblk that may be used by ULP. We also check if we need to - * turn MDT back to 'on' when certain restrictions prohibiting us to allow - * MDT usage in the past have been lifted. This gets called during IP - * and ULP binding. - */ -mblk_t * -ip_mdinfo_return(ire_t *dst_ire, conn_t *connp, char *ill_name, - ill_mdt_capab_t *mdt_cap) -{ - mblk_t *mp; - boolean_t rc = B_FALSE; - ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - - ASSERT(dst_ire != NULL); - ASSERT(connp != NULL); - ASSERT(mdt_cap != NULL); - - /* - * Currently, we only support simple TCP/{IPv4,IPv6} with - * Multidata, which is handled in tcp_multisend(). This - * is the reason why we do all these checks here, to ensure - * that we don't enable Multidata for the cases which we - * can't handle at the moment. - */ - do { - /* Only do TCP at the moment */ - if (connp->conn_ulp != IPPROTO_TCP) - break; - - /* - * IPsec outbound policy present? Note that we get here - * after calling ipsec_conn_cache_policy() where the global - * policy checking is performed. conn_latch will be - * non-NULL as long as there's a policy defined, - * i.e. conn_out_enforce_policy may be NULL in such case - * when the connection is non-secure, and hence we check - * further if the latch refers to an outbound policy. - */ - if (CONN_IPSEC_OUT_ENCAPSULATED(connp)) - break; - - /* CGTP (multiroute) is enabled? */ - if (dst_ire->ire_flags & RTF_MULTIRT) - break; - - /* Outbound IPQoS enabled? */ - if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { - /* - * In this case, we disable MDT for this and all - * future connections going over the interface. - */ - mdt_cap->ill_mdt_on = 0; - break; - } - - /* socket option(s) present? */ - if (!CONN_IS_LSO_MD_FASTPATH(connp)) - break; - - rc = B_TRUE; - /* CONSTCOND */ - } while (0); - - /* Remember the result */ - connp->conn_mdt_ok = rc; - - if (!rc) - return (NULL); - else if (!mdt_cap->ill_mdt_on) { - /* - * If MDT has been previously turned off in the past, and we - * currently can do MDT (due to IPQoS policy removal, etc.) - * then enable it for this interface. - */ - mdt_cap->ill_mdt_on = 1; - ip1dbg(("ip_mdinfo_return: reenabling MDT for " - "interface %s\n", ill_name)); - } - - /* Allocate the MDT info mblk */ - if ((mp = ip_mdinfo_alloc(mdt_cap)) == NULL) { - ip0dbg(("ip_mdinfo_return: can't enable Multidata for " - "conn %p on %s (ENOMEM)\n", (void *)connp, ill_name)); - return (NULL); - } - return (mp); -} - -/* - * Routine to allocate a message that is used to notify the ULP about LSO. - * The caller may provide a pointer to the link-layer LSO capabilities, - * or NULL if LSO is to be disabled on the stream. - */ -mblk_t * -ip_lsoinfo_alloc(ill_lso_capab_t *isrc) -{ - mblk_t *mp; - ip_lso_info_t *lsoi; - ill_lso_capab_t *idst; - - if ((mp = allocb(sizeof (*lsoi), BPRI_HI)) != NULL) { - DB_TYPE(mp) = M_CTL; - mp->b_wptr = mp->b_rptr + sizeof (*lsoi); - lsoi = (ip_lso_info_t *)mp->b_rptr; - lsoi->lso_info_id = LSO_IOC_INFO_UPDATE; - idst = &(lsoi->lso_capab); - - /* - * If the caller provides us with the capability, copy - * it over into our notification message; otherwise - * we zero out the capability portion. - */ - if (isrc != NULL) - bcopy((caddr_t)isrc, (caddr_t)idst, sizeof (*idst)); - else - bzero((caddr_t)idst, sizeof (*idst)); - } - return (mp); -} - -/* - * Routine which determines whether LSO can be enabled on the destination - * IRE and IPC combination, and if so, allocates and returns the LSO - * notification mblk that may be used by ULP. We also check if we need to - * turn LSO back to 'on' when certain restrictions prohibiting us to allow - * LSO usage in the past have been lifted. This gets called during IP - * and ULP binding. - */ -mblk_t * -ip_lsoinfo_return(ire_t *dst_ire, conn_t *connp, char *ill_name, - ill_lso_capab_t *lso_cap) -{ - mblk_t *mp; - ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - - ASSERT(dst_ire != NULL); - ASSERT(connp != NULL); - ASSERT(lso_cap != NULL); - - connp->conn_lso_ok = B_TRUE; - - if ((connp->conn_ulp != IPPROTO_TCP) || - CONN_IPSEC_OUT_ENCAPSULATED(connp) || - (dst_ire->ire_flags & RTF_MULTIRT) || - !CONN_IS_LSO_MD_FASTPATH(connp) || - (IPP_ENABLED(IPP_LOCAL_OUT, ipst))) { - connp->conn_lso_ok = B_FALSE; - if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { - /* - * Disable LSO for this and all future connections going - * over the interface. - */ - lso_cap->ill_lso_on = 0; - } - } - - if (!connp->conn_lso_ok) - return (NULL); - else if (!lso_cap->ill_lso_on) { - /* - * If LSO has been previously turned off in the past, and we - * currently can do LSO (due to IPQoS policy removal, etc.) - * then enable it for this interface. - */ - lso_cap->ill_lso_on = 1; - ip1dbg(("ip_mdinfo_return: reenabling LSO for interface %s\n", - ill_name)); - } - - /* Allocate the LSO info mblk */ - if ((mp = ip_lsoinfo_alloc(lso_cap)) == NULL) - ip0dbg(("ip_lsoinfo_return: can't enable LSO for " - "conn %p on %s (ENOMEM)\n", (void *)connp, ill_name)); - - return (mp); -} - -/* - * Create destination address attribute, and fill it with the physical - * destination address and SAP taken from the template DL_UNITDATA_REQ - * message block. - */ -boolean_t -ip_md_addr_attr(multidata_t *mmd, pdesc_t *pd, const mblk_t *dlmp) -{ - dl_unitdata_req_t *dlurp; - pattr_t *pa; - pattrinfo_t pa_info; - pattr_addr_t **das = (pattr_addr_t **)&pa_info.buf; - uint_t das_len, das_off; - - ASSERT(dlmp != NULL); - - dlurp = (dl_unitdata_req_t *)dlmp->b_rptr; - das_len = dlurp->dl_dest_addr_length; - das_off = dlurp->dl_dest_addr_offset; - - pa_info.type = PATTR_DSTADDRSAP; - pa_info.len = sizeof (**das) + das_len - 1; - - /* create and associate the attribute */ - pa = mmd_addpattr(mmd, pd, &pa_info, B_TRUE, KM_NOSLEEP); - if (pa != NULL) { - ASSERT(*das != NULL); - (*das)->addr_is_group = 0; - (*das)->addr_len = (uint8_t)das_len; - bcopy((caddr_t)dlurp + das_off, (*das)->addr, das_len); - } - - return (pa != NULL); -} - -/* - * Create hardware checksum attribute and fill it with the values passed. - */ -boolean_t -ip_md_hcksum_attr(multidata_t *mmd, pdesc_t *pd, uint32_t start_offset, - uint32_t stuff_offset, uint32_t end_offset, uint32_t flags) -{ - pattr_t *pa; - pattrinfo_t pa_info; - - ASSERT(mmd != NULL); - - pa_info.type = PATTR_HCKSUM; - pa_info.len = sizeof (pattr_hcksum_t); - - /* create and associate the attribute */ - pa = mmd_addpattr(mmd, pd, &pa_info, B_TRUE, KM_NOSLEEP); - if (pa != NULL) { - pattr_hcksum_t *hck = (pattr_hcksum_t *)pa_info.buf; - - hck->hcksum_start_offset = start_offset; - hck->hcksum_stuff_offset = stuff_offset; - hck->hcksum_end_offset = end_offset; - hck->hcksum_flags = flags; - } - return (pa != NULL); -} - -/* - * Create zerocopy attribute and fill it with the specified flags - */ -boolean_t -ip_md_zcopy_attr(multidata_t *mmd, pdesc_t *pd, uint_t flags) -{ - pattr_t *pa; - pattrinfo_t pa_info; - - ASSERT(mmd != NULL); - pa_info.type = PATTR_ZCOPY; - pa_info.len = sizeof (pattr_zcopy_t); - - /* create and associate the attribute */ - pa = mmd_addpattr(mmd, pd, &pa_info, B_TRUE, KM_NOSLEEP); - if (pa != NULL) { - pattr_zcopy_t *zcopy = (pattr_zcopy_t *)pa_info.buf; - - zcopy->zcopy_flags = flags; - } - return (pa != NULL); -} - -/* - * Check if ip_wput_frag_mdt() and ip_wput_frag_mdt_v6() can handle a message - * block chain. We could rewrite to handle arbitrary message block chains but - * that would make the code complicated and slow. Right now there three - * restrictions: - * - * 1. The first message block must contain the complete IP header and - * at least 1 byte of payload data. - * 2. At most MULTIDATA_MAX_PBUFS non-empty message blocks are allowed - * so that we can use a single Multidata message. - * 3. No frag must be distributed over two or more message blocks so - * that we don't need more than two packet descriptors per frag. - * - * The above restrictions allow us to support userland applications (which - * will send down a single message block) and NFS over UDP (which will - * send down a chain of at most three message blocks). - * - * We also don't use MDT for payloads with less than or equal to - * ip_wput_frag_mdt_min bytes because it would cause too much overhead. - */ -boolean_t -ip_can_frag_mdt(mblk_t *mp, ssize_t hdr_len, ssize_t len) -{ - int blocks; - ssize_t total, missing, size; - - ASSERT(mp != NULL); - ASSERT(hdr_len > 0); - - size = MBLKL(mp) - hdr_len; - if (size <= 0) - return (B_FALSE); - - /* The first mblk contains the header and some payload. */ - blocks = 1; - total = size; - size %= len; - missing = (size == 0) ? 0 : (len - size); - mp = mp->b_cont; - - while (mp != NULL) { - /* - * Give up if we encounter a zero length message block. - * In practice, this should rarely happen and therefore - * not worth the trouble of freeing and re-linking the - * mblk from the chain to handle such case. - */ - if ((size = MBLKL(mp)) == 0) - return (B_FALSE); - - /* Too many payload buffers for a single Multidata message? */ - if (++blocks > MULTIDATA_MAX_PBUFS) - return (B_FALSE); - - total += size; - /* Is a frag distributed over two or more message blocks? */ - if (missing > size) - return (B_FALSE); - size -= missing; - - size %= len; - missing = (size == 0) ? 0 : (len - size); - - mp = mp->b_cont; - } - - return (total > ip_wput_frag_mdt_min); -} - -/* - * Outbound IPv4 fragmentation routine using MDT. - */ -static void -ip_wput_frag_mdt(ire_t *ire, mblk_t *mp, ip_pkt_t pkt_type, int len, - uint32_t frag_flag, int offset) -{ - ipha_t *ipha_orig; - int i1, ip_data_end; - uint_t pkts, wroff, hdr_chunk_len, pbuf_idx; - mblk_t *hdr_mp, *md_mp = NULL; - unsigned char *hdr_ptr, *pld_ptr; - multidata_t *mmd; - ip_pdescinfo_t pdi; - ill_t *ill; - ip_stack_t *ipst = ire->ire_ipst; - - ASSERT(DB_TYPE(mp) == M_DATA); - ASSERT(MBLKL(mp) > sizeof (ipha_t)); - - ill = ire_to_ill(ire); - ASSERT(ill != NULL); - - ipha_orig = (ipha_t *)mp->b_rptr; - mp->b_rptr += sizeof (ipha_t); - - /* Calculate how many packets we will send out */ - i1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgsize(mp); - pkts = (i1 + len - 1) / len; - ASSERT(pkts > 1); - - /* Allocate a message block which will hold all the IP Headers. */ - wroff = ipst->ips_ip_wroff_extra; - hdr_chunk_len = wroff + IP_SIMPLE_HDR_LENGTH; - - i1 = pkts * hdr_chunk_len; - /* - * Create the header buffer, Multidata and destination address - * and SAP attribute that should be associated with it. - */ - if ((hdr_mp = allocb(i1, BPRI_HI)) == NULL || - ((hdr_mp->b_wptr += i1), - (mmd = mmd_alloc(hdr_mp, &md_mp, KM_NOSLEEP)) == NULL) || - !ip_md_addr_attr(mmd, NULL, ire->ire_nce->nce_res_mp)) { - freemsg(mp); - if (md_mp == NULL) { - freemsg(hdr_mp); - } else { -free_mmd: IP_STAT(ipst, ip_frag_mdt_discarded); - freemsg(md_mp); - } - IP_STAT(ipst, ip_frag_mdt_allocfail); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails); - return; - } - IP_STAT(ipst, ip_frag_mdt_allocd); - - /* - * Add a payload buffer to the Multidata; this operation must not - * fail, or otherwise our logic in this routine is broken. There - * is no memory allocation done by the routine, so any returned - * failure simply tells us that we've done something wrong. - * - * A failure tells us that either we're adding the same payload - * buffer more than once, or we're trying to add more buffers than - * allowed. None of the above cases should happen, and we panic - * because either there's horrible heap corruption, and/or - * programming mistake. - */ - if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) - goto pbuf_panic; - - hdr_ptr = hdr_mp->b_rptr; - pld_ptr = mp->b_rptr; - - /* Establish the ending byte offset, based on the starting offset. */ - offset <<= 3; - ip_data_end = offset + ntohs(ipha_orig->ipha_length) - - IP_SIMPLE_HDR_LENGTH; - - pdi.flags = PDESC_HBUF_REF | PDESC_PBUF_REF; - - while (pld_ptr < mp->b_wptr) { - ipha_t *ipha; - uint16_t offset_and_flags; - uint16_t ip_len; - int error; - - ASSERT((hdr_ptr + hdr_chunk_len) <= hdr_mp->b_wptr); - ipha = (ipha_t *)(hdr_ptr + wroff); - ASSERT(OK_32PTR(ipha)); - *ipha = *ipha_orig; - - if (ip_data_end - offset > len) { - offset_and_flags = IPH_MF; - } else { - /* - * Last frag. Set len to the length of this last piece. - */ - len = ip_data_end - offset; - /* A frag of a frag might have IPH_MF non-zero */ - offset_and_flags = - ntohs(ipha->ipha_fragment_offset_and_flags) & - IPH_MF; - } - offset_and_flags |= (uint16_t)(offset >> 3); - offset_and_flags |= (uint16_t)frag_flag; - /* Store the offset and flags in the IP header. */ - ipha->ipha_fragment_offset_and_flags = htons(offset_and_flags); - - /* Store the length in the IP header. */ - ip_len = (uint16_t)(len + IP_SIMPLE_HDR_LENGTH); - ipha->ipha_length = htons(ip_len); - - /* - * Set the IP header checksum. Note that mp is just - * the header, so this is easy to pass to ip_csum. - */ - ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); - - DTRACE_IP7(send, mblk_t *, md_mp, conn_t *, NULL, void_ip_t *, - ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, - NULL, int, 0); - - /* - * Record offset and size of header and data of the next packet - * in the multidata message. - */ - PDESC_HDR_ADD(&pdi, hdr_ptr, wroff, IP_SIMPLE_HDR_LENGTH, 0); - PDESC_PLD_INIT(&pdi); - i1 = MIN(mp->b_wptr - pld_ptr, len); - ASSERT(i1 > 0); - PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, pld_ptr, i1); - if (i1 == len) { - pld_ptr += len; - } else { - i1 = len - i1; - mp = mp->b_cont; - ASSERT(mp != NULL); - ASSERT(MBLKL(mp) >= i1); - /* - * Attach the next payload message block to the - * multidata message. - */ - if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) - goto pbuf_panic; - PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, mp->b_rptr, i1); - pld_ptr = mp->b_rptr + i1; - } - - if ((mmd_addpdesc(mmd, (pdescinfo_t *)&pdi, &error, - KM_NOSLEEP)) == NULL) { - /* - * Any failure other than ENOMEM indicates that we - * have passed in invalid pdesc info or parameters - * to mmd_addpdesc, which must not happen. - * - * EINVAL is a result of failure on boundary checks - * against the pdesc info contents. It should not - * happen, and we panic because either there's - * horrible heap corruption, and/or programming - * mistake. - */ - if (error != ENOMEM) { - cmn_err(CE_PANIC, "ip_wput_frag_mdt: " - "pdesc logic error detected for " - "mmd %p pinfo %p (%d)\n", - (void *)mmd, (void *)&pdi, error); - /* NOTREACHED */ - } - IP_STAT(ipst, ip_frag_mdt_addpdescfail); - /* Free unattached payload message blocks as well */ - md_mp->b_cont = mp->b_cont; - goto free_mmd; - } - - /* Advance fragment offset. */ - offset += len; - - /* Advance to location for next header in the buffer. */ - hdr_ptr += hdr_chunk_len; - - /* Did we reach the next payload message block? */ - if (pld_ptr == mp->b_wptr && mp->b_cont != NULL) { - mp = mp->b_cont; - /* - * Attach the next message block with payload - * data to the multidata message. - */ - if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) - goto pbuf_panic; - pld_ptr = mp->b_rptr; - } - } - - ASSERT(hdr_mp->b_wptr == hdr_ptr); - ASSERT(mp->b_wptr == pld_ptr); - - /* Update IP statistics */ - IP_STAT_UPDATE(ipst, ip_frag_mdt_pkt_out, pkts); - - UPDATE_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates, pkts); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragOKs); - - len = ntohs(ipha_orig->ipha_length) + (pkts - 1) * IP_SIMPLE_HDR_LENGTH; - UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits, pkts); - UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets, len); - - if (pkt_type == OB_PKT) { - ire->ire_ob_pkt_count += pkts; - if (ire->ire_ipif != NULL) - atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, pkts); - } else { - /* The type is IB_PKT in the forwarding path. */ - ire->ire_ib_pkt_count += pkts; - ASSERT(!IRE_IS_LOCAL(ire)); - if (ire->ire_type & IRE_BROADCAST) { - atomic_add_32(&ire->ire_ipif->ipif_ib_pkt_count, pkts); - } else { - UPDATE_MIB(ill->ill_ip_mib, - ipIfStatsHCOutForwDatagrams, pkts); - atomic_add_32(&ire->ire_ipif->ipif_fo_pkt_count, pkts); - } - } - ire->ire_last_used_time = lbolt; - /* Send it down */ - putnext(ire->ire_stq, md_mp); - return; - -pbuf_panic: - cmn_err(CE_PANIC, "ip_wput_frag_mdt: payload buffer logic " - "error for mmd %p pbuf %p (%d)", (void *)mmd, (void *)mp, - pbuf_idx); - /* NOTREACHED */ -} - /* * Outbound IP fragmentation routine. - * - * NOTE : This routine does not ire_refrele the ire that is passed in - * as the argument. + * Assumes the caller has checked whether or not fragmentation should + * be allowed. Here we copy the DF bit from the header to all the generated + * fragments. */ -static void -ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, - uint32_t frag_flag, zoneid_t zoneid, ip_stack_t *ipst, conn_t *connp) +int +ip_fragment_v4(mblk_t *mp_orig, nce_t *nce, iaflags_t ixaflags, + uint_t pkt_len, uint32_t max_frag, uint32_t xmit_hint, zoneid_t szone, + zoneid_t nolzid, pfirepostfrag_t postfragfn, uintptr_t *ixa_cookie) { int i1; - mblk_t *ll_hdr_mp; - int ll_hdr_len; int hdr_len; mblk_t *hdr_mp; ipha_t *ipha; int ip_data_end; int len; - mblk_t *mp = mp_orig, *mp1; + mblk_t *mp = mp_orig; int offset; - queue_t *q; - uint32_t v_hlen_tos_len; - mblk_t *first_mp; - boolean_t mctl_present; - ill_t *ill; - ill_t *out_ill; - mblk_t *xmit_mp; + ill_t *ill = nce->nce_ill; + ip_stack_t *ipst = ill->ill_ipst; mblk_t *carve_mp; - ire_t *ire1 = NULL; - ire_t *save_ire = NULL; - mblk_t *next_mp = NULL; - boolean_t last_frag = B_FALSE; - boolean_t multirt_send = B_FALSE; - ire_t *first_ire = NULL; - irb_t *irb = NULL; - mib2_ipIfStatsEntry_t *mibptr = NULL; - - ill = ire_to_ill(ire); - mibptr = (ill != NULL) ? ill->ill_ip_mib : &ipst->ips_ip_mib; + uint32_t frag_flag; + uint_t priority = mp->b_band; + int error = 0; - BUMP_MIB(mibptr, ipIfStatsOutFragReqds); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragReqds); - if (max_frag == 0) { - ip1dbg(("ip_wput_frag: ire frag size is 0" - " - dropping packet\n")); - BUMP_MIB(mibptr, ipIfStatsOutFragFails); + if (pkt_len != msgdsize(mp)) { + ip0dbg(("Packet length mismatch: %d, %ld\n", + pkt_len, msgdsize(mp))); freemsg(mp); - return; + return (EINVAL); } - /* - * IPsec does not allow hw accelerated packets to be fragmented - * This check is made in ip_wput_ipsec_out prior to coming here - * via ip_wput_ire_fragmentit. - * - * If at this point we have an ire whose ARP request has not - * been sent out, we call ip_xmit_v4->ire_arpresolve to trigger - * sending of ARP query and change ire's state to ND_INCOMPLETE. - * This packet and all fragmentable packets for this ire will - * continue to get dropped while ire_nce->nce_state remains in - * ND_INCOMPLETE. Post-ARP resolution, after ire's nce_state changes to - * ND_REACHABLE, all subsquent large packets for this ire will - * get fragemented and sent out by this function. - */ - if (ire->ire_nce && ire->ire_nce->nce_state != ND_REACHABLE) { - /* If nce_state is ND_INITIAL, trigger ARP query */ - (void) ip_xmit_v4(NULL, ire, NULL, B_FALSE, NULL); - ip1dbg(("ip_wput_frag: mac address for ire is unresolved" - " - dropping packet\n")); - BUMP_MIB(mibptr, ipIfStatsOutFragFails); + if (max_frag == 0) { + ip1dbg(("ip_fragment_v4: max_frag is zero. Dropping packet\n")); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails); + ip_drop_output("FragFails: zero max_frag", mp, ill); freemsg(mp); - return; - } - - TRACE_0(TR_FAC_IP, TR_IP_WPUT_FRAG_START, - "ip_wput_frag_start:"); - - if (mp->b_datap->db_type == M_CTL) { - first_mp = mp; - mp_orig = mp = mp->b_cont; - mctl_present = B_TRUE; - } else { - first_mp = mp; - mctl_present = B_FALSE; + return (EINVAL); } ASSERT(MBLKL(mp) >= sizeof (ipha_t)); ipha = (ipha_t *)mp->b_rptr; + ASSERT(ntohs(ipha->ipha_length) == pkt_len); + frag_flag = ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_DF; /* - * If the Don't Fragment flag is on, generate an ICMP destination - * unreachable, fragmentation needed. - */ - offset = ntohs(ipha->ipha_fragment_offset_and_flags); - if (offset & IPH_DF) { - BUMP_MIB(mibptr, ipIfStatsOutFragFails); - if (is_system_labeled()) { - max_frag = tsol_pmtu_adjust(mp, ire->ire_max_frag, - ire->ire_max_frag - max_frag, AF_INET); - } - /* - * Need to compute hdr checksum if called from ip_wput_ire. - * Note that ip_rput_forward verifies the checksum before - * calling this routine so in that case this is a noop. - */ - ipha->ipha_hdr_checksum = 0; - ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); - icmp_frag_needed(ire->ire_stq, first_mp, max_frag, zoneid, - ipst); - TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, - "ip_wput_frag_end:(%S)", - "don't fragment"); - return; - } - /* - * Labeled systems adjust max_frag if they add a label - * to send the correct path mtu. We need the real mtu since we - * are fragmenting the packet after label adjustment. - */ - if (is_system_labeled()) - max_frag = ire->ire_max_frag; - if (mctl_present) - freeb(first_mp); - /* * Establish the starting offset. May not be zero if we are fragging * a fragment that is being forwarded. */ - offset = offset & IPH_OFFSET; + offset = ntohs(ipha->ipha_fragment_offset_and_flags) & IPH_OFFSET; /* TODO why is this test needed? */ - v_hlen_tos_len = ((uint32_t *)ipha)[0]; - if (((max_frag - LENGTH) & ~7) < 8) { + if (((max_frag - ntohs(ipha->ipha_length)) & ~7) < 8) { /* TODO: notify ulp somehow */ - BUMP_MIB(mibptr, ipIfStatsOutFragFails); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails); + ip_drop_output("FragFails: bad starting offset", mp, ill); freemsg(mp); - TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, - "ip_wput_frag_end:(%S)", - "len < 8"); - return; + return (EINVAL); } - hdr_len = (V_HLEN & 0xF) << 2; - + hdr_len = IPH_HDR_LENGTH(ipha); ipha->ipha_hdr_checksum = 0; /* @@ -24173,40 +11742,14 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, */ len = (max_frag - hdr_len) & ~7; - /* Check if we can use MDT to send out the frags. */ - ASSERT(!IRE_IS_LOCAL(ire)); - if (hdr_len == IP_SIMPLE_HDR_LENGTH && - ipst->ips_ip_multidata_outbound && - !(ire->ire_flags & RTF_MULTIRT) && - !IPP_ENABLED(IPP_LOCAL_OUT, ipst) && - ill != NULL && ILL_MDT_CAPABLE(ill) && - IP_CAN_FRAG_MDT(mp, IP_SIMPLE_HDR_LENGTH, len)) { - ASSERT(ill->ill_mdt_capab != NULL); - if (!ill->ill_mdt_capab->ill_mdt_on) { - /* - * If MDT has been previously turned off in the past, - * and we currently can do MDT (due to IPQoS policy - * removal, etc.) then enable it for this interface. - */ - ill->ill_mdt_capab->ill_mdt_on = 1; - ip1dbg(("ip_wput_frag: enabled MDT for interface %s\n", - ill->ill_name)); - } - ip_wput_frag_mdt(ire, mp, pkt_type, len, frag_flag, - offset); - return; - } - /* Get a copy of the header for the trailing frags */ - hdr_mp = ip_wput_frag_copyhdr((uchar_t *)ipha, hdr_len, offset, ipst, + hdr_mp = ip_fragment_copyhdr((uchar_t *)ipha, hdr_len, offset, ipst, mp); - if (!hdr_mp) { - BUMP_MIB(mibptr, ipIfStatsOutFragFails); + if (hdr_mp == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails); + ip_drop_output("FragFails: no hdr_mp", mp, ill); freemsg(mp); - TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, - "ip_wput_frag_end:(%S)", - "couldn't copy hdr"); - return; + return (ENOBUFS); } /* Store the starting offset, with the MoreFrags flag. */ @@ -24233,279 +11776,28 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, * original IP header. */ if (!(mp = ip_carve_mp(&mp_orig, i1))) { - BUMP_MIB(mibptr, ipIfStatsOutFragFails); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails); + ip_drop_output("FragFails: could not carve mp", mp_orig, ill); freeb(hdr_mp); freemsg(mp_orig); - TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, - "ip_wput_frag_end:(%S)", - "couldn't carve first"); - return; + return (ENOBUFS); } - /* - * Multirouting case. Each fragment is replicated - * via all non-condemned RTF_MULTIRT routes - * currently resolved. - * We ensure that first_ire is the first RTF_MULTIRT - * ire in the bucket. - */ - if (ire->ire_flags & RTF_MULTIRT) { - irb = ire->ire_bucket; - ASSERT(irb != NULL); - - multirt_send = B_TRUE; - - /* Make sure we do not omit any multiroute ire. */ - IRB_REFHOLD(irb); - for (first_ire = irb->irb_ire; - first_ire != NULL; - first_ire = first_ire->ire_next) { - if ((first_ire->ire_flags & RTF_MULTIRT) && - (first_ire->ire_addr == ire->ire_addr) && - !(first_ire->ire_marks & - (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN))) - break; - } - - if (first_ire != NULL) { - if (first_ire != ire) { - IRE_REFHOLD(first_ire); - /* - * Do not release the ire passed in - * as the argument. - */ - ire = first_ire; - } else { - first_ire = NULL; - } - } - IRB_REFRELE(irb); - - /* - * Save the first ire; we will need to restore it - * for the trailing frags. - * We REFHOLD save_ire, as each iterated ire will be - * REFRELEd. - */ - save_ire = ire; - IRE_REFHOLD(save_ire); - } + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates); - /* - * First fragment emission loop. - * In most cases, the emission loop below is entered only - * once. Only in the case where the ire holds the RTF_MULTIRT - * flag, do we loop to process all RTF_MULTIRT ires in the - * bucket, and send the fragment through all crossed - * RTF_MULTIRT routes. - */ - do { - if (ire->ire_flags & RTF_MULTIRT) { - /* - * We are in a multiple send case, need to get - * the next ire and make a copy of the packet. - * ire1 holds here the next ire to process in the - * bucket. If multirouting is expected, - * any non-RTF_MULTIRT ire that has the - * right destination address is ignored. - * - * We have to take into account the MTU of - * each walked ire. max_frag is set by the - * the caller and generally refers to - * the primary ire entry. Here we ensure that - * no route with a lower MTU will be used, as - * fragments are carved once for all ires, - * then replicated. - */ - ASSERT(irb != NULL); - IRB_REFHOLD(irb); - for (ire1 = ire->ire_next; - ire1 != NULL; - ire1 = ire1->ire_next) { - if ((ire1->ire_flags & RTF_MULTIRT) == 0) - continue; - if (ire1->ire_addr != ire->ire_addr) - continue; - if (ire1->ire_marks & - (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)) - continue; - /* - * Ensure we do not exceed the MTU - * of the next route. - */ - if (ire1->ire_max_frag < max_frag) { - ip_multirt_bad_mtu(ire1, max_frag); - continue; - } - - /* Got one. */ - IRE_REFHOLD(ire1); - break; - } - IRB_REFRELE(irb); - - if (ire1 != NULL) { - next_mp = copyb(mp); - if ((next_mp == NULL) || - ((mp->b_cont != NULL) && - ((next_mp->b_cont = - dupmsg(mp->b_cont)) == NULL))) { - freemsg(next_mp); - next_mp = NULL; - ire_refrele(ire1); - ire1 = NULL; - } - } - - /* Last multiroute ire; don't loop anymore. */ - if (ire1 == NULL) { - multirt_send = B_FALSE; - } - } - - ll_hdr_len = 0; - LOCK_IRE_FP_MP(ire); - ll_hdr_mp = ire->ire_nce->nce_fp_mp; - if (ll_hdr_mp != NULL) { - ASSERT(ll_hdr_mp->b_datap->db_type == M_DATA); - ll_hdr_len = ll_hdr_mp->b_wptr - ll_hdr_mp->b_rptr; - } else { - ll_hdr_mp = ire->ire_nce->nce_res_mp; - } - - /* If there is a transmit header, get a copy for this frag. */ - /* - * TODO: should check db_ref before calling ip_carve_mp since - * it might give us a dup. - */ - if (!ll_hdr_mp) { - /* No xmit header. */ - xmit_mp = mp; - - /* We have a link-layer header that can fit in our mblk. */ - } else if (mp->b_datap->db_ref == 1 && - ll_hdr_len != 0 && - ll_hdr_len <= mp->b_rptr - mp->b_datap->db_base) { - /* M_DATA fastpath */ - mp->b_rptr -= ll_hdr_len; - bcopy(ll_hdr_mp->b_rptr, mp->b_rptr, ll_hdr_len); - xmit_mp = mp; - - /* Corner case if copyb has failed */ - } else if (!(xmit_mp = copyb(ll_hdr_mp))) { - UNLOCK_IRE_FP_MP(ire); - BUMP_MIB(mibptr, ipIfStatsOutFragFails); - freeb(hdr_mp); - freemsg(mp); - freemsg(mp_orig); - TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, - "ip_wput_frag_end:(%S)", - "discard"); - - if (multirt_send) { - ASSERT(ire1); - ASSERT(next_mp); - - freemsg(next_mp); - ire_refrele(ire1); - } - if (save_ire != NULL) - IRE_REFRELE(save_ire); - - if (first_ire != NULL) - ire_refrele(first_ire); - return; - - /* - * Case of res_mp OR the fastpath mp can't fit - * in the mblk - */ - } else { - xmit_mp->b_cont = mp; - - /* - * Get priority marking, if any. - * We propagate the CoS marking from the - * original packet that went to QoS processing - * in ip_wput_ire to the newly carved mp. - */ - if (DB_TYPE(xmit_mp) == M_DATA) - xmit_mp->b_band = mp->b_band; - } - UNLOCK_IRE_FP_MP(ire); - - q = ire->ire_stq; - out_ill = (ill_t *)q->q_ptr; - - BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsOutFragCreates); - - DTRACE_PROBE4(ip4__physical__out__start, - ill_t *, NULL, ill_t *, out_ill, - ipha_t *, ipha, mblk_t *, xmit_mp); - - FW_HOOKS(ipst->ips_ip4_physical_out_event, - ipst->ips_ipv4firewall_physical_out, - NULL, out_ill, ipha, xmit_mp, mp, 0, ipst); - - DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, xmit_mp); - - if (xmit_mp != NULL) { - DTRACE_IP7(send, mblk_t *, xmit_mp, conn_t *, NULL, - void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill, - ipha_t *, ipha, ip6_t *, NULL, int, 0); - - ILL_SEND_TX(out_ill, ire, connp, xmit_mp, 0, connp); - - BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutTransmits); - UPDATE_MIB(out_ill->ill_ip_mib, - ipIfStatsHCOutOctets, i1); - - if (pkt_type != OB_PKT) { - /* - * Update the packet count and MIB stats - * of trailing RTF_MULTIRT ires. - */ - UPDATE_OB_PKT_COUNT(ire); - BUMP_MIB(out_ill->ill_ip_mib, - ipIfStatsOutFragReqds); - } - } - - if (multirt_send) { - /* - * We are in a multiple send case; look for - * the next ire and re-enter the loop. - */ - ASSERT(ire1); - ASSERT(next_mp); - /* REFRELE the current ire before looping */ - ire_refrele(ire); - ire = ire1; - ire1 = NULL; - mp = next_mp; - next_mp = NULL; - } - } while (multirt_send); - - ASSERT(ire1 == NULL); - - /* Restore the original ire; we need it for the trailing frags */ - if (save_ire != NULL) { - /* REFRELE the last iterated ire */ - ire_refrele(ire); - /* save_ire has been REFHOLDed */ - ire = save_ire; - save_ire = NULL; - q = ire->ire_stq; + error = postfragfn(mp, nce, ixaflags, i1, xmit_hint, szone, nolzid, + ixa_cookie); + if (error != 0 && error != EWOULDBLOCK) { + /* No point in sending the other fragments */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails); + ip_drop_output("FragFails: postfragfn failed", mp_orig, ill); + freeb(hdr_mp); + freemsg(mp_orig); + return (error); } - if (pkt_type == OB_PKT) { - UPDATE_OB_PKT_COUNT(ire); - } else { - out_ill = (ill_t *)q->q_ptr; - BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutForwDatagrams); - UPDATE_IB_PKT_COUNT(ire); - } + /* No need to redo state machine in loop */ + ixaflags &= ~IXAF_REACH_CONF; /* Advance the offset to the second frag starting point. */ offset += len; @@ -24547,7 +11839,7 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, break; } /* Get priority marking, if any. */ - mp->b_band = carve_mp->b_band; + mp->b_band = priority; mp->b_cont = carve_mp; } ipha = (ipha_t *)mp->b_rptr; @@ -24581,7 +11873,7 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, } else { mp = hdr_mp; /* Get priority marking, if any. */ - mp->b_band = carve_mp->b_band; + mp->b_band = priority; mp->b_cont = carve_mp; } ipha = (ipha_t *)mp->b_rptr; @@ -24605,254 +11897,40 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, */ ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); - /* Attach a transmit header, if any, and ship it. */ - if (pkt_type == OB_PKT) { - UPDATE_OB_PKT_COUNT(ire); - } else { - out_ill = (ill_t *)q->q_ptr; - BUMP_MIB(out_ill->ill_ip_mib, - ipIfStatsHCOutForwDatagrams); - UPDATE_IB_PKT_COUNT(ire); - } - - if (ire->ire_flags & RTF_MULTIRT) { - irb = ire->ire_bucket; - ASSERT(irb != NULL); - - multirt_send = B_TRUE; + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates); - /* - * Save the original ire; we will need to restore it - * for the tailing frags. - */ - save_ire = ire; - IRE_REFHOLD(save_ire); + error = postfragfn(mp, nce, ixaflags, ip_len, xmit_hint, szone, + nolzid, ixa_cookie); + /* All done if we just consumed the hdr_mp. */ + if (mp == hdr_mp) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragOKs); + return (error); } - /* - * Emission loop for this fragment, similar - * to what is done for the first fragment. - */ - do { - if (multirt_send) { - /* - * We are in a multiple send case, need to get - * the next ire and make a copy of the packet. - */ - ASSERT(irb != NULL); - IRB_REFHOLD(irb); - for (ire1 = ire->ire_next; - ire1 != NULL; - ire1 = ire1->ire_next) { - if (!(ire1->ire_flags & RTF_MULTIRT)) - continue; - if (ire1->ire_addr != ire->ire_addr) - continue; - if (ire1->ire_marks & - (IRE_MARK_CONDEMNED | - IRE_MARK_TESTHIDDEN)) - continue; - /* - * Ensure we do not exceed the MTU - * of the next route. - */ - if (ire1->ire_max_frag < max_frag) { - ip_multirt_bad_mtu(ire1, - max_frag); - continue; - } - - /* Got one. */ - IRE_REFHOLD(ire1); - break; - } - IRB_REFRELE(irb); - - if (ire1 != NULL) { - next_mp = copyb(mp); - if ((next_mp == NULL) || - ((mp->b_cont != NULL) && - ((next_mp->b_cont = - dupmsg(mp->b_cont)) == NULL))) { - freemsg(next_mp); - next_mp = NULL; - ire_refrele(ire1); - ire1 = NULL; - } - } - - /* Last multiroute ire; don't loop anymore. */ - if (ire1 == NULL) { - multirt_send = B_FALSE; - } - } - - /* Update transmit header */ - ll_hdr_len = 0; - LOCK_IRE_FP_MP(ire); - ll_hdr_mp = ire->ire_nce->nce_fp_mp; - if (ll_hdr_mp != NULL) { - ASSERT(ll_hdr_mp->b_datap->db_type == M_DATA); - ll_hdr_len = MBLKL(ll_hdr_mp); - } else { - ll_hdr_mp = ire->ire_nce->nce_res_mp; - } - - if (!ll_hdr_mp) { - xmit_mp = mp; - - /* - * We have link-layer header that can fit in - * our mblk. - */ - } else if (mp->b_datap->db_ref == 1 && - ll_hdr_len != 0 && - ll_hdr_len <= mp->b_rptr - mp->b_datap->db_base) { - /* M_DATA fastpath */ - mp->b_rptr -= ll_hdr_len; - bcopy(ll_hdr_mp->b_rptr, mp->b_rptr, - ll_hdr_len); - xmit_mp = mp; - - /* - * Case of res_mp OR the fastpath mp can't fit - * in the mblk - */ - } else if ((xmit_mp = copyb(ll_hdr_mp)) != NULL) { - xmit_mp->b_cont = mp; - /* Get priority marking, if any. */ - if (DB_TYPE(xmit_mp) == M_DATA) - xmit_mp->b_band = mp->b_band; - - /* Corner case if copyb failed */ - } else { - /* - * Exit both the replication and - * fragmentation loops. - */ - UNLOCK_IRE_FP_MP(ire); - goto drop_pkt; - } - UNLOCK_IRE_FP_MP(ire); - - mp1 = mp; - out_ill = (ill_t *)q->q_ptr; - - BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsOutFragCreates); - - DTRACE_PROBE4(ip4__physical__out__start, - ill_t *, NULL, ill_t *, out_ill, - ipha_t *, ipha, mblk_t *, xmit_mp); - - FW_HOOKS(ipst->ips_ip4_physical_out_event, - ipst->ips_ipv4firewall_physical_out, - NULL, out_ill, ipha, xmit_mp, mp, 0, ipst); - - DTRACE_PROBE1(ip4__physical__out__end, - mblk_t *, xmit_mp); - - if (mp != mp1 && hdr_mp == mp1) - hdr_mp = mp; - if (mp != mp1 && mp_orig == mp1) - mp_orig = mp; - - if (xmit_mp != NULL) { - DTRACE_IP7(send, mblk_t *, xmit_mp, conn_t *, - NULL, void_ip_t *, ipha, - __dtrace_ipsr_ill_t *, out_ill, ipha_t *, - ipha, ip6_t *, NULL, int, 0); - - ILL_SEND_TX(out_ill, ire, connp, - xmit_mp, 0, connp); - - BUMP_MIB(out_ill->ill_ip_mib, - ipIfStatsHCOutTransmits); - UPDATE_MIB(out_ill->ill_ip_mib, - ipIfStatsHCOutOctets, ip_len); - - if (pkt_type != OB_PKT) { - /* - * Update the packet count of trailing - * RTF_MULTIRT ires. - */ - UPDATE_OB_PKT_COUNT(ire); - } - } - - /* All done if we just consumed the hdr_mp. */ - if (mp == hdr_mp) { - last_frag = B_TRUE; - BUMP_MIB(out_ill->ill_ip_mib, - ipIfStatsOutFragOKs); - } - - if (multirt_send) { - /* - * We are in a multiple send case; look for - * the next ire and re-enter the loop. - */ - ASSERT(ire1); - ASSERT(next_mp); - /* REFRELE the current ire before looping */ - ire_refrele(ire); - ire = ire1; - ire1 = NULL; - q = ire->ire_stq; - mp = next_mp; - next_mp = NULL; - } - } while (multirt_send); - /* - * Restore the original ire; we need it for the - * trailing frags - */ - if (save_ire != NULL) { - ASSERT(ire1 == NULL); - /* REFRELE the last iterated ire */ - ire_refrele(ire); - /* save_ire has been REFHOLDed */ - ire = save_ire; - q = ire->ire_stq; - save_ire = NULL; + if (error != 0 && error != EWOULDBLOCK) { + DTRACE_PROBE2(ip__xmit__frag__fail, ill_t *, ill, + mblk_t *, hdr_mp); + /* No point in sending the other fragments */ + break; } - if (last_frag) { - TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, - "ip_wput_frag_end:(%S)", - "consumed hdr_mp"); - - if (first_ire != NULL) - ire_refrele(first_ire); - return; - } /* Otherwise, advance and loop. */ offset += len; } - -drop_pkt: /* Clean up following allocation failure. */ - BUMP_MIB(mibptr, ipIfStatsOutFragFails); - freemsg(mp); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails); + ip_drop_output("FragFails: loop ended", NULL, ill); if (mp != hdr_mp) freeb(hdr_mp); if (mp != mp_orig) freemsg(mp_orig); - - if (save_ire != NULL) - IRE_REFRELE(save_ire); - if (first_ire != NULL) - ire_refrele(first_ire); - - TRACE_1(TR_FAC_IP, TR_IP_WPUT_FRAG_END, - "ip_wput_frag_end:(%S)", - "end--alloc failure"); + return (error); } /* * Copy the header plus those options which have the copy bit set - * src is the template to make sure we preserve the cred for TX purposes. */ static mblk_t * -ip_wput_frag_copyhdr(uchar_t *rptr, int hdr_len, int offset, ip_stack_t *ipst, +ip_fragment_copyhdr(uchar_t *rptr, int hdr_len, int offset, ip_stack_t *ipst, mblk_t *src) { mblk_t *mp; @@ -24908,310 +11986,13 @@ ip_wput_frag_copyhdr(uchar_t *rptr, int hdr_len, int offset, ip_stack_t *ipst, } /* - * Delivery to local recipients including fanout to multiple recipients. - * Does not do checksumming of UDP/TCP. - * Note: q should be the read side queue for either the ill or conn. - * Note: rq should be the read side q for the lower (ill) stream. - * We don't send packets to IPPF processing, thus the last argument - * to all the fanout calls are B_FALSE. - */ -void -ip_wput_local(queue_t *q, ill_t *ill, ipha_t *ipha, mblk_t *mp, ire_t *ire, - int fanout_flags, zoneid_t zoneid) -{ - uint32_t protocol; - mblk_t *first_mp; - boolean_t mctl_present; - int ire_type; -#define rptr ((uchar_t *)ipha) - ip_stack_t *ipst = ill->ill_ipst; - - TRACE_1(TR_FAC_IP, TR_IP_WPUT_LOCAL_START, - "ip_wput_local_start: q %p", q); - - if (ire != NULL) { - ire_type = ire->ire_type; - } else { - /* - * Only ip_multicast_loopback() calls us with a NULL ire. If the - * packet is not multicast, we can't tell the ire type. - */ - ASSERT(CLASSD(ipha->ipha_dst)); - ire_type = IRE_BROADCAST; - } - - first_mp = mp; - if (first_mp->b_datap->db_type == M_CTL) { - ipsec_out_t *io = (ipsec_out_t *)first_mp->b_rptr; - if (!io->ipsec_out_secure) { - /* - * This ipsec_out_t was allocated in ip_wput - * for multicast packets to store the ill_index. - * As this is being delivered locally, we don't - * need this anymore. - */ - mp = first_mp->b_cont; - freeb(first_mp); - first_mp = mp; - mctl_present = B_FALSE; - } else { - /* - * Convert IPSEC_OUT to IPSEC_IN, preserving all - * security properties for the looped-back packet. - */ - mctl_present = B_TRUE; - mp = first_mp->b_cont; - ASSERT(mp != NULL); - ipsec_out_to_in(first_mp); - } - } else { - mctl_present = B_FALSE; - } - - DTRACE_PROBE4(ip4__loopback__in__start, - ill_t *, ill, ill_t *, NULL, - ipha_t *, ipha, mblk_t *, first_mp); - - FW_HOOKS(ipst->ips_ip4_loopback_in_event, - ipst->ips_ipv4firewall_loopback_in, - ill, NULL, ipha, first_mp, mp, 0, ipst); - - DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, first_mp); - - if (first_mp == NULL) - return; - - if (ipst->ips_ip4_observe.he_interested) { - zoneid_t szone, dzone, lookup_zoneid = ALL_ZONES; - zoneid_t stackzoneid = netstackid_to_zoneid( - ipst->ips_netstack->netstack_stackid); - - dzone = (stackzoneid == GLOBAL_ZONEID) ? zoneid : stackzoneid; - /* - * 127.0.0.1 is special, as we cannot lookup its zoneid by - * address. Restrict the lookup below to the destination zone. - */ - if (ipha->ipha_src == ntohl(INADDR_LOOPBACK)) - lookup_zoneid = zoneid; - szone = ip_get_zoneid_v4(ipha->ipha_src, mp, ipst, - lookup_zoneid); - ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst); - } - - DTRACE_IP7(receive, mblk_t *, first_mp, conn_t *, NULL, void_ip_t *, - ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL, - int, 1); - - ipst->ips_loopback_packets++; - - ip2dbg(("ip_wput_local: from 0x%x to 0x%x in zone %d\n", - ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst), zoneid)); - if (!IS_SIMPLE_IPH(ipha)) { - ip_wput_local_options(ipha, ipst); - } - - protocol = ipha->ipha_protocol; - switch (protocol) { - case IPPROTO_ICMP: { - ire_t *ire_zone; - ilm_t *ilm; - mblk_t *mp1; - zoneid_t last_zoneid; - ilm_walker_t ilw; - - if (CLASSD(ipha->ipha_dst) && !IS_LOOPBACK(ill)) { - ASSERT(ire_type == IRE_BROADCAST); - /* - * In the multicast case, applications may have joined - * the group from different zones, so we need to deliver - * the packet to each of them. Loop through the - * multicast memberships structures (ilm) on the receive - * ill and send a copy of the packet up each matching - * one. However, we don't do this for multicasts sent on - * the loopback interface (PHYI_LOOPBACK flag set) as - * they must stay in the sender's zone. - * - * ilm_add_v6() ensures that ilms in the same zone are - * contiguous in the ill_ilm list. We use this property - * to avoid sending duplicates needed when two - * applications in the same zone join the same group on - * different logical interfaces: we ignore the ilm if - * it's zoneid is the same as the last matching one. - * In addition, the sending of the packet for - * ire_zoneid is delayed until all of the other ilms - * have been exhausted. - */ - last_zoneid = -1; - ilm = ilm_walker_start(&ilw, ill); - for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { - if (ipha->ipha_dst != ilm->ilm_addr || - ilm->ilm_zoneid == last_zoneid || - ilm->ilm_zoneid == zoneid || - !(ilm->ilm_ipif->ipif_flags & IPIF_UP)) - continue; - mp1 = ip_copymsg(first_mp); - if (mp1 == NULL) - continue; - icmp_inbound(q, mp1, B_TRUE, ilw.ilw_walk_ill, - 0, 0, mctl_present, B_FALSE, ill, - ilm->ilm_zoneid); - last_zoneid = ilm->ilm_zoneid; - } - ilm_walker_finish(&ilw); - /* - * Loopback case: the sending endpoint has - * IP_MULTICAST_LOOP disabled, therefore we don't - * dispatch the multicast packet to the sending zone. - */ - if (fanout_flags & IP_FF_NO_MCAST_LOOP) { - freemsg(first_mp); - return; - } - } else if (ire_type == IRE_BROADCAST) { - /* - * In the broadcast case, there may be many zones - * which need a copy of the packet delivered to them. - * There is one IRE_BROADCAST per broadcast address - * and per zone; we walk those using a helper function. - * In addition, the sending of the packet for zoneid is - * delayed until all of the other ires have been - * processed. - */ - IRB_REFHOLD(ire->ire_bucket); - ire_zone = NULL; - while ((ire_zone = ire_get_next_bcast_ire(ire_zone, - ire)) != NULL) { - mp1 = ip_copymsg(first_mp); - if (mp1 == NULL) - continue; - - UPDATE_IB_PKT_COUNT(ire_zone); - ire_zone->ire_last_used_time = lbolt; - icmp_inbound(q, mp1, B_TRUE, ill, 0, 0, - mctl_present, B_FALSE, ill, - ire_zone->ire_zoneid); - } - IRB_REFRELE(ire->ire_bucket); - } - icmp_inbound(q, first_mp, (ire_type == IRE_BROADCAST), ill, 0, - 0, mctl_present, B_FALSE, ill, zoneid); - TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, - "ip_wput_local_end: q %p (%S)", - q, "icmp"); - return; - } - case IPPROTO_IGMP: - if ((mp = igmp_input(q, mp, ill)) == NULL) { - /* Bad packet - discarded by igmp_input */ - TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, - "ip_wput_local_end: q %p (%S)", - q, "igmp_input--bad packet"); - if (mctl_present) - freeb(first_mp); - return; - } - /* - * igmp_input() may have returned the pulled up message. - * So first_mp and ipha need to be reinitialized. - */ - ipha = (ipha_t *)mp->b_rptr; - if (mctl_present) - first_mp->b_cont = mp; - else - first_mp = mp; - /* deliver to local raw users */ - break; - case IPPROTO_ENCAP: - /* - * This case is covered by either ip_fanout_proto, or by - * the above security processing for self-tunneled packets. - */ - break; - case IPPROTO_UDP: { - uint16_t *up; - uint32_t ports; - - up = (uint16_t *)(rptr + IPH_HDR_LENGTH(ipha) + - UDP_PORTS_OFFSET); - /* Force a 'valid' checksum. */ - up[3] = 0; - - ports = *(uint32_t *)up; - ip_fanout_udp(q, first_mp, ill, ipha, ports, - (ire_type == IRE_BROADCAST), - fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | - IP_FF_SEND_SLLA | IP_FF_IPINFO, mctl_present, B_FALSE, - ill, zoneid); - TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, - "ip_wput_local_end: q %p (%S)", q, "ip_fanout_udp"); - return; - } - case IPPROTO_TCP: { - - /* - * For TCP, discard broadcast packets. - */ - if ((ushort_t)ire_type == IRE_BROADCAST) { - freemsg(first_mp); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - ip2dbg(("ip_wput_local: discard broadcast\n")); - return; - } - - if (mp->b_datap->db_type == M_DATA) { - /* - * M_DATA mblk, so init mblk (chain) for no struio(). - */ - mblk_t *mp1 = mp; - - do { - mp1->b_datap->db_struioflag = 0; - } while ((mp1 = mp1->b_cont) != NULL); - } - ASSERT((rptr + IPH_HDR_LENGTH(ipha) + TCP_PORTS_OFFSET + 4) - <= mp->b_wptr); - ip_fanout_tcp(q, first_mp, ill, ipha, - fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | - IP_FF_SYN_ADDIRE | IP_FF_IPINFO, - mctl_present, B_FALSE, zoneid); - TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, - "ip_wput_local_end: q %p (%S)", q, "ip_fanout_tcp"); - return; - } - case IPPROTO_SCTP: - { - uint32_t ports; - - bcopy(rptr + IPH_HDR_LENGTH(ipha), &ports, sizeof (ports)); - ip_fanout_sctp(first_mp, ill, ipha, ports, - fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | - IP_FF_IPINFO, mctl_present, B_FALSE, zoneid); - return; - } - - default: - break; - } - /* - * Find a client for some other protocol. We give - * copies to multiple clients, if more than one is - * bound. - */ - ip_fanout_proto(q, first_mp, ill, ipha, - fanout_flags | IP_FF_SEND_ICMP | IP_FF_HDR_COMPLETE | IP_FF_RAWIP, - mctl_present, B_FALSE, ill, zoneid); - TRACE_2(TR_FAC_IP, TR_IP_WPUT_LOCAL_END, - "ip_wput_local_end: q %p (%S)", q, "ip_fanout_proto"); -#undef rptr -} - -/* - * Update any source route, record route, or timestamp options. + * Update any source route, record route, or timestamp options when + * sending a packet back to ourselves. * Check that we are at end of strict source route. - * The options have been sanity checked by ip_wput_options(). + * The options have been sanity checked by ip_output_options(). */ -static void -ip_wput_local_options(ipha_t *ipha, ip_stack_t *ipst) +void +ip_output_local_options(ipha_t *ipha, ip_stack_t *ipst) { ipoptp_t opts; uchar_t *opt; @@ -25219,10 +12000,8 @@ ip_wput_local_options(ipha_t *ipha, ip_stack_t *ipst) uint8_t optlen; ipaddr_t dst; uint32_t ts; - ire_t *ire; timestruc_t now; - ip2dbg(("ip_wput_local_options\n")); for (optval = ipoptp_first(&opts, ipha); optval != IPOPT_EOL; optval = ipoptp_next(&opts)) { @@ -25246,7 +12025,7 @@ ip_wput_local_options(ipha_t *ipha, ip_stack_t *ipst) * it is a packet with a loose source route which * reaches us before consuming the whole source route */ - ip1dbg(("ip_wput_local_options: not end of SR\n")); + if (optval == IPOPT_SSRR) { return; } @@ -25267,7 +12046,7 @@ ip_wput_local_options(ipha_t *ipha, ip_stack_t *ipst) off > optlen - IP_ADDR_LEN) { /* No more room - ignore */ ip1dbg(( - "ip_wput_forward_options: end of RR\n")); + "ip_output_local_options: end of RR\n")); break; } dst = htonl(INADDR_LOOPBACK); @@ -25285,14 +12064,10 @@ ip_wput_local_options(ipha_t *ipha, ip_stack_t *ipst) /* Verify that the address matched */ off = opt[IPOPT_OFFSET] - 1; bcopy((char *)opt + off, &dst, IP_ADDR_LEN); - ire = ire_ctable_lookup(dst, 0, IRE_LOCAL, - NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, - ipst); - if (ire == NULL) { + if (ip_type_v4(dst, ipst) != IRE_LOCAL) { /* Not for us */ break; } - ire_refrele(ire); /* FALLTHRU */ case IPOPT_TS_TSANDADDR: off = IP_ADDR_LEN + IPOPT_TS_TIMELEN; @@ -25302,8 +12077,8 @@ ip_wput_local_options(ipha_t *ipha, ip_stack_t *ipst) * ip_*put_options should have already * dropped this packet. */ - cmn_err(CE_PANIC, "ip_wput_local_options: " - "unknown IT - bug in ip_wput_options?\n"); + cmn_err(CE_PANIC, "ip_output_local_options: " + "unknown IT - bug in ip_output_options?\n"); return; /* Keep "lint" happy */ } if (opt[IPOPT_OFFSET] - 1 + off > optlen) { @@ -25339,1098 +12114,240 @@ ip_wput_local_options(ipha_t *ipha, ip_stack_t *ipst) } /* - * Send out a multicast packet on interface ipif. - * The sender does not have an conn. - * Caller verifies that this isn't a PHYI_LOOPBACK. - */ -void -ip_wput_multicast(queue_t *q, mblk_t *mp, ipif_t *ipif, zoneid_t zoneid) -{ - ipha_t *ipha; - ire_t *ire; - ipaddr_t dst; - mblk_t *first_mp; - ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; - - /* igmp_sendpkt always allocates a ipsec_out_t */ - ASSERT(mp->b_datap->db_type == M_CTL); - ASSERT(!ipif->ipif_isv6); - ASSERT(!IS_LOOPBACK(ipif->ipif_ill)); - - first_mp = mp; - mp = first_mp->b_cont; - ASSERT(mp->b_datap->db_type == M_DATA); - ipha = (ipha_t *)mp->b_rptr; - - /* - * Find an IRE which matches the destination and the outgoing - * queue (i.e. the outgoing interface.) - */ - if (ipif->ipif_flags & IPIF_POINTOPOINT) - dst = ipif->ipif_pp_dst_addr; - else - dst = ipha->ipha_dst; - /* - * The source address has already been initialized by the - * caller and hence matching on ILL (MATCH_IRE_ILL) would - * be sufficient rather than MATCH_IRE_IPIF. - * - * This function is used for sending IGMP packets. For IPMP, - * we sidestep IGMP snooping issues by sending all multicast - * traffic on a single interface in the IPMP group. - */ - ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid, NULL, - MATCH_IRE_ILL, ipst); - if (!ire) { - /* - * Mark this packet to make it be delivered to - * ip_wput_ire after the new ire has been - * created. - */ - mp->b_prev = NULL; - mp->b_next = NULL; - ip_newroute_ipif(q, first_mp, ipif, dst, NULL, RTF_SETSRC, - zoneid, &zero_info); - return; - } - - /* - * Honor the RTF_SETSRC flag; this is the only case - * where we force this addr whatever the current src addr is, - * because this address is set by igmp_sendpkt(), and - * cannot be specified by any user. - */ - if (ire->ire_flags & RTF_SETSRC) { - ipha->ipha_src = ire->ire_src_addr; - } - - ip_wput_ire(q, first_mp, ire, NULL, B_FALSE, zoneid); -} - -/* - * NOTE : This function does not ire_refrele the ire argument passed in. + * Prepend an M_DATA fastpath header, and if none present prepend a + * DL_UNITDATA_REQ. Frees the mblk on failure. + * + * nce_dlur_mp and nce_fp_mp can not disappear once they have been set. + * If there is a change to them, the nce will be deleted (condemned) and + * a new nce_t will be created when packets are sent. Thus we need no locks + * to access those fields. * - * Copy the link layer header and do IPQoS if needed. Frees the mblk on - * failure. The nce_fp_mp can vanish any time in the case of - * IRE_BROADCAST due to DL_NOTE_FASTPATH_FLUSH. Hence we have to hold - * the ire_lock to access the nce_fp_mp in this case. - * IPQoS assumes that the first M_DATA contains the IP header. So, if we are - * prepending a fastpath message IPQoS processing must precede it, we also set - * the b_band of the fastpath message to that of the mblk returned by IPQoS - * (IPQoS might have set the b_band for CoS marking). - * However, if we are prepending DL_UNITDATA_REQ message, IPQoS processing - * must follow it so that IPQoS can mark the dl_priority field for CoS - * marking, if needed. + * We preserve b_band to support IPQoS. If a DL_UNITDATA_REQ is prepended + * we place b_band in dl_priority.dl_max. */ static mblk_t * -ip_wput_attach_llhdr(mblk_t *mp, ire_t *ire, ip_proc_t proc, - uint32_t ill_index, ipha_t **iphap) +ip_xmit_attach_llhdr(mblk_t *mp, nce_t *nce) { uint_t hlen; - ipha_t *ipha; mblk_t *mp1; - boolean_t qos_done = B_FALSE; - uchar_t *ll_hdr; - ip_stack_t *ipst = ire->ire_ipst; + uint_t priority; + uchar_t *rptr; -#define rptr ((uchar_t *)ipha) + rptr = mp->b_rptr; - ipha = (ipha_t *)mp->b_rptr; - hlen = 0; - LOCK_IRE_FP_MP(ire); - if ((mp1 = ire->ire_nce->nce_fp_mp) != NULL) { - ASSERT(DB_TYPE(mp1) == M_DATA); - /* Initiate IPPF processing */ - if ((proc != 0) && IPP_ENABLED(proc, ipst)) { - UNLOCK_IRE_FP_MP(ire); - ip_process(proc, &mp, ill_index); - if (mp == NULL) - return (NULL); + ASSERT(DB_TYPE(mp) == M_DATA); + priority = mp->b_band; - ipha = (ipha_t *)mp->b_rptr; - LOCK_IRE_FP_MP(ire); - if ((mp1 = ire->ire_nce->nce_fp_mp) == NULL) { - qos_done = B_TRUE; - goto no_fp_mp; - } - ASSERT(DB_TYPE(mp1) == M_DATA); - } + ASSERT(nce != NULL); + if ((mp1 = nce->nce_fp_mp) != NULL) { hlen = MBLKL(mp1); /* * Check if we have enough room to prepend fastpath * header */ if (hlen != 0 && (rptr - mp->b_datap->db_base) >= hlen) { - ll_hdr = rptr - hlen; - bcopy(mp1->b_rptr, ll_hdr, hlen); + rptr -= hlen; + bcopy(mp1->b_rptr, rptr, hlen); /* * Set the b_rptr to the start of the link layer * header */ - mp->b_rptr = ll_hdr; - mp1 = mp; - } else { - mp1 = copyb(mp1); - if (mp1 == NULL) - goto unlock_err; - mp1->b_band = mp->b_band; - mp1->b_cont = mp; - /* - * XXX disable ICK_VALID and compute checksum - * here; can happen if nce_fp_mp changes and - * it can't be copied now due to insufficient - * space. (unlikely, fp mp can change, but it - * does not increase in length) - */ + mp->b_rptr = rptr; + return (mp); } - UNLOCK_IRE_FP_MP(ire); - } else { -no_fp_mp: - mp1 = copyb(ire->ire_nce->nce_res_mp); + mp1 = copyb(mp1); if (mp1 == NULL) { -unlock_err: - UNLOCK_IRE_FP_MP(ire); + ill_t *ill = nce->nce_ill; + + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards", mp, ill); freemsg(mp); return (NULL); } - UNLOCK_IRE_FP_MP(ire); + mp1->b_band = priority; mp1->b_cont = mp; - if (!qos_done && (proc != 0) && IPP_ENABLED(proc, ipst)) { - ip_process(proc, &mp1, ill_index); - if (mp1 == NULL) - return (NULL); - - if (mp1->b_cont == NULL) - ipha = NULL; - else - ipha = (ipha_t *)mp1->b_cont->b_rptr; - } - } - - *iphap = ipha; - return (mp1); -#undef rptr -} - -/* - * Finish the outbound IPsec processing for an IPv6 packet. This function - * is called from ipsec_out_process() if the IPsec packet was processed - * synchronously, or from {ah,esp}_kcf_callback() if it was processed - * asynchronously. - */ -void -ip_wput_ipsec_out_v6(queue_t *q, mblk_t *ipsec_mp, ip6_t *ip6h, ill_t *ill, - ire_t *ire_arg) -{ - in6_addr_t *v6dstp; - ire_t *ire; - mblk_t *mp; - ip6_t *ip6h1; - uint_t ill_index; - ipsec_out_t *io; - boolean_t hwaccel; - uint32_t flags = IP6_NO_IPPOLICY; - int match_flags; - zoneid_t zoneid; - boolean_t ill_need_rele = B_FALSE; - boolean_t ire_need_rele = B_FALSE; - ip_stack_t *ipst; - - mp = ipsec_mp->b_cont; - ip6h1 = (ip6_t *)mp->b_rptr; - io = (ipsec_out_t *)ipsec_mp->b_rptr; - ASSERT(io->ipsec_out_ns != NULL); - ipst = io->ipsec_out_ns->netstack_ip; - ill_index = io->ipsec_out_ill_index; - if (io->ipsec_out_reachable) { - flags |= IPV6_REACHABILITY_CONFIRMATION; - } - hwaccel = io->ipsec_out_accelerated; - zoneid = io->ipsec_out_zoneid; - ASSERT(zoneid != ALL_ZONES); - ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION); - match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; - /* Multicast addresses should have non-zero ill_index. */ - v6dstp = &ip6h->ip6_dst; - ASSERT(ip6h->ip6_nxt != IPPROTO_RAW); - ASSERT(!IN6_IS_ADDR_MULTICAST(v6dstp) || ill_index != 0); - - if (ill == NULL && ill_index != 0) { - ill = ip_grab_ill(ipsec_mp, ill_index, B_TRUE, ipst); - /* Failure case frees things for us. */ - if (ill == NULL) - return; - - ill_need_rele = B_TRUE; - } - ASSERT(mp != NULL); - - if (IN6_IS_ADDR_MULTICAST(v6dstp)) { - boolean_t unspec_src; - ipif_t *ipif; - - /* - * Use the ill_index to get the right ill. - */ - unspec_src = io->ipsec_out_unspec_src; - (void) ipif_lookup_zoneid(ill, zoneid, 0, &ipif); - if (ipif == NULL) { - if (ill_need_rele) - ill_refrele(ill); - freemsg(ipsec_mp); - return; - } - - if (ire_arg != NULL) { - ire = ire_arg; - } else { - ire = ire_ctable_lookup_v6(v6dstp, 0, 0, ipif, - zoneid, msg_getlabel(mp), match_flags, ipst); - ire_need_rele = B_TRUE; - } - if (ire != NULL) { - ipif_refrele(ipif); - /* - * XXX Do the multicast forwarding now, as the IPsec - * processing has been done. - */ - goto send; - } - - ip0dbg(("ip_wput_ipsec_out_v6: multicast: IRE disappeared\n")); - mp->b_prev = NULL; - mp->b_next = NULL; - - /* - * If the IPsec packet was processed asynchronously, - * drop it now. - */ - if (q == NULL) { - if (ill_need_rele) - ill_refrele(ill); - freemsg(ipsec_mp); - ipif_refrele(ipif); - return; - } - - ip_newroute_ipif_v6(q, ipsec_mp, ipif, v6dstp, &ip6h->ip6_src, - unspec_src, zoneid); - ipif_refrele(ipif); - } else { - if (ire_arg != NULL) { - ire = ire_arg; - } else { - ire = ire_cache_lookup_v6(v6dstp, zoneid, NULL, ipst); - ire_need_rele = B_TRUE; - } - if (ire != NULL) - goto send; - /* - * ire disappeared underneath. - * - * What we need to do here is the ip_newroute - * logic to get the ire without doing the IPsec - * processing. Follow the same old path. But this - * time, ip_wput or ire_add_then_send will call us - * directly as all the IPsec operations are done. - */ - ip1dbg(("ip_wput_ipsec_out_v6: IRE disappeared\n")); - mp->b_prev = NULL; - mp->b_next = NULL; - - /* - * If the IPsec packet was processed asynchronously, - * drop it now. - */ - if (q == NULL) { - if (ill_need_rele) - ill_refrele(ill); - freemsg(ipsec_mp); - return; - } - - ip_newroute_v6(q, ipsec_mp, v6dstp, &ip6h->ip6_src, ill, - zoneid, ipst); - } - if (ill != NULL && ill_need_rele) - ill_refrele(ill); - return; -send: - if (ill != NULL && ill_need_rele) - ill_refrele(ill); - - /* Local delivery */ - if (ire->ire_stq == NULL) { - ill_t *out_ill; - ASSERT(q != NULL); - - /* PFHooks: LOOPBACK_OUT */ - out_ill = ire_to_ill(ire); - + DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp); + DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp); + DB_CKSUMEND(mp1) = DB_CKSUMEND(mp); + DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp); + DB_LSOMSS(mp1) = DB_LSOMSS(mp); + DTRACE_PROBE1(ip__xmit__copyb, (mblk_t *), mp1); /* - * DTrace this as ip:::send. A blocked packet will fire the - * send probe, but not the receive probe. + * XXX disable ICK_VALID and compute checksum + * here; can happen if nce_fp_mp changes and + * it can't be copied now due to insufficient + * space. (unlikely, fp mp can change, but it + * does not increase in length) */ - DTRACE_IP7(send, mblk_t *, ipsec_mp, conn_t *, NULL, - void_ip_t *, ip6h, __dtrace_ipsr_ill_t *, out_ill, - ipha_t *, NULL, ip6_t *, ip6h, int, 1); - - DTRACE_PROBE4(ip6__loopback__out__start, - ill_t *, NULL, ill_t *, out_ill, - ip6_t *, ip6h1, mblk_t *, ipsec_mp); - - FW_HOOKS6(ipst->ips_ip6_loopback_out_event, - ipst->ips_ipv6firewall_loopback_out, - NULL, out_ill, ip6h1, ipsec_mp, mp, 0, ipst); - - DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, ipsec_mp); - - if (ipsec_mp != NULL) { - ip_wput_local_v6(RD(q), out_ill, - ip6h, ipsec_mp, ire, 0, zoneid); - } - if (ire_need_rele) - ire_refrele(ire); - return; - } - /* - * Everything is done. Send it out on the wire. - * We force the insertion of a fragment header using the - * IPH_FRAG_HDR flag in two cases: - * - after reception of an ICMPv6 "packet too big" message - * with a MTU < 1280 (cf. RFC 2460 section 5) - * - for multirouted IPv6 packets, so that the receiver can - * discard duplicates according to their fragment identifier - */ - /* XXX fix flow control problems. */ - if (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN > ire->ire_max_frag || - (ire->ire_frag_flag & IPH_FRAG_HDR)) { - if (hwaccel) { - /* - * hardware acceleration does not handle these - * "slow path" cases. - */ - /* IPsec KSTATS: should bump bean counter here. */ - if (ire_need_rele) - ire_refrele(ire); - freemsg(ipsec_mp); - return; - } - if (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN != - (mp->b_cont ? msgdsize(mp) : - mp->b_wptr - (uchar_t *)ip6h)) { - /* IPsec KSTATS: should bump bean counter here. */ - ip0dbg(("Packet length mismatch: %d, %ld\n", - ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN, - msgdsize(mp))); - if (ire_need_rele) - ire_refrele(ire); - freemsg(ipsec_mp); - return; - } - ASSERT(mp->b_prev == NULL); - ip2dbg(("Fragmenting Size = %d, mtu = %d\n", - ntohs(ip6h->ip6_plen) + - IPV6_HDR_LEN, ire->ire_max_frag)); - ip_wput_frag_v6(mp, ire, flags, NULL, B_FALSE, - ire->ire_max_frag); - } else { - UPDATE_OB_PKT_COUNT(ire); - ire->ire_last_used_time = lbolt; - ip_xmit_v6(mp, ire, flags, NULL, B_FALSE, hwaccel ? io : NULL); + return (mp1); } - if (ire_need_rele) - ire_refrele(ire); - freeb(ipsec_mp); -} + mp1 = copyb(nce->nce_dlur_mp); -void -ipsec_hw_putnext(queue_t *q, mblk_t *mp) -{ - mblk_t *hada_mp; /* attributes M_CTL mblk */ - da_ipsec_t *hada; /* data attributes */ - ill_t *ill = (ill_t *)q->q_ptr; - - IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_hw_putnext: accelerated packet\n")); + if (mp1 == NULL) { + ill_t *ill = nce->nce_ill; - if ((ill->ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP)) == 0) { - /* IPsec KSTATS: Bump lose counter here! */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards", mp, ill); freemsg(mp); - return; + return (NULL); } - - /* - * It's an IPsec packet that must be - * accelerated by the Provider, and the - * outbound ill is IPsec acceleration capable. - * Prepends the mblk with an IPHADA_M_CTL, and ship it - * to the ill. - * IPsec KSTATS: should bump packet counter here. - */ - - hada_mp = allocb(sizeof (da_ipsec_t), BPRI_HI); - if (hada_mp == NULL) { - /* IPsec KSTATS: should bump packet counter here. */ - freemsg(mp); - return; + mp1->b_cont = mp; + if (priority != 0) { + mp1->b_band = priority; + ((dl_unitdata_req_t *)(mp1->b_rptr))->dl_priority.dl_max = + priority; } - - hada_mp->b_datap->db_type = M_CTL; - hada_mp->b_wptr = hada_mp->b_rptr + sizeof (*hada); - hada_mp->b_cont = mp; - - hada = (da_ipsec_t *)hada_mp->b_rptr; - bzero(hada, sizeof (da_ipsec_t)); - hada->da_type = IPHADA_M_CTL; - - putnext(q, hada_mp); + return (mp1); +#undef rptr } /* * Finish the outbound IPsec processing. This function is called from * ipsec_out_process() if the IPsec packet was processed - * synchronously, or from {ah,esp}_kcf_callback() if it was processed + * synchronously, or from {ah,esp}_kcf_callback_outbound() if it was processed * asynchronously. + * + * This is common to IPv4 and IPv6. */ -void -ip_wput_ipsec_out(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, ill_t *ill, - ire_t *ire_arg) +int +ip_output_post_ipsec(mblk_t *mp, ip_xmit_attr_t *ixa) { - uint32_t v_hlen_tos_len; - ipaddr_t dst; - ipif_t *ipif = NULL; - ire_t *ire; - ire_t *ire1 = NULL; - mblk_t *next_mp = NULL; - uint32_t max_frag; - boolean_t multirt_send = B_FALSE; - mblk_t *mp; - ipha_t *ipha1; - uint_t ill_index; - ipsec_out_t *io; - int match_flags; - irb_t *irb = NULL; - boolean_t ill_need_rele = B_FALSE, ire_need_rele = B_TRUE; - zoneid_t zoneid; - ipxmit_state_t pktxmit_state; - ip_stack_t *ipst; - -#ifdef _BIG_ENDIAN -#define LENGTH (v_hlen_tos_len & 0xFFFF) -#else -#define LENGTH ((v_hlen_tos_len >> 24) | ((v_hlen_tos_len >> 8) & 0xFF00)) -#endif + iaflags_t ixaflags = ixa->ixa_flags; + uint_t pktlen; - mp = ipsec_mp->b_cont; - ipha1 = (ipha_t *)mp->b_rptr; - ASSERT(mp != NULL); - v_hlen_tos_len = ((uint32_t *)ipha)[0]; - dst = ipha->ipha_dst; - io = (ipsec_out_t *)ipsec_mp->b_rptr; - ill_index = io->ipsec_out_ill_index; - zoneid = io->ipsec_out_zoneid; - ASSERT(zoneid != ALL_ZONES); - ipst = io->ipsec_out_ns->netstack_ip; - ASSERT(io->ipsec_out_ns != NULL); - - match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; - if (ill == NULL && ill_index != 0) { - ill = ip_grab_ill(ipsec_mp, ill_index, B_FALSE, ipst); - /* Failure case frees things for us. */ - if (ill == NULL) - return; + /* AH/ESP don't update ixa_pktlen when they modify the packet */ + if (ixaflags & IXAF_IS_IPV4) { + ipha_t *ipha = (ipha_t *)mp->b_rptr; - ill_need_rele = B_TRUE; - } - - if (CLASSD(dst)) { - boolean_t conn_dontroute; - /* - * Use the ill_index to get the right ipif. - */ - conn_dontroute = io->ipsec_out_dontroute; - if (ill_index == 0) - ipif = ipif_lookup_group(dst, zoneid, ipst); - else - (void) ipif_lookup_zoneid(ill, zoneid, 0, &ipif); - if (ipif == NULL) { - ip1dbg(("ip_wput_ipsec_out: No ipif for" - " multicast\n")); - BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); - freemsg(ipsec_mp); - goto done; - } - /* - * ipha_src has already been intialized with the - * value of the ipif in ip_wput. All we need now is - * an ire to send this downstream. - */ - ire = ire_ctable_lookup(dst, 0, 0, ipif, zoneid, - msg_getlabel(mp), match_flags, ipst); - if (ire != NULL) { - ill_t *ill1; - /* - * Do the multicast forwarding now, as the IPsec - * processing has been done. - */ - if (ipst->ips_ip_g_mrouter && !conn_dontroute && - (ill1 = ire_to_ill(ire))) { - if (ip_mforward(ill1, ipha, mp)) { - freemsg(ipsec_mp); - ip1dbg(("ip_wput_ipsec_out: mforward " - "failed\n")); - ire_refrele(ire); - goto done; - } - } - goto send; - } - - ip0dbg(("ip_wput_ipsec_out: multicast: IRE disappeared\n")); - mp->b_prev = NULL; - mp->b_next = NULL; - - /* - * If the IPsec packet was processed asynchronously, - * drop it now. - */ - if (q == NULL) { - freemsg(ipsec_mp); - goto done; - } - - /* - * We may be using a wrong ipif to create the ire. - * But it is okay as the source address is assigned - * for the packet already. Next outbound packet would - * create the IRE with the right IPIF in ip_wput. - * - * Also handle RTF_MULTIRT routes. - */ - ip_newroute_ipif(q, ipsec_mp, ipif, dst, NULL, RTF_MULTIRT, - zoneid, &zero_info); + ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); + pktlen = ntohs(ipha->ipha_length); } else { - if (ire_arg != NULL) { - ire = ire_arg; - ire_need_rele = B_FALSE; - } else { - ire = ire_cache_lookup(dst, zoneid, - msg_getlabel(mp), ipst); - } - if (ire != NULL) { - goto send; - } - - /* - * ire disappeared underneath. - * - * What we need to do here is the ip_newroute - * logic to get the ire without doing the IPsec - * processing. Follow the same old path. But this - * time, ip_wput or ire_add_then_put will call us - * directly as all the IPsec operations are done. - */ - ip1dbg(("ip_wput_ipsec_out: IRE disappeared\n")); - mp->b_prev = NULL; - mp->b_next = NULL; + ip6_t *ip6h = (ip6_t *)mp->b_rptr; - /* - * If the IPsec packet was processed asynchronously, - * drop it now. - */ - if (q == NULL) { - freemsg(ipsec_mp); - goto done; - } - - /* - * Since we're going through ip_newroute() again, we - * need to make sure we don't: - * - * 1.) Trigger the ASSERT() with the ipha_ident - * overloading. - * 2.) Redo transport-layer checksumming, since we've - * already done all that to get this far. - * - * The easiest way not do either of the above is to set - * the ipha_ident field to IP_HDR_INCLUDED. - */ - ipha->ipha_ident = IP_HDR_INCLUDED; - ip_newroute(q, ipsec_mp, dst, (CONN_Q(q) ? Q_TO_CONN(q) : NULL), - zoneid, ipst); - } - goto done; -send: - if (ire->ire_stq == NULL) { - ill_t *out_ill; - /* - * Loopbacks go through ip_wput_local except for one case. - * We come here if we generate a icmp_frag_needed message - * after IPsec processing is over. When this function calls - * ip_wput_ire_fragmentit, ip_wput_frag might end up calling - * icmp_frag_needed. The message generated comes back here - * through icmp_frag_needed -> icmp_pkt -> ip_wput -> - * ipsec_out_process -> ip_wput_ipsec_out. We need to set the - * source address as it is usually set in ip_wput_ire. As - * ipsec_out_proc_begin is set, ip_wput calls ipsec_out_process - * and we end up here. We can't enter ip_wput_ire once the - * IPsec processing is over and hence we need to do it here. - */ - ASSERT(q != NULL); - UPDATE_OB_PKT_COUNT(ire); - ire->ire_last_used_time = lbolt; - if (ipha->ipha_src == 0) - ipha->ipha_src = ire->ire_src_addr; - - /* PFHooks: LOOPBACK_OUT */ - out_ill = ire_to_ill(ire); - - /* - * DTrace this as ip:::send. A blocked packet will fire the - * send probe, but not the receive probe. - */ - DTRACE_IP7(send, mblk_t *, ipsec_mp, conn_t *, NULL, - void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill, - ipha_t *, ipha, ip6_t *, NULL, int, 1); - - DTRACE_PROBE4(ip4__loopback__out__start, - ill_t *, NULL, ill_t *, out_ill, - ipha_t *, ipha1, mblk_t *, ipsec_mp); - - FW_HOOKS(ipst->ips_ip4_loopback_out_event, - ipst->ips_ipv4firewall_loopback_out, - NULL, out_ill, ipha1, ipsec_mp, mp, 0, ipst); - - DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, ipsec_mp); - - if (ipsec_mp != NULL) - ip_wput_local(RD(q), out_ill, - ipha, ipsec_mp, ire, 0, zoneid); - if (ire_need_rele) - ire_refrele(ire); - goto done; + ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION); + pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; } - if (ire->ire_max_frag < (unsigned int)LENGTH) { - /* - * We are through with IPsec processing. - * Fragment this and send it on the wire. - */ - if (io->ipsec_out_accelerated) { - /* - * The packet has been accelerated but must - * be fragmented. This should not happen - * since AH and ESP must not accelerate - * packets that need fragmentation, however - * the configuration could have changed - * since the AH or ESP processing. - * Drop packet. - * IPsec KSTATS: bump bean counter here. - */ - IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_wput_ipsec_out: " - "fragmented accelerated packet!\n")); - freemsg(ipsec_mp); - } else { - ip_wput_ire_fragmentit(ipsec_mp, ire, - zoneid, ipst, NULL); - } - if (ire_need_rele) - ire_refrele(ire); - goto done; - } - - ip2dbg(("ip_wput_ipsec_out: ipsec_mp %p, ire %p, ire_ipif %p, " - "ipif %p\n", (void *)ipsec_mp, (void *)ire, - (void *)ire->ire_ipif, (void *)ipif)); - /* - * Multiroute the secured packet. + * We release any hard reference on the SAs here to make + * sure the SAs can be garbage collected. ipsr_sa has a soft reference + * on the SAs. + * If in the future we want the hard latching of the SAs in the + * ip_xmit_attr_t then we should remove this. */ - if (ire->ire_flags & RTF_MULTIRT) { - ire_t *first_ire; - irb = ire->ire_bucket; - ASSERT(irb != NULL); - /* - * This ire has been looked up as the one that - * goes through the given ipif; - * make sure we do not omit any other multiroute ire - * that may be present in the bucket before this one. - */ - IRB_REFHOLD(irb); - for (first_ire = irb->irb_ire; - first_ire != NULL; - first_ire = first_ire->ire_next) { - if ((first_ire->ire_flags & RTF_MULTIRT) && - (first_ire->ire_addr == ire->ire_addr) && - !(first_ire->ire_marks & - (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN))) - break; - } - - if ((first_ire != NULL) && (first_ire != ire)) { - /* - * Don't change the ire if the packet must - * be fragmented if sent via this new one. - */ - if (first_ire->ire_max_frag >= (unsigned int)LENGTH) { - IRE_REFHOLD(first_ire); - if (ire_need_rele) - ire_refrele(ire); - else - ire_need_rele = B_TRUE; - ire = first_ire; - } - } - IRB_REFRELE(irb); - - multirt_send = B_TRUE; - max_frag = ire->ire_max_frag; + if (ixa->ixa_ipsec_esp_sa != NULL) { + IPSA_REFRELE(ixa->ixa_ipsec_esp_sa); + ixa->ixa_ipsec_esp_sa = NULL; + } + if (ixa->ixa_ipsec_ah_sa != NULL) { + IPSA_REFRELE(ixa->ixa_ipsec_ah_sa); + ixa->ixa_ipsec_ah_sa = NULL; } - /* - * In most cases, the emission loop below is entered only once. - * Only in the case where the ire holds the RTF_MULTIRT - * flag, we loop to process all RTF_MULTIRT ires in the - * bucket, and send the packet through all crossed - * RTF_MULTIRT routes. - */ - do { - if (multirt_send) { + /* Do we need to fragment? */ + if ((ixa->ixa_flags & IXAF_IPV6_ADD_FRAGHDR) || + pktlen > ixa->ixa_fragsize) { + if (ixaflags & IXAF_IS_IPV4) { + ASSERT(!(ixa->ixa_flags & IXAF_IPV6_ADD_FRAGHDR)); /* - * ire1 holds here the next ire to process in the - * bucket. If multirouting is expected, - * any non-RTF_MULTIRT ire that has the - * right destination address is ignored. + * We check for the DF case in ipsec_out_process + * hence this only handles the non-DF case. */ - ASSERT(irb != NULL); - IRB_REFHOLD(irb); - for (ire1 = ire->ire_next; - ire1 != NULL; - ire1 = ire1->ire_next) { - if ((ire1->ire_flags & RTF_MULTIRT) == 0) - continue; - if (ire1->ire_addr != ire->ire_addr) - continue; - if (ire1->ire_marks & - (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN)) - continue; - /* No loopback here */ - if (ire1->ire_stq == NULL) - continue; - /* - * Ensure we do not exceed the MTU - * of the next route. - */ - if (ire1->ire_max_frag < (unsigned int)LENGTH) { - ip_multirt_bad_mtu(ire1, max_frag); - continue; - } - - IRE_REFHOLD(ire1); - break; - } - IRB_REFRELE(irb); - if (ire1 != NULL) { - /* - * We are in a multiple send case, need to - * make a copy of the packet. - */ - next_mp = copymsg(ipsec_mp); - if (next_mp == NULL) { - ire_refrele(ire1); - ire1 = NULL; - } + return (ip_fragment_v4(mp, ixa->ixa_nce, ixa->ixa_flags, + pktlen, ixa->ixa_fragsize, + ixa->ixa_xmit_hint, ixa->ixa_zoneid, + ixa->ixa_no_loop_zoneid, ixa->ixa_postfragfn, + &ixa->ixa_cookie)); + } else { + mp = ip_fraghdr_add_v6(mp, ixa->ixa_ident, ixa); + if (mp == NULL) { + /* MIB and ip_drop_output already done */ + return (ENOMEM); } - } - /* - * Everything is done. Send it out on the wire - * - * ip_xmit_v4 will call ip_wput_attach_llhdr and then - * either send it on the wire or, in the case of - * HW acceleration, call ipsec_hw_putnext. - */ - if (ire->ire_nce && - ire->ire_nce->nce_state != ND_REACHABLE) { - DTRACE_PROBE2(ip__wput__ipsec__bail, - (ire_t *), ire, (mblk_t *), ipsec_mp); - /* - * If ire's link-layer is unresolved (this - * would only happen if the incomplete ire - * was added to cachetable via forwarding path) - * don't bother going to ip_xmit_v4. Just drop the - * packet. - * There is a slight risk here, in that, if we - * have the forwarding path create an incomplete - * IRE, then until the IRE is completed, any - * transmitted IPsec packets will be dropped - * instead of being queued waiting for resolution. - * - * But the likelihood of a forwarding packet and a wput - * packet sending to the same dst at the same time - * and there not yet be an ARP entry for it is small. - * Furthermore, if this actually happens, it might - * be likely that wput would generate multiple - * packets (and forwarding would also have a train - * of packets) for that destination. If this is - * the case, some of them would have been dropped - * anyway, since ARP only queues a few packets while - * waiting for resolution - * - * NOTE: We should really call ip_xmit_v4, - * and let it queue the packet and send the - * ARP query and have ARP come back thus: - * <ARP> ip_wput->ip_output->ip-wput_nondata-> - * ip_xmit_v4->ip_wput_attach_llhdr + ipsec - * hw accel work. But it's too complex to get - * the IPsec hw acceleration approach to fit - * well with ip_xmit_v4 doing ARP without - * doing IPsec simplification. For now, we just - * poke ip_xmit_v4 to trigger the arp resolve, so - * that we can continue with the send on the next - * attempt. - * - * XXX THis should be revisited, when - * the IPsec/IP interaction is cleaned up - */ - ip1dbg(("ip_wput_ipsec_out: ire is incomplete" - " - dropping packet\n")); - freemsg(ipsec_mp); - /* - * Call ip_xmit_v4() to trigger ARP query - * in case the nce_state is ND_INITIAL - */ - (void) ip_xmit_v4(NULL, ire, NULL, B_FALSE, NULL); - goto drop_pkt; - } - - DTRACE_PROBE4(ip4__physical__out__start, ill_t *, NULL, - ill_t *, ire->ire_ipif->ipif_ill, ipha_t *, ipha1, - mblk_t *, ipsec_mp); - FW_HOOKS(ipst->ips_ip4_physical_out_event, - ipst->ips_ipv4firewall_physical_out, NULL, - ire->ire_ipif->ipif_ill, ipha1, ipsec_mp, mp, 0, ipst); - DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, ipsec_mp); - if (ipsec_mp == NULL) - goto drop_pkt; - - ip1dbg(("ip_wput_ipsec_out: calling ip_xmit_v4\n")); - pktxmit_state = ip_xmit_v4(mp, ire, - (io->ipsec_out_accelerated ? io : NULL), B_FALSE, NULL); - - if ((pktxmit_state == SEND_FAILED) || - (pktxmit_state == LLHDR_RESLV_FAILED)) { - - freeb(ipsec_mp); /* ip_xmit_v4 frees the mp */ -drop_pkt: - BUMP_MIB(((ill_t *)ire->ire_stq->q_ptr)->ill_ip_mib, - ipIfStatsOutDiscards); - if (ire_need_rele) - ire_refrele(ire); - if (ire1 != NULL) { - ire_refrele(ire1); - freemsg(next_mp); + pktlen += sizeof (ip6_frag_t); + if (pktlen > ixa->ixa_fragsize) { + return (ip_fragment_v6(mp, ixa->ixa_nce, + ixa->ixa_flags, pktlen, + ixa->ixa_fragsize, ixa->ixa_xmit_hint, + ixa->ixa_zoneid, ixa->ixa_no_loop_zoneid, + ixa->ixa_postfragfn, &ixa->ixa_cookie)); } - goto done; } - - freeb(ipsec_mp); - if (ire_need_rele) - ire_refrele(ire); - - if (ire1 != NULL) { - ire = ire1; - ire_need_rele = B_TRUE; - ASSERT(next_mp); - ipsec_mp = next_mp; - mp = ipsec_mp->b_cont; - ire1 = NULL; - next_mp = NULL; - io = (ipsec_out_t *)ipsec_mp->b_rptr; - } else { - multirt_send = B_FALSE; - } - } while (multirt_send); -done: - if (ill != NULL && ill_need_rele) - ill_refrele(ill); - if (ipif != NULL) - ipif_refrele(ipif); + } + return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixa->ixa_flags, + pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid, + ixa->ixa_no_loop_zoneid, NULL)); } /* - * Get the ill corresponding to the specified ire, and compare its - * capabilities with the protocol and algorithms specified by the - * the SA obtained from ipsec_out. If they match, annotate the - * ipsec_out structure to indicate that the packet needs acceleration. - * - * - * A packet is eligible for outbound hardware acceleration if the - * following conditions are satisfied: - * - * 1. the packet will not be fragmented - * 2. the provider supports the algorithm - * 3. there is no pending control message being exchanged - * 4. snoop is not attached - * 5. the destination address is not a broadcast or multicast address. - * - * Rationale: - * - Hardware drivers do not support fragmentation with - * the current interface. - * - snoop, multicast, and broadcast may result in exposure of - * a cleartext datagram. - * We check all five of these conditions here. + * Finish the inbound IPsec processing. This function is called from + * ipsec_out_process() if the IPsec packet was processed + * synchronously, or from {ah,esp}_kcf_callback_outbound() if it was processed + * asynchronously. * - * XXX would like to nuke "ire_t *" parameter here; problem is that - * IRE is only way to figure out if a v4 address is a broadcast and - * thus ineligible for acceleration... + * This is common to IPv4 and IPv6. */ -static void -ipsec_out_is_accelerated(mblk_t *ipsec_mp, ipsa_t *sa, ill_t *ill, ire_t *ire) +void +ip_input_post_ipsec(mblk_t *mp, ip_recv_attr_t *ira) { - ipsec_out_t *io; - mblk_t *data_mp; - uint_t plen, overhead; - ip_stack_t *ipst; - phyint_t *phyint; - - if ((sa->ipsa_flags & IPSA_F_HW) == 0) - return; - - if (ill == NULL) - return; - ipst = ill->ill_ipst; - phyint = ill->ill_phyint; - - /* - * Destination address is a broadcast or multicast. Punt. - */ - if ((ire != NULL) && (ire->ire_type & (IRE_BROADCAST|IRE_LOOPBACK| - IRE_LOCAL))) - return; - - data_mp = ipsec_mp->b_cont; + iaflags_t iraflags = ira->ira_flags; - if (ill->ill_isv6) { - ip6_t *ip6h = (ip6_t *)data_mp->b_rptr; + /* Length might have changed */ + if (iraflags & IRAF_IS_IPV4) { + ipha_t *ipha = (ipha_t *)mp->b_rptr; - if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) - return; + ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); + ira->ira_pktlen = ntohs(ipha->ipha_length); + ira->ira_ip_hdr_length = IPH_HDR_LENGTH(ipha); + ira->ira_protocol = ipha->ipha_protocol; - plen = ip6h->ip6_plen; + ip_fanout_v4(mp, ipha, ira); } else { - ipha_t *ipha = (ipha_t *)data_mp->b_rptr; - - if (CLASSD(ipha->ipha_dst)) + ip6_t *ip6h = (ip6_t *)mp->b_rptr; + uint8_t *nexthdrp; + + ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION); + ira->ira_pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; + if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ira->ira_ip_hdr_length, + &nexthdrp)) { + /* Malformed packet */ + BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ira->ira_ill); + freemsg(mp); return; - - plen = ipha->ipha_length; - } - /* - * Is there a pending DLPI control message being exchanged - * between IP/IPsec and the DLS Provider? If there is, it - * could be a SADB update, and the state of the DLS Provider - * SADB might not be in sync with the SADB maintained by - * IPsec. To avoid dropping packets or using the wrong keying - * material, we do not accelerate this packet. - */ - if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { - IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_out_check_is_accelerated: " - "ill_dlpi_pending! don't accelerate packet\n")); - return; - } - - /* - * Is the Provider in promiscous mode? If it does, we don't - * accelerate the packet since it will bounce back up to the - * listeners in the clear. - */ - if (phyint->phyint_flags & PHYI_PROMISC) { - IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_out_check_is_accelerated: " - "ill in promiscous mode, don't accelerate packet\n")); - return; - } - - /* - * Will the packet require fragmentation? - */ - - /* - * IPsec ESP note: this is a pessimistic estimate, but the same - * as is used elsewhere. - * SPI + sequence + MAC + IV(blocksize) + padding(blocksize-1) - * + 2-byte trailer - */ - overhead = (sa->ipsa_type == SADB_SATYPE_AH) ? IPSEC_MAX_AH_HDR_SIZE : - IPSEC_BASE_ESP_HDR_SIZE(sa); - - if ((plen + overhead) > ill->ill_max_mtu) - return; - - io = (ipsec_out_t *)ipsec_mp->b_rptr; - - /* - * Can the ill accelerate this IPsec protocol and algorithm - * specified by the SA? - */ - if (!ipsec_capab_match(ill, io->ipsec_out_capab_ill_index, - ill->ill_isv6, sa, ipst->ips_netstack)) { - return; + } + ira->ira_protocol = *nexthdrp; + ip_fanout_v6(mp, ip6h, ira); } - - /* - * Tell AH or ESP that the outbound ill is capable of - * accelerating this packet. - */ - io->ipsec_out_is_capab_ill = B_TRUE; } /* * Select which AH & ESP SA's to use (if any) for the outbound packet. * * If this function returns B_TRUE, the requested SA's have been filled - * into the ipsec_out_*_sa pointers. + * into the ixa_ipsec_*_sa pointers. * * If the function returns B_FALSE, the packet has been "consumed", most * likely by an ACQUIRE sent up via PF_KEY to a key management daemon. * * The SA references created by the protocol-specific "select" - * function will be released when the ipsec_mp is freed, thanks to the - * ipsec_out_free destructor -- see spd.c. + * function will be released in ip_output_post_ipsec. */ static boolean_t -ipsec_out_select_sa(mblk_t *ipsec_mp) +ipsec_out_select_sa(mblk_t *mp, ip_xmit_attr_t *ixa) { boolean_t need_ah_acquire = B_FALSE, need_esp_acquire = B_FALSE; - ipsec_out_t *io; ipsec_policy_t *pp; ipsec_action_t *ap; - io = (ipsec_out_t *)ipsec_mp->b_rptr; - ASSERT(io->ipsec_out_type == IPSEC_OUT); - ASSERT(io->ipsec_out_len == sizeof (ipsec_out_t)); - if (!io->ipsec_out_secure) { - /* - * We came here by mistake. - * Don't bother with ipsec processing - * We should "discourage" this path in the future. - */ - ASSERT(io->ipsec_out_proc_begin == B_FALSE); - return (B_FALSE); - } - ASSERT(io->ipsec_out_need_policy == B_FALSE); - ASSERT((io->ipsec_out_policy != NULL) || - (io->ipsec_out_act != NULL)); + ASSERT(ixa->ixa_flags & IXAF_IPSEC_SECURE); + ASSERT((ixa->ixa_ipsec_policy != NULL) || + (ixa->ixa_ipsec_action != NULL)); - ASSERT(io->ipsec_out_failed == B_FALSE); - - /* - * IPsec processing has started. - */ - io->ipsec_out_proc_begin = B_TRUE; - ap = io->ipsec_out_act; + ap = ixa->ixa_ipsec_action; if (ap == NULL) { - pp = io->ipsec_out_policy; + pp = ixa->ixa_ipsec_policy; ASSERT(pp != NULL); ap = pp->ipsp_act; ASSERT(ap != NULL); @@ -26438,22 +12355,23 @@ ipsec_out_select_sa(mblk_t *ipsec_mp) /* * We have an action. now, let's select SA's. - * (In the future, we can cache this in the conn_t..) + * A side effect of setting ixa_ipsec_*_sa is that it will + * be cached in the conn_t. */ if (ap->ipa_want_esp) { - if (io->ipsec_out_esp_sa == NULL) { - need_esp_acquire = !ipsec_outbound_sa(ipsec_mp, + if (ixa->ixa_ipsec_esp_sa == NULL) { + need_esp_acquire = !ipsec_outbound_sa(mp, ixa, IPPROTO_ESP); } - ASSERT(need_esp_acquire || io->ipsec_out_esp_sa != NULL); + ASSERT(need_esp_acquire || ixa->ixa_ipsec_esp_sa != NULL); } if (ap->ipa_want_ah) { - if (io->ipsec_out_ah_sa == NULL) { - need_ah_acquire = !ipsec_outbound_sa(ipsec_mp, + if (ixa->ixa_ipsec_ah_sa == NULL) { + need_ah_acquire = !ipsec_outbound_sa(mp, ixa, IPPROTO_AH); } - ASSERT(need_ah_acquire || io->ipsec_out_ah_sa != NULL); + ASSERT(need_ah_acquire || ixa->ixa_ipsec_ah_sa != NULL); /* * The ESP and AH processing order needs to be preserved * when both protocols are required (ESP should be applied @@ -26471,16 +12389,16 @@ ipsec_out_select_sa(mblk_t *ipsec_mp) * acquire _all_ of the SAs we need. */ if (need_ah_acquire || need_esp_acquire) { - if (io->ipsec_out_ah_sa != NULL) { - IPSA_REFRELE(io->ipsec_out_ah_sa); - io->ipsec_out_ah_sa = NULL; + if (ixa->ixa_ipsec_ah_sa != NULL) { + IPSA_REFRELE(ixa->ixa_ipsec_ah_sa); + ixa->ixa_ipsec_ah_sa = NULL; } - if (io->ipsec_out_esp_sa != NULL) { - IPSA_REFRELE(io->ipsec_out_esp_sa); - io->ipsec_out_esp_sa = NULL; + if (ixa->ixa_ipsec_esp_sa != NULL) { + IPSA_REFRELE(ixa->ixa_ipsec_esp_sa); + ixa->ixa_ipsec_esp_sa = NULL; } - sadb_acquire(ipsec_mp, io, need_ah_acquire, need_esp_acquire); + sadb_acquire(mp, ixa, need_ah_acquire, need_esp_acquire); return (B_FALSE); } @@ -26488,110 +12406,64 @@ ipsec_out_select_sa(mblk_t *ipsec_mp) } /* - * Process an IPSEC_OUT message and see what you can - * do with it. - * IPQoS Notes: - * We do IPPF processing if IPP_LOCAL_OUT is enabled before processing for - * IPsec. - * XXX would like to nuke ire_t. - * XXX ill_index better be "real" + * Handle IPsec output processing. + * This function is only entered once for a given packet. + * We try to do things synchronously, but if we need to have user-level + * set up SAs, or ESP or AH uses asynchronous kEF, then the operation + * will be completed + * - when the SAs are added in esp_add_sa_finish/ah_add_sa_finish + * - when asynchronous ESP is done it will do AH + * + * In all cases we come back in ip_output_post_ipsec() to fragment and + * send out the packet. */ -void -ipsec_out_process(queue_t *q, mblk_t *ipsec_mp, ire_t *ire, uint_t ill_index) +int +ipsec_out_process(mblk_t *mp, ip_xmit_attr_t *ixa) { - ipsec_out_t *io; - ipsec_policy_t *pp; - ipsec_action_t *ap; - ipha_t *ipha; - ip6_t *ip6h; - mblk_t *mp; - ill_t *ill; - zoneid_t zoneid; - ipsec_status_t ipsec_rc; - boolean_t ill_need_rele = B_FALSE; - ip_stack_t *ipst; + ill_t *ill = ixa->ixa_nce->nce_ill; + ip_stack_t *ipst = ixa->ixa_ipst; ipsec_stack_t *ipss; + ipsec_policy_t *pp; + ipsec_action_t *ap; - io = (ipsec_out_t *)ipsec_mp->b_rptr; - ASSERT(io->ipsec_out_type == IPSEC_OUT); - ASSERT(io->ipsec_out_len == sizeof (ipsec_out_t)); - ipst = io->ipsec_out_ns->netstack_ip; - mp = ipsec_mp->b_cont; - - /* - * Initiate IPPF processing. We do it here to account for packets - * coming here that don't have any policy (i.e. !io->ipsec_out_secure). - * We can check for ipsec_out_proc_begin even for such packets, as - * they will always be false (asserted below). - */ - if (IPP_ENABLED(IPP_LOCAL_OUT, ipst) && !io->ipsec_out_proc_begin) { - ip_process(IPP_LOCAL_OUT, &mp, io->ipsec_out_ill_index != 0 ? - io->ipsec_out_ill_index : ill_index); - if (mp == NULL) { - ip2dbg(("ipsec_out_process: packet dropped "\ - "during IPPF processing\n")); - freeb(ipsec_mp); - BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); - return; - } - } + ASSERT(ixa->ixa_flags & IXAF_IPSEC_SECURE); - if (!io->ipsec_out_secure) { - /* - * We came here by mistake. - * Don't bother with ipsec processing - * Should "discourage" this path in the future. - */ - ASSERT(io->ipsec_out_proc_begin == B_FALSE); - goto done; - } - ASSERT(io->ipsec_out_need_policy == B_FALSE); - ASSERT((io->ipsec_out_policy != NULL) || - (io->ipsec_out_act != NULL)); - ASSERT(io->ipsec_out_failed == B_FALSE); + ASSERT((ixa->ixa_ipsec_policy != NULL) || + (ixa->ixa_ipsec_action != NULL)); ipss = ipst->ips_netstack->netstack_ipsec; if (!ipsec_loaded(ipss)) { - ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr; - if (IPH_HDR_VERSION(ipha) == IP_VERSION) { - BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); - } else { - BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); - } - ip_drop_packet(ipsec_mp, B_FALSE, NULL, ire, + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_packet(mp, B_TRUE, ill, DROPPER(ipss, ipds_ip_ipsec_not_loaded), &ipss->ipsec_dropper); - return; + return (ENOTSUP); } - /* - * IPsec processing has started. - */ - io->ipsec_out_proc_begin = B_TRUE; - ap = io->ipsec_out_act; + ap = ixa->ixa_ipsec_action; if (ap == NULL) { - pp = io->ipsec_out_policy; + pp = ixa->ixa_ipsec_policy; ASSERT(pp != NULL); ap = pp->ipsp_act; ASSERT(ap != NULL); } - /* - * Save the outbound ill index. When the packet comes back - * from IPsec, we make sure the ill hasn't changed or disappeared - * before sending it the accelerated packet. - */ - if ((ire != NULL) && (io->ipsec_out_capab_ill_index == 0)) { - ill = ire_to_ill(ire); - io->ipsec_out_capab_ill_index = ill->ill_phyint->phyint_ifindex; + /* Handle explicit drop action and bypass. */ + switch (ap->ipa_act.ipa_type) { + case IPSEC_ACT_DISCARD: + case IPSEC_ACT_REJECT: + ip_drop_packet(mp, B_FALSE, ill, + DROPPER(ipss, ipds_spd_explicit), &ipss->ipsec_spd_dropper); + return (EHOSTUNREACH); /* IPsec policy failure */ + case IPSEC_ACT_BYPASS: + return (ip_output_post_ipsec(mp, ixa)); } /* * The order of processing is first insert a IP header if needed. * Then insert the ESP header and then the AH header. */ - if ((io->ipsec_out_se_done == B_FALSE) && - (ap->ipa_want_se)) { + if ((ixa->ixa_flags & IXAF_IS_IPV4) && ap->ipa_want_se) { /* * First get the outer IP header before sending * it to ESP. @@ -26600,19 +12472,16 @@ ipsec_out_process(queue_t *q, mblk_t *ipsec_mp, ire_t *ire, uint_t ill_index) mblk_t *outer_mp, *inner_mp; if ((outer_mp = allocb(sizeof (ipha_t), BPRI_HI)) == NULL) { - (void) mi_strlog(q, 0, SL_ERROR|SL_TRACE|SL_CONSOLE, + (void) mi_strlog(ill->ill_rq, 0, + SL_ERROR|SL_TRACE|SL_CONSOLE, "ipsec_out_process: " "Self-Encapsulation failed: Out of memory\n"); - freemsg(ipsec_mp); - if (ill != NULL) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); - } else { - BUMP_MIB(&ipst->ips_ip_mib, - ipIfStatsOutDiscards); - } - return; + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards", mp, ill); + freemsg(mp); + return (ENOBUFS); } - inner_mp = ipsec_mp->b_cont; + inner_mp = mp; ASSERT(inner_mp->b_datap->db_type == M_DATA); oipha = (ipha_t *)outer_mp->b_rptr; iipha = (ipha_t *)inner_mp->b_rptr; @@ -26626,139 +12495,51 @@ ipsec_out_process(queue_t *q, mblk_t *ipsec_mp, ire_t *ire, uint_t ill_index) oipha->ipha_hdr_checksum = 0; oipha->ipha_hdr_checksum = ip_csum_hdr(oipha); outer_mp->b_cont = inner_mp; - ipsec_mp->b_cont = outer_mp; + mp = outer_mp; - io->ipsec_out_se_done = B_TRUE; - io->ipsec_out_tunnel = B_TRUE; + ixa->ixa_flags |= IXAF_IPSEC_TUNNEL; } - if (((ap->ipa_want_ah && (io->ipsec_out_ah_sa == NULL)) || - (ap->ipa_want_esp && (io->ipsec_out_esp_sa == NULL))) && - !ipsec_out_select_sa(ipsec_mp)) - return; + /* If we need to wait for a SA then we can't return any errno */ + if (((ap->ipa_want_ah && (ixa->ixa_ipsec_ah_sa == NULL)) || + (ap->ipa_want_esp && (ixa->ixa_ipsec_esp_sa == NULL))) && + !ipsec_out_select_sa(mp, ixa)) + return (0); /* * By now, we know what SA's to use. Toss over to ESP & AH * to do the heavy lifting. */ - zoneid = io->ipsec_out_zoneid; - ASSERT(zoneid != ALL_ZONES); - if ((io->ipsec_out_esp_done == B_FALSE) && (ap->ipa_want_esp)) { - ASSERT(io->ipsec_out_esp_sa != NULL); - io->ipsec_out_esp_done = B_TRUE; - /* - * Note that since hw accel can only apply one transform, - * not two, we skip hw accel for ESP if we also have AH - * This is an design limitation of the interface - * which should be revisited. - */ - ASSERT(ire != NULL); - if (io->ipsec_out_ah_sa == NULL) { - ill = (ill_t *)ire->ire_stq->q_ptr; - ipsec_out_is_accelerated(ipsec_mp, - io->ipsec_out_esp_sa, ill, ire); - } + if (ap->ipa_want_esp) { + ASSERT(ixa->ixa_ipsec_esp_sa != NULL); - ipsec_rc = io->ipsec_out_esp_sa->ipsa_output_func(ipsec_mp); - switch (ipsec_rc) { - case IPSEC_STATUS_SUCCESS: - break; - case IPSEC_STATUS_FAILED: - if (ill != NULL) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); - } else { - BUMP_MIB(&ipst->ips_ip_mib, - ipIfStatsOutDiscards); - } - /* FALLTHRU */ - case IPSEC_STATUS_PENDING: - return; + mp = ixa->ixa_ipsec_esp_sa->ipsa_output_func(mp, ixa); + if (mp == NULL) { + /* + * Either it failed or is pending. In the former case + * ipIfStatsInDiscards was increased. + */ + return (0); } } - if ((io->ipsec_out_ah_done == B_FALSE) && (ap->ipa_want_ah)) { - ASSERT(io->ipsec_out_ah_sa != NULL); - io->ipsec_out_ah_done = B_TRUE; - if (ire == NULL) { - int idx = io->ipsec_out_capab_ill_index; - ill = ill_lookup_on_ifindex(idx, B_FALSE, - NULL, NULL, NULL, NULL, ipst); - ill_need_rele = B_TRUE; - } else { - ill = (ill_t *)ire->ire_stq->q_ptr; - } - ipsec_out_is_accelerated(ipsec_mp, io->ipsec_out_ah_sa, ill, - ire); + if (ap->ipa_want_ah) { + ASSERT(ixa->ixa_ipsec_ah_sa != NULL); - ipsec_rc = io->ipsec_out_ah_sa->ipsa_output_func(ipsec_mp); - switch (ipsec_rc) { - case IPSEC_STATUS_SUCCESS: - break; - case IPSEC_STATUS_FAILED: - if (ill != NULL) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); - } else { - BUMP_MIB(&ipst->ips_ip_mib, - ipIfStatsOutDiscards); - } - /* FALLTHRU */ - case IPSEC_STATUS_PENDING: - if (ill != NULL && ill_need_rele) - ill_refrele(ill); - return; + mp = ixa->ixa_ipsec_ah_sa->ipsa_output_func(mp, ixa); + if (mp == NULL) { + /* + * Either it failed or is pending. In the former case + * ipIfStatsInDiscards was increased. + */ + return (0); } } /* - * We are done with IPsec processing. Send it over the wire. - */ -done: - mp = ipsec_mp->b_cont; - ipha = (ipha_t *)mp->b_rptr; - if (IPH_HDR_VERSION(ipha) == IP_VERSION) { - ip_wput_ipsec_out(q, ipsec_mp, ipha, ire->ire_ipif->ipif_ill, - ire); - } else { - ip6h = (ip6_t *)ipha; - ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h, ire->ire_ipif->ipif_ill, - ire); - } - if (ill != NULL && ill_need_rele) - ill_refrele(ill); -} - -/* ARGSUSED */ -void -ip_restart_optmgmt(ipsq_t *dummy_sq, queue_t *q, mblk_t *first_mp, void *dummy) -{ - opt_restart_t *or; - int err; - conn_t *connp; - cred_t *cr; - - ASSERT(CONN_Q(q)); - connp = Q_TO_CONN(q); - - ASSERT(first_mp->b_datap->db_type == M_CTL); - or = (opt_restart_t *)first_mp->b_rptr; - /* - * We checked for a db_credp the first time svr4_optcom_req - * was called (from ip_wput_nondata). So we can just ASSERT here. + * We are done with IPsec processing. Send it over + * the wire. */ - cr = msg_getcred(first_mp, NULL); - ASSERT(cr != NULL); - - if (or->or_type == T_SVR4_OPTMGMT_REQ) { - err = svr4_optcom_req(q, first_mp, cr, - &ip_opt_obj, B_FALSE); - } else { - ASSERT(or->or_type == T_OPTMGMT_REQ); - err = tpi_optcom_req(q, first_mp, cr, - &ip_opt_obj, B_FALSE); - } - if (err != EINPROGRESS) { - /* operation is done */ - CONN_OPER_PENDING_DONE(connp); - } + return (ip_output_post_ipsec(mp, ixa)); } /* @@ -26811,6 +12592,11 @@ ip_reprocess_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) err = (*ipip->ipi_func_restart)(ipsq->ipsq_xop->ipx_current_ipif, sin, q, mp, ipip, mp1->b_rptr); + DTRACE_PROBE4(ipif__ioctl, char *, "ip_reprocess_ioctl finish", + int, ipip->ipi_cmd, + ill_t *, ipsq->ipsq_xop->ipx_current_ipif->ipif_ill, + ipif_t *, ipsq->ipsq_xop->ipx_current_ipif); + ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq); } @@ -26865,12 +12651,16 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) */ if (ipip->ipi_cmd == SIOCLIFADDIF) { err = ip_sioctl_addif(NULL, NULL, q, mp, NULL, NULL); + DTRACE_PROBE4(ipif__ioctl, char *, "ip_process_ioctl finish", + int, ipip->ipi_cmd, ill_t *, NULL, ipif_t *, NULL); ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL); return; } ci.ci_ipif = NULL; - if (ipip->ipi_cmd_type == MISC_CMD) { + switch (ipip->ipi_cmd_type) { + case MISC_CMD: + case MSFILT_CMD: /* * All MISC_CMD ioctls come in here -- e.g. SIOCGLIFCONF. */ @@ -26883,28 +12673,29 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) ci.ci_sin = NULL; ci.ci_sin6 = NULL; ci.ci_lifr = NULL; - } else { - switch (ipip->ipi_cmd_type) { - case IF_CMD: - case LIF_CMD: - extract_funcp = ip_extract_lifreq; - break; + extract_funcp = NULL; + break; - case ARP_CMD: - case XARP_CMD: - extract_funcp = ip_extract_arpreq; - break; + case IF_CMD: + case LIF_CMD: + extract_funcp = ip_extract_lifreq; + break; - case MSFILT_CMD: - extract_funcp = ip_extract_msfilter; - break; + case ARP_CMD: + case XARP_CMD: + extract_funcp = ip_extract_arpreq; + break; - default: - ASSERT(0); - } + default: + ASSERT(0); + } - err = (*extract_funcp)(q, mp, ipip, &ci, ip_process_ioctl); + if (extract_funcp != NULL) { + err = (*extract_funcp)(q, mp, ipip, &ci); if (err != 0) { + DTRACE_PROBE4(ipif__ioctl, + char *, "ip_process_ioctl finish err", + int, ipip->ipi_cmd, ill_t *, NULL, ipif_t *, NULL); ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL); return; } @@ -26923,8 +12714,17 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) */ err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, ci.ci_lifr); - if (ci.ci_ipif != NULL) + if (ci.ci_ipif != NULL) { + DTRACE_PROBE4(ipif__ioctl, + char *, "ip_process_ioctl finish RD", + int, ipip->ipi_cmd, ill_t *, ci.ci_ipif->ipif_ill, + ipif_t *, ci.ci_ipif); ipif_refrele(ci.ci_ipif); + } else { + DTRACE_PROBE4(ipif__ioctl, + char *, "ip_process_ioctl finish RD", + int, ipip->ipi_cmd, ill_t *, NULL, ipif_t *, NULL); + } ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), NULL); return; } @@ -26932,7 +12732,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) ASSERT(ci.ci_ipif != NULL); /* - * If ipsq is non-NULL, we are already being called exclusively. + * If ipsq is non-NULL, we are already being called exclusively */ ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq)); if (ipsq == NULL) { @@ -26944,7 +12744,6 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) } entered_ipsq = B_TRUE; } - /* * Release the ipif so that ipif_down and friends that wait for * references to go away are not misled about the current ipif_refcnt @@ -26962,6 +12761,10 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) */ err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, ci.ci_lifr); + DTRACE_PROBE4(ipif__ioctl, char *, "ip_process_ioctl finish WR", + int, ipip->ipi_cmd, + ill_t *, ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill, + ipif_t *, ci.ci_ipif); ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq); if (entered_ipsq) @@ -27012,31 +12815,21 @@ ip_ioctl_finish(queue_t *q, mblk_t *mp, int err, int mode, ipsq_t *ipsq) ipsq_current_finish(ipsq); } -/* Called from ip_wput for all non data messages */ -/* ARGSUSED */ +/* Handles all non data messages */ void -ip_wput_nondata(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) +ip_wput_nondata(queue_t *q, mblk_t *mp) { mblk_t *mp1; - ire_t *ire, *fake_ire; - ill_t *ill; struct iocblk *iocp; ip_ioctl_cmd_t *ipip; - cred_t *cr; conn_t *connp; - int err; - nce_t *nce; - ipif_t *ipif; - ip_stack_t *ipst; + cred_t *cr; char *proto_str; - if (CONN_Q(q)) { + if (CONN_Q(q)) connp = Q_TO_CONN(q); - ipst = connp->conn_netstack->netstack_ip; - } else { + else connp = NULL; - ipst = ILLQ_TO_IPST(q); - } switch (DB_TYPE(mp)) { case M_IOCTL: @@ -27064,17 +12857,10 @@ ip_wput_nondata(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) } if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) { /* - * the ioctl is one we recognise, but is not - * consumed by IP as a module, pass M_IOCDATA - * for processing downstream, but only for - * common Streams ioctls. + * The ioctl is one we recognise, but is not consumed + * by IP as a module and we are a module, so we drop */ - if (ipip->ipi_flags & IPI_PASS_DOWN) { - putnext(q, mp); - return; - } else { - goto nak; - } + goto nak; } /* IOCTL continuation following copyin or copyout. */ @@ -27110,8 +12896,8 @@ ip_wput_nondata(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) /* * Refhold the conn, till the ioctl completes. This is * needed in case the ioctl ends up in the pending mp - * list. Every mp in the ill_pending_mp list and - * the ipx_pending_mp must have a refhold on the conn + * list. Every mp in the ipx_pending_mp list + * must have a refhold on the conn * to resume processing. The refhold is released when * the ioctl completes. (normally or abnormally) * In all cases ip_ioctl_finish is called to finish @@ -27119,7 +12905,6 @@ ip_wput_nondata(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) */ if (connp != NULL) { /* This is not a reentry */ - ASSERT(ipsq == NULL); CONN_INC_REF(connp); } else { if (!(ipip->ipi_flags & IPI_MODOK)) { @@ -27128,18 +12913,12 @@ ip_wput_nondata(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) } } - ip_process_ioctl(ipsq, q, mp, ipip); + ip_process_ioctl(NULL, q, mp, ipip); } else { mi_copyout(q, mp); } return; -nak: - iocp->ioc_error = EINVAL; - mp->b_datap->db_type = M_IOCNAK; - iocp->ioc_count = 0; - qreply(q, mp); - return; case M_IOCNAK: /* @@ -27147,35 +12926,13 @@ nak: * an IOCTL we sent it. This shouldn't happen. */ (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, - "ip_wput: unexpected M_IOCNAK, ioc_cmd 0x%x", + "ip_wput_nondata: unexpected M_IOCNAK, ioc_cmd 0x%x", ((struct iocblk *)mp->b_rptr)->ioc_cmd); freemsg(mp); return; case M_IOCACK: /* /dev/ip shouldn't see this */ - if (CONN_Q(q)) - goto nak; - - /* - * Finish socket ioctls passed through to ARP. We use the - * ioc_cmd values we set in ip_sioctl_arp() to decide whether - * we need to become writer before calling ip_sioctl_iocack(). - * Note that qwriter_ip() will release the refhold, and that a - * refhold is OK without ILL_CAN_LOOKUP() since we're on the - * ill stream. - */ - iocp = (struct iocblk *)mp->b_rptr; - if (iocp->ioc_cmd == AR_ENTRY_SQUERY) { - ip_sioctl_iocack(NULL, q, mp, NULL); - return; - } - - ASSERT(iocp->ioc_cmd == AR_ENTRY_DELETE || - iocp->ioc_cmd == AR_ENTRY_ADD); - ill = q->q_ptr; - ill_refhold(ill); - qwriter_ip(ill, q, mp, ip_sioctl_iocack, CUR_OP, B_FALSE); - return; + goto nak; case M_FLUSH: if (*mp->b_rptr & FLUSHW) flushq(q, FLUSHALL); @@ -27190,117 +12947,17 @@ nak: } freemsg(mp); return; - case IRE_DB_REQ_TYPE: - if (connp == NULL) { - proto_str = "IRE_DB_REQ_TYPE"; - goto protonak; - } - /* An Upper Level Protocol wants a copy of an IRE. */ - ip_ire_req(q, mp); - return; case M_CTL: - if (mp->b_wptr - mp->b_rptr < sizeof (uint32_t)) - break; - - /* M_CTL messages are used by ARP to tell us things. */ - if ((mp->b_wptr - mp->b_rptr) < sizeof (arc_t)) - break; - switch (((arc_t *)mp->b_rptr)->arc_cmd) { - case AR_ENTRY_SQUERY: - putnext(q, mp); - return; - case AR_CLIENT_NOTIFY: - ip_arp_news(q, mp); - return; - case AR_DLPIOP_DONE: - ASSERT(q->q_next != NULL); - ill = (ill_t *)q->q_ptr; - /* qwriter_ip releases the refhold */ - /* refhold on ill stream is ok without ILL_CAN_LOOKUP */ - ill_refhold(ill); - qwriter_ip(ill, q, mp, ip_arp_done, CUR_OP, B_FALSE); - return; - case AR_ARP_CLOSING: - /* - * ARP (above us) is closing. If no ARP bringup is - * currently pending, ack the message so that ARP - * can complete its close. Also mark ill_arp_closing - * so that new ARP bringups will fail. If any - * ARP bringup is currently in progress, we will - * ack this when the current ARP bringup completes. - */ - ASSERT(q->q_next != NULL); - ill = (ill_t *)q->q_ptr; - mutex_enter(&ill->ill_lock); - ill->ill_arp_closing = 1; - if (!ill->ill_arp_bringup_pending) { - mutex_exit(&ill->ill_lock); - qreply(q, mp); - } else { - mutex_exit(&ill->ill_lock); - freemsg(mp); - } - return; - case AR_ARP_EXTEND: - /* - * The ARP module above us is capable of duplicate - * address detection. Old ATM drivers will not send - * this message. - */ - ASSERT(q->q_next != NULL); - ill = (ill_t *)q->q_ptr; - ill->ill_arp_extend = B_TRUE; - freemsg(mp); - return; - default: - break; - } break; case M_PROTO: case M_PCPROTO: /* - * The only PROTO messages we expect are copies of option - * negotiation acknowledgements, AH and ESP bind requests - * are also expected. + * The only PROTO messages we expect are SNMP-related. */ switch (((union T_primitives *)mp->b_rptr)->type) { - case O_T_BIND_REQ: - case T_BIND_REQ: { - /* Request can get queued in bind */ - if (connp == NULL) { - proto_str = "O_T_BIND_REQ/T_BIND_REQ"; - goto protonak; - } - /* - * The transports except SCTP call ip_bind_{v4,v6}() - * directly instead of a a putnext. SCTP doesn't - * generate any T_BIND_REQ since it has its own - * fanout data structures. However, ESP and AH - * come in for regular binds; all other cases are - * bind retries. - */ - ASSERT(!IPCL_IS_SCTP(connp)); - - /* Don't increment refcnt if this is a re-entry */ - if (ipsq == NULL) - CONN_INC_REF(connp); - - mp = connp->conn_af_isv6 ? ip_bind_v6(q, mp, - connp, NULL) : ip_bind_v4(q, mp, connp); - ASSERT(mp != NULL); - - ASSERT(!IPCL_IS_TCP(connp)); - ASSERT(!IPCL_IS_UDP(connp)); - ASSERT(!IPCL_IS_RAWIP(connp)); - ASSERT(!IPCL_IS_IPTUN(connp)); - - /* The case of AH and ESP */ - qreply(q, mp); - CONN_OPER_PENDING_DONE(connp); - return; - } case T_SVR4_OPTMGMT_REQ: - ip2dbg(("ip_wput: T_SVR4_OPTMGMT_REQ flags %x\n", + ip2dbg(("ip_wput_nondata: T_SVR4_OPTMGMT_REQ " + "flags %x\n", ((struct T_optmgmt_req *)mp->b_rptr)->MGMT_flags)); if (connp == NULL) { @@ -27324,460 +12981,17 @@ nak: return; } - if (!snmpcom_req(q, mp, ip_snmp_set, - ip_snmp_get, cr)) { - /* - * Call svr4_optcom_req so that it can - * generate the ack. We don't come here - * if this operation is being restarted. - * ip_restart_optmgmt will drop the conn ref. - * In the case of ipsec option after the ipsec - * load is complete conn_restart_ipsec_waiter - * drops the conn ref. - */ - ASSERT(ipsq == NULL); - CONN_INC_REF(connp); - if (ip_check_for_ipsec_opt(q, mp)) - return; - err = svr4_optcom_req(q, mp, cr, &ip_opt_obj, - B_FALSE); - if (err != EINPROGRESS) { - /* Operation is done */ - CONN_OPER_PENDING_DONE(connp); - } - } - return; - case T_OPTMGMT_REQ: - ip2dbg(("ip_wput: T_OPTMGMT_REQ\n")); - /* - * Note: No snmpcom_req support through new - * T_OPTMGMT_REQ. - * Call tpi_optcom_req so that it can - * generate the ack. - */ - if (connp == NULL) { - proto_str = "T_OPTMGMT_REQ"; - goto protonak; - } - - /* - * All Solaris components should pass a db_credp - * for this TPI message, hence we ASSERT. - * But in case there is some other M_PROTO that looks - * like a TPI message sent by some other kernel - * component, we check and return an error. - */ - cr = msg_getcred(mp, NULL); - ASSERT(cr != NULL); - if (cr == NULL) { - mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL); - if (mp != NULL) - qreply(q, mp); - return; - } - ASSERT(ipsq == NULL); - /* - * We don't come here for restart. ip_restart_optmgmt - * will drop the conn ref. In the case of ipsec option - * after the ipsec load is complete - * conn_restart_ipsec_waiter drops the conn ref. - */ - CONN_INC_REF(connp); - if (ip_check_for_ipsec_opt(q, mp)) - return; - err = tpi_optcom_req(q, mp, cr, &ip_opt_obj, B_FALSE); - if (err != EINPROGRESS) { - /* Operation is done */ - CONN_OPER_PENDING_DONE(connp); - } - return; - case T_UNBIND_REQ: - if (connp == NULL) { - proto_str = "T_UNBIND_REQ"; + if (!snmpcom_req(q, mp, ip_snmp_set, ip_snmp_get, cr)) { + proto_str = "Bad SNMPCOM request?"; goto protonak; } - ip_unbind(Q_TO_CONN(q)); - mp = mi_tpi_ok_ack_alloc(mp); - qreply(q, mp); return; default: - /* - * Have to drop any DLPI messages coming down from - * arp (such as an info_req which would cause ip - * to receive an extra info_ack if it was passed - * through. - */ - ip1dbg(("ip_wput_nondata: dropping M_PROTO %d\n", + ip1dbg(("ip_wput_nondata: dropping M_PROTO prim %u\n", (int)*(uint_t *)mp->b_rptr)); freemsg(mp); return; } - /* NOTREACHED */ - case IRE_DB_TYPE: { - nce_t *nce; - ill_t *ill; - in6_addr_t gw_addr_v6; - - /* - * This is a response back from a resolver. It - * consists of a message chain containing: - * IRE_MBLK-->LL_HDR_MBLK->pkt - * The IRE_MBLK is the one we allocated in ip_newroute. - * The LL_HDR_MBLK is the DLPI header to use to get - * the attached packet, and subsequent ones for the - * same destination, transmitted. - */ - if ((mp->b_wptr - mp->b_rptr) != sizeof (ire_t)) /* ire */ - break; - /* - * First, check to make sure the resolution succeeded. - * If it failed, the second mblk will be empty. - * If it is, free the chain, dropping the packet. - * (We must ire_delete the ire; that frees the ire mblk) - * We're doing this now to support PVCs for ATM; it's - * a partial xresolv implementation. When we fully implement - * xresolv interfaces, instead of freeing everything here - * we'll initiate neighbor discovery. - * - * For v4 (ARP and other external resolvers) the resolver - * frees the message, so no check is needed. This check - * is required, though, for a full xresolve implementation. - * Including this code here now both shows how external - * resolvers can NACK a resolution request using an - * existing design that has no specific provisions for NACKs, - * and also takes into account that the current non-ARP - * external resolver has been coded to use this method of - * NACKing for all IPv6 (xresolv) cases, - * whether our xresolv implementation is complete or not. - * - */ - ire = (ire_t *)mp->b_rptr; - ill = ire_to_ill(ire); - mp1 = mp->b_cont; /* dl_unitdata_req */ - if (mp1->b_rptr == mp1->b_wptr) { - if (ire->ire_ipversion == IPV6_VERSION) { - /* - * XRESOLV interface. - */ - ASSERT(ill->ill_flags & ILLF_XRESOLV); - mutex_enter(&ire->ire_lock); - gw_addr_v6 = ire->ire_gateway_addr_v6; - mutex_exit(&ire->ire_lock); - if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) { - nce = ndp_lookup_v6(ill, B_FALSE, - &ire->ire_addr_v6, B_FALSE); - } else { - nce = ndp_lookup_v6(ill, B_FALSE, - &gw_addr_v6, B_FALSE); - } - if (nce != NULL) { - nce_resolv_failed(nce); - ndp_delete(nce); - NCE_REFRELE(nce); - } - } - mp->b_cont = NULL; - freemsg(mp1); /* frees the pkt as well */ - ASSERT(ire->ire_nce == NULL); - ire_delete((ire_t *)mp->b_rptr); - return; - } - - /* - * Split them into IRE_MBLK and pkt and feed it into - * ire_add_then_send. Then in ire_add_then_send - * the IRE will be added, and then the packet will be - * run back through ip_wput. This time it will make - * it to the wire. - */ - mp->b_cont = NULL; - mp = mp1->b_cont; /* now, mp points to pkt */ - mp1->b_cont = NULL; - ip1dbg(("ip_wput_nondata: reply from external resolver \n")); - if (ire->ire_ipversion == IPV6_VERSION) { - /* - * XRESOLV interface. Find the nce and put a copy - * of the dl_unitdata_req in nce_res_mp - */ - ASSERT(ill->ill_flags & ILLF_XRESOLV); - mutex_enter(&ire->ire_lock); - gw_addr_v6 = ire->ire_gateway_addr_v6; - mutex_exit(&ire->ire_lock); - if (IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) { - nce = ndp_lookup_v6(ill, B_FALSE, - &ire->ire_addr_v6, B_FALSE); - } else { - nce = ndp_lookup_v6(ill, B_FALSE, - &gw_addr_v6, B_FALSE); - } - if (nce != NULL) { - /* - * We have to protect nce_res_mp here - * from being accessed by other threads - * while we change the mblk pointer. - * Other functions will also lock the nce when - * accessing nce_res_mp. - * - * The reason we change the mblk pointer - * here rather than copying the resolved address - * into the template is that, unlike with - * ethernet, we have no guarantee that the - * resolved address length will be - * smaller than or equal to the lla length - * with which the template was allocated, - * (for ethernet, they're equal) - * so we have to use the actual resolved - * address mblk - which holds the real - * dl_unitdata_req with the resolved address. - * - * Doing this is the same behavior as was - * previously used in the v4 ARP case. - */ - mutex_enter(&nce->nce_lock); - if (nce->nce_res_mp != NULL) - freemsg(nce->nce_res_mp); - nce->nce_res_mp = mp1; - mutex_exit(&nce->nce_lock); - /* - * We do a fastpath probe here because - * we have resolved the address without - * using Neighbor Discovery. - * In the non-XRESOLV v6 case, the fastpath - * probe is done right after neighbor - * discovery completes. - */ - if (nce->nce_res_mp != NULL) { - int res; - nce_fastpath_list_add(nce); - res = ill_fastpath_probe(ill, - nce->nce_res_mp); - if (res != 0 && res != EAGAIN) - nce_fastpath_list_delete(nce); - } - - ire_add_then_send(q, ire, mp); - /* - * Now we have to clean out any packets - * that may have been queued on the nce - * while it was waiting for address resolution - * to complete. - */ - mutex_enter(&nce->nce_lock); - mp1 = nce->nce_qd_mp; - nce->nce_qd_mp = NULL; - mutex_exit(&nce->nce_lock); - while (mp1 != NULL) { - mblk_t *nxt_mp; - queue_t *fwdq = NULL; - ill_t *inbound_ill; - uint_t ifindex; - - nxt_mp = mp1->b_next; - mp1->b_next = NULL; - /* - * Retrieve ifindex stored in - * ip_rput_data_v6() - */ - ifindex = - (uint_t)(uintptr_t)mp1->b_prev; - inbound_ill = - ill_lookup_on_ifindex(ifindex, - B_TRUE, NULL, NULL, NULL, - NULL, ipst); - mp1->b_prev = NULL; - if (inbound_ill != NULL) - fwdq = inbound_ill->ill_rq; - - if (fwdq != NULL) { - put(fwdq, mp1); - ill_refrele(inbound_ill); - } else - put(WR(ill->ill_rq), mp1); - mp1 = nxt_mp; - } - NCE_REFRELE(nce); - } else { /* nce is NULL; clean up */ - ire_delete(ire); - freemsg(mp); - freemsg(mp1); - return; - } - } else { - nce_t *arpce; - /* - * Link layer resolution succeeded. Recompute the - * ire_nce. - */ - ASSERT(ire->ire_type & (IRE_CACHE|IRE_BROADCAST)); - if ((arpce = ndp_lookup_v4(ill, - (ire->ire_gateway_addr != INADDR_ANY ? - &ire->ire_gateway_addr : &ire->ire_addr), - B_FALSE)) == NULL) { - freeb(ire->ire_mp); - freeb(mp1); - freemsg(mp); - return; - } - mutex_enter(&arpce->nce_lock); - arpce->nce_last = TICK_TO_MSEC(lbolt64); - if (arpce->nce_state == ND_REACHABLE) { - /* - * Someone resolved this before us; - * cleanup the res_mp. Since ire has - * not been added yet, the call to ire_add_v4 - * from ire_add_then_send (when a dup is - * detected) will clean up the ire. - */ - freeb(mp1); - } else { - ASSERT(arpce->nce_res_mp == NULL); - arpce->nce_res_mp = mp1; - arpce->nce_state = ND_REACHABLE; - } - mutex_exit(&arpce->nce_lock); - if (ire->ire_marks & IRE_MARK_NOADD) { - /* - * this ire will not be added to the ire - * cache table, so we can set the ire_nce - * here, as there are no atomicity constraints. - */ - ire->ire_nce = arpce; - /* - * We are associating this nce with the ire - * so change the nce ref taken in - * ndp_lookup_v4() from - * NCE_REFHOLD to NCE_REFHOLD_NOTR - */ - NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce); - } else { - NCE_REFRELE(arpce); - } - ire_add_then_send(q, ire, mp); - } - return; /* All is well, the packet has been sent. */ - } - case IRE_ARPRESOLVE_TYPE: { - - if ((mp->b_wptr - mp->b_rptr) != sizeof (ire_t)) /* fake_ire */ - break; - mp1 = mp->b_cont; /* dl_unitdata_req */ - mp->b_cont = NULL; - /* - * First, check to make sure the resolution succeeded. - * If it failed, the second mblk will be empty. - */ - if (mp1->b_rptr == mp1->b_wptr) { - /* cleanup the incomplete ire, free queued packets */ - freemsg(mp); /* fake ire */ - freeb(mp1); /* dl_unitdata response */ - return; - } - - /* - * Update any incomplete nce_t found. We search the ctable - * and find the nce from the ire->ire_nce because we need - * to pass the ire to ip_xmit_v4 later, and can find both - * ire and nce in one lookup. - */ - fake_ire = (ire_t *)mp->b_rptr; - - /* - * By the time we come back here from ARP the logical outgoing - * interface of the incomplete ire we added in ire_forward() - * could have disappeared, causing the incomplete ire to also - * disappear. So we need to retreive the proper ipif for the - * ire before looking in ctable. In the case of IPMP, the - * ipif may be on the IPMP ill, so look it up based on the - * ire_ipif_ifindex we stashed back in ire_init_common(). - * Then, we can verify that ire_ipif_seqid still exists. - */ - ill = ill_lookup_on_ifindex(fake_ire->ire_ipif_ifindex, B_FALSE, - NULL, NULL, NULL, NULL, ipst); - if (ill == NULL) { - ip1dbg(("ill for incomplete ire vanished\n")); - freemsg(mp); /* fake ire */ - freeb(mp1); /* dl_unitdata response */ - return; - } - - /* Get the outgoing ipif */ - mutex_enter(&ill->ill_lock); - ipif = ipif_lookup_seqid(ill, fake_ire->ire_ipif_seqid); - if (ipif == NULL) { - mutex_exit(&ill->ill_lock); - ill_refrele(ill); - ip1dbg(("logical intrf to incomplete ire vanished\n")); - freemsg(mp); /* fake_ire */ - freeb(mp1); /* dl_unitdata response */ - return; - } - - ipif_refhold_locked(ipif); - mutex_exit(&ill->ill_lock); - ill_refrele(ill); - ire = ire_arpresolve_lookup(fake_ire->ire_addr, - fake_ire->ire_gateway_addr, ipif, fake_ire->ire_zoneid, - ipst, ((ill_t *)q->q_ptr)->ill_wq); - ipif_refrele(ipif); - if (ire == NULL) { - /* - * no ire was found; check if there is an nce - * for this lookup; if it has no ire's pointing at it - * cleanup. - */ - if ((nce = ndp_lookup_v4(q->q_ptr, - (fake_ire->ire_gateway_addr != INADDR_ANY ? - &fake_ire->ire_gateway_addr : &fake_ire->ire_addr), - B_FALSE)) != NULL) { - /* - * cleanup: - * We check for refcnt 2 (one for the nce - * hash list + 1 for the ref taken by - * ndp_lookup_v4) to check that there are - * no ire's pointing at the nce. - */ - if (nce->nce_refcnt == 2) - ndp_delete(nce); - NCE_REFRELE(nce); - } - freeb(mp1); /* dl_unitdata response */ - freemsg(mp); /* fake ire */ - return; - } - - nce = ire->ire_nce; - DTRACE_PROBE2(ire__arpresolve__type, - ire_t *, ire, nce_t *, nce); - mutex_enter(&nce->nce_lock); - nce->nce_last = TICK_TO_MSEC(lbolt64); - if (nce->nce_state == ND_REACHABLE) { - /* - * Someone resolved this before us; - * our response is not needed any more. - */ - mutex_exit(&nce->nce_lock); - freeb(mp1); /* dl_unitdata response */ - } else { - ASSERT(nce->nce_res_mp == NULL); - nce->nce_res_mp = mp1; - nce->nce_state = ND_REACHABLE; - mutex_exit(&nce->nce_lock); - nce_fastpath(nce); - } - /* - * The cached nce_t has been updated to be reachable; - * Clear the IRE_MARK_UNCACHED flag and free the fake_ire. - */ - fake_ire->ire_marks &= ~IRE_MARK_UNCACHED; - freemsg(mp); - /* - * send out queued packets. - */ - (void) ip_xmit_v4(NULL, ire, NULL, B_FALSE, NULL); - - IRE_REFRELE(ire); - return; - } default: break; } @@ -27787,6 +13001,13 @@ nak: freemsg(mp); return; +nak: + iocp->ioc_error = EINVAL; + mp->b_datap->db_type = M_IOCNAK; + iocp->ioc_count = 0; + qreply(q, mp); + return; + protonak: cmn_err(CE_NOTE, "IP doesn't process %s as a module", proto_str); if ((mp = mi_tpi_err_ack_alloc(mp, TPROTO, EINVAL)) != NULL) @@ -27794,14 +13015,15 @@ protonak: } /* - * Process IP options in an outbound packet. Modify the destination if there - * is a source route option. + * Process IP options in an outbound packet. Verify that the nexthop in a + * strict source route is onlink. * Returns non-zero if something fails in which case an ICMP error has been * sent and mp freed. + * + * Assumes the ULP has called ip_massage_options to move nexthop into ipha_dst. */ -static int -ip_wput_options(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, - boolean_t mctl_present, zoneid_t zoneid, ip_stack_t *ipst) +int +ip_output_options(mblk_t *mp, ipha_t *ipha, ip_xmit_attr_t *ixa, ill_t *ill) { ipoptp_t opts; uchar_t *opt; @@ -27809,14 +13031,11 @@ ip_wput_options(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, uint8_t optlen; ipaddr_t dst; intptr_t code = 0; - mblk_t *mp; - ire_t *ire = NULL; + ire_t *ire; + ip_stack_t *ipst = ixa->ixa_ipst; + ip_recv_attr_t iras; - ip2dbg(("ip_wput_options\n")); - mp = ipsec_mp; - if (mctl_present) { - mp = ipsec_mp->b_cont; - } + ip2dbg(("ip_output_options\n")); dst = ipha->ipha_dst; for (optval = ipoptp_first(&opts, ipha); @@ -27824,7 +13043,7 @@ ip_wput_options(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, optval = ipoptp_next(&opts)) { opt = opts.ipoptp_cur; optlen = opts.ipoptp_len; - ip2dbg(("ip_wput_options: opt %d, len %d\n", + ip2dbg(("ip_output_options: opt %d, len %d\n", optval, optlen)); switch (optval) { uint32_t off; @@ -27832,25 +13051,25 @@ ip_wput_options(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, case IPOPT_LSRR: if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { ip1dbg(( - "ip_wput_options: bad option offset\n")); + "ip_output_options: bad option offset\n")); code = (char *)&opt[IPOPT_OLEN] - (char *)ipha; goto param_prob; } off = opt[IPOPT_OFFSET]; - ip1dbg(("ip_wput_options: next hop 0x%x\n", + ip1dbg(("ip_output_options: next hop 0x%x\n", ntohl(dst))); /* * For strict: verify that dst is directly * reachable. */ if (optval == IPOPT_SSRR) { - ire = ire_ftable_lookup(dst, 0, 0, - IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, - msg_getlabel(mp), - MATCH_IRE_TYPE | MATCH_IRE_SECATTR, ipst); + ire = ire_ftable_lookup_v4(dst, 0, 0, + IRE_IF_ALL, NULL, ALL_ZONES, ixa->ixa_tsl, + MATCH_IRE_TYPE | MATCH_IRE_SECATTR, 0, ipst, + NULL); if (ire == NULL) { - ip1dbg(("ip_wput_options: SSRR not" + ip1dbg(("ip_output_options: SSRR not" " directly reachable: 0x%x\n", ntohl(dst))); goto bad_src_route; @@ -27861,7 +13080,7 @@ ip_wput_options(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, case IPOPT_RR: if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { ip1dbg(( - "ip_wput_options: bad option offset\n")); + "ip_output_options: bad option offset\n")); code = (char *)&opt[IPOPT_OLEN] - (char *)ipha; goto param_prob; @@ -27879,7 +13098,7 @@ ip_wput_options(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, } if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { ip1dbg(( - "ip_wput_options: bad option offset\n")); + "ip_output_options: bad option offset\n")); code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha; goto param_prob; @@ -27913,33 +13132,31 @@ ip_wput_options(queue_t *q, mblk_t *ipsec_mp, ipha_t *ipha, if ((opts.ipoptp_flags & IPOPTP_ERROR) == 0) return (0); - ip1dbg(("ip_wput_options: error processing IP options.")); + ip1dbg(("ip_output_options: error processing IP options.")); code = (char *)&opt[IPOPT_OFFSET] - (char *)ipha; param_prob: - /* - * Since ip_wput() isn't close to finished, we fill - * in enough of the header for credible error reporting. - */ - if (ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst)) { - /* Failed */ - freemsg(ipsec_mp); - return (-1); - } - icmp_param_problem(q, ipsec_mp, (uint8_t)code, zoneid, ipst); + bzero(&iras, sizeof (iras)); + iras.ira_ill = iras.ira_rill = ill; + iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; + iras.ira_rifindex = iras.ira_ruifindex; + iras.ira_flags = IRAF_IS_IPV4; + + ip_drop_output("ip_output_options", mp, ill); + icmp_param_problem(mp, (uint8_t)code, &iras); + ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); return (-1); bad_src_route: - /* - * Since ip_wput() isn't close to finished, we fill - * in enough of the header for credible error reporting. - */ - if (ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst)) { - /* Failed */ - freemsg(ipsec_mp); - return (-1); - } - icmp_unreachable(q, ipsec_mp, ICMP_SOURCE_ROUTE_FAILED, zoneid, ipst); + bzero(&iras, sizeof (iras)); + iras.ira_ill = iras.ira_rill = ill; + iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; + iras.ira_rifindex = iras.ira_ruifindex; + iras.ira_flags = IRAF_IS_IPV4; + + ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ill); + icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, &iras); + ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); return (-1); } @@ -28082,29 +13299,60 @@ conn_drain_insert(conn_t *connp, idl_tx_list_t *tx_list) /* * For non streams based sockets assert flow control. */ - if (IPCL_IS_NONSTR(connp)) { - DTRACE_PROBE1(su__txq__full, conn_t *, connp); - (*connp->conn_upcalls->su_txq_full) - (connp->conn_upper_handle, B_TRUE); - } else { - conn_setqfull(connp); - noenable(connp->conn_wq); - } + conn_setqfull(connp, NULL); mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); } +static void +conn_idl_remove(conn_t *connp) +{ + idl_t *idl = connp->conn_idl; + + if (idl != NULL) { + /* + * Remove ourself from the drain list, if we did not do + * a putq, or if the conn is closing. + * Note: It is possible that q->q_first is non-null. It means + * that these messages landed after we did a enableok() in + * ip_wsrv. Thus STREAMS will call ip_wsrv once again to + * service them. + */ + if (connp->conn_drain_next == connp) { + /* Singleton in the list */ + ASSERT(connp->conn_drain_prev == connp); + idl->idl_conn = NULL; + } else { + connp->conn_drain_prev->conn_drain_next = + connp->conn_drain_next; + connp->conn_drain_next->conn_drain_prev = + connp->conn_drain_prev; + if (idl->idl_conn == connp) + idl->idl_conn = connp->conn_drain_next; + } + } + connp->conn_drain_next = NULL; + connp->conn_drain_prev = NULL; + + conn_clrqfull(connp, NULL); + /* + * For streams based sockets open up flow control. + */ + if (!IPCL_IS_NONSTR(connp)) + enableok(connp->conn_wq); +} + /* * This conn is closing, and we are called from ip_close. OR - * This conn has been serviced by ip_wsrv, and we need to do the tail - * processing. - * If this conn is part of the drain list, we may need to sustain the drain - * process by qenabling the next conn in the drain list. We may also need to - * remove this conn from the list, if it is done. + * this conn is draining because flow-control on the ill has been relieved. + * + * We must also need to remove conn's on this idl from the list, and also + * inform the sockfs upcalls about the change in flow-control. */ static void conn_drain_tail(conn_t *connp, boolean_t closing) { idl_t *idl; + conn_t *next_connp; /* * connp->conn_idl is stable at this point, and no lock is needed @@ -28116,24 +13364,21 @@ conn_drain_tail(conn_t *connp, boolean_t closing) * instance of service trying to call conn_drain_insert on this conn * now. */ - ASSERT(!closing || (connp->conn_idl != NULL)); + ASSERT(!closing || connp == NULL || connp->conn_idl != NULL); /* * If connp->conn_idl is null, the conn has not been inserted into any * drain list even once since creation of the conn. Just return. */ - if (connp->conn_idl == NULL) + if (connp == NULL || connp->conn_idl == NULL) return; - mutex_enter(CONN_DRAIN_LIST_LOCK(connp)); - if (connp->conn_drain_prev == NULL) { /* This conn is currently not in the drain list. */ - mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); return; } idl = connp->conn_idl; - if (idl->idl_conn_draining == connp) { + if (!closing) { /* * This conn is the current drainer. If this is the last conn * in the drain list, we need to do more checks, in the 'if' @@ -28141,186 +13386,45 @@ conn_drain_tail(conn_t *connp, boolean_t closing) * to sustain the draining, and is handled in the 'else' * below. */ - if (connp->conn_drain_next == idl->idl_conn) { - /* - * This conn is the last in this list. This round - * of draining is complete. If idl_repeat is set, - * it means another flow enabling has happened from - * the driver/streams and we need to another round - * of draining. - * If there are more than 2 conns in the drain list, - * do a left rotate by 1, so that all conns except the - * conn at the head move towards the head by 1, and the - * the conn at the head goes to the tail. This attempts - * a more even share for all queues that are being - * drained. - */ - if ((connp->conn_drain_next != connp) && - (idl->idl_conn->conn_drain_next != connp)) { - idl->idl_conn = idl->idl_conn->conn_drain_next; - } - if (idl->idl_repeat) { - qenable(idl->idl_conn->conn_wq); - idl->idl_conn_draining = idl->idl_conn; - idl->idl_repeat = 0; - } else { - idl->idl_conn_draining = NULL; - } - } else { - /* - * If the next queue that we are now qenable'ing, - * is closing, it will remove itself from this list - * and qenable the subsequent queue in ip_close(). - * Serialization is acheived thru idl_lock. - */ - qenable(connp->conn_drain_next->conn_wq); - idl->idl_conn_draining = connp->conn_drain_next; - } - } - if (!connp->conn_did_putbq || closing) { - /* - * Remove ourself from the drain list, if we did not do - * a putbq, or if the conn is closing. - * Note: It is possible that q->q_first is non-null. It means - * that these messages landed after we did a enableok() in - * ip_wsrv. Thus STREAMS will call ip_wsrv once again to - * service them. - */ - if (connp->conn_drain_next == connp) { - /* Singleton in the list */ - ASSERT(connp->conn_drain_prev == connp); - idl->idl_conn = NULL; - idl->idl_conn_draining = NULL; - } else { - connp->conn_drain_prev->conn_drain_next = - connp->conn_drain_next; - connp->conn_drain_next->conn_drain_prev = - connp->conn_drain_prev; - if (idl->idl_conn == connp) - idl->idl_conn = connp->conn_drain_next; - ASSERT(idl->idl_conn_draining != connp); - - } - connp->conn_drain_next = NULL; - connp->conn_drain_prev = NULL; + next_connp = connp->conn_drain_next; + while (next_connp != connp) { + conn_t *delconnp = next_connp; - /* - * For non streams based sockets open up flow control. - */ - if (IPCL_IS_NONSTR(connp)) { - (*connp->conn_upcalls->su_txq_full) - (connp->conn_upper_handle, B_FALSE); - } else { - conn_clrqfull(connp); - enableok(connp->conn_wq); + next_connp = next_connp->conn_drain_next; + conn_idl_remove(delconnp); } + ASSERT(connp->conn_drain_next == idl->idl_conn); } + conn_idl_remove(connp); - mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); } /* * Write service routine. Shared perimeter entry point. - * ip_wsrv can be called in any of the following ways. - * 1. The device queue's messages has fallen below the low water mark - * and STREAMS has backenabled the ill_wq. We walk thru all the - * the drain lists and backenable the first conn in each list. - * 2. The above causes STREAMS to run ip_wsrv on the conn_wq of the - * qenabled non-tcp upper layers. We start dequeing messages and call - * ip_wput for each message. + * The device queue's messages has fallen below the low water mark and STREAMS + * has backenabled the ill_wq. Send sockfs notification about flow-control onx + * each waiting conn. */ - void ip_wsrv(queue_t *q) { - conn_t *connp; ill_t *ill; - mblk_t *mp; - - if (q->q_next) { - ill = (ill_t *)q->q_ptr; - if (ill->ill_state_flags == 0) { - ip_stack_t *ipst = ill->ill_ipst; - /* - * The device flow control has opened up. - * Walk through conn drain lists and qenable the - * first conn in each list. This makes sense only - * if the stream is fully plumbed and setup. - * Hence the if check above. - */ - ip1dbg(("ip_wsrv: walking\n")); - conn_walk_drain(ipst, &ipst->ips_idl_tx_list[0]); - } - return; - } - - connp = Q_TO_CONN(q); - ip1dbg(("ip_wsrv: %p %p\n", (void *)q, (void *)connp)); + ill = (ill_t *)q->q_ptr; + if (ill->ill_state_flags == 0) { + ip_stack_t *ipst = ill->ill_ipst; - /* - * 1. Set conn_draining flag to signal that service is active. - * - * 2. ip_output determines whether it has been called from service, - * based on the last parameter. If it is IP_WSRV it concludes it - * has been called from service. - * - * 3. Message ordering is preserved by the following logic. - * i. A directly called ip_output (i.e. not thru service) will queue - * the message at the tail, if conn_draining is set (i.e. service - * is running) or if q->q_first is non-null. - * - * ii. If ip_output is called from service, and if ip_output cannot - * putnext due to flow control, it does a putbq. - * - * 4. noenable the queue so that a putbq from ip_wsrv does not reenable - * (causing an infinite loop). - */ - ASSERT(!connp->conn_did_putbq); - - while ((q->q_first != NULL) && !connp->conn_did_putbq) { - connp->conn_draining = 1; - noenable(q); - while ((mp = getq(q)) != NULL) { - ASSERT(CONN_Q(q)); - - DTRACE_PROBE1(ip__wsrv__ip__output, conn_t *, connp); - ip_output(Q_TO_CONN(q), mp, q, IP_WSRV); - if (connp->conn_did_putbq) { - /* ip_wput did a putbq */ - break; - } - } /* - * At this point, a thread coming down from top, calling - * ip_wput, may end up queueing the message. We have not yet - * enabled the queue, so ip_wsrv won't be called again. - * To avoid this race, check q->q_first again (in the loop) - * If the other thread queued the message before we call - * enableok(), we will catch it in the q->q_first check. - * If the other thread queues the message after we call - * enableok(), ip_wsrv will be called again by STREAMS. + * The device flow control has opened up. + * Walk through conn drain lists and qenable the + * first conn in each list. This makes sense only + * if the stream is fully plumbed and setup. + * Hence the ill_state_flags check above. */ - connp->conn_draining = 0; - enableok(q); + ip1dbg(("ip_wsrv: walking\n")); + conn_walk_drain(ipst, &ipst->ips_idl_tx_list[0]); + enableok(ill->ill_wq); } - - /* Enable the next conn for draining */ - conn_drain_tail(connp, B_FALSE); - - /* - * conn_direct_blocked is used to indicate blocked - * condition for direct path (ILL_DIRECT_CAPABLE()). - * This is the only place where it is set without - * checking for ILL_DIRECT_CAPABLE() and setting it - * to 0 is ok even if it is not ILL_DIRECT_CAPABLE(). - */ - if (!connp->conn_did_putbq && connp->conn_direct_blocked) { - DTRACE_PROBE1(ip__wsrv__direct__blocked, conn_t *, connp); - connp->conn_direct_blocked = B_FALSE; - } - - connp->conn_did_putbq = 0; } /* @@ -28369,21 +13473,7 @@ conn_walk_drain(ip_stack_t *ipst, idl_tx_list_t *tx_list) for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++) { idl = &tx_list->txl_drain_list[i]; mutex_enter(&idl->idl_lock); - if (idl->idl_conn == NULL) { - mutex_exit(&idl->idl_lock); - continue; - } - /* - * If this list is not being drained currently by - * an ip_wsrv thread, start the process. - */ - if (idl->idl_conn_draining == NULL) { - ASSERT(idl->idl_repeat == 0); - qenable(idl->idl_conn->conn_wq); - idl->idl_conn_draining = idl->idl_conn; - } else { - idl->idl_repeat = 1; - } + conn_drain_tail(idl->idl_conn, B_FALSE); mutex_exit(&idl->idl_lock); } } @@ -28393,240 +13483,190 @@ conn_walk_drain(ip_stack_t *ipst, idl_tx_list_t *tx_list) * "matches" the conn. */ boolean_t -conn_wantpacket(conn_t *connp, ill_t *ill, ipha_t *ipha, int fanout_flags, - zoneid_t zoneid) +conn_wantpacket(conn_t *connp, ip_recv_attr_t *ira, ipha_t *ipha) { - ill_t *bound_ill; - boolean_t found; - ipif_t *ipif; - ire_t *ire; - ipaddr_t dst, src; - ip_stack_t *ipst = connp->conn_netstack->netstack_ip; + ill_t *ill = ira->ira_rill; + zoneid_t zoneid = ira->ira_zoneid; + uint_t in_ifindex; + ipaddr_t dst, src; dst = ipha->ipha_dst; src = ipha->ipha_src; /* - * conn_incoming_ill is set by IP_BOUND_IF which limits + * conn_incoming_ifindex is set by IP_BOUND_IF which limits * unicast, broadcast and multicast reception to - * conn_incoming_ill. conn_wantpacket itself is called - * only for BROADCAST and multicast. + * conn_incoming_ifindex. + * conn_wantpacket is called for unicast, broadcast and + * multicast packets. */ - bound_ill = connp->conn_incoming_ill; - if (bound_ill != NULL) { - if (IS_IPMP(bound_ill)) { - if (bound_ill->ill_grp != ill->ill_grp) - return (B_FALSE); - } else { - if (bound_ill != ill) - return (B_FALSE); - } - } + in_ifindex = connp->conn_incoming_ifindex; - if (!CLASSD(dst)) { - if (IPCL_ZONE_MATCH(connp, zoneid)) - return (B_TRUE); - /* - * The conn is in a different zone; we need to check that this - * broadcast address is configured in the application's zone. - */ - ipif = ipif_get_next_ipif(NULL, ill); - if (ipif == NULL) + /* mpathd can bind to the under IPMP interface, which we allow */ + if (in_ifindex != 0 && in_ifindex != ill->ill_phyint->phyint_ifindex) { + if (!IS_UNDER_IPMP(ill)) return (B_FALSE); - ire = ire_ctable_lookup(dst, 0, IRE_BROADCAST, ipif, - connp->conn_zoneid, NULL, - (MATCH_IRE_TYPE | MATCH_IRE_ILL), ipst); - ipif_refrele(ipif); - if (ire != NULL) { - ire_refrele(ire); - return (B_TRUE); - } else { + + if (in_ifindex != ipmp_ill_get_ipmp_ifindex(ill)) return (B_FALSE); - } } - if ((fanout_flags & IP_FF_NO_MCAST_LOOP) && - connp->conn_zoneid == zoneid) { - /* - * Loopback case: the sending endpoint has IP_MULTICAST_LOOP - * disabled, therefore we don't dispatch the multicast packet to - * the sending zone. - */ + if (!IPCL_ZONE_MATCH(connp, zoneid)) return (B_FALSE); - } - if (IS_LOOPBACK(ill) && connp->conn_zoneid != zoneid) { - /* - * Multicast packet on the loopback interface: we only match - * conns who joined the group in the specified zone. - */ - return (B_FALSE); - } + if (!(ira->ira_flags & IRAF_MULTICAST)) + return (B_TRUE); if (connp->conn_multi_router) { /* multicast packet and multicast router socket: send up */ return (B_TRUE); } - mutex_enter(&connp->conn_lock); - found = (ilg_lookup_ill_withsrc(connp, dst, src, ill) != NULL); - mutex_exit(&connp->conn_lock); - return (found); + if (ipha->ipha_protocol == IPPROTO_PIM || + ipha->ipha_protocol == IPPROTO_RSVP) + return (B_TRUE); + + return (conn_hasmembers_ill_withsrc_v4(connp, dst, src, ira->ira_ill)); } -static void -conn_setqfull(conn_t *connp) +void +conn_setqfull(conn_t *connp, boolean_t *flow_stopped) { - queue_t *q = connp->conn_wq; + if (IPCL_IS_NONSTR(connp)) { + (*connp->conn_upcalls->su_txq_full) + (connp->conn_upper_handle, B_TRUE); + if (flow_stopped != NULL) + *flow_stopped = B_TRUE; + } else { + queue_t *q = connp->conn_wq; - if (!(q->q_flag & QFULL)) { - mutex_enter(QLOCK(q)); + ASSERT(q != NULL); if (!(q->q_flag & QFULL)) { - /* still need to set QFULL */ - q->q_flag |= QFULL; - mutex_exit(QLOCK(q)); - } else { - mutex_exit(QLOCK(q)); + mutex_enter(QLOCK(q)); + if (!(q->q_flag & QFULL)) { + /* still need to set QFULL */ + q->q_flag |= QFULL; + /* set flow_stopped to true under QLOCK */ + if (flow_stopped != NULL) + *flow_stopped = B_TRUE; + mutex_exit(QLOCK(q)); + } else { + /* flow_stopped is left unchanged */ + mutex_exit(QLOCK(q)); + } } } } -static void -conn_clrqfull(conn_t *connp) +void +conn_clrqfull(conn_t *connp, boolean_t *flow_stopped) { - queue_t *q = connp->conn_wq; + if (IPCL_IS_NONSTR(connp)) { + (*connp->conn_upcalls->su_txq_full) + (connp->conn_upper_handle, B_FALSE); + if (flow_stopped != NULL) + *flow_stopped = B_FALSE; + } else { + queue_t *q = connp->conn_wq; - if (q->q_flag & QFULL) { - mutex_enter(QLOCK(q)); + ASSERT(q != NULL); if (q->q_flag & QFULL) { - q->q_flag &= ~QFULL; - mutex_exit(QLOCK(q)); - if (q->q_flag & QWANTW) - qbackenable(q, 0); - } else { - mutex_exit(QLOCK(q)); + mutex_enter(QLOCK(q)); + if (q->q_flag & QFULL) { + q->q_flag &= ~QFULL; + /* set flow_stopped to false under QLOCK */ + if (flow_stopped != NULL) + *flow_stopped = B_FALSE; + mutex_exit(QLOCK(q)); + if (q->q_flag & QWANTW) + qbackenable(q, 0); + } else { + /* flow_stopped is left unchanged */ + mutex_exit(QLOCK(q)); + } } } + connp->conn_direct_blocked = B_FALSE; } /* - * Finish processing of "arp_up" when AR_DLPIOP_DONE is received from arp. + * Return the length in bytes of the IPv4 headers (base header, label, and + * other IP options) that will be needed based on the + * ip_pkt_t structure passed by the caller. + * + * The returned length does not include the length of the upper level + * protocol (ULP) header. + * The caller needs to check that the length doesn't exceed the max for IPv4. */ -/* ARGSUSED */ -static void -ip_arp_done(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy_arg) +int +ip_total_hdrs_len_v4(const ip_pkt_t *ipp) { - ill_t *ill = (ill_t *)q->q_ptr; - mblk_t *mp1, *mp2; - ipif_t *ipif; - int err = 0; - conn_t *connp = NULL; - ipsq_t *ipsq; - arc_t *arc; - - ip1dbg(("ip_arp_done(%s)\n", ill->ill_name)); - - ASSERT((mp->b_wptr - mp->b_rptr) >= sizeof (arc_t)); - ASSERT(((arc_t *)mp->b_rptr)->arc_cmd == AR_DLPIOP_DONE); - - ASSERT(IAM_WRITER_ILL(ill)); - mp2 = mp->b_cont; - mp->b_cont = NULL; + int len; - /* - * We have now received the arp bringup completion message - * from ARP. Mark the arp bringup as done. Also if the arp - * stream has already started closing, send up the AR_ARP_CLOSING - * ack now since ARP is waiting in close for this ack. - */ - mutex_enter(&ill->ill_lock); - ill->ill_arp_bringup_pending = 0; - if (ill->ill_arp_closing) { - mutex_exit(&ill->ill_lock); - /* Let's reuse the mp for sending the ack */ - arc = (arc_t *)mp->b_rptr; - mp->b_wptr = mp->b_rptr + sizeof (arc_t); - arc->arc_cmd = AR_ARP_CLOSING; - qreply(q, mp); - } else { - mutex_exit(&ill->ill_lock); - freeb(mp); + len = IP_SIMPLE_HDR_LENGTH; + if (ipp->ipp_fields & IPPF_LABEL_V4) { + ASSERT(ipp->ipp_label_len_v4 != 0); + /* We need to round up here */ + len += (ipp->ipp_label_len_v4 + 3) & ~3; } - ipsq = ill->ill_phyint->phyint_ipsq; - ipif = ipsq->ipsq_xop->ipx_pending_ipif; - mp1 = ipsq_pending_mp_get(ipsq, &connp); - ASSERT(!((mp1 != NULL) ^ (ipif != NULL))); - if (mp1 == NULL) { - /* bringup was aborted by the user */ - freemsg(mp2); - return; + if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) { + ASSERT(ipp->ipp_ipv4_options_len != 0); + ASSERT((ipp->ipp_ipv4_options_len & 3) == 0); + len += ipp->ipp_ipv4_options_len; } + return (len); +} - /* - * If an IOCTL is waiting on this (ipx_current_ioctl != 0), then we - * must have an associated conn_t. Otherwise, we're bringing this - * interface back up as part of handling an asynchronous event (e.g., - * physical address change). - */ - if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { - ASSERT(connp != NULL); - q = CONNP_TO_WQ(connp); - } else { - ASSERT(connp == NULL); - q = ill->ill_rq; - } +/* + * All-purpose routine to build an IPv4 header with options based + * on the abstract ip_pkt_t. + * + * The caller has to set the source and destination address as well as + * ipha_length. The caller has to massage any source route and compensate + * for the ULP pseudo-header checksum due to the source route. + */ +void +ip_build_hdrs_v4(uchar_t *buf, uint_t buf_len, const ip_pkt_t *ipp, + uint8_t protocol) +{ + ipha_t *ipha = (ipha_t *)buf; + uint8_t *cp; - /* - * If the DL_BIND_REQ fails, it is noted - * in arc_name_offset. - */ - err = *((int *)mp2->b_rptr); - if (err == 0) { - if (ipif->ipif_isv6) { - if ((err = ipif_up_done_v6(ipif)) != 0) - ip0dbg(("ip_arp_done: init failed\n")); - } else { - if ((err = ipif_up_done(ipif)) != 0) - ip0dbg(("ip_arp_done: init failed\n")); - } - } else { - ip0dbg(("ip_arp_done: DL_BIND_REQ failed\n")); - } + /* Initialize IPv4 header */ + ipha->ipha_type_of_service = ipp->ipp_type_of_service; + ipha->ipha_length = 0; /* Caller will set later */ + ipha->ipha_ident = 0; + ipha->ipha_fragment_offset_and_flags = 0; + ipha->ipha_ttl = ipp->ipp_unicast_hops; + ipha->ipha_protocol = protocol; + ipha->ipha_hdr_checksum = 0; - freemsg(mp2); + if ((ipp->ipp_fields & IPPF_ADDR) && + IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) + ipha->ipha_src = ipp->ipp_addr_v4; - if ((err == 0) && (ill->ill_up_ipifs)) { - err = ill_up_ipifs(ill, q, mp1); - if (err == EINPROGRESS) - return; + cp = (uint8_t *)&ipha[1]; + if (ipp->ipp_fields & IPPF_LABEL_V4) { + ASSERT(ipp->ipp_label_len_v4 != 0); + bcopy(ipp->ipp_label_v4, cp, ipp->ipp_label_len_v4); + cp += ipp->ipp_label_len_v4; + /* We need to round up here */ + while ((uintptr_t)cp & 0x3) { + *cp++ = IPOPT_NOP; + } } - /* - * If we have a moved ipif to bring up, and everything has succeeded - * to this point, bring it up on the IPMP ill. Otherwise, leave it - * down -- the admin can try to bring it up by hand if need be. - */ - if (ill->ill_move_ipif != NULL) { - ipif = ill->ill_move_ipif; - ill->ill_move_ipif = NULL; - if (err == 0) { - err = ipif_up(ipif, q, mp1); - if (err == EINPROGRESS) - return; - } + if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) { + ASSERT(ipp->ipp_ipv4_options_len != 0); + ASSERT((ipp->ipp_ipv4_options_len & 3) == 0); + bcopy(ipp->ipp_ipv4_options, cp, ipp->ipp_ipv4_options_len); + cp += ipp->ipp_ipv4_options_len; } + ipha->ipha_version_and_hdr_length = + (uint8_t)((IP_VERSION << 4) + buf_len / 4); - /* - * The operation must complete without EINPROGRESS since - * ipsq_pending_mp_get() has removed the mblk. Otherwise, the - * operation will be stuck forever in the ipsq. - */ - ASSERT(err != EINPROGRESS); - if (ipsq->ipsq_xop->ipx_current_ioctl != 0) - ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); - else - ipsq_current_finish(ipsq); + ASSERT((int)(cp - buf) == buf_len); } /* Allocate the private structure */ @@ -28659,47 +13699,43 @@ ip_priv_free(void *buf) * which holds the state information for this packet and invokes the * the classifier (via ipp_packet_process). The classification, depending on * configured filters, results in a list of actions for this packet. Invoking - * an action may cause the packet to be dropped, in which case the resulting - * mblk (*mpp) is NULL. proc indicates the callout position for - * this packet and ill_index is the interface this packet on or will leave + * an action may cause the packet to be dropped, in which case we return NULL. + * proc indicates the callout position for + * this packet and ill is the interface this packet arrived on or will leave * on (inbound and outbound resp.). + * + * We do the processing on the rill (mapped to the upper if ipmp), but MIB + * on the ill corrsponding to the destination IP address. */ -void -ip_process(ip_proc_t proc, mblk_t **mpp, uint32_t ill_index) +mblk_t * +ip_process(ip_proc_t proc, mblk_t *mp, ill_t *rill, ill_t *ill) { - mblk_t *mp; ip_priv_t *priv; ipp_action_id_t aid; int rc = 0; ipp_packet_t *pp; -#define IP_CLASS "ip" /* If the classifier is not loaded, return */ if ((aid = ipp_action_lookup(IPGPC_CLASSIFY)) == IPP_ACTION_INVAL) { - return; + return (mp); } - mp = *mpp; ASSERT(mp != NULL); /* Allocate the packet structure */ - rc = ipp_packet_alloc(&pp, IP_CLASS, aid); - if (rc != 0) { - *mpp = NULL; - freemsg(mp); - return; - } + rc = ipp_packet_alloc(&pp, "ip", aid); + if (rc != 0) + goto drop; /* Allocate the private structure */ rc = ip_priv_alloc((void **)&priv); if (rc != 0) { - *mpp = NULL; - freemsg(mp); ipp_packet_free(pp); - return; + goto drop; } priv->proc = proc; - priv->ill_index = ill_index; + priv->ill_index = ill_get_upper_ifindex(rill); + ipp_packet_set_private(pp, priv, ip_priv_free); ipp_packet_set_data(pp, mp); @@ -28708,14 +13744,23 @@ ip_process(ip_proc_t proc, mblk_t **mpp, uint32_t ill_index) if (pp != NULL) { mp = ipp_packet_get_data(pp); ipp_packet_free(pp); - if (rc != 0) { - freemsg(mp); - *mpp = NULL; - } + if (rc != 0) + goto drop; + return (mp); } else { - *mpp = NULL; + /* No mp to trace in ip_drop_input/ip_drop_output */ + mp = NULL; } -#undef IP_CLASS +drop: + if (proc == IPP_LOCAL_IN || proc == IPP_FWD_IN) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ip_process", mp, ill); + } else { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ip_process", mp, ill); + } + freemsg(mp); + return (NULL); } /* @@ -28723,102 +13768,92 @@ ip_process(ip_proc_t proc, mblk_t **mpp, uint32_t ill_index) * all the interfaces crossed by the related multirt routes. * The call is considered successful if the operation succeeds * on at least one interface. + * + * This assumes that a set of IRE_HOST/RTF_MULTIRT has been created for the + * multicast addresses with the ire argument being the first one. + * We walk the bucket to find all the of those. + * + * Common to IPv4 and IPv6. */ static int -ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t, ipaddr_t, ipaddr_t, - uint_t *, mcast_record_t, ipaddr_t, mblk_t *), ire_t *ire, conn_t *connp, - boolean_t checkonly, ipaddr_t group, mcast_record_t fmode, ipaddr_t src, - mblk_t *first_mp) +ip_multirt_apply_membership(int (*fn)(conn_t *, boolean_t, + const in6_addr_t *, ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *), + ire_t *ire, conn_t *connp, boolean_t checkonly, const in6_addr_t *v6group, + mcast_record_t fmode, const in6_addr_t *v6src) { ire_t *ire_gw; irb_t *irb; + int ifindex; int error = 0; - opt_restart_t *or; + int result; ip_stack_t *ipst = ire->ire_ipst; + ipaddr_t group; + boolean_t isv6; + int match_flags; + + if (IN6_IS_ADDR_V4MAPPED(v6group)) { + IN6_V4MAPPED_TO_IPADDR(v6group, group); + isv6 = B_FALSE; + } else { + isv6 = B_TRUE; + } irb = ire->ire_bucket; ASSERT(irb != NULL); - ASSERT(DB_TYPE(first_mp) == M_CTL); - - or = (opt_restart_t *)first_mp->b_rptr; - IRB_REFHOLD(irb); + result = 0; + irb_refhold(irb); for (; ire != NULL; ire = ire->ire_next) { if ((ire->ire_flags & RTF_MULTIRT) == 0) continue; - if (ire->ire_addr != group) - continue; - ire_gw = ire_ftable_lookup(ire->ire_gateway_addr, 0, 0, - IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, NULL, - MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE, ipst); - /* No resolver exists for the gateway; skip this ire. */ + /* We handle -ifp routes by matching on the ill if set */ + match_flags = MATCH_IRE_TYPE; + if (ire->ire_ill != NULL) + match_flags |= MATCH_IRE_ILL; + + if (isv6) { + if (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, v6group)) + continue; + + ire_gw = ire_ftable_lookup_v6(&ire->ire_gateway_addr_v6, + 0, 0, IRE_INTERFACE, ire->ire_ill, ALL_ZONES, NULL, + match_flags, 0, ipst, NULL); + } else { + if (ire->ire_addr != group) + continue; + + ire_gw = ire_ftable_lookup_v4(ire->ire_gateway_addr, + 0, 0, IRE_INTERFACE, ire->ire_ill, ALL_ZONES, NULL, + match_flags, 0, ipst, NULL); + } + /* No interface route exists for the gateway; skip this ire. */ if (ire_gw == NULL) continue; + if (ire_gw->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { + ire_refrele(ire_gw); + continue; + } + ASSERT(ire_gw->ire_ill != NULL); /* IRE_INTERFACE */ + ifindex = ire_gw->ire_ill->ill_phyint->phyint_ifindex; /* - * This function can return EINPROGRESS. If so the operation - * will be restarted from ip_restart_optmgmt which will - * call ip_opt_set and option processing will restart for - * this option. So we may end up calling 'fn' more than once. - * This requires that 'fn' is idempotent except for the - * return value. The operation is considered a success if + * The operation is considered a success if * it succeeds at least once on any one interface. */ - error = fn(connp, checkonly, group, ire_gw->ire_src_addr, - NULL, fmode, src, first_mp); + error = fn(connp, checkonly, v6group, INADDR_ANY, ifindex, + fmode, v6src); if (error == 0) - or->or_private = CGTP_MCAST_SUCCESS; - - if (ip_debug > 0) { - ulong_t off; - char *ksym; - ksym = kobj_getsymname((uintptr_t)fn, &off); - ip2dbg(("ip_multirt_apply_membership: " - "called %s, multirt group 0x%08x via itf 0x%08x, " - "error %d [success %u]\n", - ksym ? ksym : "?", - ntohl(group), ntohl(ire_gw->ire_src_addr), - error, or->or_private)); - } + result = CGTP_MCAST_SUCCESS; ire_refrele(ire_gw); - if (error == EINPROGRESS) { - IRB_REFRELE(irb); - return (error); - } } - IRB_REFRELE(irb); + irb_refrele(irb); /* * Consider the call as successful if we succeeded on at least * one interface. Otherwise, return the last encountered error. */ - return (or->or_private == CGTP_MCAST_SUCCESS ? 0 : error); -} - -/* - * Issue a warning regarding a route crossing an interface with an - * incorrect MTU. Only one message every 'ip_multirt_log_interval' - * amount of time is logged. - */ -static void -ip_multirt_bad_mtu(ire_t *ire, uint32_t max_frag) -{ - hrtime_t current = gethrtime(); - char buf[INET_ADDRSTRLEN]; - ip_stack_t *ipst = ire->ire_ipst; - - /* Convert interval in ms to hrtime in ns */ - if (ipst->ips_multirt_bad_mtu_last_time + - ((hrtime_t)ipst->ips_ip_multirt_log_interval * (hrtime_t)1000000) <= - current) { - cmn_err(CE_WARN, "ip: ignoring multiroute " - "to %s, incorrect MTU %u (expected %u)\n", - ip_dot_addr(ire->ire_addr, buf), - ire->ire_max_frag, max_frag); - - ipst->ips_multirt_bad_mtu_last_time = current; - } + return (result == CGTP_MCAST_SUCCESS ? 0 : error); } /* @@ -28882,6 +13917,7 @@ ip_cgtp_filter_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, *ip_cgtp_filter_value = (boolean_t)new_value; + ill_set_inputfn_all(ipst); return (0); } @@ -28919,6 +13955,9 @@ ip_cgtp_filter_register(netstackid_t stackid, cgtp_filter_ops_t *ops) } ipst->ips_ip_cgtp_filter_ops = ops; + + ill_set_inputfn_all(ipst); + netstack_rele(ns); return (0); } @@ -28950,6 +13989,9 @@ ip_cgtp_filter_unregister(netstackid_t stackid) return (ENXIO); } ipst->ips_ip_cgtp_filter_ops = NULL; + + ill_set_inputfn_all(ipst); + netstack_rele(ns); return (0); } @@ -28984,7 +14026,7 @@ ip_cgtp_filter_is_registered(netstackid_t stackid) static int ip_squeue_switch(int val) { - int rval = SQ_FILL; + int rval; switch (val) { case IP_SQUEUE_ENTER_NODRAIN: @@ -28993,7 +14035,9 @@ ip_squeue_switch(int val) case IP_SQUEUE_ENTER: rval = SQ_PROCESS; break; + case IP_SQUEUE_FILL: default: + rval = SQ_FILL; break; } return (rval); @@ -29046,52 +14090,45 @@ ip_kstat2_init(netstackid_t stackid, ip_stat_t *ip_statisticsp) kstat_t *ksp; ip_stat_t template = { - { "ipsec_fanout_proto", KSTAT_DATA_UINT64 }, { "ip_udp_fannorm", KSTAT_DATA_UINT64 }, { "ip_udp_fanmb", KSTAT_DATA_UINT64 }, - { "ip_udp_fanothers", KSTAT_DATA_UINT64 }, - { "ip_udp_fast_path", KSTAT_DATA_UINT64 }, - { "ip_udp_slow_path", KSTAT_DATA_UINT64 }, - { "ip_udp_input_err", KSTAT_DATA_UINT64 }, - { "ip_tcppullup", KSTAT_DATA_UINT64 }, - { "ip_tcpoptions", KSTAT_DATA_UINT64 }, - { "ip_multipkttcp", KSTAT_DATA_UINT64 }, - { "ip_tcp_fast_path", KSTAT_DATA_UINT64 }, - { "ip_tcp_slow_path", KSTAT_DATA_UINT64 }, - { "ip_tcp_input_error", KSTAT_DATA_UINT64 }, + { "ip_recv_pullup", KSTAT_DATA_UINT64 }, { "ip_db_ref", KSTAT_DATA_UINT64 }, - { "ip_notaligned1", KSTAT_DATA_UINT64 }, - { "ip_notaligned2", KSTAT_DATA_UINT64 }, - { "ip_multimblk3", KSTAT_DATA_UINT64 }, - { "ip_multimblk4", KSTAT_DATA_UINT64 }, - { "ip_ipoptions", KSTAT_DATA_UINT64 }, - { "ip_classify_fail", KSTAT_DATA_UINT64 }, + { "ip_notaligned", KSTAT_DATA_UINT64 }, + { "ip_multimblk", KSTAT_DATA_UINT64 }, { "ip_opt", KSTAT_DATA_UINT64 }, - { "ip_udp_rput_local", KSTAT_DATA_UINT64 }, { "ipsec_proto_ahesp", KSTAT_DATA_UINT64 }, { "ip_conn_flputbq", KSTAT_DATA_UINT64 }, { "ip_conn_walk_drain", KSTAT_DATA_UINT64 }, { "ip_out_sw_cksum", KSTAT_DATA_UINT64 }, + { "ip_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, { "ip_in_sw_cksum", KSTAT_DATA_UINT64 }, - { "ip_trash_ire_reclaim_calls", KSTAT_DATA_UINT64 }, - { "ip_trash_ire_reclaim_success", KSTAT_DATA_UINT64 }, - { "ip_ire_arp_timer_expired", KSTAT_DATA_UINT64 }, - { "ip_ire_redirect_timer_expired", KSTAT_DATA_UINT64 }, - { "ip_ire_pmtu_timer_expired", KSTAT_DATA_UINT64 }, - { "ip_input_multi_squeue", KSTAT_DATA_UINT64 }, + { "ip_ire_reclaim_calls", KSTAT_DATA_UINT64 }, + { "ip_ire_reclaim_deleted", KSTAT_DATA_UINT64 }, + { "ip_nce_reclaim_calls", KSTAT_DATA_UINT64 }, + { "ip_nce_reclaim_deleted", KSTAT_DATA_UINT64 }, + { "ip_dce_reclaim_calls", KSTAT_DATA_UINT64 }, + { "ip_dce_reclaim_deleted", KSTAT_DATA_UINT64 }, { "ip_tcp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 }, { "ip_tcp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 }, { "ip_tcp_in_sw_cksum_err", KSTAT_DATA_UINT64 }, - { "ip_tcp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, { "ip_udp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 }, { "ip_udp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 }, - { "ip_udp_in_sw_cksum_err", KSTAT_DATA_UINT64 }, - { "ip_udp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, - { "ip_frag_mdt_pkt_out", KSTAT_DATA_UINT64 }, - { "ip_frag_mdt_discarded", KSTAT_DATA_UINT64 }, - { "ip_frag_mdt_allocfail", KSTAT_DATA_UINT64 }, - { "ip_frag_mdt_addpdescfail", KSTAT_DATA_UINT64 }, - { "ip_frag_mdt_allocd", KSTAT_DATA_UINT64 }, + { "ip_udp_in_sw_cksum_err", KSTAT_DATA_UINT64 }, + { "conn_in_recvdstaddr", KSTAT_DATA_UINT64 }, + { "conn_in_recvopts", KSTAT_DATA_UINT64 }, + { "conn_in_recvif", KSTAT_DATA_UINT64 }, + { "conn_in_recvslla", KSTAT_DATA_UINT64 }, + { "conn_in_recvucred", KSTAT_DATA_UINT64 }, + { "conn_in_recvttl", KSTAT_DATA_UINT64 }, + { "conn_in_recvhopopts", KSTAT_DATA_UINT64 }, + { "conn_in_recvhoplimit", KSTAT_DATA_UINT64 }, + { "conn_in_recvdstopts", KSTAT_DATA_UINT64 }, + { "conn_in_recvrthdrdstopts", KSTAT_DATA_UINT64 }, + { "conn_in_recvrthdr", KSTAT_DATA_UINT64 }, + { "conn_in_recvpktinfo", KSTAT_DATA_UINT64 }, + { "conn_in_recvtclass", KSTAT_DATA_UINT64 }, + { "conn_in_timestamp", KSTAT_DATA_UINT64 }, }; ksp = kstat_create_netstack("ip", 0, "ipstat", "net", @@ -29420,323 +14457,457 @@ icmp_kstat_update(kstat_t *kp, int rw) * a port. This is assured in ipcl_sctp_hash_insert(); */ void -ip_fanout_sctp_raw(mblk_t *mp, ill_t *recv_ill, ipha_t *ipha, boolean_t isv4, - uint32_t ports, boolean_t mctl_present, uint_t flags, boolean_t ip_policy, - zoneid_t zoneid) +ip_fanout_sctp_raw(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, uint32_t ports, + ip_recv_attr_t *ira) { conn_t *connp; queue_t *rq; - mblk_t *first_mp; boolean_t secure; - ip6_t *ip6h; - ip_stack_t *ipst = recv_ill->ill_ipst; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; sctp_stack_t *sctps = ipst->ips_netstack->netstack_sctp; - boolean_t sctp_csum_err = B_FALSE; + iaflags_t iraflags = ira->ira_flags; + ill_t *rill = ira->ira_rill; - if (flags & IP_FF_SCTP_CSUM_ERR) { - sctp_csum_err = B_TRUE; - flags &= ~IP_FF_SCTP_CSUM_ERR; - } + secure = iraflags & IRAF_IPSEC_SECURE; - first_mp = mp; - if (mctl_present) { - mp = first_mp->b_cont; - secure = ipsec_in_is_secure(first_mp); - ASSERT(mp != NULL); - } else { - secure = B_FALSE; - } - ip6h = (isv4) ? NULL : (ip6_t *)ipha; - - connp = ipcl_classify_raw(mp, IPPROTO_SCTP, zoneid, ports, ipha, ipst); + connp = ipcl_classify_raw(mp, IPPROTO_SCTP, ports, ipha, ip6h, + ira, ipst); if (connp == NULL) { /* * Although raw sctp is not summed, OOB chunks must be. * Drop the packet here if the sctp checksum failed. */ - if (sctp_csum_err) { + if (iraflags & IRAF_SCTP_CSUM_ERR) { BUMP_MIB(&sctps->sctps_mib, sctpChecksumError); - freemsg(first_mp); + freemsg(mp); return; } - sctp_ootb_input(first_mp, recv_ill, zoneid, mctl_present); + ira->ira_ill = ira->ira_rill = NULL; + sctp_ootb_input(mp, ira, ipst); + ira->ira_ill = ill; + ira->ira_rill = rill; return; } rq = connp->conn_rq; - if (!canputnext(rq)) { + if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld : !canputnext(rq)) { CONN_DEC_REF(connp); - BUMP_MIB(recv_ill->ill_ip_mib, rawipIfStatsInOverflows); - freemsg(first_mp); + BUMP_MIB(ill->ill_ip_mib, rawipIfStatsInOverflows); + freemsg(mp); return; } - if ((isv4 ? CONN_INBOUND_POLICY_PRESENT(connp, ipss) : - CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) || secure) { - first_mp = ipsec_check_inbound_policy(first_mp, connp, - (isv4 ? ipha : NULL), ip6h, mctl_present); - if (first_mp == NULL) { - BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards); + if (((iraflags & IRAF_IS_IPV4) ? + CONN_INBOUND_POLICY_PRESENT(connp, ipss) : + CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) || + secure) { + mp = ipsec_check_inbound_policy(mp, connp, ipha, + ip6h, ira); + if (mp == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + /* Note that mp is NULL */ + ip_drop_input("ipIfStatsInDiscards", mp, ill); CONN_DEC_REF(connp); return; } } - /* - * We probably should not send M_CTL message up to - * raw socket. - */ - if (mctl_present) - freeb(first_mp); - /* Initiate IPPF processing here if needed. */ - if ((isv4 && IPP_ENABLED(IPP_LOCAL_IN, ipst) && ip_policy) || - (!isv4 && IP6_IN_IPP(flags, ipst))) { - ip_process(IPP_LOCAL_IN, &mp, - recv_ill->ill_phyint->phyint_ifindex); - if (mp == NULL) { - CONN_DEC_REF(connp); - return; - } + if (iraflags & IRAF_ICMP_ERROR) { + (connp->conn_recvicmp)(connp, mp, NULL, ira); + } else { + ill_t *rill = ira->ira_rill; + + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); + /* This is the SOCK_RAW, IPPROTO_SCTP case. */ + ira->ira_ill = ira->ira_rill = NULL; + (connp->conn_recv)(connp, mp, NULL, ira); + ira->ira_ill = ill; + ira->ira_rill = rill; } + CONN_DEC_REF(connp); +} - if (connp->conn_recvif || connp->conn_recvslla || - ((connp->conn_ip_recvpktinfo || - (!isv4 && IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))) && - (flags & IP_FF_IPINFO))) { - int in_flags = 0; +/* + * Free a packet that has the link-layer dl_unitdata_req_t or fast-path + * header before the ip payload. + */ +static void +ip_xmit_flowctl_drop(ill_t *ill, mblk_t *mp, boolean_t is_fp_mp, int fp_mp_len) +{ + int len = (mp->b_wptr - mp->b_rptr); + mblk_t *ip_mp; - /* - * Since sctp does not support IP_RECVPKTINFO for v4, only pass - * IPF_RECVIF. - */ - if (connp->conn_recvif || connp->conn_ip_recvpktinfo) { - in_flags = IPF_RECVIF; - } - if (connp->conn_recvslla) { - in_flags |= IPF_RECVSLLA; - } - if (isv4) { - mp = ip_add_info(mp, recv_ill, in_flags, - IPCL_ZONEID(connp), ipst); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + if (is_fp_mp || len != fp_mp_len) { + if (len > fp_mp_len) { + /* + * fastpath header and ip header in the first mblk + */ + mp->b_rptr += fp_mp_len; } else { - mp = ip_add_info_v6(mp, recv_ill, &ip6h->ip6_dst); - if (mp == NULL) { - BUMP_MIB(recv_ill->ill_ip_mib, - ipIfStatsInDiscards); - CONN_DEC_REF(connp); - return; - } + /* + * ip_xmit_attach_llhdr had to prepend an mblk to + * attach the fastpath header before ip header. + */ + ip_mp = mp->b_cont; + freeb(mp); + mp = ip_mp; + mp->b_rptr += (fp_mp_len - len); } + } else { + ip_mp = mp->b_cont; + freeb(mp); + mp = ip_mp; } - - BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers); - /* - * We are sending the IPSEC_IN message also up. Refer - * to comments above this function. - * This is the SOCK_RAW, IPPROTO_SCTP case. - */ - (connp->conn_recv)(connp, mp, NULL); - CONN_DEC_REF(connp); + ip_drop_output("ipIfStatsOutDiscards - flow ctl", mp, ill); + freemsg(mp); } -#define UPDATE_IP_MIB_OB_COUNTERS(ill, len) \ -{ \ - BUMP_MIB((ill)->ill_ip_mib, ipIfStatsHCOutTransmits); \ - UPDATE_MIB((ill)->ill_ip_mib, ipIfStatsHCOutOctets, (len)); \ -} /* - * This function should be called only if all packet processing - * including fragmentation is complete. Callers of this function - * must set mp->b_prev to one of these values: - * {0, IPP_FWD_OUT, IPP_LOCAL_OUT} - * prior to handing over the mp as first argument to this function. + * Normal post fragmentation function. + * + * Send a packet using the passed in nce. This handles both IPv4 and IPv6 + * using the same state machine. * - * If the ire passed by caller is incomplete, this function + * We return an error on failure. In particular we return EWOULDBLOCK + * when the driver flow controls. In that case this ensures that ip_wsrv runs + * (currently by canputnext failure resulting in backenabling from GLD.) + * This allows the callers of conn_ip_output() to use EWOULDBLOCK as an + * indication that they can flow control until ip_wsrv() tells then to restart. + * + * If the nce passed by caller is incomplete, this function * queues the packet and if necessary, sends ARP request and bails. - * If the ire passed is fully resolved, we simply prepend + * If the Neighbor Cache passed is fully resolved, we simply prepend * the link-layer header to the packet, do ipsec hw acceleration * work if necessary, and send the packet out on the wire. - * - * NOTE: IPsec will only call this function with fully resolved - * ires if hw acceleration is involved. - * TODO list : - * a Handle M_MULTIDATA so that - * tcp_multisend->tcp_multisend_data can - * call ip_xmit_v4 directly - * b Handle post-ARP work for fragments so that - * ip_wput_frag can call this function. */ -ipxmit_state_t -ip_xmit_v4(mblk_t *mp, ire_t *ire, ipsec_out_t *io, - boolean_t flow_ctl_enabled, conn_t *connp) +/* ARGSUSED6 */ +int +ip_xmit(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, uint_t pkt_len, + uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, uintptr_t *ixacookie) { - nce_t *arpce; - ipha_t *ipha; - queue_t *q; - int ill_index; - mblk_t *nxt_mp, *first_mp; - boolean_t xmit_drop = B_FALSE; - ip_proc_t proc; - ill_t *out_ill; - int pkt_len; + queue_t *wq; + ill_t *ill = nce->nce_ill; + ip_stack_t *ipst = ill->ill_ipst; + uint64_t delta; + boolean_t isv6 = ill->ill_isv6; + boolean_t fp_mp; + ncec_t *ncec = nce->nce_common; - arpce = ire->ire_nce; - ASSERT(arpce != NULL); + DTRACE_PROBE1(ip__xmit, nce_t *, nce); - DTRACE_PROBE2(ip__xmit__v4, ire_t *, ire, nce_t *, arpce); + ASSERT(mp != NULL); + ASSERT(mp->b_datap->db_type == M_DATA); + ASSERT(pkt_len == msgdsize(mp)); - mutex_enter(&arpce->nce_lock); - switch (arpce->nce_state) { - case ND_REACHABLE: - /* If there are other queued packets, queue this packet */ - if (arpce->nce_qd_mp != NULL) { - if (mp != NULL) - nce_queue_mp_common(arpce, mp, B_FALSE); - mp = arpce->nce_qd_mp; + /* + * If we have already been here and are coming back after ARP/ND. + * the IXAF_NO_TRACE flag is set. We skip FW_HOOKS, DTRACE and ipobs + * in that case since they have seen the packet when it came here + * the first time. + */ + if (ixaflags & IXAF_NO_TRACE) + goto sendit; + + if (ixaflags & IXAF_IS_IPV4) { + ipha_t *ipha = (ipha_t *)mp->b_rptr; + + ASSERT(!isv6); + ASSERT(pkt_len == ntohs(((ipha_t *)mp->b_rptr)->ipha_length)); + if (HOOKS4_INTERESTED_PHYSICAL_OUT(ipst) && + !(ixaflags & IXAF_NO_PFHOOK)) { + int error; + + FW_HOOKS(ipst->ips_ip4_physical_out_event, + ipst->ips_ipv4firewall_physical_out, + NULL, ill, ipha, mp, mp, 0, ipst, error); + DTRACE_PROBE1(ip4__physical__out__end, + mblk_t *, mp); + if (mp == NULL) + return (error); + + /* The length could have changed */ + pkt_len = msgdsize(mp); + } + if (ipst->ips_ip4_observe.he_interested) { + /* + * Note that for TX the zoneid is the sending + * zone, whether or not MLP is in play. + * Since the szone argument is the IP zoneid (i.e., + * zero for exclusive-IP zones) and ipobs wants + * the system zoneid, we map it here. + */ + szone = IP_REAL_ZONEID(szone, ipst); + + /* + * On the outbound path the destination zone will be + * unknown as we're sending this packet out on the + * wire. + */ + ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, ALL_ZONES, + ill, ipst); + } + DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, + void_ip_t *, ipha, __dtrace_ipsr_ill_t *, ill, + ipha_t *, ipha, ip6_t *, NULL, int, 0); + } else { + ip6_t *ip6h = (ip6_t *)mp->b_rptr; + + ASSERT(isv6); + ASSERT(pkt_len == + ntohs(((ip6_t *)mp->b_rptr)->ip6_plen) + IPV6_HDR_LEN); + if (HOOKS6_INTERESTED_PHYSICAL_OUT(ipst) && + !(ixaflags & IXAF_NO_PFHOOK)) { + int error; + + FW_HOOKS6(ipst->ips_ip6_physical_out_event, + ipst->ips_ipv6firewall_physical_out, + NULL, ill, ip6h, mp, mp, 0, ipst, error); + DTRACE_PROBE1(ip6__physical__out__end, + mblk_t *, mp); + if (mp == NULL) + return (error); + + /* The length could have changed */ + pkt_len = msgdsize(mp); + } + if (ipst->ips_ip6_observe.he_interested) { + /* See above */ + szone = IP_REAL_ZONEID(szone, ipst); + + ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, ALL_ZONES, + ill, ipst); } - arpce->nce_qd_mp = NULL; - mutex_exit(&arpce->nce_lock); + DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, + void_ip_t *, ip6h, __dtrace_ipsr_ill_t *, ill, + ipha_t *, NULL, ip6_t *, ip6h, int, 0); + } +sendit: + /* + * We check the state without a lock because the state can never + * move "backwards" to initial or incomplete. + */ + switch (ncec->ncec_state) { + case ND_REACHABLE: + case ND_STALE: + case ND_DELAY: + case ND_PROBE: + mp = ip_xmit_attach_llhdr(mp, nce); + if (mp == NULL) { + /* + * ip_xmit_attach_llhdr has increased + * ipIfStatsOutDiscards and called ip_drop_output() + */ + return (ENOBUFS); + } /* - * Flush the queue. In the common case, where the - * ARP is already resolved, it will go through the - * while loop only once. + * check if nce_fastpath completed and we tagged on a + * copy of nce_fp_mp in ip_xmit_attach_llhdr(). */ - while (mp != NULL) { + fp_mp = (mp->b_datap->db_type == M_DATA); - nxt_mp = mp->b_next; - mp->b_next = NULL; - ASSERT(mp->b_datap->db_type != M_CTL); - pkt_len = ntohs(((ipha_t *)mp->b_rptr)->ipha_length); + if (fp_mp && + (ill->ill_capabilities & ILL_CAPAB_DLD_DIRECT)) { + ill_dld_direct_t *idd; + + idd = &ill->ill_dld_capab->idc_direct; /* - * This info is needed for IPQOS to do COS marking - * in ip_wput_attach_llhdr->ip_process. + * Send the packet directly to DLD, where it + * may be queued depending on the availability + * of transmit resources at the media layer. + * Return value should be taken into + * account and flow control the TCP. */ - proc = (ip_proc_t)(uintptr_t)mp->b_prev; - mp->b_prev = NULL; - - /* set up ill index for outbound qos processing */ - out_ill = ire_to_ill(ire); - ill_index = out_ill->ill_phyint->phyint_ifindex; - first_mp = ip_wput_attach_llhdr(mp, ire, proc, - ill_index, &ipha); - if (first_mp == NULL) { - xmit_drop = B_TRUE; - BUMP_MIB(out_ill->ill_ip_mib, - ipIfStatsOutDiscards); - goto next_mp; - } + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits); + UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets, + pkt_len); - /* non-ipsec hw accel case */ - if (io == NULL || !io->ipsec_out_accelerated) { - /* send it */ - q = ire->ire_stq; - if (proc == IPP_FWD_OUT) { - UPDATE_IB_PKT_COUNT(ire); - } else { - UPDATE_OB_PKT_COUNT(ire); - } - ire->ire_last_used_time = lbolt; + if (ixaflags & IXAF_NO_DEV_FLOW_CTL) { + (void) idd->idd_tx_df(idd->idd_tx_dh, mp, + (uintptr_t)xmit_hint, IP_DROP_ON_NO_DESC); + } else { + uintptr_t cookie; - if (flow_ctl_enabled || canputnext(q)) { - if (proc == IPP_FWD_OUT) { + if ((cookie = idd->idd_tx_df(idd->idd_tx_dh, + mp, (uintptr_t)xmit_hint, 0)) != 0) { + if (ixacookie != NULL) + *ixacookie = cookie; + return (EWOULDBLOCK); + } + } + } else { + wq = ill->ill_wq; + + if (!(ixaflags & IXAF_NO_DEV_FLOW_CTL) && + !canputnext(wq)) { + if (ixacookie != NULL) + *ixacookie = 0; + ip_xmit_flowctl_drop(ill, mp, fp_mp, + nce->nce_fp_mp != NULL ? + MBLKL(nce->nce_fp_mp) : 0); + return (EWOULDBLOCK); + } + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits); + UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets, + pkt_len); + putnext(wq, mp); + } - BUMP_MIB(out_ill->ill_ip_mib, - ipIfStatsHCOutForwDatagrams); + /* + * The rest of this function implements Neighbor Unreachability + * detection. Determine if the ncec is eligible for NUD. + */ + if (ncec->ncec_flags & NCE_F_NONUD) + return (0); - } - UPDATE_IP_MIB_OB_COUNTERS(out_ill, - pkt_len); + ASSERT(ncec->ncec_state != ND_INCOMPLETE); - DTRACE_IP7(send, mblk_t *, first_mp, - conn_t *, NULL, void_ip_t *, ipha, - __dtrace_ipsr_ill_t *, out_ill, - ipha_t *, ipha, ip6_t *, NULL, int, - 0); + /* + * Check for upper layer advice + */ + if (ixaflags & IXAF_REACH_CONF) { + timeout_id_t tid; - ILL_SEND_TX(out_ill, - ire, connp, first_mp, 0, connp); - } else { - BUMP_MIB(out_ill->ill_ip_mib, - ipIfStatsOutDiscards); - xmit_drop = B_TRUE; - freemsg(first_mp); + /* + * It should be o.k. to check the state without + * a lock here, at most we lose an advice. + */ + ncec->ncec_last = TICK_TO_MSEC(lbolt64); + if (ncec->ncec_state != ND_REACHABLE) { + mutex_enter(&ncec->ncec_lock); + ncec->ncec_state = ND_REACHABLE; + tid = ncec->ncec_timeout_id; + ncec->ncec_timeout_id = 0; + mutex_exit(&ncec->ncec_lock); + (void) untimeout(tid); + if (ip_debug > 2) { + /* ip1dbg */ + pr_addr_dbg("ip_xmit: state" + " for %s changed to" + " REACHABLE\n", AF_INET6, + &ncec->ncec_addr); } - } else { + } + return (0); + } + + delta = TICK_TO_MSEC(lbolt64) - ncec->ncec_last; + ip1dbg(("ip_xmit: delta = %" PRId64 + " ill_reachable_time = %d \n", delta, + ill->ill_reachable_time)); + if (delta > (uint64_t)ill->ill_reachable_time) { + mutex_enter(&ncec->ncec_lock); + switch (ncec->ncec_state) { + case ND_REACHABLE: + ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0); + /* FALLTHROUGH */ + case ND_STALE: /* - * Safety Pup says: make sure this - * is going to the right interface! + * ND_REACHABLE is identical to + * ND_STALE in this specific case. If + * reachable time has expired for this + * neighbor (delta is greater than + * reachable time), conceptually, the + * neighbor cache is no longer in + * REACHABLE state, but already in + * STALE state. So the correct + * transition here is to ND_DELAY. */ - ill_t *ill1 = - (ill_t *)ire->ire_stq->q_ptr; - int ifindex = - ill1->ill_phyint->phyint_ifindex; - if (ifindex != - io->ipsec_out_capab_ill_index) { - xmit_drop = B_TRUE; - freemsg(mp); - } else { - UPDATE_IP_MIB_OB_COUNTERS(ill1, - pkt_len); - - DTRACE_IP7(send, mblk_t *, first_mp, - conn_t *, NULL, void_ip_t *, ipha, - __dtrace_ipsr_ill_t *, ill1, - ipha_t *, ipha, ip6_t *, NULL, - int, 0); - - ipsec_hw_putnext(ire->ire_stq, mp); + ncec->ncec_state = ND_DELAY; + mutex_exit(&ncec->ncec_lock); + nce_restart_timer(ncec, + ipst->ips_delay_first_probe_time); + if (ip_debug > 3) { + /* ip2dbg */ + pr_addr_dbg("ip_xmit: state" + " for %s changed to" + " DELAY\n", AF_INET6, + &ncec->ncec_addr); } + break; + case ND_DELAY: + case ND_PROBE: + mutex_exit(&ncec->ncec_lock); + /* Timers have already started */ + break; + case ND_UNREACHABLE: + /* + * nce_timer has detected that this ncec + * is unreachable and initiated deleting + * this ncec. + * This is a harmless race where we found the + * ncec before it was deleted and have + * just sent out a packet using this + * unreachable ncec. + */ + mutex_exit(&ncec->ncec_lock); + break; + default: + ASSERT(0); + mutex_exit(&ncec->ncec_lock); } -next_mp: - mp = nxt_mp; - } /* while (mp != NULL) */ - if (xmit_drop) - return (SEND_FAILED); - else - return (SEND_PASSED); + } + return (0); - case ND_INITIAL: case ND_INCOMPLETE: - /* - * While we do send off packets to dests that - * use fully-resolved CGTP routes, we do not - * handle unresolved CGTP routes. + * the state could have changed since we didn't hold the lock. + * Re-verify state under lock. */ - ASSERT(!(ire->ire_flags & RTF_MULTIRT)); - ASSERT(io == NULL || !io->ipsec_out_accelerated); - - if (mp != NULL) { - /* queue the packet */ - nce_queue_mp_common(arpce, mp, B_FALSE); + mutex_enter(&ncec->ncec_lock); + if (NCE_ISREACHABLE(ncec)) { + mutex_exit(&ncec->ncec_lock); + goto sendit; } + /* queue the packet */ + nce_queue_mp(ncec, mp, ipmp_packet_is_probe(mp, nce->nce_ill)); + mutex_exit(&ncec->ncec_lock); + DTRACE_PROBE2(ip__xmit__incomplete, + (ncec_t *), ncec, (mblk_t *), mp); + return (0); - if (arpce->nce_state == ND_INCOMPLETE) { - mutex_exit(&arpce->nce_lock); - DTRACE_PROBE3(ip__xmit__incomplete, - (ire_t *), ire, (mblk_t *), mp, - (ipsec_out_t *), io); - return (LOOKUP_IN_PROGRESS); + case ND_INITIAL: + /* + * State could have changed since we didn't hold the lock, so + * re-verify state. + */ + mutex_enter(&ncec->ncec_lock); + if (NCE_ISREACHABLE(ncec)) { + mutex_exit(&ncec->ncec_lock); + goto sendit; + } + nce_queue_mp(ncec, mp, ipmp_packet_is_probe(mp, nce->nce_ill)); + if (ncec->ncec_state == ND_INITIAL) { + ncec->ncec_state = ND_INCOMPLETE; + mutex_exit(&ncec->ncec_lock); + /* + * figure out the source we want to use + * and resolve it. + */ + ip_ndp_resolve(ncec); + } else { + mutex_exit(&ncec->ncec_lock); } + return (0); - arpce->nce_state = ND_INCOMPLETE; - mutex_exit(&arpce->nce_lock); + case ND_UNREACHABLE: + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards - ND_UNREACHABLE", + mp, ill); + freemsg(mp); + return (0); - /* - * Note that ire_add() (called from ire_forward()) - * holds a ref on the ire until ARP is completed. - */ - ire_arpresolve(ire); - return (LOOKUP_IN_PROGRESS); default: ASSERT(0); - mutex_exit(&arpce->nce_lock); - return (LLHDR_RESLV_FAILED); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards - ND_other", + mp, ill); + freemsg(mp); + return (ENETUNREACH); } } -#undef UPDATE_IP_MIB_OB_COUNTERS - /* * Return B_TRUE if the buffers differ in length or content. * This is used for comparing extension header buffers. @@ -29803,52 +14974,300 @@ ip_savebuf(void **dstp, uint_t *dstlenp, boolean_t src_valid, } /* - * Free the storage pointed to by the members of an ip6_pkt_t. + * Free the storage pointed to by the members of an ip_pkt_t. */ void -ip6_pkt_free(ip6_pkt_t *ipp) +ip_pkt_free(ip_pkt_t *ipp) { - ASSERT(ipp->ipp_pathmtu == NULL && !(ipp->ipp_fields & IPPF_PATHMTU)); + uint_t fields = ipp->ipp_fields; - if (ipp->ipp_fields & IPPF_HOPOPTS) { + if (fields & IPPF_HOPOPTS) { kmem_free(ipp->ipp_hopopts, ipp->ipp_hopoptslen); ipp->ipp_hopopts = NULL; ipp->ipp_hopoptslen = 0; } - if (ipp->ipp_fields & IPPF_RTDSTOPTS) { - kmem_free(ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen); - ipp->ipp_rtdstopts = NULL; - ipp->ipp_rtdstoptslen = 0; + if (fields & IPPF_RTHDRDSTOPTS) { + kmem_free(ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen); + ipp->ipp_rthdrdstopts = NULL; + ipp->ipp_rthdrdstoptslen = 0; } - if (ipp->ipp_fields & IPPF_DSTOPTS) { + if (fields & IPPF_DSTOPTS) { kmem_free(ipp->ipp_dstopts, ipp->ipp_dstoptslen); ipp->ipp_dstopts = NULL; ipp->ipp_dstoptslen = 0; } - if (ipp->ipp_fields & IPPF_RTHDR) { + if (fields & IPPF_RTHDR) { kmem_free(ipp->ipp_rthdr, ipp->ipp_rthdrlen); ipp->ipp_rthdr = NULL; ipp->ipp_rthdrlen = 0; } - ipp->ipp_fields &= ~(IPPF_HOPOPTS | IPPF_RTDSTOPTS | IPPF_DSTOPTS | - IPPF_RTHDR); + if (fields & IPPF_IPV4_OPTIONS) { + kmem_free(ipp->ipp_ipv4_options, ipp->ipp_ipv4_options_len); + ipp->ipp_ipv4_options = NULL; + ipp->ipp_ipv4_options_len = 0; + } + if (fields & IPPF_LABEL_V4) { + kmem_free(ipp->ipp_label_v4, ipp->ipp_label_len_v4); + ipp->ipp_label_v4 = NULL; + ipp->ipp_label_len_v4 = 0; + } + if (fields & IPPF_LABEL_V6) { + kmem_free(ipp->ipp_label_v6, ipp->ipp_label_len_v6); + ipp->ipp_label_v6 = NULL; + ipp->ipp_label_len_v6 = 0; + } + ipp->ipp_fields &= ~(IPPF_HOPOPTS | IPPF_RTHDRDSTOPTS | IPPF_DSTOPTS | + IPPF_RTHDR | IPPF_IPV4_OPTIONS | IPPF_LABEL_V4 | IPPF_LABEL_V6); +} + +/* + * Copy from src to dst and allocate as needed. + * Returns zero or ENOMEM. + * + * The caller must initialize dst to zero. + */ +int +ip_pkt_copy(ip_pkt_t *src, ip_pkt_t *dst, int kmflag) +{ + uint_t fields = src->ipp_fields; + + /* Start with fields that don't require memory allocation */ + dst->ipp_fields = fields & + ~(IPPF_HOPOPTS | IPPF_RTHDRDSTOPTS | IPPF_DSTOPTS | + IPPF_RTHDR | IPPF_IPV4_OPTIONS | IPPF_LABEL_V4 | IPPF_LABEL_V6); + + dst->ipp_addr = src->ipp_addr; + dst->ipp_unicast_hops = src->ipp_unicast_hops; + dst->ipp_hoplimit = src->ipp_hoplimit; + dst->ipp_tclass = src->ipp_tclass; + dst->ipp_type_of_service = src->ipp_type_of_service; + + if (fields & IPPF_HOPOPTS) { + dst->ipp_hopopts = kmem_alloc(src->ipp_hopoptslen, kmflag); + if (dst->ipp_hopopts == NULL) { + ip_pkt_free(dst); + return (ENOMEM); + } + dst->ipp_fields |= IPPF_HOPOPTS; + bcopy(src->ipp_hopopts, dst->ipp_hopopts, + src->ipp_hopoptslen); + dst->ipp_hopoptslen = src->ipp_hopoptslen; + } + if (fields & IPPF_RTHDRDSTOPTS) { + dst->ipp_rthdrdstopts = kmem_alloc(src->ipp_rthdrdstoptslen, + kmflag); + if (dst->ipp_rthdrdstopts == NULL) { + ip_pkt_free(dst); + return (ENOMEM); + } + dst->ipp_fields |= IPPF_RTHDRDSTOPTS; + bcopy(src->ipp_rthdrdstopts, dst->ipp_rthdrdstopts, + src->ipp_rthdrdstoptslen); + dst->ipp_rthdrdstoptslen = src->ipp_rthdrdstoptslen; + } + if (fields & IPPF_DSTOPTS) { + dst->ipp_dstopts = kmem_alloc(src->ipp_dstoptslen, kmflag); + if (dst->ipp_dstopts == NULL) { + ip_pkt_free(dst); + return (ENOMEM); + } + dst->ipp_fields |= IPPF_DSTOPTS; + bcopy(src->ipp_dstopts, dst->ipp_dstopts, + src->ipp_dstoptslen); + dst->ipp_dstoptslen = src->ipp_dstoptslen; + } + if (fields & IPPF_RTHDR) { + dst->ipp_rthdr = kmem_alloc(src->ipp_rthdrlen, kmflag); + if (dst->ipp_rthdr == NULL) { + ip_pkt_free(dst); + return (ENOMEM); + } + dst->ipp_fields |= IPPF_RTHDR; + bcopy(src->ipp_rthdr, dst->ipp_rthdr, + src->ipp_rthdrlen); + dst->ipp_rthdrlen = src->ipp_rthdrlen; + } + if (fields & IPPF_IPV4_OPTIONS) { + dst->ipp_ipv4_options = kmem_alloc(src->ipp_ipv4_options_len, + kmflag); + if (dst->ipp_ipv4_options == NULL) { + ip_pkt_free(dst); + return (ENOMEM); + } + dst->ipp_fields |= IPPF_IPV4_OPTIONS; + bcopy(src->ipp_ipv4_options, dst->ipp_ipv4_options, + src->ipp_ipv4_options_len); + dst->ipp_ipv4_options_len = src->ipp_ipv4_options_len; + } + if (fields & IPPF_LABEL_V4) { + dst->ipp_label_v4 = kmem_alloc(src->ipp_label_len_v4, kmflag); + if (dst->ipp_label_v4 == NULL) { + ip_pkt_free(dst); + return (ENOMEM); + } + dst->ipp_fields |= IPPF_LABEL_V4; + bcopy(src->ipp_label_v4, dst->ipp_label_v4, + src->ipp_label_len_v4); + dst->ipp_label_len_v4 = src->ipp_label_len_v4; + } + if (fields & IPPF_LABEL_V6) { + dst->ipp_label_v6 = kmem_alloc(src->ipp_label_len_v6, kmflag); + if (dst->ipp_label_v6 == NULL) { + ip_pkt_free(dst); + return (ENOMEM); + } + dst->ipp_fields |= IPPF_LABEL_V6; + bcopy(src->ipp_label_v6, dst->ipp_label_v6, + src->ipp_label_len_v6); + dst->ipp_label_len_v6 = src->ipp_label_len_v6; + } + if (fields & IPPF_FRAGHDR) { + dst->ipp_fraghdr = kmem_alloc(src->ipp_fraghdrlen, kmflag); + if (dst->ipp_fraghdr == NULL) { + ip_pkt_free(dst); + return (ENOMEM); + } + dst->ipp_fields |= IPPF_FRAGHDR; + bcopy(src->ipp_fraghdr, dst->ipp_fraghdr, + src->ipp_fraghdrlen); + dst->ipp_fraghdrlen = src->ipp_fraghdrlen; + } + return (0); +} + +/* + * Returns INADDR_ANY if no source route + */ +ipaddr_t +ip_pkt_source_route_v4(const ip_pkt_t *ipp) +{ + ipaddr_t nexthop = INADDR_ANY; + ipoptp_t opts; + uchar_t *opt; + uint8_t optval; + uint8_t optlen; + uint32_t totallen; + + if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS)) + return (INADDR_ANY); + + totallen = ipp->ipp_ipv4_options_len; + if (totallen & 0x3) + return (INADDR_ANY); + + for (optval = ipoptp_first2(&opts, totallen, ipp->ipp_ipv4_options); + optval != IPOPT_EOL; + optval = ipoptp_next(&opts)) { + opt = opts.ipoptp_cur; + switch (optval) { + uint8_t off; + case IPOPT_SSRR: + case IPOPT_LSRR: + if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { + break; + } + optlen = opts.ipoptp_len; + off = opt[IPOPT_OFFSET]; + off--; + if (optlen < IP_ADDR_LEN || + off > optlen - IP_ADDR_LEN) { + /* End of source route */ + break; + } + bcopy((char *)opt + off, &nexthop, IP_ADDR_LEN); + if (nexthop == htonl(INADDR_LOOPBACK)) { + /* Ignore */ + nexthop = INADDR_ANY; + break; + } + break; + } + } + return (nexthop); +} + +/* + * Reverse a source route. + */ +void +ip_pkt_source_route_reverse_v4(ip_pkt_t *ipp) +{ + ipaddr_t tmp; + ipoptp_t opts; + uchar_t *opt; + uint8_t optval; + uint32_t totallen; + + if (!(ipp->ipp_fields & IPPF_IPV4_OPTIONS)) + return; + + totallen = ipp->ipp_ipv4_options_len; + if (totallen & 0x3) + return; + + for (optval = ipoptp_first2(&opts, totallen, ipp->ipp_ipv4_options); + optval != IPOPT_EOL; + optval = ipoptp_next(&opts)) { + uint8_t off1, off2; + + opt = opts.ipoptp_cur; + switch (optval) { + case IPOPT_SSRR: + case IPOPT_LSRR: + if ((opts.ipoptp_flags & IPOPTP_ERROR) != 0) { + break; + } + off1 = IPOPT_MINOFF_SR - 1; + off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1; + while (off2 > off1) { + bcopy(opt + off2, &tmp, IP_ADDR_LEN); + bcopy(opt + off1, opt + off2, IP_ADDR_LEN); + bcopy(&tmp, opt + off2, IP_ADDR_LEN); + off2 -= IP_ADDR_LEN; + off1 += IP_ADDR_LEN; + } + opt[IPOPT_OFFSET] = IPOPT_MINOFF_SR; + break; + } + } +} + +/* + * Returns NULL if no routing header + */ +in6_addr_t * +ip_pkt_source_route_v6(const ip_pkt_t *ipp) +{ + in6_addr_t *nexthop = NULL; + ip6_rthdr0_t *rthdr; + + if (!(ipp->ipp_fields & IPPF_RTHDR)) + return (NULL); + + rthdr = (ip6_rthdr0_t *)ipp->ipp_rthdr; + if (rthdr->ip6r0_segleft == 0) + return (NULL); + + nexthop = (in6_addr_t *)((char *)rthdr + sizeof (*rthdr)); + return (nexthop); } zoneid_t -ip_get_zoneid_v4(ipaddr_t addr, mblk_t *mp, ip_stack_t *ipst, +ip_get_zoneid_v4(ipaddr_t addr, mblk_t *mp, ip_recv_attr_t *ira, zoneid_t lookup_zoneid) { + ip_stack_t *ipst = ira->ira_ill->ill_ipst; ire_t *ire; int ire_flags = MATCH_IRE_TYPE; zoneid_t zoneid = ALL_ZONES; - if (is_system_labeled() && !tsol_can_accept_raw(mp, B_FALSE)) + if (is_system_labeled() && !tsol_can_accept_raw(mp, ira, B_FALSE)) return (ALL_ZONES); if (lookup_zoneid != ALL_ZONES) ire_flags |= MATCH_IRE_ZONEONLY; - ire = ire_ctable_lookup(addr, NULL, IRE_LOCAL | IRE_LOOPBACK, NULL, - lookup_zoneid, NULL, ire_flags, ipst); + ire = ire_ftable_lookup_v4(addr, NULL, NULL, IRE_LOCAL | IRE_LOOPBACK, + NULL, lookup_zoneid, NULL, ire_flags, 0, ipst, NULL); if (ire != NULL) { zoneid = IP_REAL_ZONEID(ire->ire_zoneid, ipst); ire_refrele(ire); @@ -29858,24 +15277,23 @@ ip_get_zoneid_v4(ipaddr_t addr, mblk_t *mp, ip_stack_t *ipst, zoneid_t ip_get_zoneid_v6(in6_addr_t *addr, mblk_t *mp, const ill_t *ill, - ip_stack_t *ipst, zoneid_t lookup_zoneid) + ip_recv_attr_t *ira, zoneid_t lookup_zoneid) { + ip_stack_t *ipst = ira->ira_ill->ill_ipst; ire_t *ire; int ire_flags = MATCH_IRE_TYPE; zoneid_t zoneid = ALL_ZONES; - ipif_t *ipif_arg = NULL; - if (is_system_labeled() && !tsol_can_accept_raw(mp, B_FALSE)) + if (is_system_labeled() && !tsol_can_accept_raw(mp, ira, B_FALSE)) return (ALL_ZONES); - if (IN6_IS_ADDR_LINKLOCAL(addr)) { + if (IN6_IS_ADDR_LINKLOCAL(addr)) ire_flags |= MATCH_IRE_ILL; - ipif_arg = ill->ill_ipif; - } + if (lookup_zoneid != ALL_ZONES) ire_flags |= MATCH_IRE_ZONEONLY; - ire = ire_ctable_lookup_v6(addr, NULL, IRE_LOCAL | IRE_LOOPBACK, - ipif_arg, lookup_zoneid, NULL, ire_flags, ipst); + ire = ire_ftable_lookup_v6(addr, NULL, NULL, IRE_LOCAL | IRE_LOOPBACK, + ill, lookup_zoneid, NULL, ire_flags, 0, ipst, NULL); if (ire != NULL) { zoneid = IP_REAL_ZONEID(ire->ire_zoneid, ipst); ire_refrele(ire); @@ -29964,3 +15382,29 @@ ipobs_hook(mblk_t *mp, int htype, zoneid_t zsrc, zoneid_t zdst, imp->b_cont = NULL; freemsg(imp); } + +/* + * Utility routine that checks if `v4srcp' is a valid address on underlying + * interface `ill'. If `ipifp' is non-NULL, it's set to a held ipif + * associated with `v4srcp' on success. NOTE: if this is not called from + * inside the IPSQ (ill_g_lock is not held), `ill' may be removed from the + * group during or after this lookup. + */ +boolean_t +ipif_lookup_testaddr_v4(ill_t *ill, const in_addr_t *v4srcp, ipif_t **ipifp) +{ + ipif_t *ipif; + + ipif = ipif_lookup_addr_exact(*v4srcp, ill, ill->ill_ipst); + if (ipif != NULL) { + if (ipifp != NULL) + *ipifp = ipif; + else + ipif_refrele(ipif); + return (B_TRUE); + } + + ip1dbg(("ipif_lookup_testaddr_v4: cannot find ipif for src %x\n", + *v4srcp)); + return (B_FALSE); +} diff --git a/usr/src/uts/common/inet/ip/ip2mac.c b/usr/src/uts/common/inet/ip/ip2mac.c index e232a5bb63..55a17f762a 100644 --- a/usr/src/uts/common/inet/ip/ip2mac.c +++ b/usr/src/uts/common/inet/ip/ip2mac.c @@ -18,6 +18,7 @@ * * CDDL HEADER END */ + /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. @@ -29,7 +30,6 @@ #include <inet/ip2mac.h> #include <inet/ip2mac_impl.h> #include <sys/zone.h> -#include <sys/dlpi.h> #include <inet/ip_ndp.h> #include <inet/ip_if.h> #include <inet/ip6.h> @@ -38,18 +38,18 @@ * dispatch pending callbacks. */ void -nce_cb_dispatch(nce_t *nce) +ncec_cb_dispatch(ncec_t *ncec) { - nce_cb_t *nce_cb = list_head(&nce->nce_cb); + ncec_cb_t *ncec_cb; ip2mac_t ip2m; - mutex_enter(&nce->nce_lock); - if (list_is_empty(&nce->nce_cb)) { - mutex_exit(&nce->nce_lock); + mutex_enter(&ncec->ncec_lock); + if (list_is_empty(&ncec->ncec_cb)) { + mutex_exit(&ncec->ncec_lock); return; } - nce_ip2mac_response(&ip2m, nce); - nce_cb_refhold_locked(nce); + ncec_ip2mac_response(&ip2m, ncec); + ncec_cb_refhold_locked(ncec); /* * IP does not hold internal locks like nce_lock across calls to * other subsystems for fear of recursive lock entry and lock @@ -58,75 +58,82 @@ nce_cb_dispatch(nce_t *nce) * across calls into another subsystem, especially if calls can * happen in either direction). */ - nce_cb = list_head(&nce->nce_cb); - for (; nce_cb != NULL; nce_cb = list_next(&nce->nce_cb, nce_cb)) { - if (nce_cb->nce_cb_flags & NCE_CB_DISPATCHED) + ncec_cb = list_head(&ncec->ncec_cb); + for (; ncec_cb != NULL; ncec_cb = list_next(&ncec->ncec_cb, ncec_cb)) { + if (ncec_cb->ncec_cb_flags & NCE_CB_DISPATCHED) continue; - nce_cb->nce_cb_flags |= NCE_CB_DISPATCHED; - mutex_exit(&nce->nce_lock); - (*nce_cb->nce_cb_func)(&ip2m, nce_cb->nce_cb_arg); - mutex_enter(&nce->nce_lock); + ncec_cb->ncec_cb_flags |= NCE_CB_DISPATCHED; + mutex_exit(&ncec->ncec_lock); + (*ncec_cb->ncec_cb_func)(&ip2m, ncec_cb->ncec_cb_arg); + mutex_enter(&ncec->ncec_lock); } - nce_cb_refrele(nce); - mutex_exit(&nce->nce_lock); + ncec_cb_refrele(ncec); + mutex_exit(&ncec->ncec_lock); } /* * fill up the ip2m response fields with inforamation from the nce. */ void -nce_ip2mac_response(ip2mac_t *ip2m, nce_t *nce) +ncec_ip2mac_response(ip2mac_t *ip2m, ncec_t *ncec) { - boolean_t isv6 = (nce->nce_ipversion == IPV6_VERSION); + boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); + sin_t *sin; sin6_t *sin6; struct sockaddr_dl *sdl; - uchar_t *nce_lladdr; - ASSERT(MUTEX_HELD(&nce->nce_lock)); + ASSERT(MUTEX_HELD(&ncec->ncec_lock)); bzero(ip2m, sizeof (*ip2m)); - if (NCE_ISREACHABLE(nce) && (nce->nce_flags & NCE_F_CONDEMNED) == 0) + if (NCE_ISREACHABLE(ncec) && !NCE_ISCONDEMNED(ncec)) ip2m->ip2mac_err = 0; else ip2m->ip2mac_err = ESRCH; if (isv6) { sin6 = (sin6_t *)&ip2m->ip2mac_pa; sin6->sin6_family = AF_INET6; - sin6->sin6_addr = nce->nce_addr; + sin6->sin6_addr = ncec->ncec_addr; + } else { + sin = (sin_t *)&ip2m->ip2mac_pa; + sin->sin_family = AF_INET; + IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &sin->sin_addr); } if (ip2m->ip2mac_err == 0) { sdl = &ip2m->ip2mac_ha; sdl->sdl_family = AF_LINK; - sdl->sdl_type = nce->nce_ill->ill_type; + sdl->sdl_type = ncec->ncec_ill->ill_type; + /* + * should we put ncec_ill->ill_name in there? why? + * likewise for the sdl_index + */ sdl->sdl_nlen = 0; - sdl->sdl_alen = nce->nce_ill->ill_phys_addr_length; - nce_lladdr = nce->nce_res_mp->b_rptr + - NCE_LL_ADDR_OFFSET(nce->nce_ill); - bcopy(nce_lladdr, LLADDR(sdl), sdl->sdl_alen); + sdl->sdl_alen = ncec->ncec_ill->ill_phys_addr_length; + if (ncec->ncec_lladdr != NULL) + bcopy(ncec->ncec_lladdr, LLADDR(sdl), sdl->sdl_alen); } } void -nce_cb_refhold_locked(nce_t *nce) +ncec_cb_refhold_locked(ncec_t *ncec) { - ASSERT(MUTEX_HELD(&nce->nce_lock)); - nce->nce_cb_walker_cnt++; + ASSERT(MUTEX_HELD(&ncec->ncec_lock)); + ncec->ncec_cb_walker_cnt++; } void -nce_cb_refrele(nce_t *nce) +ncec_cb_refrele(ncec_t *ncec) { - nce_cb_t *nce_cb, *nce_cb_next = NULL; + ncec_cb_t *ncec_cb, *ncec_cb_next = NULL; - ASSERT(MUTEX_HELD(&nce->nce_lock)); - if (--nce->nce_cb_walker_cnt == 0) { - for (nce_cb = list_head(&nce->nce_cb); nce_cb != NULL; - nce_cb = nce_cb_next) { + ASSERT(MUTEX_HELD(&ncec->ncec_lock)); + if (--ncec->ncec_cb_walker_cnt == 0) { + for (ncec_cb = list_head(&ncec->ncec_cb); ncec_cb != NULL; + ncec_cb = ncec_cb_next) { - nce_cb_next = list_next(&nce->nce_cb, nce_cb); - if ((nce_cb->nce_cb_flags & NCE_CB_DISPATCHED) == 0) + ncec_cb_next = list_next(&ncec->ncec_cb, ncec_cb); + if ((ncec_cb->ncec_cb_flags & NCE_CB_DISPATCHED) == 0) continue; - list_remove(&nce->nce_cb, nce_cb); - kmem_free(nce_cb, sizeof (*nce_cb)); + list_remove(&ncec->ncec_cb, ncec_cb); + kmem_free(ncec_cb, sizeof (*ncec_cb)); } } } @@ -136,25 +143,25 @@ nce_cb_refrele(nce_t *nce) * after address resolution succeeds/fails. */ static ip2mac_id_t -nce_add_cb(nce_t *nce, ip2mac_callback_t *cb, void *cbarg) +ncec_add_cb(ncec_t *ncec, ip2mac_callback_t *cb, void *cbarg) { - nce_cb_t *nce_cb; + ncec_cb_t *nce_cb; ip2mac_id_t ip2mid = NULL; - ASSERT(MUTEX_HELD(&nce->nce_lock)); + ASSERT(MUTEX_HELD(&ncec->ncec_lock)); if ((nce_cb = kmem_zalloc(sizeof (*nce_cb), KM_NOSLEEP)) == NULL) return (ip2mid); - nce_cb->nce_cb_func = cb; - nce_cb->nce_cb_arg = cbarg; + nce_cb->ncec_cb_func = cb; + nce_cb->ncec_cb_arg = cbarg; /* - * We identify the nce_cb_t during cancellation by the address + * We identify the ncec_cb_t during cancellation by the address * of the nce_cb_t itself, and, as a short-cut for eliminating - * clear mismatches, only look in the callback list of nce's + * clear mismatches, only look in the callback list of ncec's * whose address is equal to the nce_cb_id. */ - nce_cb->nce_cb_id = nce; /* no refs! just an address */ - list_insert_tail(&nce->nce_cb, nce_cb); - ip2mid = nce; /* this is the id to be used in ip2mac_cancel */ + nce_cb->ncec_cb_id = ncec; /* no refs! just an address */ + list_insert_tail(&ncec->ncec_cb, nce_cb); + ip2mid = ncec; /* this is the id to be used in ip2mac_cancel */ return (nce_cb); } @@ -167,29 +174,24 @@ nce_add_cb(nce_t *nce, ip2mac_callback_t *cb, void *cbarg) * the resolution completes. */ ip2mac_id_t -ip2mac(uint_t flags, ip2mac_t *ip2m, ip2mac_callback_t *cb, void *cbarg, +ip2mac(uint_t op, ip2mac_t *ip2m, ip2mac_callback_t *cb, void *cbarg, zoneid_t zoneid) { - nce_t *nce; + ncec_t *ncec; + nce_t *nce = NULL; boolean_t isv6; ill_t *ill; netstack_t *ns; ip_stack_t *ipst; ip2mac_id_t ip2mid = NULL; + sin_t *sin; sin6_t *sin6; int err; uint64_t delta; + boolean_t need_resolve = B_FALSE; isv6 = (ip2m->ip2mac_pa.ss_family == AF_INET6); - if (!isv6) { - /* - * IPv4 is not currently supported. - */ - ip2m->ip2mac_err = ENOTSUP; - return (NULL); - } - ns = netstack_find_by_zoneid(zoneid); if (ns == NULL) { ip2m->ip2mac_err = EINVAL; @@ -205,8 +207,7 @@ ip2mac(uint_t flags, ip2mac_t *ip2m, ip2mac_callback_t *cb, void *cbarg, /* * find the ill from the ip2m->ip2mac_ifindex */ - ill = ill_lookup_on_ifindex(ip2m->ip2mac_ifindex, isv6, NULL, - NULL, NULL, NULL, ipst); + ill = ill_lookup_on_ifindex(ip2m->ip2mac_ifindex, isv6, ipst); if (ill == NULL) { ip2m->ip2mac_err = ENXIO; netstack_rele(ns); @@ -214,32 +215,39 @@ ip2mac(uint_t flags, ip2mac_t *ip2m, ip2mac_callback_t *cb, void *cbarg, } if (isv6) { sin6 = (sin6_t *)&ip2m->ip2mac_pa; - if (flags == IP2MAC_LOOKUP) { - nce = ndp_lookup_v6(ill, B_FALSE, &sin6->sin6_addr, - B_FALSE); + if (op == IP2MAC_LOOKUP) { + nce = nce_lookup_v6(ill, &sin6->sin6_addr); } else { - err = ndp_lookup_then_add_v6(ill, B_FALSE, NULL, - &sin6->sin6_addr, &ipv6_all_ones, &ipv6_all_zeros, - 0, 0, ND_INCOMPLETE, &nce); + err = nce_lookup_then_add_v6(ill, NULL, + ill->ill_phys_addr_length, + &sin6->sin6_addr, 0, ND_UNCHANGED, &nce); } } else { - ip2m->ip2mac_err = ENOTSUP; /* yet. */ - goto done; + sin = (sin_t *)&ip2m->ip2mac_pa; + if (op == IP2MAC_LOOKUP) { + nce = nce_lookup_v4(ill, &sin->sin_addr.s_addr); + } else { + err = nce_lookup_then_add_v4(ill, NULL, + ill->ill_phys_addr_length, + &sin->sin_addr.s_addr, 0, ND_UNCHANGED, &nce); + } } - if (flags == IP2MAC_LOOKUP) { + if (op == IP2MAC_LOOKUP) { if (nce == NULL) { ip2m->ip2mac_err = ESRCH; goto done; } - mutex_enter(&nce->nce_lock); - if (NCE_ISREACHABLE(nce)) { - nce_ip2mac_response(ip2m, nce); + ncec = nce->nce_common; + delta = TICK_TO_MSEC(lbolt64) - ncec->ncec_last; + mutex_enter(&ncec->ncec_lock); + if (NCE_ISREACHABLE(ncec) && + delta < (uint64_t)ill->ill_reachable_time) { + ncec_ip2mac_response(ip2m, ncec); ip2m->ip2mac_err = 0; } else { ip2m->ip2mac_err = ESRCH; } - mutex_exit(&nce->nce_lock); - NCE_REFRELE(nce); + mutex_exit(&ncec->ncec_lock); goto done; } else { if (err != 0 && err != EEXIST) { @@ -247,13 +255,20 @@ ip2mac(uint_t flags, ip2mac_t *ip2m, ip2mac_callback_t *cb, void *cbarg, goto done; } } - delta = TICK_TO_MSEC(lbolt64) - nce->nce_last; - mutex_enter(&nce->nce_lock); - if (nce->nce_flags & NCE_F_CONDEMNED) { + ncec = nce->nce_common; + delta = TICK_TO_MSEC(lbolt64) - ncec->ncec_last; + mutex_enter(&ncec->ncec_lock); + if (NCE_ISCONDEMNED(ncec)) { ip2m->ip2mac_err = ESRCH; - } else if (!NCE_ISREACHABLE(nce) || - delta > (uint64_t)ill->ill_reachable_time) { - if (NCE_ISREACHABLE(nce)) { + } else { + if (NCE_ISREACHABLE(ncec)) { + if (NCE_MYADDR(ncec) || + delta < (uint64_t)ill->ill_reachable_time) { + ncec_ip2mac_response(ip2m, ncec); + ip2m->ip2mac_err = 0; + mutex_exit(&ncec->ncec_lock); + goto done; + } /* * Since we do not control the packet output * path for ip2mac() callers, we need to verify @@ -268,39 +283,48 @@ ip2mac(uint_t flags, ip2mac_t *ip2m, ip2mac_callback_t *cb, void *cbarg, * so that we can return the stale information but * also update the caller if the lladdr changes. */ - nce->nce_rcnt = ill->ill_xmit_count; - nce->nce_state = ND_PROBE; - err = 0; /* treat this nce as a new one */ + ncec->ncec_rcnt = ill->ill_xmit_count; + ncec->ncec_state = ND_PROBE; + need_resolve = B_TRUE; /* reachable but very old nce */ + } else if (ncec->ncec_state == ND_INITIAL) { + need_resolve = B_TRUE; /* ND_INITIAL nce */ + ncec->ncec_state = ND_INCOMPLETE; } - if (nce->nce_rcnt > 0) { + /* + * NCE not known to be reachable in the recent past. We must + * reconfirm the information before returning it to the caller + */ + if (ncec->ncec_rcnt > 0) { /* - * Still resolving this nce, so we can - * queue the callback information in nce->nce_cb + * Still resolving this ncec, so we can queue the + * callback information in ncec->ncec_cb */ - ip2mid = nce_add_cb(nce, cb, cbarg); + ip2mid = ncec_add_cb(ncec, cb, cbarg); ip2m->ip2mac_err = EINPROGRESS; } else { /* - * Resolution failed. + * No more retransmits allowed -- resolution failed. */ ip2m->ip2mac_err = ESRCH; } - } else { - nce_ip2mac_response(ip2m, nce); - ip2m->ip2mac_err = 0; } - if (ip2m->ip2mac_err == EINPROGRESS && err != EEXIST) - ip_ndp_resolve(nce); - mutex_exit(&nce->nce_lock); - NCE_REFRELE(nce); + mutex_exit(&ncec->ncec_lock); done: + /* + * if NCE_ISREACHABLE(ncec) but very old, or if it is ND_INITIAL, + * trigger resolve. + */ + if (need_resolve) + ip_ndp_resolve(ncec); + if (nce != NULL) + nce_refrele(nce); netstack_rele(ns); ill_refrele(ill); return (ip2mid); } /* - * data passed to nce_walk for canceling outstanding callbacks. + * data passed to ncec_walk for canceling outstanding callbacks. */ typedef struct ip2mac_cancel_data_s { ip2mac_id_t ip2m_cancel_id; @@ -308,23 +332,23 @@ typedef struct ip2mac_cancel_data_s { } ip2mac_cancel_data_t; /* - * callback invoked for each active nce. If the ip2mac_id_t corresponds - * to an active nce_cb_t in the nce's callback list, we want to remove + * callback invoked for each active ncec. If the ip2mac_id_t corresponds + * to an active nce_cb_t in the ncec's callback list, we want to remove * the callback (if there are no walkers) or return EBUSY to the caller */ static int -ip2mac_cancel_callback(nce_t *nce, void *arg) +ip2mac_cancel_callback(ncec_t *ncec, void *arg) { ip2mac_cancel_data_t *ip2m_wdata = arg; - nce_cb_t *ip2m_nce_cb = ip2m_wdata->ip2m_cancel_id; - nce_cb_t *nce_cb; + ncec_cb_t *ip2m_nce_cb = ip2m_wdata->ip2m_cancel_id; + ncec_cb_t *ncec_cb; - if (ip2m_nce_cb->nce_cb_id != nce) + if (ip2m_nce_cb->ncec_cb_id != ncec) return (0); - mutex_enter(&nce->nce_lock); - if (list_is_empty(&nce->nce_cb)) { - mutex_exit(&nce->nce_lock); + mutex_enter(&ncec->ncec_lock); + if (list_is_empty(&ncec->ncec_cb)) { + mutex_exit(&ncec->ncec_lock); return (0); } /* @@ -335,22 +359,22 @@ ip2mac_cancel_callback(nce_t *nce, void *arg) * across calls into another subsystem, especially if calls can * happen in either direction). */ - nce_cb = list_head(&nce->nce_cb); - for (; nce_cb != NULL; nce_cb = list_next(&nce->nce_cb, nce_cb)) { - if (nce_cb != ip2m_nce_cb) + ncec_cb = list_head(&ncec->ncec_cb); + for (; ncec_cb != NULL; ncec_cb = list_next(&ncec->ncec_cb, ncec_cb)) { + if (ncec_cb != ip2m_nce_cb) continue; /* * If there are no walkers we can remove the nce_cb. * Otherwise the exiting walker will clean up. */ - if (nce->nce_cb_walker_cnt == 0) { - list_remove(&nce->nce_cb, nce_cb); + if (ncec->ncec_cb_walker_cnt == 0) { + list_remove(&ncec->ncec_cb, ncec_cb); } else { ip2m_wdata->ip2m_cancel_err = EBUSY; } break; } - mutex_exit(&nce->nce_lock); + mutex_exit(&ncec->ncec_lock); return (0); } @@ -379,7 +403,7 @@ ip2mac_cancel(ip2mac_id_t ip2mid, zoneid_t zoneid) ip2m_wdata.ip2m_cancel_id = ip2mid; ip2m_wdata.ip2m_cancel_err = 0; - ndp_walk(NULL, ip2mac_cancel_callback, &ip2m_wdata, ipst); + ncec_walk(NULL, ip2mac_cancel_callback, &ip2m_wdata, ipst); /* * We may return EBUSY if a walk to dispatch callbacks is * in progress, in which case the caller needs to synchronize diff --git a/usr/src/uts/common/inet/ip/ip6.c b/usr/src/uts/common/inet/ip/ip6.c index 38fe7b2562..ed54c08884 100644 --- a/usr/src/uts/common/inet/ip/ip6.c +++ b/usr/src/uts/common/inet/ip/ip6.c @@ -53,8 +53,8 @@ #include <sys/vtrace.h> #include <sys/isa_defs.h> #include <sys/atomic.h> -#include <sys/iphada.h> #include <sys/policy.h> +#include <sys/mac.h> #include <net/if.h> #include <net/if_types.h> #include <net/route.h> @@ -79,9 +79,7 @@ #include <inet/tcp.h> #include <inet/tcp_impl.h> #include <inet/udp_impl.h> -#include <inet/sctp/sctp_impl.h> #include <inet/ipp_common.h> -#include <inet/ilb_ip.h> #include <inet/ip_multi.h> #include <inet/ip_if.h> @@ -89,7 +87,6 @@ #include <inet/ip_rts.h> #include <inet/ip_ndp.h> #include <net/pfkeyv2.h> -#include <inet/ipsec_info.h> #include <inet/sadb.h> #include <inet/ipsec_impl.h> #include <inet/iptun/iptun_impl.h> @@ -110,8 +107,6 @@ /* Temporary; for CR 6451644 work-around */ #include <sys/ethernet.h> -extern int ip_squeue_flag; - /* * Naming conventions: * These rules should be judiciously applied @@ -179,154 +174,75 @@ const in6_addr_t ipv6_solicited_node_mcast = { 0x000002ffU, 0, 0x01000000U, 0x000000ffU }; #endif /* _BIG_ENDIAN */ -/* Leave room for ip_newroute to tack on the src and target addresses */ -#define OK_RESOLVER_MP_V6(mp) \ - ((mp) && ((mp)->b_wptr - (mp)->b_rptr) >= (2 * IPV6_ADDR_LEN)) - -#define IP6_MBLK_OK 0 -#define IP6_MBLK_HDR_ERR 1 -#define IP6_MBLK_LEN_ERR 2 - -static void icmp_inbound_too_big_v6(queue_t *, mblk_t *, ill_t *, ill_t *, - boolean_t, zoneid_t); -static void icmp_pkt_v6(queue_t *, mblk_t *, void *, size_t, - const in6_addr_t *, boolean_t, zoneid_t, ip_stack_t *); -static void icmp_redirect_v6(queue_t *, mblk_t *, ill_t *ill); -static int ip_bind_connected_v6(conn_t *, mblk_t **, uint8_t, in6_addr_t *, - uint16_t, const in6_addr_t *, ip6_pkt_t *, uint16_t, - boolean_t, boolean_t, cred_t *); -static boolean_t ip_bind_get_ire_v6(mblk_t **, ire_t *, const in6_addr_t *, - iulp_t *, ip_stack_t *); -static int ip_bind_laddr_v6(conn_t *, mblk_t **, uint8_t, - const in6_addr_t *, uint16_t, boolean_t); -static void ip_fanout_proto_v6(queue_t *, mblk_t *, ip6_t *, ill_t *, - ill_t *, uint8_t, uint_t, uint_t, boolean_t, zoneid_t); -static void ip_fanout_tcp_v6(queue_t *, mblk_t *, ip6_t *, ill_t *, - ill_t *, uint_t, uint_t, boolean_t, zoneid_t); -static void ip_fanout_udp_v6(queue_t *, mblk_t *, ip6_t *, uint32_t, - ill_t *, ill_t *, uint_t, boolean_t, zoneid_t); -static int ip_process_options_v6(queue_t *, mblk_t *, ip6_t *, - uint8_t *, uint_t, uint8_t, ip_stack_t *); -static mblk_t *ip_rput_frag_v6(ill_t *, ill_t *, mblk_t *, ip6_t *, - ip6_frag_t *, uint_t, uint_t *, uint32_t *, uint16_t *); +static boolean_t icmp_inbound_verify_v6(mblk_t *, icmp6_t *, ip_recv_attr_t *); +static void icmp_inbound_too_big_v6(icmp6_t *, ip_recv_attr_t *); +static void icmp_pkt_v6(mblk_t *, void *, size_t, const in6_addr_t *, + ip_recv_attr_t *); +static void icmp_redirect_v6(mblk_t *, ip6_t *, nd_redirect_t *, + ip_recv_attr_t *); +static void icmp_send_redirect_v6(mblk_t *, in6_addr_t *, + in6_addr_t *, ip_recv_attr_t *); +static void icmp_send_reply_v6(mblk_t *, ip6_t *, icmp6_t *, + ip_recv_attr_t *); static boolean_t ip_source_routed_v6(ip6_t *, mblk_t *, ip_stack_t *); -static void ip_wput_ire_v6(queue_t *, mblk_t *, ire_t *, int, int, - conn_t *, int, int, zoneid_t); -static boolean_t ipif_lookup_testaddr_v6(ill_t *, const in6_addr_t *, - ipif_t **); - -/* - * A template for an IPv6 AR_ENTRY_QUERY - */ -static areq_t ipv6_areq_template = { - AR_ENTRY_QUERY, /* cmd */ - sizeof (areq_t)+(2*IPV6_ADDR_LEN), /* name offset */ - sizeof (areq_t), /* name len (filled by ill_arp_alloc) */ - ETHERTYPE_IPV6, /* protocol, from arps perspective */ - sizeof (areq_t), /* target addr offset */ - IPV6_ADDR_LEN, /* target addr_length */ - 0, /* flags */ - sizeof (areq_t) + IPV6_ADDR_LEN, /* sender addr offset */ - IPV6_ADDR_LEN, /* sender addr length */ - 6, /* xmit_count */ - 1000, /* (re)xmit_interval in milliseconds */ - 4 /* max # of requests to buffer */ - /* anything else filled in by the code */ -}; /* - * Handle IPv6 ICMP packets sent to us. Consume the mblk passed in. - * The message has already been checksummed and if needed, - * a copy has been made to be sent any interested ICMP client (conn) - * Note that this is different than icmp_inbound() which does the fanout - * to conn's as well as local processing of the ICMP packets. + * icmp_inbound_v6 deals with ICMP messages that are handled by IP. + * If the ICMP message is consumed by IP, i.e., it should not be delivered + * to any IPPROTO_ICMP raw sockets, then it returns NULL. + * Likewise, if the ICMP error is misformed (too short, etc), then it + * returns NULL. The caller uses this to determine whether or not to send + * to raw sockets. * * All error messages are passed to the matching transport stream. * - * Zones notes: - * The packet is only processed in the context of the specified zone: typically - * only this zone will reply to an echo request. This means that the caller must - * call icmp_inbound_v6() for each relevant zone. + * See comment for icmp_inbound_v4() on how IPsec is handled. */ -static void -icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, ill_t *inill, - uint_t hdr_length, boolean_t mctl_present, uint_t flags, zoneid_t zoneid, - mblk_t *dl_mp) +mblk_t * +icmp_inbound_v6(mblk_t *mp, ip_recv_attr_t *ira) { icmp6_t *icmp6; - ip6_t *ip6h; + ip6_t *ip6h; /* Outer header */ + int ip_hdr_length; /* Outer header length */ boolean_t interested; - in6_addr_t origsrc; - mblk_t *first_mp; - ipsec_in_t *ii; + ill_t *ill = ira->ira_ill; ip_stack_t *ipst = ill->ill_ipst; - - ASSERT(ill != NULL); - first_mp = mp; - if (mctl_present) { - mp = first_mp->b_cont; - ASSERT(mp != NULL); - - ii = (ipsec_in_t *)first_mp->b_rptr; - ASSERT(ii->ipsec_in_type == IPSEC_IN); - } + mblk_t *mp_ret = NULL; ip6h = (ip6_t *)mp->b_rptr; BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInMsgs); - if ((mp->b_wptr - mp->b_rptr) < (hdr_length + ICMP6_MINLEN)) { - if (!pullupmsg(mp, hdr_length + ICMP6_MINLEN)) { - ip1dbg(("icmp_inbound_v6: pullupmsg failed\n")); - BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors); - freemsg(first_mp); - return; - } - ip6h = (ip6_t *)mp->b_rptr; - } - if (ipst->ips_icmp_accept_clear_messages == 0) { - first_mp = ipsec_check_global_policy(first_mp, NULL, - NULL, ip6h, mctl_present, ipst->ips_netstack); - if (first_mp == NULL) - return; - } + /* Make sure ira_l2src is set for ndp_input */ + if (!(ira->ira_flags & IRAF_L2SRC_SET)) + ip_setl2src(mp, ira, ira->ira_rill); - /* - * On a labeled system, we have to check whether the zone itself is - * permitted to receive raw traffic. - */ - if (is_system_labeled()) { - if (zoneid == ALL_ZONES) - zoneid = tsol_packet_to_zoneid(mp); - if (!tsol_can_accept_raw(mp, B_FALSE)) { - ip1dbg(("icmp_inbound_v6: zone %d can't receive raw", - zoneid)); + ip_hdr_length = ira->ira_ip_hdr_length; + if ((mp->b_wptr - mp->b_rptr) < (ip_hdr_length + ICMP6_MINLEN)) { + if (ira->ira_pktlen < (ip_hdr_length + ICMP6_MINLEN)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts); + ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill); + freemsg(mp); + return (NULL); + } + ip6h = ip_pullup(mp, ip_hdr_length + ICMP6_MINLEN, ira); + if (ip6h == NULL) { BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors); - freemsg(first_mp); - return; + freemsg(mp); + return (NULL); } } - icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]); + icmp6 = (icmp6_t *)(&mp->b_rptr[ip_hdr_length]); + DTRACE_PROBE2(icmp__inbound__v6, ip6_t *, ip6h, icmp6_t *, icmp6); ip2dbg(("icmp_inbound_v6: type %d code %d\n", icmp6->icmp6_type, icmp6->icmp6_code)); - interested = !(icmp6->icmp6_type & ICMP6_INFOMSG_MASK); - /* Initiate IPPF processing here */ - if (IP6_IN_IPP(flags, ipst)) { - - /* - * If the ifindex changes due to SIOCSLIFINDEX - * packet may return to IP on the wrong ill. - */ - ip_process(IPP_LOCAL_IN, &mp, ill->ill_phyint->phyint_ifindex); - if (mp == NULL) { - if (mctl_present) { - freeb(first_mp); - } - return; - } - } + /* + * We will set "interested" to "true" if we should pass a copy to + * the transport i.e., if it is an error message. + */ + interested = !(icmp6->icmp6_type & ICMP6_INFOMSG_MASK); switch (icmp6->icmp6_type) { case ICMP6_DST_UNREACH: @@ -344,9 +260,9 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, ill_t *inill, break; case ICMP6_PACKET_TOO_BIG: - icmp_inbound_too_big_v6(q, first_mp, ill, inill, mctl_present, - zoneid); - return; + BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInPktTooBigs); + break; + case ICMP6_ECHO_REQUEST: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInEchos); if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) && @@ -362,93 +278,22 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, ill_t *inill, mblk_t *mp1; mp1 = copymsg(mp); - freemsg(mp); if (mp1 == NULL) { - BUMP_MIB(ill->ill_icmp6_mib, - ipv6IfIcmpInErrors); - if (mctl_present) - freeb(first_mp); - return; + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards - copymsg", + mp, ill); + freemsg(mp); + return (NULL); } + freemsg(mp); mp = mp1; ip6h = (ip6_t *)mp->b_rptr; - icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]); - if (mctl_present) - first_mp->b_cont = mp; - else - first_mp = mp; + icmp6 = (icmp6_t *)(&mp->b_rptr[ip_hdr_length]); } - /* - * Turn the echo into an echo reply. - * Remove any extension headers (do not reverse a source route) - * and clear the flow id (keep traffic class for now). - */ - if (hdr_length != IPV6_HDR_LEN) { - int i; - - for (i = 0; i < IPV6_HDR_LEN; i++) - mp->b_rptr[hdr_length - i - 1] = - mp->b_rptr[IPV6_HDR_LEN - i - 1]; - mp->b_rptr += (hdr_length - IPV6_HDR_LEN); - ip6h = (ip6_t *)mp->b_rptr; - ip6h->ip6_nxt = IPPROTO_ICMPV6; - hdr_length = IPV6_HDR_LEN; - } - ip6h->ip6_vcf &= ~IPV6_FLOWINFO_FLOWLABEL; icmp6->icmp6_type = ICMP6_ECHO_REPLY; - - ip6h->ip6_plen = - htons((uint16_t)(msgdsize(mp) - IPV6_HDR_LEN)); - origsrc = ip6h->ip6_src; - /* - * Reverse the source and destination addresses. - * If the return address is a multicast, zero out the source - * (ip_wput_v6 will set an address). - */ - if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) { - ip6h->ip6_src = ipv6_all_zeros; - ip6h->ip6_dst = origsrc; - } else { - ip6h->ip6_src = ip6h->ip6_dst; - ip6h->ip6_dst = origsrc; - } - - /* set the hop limit */ - ip6h->ip6_hops = ipst->ips_ipv6_def_hops; - - /* - * Prepare for checksum by putting icmp length in the icmp - * checksum field. The checksum is calculated in ip_wput_v6. - */ - icmp6->icmp6_cksum = ip6h->ip6_plen; - - if (!mctl_present) { - /* - * This packet should go out the same way as it - * came in i.e in clear. To make sure that global - * policy will not be applied to this in ip_wput, - * we attach a IPSEC_IN mp and clear ipsec_in_secure. - */ - ASSERT(first_mp == mp); - first_mp = ipsec_in_alloc(B_FALSE, ipst->ips_netstack); - if (first_mp == NULL) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - freemsg(mp); - return; - } - ii = (ipsec_in_t *)first_mp->b_rptr; - - /* This is not a secure packet */ - ii->ipsec_in_secure = B_FALSE; - first_mp->b_cont = mp; - } - if (!ipsec_in_to_out(first_mp, NULL, ip6h, zoneid)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - return; - } - put(WR(q), first_mp); - return; + icmp_send_reply_v6(mp, ip6h, icmp6, ira); + return (NULL); case ICMP6_ECHO_REPLY: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInEchoReplies); @@ -464,343 +309,478 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, ill_t *inill, case ND_NEIGHBOR_SOLICIT: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInNeighborSolicits); - if (mctl_present) - freeb(first_mp); - /* XXX may wish to pass first_mp up to ndp_input someday. */ - ndp_input(inill, mp, dl_mp); - return; + ndp_input(mp, ira); + return (NULL); case ND_NEIGHBOR_ADVERT: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInNeighborAdvertisements); - if (mctl_present) - freeb(first_mp); - /* XXX may wish to pass first_mp up to ndp_input someday. */ - ndp_input(inill, mp, dl_mp); - return; + ndp_input(mp, ira); + return (NULL); - case ND_REDIRECT: { + case ND_REDIRECT: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInRedirects); if (ipst->ips_ipv6_ignore_redirect) break; - /* - * As there is no upper client to deliver, we don't - * need the first_mp any more. - */ - if (mctl_present) - freeb(first_mp); - if (!pullupmsg(mp, -1)) { - BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects); - break; - } - icmp_redirect_v6(q, mp, ill); - return; - } + /* We now allow a RAW socket to receive this. */ + interested = B_TRUE; + break; /* * The next three icmp messages will be handled by MLD. * Pass all valid MLD packets up to any process(es) - * listening on a raw ICMP socket. MLD messages are - * freed by mld_input function. + * listening on a raw ICMP socket. */ case MLD_LISTENER_QUERY: case MLD_LISTENER_REPORT: case MLD_LISTENER_REDUCTION: - if (mctl_present) - freeb(first_mp); - mld_input(q, mp, ill); - return; + mp = mld_input(mp, ira); + return (mp); default: break; } - if (interested) { - icmp_inbound_error_fanout_v6(q, first_mp, ip6h, icmp6, ill, - inill, mctl_present, zoneid); - } else { - freemsg(first_mp); - } -} + /* + * See if there is an ICMP client to avoid an extra copymsg/freemsg + * if there isn't one. + */ + if (ipst->ips_ipcl_proto_fanout_v6[IPPROTO_ICMPV6].connf_head != NULL) { + /* If there is an ICMP client and we want one too, copy it. */ -/* - * Process received IPv6 ICMP Packet too big. - * After updating any IRE it does the fanout to any matching transport streams. - * Assumes the IPv6 plus ICMPv6 headers have been pulled up but nothing else. - */ -/* ARGSUSED */ -static void -icmp_inbound_too_big_v6(queue_t *q, mblk_t *mp, ill_t *ill, ill_t *inill, - boolean_t mctl_present, zoneid_t zoneid) -{ - ip6_t *ip6h; - ip6_t *inner_ip6h; - icmp6_t *icmp6; - uint16_t hdr_length; - uint32_t mtu; - ire_t *ire, *first_ire; - mblk_t *first_mp; - ip_stack_t *ipst = ill->ill_ipst; + if (!interested) { + /* Caller will deliver to RAW sockets */ + return (mp); + } + mp_ret = copymsg(mp); + if (mp_ret == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill); + } + } else if (!interested) { + /* Neither we nor raw sockets are interested. Drop packet now */ + freemsg(mp); + return (NULL); + } - first_mp = mp; - if (mctl_present) - mp = first_mp->b_cont; /* - * We must have exclusive use of the mblk to update the MTU - * in the packet. - * If not, we copy it. - * - * If there's an M_CTL present, we know that allocated first_mp - * earlier in this function, so we know first_mp has refcnt of one. + * ICMP error or redirect packet. Make sure we have enough of + * the header and that db_ref == 1 since we might end up modifying + * the packet. */ - ASSERT(!mctl_present || first_mp->b_datap->db_ref == 1); + if (mp->b_cont != NULL) { + if (ip_pullup(mp, -1, ira) == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards - ip_pullup", + mp, ill); + freemsg(mp); + return (mp_ret); + } + } + if (mp->b_datap->db_ref > 1) { mblk_t *mp1; mp1 = copymsg(mp); - freemsg(mp); if (mp1 == NULL) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - if (mctl_present) - freeb(first_mp); - return; + ip_drop_input("ipIfStatsInDiscards - copymsg", mp, ill); + freemsg(mp); + return (mp_ret); } + freemsg(mp); mp = mp1; - if (mctl_present) - first_mp->b_cont = mp; - else - first_mp = mp; } + + /* + * In case mp has changed, verify the message before any further + * processes. + */ ip6h = (ip6_t *)mp->b_rptr; - if (ip6h->ip6_nxt != IPPROTO_ICMPV6) - hdr_length = ip_hdr_length_v6(mp, ip6h); - else - hdr_length = IPV6_HDR_LEN; + icmp6 = (icmp6_t *)(&mp->b_rptr[ip_hdr_length]); + if (!icmp_inbound_verify_v6(mp, icmp6, ira)) { + freemsg(mp); + return (mp_ret); + } - icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]); - ASSERT((size_t)(mp->b_wptr - mp->b_rptr) >= hdr_length + ICMP6_MINLEN); - inner_ip6h = (ip6_t *)&icmp6[1]; /* Packet in error */ - if ((uchar_t *)&inner_ip6h[1] > mp->b_wptr) { - if (!pullupmsg(mp, (uchar_t *)&inner_ip6h[1] - mp->b_rptr)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - freemsg(first_mp); - return; + switch (icmp6->icmp6_type) { + case ND_REDIRECT: + icmp_redirect_v6(mp, ip6h, (nd_redirect_t *)icmp6, ira); + break; + case ICMP6_PACKET_TOO_BIG: + /* Update DCE and adjust MTU is icmp header if needed */ + icmp_inbound_too_big_v6(icmp6, ira); + /* FALLTHRU */ + default: + icmp_inbound_error_fanout_v6(mp, icmp6, ira); + break; + } + + return (mp_ret); +} + +/* + * Send an ICMP echo reply. + * The caller has already updated the payload part of the packet. + * We handle the ICMP checksum, IP source address selection and feed + * the packet into ip_output_simple. + */ +static void +icmp_send_reply_v6(mblk_t *mp, ip6_t *ip6h, icmp6_t *icmp6, + ip_recv_attr_t *ira) +{ + uint_t ip_hdr_length = ira->ira_ip_hdr_length; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; + ip_xmit_attr_t ixas; + in6_addr_t origsrc; + + /* + * Remove any extension headers (do not reverse a source route) + * and clear the flow id (keep traffic class for now). + */ + if (ip_hdr_length != IPV6_HDR_LEN) { + int i; + + for (i = 0; i < IPV6_HDR_LEN; i++) { + mp->b_rptr[ip_hdr_length - i - 1] = + mp->b_rptr[IPV6_HDR_LEN - i - 1]; } + mp->b_rptr += (ip_hdr_length - IPV6_HDR_LEN); ip6h = (ip6_t *)mp->b_rptr; - icmp6 = (icmp6_t *)&mp->b_rptr[hdr_length]; - inner_ip6h = (ip6_t *)&icmp6[1]; + ip6h->ip6_nxt = IPPROTO_ICMPV6; + i = ntohs(ip6h->ip6_plen); + i -= (ip_hdr_length - IPV6_HDR_LEN); + ip6h->ip6_plen = htons(i); + ip_hdr_length = IPV6_HDR_LEN; + ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == msgdsize(mp)); } + ip6h->ip6_vcf &= ~IPV6_FLOWINFO_FLOWLABEL; + + /* Reverse the source and destination addresses. */ + origsrc = ip6h->ip6_src; + ip6h->ip6_src = ip6h->ip6_dst; + ip6h->ip6_dst = origsrc; + + /* set the hop limit */ + ip6h->ip6_hops = ipst->ips_ipv6_def_hops; /* - * For link local destinations matching simply on IRE type is not - * sufficient. Same link local addresses for different ILL's is - * possible. + * Prepare for checksum by putting icmp length in the icmp + * checksum field. The checksum is calculated in ip_output */ - if (IN6_IS_ADDR_LINKLOCAL(&inner_ip6h->ip6_dst)) { - first_ire = ire_ctable_lookup_v6(&inner_ip6h->ip6_dst, NULL, - IRE_CACHE, ill->ill_ipif, ALL_ZONES, NULL, - MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst); - - if (first_ire == NULL) { - if (ip_debug > 2) { - /* ip1dbg */ - pr_addr_dbg("icmp_inbound_too_big_v6:" - "no ire for dst %s\n", AF_INET6, - &inner_ip6h->ip6_dst); - } - freemsg(first_mp); - return; - } + icmp6->icmp6_cksum = ip6h->ip6_plen; - mtu = ntohl(icmp6->icmp6_mtu); - rw_enter(&first_ire->ire_bucket->irb_lock, RW_READER); - for (ire = first_ire; ire != NULL && - IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, &inner_ip6h->ip6_dst); - ire = ire->ire_next) { - mutex_enter(&ire->ire_lock); - if (mtu < IPV6_MIN_MTU) { - ip1dbg(("Received mtu less than IPv6 " - "min mtu %d: %d\n", IPV6_MIN_MTU, mtu)); - mtu = IPV6_MIN_MTU; - /* - * If an mtu less than IPv6 min mtu is received, - * we must include a fragment header in - * subsequent packets. - */ - ire->ire_frag_flag |= IPH_FRAG_HDR; - } - ip1dbg(("Received mtu from router: %d\n", mtu)); - ire->ire_max_frag = MIN(ire->ire_max_frag, mtu); - if (ire->ire_max_frag == mtu) { - /* Decreased it */ - ire->ire_marks |= IRE_MARK_PMTU; - } - /* Record the new max frag size for the ULP. */ - if (ire->ire_frag_flag & IPH_FRAG_HDR) { - /* - * If we need a fragment header in every packet - * (above case or multirouting), make sure the - * ULP takes it into account when computing the - * payload size. - */ - icmp6->icmp6_mtu = htonl(ire->ire_max_frag - - sizeof (ip6_frag_t)); - } else { - icmp6->icmp6_mtu = htonl(ire->ire_max_frag); - } - mutex_exit(&ire->ire_lock); - } - rw_exit(&first_ire->ire_bucket->irb_lock); - ire_refrele(first_ire); - } else { - irb_t *irb = NULL; + bzero(&ixas, sizeof (ixas)); + ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6; + ixas.ixa_zoneid = ira->ira_zoneid; + ixas.ixa_cred = kcred; + ixas.ixa_cpid = NOPID; + ixas.ixa_tsl = ira->ira_tsl; /* Behave as a multi-level responder */ + ixas.ixa_ifindex = 0; + ixas.ixa_ipst = ipst; + ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; + + if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) { /* - * for non-link local destinations we match only on the IRE type + * This packet should go out the same way as it + * came in i.e in clear, independent of the IPsec + * policy for transmitting packets. */ - ire = ire_ctable_lookup_v6(&inner_ip6h->ip6_dst, NULL, - IRE_CACHE, ill->ill_ipif, ALL_ZONES, NULL, MATCH_IRE_TYPE, - ipst); - if (ire == NULL) { - if (ip_debug > 2) { - /* ip1dbg */ - pr_addr_dbg("icmp_inbound_too_big_v6:" - "no ire for dst %s\n", - AF_INET6, &inner_ip6h->ip6_dst); - } - freemsg(first_mp); + ixas.ixa_flags |= IXAF_NO_IPSEC; + } else { + if (!ipsec_in_to_out(ira, &ixas, mp, NULL, ip6h)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + /* Note: mp already consumed and ip_drop_packet done */ return; } - irb = ire->ire_bucket; - ire_refrele(ire); - rw_enter(&irb->irb_lock, RW_READER); - for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { - if (IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, - &inner_ip6h->ip6_dst)) { - mtu = ntohl(icmp6->icmp6_mtu); - mutex_enter(&ire->ire_lock); - if (mtu < IPV6_MIN_MTU) { - ip1dbg(("Received mtu less than IPv6" - "min mtu %d: %d\n", - IPV6_MIN_MTU, mtu)); - mtu = IPV6_MIN_MTU; - /* - * If an mtu less than IPv6 min mtu is - * received, we must include a fragment - * header in subsequent packets. - */ - ire->ire_frag_flag |= IPH_FRAG_HDR; - } + } - ip1dbg(("Received mtu from router: %d\n", mtu)); - ire->ire_max_frag = MIN(ire->ire_max_frag, mtu); - if (ire->ire_max_frag == mtu) { - /* Decreased it */ - ire->ire_marks |= IRE_MARK_PMTU; - } - /* Record the new max frag size for the ULP. */ - if (ire->ire_frag_flag & IPH_FRAG_HDR) { - /* - * If we need a fragment header in - * every packet (above case or - * multirouting), make sure the ULP - * takes it into account when computing - * the payload size. - */ - icmp6->icmp6_mtu = - htonl(ire->ire_max_frag - - sizeof (ip6_frag_t)); - } else { - icmp6->icmp6_mtu = - htonl(ire->ire_max_frag); - } - mutex_exit(&ire->ire_lock); - } - } - rw_exit(&irb->irb_lock); + /* Was the destination (now source) link-local? Send out same group */ + if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) { + ixas.ixa_flags |= IXAF_SCOPEID_SET; + if (IS_UNDER_IPMP(ill)) + ixas.ixa_scopeid = ill_get_upper_ifindex(ill); + else + ixas.ixa_scopeid = ill->ill_phyint->phyint_ifindex; + } + + if (ira->ira_flags & IRAF_MULTIBROADCAST) { + /* + * Not one or our addresses (IRE_LOCALs), thus we let + * ip_output_simple pick the source. + */ + ip6h->ip6_src = ipv6_all_zeros; + ixas.ixa_flags |= IXAF_SET_SOURCE; } - icmp_inbound_error_fanout_v6(q, first_mp, ip6h, icmp6, ill, inill, - mctl_present, zoneid); + + /* Should we send using dce_pmtu? */ + if (ipst->ips_ipv6_icmp_return_pmtu) + ixas.ixa_flags |= IXAF_PMTU_DISCOVERY; + + (void) ip_output_simple(mp, &ixas); + ixa_cleanup(&ixas); + } /* - * Fanout for ICMPv6 errors containing IP-in-IPv6 packets. Returns B_TRUE if a - * tunnel consumed the message, and B_FALSE otherwise. + * Verify the ICMP messages for either for ICMP error or redirect packet. + * The caller should have fully pulled up the message. If it's a redirect + * packet, only basic checks on IP header will be done; otherwise, verify + * the packet by looking at the included ULP header. + * + * Called before icmp_inbound_error_fanout_v6 is called. */ static boolean_t -icmp_inbound_iptun_fanout_v6(mblk_t *first_mp, ip6_t *rip6h, ill_t *ill, - ip_stack_t *ipst) +icmp_inbound_verify_v6(mblk_t *mp, icmp6_t *icmp6, ip_recv_attr_t *ira) { - conn_t *connp; + ill_t *ill = ira->ira_ill; + uint16_t hdr_length; + uint8_t *nexthdrp; + uint8_t nexthdr; + ip_stack_t *ipst = ill->ill_ipst; + conn_t *connp; + ip6_t *ip6h; /* Inner header */ - if ((connp = ipcl_iptun_classify_v6(&rip6h->ip6_src, &rip6h->ip6_dst, - ipst)) == NULL) - return (B_FALSE); + ip6h = (ip6_t *)&icmp6[1]; + if ((uchar_t *)ip6h + IPV6_HDR_LEN > mp->b_wptr) + goto truncated; + + if (icmp6->icmp6_type == ND_REDIRECT) { + hdr_length = sizeof (nd_redirect_t); + } else { + if ((IPH_HDR_VERSION(ip6h) != IPV6_VERSION)) + goto discard_pkt; + hdr_length = IPV6_HDR_LEN; + } + + if ((uchar_t *)ip6h + hdr_length > mp->b_wptr) + goto truncated; + + /* + * Stop here for ICMP_REDIRECT. + */ + if (icmp6->icmp6_type == ND_REDIRECT) + return (B_TRUE); + + /* + * ICMP errors only. + */ + if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_length, &nexthdrp)) + goto discard_pkt; + nexthdr = *nexthdrp; + + /* Try to pass the ICMP message to clients who need it */ + switch (nexthdr) { + case IPPROTO_UDP: + /* + * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of + * transport header. + */ + if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_TP_HDR_LEN > + mp->b_wptr) + goto truncated; + break; + case IPPROTO_TCP: { + tcpha_t *tcpha; + + /* + * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of + * transport header. + */ + if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_TP_HDR_LEN > + mp->b_wptr) + goto truncated; + + tcpha = (tcpha_t *)((uchar_t *)ip6h + hdr_length); + /* + * With IPMP we need to match across group, which we do + * since we have the upper ill from ira_ill. + */ + connp = ipcl_tcp_lookup_reversed_ipv6(ip6h, tcpha, TCPS_LISTEN, + ill->ill_phyint->phyint_ifindex, ipst); + if (connp == NULL) + goto discard_pkt; + + if ((connp->conn_verifyicmp != NULL) && + !connp->conn_verifyicmp(connp, tcpha, NULL, icmp6, ira)) { + CONN_DEC_REF(connp); + goto discard_pkt; + } + CONN_DEC_REF(connp); + break; + } + case IPPROTO_SCTP: + /* + * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of + * transport header. + */ + if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_TP_HDR_LEN > + mp->b_wptr) + goto truncated; + break; + case IPPROTO_ESP: + case IPPROTO_AH: + break; + case IPPROTO_ENCAP: + case IPPROTO_IPV6: { + /* Look for self-encapsulated packets that caused an error */ + ip6_t *in_ip6h; + + in_ip6h = (ip6_t *)((uint8_t *)ip6h + hdr_length); + if ((uint8_t *)in_ip6h + (nexthdr == IPPROTO_ENCAP ? + sizeof (ipha_t) : sizeof (ip6_t)) > mp->b_wptr) + goto truncated; + break; + } + default: + break; + } - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); - connp->conn_recv(connp, first_mp, NULL); - CONN_DEC_REF(connp); return (B_TRUE); + +discard_pkt: + /* Bogus ICMP error. */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + return (B_FALSE); + +truncated: + /* We pulled up everthing already. Must be truncated */ + BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors); + return (B_FALSE); } /* - * Fanout received ICMPv6 error packets to the transports. - * Assumes the IPv6 plus ICMPv6 headers have been pulled up but nothing else. + * Process received IPv6 ICMP Packet too big. + * The caller is responsible for validating the packet before passing it in + * and also to fanout the ICMP error to any matching transport conns. Assumes + * the message has been fully pulled up. + * + * Before getting here, the caller has called icmp_inbound_verify_v6() + * that should have verified with ULP to prevent undoing the changes we're + * going to make to DCE. For example, TCP might have verified that the packet + * which generated error is in the send window. + * + * In some cases modified this MTU in the ICMP header packet; the caller + * should pass to the matching ULP after this returns. */ -void -icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, - icmp6_t *icmp6, ill_t *ill, ill_t *inill, boolean_t mctl_present, - zoneid_t zoneid) +static void +icmp_inbound_too_big_v6(icmp6_t *icmp6, ip_recv_attr_t *ira) { - uint16_t *up; /* Pointer to ports in ULP header */ - uint32_t ports; /* reversed ports for fanout */ - ip6_t rip6h; /* With reversed addresses */ - uint16_t hdr_length; - uint8_t *nexthdrp; - uint8_t nexthdr; - mblk_t *first_mp; - ipsec_in_t *ii; - tcpha_t *tcpha; - conn_t *connp; + uint32_t mtu; + dce_t *dce; + ill_t *ill = ira->ira_ill; /* Upper ill if IPMP */ ip_stack_t *ipst = ill->ill_ipst; + int old_max_frag; + in6_addr_t final_dst; + ip6_t *ip6h; /* Inner IP header */ - first_mp = mp; - if (mctl_present) { - mp = first_mp->b_cont; - ASSERT(mp != NULL); + /* Caller has already pulled up everything. */ + ip6h = (ip6_t *)&icmp6[1]; + final_dst = ip_get_dst_v6(ip6h, NULL, NULL); - ii = (ipsec_in_t *)first_mp->b_rptr; - ASSERT(ii->ipsec_in_type == IPSEC_IN); + /* + * For link local destinations matching simply on address is not + * sufficient. Same link local addresses for different ILL's is + * possible. + */ + if (IN6_IS_ADDR_LINKSCOPE(&final_dst)) { + dce = dce_lookup_and_add_v6(&final_dst, + ill->ill_phyint->phyint_ifindex, ipst); } else { - ii = NULL; + dce = dce_lookup_and_add_v6(&final_dst, 0, ipst); + } + if (dce == NULL) { + /* Couldn't add a unique one - ENOMEM */ + if (ip_debug > 2) { + /* ip1dbg */ + pr_addr_dbg("icmp_inbound_too_big_v6:" + "no dce for dst %s\n", AF_INET6, + &final_dst); + } + return; } - hdr_length = (uint16_t)((uchar_t *)icmp6 - (uchar_t *)ip6h); - ASSERT((size_t)(mp->b_wptr - (uchar_t *)icmp6) >= ICMP6_MINLEN); + mtu = ntohl(icmp6->icmp6_mtu); + mutex_enter(&dce->dce_lock); + if (dce->dce_flags & DCEF_PMTU) + old_max_frag = dce->dce_pmtu; + else + old_max_frag = ill->ill_mtu; + + if (mtu < IPV6_MIN_MTU) { + ip1dbg(("Received mtu less than IPv6 " + "min mtu %d: %d\n", IPV6_MIN_MTU, mtu)); + mtu = IPV6_MIN_MTU; + /* + * If an mtu less than IPv6 min mtu is received, + * we must include a fragment header in + * subsequent packets. + */ + dce->dce_flags |= DCEF_TOO_SMALL_PMTU; + } else { + dce->dce_flags &= ~DCEF_TOO_SMALL_PMTU; + } + ip1dbg(("Received mtu from router: %d\n", mtu)); + dce->dce_pmtu = MIN(old_max_frag, mtu); + + /* Prepare to send the new max frag size for the ULP. */ + if (dce->dce_flags & DCEF_TOO_SMALL_PMTU) { + /* + * If we need a fragment header in every packet + * (above case or multirouting), make sure the + * ULP takes it into account when computing the + * payload size. + */ + icmp6->icmp6_mtu = htonl(dce->dce_pmtu - sizeof (ip6_frag_t)); + } else { + icmp6->icmp6_mtu = htonl(dce->dce_pmtu); + } + /* We now have a PMTU for sure */ + dce->dce_flags |= DCEF_PMTU; + dce->dce_last_change_time = TICK_TO_SEC(lbolt64); + mutex_exit(&dce->dce_lock); /* - * Need to pullup everything in order to use - * ip_hdr_length_nexthdr_v6() + * After dropping the lock the new value is visible to everyone. + * Then we bump the generation number so any cached values reinspect + * the dce_t. */ - if (mp->b_cont != NULL) { - if (!pullupmsg(mp, -1)) { - ip1dbg(("icmp_inbound_error_fanout_v6: " - "pullupmsg failed\n")); - goto drop_pkt; - } - ip6h = (ip6_t *)mp->b_rptr; - icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]); - } + dce_increment_generation(dce); + dce_refrele(dce); +} - ip6h = (ip6_t *)&icmp6[1]; /* Packet in error */ - if ((uchar_t *)&ip6h[1] > mp->b_wptr) - goto drop_pkt; +/* + * Fanout received ICMPv6 error packets to the transports. + * Assumes the IPv6 plus ICMPv6 headers have been pulled up but nothing else. + * + * The caller must have called icmp_inbound_verify_v6. + */ +void +icmp_inbound_error_fanout_v6(mblk_t *mp, icmp6_t *icmp6, ip_recv_attr_t *ira) +{ + uint16_t *up; /* Pointer to ports in ULP header */ + uint32_t ports; /* reversed ports for fanout */ + ip6_t rip6h; /* With reversed addresses */ + ip6_t *ip6h; /* Inner IP header */ + uint16_t hdr_length; /* Inner IP header length */ + uint8_t *nexthdrp; + uint8_t nexthdr; + tcpha_t *tcpha; + conn_t *connp; + ill_t *ill = ira->ira_ill; /* Upper in the case of IPMP */ + ip_stack_t *ipst = ill->ill_ipst; + ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; + + /* Caller has already pulled up everything. */ + ip6h = (ip6_t *)&icmp6[1]; + ASSERT(mp->b_cont == NULL); + ASSERT((uchar_t *)&ip6h[1] <= mp->b_wptr); if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_length, &nexthdrp)) goto drop_pkt; nexthdr = *nexthdrp; - - /* Set message type, must be done after pullups */ - mp->b_datap->db_type = M_CTL; + ira->ira_protocol = nexthdr; /* * We need a separate IP header with the source and destination @@ -814,174 +794,128 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, /* Try to pass the ICMP message to clients who need it */ switch (nexthdr) { case IPPROTO_UDP: { - /* - * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of - * UDP header to get the port information. - */ - if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_TP_HDR_LEN > - mp->b_wptr) { - break; - } /* Attempt to find a client stream based on port. */ up = (uint16_t *)((uchar_t *)ip6h + hdr_length); - ((uint16_t *)&ports)[0] = up[1]; - ((uint16_t *)&ports)[1] = up[0]; - ip_fanout_udp_v6(q, first_mp, &rip6h, ports, ill, inill, - IP6_NO_IPPOLICY, mctl_present, zoneid); + /* Note that we send error to all matches. */ + ira->ira_flags |= IRAF_ICMP_ERROR; + ip_fanout_udp_multi_v6(mp, &rip6h, up[0], up[1], ira); + ira->ira_flags &= ~IRAF_ICMP_ERROR; return; } case IPPROTO_TCP: { /* - * Verify we have at least ICMP_MIN_TP_HDR_LEN bytes of - * the TCP header to get the port information. - */ - if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_TP_HDR_LEN > - mp->b_wptr) { - break; - } - - /* * Attempt to find a client stream based on port. * Note that we do a reverse lookup since the header is * in the form we sent it out. */ - tcpha = (tcpha_t *)((char *)ip6h + hdr_length); + tcpha = (tcpha_t *)((uchar_t *)ip6h + hdr_length); + /* + * With IPMP we need to match across group, which we do + * since we have the upper ill from ira_ill. + */ connp = ipcl_tcp_lookup_reversed_ipv6(ip6h, tcpha, TCPS_LISTEN, ill->ill_phyint->phyint_ifindex, ipst); if (connp == NULL) { goto drop_pkt; } - SQUEUE_ENTER_ONE(connp->conn_sqp, first_mp, tcp_input, connp, - SQ_FILL, SQTAG_TCP6_INPUT_ICMP_ERR); + if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) || + (ira->ira_flags & IRAF_IPSEC_SECURE)) { + mp = ipsec_check_inbound_policy(mp, connp, + NULL, ip6h, ira); + if (mp == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + /* Note that mp is NULL */ + ip_drop_input("ipIfStatsInDiscards", mp, ill); + CONN_DEC_REF(connp); + return; + } + } + + ira->ira_flags |= IRAF_ICMP_ERROR; + if (IPCL_IS_TCP(connp)) { + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, + connp->conn_recvicmp, connp, ira, SQ_FILL, + SQTAG_TCP6_INPUT_ICMP_ERR); + } else { + /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */ + ill_t *rill = ira->ira_rill; + + ira->ira_ill = ira->ira_rill = NULL; + (connp->conn_recv)(connp, mp, NULL, ira); + CONN_DEC_REF(connp); + ira->ira_ill = ill; + ira->ira_rill = rill; + } + ira->ira_flags &= ~IRAF_ICMP_ERROR; return; } case IPPROTO_SCTP: - /* - * Verify we have at least ICMP_MIN_SCTP_HDR_LEN bytes of - * transport header to get the port information. - */ - if ((uchar_t *)ip6h + hdr_length + ICMP_MIN_SCTP_HDR_LEN > - mp->b_wptr) { - if (!pullupmsg(mp, (uchar_t *)ip6h + hdr_length + - ICMP_MIN_SCTP_HDR_LEN - mp->b_rptr)) { - goto drop_pkt; - } - } - up = (uint16_t *)((uchar_t *)ip6h + hdr_length); + /* Find a SCTP client stream for this packet. */ ((uint16_t *)&ports)[0] = up[1]; ((uint16_t *)&ports)[1] = up[0]; - ip_fanout_sctp(first_mp, inill, (ipha_t *)ip6h, ports, 0, - mctl_present, IP6_NO_IPPOLICY, zoneid); - return; - case IPPROTO_ESP: - case IPPROTO_AH: { - int ipsec_rc; - ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; - /* - * We need a IPSEC_IN in the front to fanout to AH/ESP. - * We will re-use the IPSEC_IN if it is already present as - * AH/ESP will not affect any fields in the IPSEC_IN for - * ICMP errors. If there is no IPSEC_IN, allocate a new - * one and attach it in the front. - */ - if (ii != NULL) { - /* - * ip_fanout_proto_again converts the ICMP errors - * that come back from AH/ESP to M_DATA so that - * if it is non-AH/ESP and we do a pullupmsg in - * this function, it would work. Convert it back - * to M_CTL before we send up as this is a ICMP - * error. This could have been generated locally or - * by some router. Validate the inner IPSEC - * headers. - * - * NOTE : ill_index is used by ip_fanout_proto_again - * to locate the ill. - */ - ASSERT(ill != NULL); - ii->ipsec_in_ill_index = - ill->ill_phyint->phyint_ifindex; - ii->ipsec_in_rill_index = - inill->ill_phyint->phyint_ifindex; - first_mp->b_cont->b_datap->db_type = M_CTL; - } else { - /* - * IPSEC_IN is not present. We attach a ipsec_in - * message and send up to IPSEC for validating - * and removing the IPSEC headers. Clear - * ipsec_in_secure so that when we return - * from IPSEC, we don't mistakenly think that this - * is a secure packet came from the network. - * - * NOTE : ill_index is used by ip_fanout_proto_again - * to locate the ill. - */ - ASSERT(first_mp == mp); - first_mp = ipsec_in_alloc(B_FALSE, ipst->ips_netstack); - ASSERT(ill != NULL); - if (first_mp == NULL) { - freemsg(mp); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - return; - } - ii = (ipsec_in_t *)first_mp->b_rptr; - - /* This is not a secure packet */ - ii->ipsec_in_secure = B_FALSE; - first_mp->b_cont = mp; - mp->b_datap->db_type = M_CTL; - ii->ipsec_in_ill_index = - ill->ill_phyint->phyint_ifindex; - ii->ipsec_in_rill_index = - inill->ill_phyint->phyint_ifindex; - } + ira->ira_flags |= IRAF_ICMP_ERROR; + ip_fanout_sctp(mp, NULL, &rip6h, ports, ira); + ira->ira_flags &= ~IRAF_ICMP_ERROR; + return; + case IPPROTO_ESP: + case IPPROTO_AH: if (!ipsec_loaded(ipss)) { - ip_proto_not_sup(q, first_mp, 0, zoneid, ipst); + ip_proto_not_sup(mp, ira); return; } if (nexthdr == IPPROTO_ESP) - ipsec_rc = ipsecesp_icmp_error(first_mp); + mp = ipsecesp_icmp_error(mp, ira); else - ipsec_rc = ipsecah_icmp_error(first_mp); - if (ipsec_rc == IPSEC_STATUS_FAILED) + mp = ipsecah_icmp_error(mp, ira); + if (mp == NULL) return; - ip_fanout_proto_again(first_mp, ill, inill, NULL); - return; - } - case IPPROTO_ENCAP: - case IPPROTO_IPV6: - if ((uint8_t *)ip6h + hdr_length + - (nexthdr == IPPROTO_ENCAP ? sizeof (ipha_t) : - sizeof (ip6_t)) > mp->b_wptr) { + /* Just in case ipsec didn't preserve the NULL b_cont */ + if (mp->b_cont != NULL) { + if (!pullupmsg(mp, -1)) + goto drop_pkt; + } + + /* + * If succesful, the mp has been modified to not include + * the ESP/AH header so we can fanout to the ULP's icmp + * error handler. + */ + if (mp->b_wptr - mp->b_rptr < IPV6_HDR_LEN) goto drop_pkt; + + ip6h = (ip6_t *)mp->b_rptr; + /* Don't call hdr_length_v6() unless you have to. */ + if (ip6h->ip6_nxt != IPPROTO_ICMPV6) + hdr_length = ip_hdr_length_v6(mp, ip6h); + else + hdr_length = IPV6_HDR_LEN; + + /* Verify the modified message before any further processes. */ + icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]); + if (!icmp_inbound_verify_v6(mp, icmp6, ira)) { + freemsg(mp); + return; } - if (nexthdr == IPPROTO_ENCAP || - !IN6_ARE_ADDR_EQUAL( - &((ip6_t *)(((uint8_t *)ip6h) + hdr_length))->ip6_src, - &ip6h->ip6_src) || - !IN6_ARE_ADDR_EQUAL( - &((ip6_t *)(((uint8_t *)ip6h) + hdr_length))->ip6_dst, - &ip6h->ip6_dst)) { - /* - * For tunnels that have used IPsec protection, - * we need to adjust the MTU to take into account - * the IPsec overhead. - */ - if (ii != NULL) { - icmp6->icmp6_mtu = htonl( - ntohl(icmp6->icmp6_mtu) - - ipsec_in_extra_length(first_mp)); - } - } else { + icmp_inbound_error_fanout_v6(mp, icmp6, ira); + return; + + case IPPROTO_IPV6: { + /* Look for self-encapsulated packets that caused an error */ + ip6_t *in_ip6h; + + in_ip6h = (ip6_t *)((uint8_t *)ip6h + hdr_length); + + if (IN6_ARE_ADDR_EQUAL(&in_ip6h->ip6_src, &ip6h->ip6_src) && + IN6_ARE_ADDR_EQUAL(&in_ip6h->ip6_dst, &ip6h->ip6_dst)) { /* * Self-encapsulated case. As in the ipv4 case, * we need to strip the 2nd IP header. Since mp @@ -989,126 +923,124 @@ icmp_inbound_error_fanout_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, * the 3rd header + data over the 2nd header. */ uint16_t unused_len; - ip6_t *inner_ip6h = (ip6_t *) - ((uchar_t *)ip6h + hdr_length); /* * Make sure we don't do recursion more than once. */ - if (!ip_hdr_length_nexthdr_v6(mp, inner_ip6h, + if (!ip_hdr_length_nexthdr_v6(mp, in_ip6h, &unused_len, &nexthdrp) || *nexthdrp == IPPROTO_IPV6) { goto drop_pkt; } /* - * We are about to modify the packet. Make a copy if - * someone else has a reference to it. - */ - if (DB_REF(mp) > 1) { - mblk_t *mp1; - uint16_t icmp6_offset; - - mp1 = copymsg(mp); - if (mp1 == NULL) { - goto drop_pkt; - } - icmp6_offset = (uint16_t) - ((uchar_t *)icmp6 - mp->b_rptr); - freemsg(mp); - mp = mp1; - - icmp6 = (icmp6_t *)(mp->b_rptr + icmp6_offset); - ip6h = (ip6_t *)&icmp6[1]; - inner_ip6h = (ip6_t *) - ((uchar_t *)ip6h + hdr_length); - - if (mctl_present) - first_mp->b_cont = mp; - else - first_mp = mp; - } - - /* - * Need to set db_type back to M_DATA before - * refeeding mp into this function. - */ - DB_TYPE(mp) = M_DATA; - - /* * Copy the 3rd header + remaining data on top * of the 2nd header. */ - bcopy(inner_ip6h, ip6h, - mp->b_wptr - (uchar_t *)inner_ip6h); + bcopy(in_ip6h, ip6h, mp->b_wptr - (uchar_t *)in_ip6h); /* * Subtract length of the 2nd header. */ mp->b_wptr -= hdr_length; + ip6h = (ip6_t *)mp->b_rptr; + /* Don't call hdr_length_v6() unless you have to. */ + if (ip6h->ip6_nxt != IPPROTO_ICMPV6) + hdr_length = ip_hdr_length_v6(mp, ip6h); + else + hdr_length = IPV6_HDR_LEN; + + /* + * Verify the modified message before any further + * processes. + */ + icmp6 = (icmp6_t *)(&mp->b_rptr[hdr_length]); + if (!icmp_inbound_verify_v6(mp, icmp6, ira)) { + freemsg(mp); + return; + } + /* * Now recurse, and see what I _really_ should be * doing here. */ - icmp_inbound_error_fanout_v6(q, first_mp, - (ip6_t *)mp->b_rptr, icmp6, ill, inill, - mctl_present, zoneid); + icmp_inbound_error_fanout_v6(mp, icmp6, ira); return; } - if (icmp_inbound_iptun_fanout_v6(first_mp, &rip6h, ill, ipst)) + /* FALLTHRU */ + } + case IPPROTO_ENCAP: + if ((connp = ipcl_iptun_classify_v6(&rip6h.ip6_src, + &rip6h.ip6_dst, ipst)) != NULL) { + ira->ira_flags |= IRAF_ICMP_ERROR; + connp->conn_recvicmp(connp, mp, NULL, ira); + CONN_DEC_REF(connp); + ira->ira_flags &= ~IRAF_ICMP_ERROR; return; + } /* - * No IP tunnel is associated with this error. Perhaps a raw - * socket will want it. + * No IP tunnel is interested, fallthrough and see + * if a raw socket will want it. */ /* FALLTHRU */ default: - ip_fanout_proto_v6(q, first_mp, &rip6h, ill, inill, nexthdr, 0, - IP6_NO_IPPOLICY, mctl_present, zoneid); + ira->ira_flags |= IRAF_ICMP_ERROR; + ASSERT(ira->ira_protocol == nexthdr); + ip_fanout_proto_v6(mp, &rip6h, ira); + ira->ira_flags &= ~IRAF_ICMP_ERROR; return; } /* NOTREACHED */ drop_pkt: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors); ip1dbg(("icmp_inbound_error_fanout_v6: drop pkt\n")); - freemsg(first_mp); + freemsg(mp); } /* * Process received IPv6 ICMP Redirect messages. + * Assumes the caller has verified that the headers are in the pulled up mblk. + * Consumes mp. */ /* ARGSUSED */ static void -icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill) +icmp_redirect_v6(mblk_t *mp, ip6_t *ip6h, nd_redirect_t *rd, + ip_recv_attr_t *ira) { - ip6_t *ip6h; - uint16_t hdr_length; - nd_redirect_t *rd; - ire_t *ire; - ire_t *prev_ire; + ire_t *ire, *nire; + ire_t *prev_ire = NULL; ire_t *redir_ire; in6_addr_t *src, *dst, *gateway; nd_opt_hdr_t *opt; nce_t *nce; - int nce_flags = 0; + int ncec_flags = 0; int err = 0; boolean_t redirect_to_router = B_FALSE; int len; int optlen; - iulp_t ulp_info = { 0 }; - ill_t *prev_ire_ill; - ipif_t *ipif; + ill_t *ill = ira->ira_rill; + ill_t *rill = ira->ira_rill; ip_stack_t *ipst = ill->ill_ipst; - ip6h = (ip6_t *)mp->b_rptr; - if (ip6h->ip6_nxt != IPPROTO_ICMPV6) - hdr_length = ip_hdr_length_v6(mp, ip6h); - else - hdr_length = IPV6_HDR_LEN; + /* + * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill + * and make it be the IPMP upper so avoid being confused by a packet + * addressed to a unicast address on a different ill. + */ + if (IS_UNDER_IPMP(rill)) { + rill = ipmp_ill_hold_ipmp_ill(rill); + if (rill == NULL) { + BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects); + ip_drop_input("ipv6IfIcmpInBadRedirects - IPMP ill", + mp, ill); + freemsg(mp); + return; + } + ASSERT(rill != ira->ira_rill); + } - rd = (nd_redirect_t *)&mp->b_rptr[hdr_length]; - len = mp->b_wptr - mp->b_rptr - hdr_length; + len = mp->b_wptr - (uchar_t *)rd; src = &ip6h->ip6_src; dst = &rd->nd_rd_dst; gateway = &rd->nd_rd_target; @@ -1121,37 +1053,35 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill) (IN6_IS_ADDR_V4MAPPED(dst)) || (IN6_IS_ADDR_MULTICAST(dst))) { BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects); - freemsg(mp); - return; + ip_drop_input("ipv6IfIcmpInBadRedirects - addr/len", mp, ill); + goto fail_redirect; } if (!(IN6_IS_ADDR_LINKLOCAL(gateway) || IN6_ARE_ADDR_EQUAL(gateway, dst))) { BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects); - freemsg(mp); - return; + ip_drop_input("ipv6IfIcmpInBadRedirects - bad gateway", + mp, ill); + goto fail_redirect; } - if (len > sizeof (nd_redirect_t)) { - if (!ndp_verify_optlen((nd_opt_hdr_t *)&rd[1], - len - sizeof (nd_redirect_t))) { + optlen = len - sizeof (nd_redirect_t); + if (optlen != 0) { + if (!ndp_verify_optlen((nd_opt_hdr_t *)&rd[1], optlen)) { BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects); - freemsg(mp); - return; + ip_drop_input("ipv6IfIcmpInBadRedirects - options", + mp, ill); + goto fail_redirect; } } if (!IN6_ARE_ADDR_EQUAL(gateway, dst)) { redirect_to_router = B_TRUE; - nce_flags |= NCE_F_ISROUTER; + ncec_flags |= NCE_F_ISROUTER; + } else { + gateway = dst; /* Add nce for dst */ } - /* ipif will be refreleased afterwards */ - ipif = ipif_get_next_ipif(NULL, ill); - if (ipif == NULL) { - freemsg(mp); - return; - } /* * Verify that the IP source address of the redirect is @@ -1160,10 +1090,11 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill) * Also, Make sure we had a route for the dest in question and * that route was pointing to the old gateway (the source of the * redirect packet.) + * Note: this merely says that there is some IRE which matches that + * gateway; not that the longest match matches that gateway. */ - - prev_ire = ire_route_lookup_v6(dst, 0, src, 0, ipif, NULL, ALL_ZONES, - NULL, MATCH_IRE_GW | MATCH_IRE_ILL | MATCH_IRE_DEFAULT, ipst); + prev_ire = ire_ftable_lookup_v6(dst, 0, src, 0, rill, + ALL_ZONES, NULL, MATCH_IRE_GW | MATCH_IRE_ILL, 0, ipst, NULL); /* * Check that @@ -1171,92 +1102,44 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill) * old gateway is still directly reachable */ if (prev_ire == NULL || - prev_ire->ire_type == IRE_LOCAL) { + (prev_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) || + (prev_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects); - ipif_refrele(ipif); + ip_drop_input("ipv6IfIcmpInBadRedirects - ire", mp, ill); goto fail_redirect; } - prev_ire_ill = ire_to_ill(prev_ire); - ASSERT(prev_ire_ill != NULL); - if (prev_ire_ill->ill_flags & ILLF_NONUD) - nce_flags |= NCE_F_NONUD; - - /* - * Should we use the old ULP info to create the new gateway? From - * a user's perspective, we should inherit the info so that it - * is a "smooth" transition. If we do not do that, then new - * connections going thru the new gateway will have no route metrics, - * which is counter-intuitive to user. From a network point of - * view, this may or may not make sense even though the new gateway - * is still directly connected to us so the route metrics should not - * change much. - * - * But if the old ire_uinfo is not initialized, we do another - * recursive lookup on the dest using the new gateway. There may - * be a route to that. If so, use it to initialize the redirect - * route. - */ - if (prev_ire->ire_uinfo.iulp_set) { - bcopy(&prev_ire->ire_uinfo, &ulp_info, sizeof (iulp_t)); - } else if (redirect_to_router) { - /* - * Only do the following if the redirection is really to - * a router. - */ - ire_t *tmp_ire; - ire_t *sire; - tmp_ire = ire_ftable_lookup_v6(dst, 0, gateway, 0, NULL, &sire, - ALL_ZONES, 0, NULL, - (MATCH_IRE_RECURSIVE | MATCH_IRE_GW | MATCH_IRE_DEFAULT), - ipst); - if (sire != NULL) { - bcopy(&sire->ire_uinfo, &ulp_info, sizeof (iulp_t)); - ASSERT(tmp_ire != NULL); - ire_refrele(tmp_ire); - ire_refrele(sire); - } else if (tmp_ire != NULL) { - bcopy(&tmp_ire->ire_uinfo, &ulp_info, - sizeof (iulp_t)); - ire_refrele(tmp_ire); - } - } + ASSERT(prev_ire->ire_ill != NULL); + if (prev_ire->ire_ill->ill_flags & ILLF_NONUD) + ncec_flags |= NCE_F_NONUD; - optlen = mp->b_wptr - mp->b_rptr - hdr_length - sizeof (nd_redirect_t); opt = (nd_opt_hdr_t *)&rd[1]; opt = ndp_get_option(opt, optlen, ND_OPT_TARGET_LINKADDR); if (opt != NULL) { - err = ndp_lookup_then_add_v6(ill, - B_FALSE, /* don't match across illgrp */ + err = nce_lookup_then_add_v6(rill, (uchar_t *)&opt[1], /* Link layer address */ - gateway, - &ipv6_all_ones, /* prefix mask */ - &ipv6_all_zeros, /* Mapping mask */ - 0, - nce_flags, - ND_STALE, - &nce); + rill->ill_phys_addr_length, + gateway, ncec_flags, ND_STALE, &nce); switch (err) { case 0: - NCE_REFRELE(nce); + nce_refrele(nce); break; case EEXIST: /* * Check to see if link layer address has changed and - * process the nce_state accordingly. + * process the ncec_state accordingly. */ - ndp_process(nce, (uchar_t *)&opt[1], 0, B_FALSE); - NCE_REFRELE(nce); + nce_process(nce->nce_common, + (uchar_t *)&opt[1], 0, B_FALSE); + nce_refrele(nce); break; default: ip1dbg(("icmp_redirect_v6: NCE create failed %d\n", err)); - ipif_refrele(ipif); goto fail_redirect; } } if (redirect_to_router) { - /* icmp_redirect_ok_v6() must have already verified this */ ASSERT(IN6_IS_ADDR_LINKLOCAL(gateway)); /* @@ -1266,65 +1149,68 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill) ire = ire_create_v6( dst, &ipv6_all_ones, /* mask */ - &prev_ire->ire_src_addr_v6, /* source addr */ gateway, /* gateway addr */ - &prev_ire->ire_max_frag, /* max frag */ - NULL, /* no src nce */ - NULL, /* no rfq */ - NULL, /* no stq */ IRE_HOST, - prev_ire->ire_ipif, - NULL, - 0, - 0, + prev_ire->ire_ill, + ALL_ZONES, (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), - &ulp_info, - NULL, NULL, ipst); } else { - queue_t *stq; - - stq = (ipif->ipif_net_type == IRE_IF_RESOLVER) - ? ipif->ipif_rq : ipif->ipif_wq; + ipif_t *ipif; + in6_addr_t gw; /* * Just create an on link entry, i.e. interface route. + * The gateway field is our link-local on the ill. */ + mutex_enter(&rill->ill_lock); + for (ipif = rill->ill_ipif; ipif != NULL; + ipif = ipif->ipif_next) { + if (!(ipif->ipif_state_flags & IPIF_CONDEMNED) && + IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6lcl_addr)) + break; + } + if (ipif == NULL) { + /* We have no link-local address! */ + mutex_exit(&rill->ill_lock); + goto fail_redirect; + } + gw = ipif->ipif_v6lcl_addr; + mutex_exit(&rill->ill_lock); + ire = ire_create_v6( dst, /* gateway == dst */ &ipv6_all_ones, /* mask */ - &prev_ire->ire_src_addr_v6, /* source addr */ - &ipv6_all_zeros, /* gateway addr */ - &prev_ire->ire_max_frag, /* max frag */ - NULL, /* no src nce */ - NULL, /* ire rfq */ - stq, /* ire stq */ - ipif->ipif_net_type, /* IF_[NO]RESOLVER */ - prev_ire->ire_ipif, - &ipv6_all_ones, - 0, - 0, + &gw, /* gateway addr */ + rill->ill_net_type, /* IF_[NO]RESOLVER */ + prev_ire->ire_ill, + ALL_ZONES, (RTF_DYNAMIC | RTF_HOST), - &ulp_info, - NULL, NULL, ipst); } - /* Release reference from earlier ipif_get_next_ipif() */ - ipif_refrele(ipif); - if (ire == NULL) goto fail_redirect; - if (ire_add(&ire, NULL, NULL, NULL, B_FALSE) == 0) { + nire = ire_add(ire); + /* Check if it was a duplicate entry */ + if (nire != NULL && nire != ire) { + ASSERT(nire->ire_identical_ref > 1); + ire_delete(nire); + ire_refrele(nire); + nire = NULL; + } + ire = nire; + if (ire != NULL) { + ire_refrele(ire); /* Held in ire_add */ /* tell routing sockets that we received a redirect */ ip_rts_change_v6(RTM_REDIRECT, &rd->nd_rd_dst, &rd->nd_rd_target, - &ipv6_all_ones, 0, &ire->ire_src_addr_v6, + &ipv6_all_ones, 0, src, (RTF_DYNAMIC | RTF_GATEWAY | RTF_HOST), 0, (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_AUTHOR), ipst); @@ -1334,10 +1220,9 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill) * modifying an existing redirect. */ redir_ire = ire_ftable_lookup_v6(dst, 0, src, IRE_HOST, - ire->ire_ipif, NULL, ALL_ZONES, 0, NULL, - (MATCH_IRE_GW | MATCH_IRE_TYPE | MATCH_IRE_ILL), ipst); - - ire_refrele(ire); /* Held in ire_add_v6 */ + prev_ire->ire_ill, ALL_ZONES, NULL, + (MATCH_IRE_GW | MATCH_IRE_TYPE | MATCH_IRE_ILL), 0, ipst, + NULL); if (redir_ire != NULL) { if (redir_ire->ire_flags & RTF_DYNAMIC) @@ -1346,8 +1231,6 @@ icmp_redirect_v6(queue_t *q, mblk_t *mp, ill_t *ill) } } - if (prev_ire->ire_type == IRE_CACHE) - ire_delete(prev_ire); ire_refrele(prev_ire); prev_ire = NULL; @@ -1355,101 +1238,8 @@ fail_redirect: if (prev_ire != NULL) ire_refrele(prev_ire); freemsg(mp); -} - -static ill_t * -ip_queue_to_ill_v6(queue_t *q, ip_stack_t *ipst) -{ - ill_t *ill; - - ASSERT(WR(q) == q); - - if (q->q_next != NULL) { - ill = (ill_t *)q->q_ptr; - if (ILL_CAN_LOOKUP(ill)) - ill_refhold(ill); - else - ill = NULL; - } else { - ill = ill_lookup_on_name(ipif_loopback_name, B_FALSE, B_TRUE, - NULL, NULL, NULL, NULL, NULL, ipst); - } - if (ill == NULL) - ip0dbg(("ip_queue_to_ill_v6: no ill\n")); - return (ill); -} - -/* - * Assigns an appropriate source address to the packet. - * If origdst is one of our IP addresses that use it as the source. - * If the queue is an ill queue then select a source from that ill. - * Otherwise pick a source based on a route lookup back to the origsrc. - * - * src is the return parameter. Returns a pointer to src or NULL if failure. - */ -static in6_addr_t * -icmp_pick_source_v6(queue_t *wq, in6_addr_t *origsrc, in6_addr_t *origdst, - in6_addr_t *src, zoneid_t zoneid, ip_stack_t *ipst) -{ - ill_t *ill; - ire_t *ire; - ipif_t *ipif; - - ASSERT(!(wq->q_flag & QREADR)); - if (wq->q_next != NULL) { - ill = (ill_t *)wq->q_ptr; - } else { - ill = NULL; - } - - ire = ire_route_lookup_v6(origdst, 0, 0, (IRE_LOCAL|IRE_LOOPBACK), - NULL, NULL, zoneid, NULL, (MATCH_IRE_TYPE|MATCH_IRE_ZONEONLY), - ipst); - if (ire != NULL) { - /* Destined to one of our addresses */ - *src = *origdst; - ire_refrele(ire); - return (src); - } - if (ire != NULL) { - ire_refrele(ire); - ire = NULL; - } - if (ill == NULL) { - /* What is the route back to the original source? */ - ire = ire_route_lookup_v6(origsrc, 0, 0, 0, - NULL, NULL, zoneid, NULL, - (MATCH_IRE_DEFAULT|MATCH_IRE_RECURSIVE), ipst); - if (ire == NULL) { - BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutNoRoutes); - return (NULL); - } - ASSERT(ire->ire_ipif != NULL); - ill = ire->ire_ipif->ipif_ill; - ire_refrele(ire); - } - ipif = ipif_select_source_v6(ill, origsrc, B_FALSE, - IPV6_PREFER_SRC_DEFAULT, zoneid); - if (ipif != NULL) { - *src = ipif->ipif_v6src_addr; - ipif_refrele(ipif); - return (src); - } - /* - * Unusual case - can't find a usable source address to reach the - * original source. Use what in the route to the source. - */ - ire = ire_route_lookup_v6(origsrc, 0, 0, 0, - NULL, NULL, zoneid, NULL, - (MATCH_IRE_DEFAULT|MATCH_IRE_RECURSIVE), ipst); - if (ire == NULL) { - BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutNoRoutes); - return (NULL); - } - ASSERT(ire != NULL); - *src = ire->ire_src_addr_v6; - ire_refrele(ire); - return (src); + if (rill != ira->ira_rill) + ill_refrele(rill); } /* @@ -1459,17 +1249,12 @@ icmp_pick_source_v6(queue_t *wq, in6_addr_t *origsrc, in6_addr_t *origdst, * Note: assumes that icmp_pkt_err_ok_v6 has been called to * verify that an icmp error packet can be sent. * - * If q is an ill write side queue (which is the case when packets - * arrive from ip_rput) then ip_wput code will ensure that packets to - * link-local destinations are sent out that ill. - * * If v6src_ptr is set use it as a source. Otherwise select a reasonable * source address (see above function). */ static void -icmp_pkt_v6(queue_t *q, mblk_t *mp, void *stuff, size_t len, - const in6_addr_t *v6src_ptr, boolean_t mctl_present, zoneid_t zoneid, - ip_stack_t *ipst) +icmp_pkt_v6(mblk_t *mp, void *stuff, size_t len, + const in6_addr_t *v6src_ptr, ip_recv_attr_t *ira) { ip6_t *ip6h; in6_addr_t v6dst; @@ -1477,98 +1262,82 @@ icmp_pkt_v6(queue_t *q, mblk_t *mp, void *stuff, size_t len, size_t msg_len; mblk_t *mp1; icmp6_t *icmp6; - ill_t *ill; in6_addr_t v6src; - mblk_t *ipsec_mp; - ipsec_out_t *io; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; + ip_xmit_attr_t ixas; - ill = ip_queue_to_ill_v6(q, ipst); - if (ill == NULL) { - freemsg(mp); - return; + ip6h = (ip6_t *)mp->b_rptr; + + bzero(&ixas, sizeof (ixas)); + ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6; + ixas.ixa_zoneid = ira->ira_zoneid; + ixas.ixa_ifindex = 0; + ixas.ixa_ipst = ipst; + ixas.ixa_cred = kcred; + ixas.ixa_cpid = NOPID; + ixas.ixa_tsl = ira->ira_tsl; /* Behave as a multi-level responder */ + ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; + + /* + * If the source of the original packet was link-local, then + * make sure we send on the same ill (group) as we received it on. + */ + if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) { + ixas.ixa_flags |= IXAF_SCOPEID_SET; + if (IS_UNDER_IPMP(ill)) + ixas.ixa_scopeid = ill_get_upper_ifindex(ill); + else + ixas.ixa_scopeid = ill->ill_phyint->phyint_ifindex; } - if (mctl_present) { + if (ira->ira_flags & IRAF_IPSEC_SECURE) { /* - * If it is : - * - * 1) a IPSEC_OUT, then this is caused by outbound - * datagram originating on this host. IPSEC processing - * may or may not have been done. Refer to comments above - * icmp_inbound_error_fanout for details. + * Apply IPsec based on how IPsec was applied to + * the packet that had the error. * - * 2) a IPSEC_IN if we are generating a icmp_message - * for an incoming datagram destined for us i.e called - * from ip_fanout_send_icmp. + * If it was an outbound packet that caused the ICMP + * error, then the caller will have setup the IRA + * appropriately. */ - ipsec_info_t *in; - - ipsec_mp = mp; - mp = ipsec_mp->b_cont; - - in = (ipsec_info_t *)ipsec_mp->b_rptr; - ip6h = (ip6_t *)mp->b_rptr; - - ASSERT(in->ipsec_info_type == IPSEC_OUT || - in->ipsec_info_type == IPSEC_IN); - - if (in->ipsec_info_type == IPSEC_IN) { - /* - * Convert the IPSEC_IN to IPSEC_OUT. - */ - if (!ipsec_in_to_out(ipsec_mp, NULL, ip6h, zoneid)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - ill_refrele(ill); - return; - } - } else { - ASSERT(in->ipsec_info_type == IPSEC_OUT); - io = (ipsec_out_t *)in; - /* - * Clear out ipsec_out_proc_begin, so we do a fresh - * ire lookup. - */ - io->ipsec_out_proc_begin = B_FALSE; + if (!ipsec_in_to_out(ira, &ixas, mp, NULL, ip6h)) { + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); + /* Note: mp already consumed and ip_drop_packet done */ + return; } } else { /* * This is in clear. The icmp message we are building - * here should go out in clear. - */ - ipsec_in_t *ii; - ASSERT(mp->b_datap->db_type == M_DATA); - ipsec_mp = ipsec_in_alloc(B_FALSE, ipst->ips_netstack); - if (ipsec_mp == NULL) { - freemsg(mp); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - ill_refrele(ill); - return; - } - ii = (ipsec_in_t *)ipsec_mp->b_rptr; - - /* This is not a secure packet */ - ii->ipsec_in_secure = B_FALSE; - ipsec_mp->b_cont = mp; - ip6h = (ip6_t *)mp->b_rptr; - /* - * Convert the IPSEC_IN to IPSEC_OUT. + * here should go out in clear, independent of our policy. */ - if (!ipsec_in_to_out(ipsec_mp, NULL, ip6h, zoneid)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - ill_refrele(ill); - return; - } + ixas.ixa_flags |= IXAF_NO_IPSEC; } - io = (ipsec_out_t *)ipsec_mp->b_rptr; + /* + * If the caller specified the source we use that. + * Otherwise, if the packet was for one of our unicast addresses, make + * sure we respond with that as the source. Otherwise + * have ip_output_simple pick the source address. + */ if (v6src_ptr != NULL) { v6src = *v6src_ptr; } else { - if (icmp_pick_source_v6(q, &ip6h->ip6_src, &ip6h->ip6_dst, - &v6src, zoneid, ipst) == NULL) { - freemsg(ipsec_mp); - ill_refrele(ill); - return; + ire_t *ire; + uint_t match_flags = MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY; + + if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src) || + IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst)) + match_flags |= MATCH_IRE_ILL; + + ire = ire_ftable_lookup_v6(&ip6h->ip6_dst, 0, 0, + (IRE_LOCAL|IRE_LOOPBACK), ill, ira->ira_zoneid, NULL, + match_flags, 0, ipst, NULL); + if (ire != NULL) { + v6src = ip6h->ip6_dst; + ire_refrele(ire); + } else { + v6src = ipv6_all_zeros; + ixas.ixa_flags |= IXAF_SET_SOURCE; } } v6dst = ip6h->ip6_src; @@ -1577,34 +1346,28 @@ icmp_pkt_v6(queue_t *q, mblk_t *mp, void *stuff, size_t len, if (msg_len > len_needed) { if (!adjmsg(mp, len_needed - msg_len)) { BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutErrors); - freemsg(ipsec_mp); - ill_refrele(ill); + freemsg(mp); return; } msg_len = len_needed; } - mp1 = allocb_tmpl(IPV6_HDR_LEN + len, mp); + mp1 = allocb(IPV6_HDR_LEN + len, BPRI_MED); if (mp1 == NULL) { BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutErrors); - freemsg(ipsec_mp); - ill_refrele(ill); + freemsg(mp); return; } - ill_refrele(ill); mp1->b_cont = mp; mp = mp1; - ASSERT(ipsec_mp->b_datap->db_type == M_CTL && - io->ipsec_out_type == IPSEC_OUT); - ipsec_mp->b_cont = mp; /* - * Set ipsec_out_icmp_loopback so we can let the ICMP messages this + * Set IXAF_TRUSTED_ICMP so we can let the ICMP messages this * node generates be accepted in peace by all on-host destinations. * If we do NOT assume that all on-host destinations trust - * self-generated ICMP messages, then rework here, ip.c, and spd.c. - * (Look for ipsec_out_icmp_loopback). + * self-generated ICMP messages, then rework here, ip6.c, and spd.c. + * (Look for IXAF_TRUSTED_ICMP). */ - io->ipsec_out_icmp_loopback = B_TRUE; + ixas.ixa_flags |= IXAF_TRUSTED_ICMP; ip6h = (ip6_t *)mp->b_rptr; mp1->b_wptr = (uchar_t *)ip6h + (IPV6_HDR_LEN + len); @@ -1624,20 +1387,21 @@ icmp_pkt_v6(queue_t *q, mblk_t *mp, void *stuff, size_t len, bcopy(stuff, (char *)icmp6, len); /* * Prepare for checksum by putting icmp length in the icmp - * checksum field. The checksum is calculated in ip_wput_v6. + * checksum field. The checksum is calculated in ip_output_wire_v6. */ icmp6->icmp6_cksum = ip6h->ip6_plen; if (icmp6->icmp6_type == ND_REDIRECT) { ip6h->ip6_hops = IPV6_MAX_HOPS; } - /* Send to V6 writeside put routine */ - put(q, ipsec_mp); + + (void) ip_output_simple(mp, &ixas); + ixa_cleanup(&ixas); } /* * Update the output mib when ICMPv6 packets are sent. */ -static void +void icmp_update_out_mib_v6(ill_t *ill, icmp6_t *icmp6) { BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutMsgs); @@ -1712,14 +1476,19 @@ icmp_update_out_mib_v6(ill_t *ill, icmp6_t *icmp6) * ICMP error packet should be sent. */ static mblk_t * -icmp_pkt_err_ok_v6(queue_t *q, mblk_t *mp, - boolean_t llbcast, boolean_t mcast_ok, ip_stack_t *ipst) +icmp_pkt_err_ok_v6(mblk_t *mp, boolean_t mcast_ok, ip_recv_attr_t *ira) { - ip6_t *ip6h; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; + boolean_t llbcast; + ip6_t *ip6h; if (!mp) return (NULL); + /* We view multicast and broadcast as the same.. */ + llbcast = (ira->ira_flags & + (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) != 0; ip6h = (ip6_t *)mp->b_rptr; /* Check if source address uniquely identifies the host */ @@ -1737,17 +1506,8 @@ icmp_pkt_err_ok_v6(queue_t *q, mblk_t *mp, if (mp->b_wptr - mp->b_rptr < len_needed) { if (!pullupmsg(mp, len_needed)) { - ill_t *ill; - - ill = ip_queue_to_ill_v6(q, ipst); - if (ill == NULL) { - BUMP_MIB(&ipst->ips_icmp6_mib, - ipv6IfIcmpInErrors); - } else { - BUMP_MIB(ill->ill_icmp6_mib, - ipv6IfIcmpInErrors); - ill_refrele(ill); - } + BUMP_MIB(ill->ill_icmp6_mib, + ipv6IfIcmpInErrors); freemsg(mp); return (NULL); } @@ -1771,6 +1531,16 @@ icmp_pkt_err_ok_v6(queue_t *q, mblk_t *mp, freemsg(mp); return (NULL); } + /* + * If this is a labeled system, then check to see if we're allowed to + * send a response to this particular sender. If not, then just drop. + */ + if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) { + BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpOutErrors); + freemsg(mp); + return (NULL); + } + if (icmp_err_rate_limit(ipst)) { /* * Only send ICMP error packets every so often. @@ -1784,37 +1554,117 @@ icmp_pkt_err_ok_v6(queue_t *q, mblk_t *mp, } /* + * Called when a packet was sent out the same link that it arrived on. + * Check if it is ok to send a redirect and then send it. + */ +void +ip_send_potential_redirect_v6(mblk_t *mp, ip6_t *ip6h, ire_t *ire, + ip_recv_attr_t *ira) +{ + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; + in6_addr_t *v6targ; + ire_t *src_ire_v6 = NULL; + mblk_t *mp1; + ire_t *nhop_ire = NULL; + + /* + * Don't send a redirect when forwarding a source + * routed packet. + */ + if (ip_source_routed_v6(ip6h, mp, ipst)) + return; + + if (ire->ire_type & IRE_ONLINK) { + /* Target is directly connected */ + v6targ = &ip6h->ip6_dst; + } else { + /* Determine the most specific IRE used to send the packets */ + nhop_ire = ire_nexthop(ire); + if (nhop_ire == NULL) + return; + + /* + * We won't send redirects to a router + * that doesn't have a link local + * address, but will forward. + */ + if (!IN6_IS_ADDR_LINKLOCAL(&nhop_ire->ire_addr_v6)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors); + ip_drop_input("ipIfStatsInAddrErrors", mp, ill); + ire_refrele(nhop_ire); + return; + } + v6targ = &nhop_ire->ire_addr_v6; + } + src_ire_v6 = ire_ftable_lookup_v6(&ip6h->ip6_src, + NULL, NULL, IRE_INTERFACE, ire->ire_ill, ALL_ZONES, NULL, + MATCH_IRE_ILL | MATCH_IRE_TYPE, 0, ipst, NULL); + + if (src_ire_v6 == NULL) { + if (nhop_ire != NULL) + ire_refrele(nhop_ire); + return; + } + + /* + * The source is directly connected. + */ + mp1 = copymsg(mp); + if (mp1 != NULL) + icmp_send_redirect_v6(mp1, v6targ, &ip6h->ip6_dst, ira); + + if (nhop_ire != NULL) + ire_refrele(nhop_ire); + ire_refrele(src_ire_v6); +} + +/* * Generate an ICMPv6 redirect message. * Include target link layer address option if it exits. * Always include redirect header. */ static void -icmp_send_redirect_v6(queue_t *q, mblk_t *mp, in6_addr_t *targetp, - in6_addr_t *dest, ill_t *ill, boolean_t llbcast) +icmp_send_redirect_v6(mblk_t *mp, in6_addr_t *targetp, in6_addr_t *dest, + ip_recv_attr_t *ira) { nd_redirect_t *rd; nd_opt_rd_hdr_t *rdh; uchar_t *buf; - nce_t *nce = NULL; + ncec_t *ncec = NULL; nd_opt_hdr_t *opt; int len; int ll_opt_len = 0; int max_redir_hdr_data_len; int pkt_len; in6_addr_t *srcp; - ip_stack_t *ipst = ill->ill_ipst; - - /* - * We are called from ip_rput where we could - * not have attached an IPSEC_IN. - */ - ASSERT(mp->b_datap->db_type == M_DATA); + ill_t *ill; + boolean_t need_refrele; + ip_stack_t *ipst = ira->ira_ill->ill_ipst; - mp = icmp_pkt_err_ok_v6(q, mp, llbcast, B_FALSE, ipst); + mp = icmp_pkt_err_ok_v6(mp, B_FALSE, ira); if (mp == NULL) return; - nce = ndp_lookup_v6(ill, B_TRUE, targetp, B_FALSE); - if (nce != NULL && nce->nce_state != ND_INCOMPLETE) { + + if (IS_UNDER_IPMP(ira->ira_ill)) { + ill = ipmp_ill_hold_ipmp_ill(ira->ira_ill); + if (ill == NULL) { + ill = ira->ira_ill; + BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInBadRedirects); + ip_drop_output("no IPMP ill for sending redirect", + mp, ill); + freemsg(mp); + return; + } + need_refrele = B_TRUE; + } else { + ill = ira->ira_ill; + need_refrele = B_FALSE; + } + + ncec = ncec_lookup_illgrp_v6(ill, targetp); + if (ncec != NULL && ncec->ncec_state != ND_INCOMPLETE && + ncec->ncec_lladdr != NULL) { ll_opt_len = (sizeof (nd_opt_hdr_t) + ill->ill_phys_addr_length + 7)/8 * 8; } @@ -1822,8 +1672,10 @@ icmp_send_redirect_v6(queue_t *q, mblk_t *mp, in6_addr_t *targetp, ASSERT(len % 4 == 0); buf = kmem_alloc(len, KM_NOSLEEP); if (buf == NULL) { - if (nce != NULL) - NCE_REFRELE(nce); + if (ncec != NULL) + ncec_refrele(ncec); + if (need_refrele) + ill_refrele(ill); freemsg(mp); return; } @@ -1836,15 +1688,14 @@ icmp_send_redirect_v6(queue_t *q, mblk_t *mp, in6_addr_t *targetp, rd->nd_rd_dst = *dest; opt = (nd_opt_hdr_t *)(buf + sizeof (nd_redirect_t)); - if (nce != NULL && ll_opt_len != 0) { + if (ncec != NULL && ll_opt_len != 0) { opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; opt->nd_opt_len = ll_opt_len/8; - bcopy((char *)nce->nce_res_mp->b_rptr + - NCE_LL_ADDR_OFFSET(ill), &opt[1], + bcopy((char *)ncec->ncec_lladdr, &opt[1], ill->ill_phys_addr_length); } - if (nce != NULL) - NCE_REFRELE(nce); + if (ncec != NULL) + ncec_refrele(ncec); rdh = (nd_opt_rd_hdr_t *)(buf + sizeof (nd_redirect_t) + ll_opt_len); rdh->nd_opt_rh_type = (uint8_t)ND_OPT_REDIRECTED_HEADER; /* max_redir_hdr_data_len and nd_opt_rh_len must be multiple of 8 */ @@ -1862,321 +1713,136 @@ icmp_send_redirect_v6(queue_t *q, mblk_t *mp, in6_addr_t *targetp, } rdh->nd_opt_rh_reserved1 = 0; rdh->nd_opt_rh_reserved2 = 0; - /* ipif_v6src_addr contains the link-local source address */ - srcp = &ill->ill_ipif->ipif_v6src_addr; + /* ipif_v6lcl_addr contains the link-local source address */ + srcp = &ill->ill_ipif->ipif_v6lcl_addr; /* Redirects sent by router, and router is global zone */ - icmp_pkt_v6(q, mp, buf, len, srcp, B_FALSE, GLOBAL_ZONEID, ipst); + ASSERT(ira->ira_zoneid == ALL_ZONES); + ira->ira_zoneid = GLOBAL_ZONEID; + icmp_pkt_v6(mp, buf, len, srcp, ira); kmem_free(buf, len); + if (need_refrele) + ill_refrele(ill); } /* Generate an ICMP time exceeded message. (May be called as writer.) */ void -icmp_time_exceeded_v6(queue_t *q, mblk_t *mp, uint8_t code, - boolean_t llbcast, boolean_t mcast_ok, zoneid_t zoneid, - ip_stack_t *ipst) +icmp_time_exceeded_v6(mblk_t *mp, uint8_t code, boolean_t mcast_ok, + ip_recv_attr_t *ira) { icmp6_t icmp6; - boolean_t mctl_present; - mblk_t *first_mp; - EXTRACT_PKT_MP(mp, first_mp, mctl_present); - - mp = icmp_pkt_err_ok_v6(q, mp, llbcast, mcast_ok, ipst); - if (mp == NULL) { - if (mctl_present) - freeb(first_mp); + mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira); + if (mp == NULL) return; - } + bzero(&icmp6, sizeof (icmp6_t)); icmp6.icmp6_type = ICMP6_TIME_EXCEEDED; icmp6.icmp6_code = code; - icmp_pkt_v6(q, first_mp, &icmp6, sizeof (icmp6_t), NULL, mctl_present, - zoneid, ipst); + icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira); } /* * Generate an ICMP unreachable message. + * When called from ip_output side a minimal ip_recv_attr_t needs to be + * constructed by the caller. */ void -icmp_unreachable_v6(queue_t *q, mblk_t *mp, uint8_t code, - boolean_t llbcast, boolean_t mcast_ok, zoneid_t zoneid, - ip_stack_t *ipst) +icmp_unreachable_v6(mblk_t *mp, uint8_t code, boolean_t mcast_ok, + ip_recv_attr_t *ira) { icmp6_t icmp6; - boolean_t mctl_present; - mblk_t *first_mp; - EXTRACT_PKT_MP(mp, first_mp, mctl_present); - - mp = icmp_pkt_err_ok_v6(q, mp, llbcast, mcast_ok, ipst); - if (mp == NULL) { - if (mctl_present) - freeb(first_mp); + mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira); + if (mp == NULL) return; - } + bzero(&icmp6, sizeof (icmp6_t)); icmp6.icmp6_type = ICMP6_DST_UNREACH; icmp6.icmp6_code = code; - icmp_pkt_v6(q, first_mp, &icmp6, sizeof (icmp6_t), NULL, mctl_present, - zoneid, ipst); + icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira); } /* * Generate an ICMP pkt too big message. + * When called from ip_output side a minimal ip_recv_attr_t needs to be + * constructed by the caller. */ -static void -icmp_pkt2big_v6(queue_t *q, mblk_t *mp, uint32_t mtu, - boolean_t llbcast, boolean_t mcast_ok, zoneid_t zoneid, ip_stack_t *ipst) +void +icmp_pkt2big_v6(mblk_t *mp, uint32_t mtu, boolean_t mcast_ok, + ip_recv_attr_t *ira) { icmp6_t icmp6; - mblk_t *first_mp; - boolean_t mctl_present; - EXTRACT_PKT_MP(mp, first_mp, mctl_present); - - mp = icmp_pkt_err_ok_v6(q, mp, llbcast, mcast_ok, ipst); - if (mp == NULL) { - if (mctl_present) - freeb(first_mp); + mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira); + if (mp == NULL) return; - } + bzero(&icmp6, sizeof (icmp6_t)); icmp6.icmp6_type = ICMP6_PACKET_TOO_BIG; icmp6.icmp6_code = 0; icmp6.icmp6_mtu = htonl(mtu); - icmp_pkt_v6(q, first_mp, &icmp6, sizeof (icmp6_t), NULL, mctl_present, - zoneid, ipst); + icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira); } /* * Generate an ICMP parameter problem message. (May be called as writer.) * 'offset' is the offset from the beginning of the packet in error. + * When called from ip_output side a minimal ip_recv_attr_t needs to be + * constructed by the caller. */ static void -icmp_param_problem_v6(queue_t *q, mblk_t *mp, uint8_t code, - uint32_t offset, boolean_t llbcast, boolean_t mcast_ok, zoneid_t zoneid, - ip_stack_t *ipst) +icmp_param_problem_v6(mblk_t *mp, uint8_t code, uint32_t offset, + boolean_t mcast_ok, ip_recv_attr_t *ira) { icmp6_t icmp6; - boolean_t mctl_present; - mblk_t *first_mp; - - EXTRACT_PKT_MP(mp, first_mp, mctl_present); - mp = icmp_pkt_err_ok_v6(q, mp, llbcast, mcast_ok, ipst); - if (mp == NULL) { - if (mctl_present) - freeb(first_mp); + mp = icmp_pkt_err_ok_v6(mp, mcast_ok, ira); + if (mp == NULL) return; - } + bzero((char *)&icmp6, sizeof (icmp6_t)); icmp6.icmp6_type = ICMP6_PARAM_PROB; icmp6.icmp6_code = code; icmp6.icmp6_pptr = htonl(offset); - icmp_pkt_v6(q, first_mp, &icmp6, sizeof (icmp6_t), NULL, mctl_present, - zoneid, ipst); + icmp_pkt_v6(mp, &icmp6, sizeof (icmp6_t), NULL, ira); } -/* - * This code will need to take into account the possibility of binding - * to a link local address on a multi-homed host, in which case the - * outgoing interface (from the conn) will need to be used when getting - * an ire for the dst. Going through proper outgoing interface and - * choosing the source address corresponding to the outgoing interface - * is necessary when the destination address is a link-local address and - * IPV6_BOUND_IF or IPV6_PKTINFO or scope_id has been set. - * This can happen when active connection is setup; thus ipp pointer - * is passed here from tcp_connect_*() routines, in non-TCP cases NULL - * pointer is passed as ipp pointer. - */ -mblk_t * -ip_bind_v6(queue_t *q, mblk_t *mp, conn_t *connp, ip6_pkt_t *ipp) +void +icmp_param_problem_nexthdr_v6(mblk_t *mp, boolean_t mcast_ok, + ip_recv_attr_t *ira) { - ssize_t len; - int protocol; - struct T_bind_req *tbr; - sin6_t *sin6; - ipa6_conn_t *ac6; - in6_addr_t *v6srcp; - in6_addr_t *v6dstp; - uint16_t lport; - uint16_t fport; - uchar_t *ucp; - int error = 0; - boolean_t local_bind; - ipa6_conn_x_t *acx6; - boolean_t verify_dst; - ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - cred_t *cr; - - /* - * All Solaris components should pass a db_credp - * for this TPI message, hence we ASSERT. - * But in case there is some other M_PROTO that looks - * like a TPI message sent by some other kernel - * component, we check and return an error. - */ - cr = msg_getcred(mp, NULL); - ASSERT(cr != NULL); - if (cr == NULL) { - error = EINVAL; - goto bad_addr; - } - - ASSERT(connp->conn_af_isv6); - len = mp->b_wptr - mp->b_rptr; - if (len < (sizeof (*tbr) + 1)) { - (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, - "ip_bind_v6: bogus msg, len %ld", len); - goto bad_addr; - } - /* Back up and extract the protocol identifier. */ - mp->b_wptr--; - tbr = (struct T_bind_req *)mp->b_rptr; - /* Reset the message type in preparation for shipping it back. */ - mp->b_datap->db_type = M_PCPROTO; - - protocol = *mp->b_wptr & 0xFF; - connp->conn_ulp = (uint8_t)protocol; - - /* - * Check for a zero length address. This is from a protocol that - * wants to register to receive all packets of its type. - */ - if (tbr->ADDR_length == 0) { - if ((protocol == IPPROTO_TCP || protocol == IPPROTO_SCTP || - protocol == IPPROTO_ESP || protocol == IPPROTO_AH) && - ipst->ips_ipcl_proto_fanout_v6[protocol].connf_head != - NULL) { - /* - * TCP, SCTP, AH, and ESP have single protocol fanouts. - * Do not allow others to bind to these. - */ - goto bad_addr; - } - - /* - * - * The udp module never sends down a zero-length address, - * and allowing this on a labeled system will break MLP - * functionality. - */ - if (is_system_labeled() && protocol == IPPROTO_UDP) - goto bad_addr; - - /* Allow ipsec plumbing */ - if ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && - (protocol != IPPROTO_AH) && (protocol != IPPROTO_ESP)) - goto bad_addr; - - connp->conn_srcv6 = ipv6_all_zeros; - ipcl_proto_insert_v6(connp, protocol); - - tbr->PRIM_type = T_BIND_ACK; - return (mp); - } - - /* Extract the address pointer from the message. */ - ucp = (uchar_t *)mi_offset_param(mp, tbr->ADDR_offset, - tbr->ADDR_length); - if (ucp == NULL) { - ip1dbg(("ip_bind_v6: no address\n")); - goto bad_addr; - } - if (!OK_32PTR(ucp)) { - ip1dbg(("ip_bind_v6: unaligned address\n")); - goto bad_addr; - } - - switch (tbr->ADDR_length) { - default: - ip1dbg(("ip_bind_v6: bad address length %d\n", - (int)tbr->ADDR_length)); - goto bad_addr; - - case IPV6_ADDR_LEN: - /* Verification of local address only */ - v6srcp = (in6_addr_t *)ucp; - lport = 0; - local_bind = B_TRUE; - break; - - case sizeof (sin6_t): - sin6 = (sin6_t *)ucp; - v6srcp = &sin6->sin6_addr; - lport = sin6->sin6_port; - local_bind = B_TRUE; - break; - - case sizeof (ipa6_conn_t): - /* - * Verify that both the source and destination addresses - * are valid. - */ - ac6 = (ipa6_conn_t *)ucp; - v6srcp = &ac6->ac6_laddr; - v6dstp = &ac6->ac6_faddr; - fport = ac6->ac6_fport; - /* For raw socket, the local port is not set. */ - lport = ac6->ac6_lport != 0 ? ac6->ac6_lport : - connp->conn_lport; - local_bind = B_FALSE; - /* Always verify destination reachability. */ - verify_dst = B_TRUE; - break; - - case sizeof (ipa6_conn_x_t): - /* - * Verify that the source address is valid. - */ - acx6 = (ipa6_conn_x_t *)ucp; - ac6 = &acx6->ac6x_conn; - v6srcp = &ac6->ac6_laddr; - v6dstp = &ac6->ac6_faddr; - fport = ac6->ac6_fport; - lport = ac6->ac6_lport; - local_bind = B_FALSE; - /* - * Client that passed ipa6_conn_x_t to us specifies whether to - * verify destination reachability. - */ - verify_dst = (acx6->ac6x_flags & ACX_VERIFY_DST) != 0; - break; - } - if (local_bind) { - error = ip_proto_bind_laddr_v6(connp, &mp->b_cont, protocol, - v6srcp, lport, tbr->ADDR_length != IPV6_ADDR_LEN); - } else { - error = ip_proto_bind_connected_v6(connp, &mp->b_cont, protocol, - v6srcp, lport, v6dstp, ipp, fport, B_TRUE, verify_dst, cr); - } + ip6_t *ip6h = (ip6_t *)mp->b_rptr; + uint16_t hdr_length; + uint8_t *nexthdrp; + uint32_t offset; + ill_t *ill = ira->ira_ill; - if (error == 0) { - /* Send it home. */ - mp->b_datap->db_type = M_PCPROTO; - tbr->PRIM_type = T_BIND_ACK; - return (mp); + /* Determine the offset of the bad nexthdr value */ + if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_length, &nexthdrp)) { + /* Malformed packet */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); + freemsg(mp); + return; } -bad_addr: - ASSERT(error != EINPROGRESS); - if (error > 0) - mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error); - else - mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0); - return (mp); + offset = nexthdrp - mp->b_rptr; + icmp_param_problem_v6(mp, ICMP6_PARAMPROB_NEXTHEADER, offset, + mcast_ok, ira); } /* - * Here address is verified to be a valid local address. - * If the IRE_DB_REQ_TYPE mp is present, a multicast - * address is also considered a valid local address. + * Verify whether or not the IP address is a valid local address. + * Could be a unicast, including one for a down interface. + * If allow_mcbc then a multicast or broadcast address is also + * acceptable. + * * In the case of a multicast address, however, the * upper protocol is expected to reset the src address - * to 0 if it sees an ire with IN6_IS_ADDR_MULTICAST returned so that + * to zero when we return IPVL_MCAST so that * no packets are emitted with multicast address as * source address. * The addresses valid for bind are: @@ -2193,855 +1859,418 @@ bad_addr: * When the address is loopback or multicast, there might be many matching IREs * so bind has to look up based on the zone. */ -/* - * Verify the local IP address. Does not change the conn_t except - * conn_fully_bound and conn_policy_cached. - */ -static int -ip_bind_laddr_v6(conn_t *connp, mblk_t **mpp, uint8_t protocol, - const in6_addr_t *v6src, uint16_t lport, boolean_t fanout_insert) +ip_laddr_t +ip_laddr_verify_v6(const in6_addr_t *v6src, zoneid_t zoneid, + ip_stack_t *ipst, boolean_t allow_mcbc, uint_t scopeid) { - int error = 0; - ire_t *src_ire = NULL; - zoneid_t zoneid; - mblk_t *mp = NULL; - boolean_t ire_requested; - boolean_t ipsec_policy_set; - ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - - if (mpp) - mp = *mpp; - - ire_requested = (mp != NULL && DB_TYPE(mp) == IRE_DB_REQ_TYPE); - ipsec_policy_set = (mp != NULL && DB_TYPE(mp) == IPSEC_POLICY_SET); - - /* - * If it was previously connected, conn_fully_bound would have - * been set. - */ - connp->conn_fully_bound = B_FALSE; - - zoneid = IPCL_ZONEID(connp); + ire_t *src_ire; + uint_t match_flags; + ill_t *ill = NULL; - if (!IN6_IS_ADDR_UNSPECIFIED(v6src)) { - src_ire = ire_route_lookup_v6(v6src, 0, 0, - 0, NULL, NULL, zoneid, NULL, MATCH_IRE_ZONEONLY, ipst); - /* - * If an address other than in6addr_any is requested, - * we verify that it is a valid address for bind - * Note: Following code is in if-else-if form for - * readability compared to a condition check. - */ - ASSERT(src_ire == NULL || !(src_ire->ire_type & IRE_BROADCAST)); - /* LINTED - statement has no consequent */ - if (IRE_IS_LOCAL(src_ire)) { - /* - * (2) Bind to address of local UP interface - */ - } else if (IN6_IS_ADDR_MULTICAST(v6src)) { - ipif_t *multi_ipif = NULL; - ire_t *save_ire; - /* - * (4) bind to multicast address. - * Fake out the IRE returned to upper - * layer to be a broadcast IRE in - * ip_bind_insert_ire_v6(). - * Pass other information that matches - * the ipif (e.g. the source address). - * conn_multicast_ill is only used for - * IPv6 packets - */ - mutex_enter(&connp->conn_lock); - if (connp->conn_multicast_ill != NULL) { - (void) ipif_lookup_zoneid( - connp->conn_multicast_ill, zoneid, 0, - &multi_ipif); - } else { - /* - * Look for default like - * ip_wput_v6 - */ - multi_ipif = ipif_lookup_group_v6( - &ipv6_unspecified_group, zoneid, ipst); - } - mutex_exit(&connp->conn_lock); - save_ire = src_ire; - src_ire = NULL; - if (multi_ipif == NULL || !ire_requested || - (src_ire = ipif_to_ire_v6(multi_ipif)) == NULL) { - src_ire = save_ire; - error = EADDRNOTAVAIL; - } else { - ASSERT(src_ire != NULL); - if (save_ire != NULL) - ire_refrele(save_ire); - } - if (multi_ipif != NULL) - ipif_refrele(multi_ipif); - } else { - if (!ip_addr_exists_v6(v6src, zoneid, ipst)) { - /* - * Not a valid address for bind - */ - error = EADDRNOTAVAIL; - } - } + ASSERT(!IN6_IS_ADDR_V4MAPPED(v6src)); + ASSERT(!IN6_IS_ADDR_UNSPECIFIED(v6src)); - if (error != 0) { - /* Red Alert! Attempting to be a bogon! */ - if (ip_debug > 2) { - /* ip1dbg */ - pr_addr_dbg("ip_bind_laddr_v6: bad src" - " address %s\n", AF_INET6, v6src); - } - goto bad_addr; - } + match_flags = MATCH_IRE_ZONEONLY; + if (scopeid != 0) { + ill = ill_lookup_on_ifindex(scopeid, B_TRUE, ipst); + if (ill == NULL) + return (IPVL_BAD); + match_flags |= MATCH_IRE_ILL; } + src_ire = ire_ftable_lookup_v6(v6src, NULL, NULL, 0, + ill, zoneid, NULL, match_flags, 0, ipst, NULL); + if (ill != NULL) + ill_refrele(ill); + /* - * Allow setting new policies. For example, disconnects come - * down as ipa_t bind. As we would have set conn_policy_cached - * to B_TRUE before, we should set it to B_FALSE, so that policy - * can change after the disconnect. + * If an address other than in6addr_any is requested, + * we verify that it is a valid address for bind + * Note: Following code is in if-else-if form for + * readability compared to a condition check. */ - connp->conn_policy_cached = B_FALSE; - - /* If not fanout_insert this was just an address verification */ - if (fanout_insert) { + if (src_ire != NULL && (src_ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK))) { /* - * The addresses have been verified. Time to insert in - * the correct fanout list. + * (2) Bind to address of local UP interface */ - connp->conn_srcv6 = *v6src; - connp->conn_remv6 = ipv6_all_zeros; - connp->conn_lport = lport; - connp->conn_fport = 0; - error = ipcl_bind_insert_v6(connp, protocol, v6src, lport); - } - if (error == 0) { - if (ire_requested) { - if (!ip_bind_get_ire_v6(mpp, src_ire, v6src, NULL, - ipst)) { - error = -1; - goto bad_addr; - } - mp = *mpp; - } else if (ipsec_policy_set) { - if (!ip_bind_ipsec_policy_set(connp, mp)) { - error = -1; - goto bad_addr; - } - } - } -bad_addr: - if (error != 0) { - if (connp->conn_anon_port) { - (void) tsol_mlp_anon(crgetzone(connp->conn_cred), - connp->conn_mlp_type, connp->conn_ulp, ntohs(lport), - B_FALSE); - } - connp->conn_mlp_type = mlptSingle; - } - - if (src_ire != NULL) ire_refrele(src_ire); + return (IPVL_UNICAST_UP); + } else if (IN6_IS_ADDR_MULTICAST(v6src)) { + /* (4) bind to multicast address. */ + if (src_ire != NULL) + ire_refrele(src_ire); - if (ipsec_policy_set) { - ASSERT(mp != NULL); - freeb(mp); /* - * As of now assume that nothing else accompanies - * IPSEC_POLICY_SET. + * Note: caller should take IPV6_MULTICAST_IF + * into account when selecting a real source address. */ - *mpp = NULL; - } - - return (error); -} -int -ip_proto_bind_laddr_v6(conn_t *connp, mblk_t **mpp, uint8_t protocol, - const in6_addr_t *v6srcp, uint16_t lport, boolean_t fanout_insert) -{ - int error; - boolean_t orig_pkt_isv6 = connp->conn_pkt_isv6; - ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - - ASSERT(connp->conn_af_isv6); - connp->conn_ulp = protocol; + if (allow_mcbc) + return (IPVL_MCAST); + else + return (IPVL_BAD); + } else { + ipif_t *ipif; - if (IN6_IS_ADDR_V4MAPPED(v6srcp) && !connp->conn_ipv6_v6only) { - /* Bind to IPv4 address */ - ipaddr_t v4src; + /* + * (3) Bind to address of local DOWN interface? + * (ipif_lookup_addr() looks up all interfaces + * but we do not get here for UP interfaces + * - case (2) above) + */ + if (src_ire != NULL) + ire_refrele(src_ire); - IN6_V4MAPPED_TO_IPADDR(v6srcp, v4src); + ipif = ipif_lookup_addr_v6(v6src, NULL, zoneid, ipst); + if (ipif == NULL) + return (IPVL_BAD); - error = ip_bind_laddr_v4(connp, mpp, protocol, v4src, lport, - fanout_insert); - if (error != 0) - goto bad_addr; - connp->conn_pkt_isv6 = B_FALSE; - } else { - if (IN6_IS_ADDR_V4MAPPED(v6srcp)) { - error = 0; - goto bad_addr; + /* Not a useful source? */ + if (ipif->ipif_flags & (IPIF_NOLOCAL | IPIF_ANYCAST)) { + ipif_refrele(ipif); + return (IPVL_BAD); } - error = ip_bind_laddr_v6(connp, mpp, protocol, v6srcp, - lport, fanout_insert); - if (error != 0) - goto bad_addr; - connp->conn_pkt_isv6 = B_TRUE; + ipif_refrele(ipif); + return (IPVL_UNICAST_DOWN); } - - if (orig_pkt_isv6 != connp->conn_pkt_isv6) - ip_setpktversion(connp, connp->conn_pkt_isv6, B_TRUE, ipst); - return (0); - -bad_addr: - if (error < 0) - error = -TBADADDR; - return (error); } /* - * Verify that both the source and destination addresses - * are valid. If verify_dst, then destination address must also be reachable, - * i.e. have a route. Protocols like TCP want this. Tunnels do not. - * It takes ip6_pkt_t * as one of the arguments to determine correct - * source address when IPV6_PKTINFO or scope_id is set along with a link-local - * destination address. Note that parameter ipp is only useful for TCP connect - * when scope_id is set or IPV6_PKTINFO option is set with an ifindex. For all - * non-TCP cases, it is NULL and for all other tcp cases it is not useful. + * Verify that both the source and destination addresses are valid. If + * IPDF_VERIFY_DST is not set, then the destination address may be unreachable, + * i.e. have no route to it. Protocols like TCP want to verify destination + * reachability, while tunnels do not. + * + * Determine the route, the interface, and (optionally) the source address + * to use to reach a given destination. + * Note that we allow connect to broadcast and multicast addresses when + * IPDF_ALLOW_MCBC is set. + * first_hop and dst_addr are normally the same, but if source routing + * they will differ; in that case the first_hop is what we'll use for the + * routing lookup but the dce and label checks will be done on dst_addr, + * + * If uinfo is set, then we fill in the best available information + * we have for the destination. This is based on (in priority order) any + * metrics and path MTU stored in a dce_t, route metrics, and finally the + * ill_mtu. + * + * Tsol note: If we have a source route then dst_addr != firsthop. But we + * always do the label check on dst_addr. * + * Assumes that the caller has set ixa_scopeid for link-local communication. */ int -ip_bind_connected_v6(conn_t *connp, mblk_t **mpp, uint8_t protocol, - in6_addr_t *v6src, uint16_t lport, const in6_addr_t *v6dst, - ip6_pkt_t *ipp, uint16_t fport, boolean_t fanout_insert, - boolean_t verify_dst, cred_t *cr) +ip_set_destination_v6(in6_addr_t *src_addrp, const in6_addr_t *dst_addr, + const in6_addr_t *firsthop, ip_xmit_attr_t *ixa, iulp_t *uinfo, + uint32_t flags, uint_t mac_mode) { - ire_t *src_ire; - ire_t *dst_ire; + ire_t *ire; int error = 0; - ire_t *sire = NULL; - ire_t *md_dst_ire = NULL; - ill_t *md_ill = NULL; - ill_t *dst_ill = NULL; - ipif_t *src_ipif = NULL; - zoneid_t zoneid; - boolean_t ill_held = B_FALSE; - mblk_t *mp = NULL; - boolean_t ire_requested = B_FALSE; - boolean_t ipsec_policy_set = B_FALSE; - ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - ts_label_t *tsl = NULL; - cred_t *effective_cred = NULL; - - if (mpp) - mp = *mpp; - - if (mp != NULL) { - ire_requested = (DB_TYPE(mp) == IRE_DB_REQ_TYPE); - ipsec_policy_set = (DB_TYPE(mp) == IPSEC_POLICY_SET); - } - - src_ire = dst_ire = NULL; - /* - * If we never got a disconnect before, clear it now. - */ - connp->conn_fully_bound = B_FALSE; + in6_addr_t setsrc; /* RTF_SETSRC */ + zoneid_t zoneid = ixa->ixa_zoneid; /* Honors SO_ALLZONES */ + ip_stack_t *ipst = ixa->ixa_ipst; + dce_t *dce; + uint_t pmtu; + uint_t ifindex; + uint_t generation; + nce_t *nce; + ill_t *ill = NULL; + boolean_t multirt = B_FALSE; + + ASSERT(!IN6_IS_ADDR_V4MAPPED(dst_addr)); - zoneid = connp->conn_zoneid; + ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4)); /* - * Check whether Trusted Solaris policy allows communication with this - * host, and pretend that the destination is unreachable if not. - * - * This is never a problem for TCP, since that transport is known to - * compute the label properly as part of the tcp_rput_other T_BIND_ACK - * handling. If the remote is unreachable, it will be detected at that - * point, so there's no reason to check it here. - * - * Note that for sendto (and other datagram-oriented friends), this - * check is done as part of the data path label computation instead. - * The check here is just to make non-TCP connect() report the right - * error. + * We never send to zero; the ULPs map it to the loopback address. + * We can't allow it since we use zero to mean unitialized in some + * places. */ - if (is_system_labeled() && !IPCL_IS_TCP(connp)) { - if ((error = tsol_check_dest(cr, v6dst, IPV6_VERSION, - connp->conn_mac_mode, &effective_cred)) != 0) { - if (ip_debug > 2) { - pr_addr_dbg( - "ip_bind_connected: no label for dst %s\n", - AF_INET6, v6dst); - } - goto bad_addr; - } + ASSERT(!IN6_IS_ADDR_UNSPECIFIED(dst_addr)); - /* - * tsol_check_dest() may have created a new cred with - * a modified security label. Use that cred if it exists - * for ire lookups. - */ - if (effective_cred == NULL) { - tsl = crgetlabel(cr); - } else { - tsl = crgetlabel(effective_cred); + if (is_system_labeled()) { + ts_label_t *tsl = NULL; + + error = tsol_check_dest(ixa->ixa_tsl, dst_addr, IPV6_VERSION, + mac_mode, (flags & IPDF_ZONE_IS_GLOBAL) != 0, &tsl); + if (error != 0) + return (error); + if (tsl != NULL) { + /* Update the label */ + ip_xmit_attr_replace_tsl(ixa, tsl); } } - if (IN6_IS_ADDR_MULTICAST(v6dst)) { - ipif_t *ipif; + setsrc = ipv6_all_zeros; + /* + * Select a route; For IPMP interfaces, we would only select + * a "hidden" route (i.e., going through a specific under_ill) + * if ixa_ifindex has been specified. + */ + ire = ip_select_route_v6(firsthop, ixa, &generation, &setsrc, &error, + &multirt); + ASSERT(ire != NULL); /* IRE_NOROUTE if none found */ + if (error != 0) + goto bad_addr; + /* + * ire can't be a broadcast or multicast unless IPDF_ALLOW_MCBC is set. + * If IPDF_VERIFY_DST is set, the destination must be reachable. + * Otherwise the destination needn't be reachable. + * + * If we match on a reject or black hole, then we've got a + * local failure. May as well fail out the connect() attempt, + * since it's never going to succeed. + */ + if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { /* - * Use an "emulated" IRE_BROADCAST to tell the transport it - * is a multicast. - * Pass other information that matches - * the ipif (e.g. the source address). + * If we're verifying destination reachability, we always want + * to complain here. * - * conn_multicast_ill is only used for IPv6 packets - */ - mutex_enter(&connp->conn_lock); - if (connp->conn_multicast_ill != NULL) { - (void) ipif_lookup_zoneid(connp->conn_multicast_ill, - zoneid, 0, &ipif); - } else { - /* Look for default like ip_wput_v6 */ - ipif = ipif_lookup_group_v6(v6dst, zoneid, ipst); - } - mutex_exit(&connp->conn_lock); - if (ipif == NULL || ire_requested || - (dst_ire = ipif_to_ire_v6(ipif)) == NULL) { - if (ipif != NULL) - ipif_refrele(ipif); - if (ip_debug > 2) { - /* ip1dbg */ - pr_addr_dbg("ip_bind_connected_v6: bad " - "connected multicast %s\n", AF_INET6, - v6dst); - } - error = ENETUNREACH; - goto bad_addr; - } - if (ipif != NULL) - ipif_refrele(ipif); - } else { - dst_ire = ire_route_lookup_v6(v6dst, NULL, NULL, 0, - NULL, &sire, zoneid, tsl, - MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | - MATCH_IRE_PARENT | MATCH_IRE_RJ_BHOLE | MATCH_IRE_SECATTR, - ipst); - /* - * We also prevent ire's with src address INADDR_ANY to - * be used, which are created temporarily for - * sending out packets from endpoints that have - * conn_unspec_src set. + * If we're not verifying destination reachability but the + * destination has a route, we still want to fail on the + * temporary address and broadcast address tests. + * + * In both cases do we let the code continue so some reasonable + * information is returned to the caller. That enables the + * caller to use (and even cache) the IRE. conn_ip_ouput will + * use the generation mismatch path to check for the unreachable + * case thereby avoiding any specific check in the main path. */ - if (dst_ire == NULL || - (dst_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || - IN6_IS_ADDR_UNSPECIFIED(&dst_ire->ire_src_addr_v6)) { + ASSERT(generation == IRE_GENERATION_VERIFY); + if (flags & IPDF_VERIFY_DST) { /* - * When verifying destination reachability, we always - * complain. - * - * When not verifying destination reachability but we - * found an IRE, i.e. the destination is reachable, - * then the other tests still apply and we complain. + * Set errno but continue to set up ixa_ire to be + * the RTF_REJECT|RTF_BLACKHOLE IRE. + * That allows callers to use ip_output to get an + * ICMP error back. */ - if (verify_dst || (dst_ire != NULL)) { - if (ip_debug > 2) { - /* ip1dbg */ - pr_addr_dbg("ip_bind_connected_v6: bad" - " connected dst %s\n", AF_INET6, - v6dst); - } - if (dst_ire == NULL || - !(dst_ire->ire_type & IRE_HOST)) { - error = ENETUNREACH; - } else { - error = EHOSTUNREACH; - } - goto bad_addr; - } + if (!(ire->ire_type & IRE_HOST)) + error = ENETUNREACH; + else + error = EHOSTUNREACH; } } - /* - * If the app does a connect(), it means that it will most likely - * send more than 1 packet to the destination. It makes sense - * to clear the temporary flag. - */ - if (dst_ire != NULL && dst_ire->ire_type == IRE_CACHE && - (dst_ire->ire_marks & IRE_MARK_TEMPORARY)) { - irb_t *irb = dst_ire->ire_bucket; - - rw_enter(&irb->irb_lock, RW_WRITER); - /* - * We need to recheck for IRE_MARK_TEMPORARY after acquiring - * the lock in order to guarantee irb_tmp_ire_cnt. - */ - if (dst_ire->ire_marks & IRE_MARK_TEMPORARY) { - dst_ire->ire_marks &= ~IRE_MARK_TEMPORARY; - irb->irb_tmp_ire_cnt--; - } - rw_exit(&irb->irb_lock); + if ((ire->ire_type & (IRE_BROADCAST|IRE_MULTICAST)) && + !(flags & IPDF_ALLOW_MCBC)) { + ire_refrele(ire); + ire = ire_reject(ipst, B_FALSE); + generation = IRE_GENERATION_VERIFY; + error = ENETUNREACH; } - ASSERT(dst_ire == NULL || dst_ire->ire_ipversion == IPV6_VERSION); + /* Cache things */ + if (ixa->ixa_ire != NULL) + ire_refrele_notr(ixa->ixa_ire); +#ifdef DEBUG + ire_refhold_notr(ire); + ire_refrele(ire); +#endif + ixa->ixa_ire = ire; + ixa->ixa_ire_generation = generation; /* - * See if we should notify ULP about MDT; we do this whether or not - * ire_requested is TRUE, in order to handle active connects; MDT - * eligibility tests for passive connects are handled separately - * through tcp_adapt_ire(). We do this before the source address - * selection, because dst_ire may change after a call to - * ipif_select_source_v6(). This is a best-effort check, as the - * packet for this connection may not actually go through - * dst_ire->ire_stq, and the exact IRE can only be known after - * calling ip_newroute_v6(). This is why we further check on the - * IRE during Multidata packet transmission in tcp_multisend(). + * For multicast with multirt we have a flag passed back from + * ire_lookup_multi_ill_v6 since we don't have an IRE for each + * possible multicast address. + * We also need a flag for multicast since we can't check + * whether RTF_MULTIRT is set in ixa_ire for multicast. */ - if (ipst->ips_ip_multidata_outbound && !ipsec_policy_set && - dst_ire != NULL && - !(dst_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST)) && - (md_ill = ire_to_ill(dst_ire), md_ill != NULL) && - ILL_MDT_CAPABLE(md_ill)) { - md_dst_ire = dst_ire; - IRE_REFHOLD(md_dst_ire); - } - - if (dst_ire != NULL && - dst_ire->ire_type == IRE_LOCAL && - dst_ire->ire_zoneid != zoneid && - dst_ire->ire_zoneid != ALL_ZONES) { - src_ire = ire_ftable_lookup_v6(v6dst, 0, 0, 0, NULL, NULL, - zoneid, 0, NULL, - MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | - MATCH_IRE_RJ_BHOLE, ipst); - if (src_ire == NULL) { - error = EHOSTUNREACH; - goto bad_addr; - } else if (src_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { - if (!(src_ire->ire_type & IRE_HOST)) - error = ENETUNREACH; - else - error = EHOSTUNREACH; - goto bad_addr; - } - if (IN6_IS_ADDR_UNSPECIFIED(v6src)) { - src_ipif = src_ire->ire_ipif; - ipif_refhold(src_ipif); - *v6src = src_ipif->ipif_v6lcl_addr; - } - ire_refrele(src_ire); - src_ire = NULL; - } else if (IN6_IS_ADDR_UNSPECIFIED(v6src) && dst_ire != NULL) { - if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) { - *v6src = sire->ire_src_addr_v6; - ire_refrele(dst_ire); - dst_ire = sire; - sire = NULL; - } else if (dst_ire->ire_type == IRE_CACHE && - (dst_ire->ire_flags & RTF_SETSRC)) { - ASSERT(dst_ire->ire_zoneid == zoneid || - dst_ire->ire_zoneid == ALL_ZONES); - *v6src = dst_ire->ire_src_addr_v6; + if (multirt) { + ixa->ixa_postfragfn = ip_postfrag_multirt_v6; + ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST; + } else { + ixa->ixa_postfragfn = ire->ire_postfragfn; + ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST; + } + if (!(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { + /* Get an nce to cache. */ + nce = ire_to_nce(ire, NULL, firsthop); + if (nce == NULL) { + /* Allocation failure? */ + ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; } else { - /* - * Pick a source address so that a proper inbound load - * spreading would happen. Use dst_ill specified by the - * app. when socket option or scopeid is set. - */ - int err; - - if (ipp != NULL && ipp->ipp_ifindex != 0) { - uint_t if_index; - - /* - * Scope id or IPV6_PKTINFO - */ - - if_index = ipp->ipp_ifindex; - dst_ill = ill_lookup_on_ifindex( - if_index, B_TRUE, NULL, NULL, NULL, NULL, - ipst); - if (dst_ill == NULL) { - ip1dbg(("ip_bind_connected_v6:" - " bad ifindex %d\n", if_index)); - error = EADDRNOTAVAIL; - goto bad_addr; - } - ill_held = B_TRUE; - } else if (connp->conn_outgoing_ill != NULL) { - /* - * For IPV6_BOUND_IF socket option, - * conn_outgoing_ill should be set - * already in TCP or UDP/ICMP. - */ - dst_ill = conn_get_held_ill(connp, - &connp->conn_outgoing_ill, &err); - if (err == ILL_LOOKUP_FAILED) { - ip1dbg(("ip_bind_connected_v6:" - "no ill for bound_if\n")); - error = EADDRNOTAVAIL; - goto bad_addr; - } - ill_held = B_TRUE; - } else if (dst_ire->ire_stq != NULL) { - /* No need to hold ill here */ - dst_ill = (ill_t *)dst_ire->ire_stq->q_ptr; - } else { - /* No need to hold ill here */ - dst_ill = dst_ire->ire_ipif->ipif_ill; - } - if (ip6_asp_can_lookup(ipst)) { - src_ipif = ipif_select_source_v6(dst_ill, - v6dst, B_FALSE, connp->conn_src_preferences, - zoneid); - ip6_asp_table_refrele(ipst); - if (src_ipif == NULL) { - pr_addr_dbg("ip_bind_connected_v6: " - "no usable source address for " - "connection to %s\n", - AF_INET6, v6dst); - error = EADDRNOTAVAIL; - goto bad_addr; - } - *v6src = src_ipif->ipif_v6lcl_addr; - } else { - error = EADDRNOTAVAIL; - goto bad_addr; - } + if (ixa->ixa_nce != NULL) + nce_refrele(ixa->ixa_nce); + ixa->ixa_nce = nce; } } /* - * We do ire_route_lookup_v6() here (and not an interface lookup) - * as we assert that v6src should only come from an - * UP interface for hard binding. + * We use use ire_nexthop_ill to avoid the under ipmp + * interface for source address selection. Note that for ipmp + * probe packets, ixa_ifindex would have been specified, and + * the ip_select_route() invocation would have picked an ire + * will ire_ill pointing at an under interface. */ - src_ire = ire_route_lookup_v6(v6src, 0, 0, 0, NULL, - NULL, zoneid, NULL, MATCH_IRE_ZONEONLY, ipst); - - /* src_ire must be a local|loopback */ - if (!IRE_IS_LOCAL(src_ire)) { - if (ip_debug > 2) { - /* ip1dbg */ - pr_addr_dbg("ip_bind_connected_v6: bad " - "connected src %s\n", AF_INET6, v6src); - } - error = EADDRNOTAVAIL; - goto bad_addr; - } + ill = ire_nexthop_ill(ire); /* * If the source address is a loopback address, the * destination had best be local or multicast. - * The transports that can't handle multicast will reject - * those addresses. + * If we are sending to an IRE_LOCAL using a loopback source then + * it had better be the same zoneid. */ - if (src_ire->ire_type == IRE_LOOPBACK && - !(IRE_IS_LOCAL(dst_ire) || IN6_IS_ADDR_MULTICAST(v6dst) || - IN6_IS_ADDR_V4MAPPED_CLASSD(v6dst))) { - ip1dbg(("ip_bind_connected_v6: bad connected loopback\n")); - error = -1; - goto bad_addr; + if (IN6_IS_ADDR_LOOPBACK(src_addrp)) { + if ((ire->ire_type & IRE_LOCAL) && ire->ire_zoneid != zoneid) { + ire = NULL; /* Stored in ixa_ire */ + error = EADDRNOTAVAIL; + goto bad_addr; + } + if (!(ire->ire_type & (IRE_LOOPBACK|IRE_LOCAL|IRE_MULTICAST))) { + ire = NULL; /* Stored in ixa_ire */ + error = EADDRNOTAVAIL; + goto bad_addr; + } } - /* - * Allow setting new policies. For example, disconnects come - * down as ipa_t bind. As we would have set conn_policy_cached - * to B_TRUE before, we should set it to B_FALSE, so that policy - * can change after the disconnect. - */ - connp->conn_policy_cached = B_FALSE; /* - * The addresses have been verified. Initialize the conn - * before calling the policy as they expect the conns - * initialized. + * Does the caller want us to pick a source address? */ - connp->conn_srcv6 = *v6src; - connp->conn_remv6 = *v6dst; - connp->conn_lport = lport; - connp->conn_fport = fport; - - ASSERT(!(ipsec_policy_set && ire_requested)); - if (ire_requested) { - iulp_t *ulp_info = NULL; + if (flags & IPDF_SELECT_SRC) { + in6_addr_t src_addr; + + /* If unreachable we have no ill but need some source */ + if (ill == NULL) { + src_addr = ipv6_loopback; + /* Make sure we look for a better source address */ + generation = SRC_GENERATION_VERIFY; + } else { + error = ip_select_source_v6(ill, &setsrc, dst_addr, + zoneid, ipst, B_FALSE, ixa->ixa_src_preferences, + &src_addr, &generation, NULL); + if (error != 0) { + ire = NULL; /* Stored in ixa_ire */ + goto bad_addr; + } + } /* - * Note that sire will not be NULL if this is an off-link - * connection and there is not cache for that dest yet. - * - * XXX Because of an existing bug, if there are multiple - * default routes, the IRE returned now may not be the actual - * default route used (default routes are chosen in a - * round robin fashion). So if the metrics for different - * default routes are different, we may return the wrong - * metrics. This will not be a problem if the existing - * bug is fixed. + * We allow the source address to to down. + * However, we check that we don't use the loopback address + * as a source when sending out on the wire. */ - if (sire != NULL) - ulp_info = &(sire->ire_uinfo); - - if (!ip_bind_get_ire_v6(mpp, dst_ire, v6dst, ulp_info, - ipst)) { - error = -1; - goto bad_addr; - } - } else if (ipsec_policy_set) { - if (!ip_bind_ipsec_policy_set(connp, mp)) { - error = -1; + if (IN6_IS_ADDR_LOOPBACK(&src_addr) && + !(ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK|IRE_MULTICAST)) && + !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { + ire = NULL; /* Stored in ixa_ire */ + error = EADDRNOTAVAIL; goto bad_addr; } + + *src_addrp = src_addr; + ixa->ixa_src_generation = generation; } /* - * Cache IPsec policy in this conn. If we have per-socket policy, - * we'll cache that. If we don't, we'll inherit global policy. - * - * We can't insert until the conn reflects the policy. Note that - * conn_policy_cached is set by ipsec_conn_cache_policy() even for - * connections where we don't have a policy. This is to prevent - * global policy lookups in the inbound path. - * - * If we insert before we set conn_policy_cached, - * CONN_INBOUND_POLICY_PRESENT_V6() check can still evaluate true - * because global policy cound be non-empty. We normally call - * ipsec_check_policy() for conn_policy_cached connections only if - * conn_in_enforce_policy is set. But in this case, - * conn_policy_cached can get set anytime since we made the - * CONN_INBOUND_POLICY_PRESENT_V6() check and ipsec_check_policy() - * is called, which will make the above assumption false. Thus, we - * need to insert after we set conn_policy_cached. + * Make sure we don't leave an unreachable ixa_nce in place + * since ip_select_route is used when we unplumb i.e., remove + * references on ixa_ire, ixa_nce, and ixa_dce. */ - if ((error = ipsec_conn_cache_policy(connp, B_FALSE)) != 0) - goto bad_addr; + nce = ixa->ixa_nce; + if (nce != NULL && nce->nce_is_condemned) { + nce_refrele(nce); + ixa->ixa_nce = NULL; + ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; + } - /* If not fanout_insert this was just an address verification */ - if (fanout_insert) { - /* - * The addresses have been verified. Time to insert in - * the correct fanout list. - */ - error = ipcl_conn_insert_v6(connp, protocol, v6src, v6dst, - connp->conn_ports, - IPCL_IS_TCP(connp) ? connp->conn_tcp->tcp_bound_if : 0); + + ifindex = 0; + if (IN6_IS_ADDR_LINKSCOPE(dst_addr)) { + /* If we are creating a DCE we'd better have an ifindex */ + if (ill != NULL) + ifindex = ill->ill_phyint->phyint_ifindex; + else + flags &= ~IPDF_UNIQUE_DCE; } - if (error == 0) { - connp->conn_fully_bound = B_TRUE; - /* - * Our initial checks for MDT have passed; the IRE is not - * LOCAL/LOOPBACK/BROADCAST, and the link layer seems to - * be supporting MDT. Pass the IRE, IPC and ILL into - * ip_mdinfo_return(), which performs further checks - * against them and upon success, returns the MDT info - * mblk which we will attach to the bind acknowledgment. - */ - if (md_dst_ire != NULL) { - mblk_t *mdinfo_mp; - - ASSERT(md_ill != NULL); - ASSERT(md_ill->ill_mdt_capab != NULL); - if ((mdinfo_mp = ip_mdinfo_return(md_dst_ire, connp, - md_ill->ill_name, md_ill->ill_mdt_capab)) != NULL) { - if (mp == NULL) { - *mpp = mdinfo_mp; - } else { - linkb(mp, mdinfo_mp); - } - } + + if (flags & IPDF_UNIQUE_DCE) { + /* Fallback to the default dce if allocation fails */ + dce = dce_lookup_and_add_v6(dst_addr, ifindex, ipst); + if (dce != NULL) { + generation = dce->dce_generation; + } else { + dce = dce_lookup_v6(dst_addr, ifindex, ipst, + &generation); } + } else { + dce = dce_lookup_v6(dst_addr, ifindex, ipst, &generation); } -bad_addr: - if (ipsec_policy_set) { - ASSERT(mp != NULL); - freeb(mp); - /* - * As of now assume that nothing else accompanies - * IPSEC_POLICY_SET. - */ - *mpp = NULL; - } -refrele_and_quit: - if (src_ire != NULL) - IRE_REFRELE(src_ire); - if (dst_ire != NULL) - IRE_REFRELE(dst_ire); - if (sire != NULL) - IRE_REFRELE(sire); - if (src_ipif != NULL) - ipif_refrele(src_ipif); - if (md_dst_ire != NULL) - IRE_REFRELE(md_dst_ire); - if (ill_held && dst_ill != NULL) - ill_refrele(dst_ill); - if (effective_cred != NULL) - crfree(effective_cred); - return (error); -} - -/* ARGSUSED */ -int -ip_proto_bind_connected_v6(conn_t *connp, mblk_t **mpp, uint8_t protocol, - in6_addr_t *v6srcp, uint16_t lport, const in6_addr_t *v6dstp, - ip6_pkt_t *ipp, uint16_t fport, boolean_t fanout_insert, - boolean_t verify_dst, cred_t *cr) -{ - int error = 0; - boolean_t orig_pkt_isv6 = connp->conn_pkt_isv6; - ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - - ASSERT(connp->conn_af_isv6); - connp->conn_ulp = protocol; + ASSERT(dce != NULL); + if (ixa->ixa_dce != NULL) + dce_refrele_notr(ixa->ixa_dce); +#ifdef DEBUG + dce_refhold_notr(dce); + dce_refrele(dce); +#endif + ixa->ixa_dce = dce; + ixa->ixa_dce_generation = generation; - /* For raw socket, the local port is not set. */ - lport = lport != 0 ? lport : connp->conn_lport; + /* + * Note that IPv6 multicast supports PMTU discovery unlike IPv4 + * multicast. But pmtu discovery is only enabled for connected + * sockets in general. + */ /* - * Bind to local and remote address. Local might be - * unspecified in which case it will be extracted from - * ire_src_addr_v6 + * Set initial value for fragmentation limit. Either conn_ip_output + * or ULP might updates it when there are routing changes. + * Handles a NULL ixa_ire->ire_ill or a NULL ixa_nce for RTF_REJECT. */ - if (IN6_IS_ADDR_V4MAPPED(v6dstp) && !connp->conn_ipv6_v6only) { - /* Connect to IPv4 address */ - ipaddr_t v4src; - ipaddr_t v4dst; - - /* Is the source unspecified or mapped? */ - if (!IN6_IS_ADDR_V4MAPPED(v6srcp) && - !IN6_IS_ADDR_UNSPECIFIED(v6srcp)) { - ip1dbg(("ip_proto_bind_connected_v6: " - "dst is mapped, but not the src\n")); - goto bad_addr; - } - IN6_V4MAPPED_TO_IPADDR(v6srcp, v4src); - IN6_V4MAPPED_TO_IPADDR(v6dstp, v4dst); + pmtu = ip_get_pmtu(ixa); + ixa->ixa_fragsize = pmtu; + /* Make sure ixa_fragsize and ixa_pmtu remain identical */ + if (ixa->ixa_flags & IXAF_VERIFY_PMTU) + ixa->ixa_pmtu = pmtu; - /* Always verify destination reachability. */ - error = ip_bind_connected_v4(connp, mpp, protocol, &v4src, - lport, v4dst, fport, B_TRUE, B_TRUE, cr); - if (error != 0) - goto bad_addr; - IN6_IPADDR_TO_V4MAPPED(v4src, v6srcp); - connp->conn_pkt_isv6 = B_FALSE; - } else if (IN6_IS_ADDR_V4MAPPED(v6srcp)) { - ip1dbg(("ip_proto_bind_connected_v6: " - "src is mapped, but not the dst\n")); - goto bad_addr; - } else { - error = ip_bind_connected_v6(connp, mpp, protocol, v6srcp, - lport, v6dstp, ipp, fport, B_TRUE, verify_dst, cr); - if (error != 0) - goto bad_addr; - connp->conn_pkt_isv6 = B_TRUE; - } + /* + * Extract information useful for some transports. + * First we look for DCE metrics. Then we take what we have in + * the metrics in the route, where the offlink is used if we have + * one. + */ + if (uinfo != NULL) { + bzero(uinfo, sizeof (*uinfo)); - if (orig_pkt_isv6 != connp->conn_pkt_isv6) - ip_setpktversion(connp, connp->conn_pkt_isv6, B_TRUE, ipst); + if (dce->dce_flags & DCEF_UINFO) + *uinfo = dce->dce_uinfo; - /* Send it home. */ - return (0); + rts_merge_metrics(uinfo, &ire->ire_metrics); -bad_addr: - if (error == 0) - error = -TBADADDR; - return (error); -} + /* Allow ire_metrics to decrease the path MTU from above */ + if (uinfo->iulp_mtu == 0 || uinfo->iulp_mtu > pmtu) + uinfo->iulp_mtu = pmtu; -/* - * Get the ire in *mpp. Returns false if it fails (due to lack of space). - * Makes the IRE be IRE_BROADCAST if dst is a multicast address. - */ -/* ARGSUSED4 */ -static boolean_t -ip_bind_get_ire_v6(mblk_t **mpp, ire_t *ire, const in6_addr_t *dst, - iulp_t *ulp_info, ip_stack_t *ipst) -{ - mblk_t *mp = *mpp; - ire_t *ret_ire; + uinfo->iulp_localnet = (ire->ire_type & IRE_ONLINK) != 0; + uinfo->iulp_loopback = (ire->ire_type & IRE_LOOPBACK) != 0; + uinfo->iulp_local = (ire->ire_type & IRE_LOCAL) != 0; + } - ASSERT(mp != NULL); + if (ill != NULL) + ill_refrele(ill); - if (ire != NULL) { - /* - * mp initialized above to IRE_DB_REQ_TYPE - * appended mblk. Its <upper protocol>'s - * job to make sure there is room. - */ - if ((mp->b_datap->db_lim - mp->b_rptr) < sizeof (ire_t)) - return (B_FALSE); + return (error); - mp->b_datap->db_type = IRE_DB_TYPE; - mp->b_wptr = mp->b_rptr + sizeof (ire_t); - bcopy(ire, mp->b_rptr, sizeof (ire_t)); - ret_ire = (ire_t *)mp->b_rptr; - if (IN6_IS_ADDR_MULTICAST(dst) || - IN6_IS_ADDR_V4MAPPED_CLASSD(dst)) { - ret_ire->ire_type = IRE_BROADCAST; - ret_ire->ire_addr_v6 = *dst; - } - if (ulp_info != NULL) { - bcopy(ulp_info, &(ret_ire->ire_uinfo), - sizeof (iulp_t)); - } - ret_ire->ire_mp = mp; - } else { - /* - * No IRE was found. Remove IRE mblk. - */ - *mpp = mp->b_cont; - freeb(mp); - } - return (B_TRUE); -} +bad_addr: + if (ire != NULL) + ire_refrele(ire); -/* - * Add an ip6i_t header to the front of the mblk. - * Inline if possible else allocate a separate mblk containing only the ip6i_t. - * Returns NULL if allocation fails (and frees original message). - * Used in outgoing path when going through ip_newroute_*v6(). - * Used in incoming path to pass ifindex to transports. - */ -mblk_t * -ip_add_info_v6(mblk_t *mp, ill_t *ill, const in6_addr_t *dst) -{ - mblk_t *mp1; - ip6i_t *ip6i; - ip6_t *ip6h; + if (ill != NULL) + ill_refrele(ill); - ip6h = (ip6_t *)mp->b_rptr; - ip6i = (ip6i_t *)(mp->b_rptr - sizeof (ip6i_t)); - if ((uchar_t *)ip6i < mp->b_datap->db_base || - mp->b_datap->db_ref > 1) { - mp1 = allocb(sizeof (ip6i_t), BPRI_MED); - if (mp1 == NULL) { - freemsg(mp); - return (NULL); - } - mp1->b_wptr = mp1->b_rptr = mp1->b_datap->db_lim; - mp1->b_cont = mp; - mp = mp1; - ip6i = (ip6i_t *)(mp->b_rptr - sizeof (ip6i_t)); - } - mp->b_rptr = (uchar_t *)ip6i; - ip6i->ip6i_vcf = ip6h->ip6_vcf; - ip6i->ip6i_nxt = IPPROTO_RAW; - if (ill != NULL) { - ip6i->ip6i_flags = IP6I_IFINDEX; - /* - * If `ill' is in an IPMP group, make sure we use the IPMP - * interface index so that e.g. IPV6_RECVPKTINFO will get the - * IPMP interface index and not an underlying interface index. - */ - if (IS_UNDER_IPMP(ill)) - ip6i->ip6i_ifindex = ipmp_ill_get_ipmp_ifindex(ill); - else - ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex; - } else { - ip6i->ip6i_flags = 0; + /* + * Make sure we don't leave an unreachable ixa_nce in place + * since ip_select_route is used when we unplumb i.e., remove + * references on ixa_ire, ixa_nce, and ixa_dce. + */ + nce = ixa->ixa_nce; + if (nce != NULL && nce->nce_is_condemned) { + nce_refrele(nce); + ixa->ixa_nce = NULL; + ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; } - ip6i->ip6i_nexthop = *dst; - return (mp); + + return (error); } /* @@ -3051,53 +2280,29 @@ ip_add_info_v6(mblk_t *mp, ill_t *ill, const in6_addr_t *dst) * of any incoming packets. * * Zones notes: - * Packets will be distributed to streams in all zones. This is really only + * Packets will be distributed to conns in all zones. This is really only * useful for ICMPv6 as only applications in the global zone can create raw * sockets for other protocols. */ -static void -ip_fanout_proto_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill, - ill_t *inill, uint8_t nexthdr, uint_t nexthdr_offset, uint_t flags, - boolean_t mctl_present, zoneid_t zoneid) +void +ip_fanout_proto_v6(mblk_t *mp, ip6_t *ip6h, ip_recv_attr_t *ira) { - queue_t *rq; - mblk_t *mp1, *first_mp1; - in6_addr_t dst = ip6h->ip6_dst; - in6_addr_t src = ip6h->ip6_src; - mblk_t *first_mp = mp; - boolean_t secure, shared_addr; - conn_t *connp, *first_connp, *next_connp; - connf_t *connfp; - ip_stack_t *ipst = inill->ill_ipst; - ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; - - if (mctl_present) { - mp = first_mp->b_cont; - secure = ipsec_in_is_secure(first_mp); - ASSERT(mp != NULL); - } else { - secure = B_FALSE; - } - - shared_addr = (zoneid == ALL_ZONES); - if (shared_addr) { - /* - * We don't allow multilevel ports for raw IP, so no need to - * check for that here. - */ - zoneid = tsol_packet_to_zoneid(mp); - } + mblk_t *mp1; + in6_addr_t laddr = ip6h->ip6_dst; + conn_t *connp, *first_connp, *next_connp; + connf_t *connfp; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; - connfp = &ipst->ips_ipcl_proto_fanout_v6[nexthdr]; + connfp = &ipst->ips_ipcl_proto_fanout_v6[ira->ira_protocol]; mutex_enter(&connfp->connf_lock); connp = connfp->connf_head; for (connp = connfp->connf_head; connp != NULL; connp = connp->conn_next) { - if (IPCL_PROTO_MATCH_V6(connp, nexthdr, ip6h, ill, flags, - zoneid) && - (!is_system_labeled() || - tsol_receive_local(mp, &dst, IPV6_VERSION, shared_addr, - connp))) + /* Note: IPCL_PROTO_MATCH_V6 includes conn_wantpacket */ + if (IPCL_PROTO_MATCH_V6(connp, ira, ip6h) && + (!(ira->ira_flags & IRAF_SYSTEM_LABELED) || + tsol_receive_local(mp, &laddr, IPV6_VERSION, ira, connp))) break; } @@ -3108,96 +2313,52 @@ ip_fanout_proto_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill, * unclaimed datagrams? */ mutex_exit(&connfp->connf_lock); - if (ip_fanout_send_icmp_v6(q, first_mp, flags, - ICMP6_PARAM_PROB, ICMP6_PARAMPROB_NEXTHEADER, - nexthdr_offset, mctl_present, zoneid, ipst)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInUnknownProtos); - } - + ip_fanout_send_icmp_v6(mp, ICMP6_PARAM_PROB, + ICMP6_PARAMPROB_NEXTHEADER, ira); return; } - ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_upq != NULL); + ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL); CONN_INC_REF(connp); first_connp = connp; /* * XXX: Fix the multiple protocol listeners case. We should not - * be walking the conn->next list here. + * be walking the conn->conn_next list here. */ connp = connp->conn_next; for (;;) { while (connp != NULL) { - if (IPCL_PROTO_MATCH_V6(connp, nexthdr, ip6h, ill, - flags, zoneid) && - (!is_system_labeled() || - tsol_receive_local(mp, &dst, IPV6_VERSION, - shared_addr, connp))) + /* Note: IPCL_PROTO_MATCH_V6 includes conn_wantpacket */ + if (IPCL_PROTO_MATCH_V6(connp, ira, ip6h) && + (!(ira->ira_flags & IRAF_SYSTEM_LABELED) || + tsol_receive_local(mp, &laddr, IPV6_VERSION, + ira, connp))) break; connp = connp->conn_next; } - /* - * Just copy the data part alone. The mctl part is - * needed just for verifying policy and it is never - * sent up. - */ - if (connp == NULL || - (((first_mp1 = dupmsg(first_mp)) == NULL) && - ((first_mp1 = ip_copymsg(first_mp)) == NULL))) { - /* - * No more intested clients or memory - * allocation failed - */ + if (connp == NULL) { + /* No more interested clients */ + connp = first_connp; + break; + } + if (((mp1 = dupmsg(mp)) == NULL) && + ((mp1 = copymsg(mp)) == NULL)) { + /* Memory allocation failed */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); connp = first_connp; break; } - ASSERT(IPCL_IS_NONSTR(connp) || connp->conn_rq != NULL); - mp1 = mctl_present ? first_mp1->b_cont : first_mp1; + CONN_INC_REF(connp); mutex_exit(&connfp->connf_lock); - rq = connp->conn_rq; - /* - * For link-local always add ifindex so that transport can set - * sin6_scope_id. Avoid it for ICMP error fanout. - */ - if ((connp->conn_ip_recvpktinfo || - IN6_IS_ADDR_LINKLOCAL(&src)) && - (flags & IP_FF_IPINFO)) { - /* Add header */ - mp1 = ip_add_info_v6(mp1, inill, &dst); - } - if (mp1 == NULL) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - } else if ( - (IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) || - (!IPCL_IS_NONSTR(connp) && !canputnext(rq))) { - if (flags & IP_FF_RAWIP) { - BUMP_MIB(ill->ill_ip_mib, - rawipIfStatsInOverflows); - } else { - BUMP_MIB(ill->ill_icmp6_mib, - ipv6IfIcmpInOverflows); - } - freemsg(mp1); - } else { - ASSERT(!IPCL_IS_IPTUN(connp)); + ip_fanout_proto_conn(connp, mp1, NULL, (ip6_t *)mp1->b_rptr, + ira); - if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) || - secure) { - first_mp1 = ipsec_check_inbound_policy( - first_mp1, connp, NULL, ip6h, mctl_present); - } - if (first_mp1 != NULL) { - if (mctl_present) - freeb(first_mp1); - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsHCInDelivers); - (connp->conn_recv)(connp, mp1, NULL); - } - } mutex_enter(&connfp->connf_lock); /* Follow the next pointer before releasing the conn. */ next_connp = connp->conn_next; @@ -3208,105 +2369,33 @@ ip_fanout_proto_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill, /* Last one. Send it upstream. */ mutex_exit(&connfp->connf_lock); - /* Initiate IPPF processing */ - if (IP6_IN_IPP(flags, ipst)) { - uint_t ifindex; - - mutex_enter(&ill->ill_lock); - ifindex = ill->ill_phyint->phyint_ifindex; - mutex_exit(&ill->ill_lock); - ip_process(IPP_LOCAL_IN, &mp, ifindex); - if (mp == NULL) { - CONN_DEC_REF(connp); - if (mctl_present) - freeb(first_mp); - return; - } - } - - /* - * For link-local always add ifindex so that transport can set - * sin6_scope_id. Avoid it for ICMP error fanout. - */ - if ((connp->conn_ip_recvpktinfo || IN6_IS_ADDR_LINKLOCAL(&src)) && - (flags & IP_FF_IPINFO)) { - /* Add header */ - mp = ip_add_info_v6(mp, inill, &dst); - if (mp == NULL) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - CONN_DEC_REF(connp); - if (mctl_present) - freeb(first_mp); - return; - } else if (mctl_present) { - first_mp->b_cont = mp; - } else { - first_mp = mp; - } - } - - rq = connp->conn_rq; - if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) || - (!IPCL_IS_NONSTR(connp) && !canputnext(rq))) { - - if (flags & IP_FF_RAWIP) { - BUMP_MIB(ill->ill_ip_mib, rawipIfStatsInOverflows); - } else { - BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInOverflows); - } - - freemsg(first_mp); - } else { - ASSERT(!IPCL_IS_IPTUN(connp)); + ip_fanout_proto_conn(connp, mp, NULL, ip6h, ira); - if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || secure) { - first_mp = ipsec_check_inbound_policy(first_mp, connp, - NULL, ip6h, mctl_present); - if (first_mp == NULL) { - CONN_DEC_REF(connp); - return; - } - } - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); - (connp->conn_recv)(connp, mp, NULL); - if (mctl_present) - freeb(first_mp); - } CONN_DEC_REF(connp); } /* - * Send an ICMP error after patching up the packet appropriately. Returns - * non-zero if the appropriate MIB should be bumped; zero otherwise. + * Called when it is conceptually a ULP that would sent the packet + * e.g., port unreachable and nexthdr unknown. Check that the packet + * would have passed the IPsec global policy before sending the error. + * + * Send an ICMP error after patching up the packet appropriately. + * Uses ip_drop_input and bumps the appropriate MIB. + * For ICMP6_PARAMPROB_NEXTHEADER we determine the offset to use. */ -int -ip_fanout_send_icmp_v6(queue_t *q, mblk_t *mp, uint_t flags, - uint_t icmp_type, uint8_t icmp_code, uint_t nexthdr_offset, - boolean_t mctl_present, zoneid_t zoneid, ip_stack_t *ipst) +void +ip_fanout_send_icmp_v6(mblk_t *mp, uint_t icmp_type, uint8_t icmp_code, + ip_recv_attr_t *ira) { - ip6_t *ip6h; - mblk_t *first_mp; - boolean_t secure; - unsigned char db_type; - ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; + ip6_t *ip6h; + boolean_t secure; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; + netstack_t *ns = ipst->ips_netstack; + ipsec_stack_t *ipss = ns->netstack_ipsec; + + secure = ira->ira_flags & IRAF_IPSEC_SECURE; - first_mp = mp; - if (mctl_present) { - mp = mp->b_cont; - secure = ipsec_in_is_secure(first_mp); - ASSERT(mp != NULL); - } else { - /* - * If this is an ICMP error being reported - which goes - * up as M_CTLs, we need to convert them to M_DATA till - * we finish checking with global policy because - * ipsec_check_global_policy() assumes M_DATA as clear - * and M_CTL as secure. - */ - db_type = mp->b_datap->db_type; - mp->b_datap->db_type = M_DATA; - secure = B_FALSE; - } /* * We are generating an icmp error for some inbound packet. * Called from all ip_fanout_(udp, tcp, proto) functions. @@ -3316,572 +2405,155 @@ ip_fanout_send_icmp_v6(queue_t *q, mblk_t *mp, uint_t flags, */ ip6h = (ip6_t *)mp->b_rptr; if (secure || ipss->ipsec_inbound_v6_policy_present) { - first_mp = ipsec_check_global_policy(first_mp, NULL, - NULL, ip6h, mctl_present, ipst->ips_netstack); - if (first_mp == NULL) - return (0); - } - - if (!mctl_present) - mp->b_datap->db_type = db_type; - - if (flags & IP_FF_SEND_ICMP) { - if (flags & IP_FF_HDR_COMPLETE) { - if (ip_hdr_complete_v6(ip6h, zoneid, ipst)) { - freemsg(first_mp); - return (1); - } - } - switch (icmp_type) { - case ICMP6_DST_UNREACH: - icmp_unreachable_v6(WR(q), first_mp, icmp_code, - B_FALSE, B_FALSE, zoneid, ipst); - break; - case ICMP6_PARAM_PROB: - icmp_param_problem_v6(WR(q), first_mp, icmp_code, - nexthdr_offset, B_FALSE, B_FALSE, zoneid, ipst); - break; - default: -#ifdef DEBUG - panic("ip_fanout_send_icmp_v6: wrong type"); - /*NOTREACHED*/ -#else - freemsg(first_mp); - break; -#endif - } - } else { - freemsg(first_mp); - return (0); - } - - return (1); -} - -/* - * Fanout for TCP packets - * The caller puts <fport, lport> in the ports parameter. - */ -static void -ip_fanout_tcp_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ill_t *ill, ill_t *inill, - uint_t flags, uint_t hdr_len, boolean_t mctl_present, zoneid_t zoneid) -{ - mblk_t *first_mp; - boolean_t secure; - conn_t *connp; - tcph_t *tcph; - boolean_t syn_present = B_FALSE; - ip_stack_t *ipst = inill->ill_ipst; - ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; - - first_mp = mp; - if (mctl_present) { - mp = first_mp->b_cont; - secure = ipsec_in_is_secure(first_mp); - ASSERT(mp != NULL); - } else { - secure = B_FALSE; - } - - connp = ipcl_classify_v6(mp, IPPROTO_TCP, hdr_len, zoneid, ipst); - - if (connp == NULL || - !conn_wantpacket_v6(connp, ill, ip6h, flags, zoneid)) { - /* - * No hard-bound match. Send Reset. - */ - dblk_t *dp = mp->b_datap; - uint32_t ill_index; - - ASSERT((dp->db_struioflag & STRUIO_IP) == 0); - - /* Initiate IPPf processing, if needed. */ - if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && - (flags & IP6_NO_IPPOLICY)) { - ill_index = ill->ill_phyint->phyint_ifindex; - ip_process(IPP_LOCAL_IN, &first_mp, ill_index); - if (first_mp == NULL) { - if (connp != NULL) - CONN_DEC_REF(connp); - return; - } - } - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); - if (connp != NULL) { - ip_xmit_reset_serialize(first_mp, hdr_len, zoneid, - ipst->ips_netstack->netstack_tcp, connp); - CONN_DEC_REF(connp); - } else { - tcp_xmit_listeners_reset(first_mp, hdr_len, zoneid, - ipst->ips_netstack->netstack_tcp, NULL); - } - - return; - } - - tcph = (tcph_t *)&mp->b_rptr[hdr_len]; - if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) { - if (IPCL_IS_TCP(connp)) { - squeue_t *sqp; - - /* - * If the queue belongs to a conn, and fused tcp - * loopback is enabled, assign the eager's squeue - * to be that of the active connect's. - */ - if ((flags & IP_FF_LOOPBACK) && do_tcp_fusion && - CONN_Q(q) && IPCL_IS_TCP(Q_TO_CONN(q)) && - !CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) && - !secure && - !IP6_IN_IPP(flags, ipst)) { - ASSERT(Q_TO_CONN(q)->conn_sqp != NULL); - sqp = Q_TO_CONN(q)->conn_sqp; - } else { - sqp = IP_SQUEUE_GET(lbolt); - } - - mp->b_datap->db_struioflag |= STRUIO_EAGER; - DB_CKSUMSTART(mp) = (intptr_t)sqp; - - /* - * db_cksumstuff is unused in the incoming - * path; Thus store the ifindex here. It will - * be cleared in tcp_conn_create_v6(). - */ - DB_CKSUMSTUFF(mp) = - (intptr_t)ill->ill_phyint->phyint_ifindex; - syn_present = B_TRUE; - } - } - - if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp) && !syn_present) { - uint_t flags = (unsigned int)tcph->th_flags[0] & 0xFF; - if ((flags & TH_RST) || (flags & TH_URG)) { - CONN_DEC_REF(connp); - freemsg(first_mp); - return; - } - if (flags & TH_ACK) { - ip_xmit_reset_serialize(first_mp, hdr_len, zoneid, - ipst->ips_netstack->netstack_tcp, connp); - CONN_DEC_REF(connp); + mp = ipsec_check_global_policy(mp, NULL, NULL, ip6h, ira, ns); + if (mp == NULL) return; - } + } - CONN_DEC_REF(connp); - freemsg(first_mp); + /* We never send errors for protocols that we do implement */ + if (ira->ira_protocol == IPPROTO_ICMPV6) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ip_fanout_send_icmp_v6", mp, ill); + freemsg(mp); return; } - if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) || secure) { - first_mp = ipsec_check_inbound_policy(first_mp, connp, - NULL, ip6h, mctl_present); - if (first_mp == NULL) { - CONN_DEC_REF(connp); - return; - } - if (IPCL_IS_TCP(connp) && IPCL_IS_BOUND(connp)) { - ASSERT(syn_present); - if (mctl_present) { - ASSERT(first_mp != mp); - first_mp->b_datap->db_struioflag |= - STRUIO_POLICY; - } else { - ASSERT(first_mp == mp); - mp->b_datap->db_struioflag &= - ~STRUIO_EAGER; - mp->b_datap->db_struioflag |= - STRUIO_POLICY; - } - } else { - /* - * Discard first_mp early since we're dealing with a - * fully-connected conn_t and tcp doesn't do policy in - * this case. Also, if someone is bound to IPPROTO_TCP - * over raw IP, they don't expect to see a M_CTL. - */ - if (mctl_present) { - freeb(first_mp); - mctl_present = B_FALSE; - } - first_mp = mp; - } - } + switch (icmp_type) { + case ICMP6_DST_UNREACH: + ASSERT(icmp_code == ICMP6_DST_UNREACH_NOPORT); - /* Initiate IPPF processing */ - if (IP6_IN_IPP(flags, ipst)) { - uint_t ifindex; + BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts); + ip_drop_input("ipIfStatsNoPorts", mp, ill); - mutex_enter(&ill->ill_lock); - ifindex = ill->ill_phyint->phyint_ifindex; - mutex_exit(&ill->ill_lock); - ip_process(IPP_LOCAL_IN, &mp, ifindex); - if (mp == NULL) { - CONN_DEC_REF(connp); - if (mctl_present) { - freeb(first_mp); - } - return; - } else if (mctl_present) { - /* - * ip_add_info_v6 might return a new mp. - */ - ASSERT(first_mp != mp); - first_mp->b_cont = mp; - } else { - first_mp = mp; - } - } + icmp_unreachable_v6(mp, icmp_code, B_FALSE, ira); + break; + case ICMP6_PARAM_PROB: + ASSERT(icmp_code == ICMP6_PARAMPROB_NEXTHEADER); - /* - * For link-local always add ifindex so that TCP can bind to that - * interface. Avoid it for ICMP error fanout. - */ - if (!syn_present && ((connp->conn_ip_recvpktinfo || - IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src)) && - (flags & IP_FF_IPINFO))) { - /* Add header */ - mp = ip_add_info_v6(mp, inill, &ip6h->ip6_dst); - if (mp == NULL) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - CONN_DEC_REF(connp); - if (mctl_present) - freeb(first_mp); - return; - } else if (mctl_present) { - ASSERT(first_mp != mp); - first_mp->b_cont = mp; - } else { - first_mp = mp; - } - } + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInUnknownProtos); + ip_drop_input("ipIfStatsInUnknownProtos", mp, ill); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); - if (IPCL_IS_TCP(connp)) { - SQUEUE_ENTER_ONE(connp->conn_sqp, first_mp, connp->conn_recv, - connp, ip_squeue_flag, SQTAG_IP6_TCP_INPUT); - } else { - /* SOCK_RAW, IPPROTO_TCP case */ - (connp->conn_recv)(connp, first_mp, NULL); - CONN_DEC_REF(connp); + /* Let the system determine the offset for this one */ + icmp_param_problem_nexthdr_v6(mp, B_FALSE, ira); + break; + default: +#ifdef DEBUG + panic("ip_fanout_send_icmp_v6: wrong type"); + /*NOTREACHED*/ +#else + freemsg(mp); + break; +#endif } } /* + * Fanout for UDP packets that are multicast or ICMP errors. + * (Unicast fanout is handled in ip_input_v6.) + * + * If SO_REUSEADDR is set all multicast packets + * will be delivered to all conns bound to the same port. + * * Fanout for UDP packets. * The caller puts <fport, lport> in the ports parameter. * ire_type must be IRE_BROADCAST for multicast and broadcast packets. * * If SO_REUSEADDR is set all multicast and broadcast packets - * will be delivered to all streams bound to the same port. + * will be delivered to all conns bound to the same port. * * Zones notes: - * Multicast packets will be distributed to streams in all zones. + * Earlier in ip_input on a system with multiple shared-IP zones we + * duplicate the multicast and broadcast packets and send them up + * with each explicit zoneid that exists on that ill. + * This means that here we can match the zoneid with SO_ALLZONES being special. */ -static void -ip_fanout_udp_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, uint32_t ports, - ill_t *ill, ill_t *inill, uint_t flags, boolean_t mctl_present, - zoneid_t zoneid) +void +ip_fanout_udp_multi_v6(mblk_t *mp, ip6_t *ip6h, uint16_t lport, uint16_t fport, + ip_recv_attr_t *ira) { - uint32_t dstport, srcport; - in6_addr_t dst; - mblk_t *first_mp; - boolean_t secure; + in6_addr_t laddr; conn_t *connp; connf_t *connfp; - conn_t *first_conn; - conn_t *next_conn; - mblk_t *mp1, *first_mp1; - in6_addr_t src; - boolean_t shared_addr; - ip_stack_t *ipst = inill->ill_ipst; - ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; - - first_mp = mp; - if (mctl_present) { - mp = first_mp->b_cont; - secure = ipsec_in_is_secure(first_mp); - ASSERT(mp != NULL); - } else { - secure = B_FALSE; - } + in6_addr_t faddr; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; - /* Extract ports in net byte order */ - dstport = htons(ntohl(ports) & 0xFFFF); - srcport = htons(ntohl(ports) >> 16); - dst = ip6h->ip6_dst; - src = ip6h->ip6_src; + ASSERT(ira->ira_flags & (IRAF_MULTIBROADCAST|IRAF_ICMP_ERROR)); - shared_addr = (zoneid == ALL_ZONES); - if (shared_addr) { - /* - * No need to handle exclusive-stack zones since ALL_ZONES - * only applies to the shared stack. - */ - zoneid = tsol_mlp_findzone(IPPROTO_UDP, dstport); - /* - * If no shared MLP is found, tsol_mlp_findzone returns - * ALL_ZONES. In that case, we assume it's SLP, and - * search for the zone based on the packet label. - * That will also return ALL_ZONES on failure, but - * we never allow conn_zoneid to be set to ALL_ZONES. - */ - if (zoneid == ALL_ZONES) - zoneid = tsol_packet_to_zoneid(mp); - } + laddr = ip6h->ip6_dst; + faddr = ip6h->ip6_src; /* Attempt to find a client stream based on destination port. */ - connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(dstport, ipst)]; + connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; mutex_enter(&connfp->connf_lock); connp = connfp->connf_head; - if (!IN6_IS_ADDR_MULTICAST(&dst)) { - /* - * Not multicast. Send to the one (first) client we find. - */ - while (connp != NULL) { - if (IPCL_UDP_MATCH_V6(connp, dstport, dst, srcport, - src) && IPCL_ZONE_MATCH(connp, zoneid) && - conn_wantpacket_v6(connp, ill, ip6h, - flags, zoneid)) { - break; - } - connp = connp->conn_next; - } - if (connp == NULL || connp->conn_upq == NULL) - goto notfound; - - if (is_system_labeled() && - !tsol_receive_local(mp, &dst, IPV6_VERSION, shared_addr, - connp)) - goto notfound; - - /* Found a client */ - CONN_INC_REF(connp); - mutex_exit(&connfp->connf_lock); - - if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) || - (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) { - freemsg(first_mp); - CONN_DEC_REF(connp); - return; - } - if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) || secure) { - first_mp = ipsec_check_inbound_policy(first_mp, - connp, NULL, ip6h, mctl_present); - if (first_mp == NULL) { - CONN_DEC_REF(connp); - return; - } - } - /* Initiate IPPF processing */ - if (IP6_IN_IPP(flags, ipst)) { - uint_t ifindex; - - mutex_enter(&ill->ill_lock); - ifindex = ill->ill_phyint->phyint_ifindex; - mutex_exit(&ill->ill_lock); - ip_process(IPP_LOCAL_IN, &mp, ifindex); - if (mp == NULL) { - CONN_DEC_REF(connp); - if (mctl_present) - freeb(first_mp); - return; - } - } - /* - * For link-local always add ifindex so that - * transport can set sin6_scope_id. Avoid it for - * ICMP error fanout. - */ - if ((connp->conn_ip_recvpktinfo || - IN6_IS_ADDR_LINKLOCAL(&src)) && - (flags & IP_FF_IPINFO)) { - /* Add header */ - mp = ip_add_info_v6(mp, inill, &dst); - if (mp == NULL) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - CONN_DEC_REF(connp); - if (mctl_present) - freeb(first_mp); - return; - } else if (mctl_present) { - first_mp->b_cont = mp; - } else { - first_mp = mp; - } - } - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); - - /* Send it upstream */ - (connp->conn_recv)(connp, mp, NULL); - - IP6_STAT(ipst, ip6_udp_fannorm); - CONN_DEC_REF(connp); - if (mctl_present) - freeb(first_mp); - return; - } - while (connp != NULL) { - if ((IPCL_UDP_MATCH_V6(connp, dstport, dst, srcport, src)) && - conn_wantpacket_v6(connp, ill, ip6h, flags, zoneid) && - (!is_system_labeled() || - tsol_receive_local(mp, &dst, IPV6_VERSION, shared_addr, - connp))) + if ((IPCL_UDP_MATCH_V6(connp, lport, laddr, fport, faddr)) && + conn_wantpacket_v6(connp, ira, ip6h) && + (!(ira->ira_flags & IRAF_SYSTEM_LABELED) || + tsol_receive_local(mp, &laddr, IPV6_VERSION, ira, connp))) break; connp = connp->conn_next; } - if (connp == NULL || connp->conn_upq == NULL) + if (connp == NULL) goto notfound; - first_conn = connp; - CONN_INC_REF(connp); - connp = connp->conn_next; - for (;;) { - while (connp != NULL) { - if (IPCL_UDP_MATCH_V6(connp, dstport, dst, srcport, - src) && conn_wantpacket_v6(connp, ill, ip6h, - flags, zoneid) && - (!is_system_labeled() || - tsol_receive_local(mp, &dst, IPV6_VERSION, - shared_addr, connp))) - break; - connp = connp->conn_next; - } - /* - * Just copy the data part alone. The mctl part is - * needed just for verifying policy and it is never - * sent up. - */ - if (connp == NULL || - (((first_mp1 = dupmsg(first_mp)) == NULL) && - ((first_mp1 = ip_copymsg(first_mp)) == NULL))) { - /* - * No more interested clients or memory - * allocation failed - */ - connp = first_conn; - break; - } - mp1 = mctl_present ? first_mp1->b_cont : first_mp1; - CONN_INC_REF(connp); - mutex_exit(&connfp->connf_lock); - /* - * For link-local always add ifindex so that transport - * can set sin6_scope_id. Avoid it for ICMP error - * fanout. - */ - if ((connp->conn_ip_recvpktinfo || - IN6_IS_ADDR_LINKLOCAL(&src)) && - (flags & IP_FF_IPINFO)) { - /* Add header */ - mp1 = ip_add_info_v6(mp1, inill, &dst); - } - /* mp1 could have changed */ - if (mctl_present) - first_mp1->b_cont = mp1; - else - first_mp1 = mp1; - if (mp1 == NULL) { - if (mctl_present) - freeb(first_mp1); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - goto next_one; - } - if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) || - (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) { - BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows); - freemsg(first_mp1); - goto next_one; - } - if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) || secure) { - first_mp1 = ipsec_check_inbound_policy - (first_mp1, connp, NULL, ip6h, - mctl_present); - } - if (first_mp1 != NULL) { - if (mctl_present) - freeb(first_mp1); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); + if (connp->conn_reuseaddr) { + conn_t *first_connp = connp; + conn_t *next_connp; + mblk_t *mp1; - /* Send it upstream */ - (connp->conn_recv)(connp, mp1, NULL); - } -next_one: - mutex_enter(&connfp->connf_lock); - /* Follow the next pointer before releasing the conn. */ - next_conn = connp->conn_next; - IP6_STAT(ipst, ip6_udp_fanmb); - CONN_DEC_REF(connp); - connp = next_conn; - } + connp = connp->conn_next; + for (;;) { + while (connp != NULL) { + if (IPCL_UDP_MATCH_V6(connp, lport, laddr, + fport, faddr) && + conn_wantpacket_v6(connp, ira, ip6h) && + (!(ira->ira_flags & IRAF_SYSTEM_LABELED) || + tsol_receive_local(mp, &laddr, IPV6_VERSION, + ira, connp))) + break; + connp = connp->conn_next; + } + if (connp == NULL) { + /* No more interested clients */ + connp = first_connp; + break; + } + if (((mp1 = dupmsg(mp)) == NULL) && + ((mp1 = copymsg(mp)) == NULL)) { + /* Memory allocation failed */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); + connp = first_connp; + break; + } - /* Last one. Send it upstream. */ - mutex_exit(&connfp->connf_lock); + CONN_INC_REF(connp); + mutex_exit(&connfp->connf_lock); - /* Initiate IPPF processing */ - if (IP6_IN_IPP(flags, ipst)) { - uint_t ifindex; + IP6_STAT(ipst, ip6_udp_fanmb); + ip_fanout_udp_conn(connp, mp1, NULL, + (ip6_t *)mp1->b_rptr, ira); - mutex_enter(&ill->ill_lock); - ifindex = ill->ill_phyint->phyint_ifindex; - mutex_exit(&ill->ill_lock); - ip_process(IPP_LOCAL_IN, &mp, ifindex); - if (mp == NULL) { + mutex_enter(&connfp->connf_lock); + /* Follow the next pointer before releasing the conn. */ + next_connp = connp->conn_next; + IP6_STAT(ipst, ip6_udp_fanmb); CONN_DEC_REF(connp); - if (mctl_present) { - freeb(first_mp); - } - return; + connp = next_connp; } } - /* - * For link-local always add ifindex so that transport can set - * sin6_scope_id. Avoid it for ICMP error fanout. - */ - if ((connp->conn_ip_recvpktinfo || - IN6_IS_ADDR_LINKLOCAL(&src)) && (flags & IP_FF_IPINFO)) { - /* Add header */ - mp = ip_add_info_v6(mp, inill, &dst); - if (mp == NULL) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - CONN_DEC_REF(connp); - if (mctl_present) - freeb(first_mp); - return; - } else if (mctl_present) { - first_mp->b_cont = mp; - } else { - first_mp = mp; - } - } - if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) || - (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) { - BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows); - freemsg(mp); - } else { - if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) || secure) { - first_mp = ipsec_check_inbound_policy(first_mp, - connp, NULL, ip6h, mctl_present); - if (first_mp == NULL) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - CONN_DEC_REF(connp); - return; - } - } - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); + /* Last one. Send it upstream. */ + mutex_exit(&connfp->connf_lock); - /* Send it upstream */ - (connp->conn_recv)(connp, mp, NULL); - } IP6_STAT(ipst, ip6_udp_fanmb); + ip_fanout_udp_conn(connp, mp, NULL, ip6h, ira); CONN_DEC_REF(connp); - if (mctl_present) - freeb(first_mp); return; notfound: @@ -3892,28 +2564,26 @@ notfound: * unclaimed datagrams? */ if (ipst->ips_ipcl_proto_fanout_v6[IPPROTO_UDP].connf_head != NULL) { - ip_fanout_proto_v6(q, first_mp, ip6h, ill, inill, IPPROTO_UDP, - 0, flags | IP_FF_RAWIP | IP_FF_IPINFO, mctl_present, - zoneid); + ASSERT(ira->ira_protocol == IPPROTO_UDP); + ip_fanout_proto_v6(mp, ip6h, ira); } else { - if (ip_fanout_send_icmp_v6(q, first_mp, flags, - ICMP6_DST_UNREACH, ICMP6_DST_UNREACH_NOPORT, 0, - mctl_present, zoneid, ipst)) { - BUMP_MIB(ill->ill_ip_mib, udpIfStatsNoPorts); - } + ip_fanout_send_icmp_v6(mp, ICMP6_DST_UNREACH, + ICMP6_DST_UNREACH_NOPORT, ira); } } /* * int ip_find_hdr_v6() * - * This routine is used by the upper layer protocols and the IP tunnel - * module to: + * This routine is used by the upper layer protocols, iptun, and IPsec: * - Set extension header pointers to appropriate locations * - Determine IPv6 header length and return it * - Return a pointer to the last nexthdr value * * The caller must initialize ipp_fields. + * The upper layer protocols normally set label_separate which makes the + * routine put the TX label in ipp_label_v6. If this is not set then + * the hop-by-hop options including the label are placed in ipp_hopopts. * * NOTE: If multiple extension headers of the same type are present, * ip_find_hdr_v6() will set the respective extension header pointers @@ -3923,7 +2593,8 @@ notfound: * malformed part. */ int -ip_find_hdr_v6(mblk_t *mp, ip6_t *ip6h, ip6_pkt_t *ipp, uint8_t *nexthdrp) +ip_find_hdr_v6(mblk_t *mp, ip6_t *ip6h, boolean_t label_separate, ip_pkt_t *ipp, + uint8_t *nexthdrp) { uint_t length, ehdrlen; uint8_t nexthdr; @@ -3933,6 +2604,11 @@ ip_find_hdr_v6(mblk_t *mp, ip6_t *ip6h, ip6_pkt_t *ipp, uint8_t *nexthdrp) ip6_hbh_t *tmphopopts; ip6_frag_t *tmpfraghdr; + ipp->ipp_fields |= IPPF_HOPLIMIT | IPPF_TCLASS | IPPF_ADDR; + ipp->ipp_hoplimit = ip6h->ip6_hops; + ipp->ipp_tclass = IPV6_FLOW_TCLASS(ip6h->ip6_flow); + ipp->ipp_addr = ip6h->ip6_dst; + length = IPV6_HDR_LEN; whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */ endptr = mp->b_wptr; @@ -3944,19 +2620,48 @@ ip_find_hdr_v6(mblk_t *mp, ip6_t *ip6h, ip6_pkt_t *ipp, uint8_t *nexthdrp) goto done; switch (nexthdr) { - case IPPROTO_HOPOPTS: + case IPPROTO_HOPOPTS: { + /* We check for any CIPSO */ + uchar_t *secopt; + boolean_t hbh_needed; + uchar_t *after_secopt; + tmphopopts = (ip6_hbh_t *)whereptr; ehdrlen = 8 * (tmphopopts->ip6h_len + 1); if ((uchar_t *)tmphopopts + ehdrlen > endptr) goto done; nexthdr = tmphopopts->ip6h_nxt; + + if (!label_separate) { + secopt = NULL; + after_secopt = whereptr; + } else { + /* + * We have dropped packets with bad options in + * ip6_input. No need to check return value + * here. + */ + (void) tsol_find_secopt_v6(whereptr, ehdrlen, + &secopt, &after_secopt, &hbh_needed); + } + if (secopt != NULL && after_secopt - whereptr > 0) { + ipp->ipp_fields |= IPPF_LABEL_V6; + ipp->ipp_label_v6 = secopt; + ipp->ipp_label_len_v6 = after_secopt - whereptr; + } else { + ipp->ipp_label_len_v6 = 0; + after_secopt = whereptr; + hbh_needed = B_TRUE; + } /* return only 1st hbh */ - if (!(ipp->ipp_fields & IPPF_HOPOPTS)) { + if (hbh_needed && !(ipp->ipp_fields & IPPF_HOPOPTS)) { ipp->ipp_fields |= IPPF_HOPOPTS; - ipp->ipp_hopopts = tmphopopts; - ipp->ipp_hopoptslen = ehdrlen; + ipp->ipp_hopopts = (ip6_hbh_t *)after_secopt; + ipp->ipp_hopoptslen = ehdrlen - + ipp->ipp_label_len_v6; } break; + } case IPPROTO_DSTOPTS: tmpdstopts = (ip6_dest_t *)whereptr; ehdrlen = 8 * (tmpdstopts->ip6d_len + 1); @@ -3993,10 +2698,10 @@ ip_find_hdr_v6(mblk_t *mp, ip6_t *ip6h, ip6_pkt_t *ipp, uint8_t *nexthdrp) */ if (ipp->ipp_fields & IPPF_DSTOPTS) { ipp->ipp_fields &= ~IPPF_DSTOPTS; - ipp->ipp_fields |= IPPF_RTDSTOPTS; - ipp->ipp_rtdstopts = ipp->ipp_dstopts; + ipp->ipp_fields |= IPPF_RTHDRDSTOPTS; + ipp->ipp_rthdrdstopts = ipp->ipp_dstopts; ipp->ipp_dstopts = NULL; - ipp->ipp_rtdstoptslen = ipp->ipp_dstoptslen; + ipp->ipp_rthdrdstoptslen = ipp->ipp_dstoptslen; ipp->ipp_dstoptslen = 0; } break; @@ -4025,25 +2730,6 @@ done: return (length); } -int -ip_hdr_complete_v6(ip6_t *ip6h, zoneid_t zoneid, ip_stack_t *ipst) -{ - ire_t *ire; - - if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) { - ire = ire_lookup_local_v6(zoneid, ipst); - if (ire == NULL) { - ip1dbg(("ip_hdr_complete_v6: no source IRE\n")); - return (1); - } - ip6h->ip6_src = ire->ire_addr_v6; - ire_refrele(ire); - } - ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW; - ip6h->ip6_hops = ipst->ips_ipv6_def_hops; - return (0); -} - /* * Try to determine where and what are the IPv6 header length and * pointer to nexthdr value for the upper layer protocol (or an @@ -4066,7 +2752,7 @@ ip_hdr_length_nexthdr_v6(mblk_t *mp, ip6_t *ip6h, uint16_t *hdr_length_ptr, ip6_rthdr_t *rthdr; ip6_frag_t *fraghdr; - ASSERT((IPH_HDR_VERSION(ip6h) & ~IP_FORWARD_PROG_BIT) == IPV6_VERSION); + ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION); length = IPV6_HDR_LEN; whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */ endptr = mp->b_wptr; @@ -4151,1905 +2837,6 @@ ip_hdr_length_v6(mblk_t *mp, ip6_t *ip6h) } /* - * IPv6 - - * ip_newroute_v6 is called by ip_rput_data_v6 or ip_wput_v6 whenever we need - * to send out a packet to a destination address for which we do not have - * specific routing information. - * - * Handle non-multicast packets. If ill is non-NULL the match is done - * for that ill. - * - * When a specific ill is specified (using IPV6_PKTINFO, - * IPV6_MULTICAST_IF, or IPV6_BOUND_IF) we will only match - * on routing entries (ftable and ctable) that have a matching - * ire->ire_ipif->ipif_ill. Thus this can only be used - * for destinations that are on-link for the specific ill - * and that can appear on multiple links. Thus it is useful - * for multicast destinations, link-local destinations, and - * at some point perhaps for site-local destinations (if the - * node sits at a site boundary). - * We create the cache entries in the regular ctable since - * it can not "confuse" things for other destinations. - * - * NOTE : These are the scopes of some of the variables that point at IRE, - * which needs to be followed while making any future modifications - * to avoid memory leaks. - * - * - ire and sire are the entries looked up initially by - * ire_ftable_lookup_v6. - * - ipif_ire is used to hold the interface ire associated with - * the new cache ire. But it's scope is limited, so we always REFRELE - * it before branching out to error paths. - * - save_ire is initialized before ire_create, so that ire returned - * by ire_create will not over-write the ire. We REFRELE save_ire - * before breaking out of the switch. - * - * Thus on failures, we have to REFRELE only ire and sire, if they - * are not NULL. - */ -/* ARGSUSED */ -void -ip_newroute_v6(queue_t *q, mblk_t *mp, const in6_addr_t *v6dstp, - const in6_addr_t *v6srcp, ill_t *ill, zoneid_t zoneid, ip_stack_t *ipst) -{ - in6_addr_t v6gw; - in6_addr_t dst; - ire_t *ire = NULL; - ipif_t *src_ipif = NULL; - ill_t *dst_ill = NULL; - ire_t *sire = NULL; - ire_t *save_ire; - ip6_t *ip6h; - int err = 0; - mblk_t *first_mp; - ipsec_out_t *io; - ushort_t ire_marks = 0; - int match_flags; - ire_t *first_sire = NULL; - mblk_t *copy_mp = NULL; - mblk_t *xmit_mp = NULL; - in6_addr_t save_dst; - uint32_t multirt_flags = - MULTIRT_CACHEGW | MULTIRT_USESTAMP | MULTIRT_SETSTAMP; - boolean_t multirt_is_resolvable; - boolean_t multirt_resolve_next; - boolean_t need_rele = B_FALSE; - boolean_t ip6_asp_table_held = B_FALSE; - tsol_ire_gw_secattr_t *attrp = NULL; - tsol_gcgrp_t *gcgrp = NULL; - tsol_gcgrp_addr_t ga; - - ASSERT(!IN6_IS_ADDR_MULTICAST(v6dstp)); - - first_mp = mp; - if (mp->b_datap->db_type == M_CTL) { - mp = mp->b_cont; - io = (ipsec_out_t *)first_mp->b_rptr; - ASSERT(io->ipsec_out_type == IPSEC_OUT); - } else { - io = NULL; - } - - ip6h = (ip6_t *)mp->b_rptr; - - if (IN6_IS_ADDR_LOOPBACK(v6dstp)) { - ip1dbg(("ip_newroute_v6: dst with loopback addr\n")); - goto icmp_err_ret; - } else if (IN6_IS_ADDR_LOOPBACK(v6srcp)) { - ip1dbg(("ip_newroute_v6: src with loopback addr\n")); - goto icmp_err_ret; - } - - /* - * If this IRE is created for forwarding or it is not for - * TCP traffic, mark it as temporary. - * - * Is it sufficient just to check the next header?? - */ - if (mp->b_prev != NULL || !IP_FLOW_CONTROLLED_ULP(ip6h->ip6_nxt)) - ire_marks |= IRE_MARK_TEMPORARY; - - /* - * Get what we can from ire_ftable_lookup_v6 which will follow an IRE - * chain until it gets the most specific information available. - * For example, we know that there is no IRE_CACHE for this dest, - * but there may be an IRE_OFFSUBNET which specifies a gateway. - * ire_ftable_lookup_v6 will look up the gateway, etc. - */ - - if (ill == NULL) { - match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | - MATCH_IRE_PARENT | MATCH_IRE_RJ_BHOLE | MATCH_IRE_SECATTR; - ire = ire_ftable_lookup_v6(v6dstp, 0, 0, 0, - NULL, &sire, zoneid, 0, msg_getlabel(mp), - match_flags, ipst); - } else { - match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | - MATCH_IRE_RJ_BHOLE | MATCH_IRE_ILL; - match_flags |= MATCH_IRE_PARENT | MATCH_IRE_SECATTR; - - /* - * Because nce_xmit() calls ip_output_v6() and NCEs are always - * tied to an underlying interface, IS_UNDER_IPMP() may be - * true even when building IREs that will be used for data - * traffic. As such, use the packet's source address to - * determine whether the traffic is test traffic, and set - * MATCH_IRE_MARK_TESTHIDDEN if so. - */ - if (IS_UNDER_IPMP(ill) && !IN6_IS_ADDR_UNSPECIFIED(v6srcp)) { - if (ipif_lookup_testaddr_v6(ill, v6srcp, NULL)) - match_flags |= MATCH_IRE_MARK_TESTHIDDEN; - } - - ire = ire_ftable_lookup_v6(v6dstp, NULL, NULL, 0, ill->ill_ipif, - &sire, zoneid, 0, msg_getlabel(mp), match_flags, ipst); - } - - ip3dbg(("ip_newroute_v6: ire_ftable_lookup_v6() " - "returned ire %p, sire %p\n", (void *)ire, (void *)sire)); - - /* - * We enter a loop that will be run only once in most cases. - * The loop is re-entered in the case where the destination - * can be reached through multiple RTF_MULTIRT-flagged routes. - * The intention is to compute multiple routes to a single - * destination in a single ip_newroute_v6 call. - * The information is contained in sire->ire_flags. - */ - do { - multirt_resolve_next = B_FALSE; - - if (dst_ill != NULL) { - ill_refrele(dst_ill); - dst_ill = NULL; - } - if (src_ipif != NULL) { - ipif_refrele(src_ipif); - src_ipif = NULL; - } - if ((sire != NULL) && sire->ire_flags & RTF_MULTIRT) { - ip3dbg(("ip_newroute_v6: starting new resolution " - "with first_mp %p, tag %d\n", - (void *)first_mp, MULTIRT_DEBUG_TAGGED(first_mp))); - - /* - * We check if there are trailing unresolved routes for - * the destination contained in sire. - */ - multirt_is_resolvable = ire_multirt_lookup_v6(&ire, - &sire, multirt_flags, msg_getlabel(mp), ipst); - - ip3dbg(("ip_newroute_v6: multirt_is_resolvable %d, " - "ire %p, sire %p\n", - multirt_is_resolvable, (void *)ire, (void *)sire)); - - if (!multirt_is_resolvable) { - /* - * No more multirt routes to resolve; give up - * (all routes resolved or no more resolvable - * routes). - */ - if (ire != NULL) { - ire_refrele(ire); - ire = NULL; - } - } else { - ASSERT(sire != NULL); - ASSERT(ire != NULL); - /* - * We simply use first_sire as a flag that - * indicates if a resolvable multirt route has - * already been found during the preceding - * loops. If it is not the case, we may have - * to send an ICMP error to report that the - * destination is unreachable. We do not - * IRE_REFHOLD first_sire. - */ - if (first_sire == NULL) { - first_sire = sire; - } - } - } - if ((ire == NULL) || (ire == sire)) { - /* - * either ire == NULL (the destination cannot be - * resolved) or ire == sire (the gateway cannot be - * resolved). At this point, there are no more routes - * to resolve for the destination, thus we exit. - */ - if (ip_debug > 3) { - /* ip2dbg */ - pr_addr_dbg("ip_newroute_v6: " - "can't resolve %s\n", AF_INET6, v6dstp); - } - ip3dbg(("ip_newroute_v6: " - "ire %p, sire %p, first_sire %p\n", - (void *)ire, (void *)sire, (void *)first_sire)); - - if (sire != NULL) { - ire_refrele(sire); - sire = NULL; - } - - if (first_sire != NULL) { - /* - * At least one multirt route has been found - * in the same ip_newroute() call; there is no - * need to report an ICMP error. - * first_sire was not IRE_REFHOLDed. - */ - MULTIRT_DEBUG_UNTAG(first_mp); - freemsg(first_mp); - return; - } - ip_rts_change_v6(RTM_MISS, v6dstp, 0, 0, 0, 0, 0, 0, - RTA_DST, ipst); - goto icmp_err_ret; - } - - ASSERT(ire->ire_ipversion == IPV6_VERSION); - - /* - * Verify that the returned IRE does not have either the - * RTF_REJECT or RTF_BLACKHOLE flags set and that the IRE is - * either an IRE_CACHE, IRE_IF_NORESOLVER or IRE_IF_RESOLVER. - */ - if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) || - (ire->ire_type & (IRE_CACHE | IRE_INTERFACE)) == 0) - goto icmp_err_ret; - - /* - * Increment the ire_ob_pkt_count field for ire if it is an - * INTERFACE (IF_RESOLVER or IF_NORESOLVER) IRE type, and - * increment the same for the parent IRE, sire, if it is some - * sort of prefix IRE (which includes DEFAULT, PREFIX, and HOST) - */ - if ((ire->ire_type & IRE_INTERFACE) != 0) { - UPDATE_OB_PKT_COUNT(ire); - ire->ire_last_used_time = lbolt; - } - - if (sire != NULL) { - mutex_enter(&sire->ire_lock); - v6gw = sire->ire_gateway_addr_v6; - mutex_exit(&sire->ire_lock); - ASSERT((sire->ire_type & (IRE_CACHETABLE | - IRE_INTERFACE)) == 0); - UPDATE_OB_PKT_COUNT(sire); - sire->ire_last_used_time = lbolt; - } else { - v6gw = ipv6_all_zeros; - } - - /* - * We have a route to reach the destination. Find the - * appropriate ill, then get a source address that matches the - * right scope via ipif_select_source_v6(). - * - * If we are here trying to create an IRE_CACHE for an offlink - * destination and have an IRE_CACHE entry for VNI, then use - * ire_stq instead since VNI's queue is a black hole. - * - * Note: While we pick a dst_ill we are really only interested - * in the ill for load spreading. The source ipif is - * determined by source address selection below. - */ - if ((ire->ire_type == IRE_CACHE) && - IS_VNI(ire->ire_ipif->ipif_ill)) { - dst_ill = ire->ire_stq->q_ptr; - ill_refhold(dst_ill); - } else { - ill_t *ill = ire->ire_ipif->ipif_ill; - - if (IS_IPMP(ill)) { - dst_ill = - ipmp_illgrp_hold_next_ill(ill->ill_grp); - } else { - dst_ill = ill; - ill_refhold(dst_ill); - } - } - - if (dst_ill == NULL) { - if (ip_debug > 2) { - pr_addr_dbg("ip_newroute_v6 : no dst " - "ill for dst %s\n", AF_INET6, v6dstp); - } - goto icmp_err_ret; - } - - if (ill != NULL && dst_ill != ill && - !IS_IN_SAME_ILLGRP(dst_ill, ill)) { - /* - * We should have found a route matching "ill" - * as we called ire_ftable_lookup_v6 with - * MATCH_IRE_ILL. Rather than asserting when - * there is a mismatch, we just drop the packet. - */ - ip0dbg(("ip_newroute_v6: BOUND_IF failed: " - "dst_ill %s ill %s\n", dst_ill->ill_name, - ill->ill_name)); - goto icmp_err_ret; - } - - /* - * Pick a source address which matches the scope of the - * destination address. - * For RTF_SETSRC routes, the source address is imposed by the - * parent ire (sire). - */ - ASSERT(src_ipif == NULL); - - /* - * Because nce_xmit() calls ip_output_v6() and NCEs are always - * tied to the underlying interface, IS_UNDER_IPMP() may be - * true even when building IREs that will be used for data - * traffic. As such, see if the packet's source address is a - * test address, and if so use that test address's ipif for - * the IRE so that the logic that sets IRE_MARK_TESTHIDDEN in - * ire_add_v6() can work properly. - */ - if (ill != NULL && IS_UNDER_IPMP(ill)) - (void) ipif_lookup_testaddr_v6(ill, v6srcp, &src_ipif); - - if (src_ipif == NULL && ire->ire_type == IRE_IF_RESOLVER && - !IN6_IS_ADDR_UNSPECIFIED(&v6gw) && - ip6_asp_can_lookup(ipst)) { - /* - * The ire cache entry we're adding is for the - * gateway itself. The source address in this case - * is relative to the gateway's address. - */ - ip6_asp_table_held = B_TRUE; - src_ipif = ipif_select_source_v6(dst_ill, &v6gw, - B_TRUE, IPV6_PREFER_SRC_DEFAULT, zoneid); - if (src_ipif != NULL) - ire_marks |= IRE_MARK_USESRC_CHECK; - } else if (src_ipif == NULL) { - if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) { - /* - * Check that the ipif matching the requested - * source address still exists. - */ - src_ipif = ipif_lookup_addr_v6( - &sire->ire_src_addr_v6, NULL, zoneid, - NULL, NULL, NULL, NULL, ipst); - } - if (src_ipif == NULL && ip6_asp_can_lookup(ipst)) { - ip6_asp_table_held = B_TRUE; - src_ipif = ipif_select_source_v6(dst_ill, - v6dstp, B_FALSE, - IPV6_PREFER_SRC_DEFAULT, zoneid); - if (src_ipif != NULL) - ire_marks |= IRE_MARK_USESRC_CHECK; - } - } - - if (src_ipif == NULL) { - if (ip_debug > 2) { - /* ip1dbg */ - pr_addr_dbg("ip_newroute_v6: no src for " - "dst %s\n", AF_INET6, v6dstp); - printf("ip_newroute_v6: interface name %s\n", - dst_ill->ill_name); - } - goto icmp_err_ret; - } - - if (ip_debug > 3) { - /* ip2dbg */ - pr_addr_dbg("ip_newroute_v6: first hop %s\n", - AF_INET6, &v6gw); - } - ip2dbg(("\tire type %s (%d)\n", - ip_nv_lookup(ire_nv_tbl, ire->ire_type), ire->ire_type)); - - /* - * At this point in ip_newroute_v6(), ire is either the - * IRE_CACHE of the next-hop gateway for an off-subnet - * destination or an IRE_INTERFACE type that should be used - * to resolve an on-subnet destination or an on-subnet - * next-hop gateway. - * - * In the IRE_CACHE case, we have the following : - * - * 1) src_ipif - used for getting a source address. - * - * 2) dst_ill - from which we derive ire_stq/ire_rfq. This - * means packets using this IRE_CACHE will go out on dst_ill. - * - * 3) The IRE sire will point to the prefix that is the longest - * matching route for the destination. These prefix types - * include IRE_DEFAULT, IRE_PREFIX, IRE_HOST. - * - * The newly created IRE_CACHE entry for the off-subnet - * destination is tied to both the prefix route and the - * interface route used to resolve the next-hop gateway - * via the ire_phandle and ire_ihandle fields, respectively. - * - * In the IRE_INTERFACE case, we have the following : - * - * 1) src_ipif - used for getting a source address. - * - * 2) dst_ill - from which we derive ire_stq/ire_rfq. This - * means packets using the IRE_CACHE that we will build - * here will go out on dst_ill. - * - * 3) sire may or may not be NULL. But, the IRE_CACHE that is - * to be created will only be tied to the IRE_INTERFACE that - * was derived from the ire_ihandle field. - * - * If sire is non-NULL, it means the destination is off-link - * and we will first create the IRE_CACHE for the gateway. - * Next time through ip_newroute_v6, we will create the - * IRE_CACHE for the final destination as described above. - */ - save_ire = ire; - switch (ire->ire_type) { - case IRE_CACHE: { - ire_t *ipif_ire; - - ASSERT(sire != NULL); - if (IN6_IS_ADDR_UNSPECIFIED(&v6gw)) { - mutex_enter(&ire->ire_lock); - v6gw = ire->ire_gateway_addr_v6; - mutex_exit(&ire->ire_lock); - } - /* - * We need 3 ire's to create a new cache ire for an - * off-link destination from the cache ire of the - * gateway. - * - * 1. The prefix ire 'sire' - * 2. The cache ire of the gateway 'ire' - * 3. The interface ire 'ipif_ire' - * - * We have (1) and (2). We lookup (3) below. - * - * If there is no interface route to the gateway, - * it is a race condition, where we found the cache - * but the inteface route has been deleted. - */ - ipif_ire = ire_ihandle_lookup_offlink_v6(ire, sire); - if (ipif_ire == NULL) { - ip1dbg(("ip_newroute_v6:" - "ire_ihandle_lookup_offlink_v6 failed\n")); - goto icmp_err_ret; - } - - /* - * Note: the new ire inherits RTF_SETSRC - * and RTF_MULTIRT to propagate these flags from prefix - * to cache. - */ - - /* - * Check cached gateway IRE for any security - * attributes; if found, associate the gateway - * credentials group to the destination IRE. - */ - if ((attrp = save_ire->ire_gw_secattr) != NULL) { - mutex_enter(&attrp->igsa_lock); - if ((gcgrp = attrp->igsa_gcgrp) != NULL) - GCGRP_REFHOLD(gcgrp); - mutex_exit(&attrp->igsa_lock); - } - - ire = ire_create_v6( - v6dstp, /* dest address */ - &ipv6_all_ones, /* mask */ - &src_ipif->ipif_v6src_addr, /* source address */ - &v6gw, /* gateway address */ - &save_ire->ire_max_frag, - NULL, /* src nce */ - dst_ill->ill_rq, /* recv-from queue */ - dst_ill->ill_wq, /* send-to queue */ - IRE_CACHE, - src_ipif, - &sire->ire_mask_v6, /* Parent mask */ - sire->ire_phandle, /* Parent handle */ - ipif_ire->ire_ihandle, /* Interface handle */ - sire->ire_flags & /* flags if any */ - (RTF_SETSRC | RTF_MULTIRT), - &(sire->ire_uinfo), - NULL, - gcgrp, - ipst); - - if (ire == NULL) { - if (gcgrp != NULL) { - GCGRP_REFRELE(gcgrp); - gcgrp = NULL; - } - ire_refrele(save_ire); - ire_refrele(ipif_ire); - break; - } - - /* reference now held by IRE */ - gcgrp = NULL; - - ire->ire_marks |= ire_marks; - - /* - * Prevent sire and ipif_ire from getting deleted. The - * newly created ire is tied to both of them via the - * phandle and ihandle respectively. - */ - IRB_REFHOLD(sire->ire_bucket); - /* Has it been removed already ? */ - if (sire->ire_marks & IRE_MARK_CONDEMNED) { - IRB_REFRELE(sire->ire_bucket); - ire_refrele(ipif_ire); - ire_refrele(save_ire); - break; - } - - IRB_REFHOLD(ipif_ire->ire_bucket); - /* Has it been removed already ? */ - if (ipif_ire->ire_marks & IRE_MARK_CONDEMNED) { - IRB_REFRELE(ipif_ire->ire_bucket); - IRB_REFRELE(sire->ire_bucket); - ire_refrele(ipif_ire); - ire_refrele(save_ire); - break; - } - - xmit_mp = first_mp; - if (ire->ire_flags & RTF_MULTIRT) { - copy_mp = copymsg(first_mp); - if (copy_mp != NULL) { - xmit_mp = copy_mp; - MULTIRT_DEBUG_TAG(first_mp); - } - } - ire_add_then_send(q, ire, xmit_mp); - if (ip6_asp_table_held) { - ip6_asp_table_refrele(ipst); - ip6_asp_table_held = B_FALSE; - } - ire_refrele(save_ire); - - /* Assert that sire is not deleted yet. */ - ASSERT(sire->ire_ptpn != NULL); - IRB_REFRELE(sire->ire_bucket); - - /* Assert that ipif_ire is not deleted yet. */ - ASSERT(ipif_ire->ire_ptpn != NULL); - IRB_REFRELE(ipif_ire->ire_bucket); - ire_refrele(ipif_ire); - - if (copy_mp != NULL) { - /* - * Search for the next unresolved - * multirt route. - */ - copy_mp = NULL; - ipif_ire = NULL; - ire = NULL; - /* re-enter the loop */ - multirt_resolve_next = B_TRUE; - continue; - } - ire_refrele(sire); - ill_refrele(dst_ill); - ipif_refrele(src_ipif); - return; - } - case IRE_IF_NORESOLVER: - /* - * We have what we need to build an IRE_CACHE. - * - * handle the Gated case, where we create - * a NORESOLVER route for loopback. - */ - if (dst_ill->ill_net_type != IRE_IF_NORESOLVER) - break; - /* - * TSol note: We are creating the ire cache for the - * destination 'dst'. If 'dst' is offlink, going - * through the first hop 'gw', the security attributes - * of 'dst' must be set to point to the gateway - * credentials of gateway 'gw'. If 'dst' is onlink, it - * is possible that 'dst' is a potential gateway that is - * referenced by some route that has some security - * attributes. Thus in the former case, we need to do a - * gcgrp_lookup of 'gw' while in the latter case we - * need to do gcgrp_lookup of 'dst' itself. - */ - ga.ga_af = AF_INET6; - if (!IN6_IS_ADDR_UNSPECIFIED(&v6gw)) - ga.ga_addr = v6gw; - else - ga.ga_addr = *v6dstp; - gcgrp = gcgrp_lookup(&ga, B_FALSE); - - /* - * Note: the new ire inherits sire flags RTF_SETSRC - * and RTF_MULTIRT to propagate those rules from prefix - * to cache. - */ - ire = ire_create_v6( - v6dstp, /* dest address */ - &ipv6_all_ones, /* mask */ - &src_ipif->ipif_v6src_addr, /* source address */ - &v6gw, /* gateway address */ - &save_ire->ire_max_frag, - NULL, /* no src nce */ - dst_ill->ill_rq, /* recv-from queue */ - dst_ill->ill_wq, /* send-to queue */ - IRE_CACHE, - src_ipif, - &save_ire->ire_mask_v6, /* Parent mask */ - (sire != NULL) ? /* Parent handle */ - sire->ire_phandle : 0, - save_ire->ire_ihandle, /* Interface handle */ - (sire != NULL) ? /* flags if any */ - sire->ire_flags & - (RTF_SETSRC | RTF_MULTIRT) : 0, - &(save_ire->ire_uinfo), - NULL, - gcgrp, - ipst); - - if (ire == NULL) { - if (gcgrp != NULL) { - GCGRP_REFRELE(gcgrp); - gcgrp = NULL; - } - ire_refrele(save_ire); - break; - } - - /* reference now held by IRE */ - gcgrp = NULL; - - ire->ire_marks |= ire_marks; - - if (!IN6_IS_ADDR_UNSPECIFIED(&v6gw)) - dst = v6gw; - else - dst = *v6dstp; - err = ndp_noresolver(dst_ill, &dst); - if (err != 0) { - ire_refrele(save_ire); - break; - } - - /* Prevent save_ire from getting deleted */ - IRB_REFHOLD(save_ire->ire_bucket); - /* Has it been removed already ? */ - if (save_ire->ire_marks & IRE_MARK_CONDEMNED) { - IRB_REFRELE(save_ire->ire_bucket); - ire_refrele(save_ire); - break; - } - - xmit_mp = first_mp; - /* - * In case of MULTIRT, a copy of the current packet - * to send is made to further re-enter the - * loop and attempt another route resolution - */ - if ((sire != NULL) && sire->ire_flags & RTF_MULTIRT) { - copy_mp = copymsg(first_mp); - if (copy_mp != NULL) { - xmit_mp = copy_mp; - MULTIRT_DEBUG_TAG(first_mp); - } - } - ire_add_then_send(q, ire, xmit_mp); - if (ip6_asp_table_held) { - ip6_asp_table_refrele(ipst); - ip6_asp_table_held = B_FALSE; - } - - /* Assert that it is not deleted yet. */ - ASSERT(save_ire->ire_ptpn != NULL); - IRB_REFRELE(save_ire->ire_bucket); - ire_refrele(save_ire); - - if (copy_mp != NULL) { - /* - * If we found a (no)resolver, we ignore any - * trailing top priority IRE_CACHE in - * further loops. This ensures that we do not - * omit any (no)resolver despite the priority - * in this call. - * IRE_CACHE, if any, will be processed - * by another thread entering ip_newroute(), - * (on resolver response, for example). - * We use this to force multiple parallel - * resolution as soon as a packet needs to be - * sent. The result is, after one packet - * emission all reachable routes are generally - * resolved. - * Otherwise, complete resolution of MULTIRT - * routes would require several emissions as - * side effect. - */ - multirt_flags &= ~MULTIRT_CACHEGW; - - /* - * Search for the next unresolved multirt - * route. - */ - copy_mp = NULL; - save_ire = NULL; - ire = NULL; - /* re-enter the loop */ - multirt_resolve_next = B_TRUE; - continue; - } - - /* Don't need sire anymore */ - if (sire != NULL) - ire_refrele(sire); - ill_refrele(dst_ill); - ipif_refrele(src_ipif); - return; - - case IRE_IF_RESOLVER: - /* - * We can't build an IRE_CACHE yet, but at least we - * found a resolver that can help. - */ - dst = *v6dstp; - - /* - * To be at this point in the code with a non-zero gw - * means that dst is reachable through a gateway that - * we have never resolved. By changing dst to the gw - * addr we resolve the gateway first. When - * ire_add_then_send() tries to put the IP dg to dst, - * it will reenter ip_newroute() at which time we will - * find the IRE_CACHE for the gw and create another - * IRE_CACHE above (for dst itself). - */ - if (!IN6_IS_ADDR_UNSPECIFIED(&v6gw)) { - save_dst = dst; - dst = v6gw; - v6gw = ipv6_all_zeros; - } - if (dst_ill->ill_flags & ILLF_XRESOLV) { - /* - * Ask the external resolver to do its thing. - * Make an mblk chain in the following form: - * ARQ_REQ_MBLK-->IRE_MBLK-->packet - */ - mblk_t *ire_mp; - mblk_t *areq_mp; - areq_t *areq; - in6_addr_t *addrp; - - ip1dbg(("ip_newroute_v6:ILLF_XRESOLV\n")); - if (ip6_asp_table_held) { - ip6_asp_table_refrele(ipst); - ip6_asp_table_held = B_FALSE; - } - ire = ire_create_mp_v6( - &dst, /* dest address */ - &ipv6_all_ones, /* mask */ - &src_ipif->ipif_v6src_addr, - /* source address */ - &v6gw, /* gateway address */ - NULL, /* no src nce */ - dst_ill->ill_rq, /* recv-from queue */ - dst_ill->ill_wq, /* send-to queue */ - IRE_CACHE, - src_ipif, - &save_ire->ire_mask_v6, /* Parent mask */ - 0, - save_ire->ire_ihandle, - /* Interface handle */ - 0, /* flags if any */ - &(save_ire->ire_uinfo), - NULL, - NULL, - ipst); - - ire_refrele(save_ire); - if (ire == NULL) { - ip1dbg(("ip_newroute_v6:" - "ire is NULL\n")); - break; - } - - if ((sire != NULL) && - (sire->ire_flags & RTF_MULTIRT)) { - /* - * processing a copy of the packet to - * send for further resolution loops - */ - copy_mp = copymsg(first_mp); - if (copy_mp != NULL) - MULTIRT_DEBUG_TAG(copy_mp); - } - ire->ire_marks |= ire_marks; - ire_mp = ire->ire_mp; - /* - * Now create or find an nce for this interface. - * The hw addr will need to to be set from - * the reply to the AR_ENTRY_QUERY that - * we're about to send. This will be done in - * ire_add_v6(). - */ - err = ndp_resolver(dst_ill, &dst, mp, zoneid); - switch (err) { - case 0: - /* - * New cache entry created. - * Break, then ask the external - * resolver. - */ - break; - case EINPROGRESS: - /* - * Resolution in progress; - * packet has been queued by - * ndp_resolver(). - */ - ire_delete(ire); - ire = NULL; - /* - * Check if another multirt - * route must be resolved. - */ - if (copy_mp != NULL) { - /* - * If we found a resolver, we - * ignore any trailing top - * priority IRE_CACHE in - * further loops. The reason is - * the same as for noresolver. - */ - multirt_flags &= - ~MULTIRT_CACHEGW; - /* - * Search for the next - * unresolved multirt route. - */ - first_mp = copy_mp; - copy_mp = NULL; - mp = first_mp; - if (mp->b_datap->db_type == - M_CTL) { - mp = mp->b_cont; - } - ASSERT(sire != NULL); - dst = save_dst; - /* - * re-enter the loop - */ - multirt_resolve_next = - B_TRUE; - continue; - } - - if (sire != NULL) - ire_refrele(sire); - ill_refrele(dst_ill); - ipif_refrele(src_ipif); - return; - default: - /* - * Transient error; packet will be - * freed. - */ - ire_delete(ire); - ire = NULL; - break; - } - if (err != 0) - break; - /* - * Now set up the AR_ENTRY_QUERY and send it. - */ - areq_mp = ill_arp_alloc(dst_ill, - (uchar_t *)&ipv6_areq_template, - (caddr_t)&dst); - if (areq_mp == NULL) { - ip1dbg(("ip_newroute_v6:" - "areq_mp is NULL\n")); - freemsg(ire_mp); - break; - } - areq = (areq_t *)areq_mp->b_rptr; - addrp = (in6_addr_t *)((char *)areq + - areq->areq_target_addr_offset); - *addrp = dst; - addrp = (in6_addr_t *)((char *)areq + - areq->areq_sender_addr_offset); - *addrp = src_ipif->ipif_v6src_addr; - /* - * link the chain, then send up to the resolver. - */ - linkb(areq_mp, ire_mp); - linkb(areq_mp, mp); - ip1dbg(("ip_newroute_v6:" - "putnext to resolver\n")); - putnext(dst_ill->ill_rq, areq_mp); - /* - * Check if another multirt route - * must be resolved. - */ - ire = NULL; - if (copy_mp != NULL) { - /* - * If we find a resolver, we ignore any - * trailing top priority IRE_CACHE in - * further loops. The reason is the - * same as for noresolver. - */ - multirt_flags &= ~MULTIRT_CACHEGW; - /* - * Search for the next unresolved - * multirt route. - */ - first_mp = copy_mp; - copy_mp = NULL; - mp = first_mp; - if (mp->b_datap->db_type == M_CTL) { - mp = mp->b_cont; - } - ASSERT(sire != NULL); - dst = save_dst; - /* - * re-enter the loop - */ - multirt_resolve_next = B_TRUE; - continue; - } - - if (sire != NULL) - ire_refrele(sire); - ill_refrele(dst_ill); - ipif_refrele(src_ipif); - return; - } - /* - * Non-external resolver case. - * - * TSol note: Please see the note above the - * IRE_IF_NORESOLVER case. - */ - ga.ga_af = AF_INET6; - ga.ga_addr = dst; - gcgrp = gcgrp_lookup(&ga, B_FALSE); - - ire = ire_create_v6( - &dst, /* dest address */ - &ipv6_all_ones, /* mask */ - &src_ipif->ipif_v6src_addr, /* source address */ - &v6gw, /* gateway address */ - &save_ire->ire_max_frag, - NULL, /* no src nce */ - dst_ill->ill_rq, /* recv-from queue */ - dst_ill->ill_wq, /* send-to queue */ - IRE_CACHE, - src_ipif, - &save_ire->ire_mask_v6, /* Parent mask */ - 0, - save_ire->ire_ihandle, /* Interface handle */ - 0, /* flags if any */ - &(save_ire->ire_uinfo), - NULL, - gcgrp, - ipst); - - if (ire == NULL) { - if (gcgrp != NULL) { - GCGRP_REFRELE(gcgrp); - gcgrp = NULL; - } - ire_refrele(save_ire); - break; - } - - /* reference now held by IRE */ - gcgrp = NULL; - - if ((sire != NULL) && - (sire->ire_flags & RTF_MULTIRT)) { - copy_mp = copymsg(first_mp); - if (copy_mp != NULL) - MULTIRT_DEBUG_TAG(copy_mp); - } - - ire->ire_marks |= ire_marks; - err = ndp_resolver(dst_ill, &dst, first_mp, zoneid); - switch (err) { - case 0: - /* Prevent save_ire from getting deleted */ - IRB_REFHOLD(save_ire->ire_bucket); - /* Has it been removed already ? */ - if (save_ire->ire_marks & IRE_MARK_CONDEMNED) { - IRB_REFRELE(save_ire->ire_bucket); - ire_refrele(save_ire); - break; - } - - /* - * We have a resolved cache entry, - * add in the IRE. - */ - ire_add_then_send(q, ire, first_mp); - if (ip6_asp_table_held) { - ip6_asp_table_refrele(ipst); - ip6_asp_table_held = B_FALSE; - } - - /* Assert that it is not deleted yet. */ - ASSERT(save_ire->ire_ptpn != NULL); - IRB_REFRELE(save_ire->ire_bucket); - ire_refrele(save_ire); - /* - * Check if another multirt route - * must be resolved. - */ - ire = NULL; - if (copy_mp != NULL) { - /* - * If we find a resolver, we ignore any - * trailing top priority IRE_CACHE in - * further loops. The reason is the - * same as for noresolver. - */ - multirt_flags &= ~MULTIRT_CACHEGW; - /* - * Search for the next unresolved - * multirt route. - */ - first_mp = copy_mp; - copy_mp = NULL; - mp = first_mp; - if (mp->b_datap->db_type == M_CTL) { - mp = mp->b_cont; - } - ASSERT(sire != NULL); - dst = save_dst; - /* - * re-enter the loop - */ - multirt_resolve_next = B_TRUE; - continue; - } - - if (sire != NULL) - ire_refrele(sire); - ill_refrele(dst_ill); - ipif_refrele(src_ipif); - return; - - case EINPROGRESS: - /* - * mp was consumed - presumably queued. - * No need for ire, presumably resolution is - * in progress, and ire will be added when the - * address is resolved. - */ - if (ip6_asp_table_held) { - ip6_asp_table_refrele(ipst); - ip6_asp_table_held = B_FALSE; - } - ASSERT(ire->ire_nce == NULL); - ire_delete(ire); - ire_refrele(save_ire); - /* - * Check if another multirt route - * must be resolved. - */ - ire = NULL; - if (copy_mp != NULL) { - /* - * If we find a resolver, we ignore any - * trailing top priority IRE_CACHE in - * further loops. The reason is the - * same as for noresolver. - */ - multirt_flags &= ~MULTIRT_CACHEGW; - /* - * Search for the next unresolved - * multirt route. - */ - first_mp = copy_mp; - copy_mp = NULL; - mp = first_mp; - if (mp->b_datap->db_type == M_CTL) { - mp = mp->b_cont; - } - ASSERT(sire != NULL); - dst = save_dst; - /* - * re-enter the loop - */ - multirt_resolve_next = B_TRUE; - continue; - } - if (sire != NULL) - ire_refrele(sire); - ill_refrele(dst_ill); - ipif_refrele(src_ipif); - return; - default: - /* Some transient error */ - ASSERT(ire->ire_nce == NULL); - ire_refrele(save_ire); - break; - } - break; - default: - break; - } - if (ip6_asp_table_held) { - ip6_asp_table_refrele(ipst); - ip6_asp_table_held = B_FALSE; - } - } while (multirt_resolve_next); - -err_ret: - ip1dbg(("ip_newroute_v6: dropped\n")); - if (src_ipif != NULL) - ipif_refrele(src_ipif); - if (dst_ill != NULL) { - need_rele = B_TRUE; - ill = dst_ill; - } - if (ill != NULL) { - if (mp->b_prev != NULL) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - } else { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); - } - - if (need_rele) - ill_refrele(ill); - } else { - if (mp->b_prev != NULL) { - BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsInDiscards); - } else { - BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); - } - } - /* Did this packet originate externally? */ - if (mp->b_prev) { - mp->b_next = NULL; - mp->b_prev = NULL; - } - if (copy_mp != NULL) { - MULTIRT_DEBUG_UNTAG(copy_mp); - freemsg(copy_mp); - } - MULTIRT_DEBUG_UNTAG(first_mp); - freemsg(first_mp); - if (ire != NULL) - ire_refrele(ire); - if (sire != NULL) - ire_refrele(sire); - return; - -icmp_err_ret: - if (ip6_asp_table_held) - ip6_asp_table_refrele(ipst); - if (src_ipif != NULL) - ipif_refrele(src_ipif); - if (dst_ill != NULL) { - need_rele = B_TRUE; - ill = dst_ill; - } - ip1dbg(("ip_newroute_v6: no route\n")); - if (sire != NULL) - ire_refrele(sire); - /* - * We need to set sire to NULL to avoid double freeing if we - * ever goto err_ret from below. - */ - sire = NULL; - ip6h = (ip6_t *)mp->b_rptr; - /* Skip ip6i_t header if present */ - if (ip6h->ip6_nxt == IPPROTO_RAW) { - /* Make sure the IPv6 header is present */ - if ((mp->b_wptr - (uchar_t *)ip6h) < - sizeof (ip6i_t) + IPV6_HDR_LEN) { - if (!pullupmsg(mp, sizeof (ip6i_t) + IPV6_HDR_LEN)) { - ip1dbg(("ip_newroute_v6: pullupmsg failed\n")); - goto err_ret; - } - } - mp->b_rptr += sizeof (ip6i_t); - ip6h = (ip6_t *)mp->b_rptr; - } - /* Did this packet originate externally? */ - if (mp->b_prev) { - if (ill != NULL) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInNoRoutes); - } else { - BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsInNoRoutes); - } - mp->b_next = NULL; - mp->b_prev = NULL; - q = WR(q); - } else { - if (ill != NULL) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutNoRoutes); - } else { - BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutNoRoutes); - } - if (ip_hdr_complete_v6(ip6h, zoneid, ipst)) { - /* Failed */ - if (copy_mp != NULL) { - MULTIRT_DEBUG_UNTAG(copy_mp); - freemsg(copy_mp); - } - MULTIRT_DEBUG_UNTAG(first_mp); - freemsg(first_mp); - if (ire != NULL) - ire_refrele(ire); - if (need_rele) - ill_refrele(ill); - return; - } - } - - if (need_rele) - ill_refrele(ill); - - /* - * At this point we will have ire only if RTF_BLACKHOLE - * or RTF_REJECT flags are set on the IRE. It will not - * generate ICMP6_DST_UNREACH_NOROUTE if RTF_BLACKHOLE is set. - */ - if (ire != NULL) { - if (ire->ire_flags & RTF_BLACKHOLE) { - ire_refrele(ire); - if (copy_mp != NULL) { - MULTIRT_DEBUG_UNTAG(copy_mp); - freemsg(copy_mp); - } - MULTIRT_DEBUG_UNTAG(first_mp); - freemsg(first_mp); - return; - } - ire_refrele(ire); - } - if (ip_debug > 3) { - /* ip2dbg */ - pr_addr_dbg("ip_newroute_v6: no route to %s\n", - AF_INET6, v6dstp); - } - icmp_unreachable_v6(WR(q), first_mp, ICMP6_DST_UNREACH_NOROUTE, - B_FALSE, B_FALSE, zoneid, ipst); -} - -/* - * ip_newroute_ipif_v6 is called by ip_wput_v6 and ip_wput_ipsec_out_v6 whenever - * we need to send out a packet to a destination address for which we do not - * have specific routing information. It is only used for multicast packets. - * - * If unspec_src we allow creating an IRE with source address zero. - * ire_send_v6() will delete it after the packet is sent. - */ -void -ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif, - const in6_addr_t *v6dstp, const in6_addr_t *v6srcp, int unspec_src, - zoneid_t zoneid) -{ - ire_t *ire = NULL; - ipif_t *src_ipif = NULL; - int err = 0; - ill_t *dst_ill = NULL; - ire_t *save_ire; - ipsec_out_t *io; - ill_t *ill; - mblk_t *first_mp; - ire_t *fire = NULL; - mblk_t *copy_mp = NULL; - const in6_addr_t *ire_v6srcp; - boolean_t probe = B_FALSE; - boolean_t multirt_resolve_next; - boolean_t ipif_held = B_FALSE; - boolean_t ill_held = B_FALSE; - boolean_t ip6_asp_table_held = B_FALSE; - ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; - - /* - * This loop is run only once in most cases. - * We loop to resolve further routes only when the destination - * can be reached through multiple RTF_MULTIRT-flagged ires. - */ - do { - multirt_resolve_next = B_FALSE; - if (dst_ill != NULL) { - ill_refrele(dst_ill); - dst_ill = NULL; - } - - if (src_ipif != NULL) { - ipif_refrele(src_ipif); - src_ipif = NULL; - } - ASSERT(ipif != NULL); - ill = ipif->ipif_ill; - - ASSERT(!IN6_IS_ADDR_V4MAPPED(v6dstp)); - if (ip_debug > 2) { - /* ip1dbg */ - pr_addr_dbg("ip_newroute_ipif_v6: v6dst %s\n", - AF_INET6, v6dstp); - printf("ip_newroute_ipif_v6: if %s, v6 %d\n", - ill->ill_name, ipif->ipif_isv6); - } - - first_mp = mp; - if (mp->b_datap->db_type == M_CTL) { - mp = mp->b_cont; - io = (ipsec_out_t *)first_mp->b_rptr; - ASSERT(io->ipsec_out_type == IPSEC_OUT); - } else { - io = NULL; - } - - /* - * If the interface is a pt-pt interface we look for an - * IRE_IF_RESOLVER or IRE_IF_NORESOLVER that matches both the - * local_address and the pt-pt destination address. - * Otherwise we just match the local address. - */ - if (!(ill->ill_flags & ILLF_MULTICAST)) { - goto err_ret; - } - - /* - * We check if an IRE_OFFSUBNET for the addr that goes through - * ipif exists. We need it to determine if the RTF_SETSRC and/or - * RTF_MULTIRT flags must be honored. - */ - fire = ipif_lookup_multi_ire_v6(ipif, v6dstp); - ip2dbg(("ip_newroute_ipif_v6: " - "ipif_lookup_multi_ire_v6(" - "ipif %p, dst %08x) = fire %p\n", - (void *)ipif, ntohl(V4_PART_OF_V6((*v6dstp))), - (void *)fire)); - - ASSERT(src_ipif == NULL); - - /* - * Because nce_xmit() calls ip_output_v6() and NCEs are always - * tied to the underlying interface, IS_UNDER_IPMP() may be - * true even when building IREs that will be used for data - * traffic. As such, see if the packet's source address is a - * test address, and if so use that test address's ipif for - * the IRE so that the logic that sets IRE_MARK_TESTHIDDEN in - * ire_add_v6() can work properly. - */ - if (IS_UNDER_IPMP(ill)) - probe = ipif_lookup_testaddr_v6(ill, v6srcp, &src_ipif); - - /* - * Determine the outbound (destination) ill for this route. - * If IPMP is not in use, that's the same as our ill. If IPMP - * is in-use and we're on the IPMP interface, or we're on an - * underlying ill but sending data traffic, use a suitable - * destination ill from the group. The latter case covers a - * subtle edge condition with multicast: when we bring up an - * IPv6 data address, we will create an NCE on an underlying - * interface, and send solitications to ff02::1, which would - * take us through here, and cause us to create an IRE for - * ff02::1. To meet our defined semantics for multicast (and - * ensure there aren't unexpected echoes), that IRE needs to - * use the IPMP group's nominated multicast interface. - * - * Note: the source ipif is determined by source address - * selection later. - */ - if (IS_IPMP(ill) || (IS_UNDER_IPMP(ill) && !probe)) { - ill_t *ipmp_ill; - ipmp_illgrp_t *illg; - - if (IS_UNDER_IPMP(ill)) { - ipmp_ill = ipmp_ill_hold_ipmp_ill(ill); - } else { - ipmp_ill = ill; - ill_refhold(ipmp_ill); /* for symmetry */ - } - - if (ipmp_ill == NULL) - goto err_ret; - - illg = ipmp_ill->ill_grp; - if (IN6_IS_ADDR_MULTICAST(v6dstp)) - dst_ill = ipmp_illgrp_hold_cast_ill(illg); - else - dst_ill = ipmp_illgrp_hold_next_ill(illg); - - ill_refrele(ipmp_ill); - } else { - dst_ill = ill; - ill_refhold(dst_ill); /* for symmetry */ - } - - if (dst_ill == NULL) { - if (ip_debug > 2) { - pr_addr_dbg("ip_newroute_ipif_v6: " - "no dst ill for dst %s\n", - AF_INET6, v6dstp); - } - goto err_ret; - } - - /* - * Pick a source address which matches the scope of the - * destination address. - * For RTF_SETSRC routes, the source address is imposed by the - * parent ire (fire). - */ - - if (src_ipif == NULL && fire != NULL && - (fire->ire_flags & RTF_SETSRC)) { - /* - * Check that the ipif matching the requested source - * address still exists. - */ - src_ipif = ipif_lookup_addr_v6(&fire->ire_src_addr_v6, - NULL, zoneid, NULL, NULL, NULL, NULL, ipst); - } - - if (src_ipif == NULL && ip6_asp_can_lookup(ipst)) { - ip6_asp_table_held = B_TRUE; - src_ipif = ipif_select_source_v6(dst_ill, v6dstp, - B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid); - } - - if (src_ipif == NULL) { - if (!unspec_src) { - if (ip_debug > 2) { - /* ip1dbg */ - pr_addr_dbg("ip_newroute_ipif_v6: " - "no src for dst %s\n", - AF_INET6, v6dstp); - printf(" through interface %s\n", - dst_ill->ill_name); - } - goto err_ret; - } - ire_v6srcp = &ipv6_all_zeros; - src_ipif = ipif; - ipif_refhold(src_ipif); - } else { - ire_v6srcp = &src_ipif->ipif_v6src_addr; - } - - ire = ipif_to_ire_v6(ipif); - if (ire == NULL) { - if (ip_debug > 2) { - /* ip1dbg */ - pr_addr_dbg("ip_newroute_ipif_v6: v6src %s\n", - AF_INET6, &ipif->ipif_v6lcl_addr); - printf("ip_newroute_ipif_v6: " - "if %s\n", dst_ill->ill_name); - } - goto err_ret; - } - if (ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) - goto err_ret; - - ASSERT(ire->ire_ipversion == IPV6_VERSION); - - ip1dbg(("ip_newroute_ipif_v6: interface type %s (%d),", - ip_nv_lookup(ire_nv_tbl, ire->ire_type), ire->ire_type)); - if (ip_debug > 2) { - /* ip1dbg */ - pr_addr_dbg(" address %s\n", - AF_INET6, &ire->ire_src_addr_v6); - } - save_ire = ire; - ip2dbg(("ip_newroute_ipif: ire %p, ipif %p\n", - (void *)ire, (void *)ipif)); - - if ((fire != NULL) && (fire->ire_flags & RTF_MULTIRT)) { - /* - * an IRE_OFFSUBET was looked up - * on that interface. - * this ire has RTF_MULTIRT flag, - * so the resolution loop - * will be re-entered to resolve - * additional routes on other - * interfaces. For that purpose, - * a copy of the packet is - * made at this point. - */ - fire->ire_last_used_time = lbolt; - copy_mp = copymsg(first_mp); - if (copy_mp) { - MULTIRT_DEBUG_TAG(copy_mp); - } - } - - switch (ire->ire_type) { - case IRE_IF_NORESOLVER: { - /* - * We have what we need to build an IRE_CACHE. - * - * handle the Gated case, where we create - * a NORESOLVER route for loopback. - */ - if (dst_ill->ill_net_type != IRE_IF_NORESOLVER) - break; - /* - * The newly created ire will inherit the flags of the - * parent ire, if any. - */ - ire = ire_create_v6( - v6dstp, /* dest address */ - &ipv6_all_ones, /* mask */ - ire_v6srcp, /* source address */ - NULL, /* gateway address */ - &save_ire->ire_max_frag, - NULL, /* no src nce */ - dst_ill->ill_rq, /* recv-from queue */ - dst_ill->ill_wq, /* send-to queue */ - IRE_CACHE, - src_ipif, - NULL, - (fire != NULL) ? /* Parent handle */ - fire->ire_phandle : 0, - save_ire->ire_ihandle, /* Interface handle */ - (fire != NULL) ? - (fire->ire_flags & (RTF_SETSRC | RTF_MULTIRT)) : - 0, - &ire_uinfo_null, - NULL, - NULL, - ipst); - - if (ire == NULL) { - ire_refrele(save_ire); - break; - } - - err = ndp_noresolver(dst_ill, v6dstp); - if (err != 0) { - ire_refrele(save_ire); - break; - } - - /* Prevent save_ire from getting deleted */ - IRB_REFHOLD(save_ire->ire_bucket); - /* Has it been removed already ? */ - if (save_ire->ire_marks & IRE_MARK_CONDEMNED) { - IRB_REFRELE(save_ire->ire_bucket); - ire_refrele(save_ire); - break; - } - - ire_add_then_send(q, ire, first_mp); - if (ip6_asp_table_held) { - ip6_asp_table_refrele(ipst); - ip6_asp_table_held = B_FALSE; - } - - /* Assert that it is not deleted yet. */ - ASSERT(save_ire->ire_ptpn != NULL); - IRB_REFRELE(save_ire->ire_bucket); - ire_refrele(save_ire); - if (fire != NULL) { - ire_refrele(fire); - fire = NULL; - } - - /* - * The resolution loop is re-entered if we - * actually are in a multirouting case. - */ - if (copy_mp != NULL) { - boolean_t need_resolve = - ire_multirt_need_resolve_v6(v6dstp, - msg_getlabel(copy_mp), ipst); - if (!need_resolve) { - MULTIRT_DEBUG_UNTAG(copy_mp); - freemsg(copy_mp); - copy_mp = NULL; - } else { - /* - * ipif_lookup_group_v6() calls - * ire_lookup_multi_v6() that uses - * ire_ftable_lookup_v6() to find - * an IRE_INTERFACE for the group. - * In the multirt case, - * ire_lookup_multi_v6() then invokes - * ire_multirt_lookup_v6() to find - * the next resolvable ire. - * As a result, we obtain a new - * interface, derived from the - * next ire. - */ - if (ipif_held) { - ipif_refrele(ipif); - ipif_held = B_FALSE; - } - ipif = ipif_lookup_group_v6(v6dstp, - zoneid, ipst); - ip2dbg(("ip_newroute_ipif: " - "multirt dst %08x, ipif %p\n", - ntohl(V4_PART_OF_V6((*v6dstp))), - (void *)ipif)); - if (ipif != NULL) { - ipif_held = B_TRUE; - mp = copy_mp; - copy_mp = NULL; - multirt_resolve_next = - B_TRUE; - continue; - } else { - freemsg(copy_mp); - } - } - } - ill_refrele(dst_ill); - if (ipif_held) { - ipif_refrele(ipif); - ipif_held = B_FALSE; - } - if (src_ipif != NULL) - ipif_refrele(src_ipif); - return; - } - case IRE_IF_RESOLVER: { - - ASSERT(dst_ill->ill_isv6); - - /* - * We obtain a partial IRE_CACHE which we will pass - * along with the resolver query. When the response - * comes back it will be there ready for us to add. - */ - /* - * the newly created ire will inherit the flags of the - * parent ire, if any. - */ - ire = ire_create_v6( - v6dstp, /* dest address */ - &ipv6_all_ones, /* mask */ - ire_v6srcp, /* source address */ - NULL, /* gateway address */ - &save_ire->ire_max_frag, - NULL, /* src nce */ - dst_ill->ill_rq, /* recv-from queue */ - dst_ill->ill_wq, /* send-to queue */ - IRE_CACHE, - src_ipif, - NULL, - (fire != NULL) ? /* Parent handle */ - fire->ire_phandle : 0, - save_ire->ire_ihandle, /* Interface handle */ - (fire != NULL) ? - (fire->ire_flags & (RTF_SETSRC | RTF_MULTIRT)) : - 0, - &ire_uinfo_null, - NULL, - NULL, - ipst); - - if (ire == NULL) { - ire_refrele(save_ire); - break; - } - - /* Resolve and add ire to the ctable */ - err = ndp_resolver(dst_ill, v6dstp, first_mp, zoneid); - switch (err) { - case 0: - /* Prevent save_ire from getting deleted */ - IRB_REFHOLD(save_ire->ire_bucket); - /* Has it been removed already ? */ - if (save_ire->ire_marks & IRE_MARK_CONDEMNED) { - IRB_REFRELE(save_ire->ire_bucket); - ire_refrele(save_ire); - break; - } - /* - * We have a resolved cache entry, - * add in the IRE. - */ - ire_add_then_send(q, ire, first_mp); - if (ip6_asp_table_held) { - ip6_asp_table_refrele(ipst); - ip6_asp_table_held = B_FALSE; - } - - /* Assert that it is not deleted yet. */ - ASSERT(save_ire->ire_ptpn != NULL); - IRB_REFRELE(save_ire->ire_bucket); - ire_refrele(save_ire); - if (fire != NULL) { - ire_refrele(fire); - fire = NULL; - } - - /* - * The resolution loop is re-entered if we - * actually are in a multirouting case. - */ - if (copy_mp != NULL) { - boolean_t need_resolve = - ire_multirt_need_resolve_v6(v6dstp, - msg_getlabel(copy_mp), ipst); - if (!need_resolve) { - MULTIRT_DEBUG_UNTAG(copy_mp); - freemsg(copy_mp); - copy_mp = NULL; - } else { - /* - * ipif_lookup_group_v6() calls - * ire_lookup_multi_v6() that - * uses ire_ftable_lookup_v6() - * to find an IRE_INTERFACE for - * the group. In the multirt - * case, ire_lookup_multi_v6() - * then invokes - * ire_multirt_lookup_v6() to - * find the next resolvable ire. - * As a result, we obtain a new - * interface, derived from the - * next ire. - */ - if (ipif_held) { - ipif_refrele(ipif); - ipif_held = B_FALSE; - } - ipif = ipif_lookup_group_v6( - v6dstp, zoneid, ipst); - ip2dbg(("ip_newroute_ipif: " - "multirt dst %08x, " - "ipif %p\n", - ntohl(V4_PART_OF_V6( - (*v6dstp))), - (void *)ipif)); - if (ipif != NULL) { - ipif_held = B_TRUE; - mp = copy_mp; - copy_mp = NULL; - multirt_resolve_next = - B_TRUE; - continue; - } else { - freemsg(copy_mp); - } - } - } - ill_refrele(dst_ill); - if (ipif_held) { - ipif_refrele(ipif); - ipif_held = B_FALSE; - } - if (src_ipif != NULL) - ipif_refrele(src_ipif); - return; - - case EINPROGRESS: - /* - * mp was consumed - presumably queued. - * No need for ire, presumably resolution is - * in progress, and ire will be added when the - * address is resolved. - */ - if (ip6_asp_table_held) { - ip6_asp_table_refrele(ipst); - ip6_asp_table_held = B_FALSE; - } - ire_delete(ire); - ire_refrele(save_ire); - if (fire != NULL) { - ire_refrele(fire); - fire = NULL; - } - - /* - * The resolution loop is re-entered if we - * actually are in a multirouting case. - */ - if (copy_mp != NULL) { - boolean_t need_resolve = - ire_multirt_need_resolve_v6(v6dstp, - msg_getlabel(copy_mp), ipst); - if (!need_resolve) { - MULTIRT_DEBUG_UNTAG(copy_mp); - freemsg(copy_mp); - copy_mp = NULL; - } else { - /* - * ipif_lookup_group_v6() calls - * ire_lookup_multi_v6() that - * uses ire_ftable_lookup_v6() - * to find an IRE_INTERFACE for - * the group. In the multirt - * case, ire_lookup_multi_v6() - * then invokes - * ire_multirt_lookup_v6() to - * find the next resolvable ire. - * As a result, we obtain a new - * interface, derived from the - * next ire. - */ - if (ipif_held) { - ipif_refrele(ipif); - ipif_held = B_FALSE; - } - ipif = ipif_lookup_group_v6( - v6dstp, zoneid, ipst); - ip2dbg(("ip_newroute_ipif: " - "multirt dst %08x, " - "ipif %p\n", - ntohl(V4_PART_OF_V6( - (*v6dstp))), - (void *)ipif)); - if (ipif != NULL) { - ipif_held = B_TRUE; - mp = copy_mp; - copy_mp = NULL; - multirt_resolve_next = - B_TRUE; - continue; - } else { - freemsg(copy_mp); - } - } - } - ill_refrele(dst_ill); - if (ipif_held) { - ipif_refrele(ipif); - ipif_held = B_FALSE; - } - if (src_ipif != NULL) - ipif_refrele(src_ipif); - return; - default: - /* Some transient error */ - ire_refrele(save_ire); - break; - } - break; - } - default: - break; - } - if (ip6_asp_table_held) { - ip6_asp_table_refrele(ipst); - ip6_asp_table_held = B_FALSE; - } - } while (multirt_resolve_next); - -err_ret: - if (ip6_asp_table_held) - ip6_asp_table_refrele(ipst); - if (ire != NULL) - ire_refrele(ire); - if (fire != NULL) - ire_refrele(fire); - if (ipif != NULL && ipif_held) - ipif_refrele(ipif); - if (src_ipif != NULL) - ipif_refrele(src_ipif); - - /* Multicast - no point in trying to generate ICMP error */ - if (dst_ill != NULL) { - ill = dst_ill; - ill_held = B_TRUE; - } - if (mp->b_prev || mp->b_next) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - } else { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); - } - ip1dbg(("ip_newroute_ipif_v6: dropped\n")); - mp->b_next = NULL; - mp->b_prev = NULL; - freemsg(first_mp); - if (ill_held) - ill_refrele(ill); -} - -/* * Parse and process any hop-by-hop or destination options. * * Assumes that q is an ill read queue so that ICMP errors for link-local @@ -6067,23 +2854,16 @@ err_ret: * Current code checks for each opt_type (other than pads) if it is in * the expected nexthdr (hbh or dest) */ -static int -ip_process_options_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, - uint8_t *optptr, uint_t optlen, uint8_t hdr_type, ip_stack_t *ipst) +int +ip_process_options_v6(mblk_t *mp, ip6_t *ip6h, + uint8_t *optptr, uint_t optlen, uint8_t hdr_type, ip_recv_attr_t *ira) { uint8_t opt_type; uint_t optused; int ret = 0; - mblk_t *first_mp; const char *errtype; - zoneid_t zoneid; - ill_t *ill = q->q_ptr; - ipif_t *ipif; - - first_mp = mp; - if (mp->b_datap->db_type == M_CTL) { - mp = mp->b_cont; - } + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; while (optlen != 0) { opt_type = *optptr; @@ -6178,13 +2958,9 @@ ip_process_options_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, * around (i.e. before AH processing). * If we've done AH... stop now. */ - if (first_mp != mp) { - ipsec_in_t *ii; - - ii = (ipsec_in_t *)first_mp->b_rptr; - if (ii->ipsec_in_ah_sa != NULL) - break; - } + if ((ira->ira_flags & IRAF_IPSEC_SECURE) && + ira->ira_ipsec_ah_sa != NULL) + break; oh = (struct ip6_opt_home_address *)optptr; /* Check total length and alignment */ @@ -6217,8 +2993,6 @@ ip_process_options_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, /* FALLTHROUGH */ opt_error: /* Determine which zone should send error */ - zoneid = ipif_lookup_addr_zoneid_v6( - &ip6h->ip6_dst, ill, ipst); switch (IP6OPT_TYPE(opt_type)) { case IP6OPT_TYPE_SKIP: optused = 2 + optptr[1]; @@ -6232,48 +3006,33 @@ ip_process_options_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, ip1dbg(("ip_process_options_v6: %s " "opt 0x%x; packet dropped\n", errtype, opt_type)); - freemsg(first_mp); + BUMP_MIB(ill->ill_ip_mib, + ipIfStatsInHdrErrors); + ip_drop_input("ipIfStatsInHdrErrors", + mp, ill); + freemsg(mp); return (-1); case IP6OPT_TYPE_ICMP: - if (zoneid == ALL_ZONES) { - freemsg(first_mp); - return (-1); - } - icmp_param_problem_v6(WR(q), first_mp, + BUMP_MIB(ill->ill_ip_mib, + ipIfStatsInHdrErrors); + ip_drop_input("ipIfStatsInHdrErrors", + mp, ill); + icmp_param_problem_v6(mp, ICMP6_PARAMPROB_OPTION, (uint32_t)(optptr - (uint8_t *)ip6h), - B_FALSE, B_FALSE, zoneid, ipst); + B_FALSE, ira); return (-1); case IP6OPT_TYPE_FORCEICMP: - /* - * If we don't have a zone and the dst - * addr is multicast, then pick a zone - * based on the inbound interface. - */ - if (zoneid == ALL_ZONES && - IN6_IS_ADDR_MULTICAST( - &ip6h->ip6_dst)) { - ipif = ipif_select_source_v6( - ill, &ip6h->ip6_src, - B_TRUE, - IPV6_PREFER_SRC_DEFAULT, - ALL_ZONES); - if (ipif != NULL) { - zoneid = - ipif->ipif_zoneid; - ipif_refrele(ipif); - } - } - if (zoneid == ALL_ZONES) { - freemsg(first_mp); - return (-1); - } - icmp_param_problem_v6(WR(q), first_mp, + BUMP_MIB(ill->ill_ip_mib, + ipIfStatsInHdrErrors); + ip_drop_input("ipIfStatsInHdrErrors", + mp, ill); + icmp_param_problem_v6(mp, ICMP6_PARAMPROB_OPTION, (uint32_t)(optptr - (uint8_t *)ip6h), - B_FALSE, B_TRUE, zoneid, ipst); + B_TRUE, ira); return (-1); default: ASSERT(0); @@ -6287,14 +3046,10 @@ ip_process_options_v6(queue_t *q, mblk_t *mp, ip6_t *ip6h, bad_opt: /* Determine which zone should send error */ - zoneid = ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst, ill, ipst); - if (zoneid == ALL_ZONES) { - freemsg(first_mp); - } else { - icmp_param_problem_v6(WR(q), first_mp, ICMP6_PARAMPROB_OPTION, - (uint32_t)(optptr - (uint8_t *)ip6h), - B_FALSE, B_FALSE, zoneid, ipst); - } + ip_drop_input("ICMP_PARAM_PROBLEM", mp, ill); + icmp_param_problem_v6(mp, ICMP6_PARAMPROB_OPTION, + (uint32_t)(optptr - (uint8_t *)ip6h), + B_FALSE, ira); return (-1); } @@ -6302,10 +3057,11 @@ bad_opt: * Process a routing header that is not yet empty. * Because of RFC 5095, we now reject all route headers. */ -static void -ip_process_rthdr(queue_t *q, mblk_t *mp, ip6_t *ip6h, ip6_rthdr_t *rth, - ill_t *ill, mblk_t *hada_mp) +void +ip_process_rthdr(mblk_t *mp, ip6_t *ip6h, ip6_rthdr_t *rth, + ip_recv_attr_t *ira) { + ill_t *ill = ira->ira_ill; ip_stack_t *ipst = ill->ill_ipst; ASSERT(rth->ip6r_segleft != 0); @@ -6314,19 +3070,15 @@ ip_process_rthdr(queue_t *q, mblk_t *mp, ip6_t *ip6h, ip6_rthdr_t *rth, /* XXX Check for source routed out same interface? */ BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors); - freemsg(hada_mp); + ip_drop_input("ipIfStatsInAddrErrors", mp, ill); freemsg(mp); return; } - if (hada_mp != NULL) { - freemsg(hada_mp); - freemsg(mp); - return; - } - /* Sent by forwarding path, and router is global zone */ - icmp_param_problem_v6(WR(q), mp, ICMP6_PARAMPROB_HEADER, - (uint32_t)((uchar_t *)&rth->ip6r_type - (uchar_t *)ip6h), B_FALSE, - B_FALSE, GLOBAL_ZONEID, ipst); + + ip_drop_input("ICMP_PARAM_PROBLEM", mp, ill); + icmp_param_problem_v6(mp, ICMP6_PARAMPROB_HEADER, + (uint32_t)((uchar_t *)&rth->ip6r_type - (uchar_t *)ip6h), + B_FALSE, ira); } /* @@ -6335,21 +3087,10 @@ ip_process_rthdr(queue_t *q, mblk_t *mp, ip6_t *ip6h, ip6_rthdr_t *rth, void ip_rput_v6(queue_t *q, mblk_t *mp) { - mblk_t *first_mp; - mblk_t *hada_mp = NULL; - ip6_t *ip6h; - boolean_t ll_multicast = B_FALSE; - boolean_t mctl_present = B_FALSE; ill_t *ill; - struct iocblk *iocp; - uint_t flags = 0; - mblk_t *dl_mp; - ip_stack_t *ipst; - int check; ill = (ill_t *)q->q_ptr; - ipst = ill->ill_ipst; - if (ill->ill_state_flags & ILL_CONDEMNED) { + if (ill->ill_state_flags & (ILL_CONDEMNED | ILL_LL_SUBNET_PENDING)) { union DL_primitives *dl; dl = (union DL_primitives *)mp->b_rptr; @@ -6367,241 +3108,14 @@ ip_rput_v6(queue_t *q, mblk_t *mp) return; } } + if (DB_TYPE(mp) == M_DATA) { + struct mac_header_info_s mhi; - dl_mp = NULL; - switch (mp->b_datap->db_type) { - case M_DATA: { - int hlen; - uchar_t *ucp; - struct ether_header *eh; - dl_unitdata_ind_t *dui; - - /* - * This is a work-around for CR 6451644, a bug in Nemo. It - * should be removed when that problem is fixed. - */ - if (ill->ill_mactype == DL_ETHER && - (hlen = MBLKHEAD(mp)) >= sizeof (struct ether_header) && - (ucp = mp->b_rptr)[-1] == (ETHERTYPE_IPV6 & 0xFF) && - ucp[-2] == (ETHERTYPE_IPV6 >> 8)) { - if (hlen >= sizeof (struct ether_vlan_header) && - ucp[-5] == 0 && ucp[-6] == 0x81) - ucp -= sizeof (struct ether_vlan_header); - else - ucp -= sizeof (struct ether_header); - /* - * If it's a group address, then fabricate a - * DL_UNITDATA_IND message. - */ - if ((ll_multicast = (ucp[0] & 1)) != 0 && - (dl_mp = allocb(DL_UNITDATA_IND_SIZE + 16, - BPRI_HI)) != NULL) { - eh = (struct ether_header *)ucp; - dui = (dl_unitdata_ind_t *)dl_mp->b_rptr; - DB_TYPE(dl_mp) = M_PROTO; - dl_mp->b_wptr = (uchar_t *)(dui + 1) + 16; - dui->dl_primitive = DL_UNITDATA_IND; - dui->dl_dest_addr_length = 8; - dui->dl_dest_addr_offset = DL_UNITDATA_IND_SIZE; - dui->dl_src_addr_length = 8; - dui->dl_src_addr_offset = DL_UNITDATA_IND_SIZE + - 8; - dui->dl_group_address = 1; - ucp = (uchar_t *)(dui + 1); - if (ill->ill_sap_length > 0) - ucp += ill->ill_sap_length; - bcopy(&eh->ether_dhost, ucp, 6); - bcopy(&eh->ether_shost, ucp + 8, 6); - ucp = (uchar_t *)(dui + 1); - if (ill->ill_sap_length < 0) - ucp += 8 + ill->ill_sap_length; - bcopy(&eh->ether_type, ucp, 2); - bcopy(&eh->ether_type, ucp + 8, 2); - } - } - break; - } - - case M_PROTO: - case M_PCPROTO: - if (((dl_unitdata_ind_t *)mp->b_rptr)->dl_primitive != - DL_UNITDATA_IND) { - /* Go handle anything other than data elsewhere. */ - ip_rput_dlpi(q, mp); - return; - } - ll_multicast = ip_get_dlpi_mbcast(ill, mp); - - /* Save the DLPI header. */ - dl_mp = mp; - mp = mp->b_cont; - dl_mp->b_cont = NULL; - break; - case M_BREAK: - panic("ip_rput_v6: got an M_BREAK"); - /*NOTREACHED*/ - case M_IOCACK: - iocp = (struct iocblk *)mp->b_rptr; - switch (iocp->ioc_cmd) { - case DL_IOC_HDR_INFO: - ill = (ill_t *)q->q_ptr; - ill_fastpath_ack(ill, mp); - return; - default: - putnext(q, mp); - return; - } - /* FALLTHRU */ - case M_ERROR: - case M_HANGUP: - mutex_enter(&ill->ill_lock); - if (ill->ill_state_flags & ILL_CONDEMNED) { - mutex_exit(&ill->ill_lock); - freemsg(mp); - return; - } - ill_refhold_locked(ill); - mutex_exit(&ill->ill_lock); - qwriter_ip(ill, q, mp, ip_rput_other, CUR_OP, B_FALSE); - return; - case M_CTL: - if ((MBLKL(mp) > sizeof (int)) && - ((da_ipsec_t *)mp->b_rptr)->da_type == IPHADA_M_CTL) { - ASSERT(MBLKL(mp) >= sizeof (da_ipsec_t)); - mctl_present = B_TRUE; - break; - } - putnext(q, mp); - return; - case M_IOCNAK: - iocp = (struct iocblk *)mp->b_rptr; - switch (iocp->ioc_cmd) { - case DL_IOC_HDR_INFO: - ip_rput_other(NULL, q, mp, NULL); - return; - default: - break; - } - /* FALLTHRU */ - default: - putnext(q, mp); - return; - } - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); - UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, - (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp)); - /* - * if db_ref > 1 then copymsg and free original. Packet may be - * changed and do not want other entity who has a reference to this - * message to trip over the changes. This is a blind change because - * trying to catch all places that might change packet is too - * difficult (since it may be a module above this one). - */ - if (mp->b_datap->db_ref > 1) { - mblk_t *mp1; - - mp1 = copymsg(mp); - freemsg(mp); - if (mp1 == NULL) { - first_mp = NULL; - goto discard; - } - mp = mp1; - } - first_mp = mp; - if (mctl_present) { - hada_mp = first_mp; - mp = first_mp->b_cont; - } - - if ((check = ip_check_v6_mblk(mp, ill)) == IP6_MBLK_HDR_ERR) { - freemsg(mp); - return; - } - - ip6h = (ip6_t *)mp->b_rptr; - - /* - * ip:::receive must see ipv6 packets with a full header, - * and so is placed after the IP6_MBLK_HDR_ERR check. - */ - DTRACE_IP7(receive, mblk_t *, first_mp, conn_t *, NULL, void_ip_t *, - ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, ip6h, - int, 0); - - if (check != IP6_MBLK_OK) { - freemsg(mp); - return; - } - - DTRACE_PROBE4(ip6__physical__in__start, - ill_t *, ill, ill_t *, NULL, - ip6_t *, ip6h, mblk_t *, first_mp); - - FW_HOOKS6(ipst->ips_ip6_physical_in_event, - ipst->ips_ipv6firewall_physical_in, - ill, NULL, ip6h, first_mp, mp, ll_multicast, ipst); - - DTRACE_PROBE1(ip6__physical__in__end, mblk_t *, first_mp); - - if (first_mp == NULL) - return; - - /* - * Attach any necessary label information to this packet. - */ - if (is_system_labeled() && !tsol_get_pkt_label(mp, IPV6_VERSION)) { - if (ip6opt_ls != 0) - ip0dbg(("tsol_get_pkt_label v6 failed\n")); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); - goto discard; - } - - /* IP observability hook. */ - if (ipst->ips_ip6_observe.he_interested) { - zoneid_t dzone; - - dzone = ip_get_zoneid_v6(&ip6h->ip6_dst, mp, ill, ipst, - ALL_ZONES); - ipobs_hook(mp, IPOBS_HOOK_INBOUND, ALL_ZONES, dzone, - ill, ipst); - } - - if ((ip6h->ip6_vcf & IPV6_VERS_AND_FLOW_MASK) == - IPV6_DEFAULT_VERS_AND_FLOW) { - /* - * It may be a bit too expensive to do this mapped address - * check here, but in the interest of robustness, it seems - * like the correct place. - * TODO: Avoid this check for e.g. connected TCP sockets - */ - if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_src)) { - ip1dbg(("ip_rput_v6: pkt with mapped src addr\n")); - goto discard; - } - - if (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src)) { - ip1dbg(("ip_rput_v6: pkt with loopback src")); - goto discard; - } else if (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst)) { - ip1dbg(("ip_rput_v6: pkt with loopback dst")); - goto discard; - } - - flags |= (ll_multicast ? IP6_IN_LLMCAST : 0); - ip_rput_data_v6(q, ill, mp, ip6h, flags, hada_mp, dl_mp); + ip_mdata_to_mhi(ill, mp, &mhi); + ip_input_v6(ill, NULL, mp, &mhi); } else { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInWrongIPVersion); - goto discard; + ip_rput_notdata(ill, mp); } - freemsg(dl_mp); - return; - -discard: - if (dl_mp != NULL) - freeb(dl_mp); - freemsg(first_mp); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); } /* @@ -6703,1507 +3217,72 @@ ipsec_needs_processing_v6(mblk_t *mp, uint8_t *nexthdr) } /* - * Path for AH if options are present. If this is the first time we are - * sending a datagram to AH, allocate a IPSEC_IN message and prepend it. - * Otherwise, just fanout. Return value answers the boolean question: - * "Did I consume the mblk you sent me?" + * Path for AH if options are present. + * Returns NULL if the mblk was consumed. * * Sometimes AH needs to be done before other IPv6 headers for security * reasons. This function (and its ipsec_needs_processing_v6() above) * indicates if that is so, and fans out to the appropriate IPsec protocol * for the datagram passed in. */ -static boolean_t -ipsec_early_ah_v6(queue_t *q, mblk_t *first_mp, boolean_t mctl_present, - ill_t *ill, ill_t *inill, mblk_t *hada_mp, zoneid_t zoneid) +mblk_t * +ipsec_early_ah_v6(mblk_t *mp, ip_recv_attr_t *ira) { - mblk_t *mp; uint8_t nexthdr; - ipsec_in_t *ii = NULL; ah_t *ah; - ipsec_status_t ipsec_rc; + ill_t *ill = ira->ira_ill; ip_stack_t *ipst = ill->ill_ipst; - netstack_t *ns = ipst->ips_netstack; - ipsec_stack_t *ipss = ns->netstack_ipsec; - - ASSERT((hada_mp == NULL) || (!mctl_present)); + ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; - switch (ipsec_needs_processing_v6( - (mctl_present ? first_mp->b_cont : first_mp), &nexthdr)) { + switch (ipsec_needs_processing_v6(mp, &nexthdr)) { case IPSEC_MEMORY_ERROR: BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - freemsg(hada_mp); - freemsg(first_mp); - return (B_TRUE); + ip_drop_input("ipIfStatsInDiscards", mp, ill); + freemsg(mp); + return (NULL); case IPSEC_HDR_DONT_PROCESS: - return (B_FALSE); + return (mp); } /* Default means send it to AH! */ ASSERT(nexthdr == IPPROTO_AH); - if (!mctl_present) { - mp = first_mp; - first_mp = ipsec_in_alloc(B_FALSE, ipst->ips_netstack); - if (first_mp == NULL) { - ip1dbg(("ipsec_early_ah_v6: IPSEC_IN " - "allocation failure.\n")); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - freemsg(hada_mp); - freemsg(mp); - return (B_TRUE); - } - /* - * Store the ill_index so that when we come back - * from IPSEC we ride on the same queue. - */ - ii = (ipsec_in_t *)first_mp->b_rptr; - ii->ipsec_in_ill_index = ill->ill_phyint->phyint_ifindex; - ii->ipsec_in_rill_index = inill->ill_phyint->phyint_ifindex; - first_mp->b_cont = mp; - } - /* - * Cache hardware acceleration info. - */ - if (hada_mp != NULL) { - ASSERT(ii != NULL); - IPSECHW_DEBUG(IPSECHW_PKT, ("ipsec_early_ah_v6: " - "caching data attr.\n")); - ii->ipsec_in_accelerated = B_TRUE; - ii->ipsec_in_da = hada_mp; - } if (!ipsec_loaded(ipss)) { - ip_proto_not_sup(q, first_mp, IP_FF_SEND_ICMP, zoneid, ipst); - return (B_TRUE); - } - - ah = ipsec_inbound_ah_sa(first_mp, ns); - if (ah == NULL) - return (B_TRUE); - ASSERT(ii->ipsec_in_ah_sa != NULL); - ASSERT(ii->ipsec_in_ah_sa->ipsa_input_func != NULL); - ipsec_rc = ii->ipsec_in_ah_sa->ipsa_input_func(first_mp, ah); - - switch (ipsec_rc) { - case IPSEC_STATUS_SUCCESS: - /* we're done with IPsec processing, send it up */ - ip_fanout_proto_again(first_mp, ill, inill, NULL); - break; - case IPSEC_STATUS_FAILED: - BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsInDiscards); - break; - case IPSEC_STATUS_PENDING: - /* no action needed */ - break; - } - return (B_TRUE); -} - -static boolean_t -ip_iptun_input_v6(mblk_t *ipsec_mp, mblk_t *data_mp, - size_t hdr_len, uint8_t nexthdr, zoneid_t zoneid, ill_t *ill, - ip_stack_t *ipst) -{ - conn_t *connp; - - ASSERT(ipsec_mp == NULL || ipsec_mp->b_cont == data_mp); - - connp = ipcl_classify_v6(data_mp, nexthdr, hdr_len, zoneid, ipst); - if (connp != NULL) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); - connp->conn_recv(connp, ipsec_mp != NULL ? ipsec_mp : data_mp, - NULL); - CONN_DEC_REF(connp); - return (B_TRUE); - } - return (B_FALSE); -} - -/* - * Validate the IPv6 mblk for alignment. - */ -int -ip_check_v6_mblk(mblk_t *mp, ill_t *ill) -{ - int pkt_len, ip6_len; - ip6_t *ip6h = (ip6_t *)mp->b_rptr; - - /* check for alignment and full IPv6 header */ - if (!OK_32PTR((uchar_t *)ip6h) || - (mp->b_wptr - (uchar_t *)ip6h) < IPV6_HDR_LEN) { - if (!pullupmsg(mp, IPV6_HDR_LEN)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - ip1dbg(("ip_rput_v6: pullupmsg failed\n")); - return (IP6_MBLK_HDR_ERR); - } - ip6h = (ip6_t *)mp->b_rptr; - } - - ASSERT(OK_32PTR((uchar_t *)ip6h) && - (mp->b_wptr - (uchar_t *)ip6h) >= IPV6_HDR_LEN); - - if (mp->b_cont == NULL) - pkt_len = mp->b_wptr - mp->b_rptr; - else - pkt_len = msgdsize(mp); - ip6_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; - - /* - * Check for bogus (too short packet) and packet which - * was padded by the link layer. - */ - if (ip6_len != pkt_len) { - ssize_t diff; - - if (ip6_len > pkt_len) { - ip1dbg(("ip_rput_data_v6: packet too short %d %d\n", - ip6_len, pkt_len)); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts); - return (IP6_MBLK_LEN_ERR); - } - diff = (ssize_t)(pkt_len - ip6_len); - - if (!adjmsg(mp, -diff)) { - ip1dbg(("ip_rput_data_v6: adjmsg failed\n")); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - return (IP6_MBLK_LEN_ERR); - } - - /* - * adjmsg may have freed an mblk from the chain, hence - * invalidate any hw checksum here. This will force IP to - * calculate the checksum in sw, but only for this packet. - */ - DB_CKSUMFLAGS(mp) = 0; - } - return (IP6_MBLK_OK); -} - -/* - * ip_rput_data_v6 -- received IPv6 packets in M_DATA messages show up here. - * ip_rput_v6 has already verified alignment, the min length, the version, - * and db_ref = 1. - * - * The ill passed in (the arg named inill) is the ill that the packet - * actually arrived on. We need to remember this when saving the - * input interface index into potential IPV6_PKTINFO data in - * ip_add_info_v6(). - * - * This routine doesn't free dl_mp; that's the caller's responsibility on - * return. (Note that the callers are complex enough that there's no tail - * recursion here anyway.) - */ -void -ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h, - uint_t flags, mblk_t *hada_mp, mblk_t *dl_mp) -{ - ire_t *ire = NULL; - ill_t *ill = inill; - ill_t *outill; - uint8_t *whereptr; - uint8_t nexthdr; - uint16_t remlen; - uint_t prev_nexthdr_offset; - uint_t used; - size_t old_pkt_len; - size_t pkt_len; - uint16_t ip6_len; - uint_t hdr_len; - boolean_t mctl_present; - mblk_t *first_mp; - mblk_t *first_mp1; - boolean_t no_forward; - ip6_hbh_t *hbhhdr; - boolean_t ll_multicast = (flags & IP6_IN_LLMCAST); - conn_t *connp; - uint32_t ports; - zoneid_t zoneid = GLOBAL_ZONEID; - uint16_t hck_flags, reass_hck_flags; - uint32_t reass_sum; - boolean_t cksum_err; - mblk_t *mp1; - ip_stack_t *ipst = inill->ill_ipst; - ilb_stack_t *ilbs = ipst->ips_netstack->netstack_ilb; - in6_addr_t lb_dst; - int lb_ret = ILB_PASSED; - - EXTRACT_PKT_MP(mp, first_mp, mctl_present); - - if (hada_mp != NULL) { - /* - * It's an IPsec accelerated packet. - * Keep a pointer to the data attributes around until - * we allocate the ipsecinfo structure. - */ - IPSECHW_DEBUG(IPSECHW_PKT, - ("ip_rput_data_v6: inbound HW accelerated IPsec pkt\n")); - hada_mp->b_cont = NULL; - /* - * Since it is accelerated, it came directly from - * the ill. - */ - ASSERT(mctl_present == B_FALSE); - ASSERT(mp->b_datap->db_type != M_CTL); - } - - ip6h = (ip6_t *)mp->b_rptr; - ip6_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; - old_pkt_len = pkt_len = ip6_len; - - if (ILL_HCKSUM_CAPABLE(ill) && !mctl_present && dohwcksum) - hck_flags = DB_CKSUMFLAGS(mp); - else - hck_flags = 0; - - /* Clear checksum flags in case we need to forward */ - DB_CKSUMFLAGS(mp) = 0; - reass_sum = reass_hck_flags = 0; - - nexthdr = ip6h->ip6_nxt; - - prev_nexthdr_offset = (uint_t)((uchar_t *)&ip6h->ip6_nxt - - (uchar_t *)ip6h); - whereptr = (uint8_t *)&ip6h[1]; - remlen = pkt_len - IPV6_HDR_LEN; /* Track how much is left */ - - /* Process hop by hop header options */ - if (nexthdr == IPPROTO_HOPOPTS) { - uint_t ehdrlen; - uint8_t *optptr; - - if (remlen < MIN_EHDR_LEN) - goto pkt_too_short; - if (mp->b_cont != NULL && - whereptr + MIN_EHDR_LEN > mp->b_wptr) { - if (!pullupmsg(mp, IPV6_HDR_LEN + MIN_EHDR_LEN)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - freemsg(hada_mp); - freemsg(first_mp); - return; - } - ip6h = (ip6_t *)mp->b_rptr; - whereptr = (uint8_t *)ip6h + pkt_len - remlen; - } - hbhhdr = (ip6_hbh_t *)whereptr; - nexthdr = hbhhdr->ip6h_nxt; - prev_nexthdr_offset = (uint_t)(whereptr - (uint8_t *)ip6h); - ehdrlen = 8 * (hbhhdr->ip6h_len + 1); - - if (remlen < ehdrlen) - goto pkt_too_short; - if (mp->b_cont != NULL && - whereptr + ehdrlen > mp->b_wptr) { - if (!pullupmsg(mp, IPV6_HDR_LEN + ehdrlen)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - freemsg(hada_mp); - freemsg(first_mp); - return; - } - ip6h = (ip6_t *)mp->b_rptr; - whereptr = (uint8_t *)ip6h + pkt_len - remlen; - hbhhdr = (ip6_hbh_t *)whereptr; - } - - optptr = whereptr + 2; - whereptr += ehdrlen; - remlen -= ehdrlen; - switch (ip_process_options_v6(q, first_mp, ip6h, optptr, - ehdrlen - 2, IPPROTO_HOPOPTS, ipst)) { - case -1: - /* - * Packet has been consumed and any - * needed ICMP messages sent. - */ - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); - freemsg(hada_mp); - return; - case 0: - /* no action needed */ - break; - case 1: - /* Known router alert */ - goto ipv6forus; - } - } - - /* - * On incoming v6 multicast packets we will bypass the ire table, - * and assume that the read queue corresponds to the targetted - * interface. - * - * The effect of this is the same as the IPv4 original code, but is - * much cleaner I think. See ip_rput for how that was done. - */ - if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastPkts); - UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastOctets, pkt_len); - - /* - * So that we don't end up with dups, only one ill in an IPMP - * group is nominated to receive multicast data traffic. - * However, link-locals on any underlying interfaces will have - * joined their solicited-node multicast addresses and we must - * accept those packets. (We don't attempt to precisely - * filter out duplicate solicited-node multicast packets since - * e.g. an IPMP interface and underlying interface may have - * the same solicited-node multicast address.) Note that we - * won't generally have duplicates because we only issue a - * DL_ENABMULTI_REQ on one interface in a group; the exception - * is when PHYI_MULTI_BCAST is set. - */ - if (IS_UNDER_IPMP(ill) && !ill->ill_nom_cast && - !IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) { - goto drop_pkt; - } - - /* - * XXX TODO Give to mrouted to for multicast forwarding. - */ - if (ilm_lookup_ill_v6(ill, &ip6h->ip6_dst, B_FALSE, - ALL_ZONES) == NULL) { - if (ip_debug > 3) { - /* ip2dbg */ - pr_addr_dbg("ip_rput_data_v6: got mcast packet" - " which is not for us: %s\n", AF_INET6, - &ip6h->ip6_dst); - } -drop_pkt: BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - freemsg(hada_mp); - freemsg(first_mp); - return; - } - if (ip_debug > 3) { - /* ip2dbg */ - pr_addr_dbg("ip_rput_data_v6: multicast for us: %s\n", - AF_INET6, &ip6h->ip6_dst); - } - zoneid = GLOBAL_ZONEID; - goto ipv6forus; - } - - /* - * Find an ire that matches destination. For link-local addresses - * we have to match the ill. - * TBD for site local addresses. - */ - if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst)) { - ire = ire_ctable_lookup_v6(&ip6h->ip6_dst, NULL, - IRE_CACHE|IRE_LOCAL, ill->ill_ipif, ALL_ZONES, NULL, - MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst); - } else { - if (ilb_has_rules(ilbs) && ILB_SUPP_L4(nexthdr)) { - /* For convenience, we just pull up the mblk. */ - if (mp->b_cont != NULL) { - if (pullupmsg(mp, -1) == 0) { - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsInDiscards); - freemsg(hada_mp); - freemsg(first_mp); - return; - } - hdr_len = pkt_len - remlen; - ip6h = (ip6_t *)mp->b_rptr; - whereptr = (uint8_t *)ip6h + hdr_len; - } - lb_ret = ilb_check_v6(ilbs, ill, mp, ip6h, nexthdr, - whereptr, &lb_dst); - if (lb_ret == ILB_DROPPED) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - freemsg(hada_mp); - freemsg(first_mp); - return; - } - } - - ire = ire_cache_lookup_v6((lb_ret == ILB_BALANCED) ? &lb_dst : - &ip6h->ip6_dst, ALL_ZONES, msg_getlabel(mp), ipst); - - if (ire != NULL && ire->ire_stq != NULL && - ire->ire_zoneid != GLOBAL_ZONEID && - ire->ire_zoneid != ALL_ZONES) { - /* - * Should only use IREs that are visible from the - * global zone for forwarding. - */ - ire_refrele(ire); - ire = ire_cache_lookup_v6(&ip6h->ip6_dst, - GLOBAL_ZONEID, msg_getlabel(mp), ipst); - } - } - - if (ire == NULL) { - /* - * No matching IRE found. Mark this packet as having - * originated externally. - */ - if (!(ill->ill_flags & ILLF_ROUTER) || ll_multicast) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); - if (!(ill->ill_flags & ILLF_ROUTER)) { - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsInAddrErrors); - } - freemsg(hada_mp); - freemsg(first_mp); - return; - } - if (ip6h->ip6_hops <= 1) { - if (hada_mp != NULL) - goto hada_drop; - /* Sent by forwarding path, and router is global zone */ - icmp_time_exceeded_v6(WR(q), first_mp, - ICMP6_TIME_EXCEED_TRANSIT, ll_multicast, B_FALSE, - GLOBAL_ZONEID, ipst); - return; - } - /* - * Per RFC 3513 section 2.5.2, we must not forward packets with - * an unspecified source address. - */ - if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); - freemsg(hada_mp); - freemsg(first_mp); - return; - } - mp->b_prev = (mblk_t *)(uintptr_t) - ill->ill_phyint->phyint_ifindex; - ip_newroute_v6(q, mp, (lb_ret == ILB_BALANCED) ? &lb_dst : - &ip6h->ip6_dst, &ip6h->ip6_src, - IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst) ? ill : NULL, - GLOBAL_ZONEID, ipst); - return; + ip_proto_not_sup(mp, ira); + return (NULL); } - /* we have a matching IRE */ - if (ire->ire_stq != NULL) { - /* - * To be quicker, we may wish not to chase pointers - * (ire->ire_ipif->ipif_ill...) and instead store the - * forwarding policy in the ire. An unfortunate side- - * effect of this would be requiring an ire flush whenever - * the ILLF_ROUTER flag changes. For now, chase pointers - * once and store in the boolean no_forward. - * - * This appears twice to keep it out of the non-forwarding, - * yes-it's-for-us-on-the-right-interface case. - */ - no_forward = ((ill->ill_flags & - ire->ire_ipif->ipif_ill->ill_flags & ILLF_ROUTER) == 0); - ASSERT(first_mp == mp); - /* - * This ire has a send-to queue - forward the packet. - */ - if (no_forward || ll_multicast || (hada_mp != NULL)) { - freemsg(hada_mp); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); - if (no_forward) { - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsInAddrErrors); - } - freemsg(mp); - ire_refrele(ire); - return; - } - /* - * ipIfStatsHCInForwDatagrams should only be increment if there - * will be an attempt to forward the packet, which is why we - * increment after the above condition has been checked. - */ - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams); - if (ip6h->ip6_hops <= 1) { - ip1dbg(("ip_rput_data_v6: hop limit expired.\n")); - /* Sent by forwarding path, and router is global zone */ - icmp_time_exceeded_v6(WR(q), mp, - ICMP6_TIME_EXCEED_TRANSIT, ll_multicast, B_FALSE, - GLOBAL_ZONEID, ipst); - ire_refrele(ire); - return; - } - /* - * Per RFC 3513 section 2.5.2, we must not forward packets with - * an unspecified source address. - */ - if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); - freemsg(mp); - ire_refrele(ire); - return; - } - - if (is_system_labeled()) { - mblk_t *mp1; - - if ((mp1 = tsol_ip_forward(ire, mp)) == NULL) { - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsForwProhibits); - freemsg(mp); - ire_refrele(ire); - return; - } - /* Size may have changed */ - mp = mp1; - ip6h = (ip6_t *)mp->b_rptr; - pkt_len = msgdsize(mp); - } - - if (pkt_len > ire->ire_max_frag) { - int max_frag = ire->ire_max_frag; - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTooBigErrors); - /* - * Handle labeled packet resizing. - */ - if (is_system_labeled()) { - max_frag = tsol_pmtu_adjust(mp, max_frag, - pkt_len - old_pkt_len, AF_INET6); - } - - /* Sent by forwarding path, and router is global zone */ - icmp_pkt2big_v6(WR(q), mp, max_frag, - ll_multicast, B_TRUE, GLOBAL_ZONEID, ipst); - ire_refrele(ire); - return; - } + mp = ipsec_inbound_ah_sa(mp, ira, &ah); + if (mp == NULL) + return (NULL); + ASSERT(ah != NULL); + ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE); + ASSERT(ira->ira_ipsec_ah_sa != NULL); + ASSERT(ira->ira_ipsec_ah_sa->ipsa_input_func != NULL); + mp = ira->ira_ipsec_ah_sa->ipsa_input_func(mp, ah, ira); + if (mp == NULL) { /* - * Check to see if we're forwarding the packet to a - * different link from which it came. If so, check the - * source and destination addresses since routers must not - * forward any packets with link-local source or - * destination addresses to other links. Otherwise (if - * we're forwarding onto the same link), conditionally send - * a redirect message. + * Either it failed or is pending. In the former case + * ipIfStatsInDiscards was increased. */ - if (ire->ire_rfq != q && - !IS_IN_SAME_ILLGRP(ill, (ill_t *)ire->ire_rfq->q_ptr)) { - if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst) || - IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src)) { - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsInAddrErrors); - freemsg(mp); - ire_refrele(ire); - return; - } - /* TBD add site-local check at site boundary? */ - } else if (ipst->ips_ipv6_send_redirects) { - in6_addr_t *v6targ; - in6_addr_t gw_addr_v6; - ire_t *src_ire_v6 = NULL; - - /* - * Don't send a redirect when forwarding a source - * routed packet. - */ - if (ip_source_routed_v6(ip6h, mp, ipst)) - goto forward; - - mutex_enter(&ire->ire_lock); - gw_addr_v6 = ire->ire_gateway_addr_v6; - mutex_exit(&ire->ire_lock); - if (!IN6_IS_ADDR_UNSPECIFIED(&gw_addr_v6)) { - v6targ = &gw_addr_v6; - /* - * We won't send redirects to a router - * that doesn't have a link local - * address, but will forward. - */ - if (!IN6_IS_ADDR_LINKLOCAL(v6targ)) { - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsInAddrErrors); - goto forward; - } - } else { - v6targ = &ip6h->ip6_dst; - } - - src_ire_v6 = ire_ftable_lookup_v6(&ip6h->ip6_src, - NULL, NULL, IRE_INTERFACE, ire->ire_ipif, NULL, - GLOBAL_ZONEID, 0, NULL, - MATCH_IRE_IPIF | MATCH_IRE_TYPE, - ipst); - - if (src_ire_v6 != NULL) { - /* - * The source is directly connected. - */ - mp1 = copymsg(mp); - if (mp1 != NULL) { - icmp_send_redirect_v6(WR(q), - mp1, v6targ, &ip6h->ip6_dst, - ill, B_FALSE); - } - ire_refrele(src_ire_v6); - } - } - -forward: - /* Hoplimit verified above */ - ip6h->ip6_hops--; - - outill = ire->ire_ipif->ipif_ill; - - DTRACE_PROBE4(ip6__forwarding__start, - ill_t *, inill, ill_t *, outill, - ip6_t *, ip6h, mblk_t *, mp); - - FW_HOOKS6(ipst->ips_ip6_forwarding_event, - ipst->ips_ipv6firewall_forwarding, - inill, outill, ip6h, mp, mp, 0, ipst); - - DTRACE_PROBE1(ip6__forwarding__end, mblk_t *, mp); - - if (mp != NULL) { - UPDATE_IB_PKT_COUNT(ire); - ire->ire_last_used_time = lbolt; - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutForwDatagrams); - ip_xmit_v6(mp, ire, 0, NULL, B_FALSE, NULL); - } - IRE_REFRELE(ire); - return; - } - - /* - * Need to put on correct queue for reassembly to find it. - * No need to use put() since reassembly has its own locks. - * Note: multicast packets and packets destined to addresses - * assigned to loopback (ire_rfq is NULL) will be reassembled on - * the arriving ill. Unlike the IPv4 case, enabling strict - * destination multihoming will prevent accepting packets - * addressed to an IRE_LOCAL on lo0. - */ - if (ire->ire_rfq != q) { - if ((ire = ip_check_multihome(&ip6h->ip6_dst, ire, ill)) - == NULL) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); - freemsg(hada_mp); - freemsg(first_mp); - return; - } - if (ire->ire_rfq != NULL) { - q = ire->ire_rfq; - ill = (ill_t *)q->q_ptr; - ASSERT(ill != NULL); - } - } - - zoneid = ire->ire_zoneid; - UPDATE_IB_PKT_COUNT(ire); - ire->ire_last_used_time = lbolt; - /* Don't use the ire after this point, we'll NULL it out to be sure. */ - ire_refrele(ire); - ire = NULL; -ipv6forus: - /* - * Looks like this packet is for us one way or another. - * This is where we'll process destination headers etc. - */ - for (; ; ) { - switch (nexthdr) { - case IPPROTO_TCP: { - uint16_t *up; - uint32_t sum; - int offset; - - hdr_len = pkt_len - remlen; - - if (hada_mp != NULL) { - ip0dbg(("tcp hada drop\n")); - goto hada_drop; - } - - - /* TCP needs all of the TCP header */ - if (remlen < TCP_MIN_HEADER_LENGTH) - goto pkt_too_short; - if (mp->b_cont != NULL && - whereptr + TCP_MIN_HEADER_LENGTH > mp->b_wptr) { - if (!pullupmsg(mp, - hdr_len + TCP_MIN_HEADER_LENGTH)) { - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsInDiscards); - freemsg(first_mp); - return; - } - hck_flags = 0; - ip6h = (ip6_t *)mp->b_rptr; - whereptr = (uint8_t *)ip6h + hdr_len; - } - /* - * Extract the offset field from the TCP header. - */ - offset = ((uchar_t *)ip6h)[hdr_len + 12] >> 4; - if (offset != 5) { - if (offset < 5) { - ip1dbg(("ip_rput_data_v6: short " - "TCP data offset")); - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsInDiscards); - freemsg(first_mp); - return; - } - /* - * There must be TCP options. - * Make sure we can grab them. - */ - offset <<= 2; - if (remlen < offset) - goto pkt_too_short; - if (mp->b_cont != NULL && - whereptr + offset > mp->b_wptr) { - if (!pullupmsg(mp, - hdr_len + offset)) { - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsInDiscards); - freemsg(first_mp); - return; - } - hck_flags = 0; - ip6h = (ip6_t *)mp->b_rptr; - whereptr = (uint8_t *)ip6h + hdr_len; - } - } - - up = (uint16_t *)&ip6h->ip6_src; - /* - * TCP checksum calculation. First sum up the - * pseudo-header fields: - * - Source IPv6 address - * - Destination IPv6 address - * - TCP payload length - * - TCP protocol ID - */ - sum = htons(IPPROTO_TCP + remlen) + - up[0] + up[1] + up[2] + up[3] + - up[4] + up[5] + up[6] + up[7] + - up[8] + up[9] + up[10] + up[11] + - up[12] + up[13] + up[14] + up[15]; - - /* Fold initial sum */ - sum = (sum & 0xffff) + (sum >> 16); - - mp1 = mp->b_cont; - - if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) - IP6_STAT(ipst, ip6_in_sw_cksum); - - IP_CKSUM_RECV(hck_flags, sum, (uchar_t *) - ((uchar_t *)mp->b_rptr + DB_CKSUMSTART(mp)), - (int32_t)(whereptr - (uchar_t *)mp->b_rptr), - mp, mp1, cksum_err); - - if (cksum_err) { - BUMP_MIB(ill->ill_ip_mib, tcpIfStatsInErrs); - - if (hck_flags & HCK_FULLCKSUM) { - IP6_STAT(ipst, - ip6_tcp_in_full_hw_cksum_err); - } else if (hck_flags & HCK_PARTIALCKSUM) { - IP6_STAT(ipst, - ip6_tcp_in_part_hw_cksum_err); - } else { - IP6_STAT(ipst, ip6_tcp_in_sw_cksum_err); - } - freemsg(first_mp); - return; - } -tcp_fanout: - ip_fanout_tcp_v6(q, first_mp, ip6h, ill, inill, - (flags|IP_FF_SEND_ICMP|IP_FF_SYN_ADDIRE| - IP_FF_IPINFO), hdr_len, mctl_present, zoneid); - return; - } - case IPPROTO_SCTP: - { - sctp_hdr_t *sctph; - uint32_t calcsum, pktsum; - uint_t hdr_len = pkt_len - remlen; - sctp_stack_t *sctps; - - sctps = inill->ill_ipst->ips_netstack->netstack_sctp; - - /* SCTP needs all of the SCTP header */ - if (remlen < sizeof (*sctph)) { - goto pkt_too_short; - } - if (whereptr + sizeof (*sctph) > mp->b_wptr) { - ASSERT(mp->b_cont != NULL); - if (!pullupmsg(mp, hdr_len + sizeof (*sctph))) { - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsInDiscards); - freemsg(mp); - return; - } - ip6h = (ip6_t *)mp->b_rptr; - whereptr = (uint8_t *)ip6h + hdr_len; - } - - sctph = (sctp_hdr_t *)(mp->b_rptr + hdr_len); - /* checksum */ - pktsum = sctph->sh_chksum; - sctph->sh_chksum = 0; - calcsum = sctp_cksum(mp, hdr_len); - if (calcsum != pktsum) { - BUMP_MIB(&sctps->sctps_mib, sctpChecksumError); - freemsg(mp); - return; - } - sctph->sh_chksum = pktsum; - ports = *(uint32_t *)(mp->b_rptr + hdr_len); - if ((connp = sctp_fanout(&ip6h->ip6_src, &ip6h->ip6_dst, - ports, zoneid, mp, sctps)) == NULL) { - ip_fanout_sctp_raw(first_mp, ill, - (ipha_t *)ip6h, B_FALSE, ports, - mctl_present, - (flags|IP_FF_SEND_ICMP|IP_FF_IPINFO), - B_TRUE, zoneid); - return; - } - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); - sctp_input(connp, (ipha_t *)ip6h, mp, first_mp, ill, - B_FALSE, mctl_present); - return; - } - case IPPROTO_UDP: { - uint16_t *up; - uint32_t sum; - - hdr_len = pkt_len - remlen; - - if (hada_mp != NULL) { - ip0dbg(("udp hada drop\n")); - goto hada_drop; - } - - /* Verify that at least the ports are present */ - if (remlen < UDPH_SIZE) - goto pkt_too_short; - if (mp->b_cont != NULL && - whereptr + UDPH_SIZE > mp->b_wptr) { - if (!pullupmsg(mp, hdr_len + UDPH_SIZE)) { - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsInDiscards); - freemsg(first_mp); - return; - } - hck_flags = 0; - ip6h = (ip6_t *)mp->b_rptr; - whereptr = (uint8_t *)ip6h + hdr_len; - } - - /* - * Before going through the regular checksum - * calculation, make sure the received checksum - * is non-zero. RFC 2460 says, a 0x0000 checksum - * in a UDP packet (within IPv6 packet) is invalid - * and should be replaced by 0xffff. This makes - * sense as regular checksum calculation will - * pass for both the cases i.e. 0x0000 and 0xffff. - * Removing one of the case makes error detection - * stronger. - */ - - if (((udpha_t *)whereptr)->uha_checksum == 0) { - /* 0x0000 checksum is invalid */ - ip1dbg(("ip_rput_data_v6: Invalid UDP " - "checksum value 0x0000\n")); - BUMP_MIB(ill->ill_ip_mib, - udpIfStatsInCksumErrs); - freemsg(first_mp); - return; - } - - up = (uint16_t *)&ip6h->ip6_src; - - /* - * UDP checksum calculation. First sum up the - * pseudo-header fields: - * - Source IPv6 address - * - Destination IPv6 address - * - UDP payload length - * - UDP protocol ID - */ - - sum = htons(IPPROTO_UDP + remlen) + - up[0] + up[1] + up[2] + up[3] + - up[4] + up[5] + up[6] + up[7] + - up[8] + up[9] + up[10] + up[11] + - up[12] + up[13] + up[14] + up[15]; - - /* Fold initial sum */ - sum = (sum & 0xffff) + (sum >> 16); - - if (reass_hck_flags != 0) { - hck_flags = reass_hck_flags; - - IP_CKSUM_RECV_REASS(hck_flags, - (int32_t)(whereptr - (uchar_t *)mp->b_rptr), - sum, reass_sum, cksum_err); - } else { - mp1 = mp->b_cont; - - IP_CKSUM_RECV(hck_flags, sum, (uchar_t *) - ((uchar_t *)mp->b_rptr + DB_CKSUMSTART(mp)), - (int32_t)(whereptr - (uchar_t *)mp->b_rptr), - mp, mp1, cksum_err); - } - - if ((hck_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) == 0) - IP6_STAT(ipst, ip6_in_sw_cksum); - - if (cksum_err) { - BUMP_MIB(ill->ill_ip_mib, - udpIfStatsInCksumErrs); - - if (hck_flags & HCK_FULLCKSUM) - IP6_STAT(ipst, - ip6_udp_in_full_hw_cksum_err); - else if (hck_flags & HCK_PARTIALCKSUM) - IP6_STAT(ipst, - ip6_udp_in_part_hw_cksum_err); - else - IP6_STAT(ipst, ip6_udp_in_sw_cksum_err); - - freemsg(first_mp); - return; - } - goto udp_fanout; - } - case IPPROTO_ICMPV6: { - uint16_t *up; - uint32_t sum; - uint_t hdr_len = pkt_len - remlen; - - if (hada_mp != NULL) { - ip0dbg(("icmp hada drop\n")); - goto hada_drop; - } - - up = (uint16_t *)&ip6h->ip6_src; - sum = htons(IPPROTO_ICMPV6 + remlen) + - up[0] + up[1] + up[2] + up[3] + - up[4] + up[5] + up[6] + up[7] + - up[8] + up[9] + up[10] + up[11] + - up[12] + up[13] + up[14] + up[15]; - sum = (sum & 0xffff) + (sum >> 16); - sum = IP_CSUM(mp, hdr_len, sum); - if (sum != 0) { - /* IPv6 ICMP checksum failed */ - ip1dbg(("ip_rput_data_v6: ICMPv6 checksum " - "failed %x\n", - sum)); - BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInMsgs); - BUMP_MIB(ill->ill_icmp6_mib, - ipv6IfIcmpInErrors); - freemsg(first_mp); - return; - } - - icmp_fanout: - /* Check variable for testing applications */ - if (ipst->ips_ipv6_drop_inbound_icmpv6) { - freemsg(first_mp); - return; - } - /* - * Assume that there is always at least one conn for - * ICMPv6 (in.ndpd) i.e. don't optimize the case - * where there is no conn. - */ - if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) { - ilm_t *ilm; - ilm_walker_t ilw; - - ASSERT(!IS_LOOPBACK(ill)); - /* - * In the multicast case, applications may have - * joined the group from different zones, so we - * need to deliver the packet to each of them. - * Loop through the multicast memberships - * structures (ilm) on the receive ill and send - * a copy of the packet up each matching one. - */ - ilm = ilm_walker_start(&ilw, inill); - for (; ilm != NULL; - ilm = ilm_walker_step(&ilw, ilm)) { - if (!IN6_ARE_ADDR_EQUAL( - &ilm->ilm_v6addr, &ip6h->ip6_dst)) - continue; - if (!ipif_lookup_zoneid( - ilw.ilw_walk_ill, ilm->ilm_zoneid, - IPIF_UP, NULL)) - continue; - - first_mp1 = ip_copymsg(first_mp); - if (first_mp1 == NULL) - continue; - icmp_inbound_v6(q, first_mp1, - ilw.ilw_walk_ill, inill, - hdr_len, mctl_present, 0, - ilm->ilm_zoneid, dl_mp); - } - ilm_walker_finish(&ilw); - } else { - first_mp1 = ip_copymsg(first_mp); - if (first_mp1 != NULL) - icmp_inbound_v6(q, first_mp1, ill, - inill, hdr_len, mctl_present, 0, - zoneid, dl_mp); - } - goto proto_fanout; - } - case IPPROTO_ENCAP: - case IPPROTO_IPV6: - if (ip_iptun_input_v6(mctl_present ? first_mp : NULL, - mp, pkt_len - remlen, nexthdr, zoneid, ill, ipst)) { - return; - } - /* - * If there was no IP tunnel data-link bound to - * receive this packet, then we fall through to - * allow potential raw sockets bound to either of - * these protocols to pick it up. - */ - /* FALLTHRU */ -proto_fanout: - default: { - /* - * Handle protocols with which IPv6 is less intimate. - */ - uint_t proto_flags = IP_FF_RAWIP|IP_FF_IPINFO; - - if (hada_mp != NULL) { - ip0dbg(("default hada drop\n")); - goto hada_drop; - } - - /* - * Enable sending ICMP for "Unknown" nexthdr - * case. i.e. where we did not FALLTHRU from - * IPPROTO_ICMPV6 processing case above. - * If we did FALLTHRU, then the packet has already been - * processed for IPPF, don't process it again in - * ip_fanout_proto_v6; set IP6_NO_IPPOLICY in the - * flags - */ - if (nexthdr != IPPROTO_ICMPV6) - proto_flags |= IP_FF_SEND_ICMP; - else - proto_flags |= IP6_NO_IPPOLICY; - - ip_fanout_proto_v6(q, first_mp, ip6h, ill, inill, - nexthdr, prev_nexthdr_offset, (flags|proto_flags), - mctl_present, zoneid); - return; - } - - case IPPROTO_DSTOPTS: { - uint_t ehdrlen; - uint8_t *optptr; - ip6_dest_t *desthdr; - - /* If packet is too short, look no further */ - if (remlen < MIN_EHDR_LEN) - goto pkt_too_short; - - /* Check if AH is present. */ - if (ipsec_early_ah_v6(q, first_mp, mctl_present, ill, - inill, hada_mp, zoneid)) { - return; - } - - /* - * Reinitialize pointers, as ipsec_early_ah_v6() does - * complete pullups. We don't have to do more pullups - * as a result. - */ - whereptr = (uint8_t *)((uintptr_t)mp->b_rptr + - (uintptr_t)(whereptr - ((uint8_t *)ip6h))); - ip6h = (ip6_t *)mp->b_rptr; - - desthdr = (ip6_dest_t *)whereptr; - nexthdr = desthdr->ip6d_nxt; - prev_nexthdr_offset = (uint_t)(whereptr - - (uint8_t *)ip6h); - ehdrlen = 8 * (desthdr->ip6d_len + 1); - if (remlen < ehdrlen) - goto pkt_too_short; - optptr = whereptr + 2; - /* - * Note: XXX This code does not seem to make - * distinction between Destination Options Header - * being before/after Routing Header which can - * happen if we are at the end of source route. - * This may become significant in future. - * (No real significant Destination Options are - * defined/implemented yet ). - */ - switch (ip_process_options_v6(q, first_mp, ip6h, optptr, - ehdrlen - 2, IPPROTO_DSTOPTS, ipst)) { - case -1: - /* - * Packet has been consumed and any needed - * ICMP errors sent. - */ - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); - freemsg(hada_mp); - return; - case 0: - /* No action needed continue */ - break; - case 1: - /* - * Unnexpected return value - * (Router alert is a Hop-by-Hop option) - */ -#ifdef DEBUG - panic("ip_rput_data_v6: router " - "alert hbh opt indication in dest opt"); - /*NOTREACHED*/ -#else - freemsg(hada_mp); - freemsg(first_mp); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - return; -#endif - } - used = ehdrlen; - break; - } - case IPPROTO_FRAGMENT: { - ip6_frag_t *fraghdr; - size_t no_frag_hdr_len; - - if (hada_mp != NULL) { - ip0dbg(("frag hada drop\n")); - goto hada_drop; - } - - ASSERT(first_mp == mp); - if (remlen < sizeof (ip6_frag_t)) - goto pkt_too_short; - - if (mp->b_cont != NULL && - whereptr + sizeof (ip6_frag_t) > mp->b_wptr) { - if (!pullupmsg(mp, - pkt_len - remlen + sizeof (ip6_frag_t))) { - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsInDiscards); - freemsg(mp); - return; - } - hck_flags = 0; - ip6h = (ip6_t *)mp->b_rptr; - whereptr = (uint8_t *)ip6h + pkt_len - remlen; - } - - fraghdr = (ip6_frag_t *)whereptr; - used = (uint_t)sizeof (ip6_frag_t); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmReqds); - - /* - * Invoke the CGTP (multirouting) filtering module to - * process the incoming packet. Packets identified as - * duplicates must be discarded. Filtering is active - * only if the the ip_cgtp_filter ndd variable is - * non-zero. - */ - if (ipst->ips_ip_cgtp_filter && - ipst->ips_ip_cgtp_filter_ops != NULL) { - int cgtp_flt_pkt; - netstackid_t stackid; - - stackid = ipst->ips_netstack->netstack_stackid; - - cgtp_flt_pkt = - ipst->ips_ip_cgtp_filter_ops->cfo_filter_v6( - stackid, inill->ill_phyint->phyint_ifindex, - ip6h, fraghdr); - if (cgtp_flt_pkt == CGTP_IP_PKT_DUPLICATE) { - freemsg(mp); - return; - } - } - - /* Restore the flags */ - DB_CKSUMFLAGS(mp) = hck_flags; - - mp = ip_rput_frag_v6(ill, inill, mp, ip6h, fraghdr, - remlen - used, &prev_nexthdr_offset, - &reass_sum, &reass_hck_flags); - if (mp == NULL) { - /* Reassembly is still pending */ - return; - } - /* The first mblk are the headers before the frag hdr */ - BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmOKs); - - first_mp = mp; /* mp has most likely changed! */ - no_frag_hdr_len = mp->b_wptr - mp->b_rptr; - ip6h = (ip6_t *)mp->b_rptr; - nexthdr = ((char *)ip6h)[prev_nexthdr_offset]; - whereptr = mp->b_rptr + no_frag_hdr_len; - remlen = ntohs(ip6h->ip6_plen) + - (uint16_t)(IPV6_HDR_LEN - no_frag_hdr_len); - pkt_len = msgdsize(mp); - used = 0; - break; - } - case IPPROTO_HOPOPTS: { - if (hada_mp != NULL) { - ip0dbg(("hop hada drop\n")); - goto hada_drop; - } - /* - * Illegal header sequence. - * (Hop-by-hop headers are processed above - * and required to immediately follow IPv6 header) - */ - icmp_param_problem_v6(WR(q), first_mp, - ICMP6_PARAMPROB_NEXTHEADER, - prev_nexthdr_offset, - B_FALSE, B_FALSE, zoneid, ipst); - return; - } - case IPPROTO_ROUTING: { - uint_t ehdrlen; - ip6_rthdr_t *rthdr; - - /* If packet is too short, look no further */ - if (remlen < MIN_EHDR_LEN) - goto pkt_too_short; - - /* Check if AH is present. */ - if (ipsec_early_ah_v6(q, first_mp, mctl_present, ill, - inill, hada_mp, zoneid)) { - return; - } - - /* - * Reinitialize pointers, as ipsec_early_ah_v6() does - * complete pullups. We don't have to do more pullups - * as a result. - */ - whereptr = (uint8_t *)((uintptr_t)mp->b_rptr + - (uintptr_t)(whereptr - ((uint8_t *)ip6h))); - ip6h = (ip6_t *)mp->b_rptr; - - rthdr = (ip6_rthdr_t *)whereptr; - nexthdr = rthdr->ip6r_nxt; - prev_nexthdr_offset = (uint_t)(whereptr - - (uint8_t *)ip6h); - ehdrlen = 8 * (rthdr->ip6r_len + 1); - if (remlen < ehdrlen) - goto pkt_too_short; - if (rthdr->ip6r_segleft != 0) { - /* Not end of source route */ - if (ll_multicast) { - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsForwProhibits); - freemsg(hada_mp); - freemsg(mp); - return; - } - ip_process_rthdr(q, mp, ip6h, rthdr, ill, - hada_mp); - return; - } - used = ehdrlen; - break; - } - case IPPROTO_AH: - case IPPROTO_ESP: { - /* - * Fast path for AH/ESP. If this is the first time - * we are sending a datagram to AH/ESP, allocate - * a IPSEC_IN message and prepend it. Otherwise, - * just fanout. - */ - - ipsec_in_t *ii; - int ipsec_rc; - ipsec_stack_t *ipss; - - ipss = ipst->ips_netstack->netstack_ipsec; - if (!mctl_present) { - ASSERT(first_mp == mp); - first_mp = ipsec_in_alloc(B_FALSE, - ipst->ips_netstack); - if (first_mp == NULL) { - ip1dbg(("ip_rput_data_v6: IPSEC_IN " - "allocation failure.\n")); - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsInDiscards); - freemsg(mp); - return; - } - /* - * Store the ill_index so that when we come back - * from IPSEC we ride on the same queue. - */ - ii = (ipsec_in_t *)first_mp->b_rptr; - ii->ipsec_in_ill_index = - ill->ill_phyint->phyint_ifindex; - ii->ipsec_in_rill_index = - inill->ill_phyint->phyint_ifindex; - first_mp->b_cont = mp; - /* - * Cache hardware acceleration info. - */ - if (hada_mp != NULL) { - IPSECHW_DEBUG(IPSECHW_PKT, - ("ip_rput_data_v6: " - "caching data attr.\n")); - ii->ipsec_in_accelerated = B_TRUE; - ii->ipsec_in_da = hada_mp; - hada_mp = NULL; - } - } else { - ii = (ipsec_in_t *)first_mp->b_rptr; - } - - if (!ipsec_loaded(ipss)) { - ip_proto_not_sup(q, first_mp, IP_FF_SEND_ICMP, - zoneid, ipst); - return; - } - - /* select inbound SA and have IPsec process the pkt */ - if (nexthdr == IPPROTO_ESP) { - esph_t *esph = ipsec_inbound_esp_sa(first_mp, - ipst->ips_netstack); - if (esph == NULL) - return; - ASSERT(ii->ipsec_in_esp_sa != NULL); - ASSERT(ii->ipsec_in_esp_sa->ipsa_input_func != - NULL); - ipsec_rc = ii->ipsec_in_esp_sa->ipsa_input_func( - first_mp, esph); - } else { - ah_t *ah = ipsec_inbound_ah_sa(first_mp, - ipst->ips_netstack); - if (ah == NULL) - return; - ASSERT(ii->ipsec_in_ah_sa != NULL); - ASSERT(ii->ipsec_in_ah_sa->ipsa_input_func != - NULL); - ipsec_rc = ii->ipsec_in_ah_sa->ipsa_input_func( - first_mp, ah); - } - - switch (ipsec_rc) { - case IPSEC_STATUS_SUCCESS: - break; - case IPSEC_STATUS_FAILED: - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - /* FALLTHRU */ - case IPSEC_STATUS_PENDING: - return; - } - /* we're done with IPsec processing, send it up */ - ip_fanout_proto_again(first_mp, ill, inill, NULL); - return; - } - case IPPROTO_NONE: - /* All processing is done. Count as "delivered". */ - freemsg(hada_mp); - freemsg(first_mp); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); - return; - } - whereptr += used; - ASSERT(remlen >= used); - remlen -= used; - } - /* NOTREACHED */ - -pkt_too_short: - ip1dbg(("ip_rput_data_v6: packet too short %d %lu %d\n", - ip6_len, pkt_len, remlen)); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts); - freemsg(hada_mp); - freemsg(first_mp); - return; -udp_fanout: - if (mctl_present || IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) { - connp = NULL; - } else { - connp = ipcl_classify_v6(mp, IPPROTO_UDP, hdr_len, zoneid, - ipst); - if ((connp != NULL) && (connp->conn_upq == NULL)) { - CONN_DEC_REF(connp); - connp = NULL; - } - } - - if (connp == NULL) { - uint32_t ports; - - ports = *(uint32_t *)(mp->b_rptr + hdr_len + - UDP_PORTS_OFFSET); - IP6_STAT(ipst, ip6_udp_slow_path); - ip_fanout_udp_v6(q, first_mp, ip6h, ports, ill, inill, - (flags|IP_FF_SEND_ICMP|IP_FF_IPINFO), mctl_present, - zoneid); - return; - } - - if ((IPCL_IS_NONSTR(connp) && PROTO_FLOW_CNTRLD(connp)) || - (!IPCL_IS_NONSTR(connp) && CONN_UDP_FLOWCTLD(connp))) { - freemsg(first_mp); - BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows); - CONN_DEC_REF(connp); - return; - } - - /* Initiate IPPF processing */ - if (IP6_IN_IPP(flags, ipst)) { - ip_process(IPP_LOCAL_IN, &mp, ill->ill_phyint->phyint_ifindex); - if (mp == NULL) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - CONN_DEC_REF(connp); - return; - } - } - - if (connp->conn_ip_recvpktinfo || - IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src)) { - mp = ip_add_info_v6(mp, inill, &ip6h->ip6_dst); - if (mp == NULL) { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - CONN_DEC_REF(connp); - return; - } + return (NULL); } - IP6_STAT(ipst, ip6_udp_fast_path); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); - - /* Send it upstream */ - (connp->conn_recv)(connp, mp, NULL); - - CONN_DEC_REF(connp); - freemsg(hada_mp); - return; - -hada_drop: - ip1dbg(("ip_rput_data_v6: malformed accelerated packet\n")); - /* IPsec kstats: bump counter here */ - freemsg(hada_mp); - freemsg(first_mp); + /* we're done with IPsec processing, send it up */ + ip_input_post_ipsec(mp, ira); + return (NULL); } /* * Reassemble fragment. * When it returns a completed message the first mblk will only contain - * the headers prior to the fragment header. - * - * prev_nexthdr_offset is an offset indication of where the nexthdr field is - * of the preceding header. This is needed to patch the previous header's - * nexthdr field when reassembly completes. + * the headers prior to the fragment header, with the nexthdr value updated + * to be the header after the fragment header. */ -static mblk_t * -ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h, - ip6_frag_t *fraghdr, uint_t remlen, uint_t *prev_nexthdr_offset, - uint32_t *cksum_val, uint16_t *cksum_flags) +mblk_t * +ip_input_fragment_v6(mblk_t *mp, ip6_t *ip6h, + ip6_frag_t *fraghdr, uint_t remlen, ip_recv_attr_t *ira) { uint32_t ident = ntohl(fraghdr->ip6f_ident); uint16_t offset; @@ -8225,12 +3304,12 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h, boolean_t pruned = B_FALSE; uint32_t sum_val; uint16_t sum_flags; + ill_t *ill = ira->ira_ill; ip_stack_t *ipst = ill->ill_ipst; - - if (cksum_val != NULL) - *cksum_val = 0; - if (cksum_flags != NULL) - *cksum_flags = 0; + uint_t prev_nexthdr_offset; + uint8_t prev_nexthdr; + uint8_t *ptr; + uint32_t packet_size; /* * We utilize hardware computed checksum info only for UDP since @@ -8238,8 +3317,9 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h, * addition, checksum offload support for IP fragments carrying * UDP payload is commonly implemented across network adapters. */ - ASSERT(inill != NULL); - if (nexthdr == IPPROTO_UDP && dohwcksum && ILL_HCKSUM_CAPABLE(inill) && + ASSERT(ira->ira_rill != NULL); + if (nexthdr == IPPROTO_UDP && dohwcksum && + ILL_HCKSUM_CAPABLE(ira->ira_rill) && (DB_CKSUMFLAGS(mp) & (HCK_FULLCKSUM | HCK_PARTIALCKSUM))) { mblk_t *mp1 = mp->b_cont; int32_t len; @@ -8253,8 +3333,8 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h, if ((sum_flags & HCK_PARTIALCKSUM) && (mp1 == NULL || mp1->b_cont == NULL) && - offset >= (uint16_t)DB_CKSUMSTART(mp) && - ((len = offset - (uint16_t)DB_CKSUMSTART(mp)) & 1) == 0) { + offset >= DB_CKSUMSTART(mp) && + ((len = offset - DB_CKSUMSTART(mp)) & 1) == 0) { uint32_t adj; /* * Partial checksum has been calculated by hardware @@ -8281,6 +3361,59 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h, DB_CKSUMFLAGS(mp) = 0; /* + * Determine the offset (from the begining of the IP header) + * of the nexthdr value which has IPPROTO_FRAGMENT. We use + * this when removing the fragment header from the packet. + * This packet consists of the IPv6 header, a potential + * hop-by-hop options header, a potential pre-routing-header + * destination options header, and a potential routing header. + */ + prev_nexthdr_offset = (uint8_t *)&ip6h->ip6_nxt - (uint8_t *)ip6h; + prev_nexthdr = ip6h->ip6_nxt; + ptr = (uint8_t *)&ip6h[1]; + + if (prev_nexthdr == IPPROTO_HOPOPTS) { + ip6_hbh_t *hbh_hdr; + uint_t hdr_len; + + hbh_hdr = (ip6_hbh_t *)ptr; + hdr_len = 8 * (hbh_hdr->ip6h_len + 1); + prev_nexthdr = hbh_hdr->ip6h_nxt; + prev_nexthdr_offset = (uint8_t *)&hbh_hdr->ip6h_nxt + - (uint8_t *)ip6h; + ptr += hdr_len; + } + if (prev_nexthdr == IPPROTO_DSTOPTS) { + ip6_dest_t *dest_hdr; + uint_t hdr_len; + + dest_hdr = (ip6_dest_t *)ptr; + hdr_len = 8 * (dest_hdr->ip6d_len + 1); + prev_nexthdr = dest_hdr->ip6d_nxt; + prev_nexthdr_offset = (uint8_t *)&dest_hdr->ip6d_nxt + - (uint8_t *)ip6h; + ptr += hdr_len; + } + if (prev_nexthdr == IPPROTO_ROUTING) { + ip6_rthdr_t *rthdr; + uint_t hdr_len; + + rthdr = (ip6_rthdr_t *)ptr; + prev_nexthdr = rthdr->ip6r_nxt; + prev_nexthdr_offset = (uint8_t *)&rthdr->ip6r_nxt + - (uint8_t *)ip6h; + hdr_len = 8 * (rthdr->ip6r_len + 1); + ptr += hdr_len; + } + if (prev_nexthdr != IPPROTO_FRAGMENT) { + /* Can't handle other headers before the fragment header */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); + ip_drop_input("ipIfStatsInHdrErrors", mp, ill); + freemsg(mp); + return (NULL); + } + + /* * Note: Fragment offset in header is in 8-octet units. * Clearing least significant 3 bits not only extracts * it but also gets it in units of octets. @@ -8293,17 +3426,10 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h, * of eight? */ if (more_frags && (ntohs(ip6h->ip6_plen) & 7)) { - zoneid_t zoneid; - - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); - zoneid = ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst, ill, ipst); - if (zoneid == ALL_ZONES) { - freemsg(mp); - return (NULL); - } - icmp_param_problem_v6(ill->ill_wq, mp, ICMP6_PARAMPROB_HEADER, + ip_drop_input("ICMP_PARAM_PROBLEM", mp, ill); + icmp_param_problem_v6(mp, ICMP6_PARAMPROB_HEADER, (uint32_t)((char *)&ip6h->ip6_plen - - (char *)ip6h), B_FALSE, B_FALSE, zoneid, ipst); + (char *)ip6h), B_FALSE, ira); return (NULL); } @@ -8319,17 +3445,11 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h, * greater than IP_MAXPACKET - the max payload size? */ if (end > IP_MAXPACKET) { - zoneid_t zoneid; - BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); - zoneid = ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst, ill, ipst); - if (zoneid == ALL_ZONES) { - freemsg(mp); - return (NULL); - } - icmp_param_problem_v6(ill->ill_wq, mp, ICMP6_PARAMPROB_HEADER, + ip_drop_input("Reassembled packet too large", mp, ill); + icmp_param_problem_v6(mp, ICMP6_PARAMPROB_HEADER, (uint32_t)((char *)&fraghdr->ip6f_offlg - - (char *)ip6h), B_FALSE, B_FALSE, zoneid, ipst); + (char *)ip6h), B_FALSE, ira); return (NULL); } @@ -8368,11 +3488,17 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h, * there is anything on the reassembly queue, the timer will * be running. */ - msg_len = MBLKSIZE(mp); + /* Handle vnic loopback of fragments */ + if (mp->b_datap->db_ref > 2) + msg_len = 0; + else + msg_len = MBLKSIZE(mp); + tail_mp = mp; while (tail_mp->b_cont != NULL) { tail_mp = tail_mp->b_cont; - msg_len += MBLKSIZE(tail_mp); + if (tail_mp->b_datap->db_ref <= 2) + msg_len += MBLKSIZE(tail_mp); } /* * If the reassembly list for this ILL will get too big @@ -8381,6 +3507,9 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h, if ((msg_len + sizeof (*ipf) + ill->ill_frag_count) >= ipst->ips_ip_reass_queue_bytes) { + DTRACE_PROBE3(ip_reass_queue_bytes, uint_t, msg_len, + uint_t, ill->ill_frag_count, + uint_t, ipst->ips_ip_reass_queue_bytes); ill_frag_prune(ill, (ipst->ips_ip_reass_queue_bytes < msg_len) ? 0 : (ipst->ips_ip_reass_queue_bytes - msg_len)); @@ -8443,6 +3572,7 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h, mp1 = allocb(sizeof (*ipf), BPRI_MED); if (!mp1) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); freemsg(mp); partial_reass_done: mutex_exit(&ipfb->ipfb_lock); @@ -8512,7 +3642,7 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h, */ ipf->ipf_end = end; ipf->ipf_nf_hdr_len = hdr_length; - ipf->ipf_prev_nexthdr_offset = *prev_nexthdr_offset; + ipf->ipf_prev_nexthdr_offset = prev_nexthdr_offset; } else { /* Hard case, hole at the beginning. */ ipf->ipf_tail_mp = NULL; @@ -8603,7 +3733,7 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h, if (ipf->ipf_prev_nexthdr_offset == 0) { ipf->ipf_nf_hdr_len = hdr_length; ipf->ipf_prev_nexthdr_offset = - *prev_nexthdr_offset; + prev_nexthdr_offset; } } /* Save current byte count */ @@ -8654,7 +3784,7 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h, * header */ nexthdr = ipf->ipf_protocol; - *prev_nexthdr_offset = ipf->ipf_prev_nexthdr_offset; + prev_nexthdr_offset = ipf->ipf_prev_nexthdr_offset; ipfp = ipf->ipf_ptphn; /* We need to supply these to caller */ @@ -8685,7 +3815,8 @@ ip_rput_frag_v6(ill_t *ill, ill_t *inill, mblk_t *mp, ip6_t *ip6h, reass_done: if (hdr_length < sizeof (ip6_frag_t)) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); - ip1dbg(("ip_rput_frag_v6: bad packet\n")); + ip_drop_input("ipIfStatsInHdrErrors", mp, ill); + ip1dbg(("ip_input_fragment_v6: bad packet\n")); freemsg(mp); return (NULL); } @@ -8708,8 +3839,9 @@ reass_done: mblk_t *nmp; if (!(nmp = dupb(mp))) { + ip1dbg(("ip_input_fragment_v6: dupb failed\n")); BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); - ip1dbg(("ip_rput_frag_v6: dupb failed\n")); + ip_drop_input("ipIfStatsInDiscards", mp, ill); freemsg(mp); return (NULL); } @@ -8720,19 +3852,24 @@ reass_done: mp->b_wptr = mp->b_rptr + hdr_length - sizeof (ip6_frag_t); ip6h = (ip6_t *)mp->b_rptr; - ((char *)ip6h)[*prev_nexthdr_offset] = nexthdr; + ((char *)ip6h)[prev_nexthdr_offset] = nexthdr; /* Restore original IP length in header. */ - ip6h->ip6_plen = htons((uint16_t)(msgdsize(mp) - IPV6_HDR_LEN)); + packet_size = msgdsize(mp); + ip6h->ip6_plen = htons((uint16_t)(packet_size - IPV6_HDR_LEN)); /* Record the ECN info. */ ip6h->ip6_vcf &= htonl(0xFFCFFFFF); ip6h->ip6_vcf |= htonl(ecn_info << 20); - /* Reassembly is successful; return checksum information if needed */ - if (cksum_val != NULL) - *cksum_val = sum_val; - if (cksum_flags != NULL) - *cksum_flags = sum_flags; + /* Update the receive attributes */ + ira->ira_pktlen = packet_size; + ira->ira_ip_hdr_length = hdr_length - sizeof (ip6_frag_t); + ira->ira_protocol = nexthdr; + + /* Reassembly is successful; set checksum information in packet */ + DB_CKSUM16(mp) = (uint16_t)sum_val; + DB_CKSUMFLAGS(mp) = sum_flags; + DB_CKSUMSTART(mp) = ira->ira_ip_hdr_length; return (mp); } @@ -8742,7 +3879,7 @@ reass_done: * header. */ static in6_addr_t -pluck_out_dst(mblk_t *mp, uint8_t *whereptr, in6_addr_t oldrv) +pluck_out_dst(const mblk_t *mp, uint8_t *whereptr, in6_addr_t oldrv) { ip6_rthdr0_t *rt0; int segleft, numaddr; @@ -8758,7 +3895,7 @@ pluck_out_dst(mblk_t *mp, uint8_t *whereptr, in6_addr_t oldrv) numaddr = rt0->ip6r0_len / 2; if ((rt0->ip6r0_len & 0x1) || - whereptr + (rt0->ip6r0_len + 1) * 8 > mp->b_wptr || + (mp != NULL && whereptr + (rt0->ip6r0_len + 1) * 8 > mp->b_wptr) || (segleft > rt0->ip6r0_len / 2)) { /* * Corrupt packet. Either the routing header length is odd @@ -8784,11 +3921,13 @@ pluck_out_dst(mblk_t *mp, uint8_t *whereptr, in6_addr_t oldrv) * Walk through the options to see if there is a routing header. * If present get the destination which is the last address of * the option. + * mp needs to be provided in cases when the extension headers might span + * b_cont; mp is never modified by this function. */ in6_addr_t -ip_get_dst_v6(ip6_t *ip6h, mblk_t *mp, boolean_t *is_fragment) +ip_get_dst_v6(ip6_t *ip6h, const mblk_t *mp, boolean_t *is_fragment) { - mblk_t *current_mp = mp; + const mblk_t *current_mp = mp; uint8_t nexthdr; uint8_t *whereptr; int ehdrlen; @@ -8798,7 +3937,8 @@ ip_get_dst_v6(ip6_t *ip6h, mblk_t *mp, boolean_t *is_fragment) ehdrlen = sizeof (ip6_t); /* We assume at least the IPv6 base header is within one mblk. */ - ASSERT(mp->b_rptr <= whereptr && mp->b_wptr >= whereptr + ehdrlen); + ASSERT(mp == NULL || + (mp->b_rptr <= whereptr && mp->b_wptr >= whereptr + ehdrlen)); rv = ip6h->ip6_dst; nexthdr = ip6h->ip6_nxt; @@ -8819,7 +3959,8 @@ ip_get_dst_v6(ip6_t *ip6h, mblk_t *mp, boolean_t *is_fragment) * All IPv6 extension headers have the next-header in byte * 0, and the (length - 8) in 8-byte-words. */ - while (whereptr + ehdrlen >= current_mp->b_wptr) { + while (current_mp != NULL && + whereptr + ehdrlen >= current_mp->b_wptr) { ehdrlen -= (current_mp->b_wptr - whereptr); current_mp = current_mp->b_cont; if (current_mp == NULL) { @@ -8833,7 +3974,7 @@ ip_get_dst_v6(ip6_t *ip6h, mblk_t *mp, boolean_t *is_fragment) whereptr += ehdrlen; nexthdr = *whereptr; - ASSERT(whereptr + 1 < current_mp->b_wptr); + ASSERT(current_mp == NULL || whereptr + 1 < current_mp->b_wptr); ehdrlen = (*(whereptr + 1) + 1) * 8; } @@ -8845,7 +3986,7 @@ done: /* * ip_source_routed_v6: - * This function is called by redirect code in ip_rput_data_v6 to + * This function is called by redirect code (called from ip_input_v6) to * know whether this packet is source routed through this node i.e * whether this node (router) is part of the journey. This * function is called under two cases : @@ -8922,22 +4063,14 @@ ip_source_routed_v6(ip6_t *ip6h, mblk_t *mp, ip_stack_t *ipst) */ if (rthdr->ip6r0_segleft > 0 || rthdr->ip6r0_segleft == 0) { - ire_t *ire = NULL; - numaddr = rthdr->ip6r0_len / 2; addrptr = (in6_addr_t *)((char *)rthdr + sizeof (*rthdr)); addrptr += (numaddr - (rthdr->ip6r0_segleft + 1)); if (addrptr != NULL) { - ire = ire_ctable_lookup_v6(addrptr, NULL, - IRE_LOCAL, NULL, ALL_ZONES, NULL, - MATCH_IRE_TYPE, - ipst); - if (ire != NULL) { - ire_refrele(ire); + if (ip_type_v6(addrptr, ipst) == IRE_LOCAL) return (B_TRUE); - } - ip1dbg(("ip_source_routed_v6: No ire found\n")); + ip1dbg(("ip_source_routed_v6: Not local\n")); } } /* FALLTHRU */ @@ -8948,2387 +4081,19 @@ ip_source_routed_v6(ip6_t *ip6h, mblk_t *mp, ip_stack_t *ipst) } /* - * ip_wput_v6 -- Packets sent down from transport modules show up here. - * Assumes that the following set of headers appear in the first - * mblk: - * ip6i_t (if present) CAN also appear as a separate mblk. - * ip6_t - * Any extension headers - * TCP/UDP/SCTP header (if present) - * The routine can handle an ICMPv6 header that is not in the first mblk. - * - * The order to determine the outgoing interface is as follows: - * 1. If an ip6i_t with IP6I_IFINDEX set then use that ill. - * 2. If q is an ill queue and (link local or multicast destination) then - * use that ill. - * 3. If IPV6_BOUND_IF has been set use that ill. - * 4. For multicast: if IPV6_MULTICAST_IF has been set use it. Otherwise - * look for the best IRE match for the unspecified group to determine - * the ill. - * 5. For unicast: Just do an IRE lookup for the best match. - * - * arg2 is always a queue_t *. - * When that queue is an ill_t (i.e. q_next != NULL), then arg must be - * the zoneid. - * When that queue is not an ill_t, then arg must be a conn_t pointer. - */ -void -ip_output_v6(void *arg, mblk_t *mp, void *arg2, int caller) -{ - conn_t *connp = NULL; - queue_t *q = (queue_t *)arg2; - ire_t *ire = NULL; - ire_t *sctp_ire = NULL; - ip6_t *ip6h; - in6_addr_t *v6dstp; - ill_t *ill = NULL; - ipif_t *ipif; - ip6i_t *ip6i; - int cksum_request; /* -1 => normal. */ - /* 1 => Skip TCP/UDP/SCTP checksum */ - /* Otherwise contains insert offset for checksum */ - int unspec_src; - boolean_t do_outrequests; /* Increment OutRequests? */ - mib2_ipIfStatsEntry_t *mibptr; - int match_flags = MATCH_IRE_ILL; - mblk_t *first_mp; - boolean_t mctl_present; - ipsec_out_t *io; - boolean_t multirt_need_resolve = B_FALSE; - mblk_t *copy_mp = NULL; - int err = 0; - int ip6i_flags = 0; - zoneid_t zoneid; - ill_t *saved_ill = NULL; - boolean_t conn_lock_held; - boolean_t need_decref = B_FALSE; - ip_stack_t *ipst; - - if (q->q_next != NULL) { - ill = (ill_t *)q->q_ptr; - ipst = ill->ill_ipst; - } else { - connp = (conn_t *)arg; - ASSERT(connp != NULL); - ipst = connp->conn_netstack->netstack_ip; - } - - /* - * Highest bit in version field is Reachability Confirmation bit - * used by NUD in ip_xmit_v6(). - */ -#ifdef _BIG_ENDIAN -#define IPVER(ip6h) ((((uint32_t *)ip6h)[0] >> 28) & 0x7) -#else -#define IPVER(ip6h) ((((uint32_t *)ip6h)[0] >> 4) & 0x7) -#endif - - /* - * M_CTL comes from 5 places - * - * 1) TCP sends down IPSEC_OUT(M_CTL) for detached connections - * both V4 and V6 datagrams. - * - * 2) AH/ESP sends down M_CTL after doing their job with both - * V4 and V6 datagrams. - * - * 3) NDP callbacks when nce is resolved and IPSEC_OUT has been - * attached. - * - * 4) Notifications from an external resolver (for XRESOLV ifs) - * - * 5) AH/ESP send down IPSEC_CTL(M_CTL) to be relayed to hardware for - * IPsec hardware acceleration support. - * - * We need to handle (1)'s IPv6 case and (3) here. For the - * IPv4 case in (1), and (2), IPSEC processing has already - * started. The code in ip_wput() already knows how to handle - * continuing IPSEC processing (for IPv4 and IPv6). All other - * M_CTLs (including case (4)) are passed on to ip_wput_nondata() - * for handling. - */ - first_mp = mp; - mctl_present = B_FALSE; - io = NULL; - - /* Multidata transmit? */ - if (DB_TYPE(mp) == M_MULTIDATA) { - /* - * We should never get here, since all Multidata messages - * originating from tcp should have been directed over to - * tcp_multisend() in the first place. - */ - BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); - freemsg(mp); - return; - } else if (DB_TYPE(mp) == M_CTL) { - uint32_t mctltype = 0; - uint32_t mlen = MBLKL(first_mp); - - mp = mp->b_cont; - mctl_present = B_TRUE; - io = (ipsec_out_t *)first_mp->b_rptr; - - /* - * Validate this M_CTL message. The only three types of - * M_CTL messages we expect to see in this code path are - * ipsec_out_t or ipsec_in_t structures (allocated as - * ipsec_info_t unions), or ipsec_ctl_t structures. - * The ipsec_out_type and ipsec_in_type overlap in the two - * data structures, and they are either set to IPSEC_OUT - * or IPSEC_IN depending on which data structure it is. - * ipsec_ctl_t is an IPSEC_CTL. - * - * All other M_CTL messages are sent to ip_wput_nondata() - * for handling. - */ - if (mlen >= sizeof (io->ipsec_out_type)) - mctltype = io->ipsec_out_type; - - if ((mlen == sizeof (ipsec_ctl_t)) && - (mctltype == IPSEC_CTL)) { - ip_output(arg, first_mp, arg2, caller); - return; - } - - if ((mlen < sizeof (ipsec_info_t)) || - (mctltype != IPSEC_OUT && mctltype != IPSEC_IN) || - mp == NULL) { - ip_wput_nondata(NULL, q, first_mp, NULL); - return; - } - /* NDP callbacks have q_next non-NULL. That's case #3. */ - if (q->q_next == NULL) { - ip6h = (ip6_t *)mp->b_rptr; - /* - * For a freshly-generated TCP dgram that needs IPV6 - * processing, don't call ip_wput immediately. We can - * tell this by the ipsec_out_proc_begin. In-progress - * IPSEC_OUT messages have proc_begin set to TRUE, - * and we want to send all IPSEC_IN messages to - * ip_wput() for IPsec processing or finishing. - */ - if (mctltype == IPSEC_IN || - IPVER(ip6h) != IPV6_VERSION || - io->ipsec_out_proc_begin) { - mibptr = &ipst->ips_ip6_mib; - goto notv6; - } - } - } else if (DB_TYPE(mp) != M_DATA) { - ip_wput_nondata(NULL, q, mp, NULL); - return; - } - - ip6h = (ip6_t *)mp->b_rptr; - - if (IPVER(ip6h) != IPV6_VERSION) { - mibptr = &ipst->ips_ip6_mib; - goto notv6; - } - - if (is_system_labeled() && DB_TYPE(mp) == M_DATA && - (connp == NULL || !connp->conn_ulp_labeled)) { - cred_t *cr; - pid_t pid; - - if (connp != NULL) { - ASSERT(CONN_CRED(connp) != NULL); - cr = BEST_CRED(mp, connp, &pid); - err = tsol_check_label_v6(cr, &mp, - connp->conn_mac_mode, ipst, pid); - } else if ((cr = msg_getcred(mp, &pid)) != NULL) { - err = tsol_check_label_v6(cr, &mp, CONN_MAC_DEFAULT, - ipst, pid); - } - if (mctl_present) - first_mp->b_cont = mp; - else - first_mp = mp; - if (err != 0) { - DTRACE_PROBE3( - tsol_ip_log_drop_checklabel_ip6, char *, - "conn(1), failed to check/update mp(2)", - conn_t, connp, mblk_t, mp); - freemsg(first_mp); - return; - } - ip6h = (ip6_t *)mp->b_rptr; - } - if (q->q_next != NULL) { - /* - * We don't know if this ill will be used for IPv6 - * until the ILLF_IPV6 flag is set via SIOCSLIFNAME. - * ipif_set_values() sets the ill_isv6 flag to true if - * ILLF_IPV6 is set. If the ill_isv6 flag isn't true, - * just drop the packet. - */ - if (!ill->ill_isv6) { - ip1dbg(("ip_wput_v6: Received an IPv6 packet before " - "ILLF_IPV6 was set\n")); - freemsg(first_mp); - return; - } - /* For uniformity do a refhold */ - mutex_enter(&ill->ill_lock); - if (!ILL_CAN_LOOKUP(ill)) { - mutex_exit(&ill->ill_lock); - freemsg(first_mp); - return; - } - ill_refhold_locked(ill); - mutex_exit(&ill->ill_lock); - mibptr = ill->ill_ip_mib; - - ASSERT(mibptr != NULL); - unspec_src = 0; - BUMP_MIB(mibptr, ipIfStatsHCOutRequests); - do_outrequests = B_FALSE; - zoneid = (zoneid_t)(uintptr_t)arg; - } else { - ASSERT(connp != NULL); - zoneid = connp->conn_zoneid; - - /* is queue flow controlled? */ - if ((q->q_first || connp->conn_draining) && - (caller == IP_WPUT)) { - /* - * 1) TCP sends down M_CTL for detached connections. - * 2) AH/ESP sends down M_CTL. - * - * We don't flow control either of the above. Only - * UDP and others are flow controlled for which we - * can't have a M_CTL. - */ - ASSERT(first_mp == mp); - (void) putq(q, mp); - return; - } - mibptr = &ipst->ips_ip6_mib; - unspec_src = connp->conn_unspec_src; - do_outrequests = B_TRUE; - if (mp->b_flag & MSGHASREF) { - mp->b_flag &= ~MSGHASREF; - ASSERT(connp->conn_ulp == IPPROTO_SCTP); - SCTP_EXTRACT_IPINFO(mp, sctp_ire); - need_decref = B_TRUE; - } - - /* - * If there is a policy, try to attach an ipsec_out in - * the front. At the end, first_mp either points to a - * M_DATA message or IPSEC_OUT message linked to a - * M_DATA message. We have to do it now as we might - * lose the "conn" if we go through ip_newroute. - */ - if (!mctl_present && - (connp->conn_out_enforce_policy || - connp->conn_latch != NULL)) { - ASSERT(first_mp == mp); - /* XXX Any better way to get the protocol fast ? */ - if (((mp = ipsec_attach_ipsec_out(&mp, connp, NULL, - connp->conn_ulp, ipst->ips_netstack)) == NULL)) { - BUMP_MIB(mibptr, ipIfStatsOutDiscards); - if (need_decref) - CONN_DEC_REF(connp); - return; - } else { - ASSERT(mp->b_datap->db_type == M_CTL); - first_mp = mp; - mp = mp->b_cont; - mctl_present = B_TRUE; - io = (ipsec_out_t *)first_mp->b_rptr; - } - } - } - - /* check for alignment and full IPv6 header */ - if (!OK_32PTR((uchar_t *)ip6h) || - (mp->b_wptr - (uchar_t *)ip6h) < IPV6_HDR_LEN) { - ip0dbg(("ip_wput_v6: bad alignment or length\n")); - if (do_outrequests) - BUMP_MIB(mibptr, ipIfStatsHCOutRequests); - BUMP_MIB(mibptr, ipIfStatsOutDiscards); - freemsg(first_mp); - if (ill != NULL) - ill_refrele(ill); - if (need_decref) - CONN_DEC_REF(connp); - return; - } - v6dstp = &ip6h->ip6_dst; - cksum_request = -1; - ip6i = NULL; - - /* - * Once neighbor discovery has completed, ndp_process() will provide - * locally generated packets for which processing can be reattempted. - * In these cases, connp is NULL and the original zone is part of a - * prepended ipsec_out_t. - */ - if (io != NULL) { - /* - * When coming from icmp_input_v6, the zoneid might not match - * for the loopback case, because inside icmp_input_v6 the - * queue_t is a conn queue from the sending side. - */ - zoneid = io->ipsec_out_zoneid; - ASSERT(zoneid != ALL_ZONES); - } - - if (ip6h->ip6_nxt == IPPROTO_RAW) { - /* - * This is an ip6i_t header followed by an ip6_hdr. - * Check which fields are set. - * - * When the packet comes from a transport we should have - * all needed headers in the first mblk. However, when - * going through ip_newroute*_v6 the ip6i might be in - * a separate mblk when we return here. In that case - * we pullup everything to ensure that extension and transport - * headers "stay" in the first mblk. - */ - ip6i = (ip6i_t *)ip6h; - ip6i_flags = ip6i->ip6i_flags; - - ASSERT((mp->b_wptr - (uchar_t *)ip6i) == sizeof (ip6i_t) || - ((mp->b_wptr - (uchar_t *)ip6i) >= - sizeof (ip6i_t) + IPV6_HDR_LEN)); - - if ((mp->b_wptr - (uchar_t *)ip6i) == sizeof (ip6i_t)) { - if (!pullupmsg(mp, -1)) { - ip1dbg(("ip_wput_v6: pullupmsg failed\n")); - if (do_outrequests) { - BUMP_MIB(mibptr, - ipIfStatsHCOutRequests); - } - BUMP_MIB(mibptr, ipIfStatsOutDiscards); - freemsg(first_mp); - if (ill != NULL) - ill_refrele(ill); - if (need_decref) - CONN_DEC_REF(connp); - return; - } - ip6h = (ip6_t *)mp->b_rptr; - v6dstp = &ip6h->ip6_dst; - ip6i = (ip6i_t *)ip6h; - } - ip6h = (ip6_t *)&ip6i[1]; - - /* - * Advance rptr past the ip6i_t to get ready for - * transmitting the packet. However, if the packet gets - * passed to ip_newroute*_v6 then rptr is moved back so - * that the ip6i_t header can be inspected when the - * packet comes back here after passing through - * ire_add_then_send. - */ - mp->b_rptr = (uchar_t *)ip6h; - - if (ip6i->ip6i_flags & IP6I_IFINDEX) { - ASSERT(ip6i->ip6i_ifindex != 0); - if (ill != NULL) - ill_refrele(ill); - ill = ill_lookup_on_ifindex(ip6i->ip6i_ifindex, 1, - NULL, NULL, NULL, NULL, ipst); - if (ill == NULL) { - if (do_outrequests) { - BUMP_MIB(mibptr, - ipIfStatsHCOutRequests); - } - BUMP_MIB(mibptr, ipIfStatsOutDiscards); - ip1dbg(("ip_wput_v6: bad ifindex %d\n", - ip6i->ip6i_ifindex)); - if (need_decref) - CONN_DEC_REF(connp); - freemsg(first_mp); - return; - } - mibptr = ill->ill_ip_mib; - /* - * Preserve the index so that when we return from - * IPSEC processing, we know where to send the packet. - */ - if (mctl_present) { - ASSERT(io != NULL); - io->ipsec_out_ill_index = ip6i->ip6i_ifindex; - } - } - if (ip6i->ip6i_flags & IP6I_VERIFY_SRC) { - cred_t *cr = msg_getcred(mp, NULL); - - /* rpcmod doesn't send down db_credp for UDP packets */ - if (cr == NULL) { - if (connp != NULL) - cr = connp->conn_cred; - else - cr = ill->ill_credp; - } - - ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)); - if (secpolicy_net_rawaccess(cr) != 0) { - /* - * Use IPCL_ZONEID to honor SO_ALLZONES. - */ - ire = ire_route_lookup_v6(&ip6h->ip6_src, - 0, 0, (IRE_LOCAL|IRE_LOOPBACK), NULL, - NULL, connp != NULL ? - IPCL_ZONEID(connp) : zoneid, NULL, - MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst); - if (ire == NULL) { - if (do_outrequests) - BUMP_MIB(mibptr, - ipIfStatsHCOutRequests); - BUMP_MIB(mibptr, ipIfStatsOutDiscards); - ip1dbg(("ip_wput_v6: bad source " - "addr\n")); - freemsg(first_mp); - if (ill != NULL) - ill_refrele(ill); - if (need_decref) - CONN_DEC_REF(connp); - return; - } - ire_refrele(ire); - } - /* No need to verify again when using ip_newroute */ - ip6i->ip6i_flags &= ~IP6I_VERIFY_SRC; - } - if (!(ip6i->ip6i_flags & IP6I_NEXTHOP)) { - /* - * Make sure they match since ip_newroute*_v6 etc might - * (unknown to them) inspect ip6i_nexthop when - * they think they access ip6_dst. - */ - ip6i->ip6i_nexthop = ip6h->ip6_dst; - } - if (ip6i->ip6i_flags & IP6I_NO_ULP_CKSUM) - cksum_request = 1; - if (ip6i->ip6i_flags & IP6I_RAW_CHECKSUM) - cksum_request = ip6i->ip6i_checksum_off; - if (ip6i->ip6i_flags & IP6I_UNSPEC_SRC) - unspec_src = 1; - - if (do_outrequests && ill != NULL) { - BUMP_MIB(mibptr, ipIfStatsHCOutRequests); - do_outrequests = B_FALSE; - } - /* - * Store ip6i_t info that we need after we come back - * from IPSEC processing. - */ - if (mctl_present) { - ASSERT(io != NULL); - io->ipsec_out_unspec_src = unspec_src; - } - } - if (connp != NULL && connp->conn_dontroute) - ip6h->ip6_hops = 1; - - if (IN6_IS_ADDR_MULTICAST(v6dstp)) - goto ipv6multicast; - - /* 1. If an ip6i_t with IP6I_IFINDEX set then use that ill. */ - if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_IFINDEX)) { - ASSERT(ill != NULL); - goto send_from_ill; - } - - /* - * 2. If q is an ill queue and there's a link-local destination - * then use that ill. - */ - if (ill != NULL && IN6_IS_ADDR_LINKLOCAL(v6dstp)) - goto send_from_ill; - - /* 3. If IPV6_BOUND_IF has been set use that ill. */ - if (connp != NULL && connp->conn_outgoing_ill != NULL) { - ill_t *conn_outgoing_ill; - - conn_outgoing_ill = conn_get_held_ill(connp, - &connp->conn_outgoing_ill, &err); - if (err == ILL_LOOKUP_FAILED) { - if (ill != NULL) - ill_refrele(ill); - if (need_decref) - CONN_DEC_REF(connp); - freemsg(first_mp); - return; - } - if (ill != NULL) - ill_refrele(ill); - ill = conn_outgoing_ill; - mibptr = ill->ill_ip_mib; - goto send_from_ill; - } - - /* - * 4. For unicast: Just do an IRE lookup for the best match. - * If we get here for a link-local address it is rather random - * what interface we pick on a multihomed host. - * *If* there is an IRE_CACHE (and the link-local address - * isn't duplicated on multi links) this will find the IRE_CACHE. - * Otherwise it will use one of the matching IRE_INTERFACE routes - * for the link-local prefix. Hence, applications - * *should* be encouraged to specify an outgoing interface when sending - * to a link local address. - */ - if (connp == NULL || (IP_FLOW_CONTROLLED_ULP(connp->conn_ulp) && - !connp->conn_fully_bound)) { - /* - * We cache IRE_CACHEs to avoid lookups. We don't do - * this for the tcp global queue and listen end point - * as it does not really have a real destination to - * talk to. - */ - ire = ire_cache_lookup_v6(v6dstp, zoneid, msg_getlabel(mp), - ipst); - } else { - /* - * IRE_MARK_CONDEMNED is marked in ire_delete. We don't - * grab a lock here to check for CONDEMNED as it is okay - * to send a packet or two with the IRE_CACHE that is going - * away. - */ - mutex_enter(&connp->conn_lock); - ire = sctp_ire != NULL ? sctp_ire : connp->conn_ire_cache; - if (ire != NULL && - IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, v6dstp) && - !(ire->ire_marks & IRE_MARK_CONDEMNED)) { - - IRE_REFHOLD(ire); - mutex_exit(&connp->conn_lock); - - } else { - boolean_t cached = B_FALSE; - - connp->conn_ire_cache = NULL; - mutex_exit(&connp->conn_lock); - /* Release the old ire */ - if (ire != NULL && sctp_ire == NULL) - IRE_REFRELE_NOTR(ire); - - ire = ire_cache_lookup_v6(v6dstp, zoneid, - msg_getlabel(mp), ipst); - if (ire != NULL) { - IRE_REFHOLD_NOTR(ire); - - mutex_enter(&connp->conn_lock); - if (CONN_CACHE_IRE(connp) && - (connp->conn_ire_cache == NULL)) { - rw_enter(&ire->ire_bucket->irb_lock, - RW_READER); - if (!(ire->ire_marks & - IRE_MARK_CONDEMNED)) { - connp->conn_ire_cache = ire; - cached = B_TRUE; - } - rw_exit(&ire->ire_bucket->irb_lock); - } - mutex_exit(&connp->conn_lock); - - /* - * We can continue to use the ire but since it - * was not cached, we should drop the extra - * reference. - */ - if (!cached) - IRE_REFRELE_NOTR(ire); - } - } - } - - if (ire != NULL) { - if (do_outrequests) { - /* Handle IRE_LOCAL's that might appear here */ - if (ire->ire_type == IRE_CACHE) { - mibptr = ((ill_t *)ire->ire_stq->q_ptr)-> - ill_ip_mib; - } else { - mibptr = ire->ire_ipif->ipif_ill->ill_ip_mib; - } - BUMP_MIB(mibptr, ipIfStatsHCOutRequests); - } - - /* - * Check if the ire has the RTF_MULTIRT flag, inherited - * from an IRE_OFFSUBNET ire entry in ip_newroute(). - */ - if (ire->ire_flags & RTF_MULTIRT) { - /* - * Force hop limit of multirouted packets if required. - * The hop limit of such packets is bounded by the - * ip_multirt_ttl ndd variable. - * NDP packets must have a hop limit of 255; don't - * change the hop limit in that case. - */ - if ((ipst->ips_ip_multirt_ttl > 0) && - (ip6h->ip6_hops > ipst->ips_ip_multirt_ttl) && - (ip6h->ip6_hops != IPV6_MAX_HOPS)) { - if (ip_debug > 3) { - ip2dbg(("ip_wput_v6: forcing multirt " - "hop limit to %d (was %d) ", - ipst->ips_ip_multirt_ttl, - ip6h->ip6_hops)); - pr_addr_dbg("v6dst %s\n", AF_INET6, - &ire->ire_addr_v6); - } - ip6h->ip6_hops = ipst->ips_ip_multirt_ttl; - } - - /* - * We look at this point if there are pending - * unresolved routes. ire_multirt_need_resolve_v6() - * checks in O(n) that all IRE_OFFSUBNET ire - * entries for the packet's destination and - * flagged RTF_MULTIRT are currently resolved. - * If some remain unresolved, we do a copy - * of the current message. It will be used - * to initiate additional route resolutions. - */ - multirt_need_resolve = - ire_multirt_need_resolve_v6(&ire->ire_addr_v6, - msg_getlabel(first_mp), ipst); - ip2dbg(("ip_wput_v6: ire %p, " - "multirt_need_resolve %d, first_mp %p\n", - (void *)ire, multirt_need_resolve, - (void *)first_mp)); - if (multirt_need_resolve) { - copy_mp = copymsg(first_mp); - if (copy_mp != NULL) { - MULTIRT_DEBUG_TAG(copy_mp); - } - } - } - ip_wput_ire_v6(q, first_mp, ire, unspec_src, cksum_request, - connp, caller, ip6i_flags, zoneid); - if (need_decref) { - CONN_DEC_REF(connp); - connp = NULL; - } - IRE_REFRELE(ire); - - /* - * Try to resolve another multiroute if - * ire_multirt_need_resolve_v6() deemed it necessary. - * copy_mp will be consumed (sent or freed) by - * ip_newroute_v6(). - */ - if (copy_mp != NULL) { - if (mctl_present) { - ip6h = (ip6_t *)copy_mp->b_cont->b_rptr; - } else { - ip6h = (ip6_t *)copy_mp->b_rptr; - } - ip_newroute_v6(q, copy_mp, &ip6h->ip6_dst, - &ip6h->ip6_src, NULL, zoneid, ipst); - } - if (ill != NULL) - ill_refrele(ill); - return; - } - - /* - * No full IRE for this destination. Send it to - * ip_newroute_v6 to see if anything else matches. - * Mark this packet as having originated on this - * machine. - * Update rptr if there was an ip6i_t header. - */ - mp->b_prev = NULL; - mp->b_next = NULL; - if (ip6i != NULL) - mp->b_rptr -= sizeof (ip6i_t); - - if (unspec_src) { - if (ip6i == NULL) { - /* - * Add ip6i_t header to carry unspec_src - * until the packet comes back in ip_wput_v6. - */ - mp = ip_add_info_v6(mp, NULL, v6dstp); - if (mp == NULL) { - if (do_outrequests) - BUMP_MIB(mibptr, - ipIfStatsHCOutRequests); - BUMP_MIB(mibptr, ipIfStatsOutDiscards); - if (mctl_present) - freeb(first_mp); - if (ill != NULL) - ill_refrele(ill); - if (need_decref) - CONN_DEC_REF(connp); - return; - } - ip6i = (ip6i_t *)mp->b_rptr; - - if (mctl_present) { - ASSERT(first_mp != mp); - first_mp->b_cont = mp; - } else { - first_mp = mp; - } - - if ((mp->b_wptr - (uchar_t *)ip6i) == - sizeof (ip6i_t)) { - /* - * ndp_resolver called from ip_newroute_v6 - * expects pulled up message. - */ - if (!pullupmsg(mp, -1)) { - ip1dbg(("ip_wput_v6: pullupmsg" - " failed\n")); - if (do_outrequests) { - BUMP_MIB(mibptr, - ipIfStatsHCOutRequests); - } - BUMP_MIB(mibptr, ipIfStatsOutDiscards); - freemsg(first_mp); - if (ill != NULL) - ill_refrele(ill); - if (need_decref) - CONN_DEC_REF(connp); - return; - } - ip6i = (ip6i_t *)mp->b_rptr; - } - ip6h = (ip6_t *)&ip6i[1]; - v6dstp = &ip6h->ip6_dst; - } - ip6i->ip6i_flags |= IP6I_UNSPEC_SRC; - if (mctl_present) { - ASSERT(io != NULL); - io->ipsec_out_unspec_src = unspec_src; - } - } - if (do_outrequests) - BUMP_MIB(mibptr, ipIfStatsHCOutRequests); - if (need_decref) - CONN_DEC_REF(connp); - ip_newroute_v6(q, first_mp, v6dstp, &ip6h->ip6_src, NULL, zoneid, ipst); - if (ill != NULL) - ill_refrele(ill); - return; - - - /* - * Handle multicast packets with or without an conn. - * Assumes that the transports set ip6_hops taking - * IPV6_MULTICAST_HOPS (and the other ways to set the hoplimit) - * into account. - */ -ipv6multicast: - ip2dbg(("ip_wput_v6: multicast\n")); - - /* - * Hold the conn_lock till we refhold the ill of interest that is - * pointed to from the conn. Since we cannot do an ill/ipif_refrele - * while holding any locks, postpone the refrele until after the - * conn_lock is dropped. - */ - if (connp != NULL) { - mutex_enter(&connp->conn_lock); - conn_lock_held = B_TRUE; - } else { - conn_lock_held = B_FALSE; - } - if (ip6i != NULL && (ip6i->ip6i_flags & IP6I_IFINDEX)) { - /* 1. If an ip6i_t with IP6I_IFINDEX set then use that ill. */ - ASSERT(ill != NULL); - } else if (ill != NULL) { - /* - * 2. If q is an ill queue and (link local or multicast - * destination) then use that ill. - * We don't need the ipif initialization here. - * This useless assert below is just to prevent lint from - * reporting a null body if statement. - */ - ASSERT(ill != NULL); - } else if (connp != NULL) { - /* - * 3. If IPV6_BOUND_IF has been set use that ill. - * - * 4. For multicast: if IPV6_MULTICAST_IF has been set use it. - * Otherwise look for the best IRE match for the unspecified - * group to determine the ill. - * - * conn_multicast_ill is used for only IPv6 packets. - * conn_multicast_ipif is used for only IPv4 packets. - * Thus a PF_INET6 socket send both IPv4 and IPv6 - * multicast packets using different IP*_MULTICAST_IF - * interfaces. - */ - if (connp->conn_outgoing_ill != NULL) { - err = ill_check_and_refhold(connp->conn_outgoing_ill); - if (err == ILL_LOOKUP_FAILED) { - ip1dbg(("ip_output_v6: multicast" - " conn_outgoing_ill no ipif\n")); -multicast_discard: - ASSERT(saved_ill == NULL); - if (conn_lock_held) - mutex_exit(&connp->conn_lock); - if (ill != NULL) - ill_refrele(ill); - freemsg(first_mp); - if (do_outrequests) - BUMP_MIB(mibptr, ipIfStatsOutDiscards); - if (need_decref) - CONN_DEC_REF(connp); - return; - } - ill = connp->conn_outgoing_ill; - } else if (connp->conn_multicast_ill != NULL) { - err = ill_check_and_refhold(connp->conn_multicast_ill); - if (err == ILL_LOOKUP_FAILED) { - ip1dbg(("ip_output_v6: multicast" - " conn_multicast_ill no ipif\n")); - goto multicast_discard; - } - ill = connp->conn_multicast_ill; - } else { - mutex_exit(&connp->conn_lock); - conn_lock_held = B_FALSE; - ipif = ipif_lookup_group_v6(v6dstp, zoneid, ipst); - if (ipif == NULL) { - ip1dbg(("ip_output_v6: multicast no ipif\n")); - goto multicast_discard; - } - /* - * We have a ref to this ipif, so we can safely - * access ipif_ill. - */ - ill = ipif->ipif_ill; - mutex_enter(&ill->ill_lock); - if (!ILL_CAN_LOOKUP(ill)) { - mutex_exit(&ill->ill_lock); - ipif_refrele(ipif); - ill = NULL; - ip1dbg(("ip_output_v6: multicast no ipif\n")); - goto multicast_discard; - } - ill_refhold_locked(ill); - mutex_exit(&ill->ill_lock); - ipif_refrele(ipif); - /* - * Save binding until IPV6_MULTICAST_IF - * changes it - */ - mutex_enter(&connp->conn_lock); - connp->conn_multicast_ill = ill; - mutex_exit(&connp->conn_lock); - } - } - if (conn_lock_held) - mutex_exit(&connp->conn_lock); - - if (saved_ill != NULL) - ill_refrele(saved_ill); - - ASSERT(ill != NULL); - /* - * For multicast loopback interfaces replace the multicast address - * with a unicast address for the ire lookup. - */ - if (IS_LOOPBACK(ill)) - v6dstp = &ill->ill_ipif->ipif_v6lcl_addr; - - mibptr = ill->ill_ip_mib; - if (do_outrequests) { - BUMP_MIB(mibptr, ipIfStatsHCOutRequests); - do_outrequests = B_FALSE; - } - BUMP_MIB(mibptr, ipIfStatsHCOutMcastPkts); - UPDATE_MIB(mibptr, ipIfStatsHCOutMcastOctets, - ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN); - - /* - * As we may lose the conn by the time we reach ip_wput_ire_v6 - * we copy conn_multicast_loop and conn_dontroute on to an - * ipsec_out. In case if this datagram goes out secure, - * we need the ill_index also. Copy that also into the - * ipsec_out. - */ - if (mctl_present) { - io = (ipsec_out_t *)first_mp->b_rptr; - ASSERT(first_mp->b_datap->db_type == M_CTL); - ASSERT(io->ipsec_out_type == IPSEC_OUT); - } else { - ASSERT(mp == first_mp); - if ((first_mp = ipsec_alloc_ipsec_out(ipst->ips_netstack)) == - NULL) { - BUMP_MIB(mibptr, ipIfStatsOutDiscards); - freemsg(mp); - if (ill != NULL) - ill_refrele(ill); - if (need_decref) - CONN_DEC_REF(connp); - return; - } - io = (ipsec_out_t *)first_mp->b_rptr; - /* This is not a secure packet */ - io->ipsec_out_secure = B_FALSE; - io->ipsec_out_use_global_policy = B_TRUE; - io->ipsec_out_zoneid = - (zoneid != ALL_ZONES ? zoneid : GLOBAL_ZONEID); - first_mp->b_cont = mp; - mctl_present = B_TRUE; - } - io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex; - io->ipsec_out_unspec_src = unspec_src; - if (connp != NULL) - io->ipsec_out_dontroute = connp->conn_dontroute; - -send_from_ill: - ASSERT(ill != NULL); - ASSERT(mibptr == ill->ill_ip_mib); - - if (do_outrequests) { - BUMP_MIB(mibptr, ipIfStatsHCOutRequests); - do_outrequests = B_FALSE; - } - - /* - * Because nce_xmit() calls ip_output_v6() and NCEs are always tied to - * an underlying interface, IS_UNDER_IPMP() may be true even when - * building IREs that will be used for data traffic. As such, use the - * packet's source address to determine whether the traffic is test - * traffic, and set MATCH_IRE_MARK_TESTHIDDEN if so. - * - * Separately, we also need to mark probe packets so that ND can - * process them specially; see the comments in nce_queue_mp_common(). - */ - if (IS_UNDER_IPMP(ill) && !IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) && - ipif_lookup_testaddr_v6(ill, &ip6h->ip6_src, NULL)) { - if (ip6i == NULL) { - if ((mp = ip_add_info_v6(mp, NULL, v6dstp)) == NULL) { - if (mctl_present) - freeb(first_mp); - goto discard; - } - - if (mctl_present) - first_mp->b_cont = mp; - else - first_mp = mp; - - /* ndp_resolver() expects a pulled-up message */ - if (MBLKL(mp) == sizeof (ip6i_t) && - pullupmsg(mp, -1) == 0) { - ip1dbg(("ip_output_v6: pullupmsg failed\n")); -discard: BUMP_MIB(mibptr, ipIfStatsOutDiscards); - ill_refrele(ill); - if (need_decref) - CONN_DEC_REF(connp); - return; - } - ip6i = (ip6i_t *)mp->b_rptr; - ip6h = (ip6_t *)&ip6i[1]; - v6dstp = &ip6h->ip6_dst; - mp->b_rptr = (uchar_t *)ip6h; /* rewound below */ - } - ip6i->ip6i_flags |= IP6I_IPMP_PROBE; - match_flags |= MATCH_IRE_MARK_TESTHIDDEN; - } - - if (io != NULL) - io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex; - - /* - * When a specific ill is specified (using IPV6_PKTINFO, - * IPV6_MULTICAST_IF, or IPV6_BOUND_IF) we will only match - * on routing entries (ftable and ctable) that have a matching - * ire->ire_ipif->ipif_ill. Thus this can only be used - * for destinations that are on-link for the specific ill - * and that can appear on multiple links. Thus it is useful - * for multicast destinations, link-local destinations, and - * at some point perhaps for site-local destinations (if the - * node sits at a site boundary). - * We create the cache entries in the regular ctable since - * it can not "confuse" things for other destinations. - * table. - * - * NOTE : conn_ire_cache is not used for caching ire_ctable_lookups. - * It is used only when ire_cache_lookup is used above. - */ - ire = ire_ctable_lookup_v6(v6dstp, 0, 0, ill->ill_ipif, - zoneid, msg_getlabel(mp), match_flags, ipst); - if (ire != NULL) { - /* - * Check if the ire has the RTF_MULTIRT flag, inherited - * from an IRE_OFFSUBNET ire entry in ip_newroute(). - */ - if (ire->ire_flags & RTF_MULTIRT) { - /* - * Force hop limit of multirouted packets if required. - * The hop limit of such packets is bounded by the - * ip_multirt_ttl ndd variable. - * NDP packets must have a hop limit of 255; don't - * change the hop limit in that case. - */ - if ((ipst->ips_ip_multirt_ttl > 0) && - (ip6h->ip6_hops > ipst->ips_ip_multirt_ttl) && - (ip6h->ip6_hops != IPV6_MAX_HOPS)) { - if (ip_debug > 3) { - ip2dbg(("ip_wput_v6: forcing multirt " - "hop limit to %d (was %d) ", - ipst->ips_ip_multirt_ttl, - ip6h->ip6_hops)); - pr_addr_dbg("v6dst %s\n", AF_INET6, - &ire->ire_addr_v6); - } - ip6h->ip6_hops = ipst->ips_ip_multirt_ttl; - } - - /* - * We look at this point if there are pending - * unresolved routes. ire_multirt_need_resolve_v6() - * checks in O(n) that all IRE_OFFSUBNET ire - * entries for the packet's destination and - * flagged RTF_MULTIRT are currently resolved. - * If some remain unresolved, we make a copy - * of the current message. It will be used - * to initiate additional route resolutions. - */ - multirt_need_resolve = - ire_multirt_need_resolve_v6(&ire->ire_addr_v6, - msg_getlabel(first_mp), ipst); - ip2dbg(("ip_wput_v6[send_from_ill]: ire %p, " - "multirt_need_resolve %d, first_mp %p\n", - (void *)ire, multirt_need_resolve, - (void *)first_mp)); - if (multirt_need_resolve) { - copy_mp = copymsg(first_mp); - if (copy_mp != NULL) { - MULTIRT_DEBUG_TAG(copy_mp); - } - } - } - - ip1dbg(("ip_wput_v6: send on %s, ire = %p, ill index = %d\n", - ill->ill_name, (void *)ire, - ill->ill_phyint->phyint_ifindex)); - ip_wput_ire_v6(q, first_mp, ire, unspec_src, cksum_request, - connp, caller, ip6i_flags, zoneid); - ire_refrele(ire); - if (need_decref) { - CONN_DEC_REF(connp); - connp = NULL; - } - - /* - * Try to resolve another multiroute if - * ire_multirt_need_resolve_v6() deemed it necessary. - * copy_mp will be consumed (sent or freed) by - * ip_newroute_[ipif_]v6(). - */ - if (copy_mp != NULL) { - if (mctl_present) { - ip6h = (ip6_t *)copy_mp->b_cont->b_rptr; - } else { - ip6h = (ip6_t *)copy_mp->b_rptr; - } - if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) { - ipif = ipif_lookup_group_v6(&ip6h->ip6_dst, - zoneid, ipst); - if (ipif == NULL) { - ip1dbg(("ip_wput_v6: No ipif for " - "multicast\n")); - MULTIRT_DEBUG_UNTAG(copy_mp); - freemsg(copy_mp); - return; - } - ip_newroute_ipif_v6(q, copy_mp, ipif, - &ip6h->ip6_dst, &ip6h->ip6_src, unspec_src, - zoneid); - ipif_refrele(ipif); - } else { - ip_newroute_v6(q, copy_mp, &ip6h->ip6_dst, - &ip6h->ip6_src, ill, zoneid, ipst); - } - } - ill_refrele(ill); - return; - } - if (need_decref) { - CONN_DEC_REF(connp); - connp = NULL; - } - - /* Update rptr if there was an ip6i_t header. */ - if (ip6i != NULL) - mp->b_rptr -= sizeof (ip6i_t); - if (unspec_src) { - if (ip6i == NULL) { - /* - * Add ip6i_t header to carry unspec_src - * until the packet comes back in ip_wput_v6. - */ - if (mctl_present) { - first_mp->b_cont = - ip_add_info_v6(mp, NULL, v6dstp); - mp = first_mp->b_cont; - if (mp == NULL) - freeb(first_mp); - } else { - first_mp = mp = ip_add_info_v6(mp, NULL, - v6dstp); - } - if (mp == NULL) { - BUMP_MIB(mibptr, ipIfStatsOutDiscards); - ill_refrele(ill); - return; - } - ip6i = (ip6i_t *)mp->b_rptr; - if ((mp->b_wptr - (uchar_t *)ip6i) == - sizeof (ip6i_t)) { - /* - * ndp_resolver called from ip_newroute_v6 - * expects a pulled up message. - */ - if (!pullupmsg(mp, -1)) { - ip1dbg(("ip_wput_v6: pullupmsg" - " failed\n")); - BUMP_MIB(mibptr, ipIfStatsOutDiscards); - freemsg(first_mp); - return; - } - ip6i = (ip6i_t *)mp->b_rptr; - } - ip6h = (ip6_t *)&ip6i[1]; - v6dstp = &ip6h->ip6_dst; - } - ip6i->ip6i_flags |= IP6I_UNSPEC_SRC; - if (mctl_present) { - ASSERT(io != NULL); - io->ipsec_out_unspec_src = unspec_src; - } - } - if (IN6_IS_ADDR_MULTICAST(v6dstp)) { - ip_newroute_ipif_v6(q, first_mp, ill->ill_ipif, v6dstp, - &ip6h->ip6_src, unspec_src, zoneid); - } else { - ip_newroute_v6(q, first_mp, v6dstp, &ip6h->ip6_src, ill, - zoneid, ipst); - } - ill_refrele(ill); - return; - -notv6: - /* FIXME?: assume the caller calls the right version of ip_output? */ - if (q->q_next == NULL) { - connp = Q_TO_CONN(q); - - /* - * We can change conn_send for all types of conn, even - * though only TCP uses it right now. - * FIXME: sctp could use conn_send but doesn't currently. - */ - ip_setpktversion(connp, B_FALSE, B_TRUE, ipst); - } - BUMP_MIB(mibptr, ipIfStatsOutWrongIPVersion); - (void) ip_output(arg, first_mp, arg2, caller); - if (ill != NULL) - ill_refrele(ill); -} - -/* - * If this is a conn_t queue, then we pass in the conn. This includes the - * zoneid. - * Otherwise, this is a message for an ill_t queue, - * in which case we use the global zoneid since those are all part of - * the global zone. - */ -void -ip_wput_v6(queue_t *q, mblk_t *mp) -{ - if (CONN_Q(q)) - ip_output_v6(Q_TO_CONN(q), mp, q, IP_WPUT); - else - ip_output_v6(GLOBAL_ZONEID, mp, q, IP_WPUT); -} - -/* - * NULL send-to queue - packet is to be delivered locally. - */ -void -ip_wput_local_v6(queue_t *q, ill_t *ill, ip6_t *ip6h, mblk_t *first_mp, - ire_t *ire, int fanout_flags, zoneid_t zoneid) -{ - uint32_t ports; - mblk_t *mp = first_mp, *first_mp1; - boolean_t mctl_present; - uint8_t nexthdr; - uint16_t hdr_length; - ipsec_out_t *io; - mib2_ipIfStatsEntry_t *mibptr; - ilm_t *ilm; - uint_t nexthdr_offset; - ip_stack_t *ipst = ill->ill_ipst; - - if (DB_TYPE(mp) == M_CTL) { - io = (ipsec_out_t *)mp->b_rptr; - if (!io->ipsec_out_secure) { - mp = mp->b_cont; - freeb(first_mp); - first_mp = mp; - mctl_present = B_FALSE; - } else { - mctl_present = B_TRUE; - mp = first_mp->b_cont; - ipsec_out_to_in(first_mp); - } - } else { - mctl_present = B_FALSE; - } - - /* - * Remove reachability confirmation bit from version field - * before passing the packet on to any firewall hooks or - * looping back the packet. - */ - if (ip6h->ip6_vcf & IP_FORWARD_PROG) - ip6h->ip6_vcf &= ~IP_FORWARD_PROG; - - DTRACE_PROBE4(ip6__loopback__in__start, - ill_t *, ill, ill_t *, NULL, - ip6_t *, ip6h, mblk_t *, first_mp); - - FW_HOOKS6(ipst->ips_ip6_loopback_in_event, - ipst->ips_ipv6firewall_loopback_in, - ill, NULL, ip6h, first_mp, mp, 0, ipst); - - DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, first_mp); - - if (first_mp == NULL) - return; - - if (ipst->ips_ip6_observe.he_interested) { - zoneid_t szone, dzone, lookup_zoneid = ALL_ZONES; - zoneid_t stackzoneid = netstackid_to_zoneid( - ipst->ips_netstack->netstack_stackid); - - szone = (stackzoneid == GLOBAL_ZONEID) ? zoneid : stackzoneid; - /* - * ::1 is special, as we cannot lookup its zoneid by - * address. For this case, restrict the lookup to the - * source zone. - */ - if (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst)) - lookup_zoneid = zoneid; - dzone = ip_get_zoneid_v6(&ip6h->ip6_dst, mp, ill, ipst, - lookup_zoneid); - ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst); - } - - DTRACE_IP7(receive, mblk_t *, first_mp, conn_t *, NULL, void_ip_t *, - ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, ip6h, - int, 1); - - nexthdr = ip6h->ip6_nxt; - mibptr = ill->ill_ip_mib; - - /* Fastpath */ - switch (nexthdr) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_ICMPV6: - case IPPROTO_SCTP: - hdr_length = IPV6_HDR_LEN; - nexthdr_offset = (uint_t)((uchar_t *)&ip6h->ip6_nxt - - (uchar_t *)ip6h); - break; - default: { - uint8_t *nexthdrp; - - if (!ip_hdr_length_nexthdr_v6(mp, ip6h, - &hdr_length, &nexthdrp)) { - /* Malformed packet */ - BUMP_MIB(mibptr, ipIfStatsOutDiscards); - freemsg(first_mp); - return; - } - nexthdr = *nexthdrp; - nexthdr_offset = nexthdrp - (uint8_t *)ip6h; - break; - } - } - - UPDATE_OB_PKT_COUNT(ire); - ire->ire_last_used_time = lbolt; - - switch (nexthdr) { - case IPPROTO_TCP: - if (DB_TYPE(mp) == M_DATA) { - /* - * M_DATA mblk, so init mblk (chain) for - * no struio(). - */ - mblk_t *mp1 = mp; - - do { - mp1->b_datap->db_struioflag = 0; - } while ((mp1 = mp1->b_cont) != NULL); - } - ports = *(uint32_t *)(mp->b_rptr + hdr_length + - TCP_PORTS_OFFSET); - ip_fanout_tcp_v6(q, first_mp, ip6h, ill, ill, - fanout_flags|IP_FF_SEND_ICMP|IP_FF_SYN_ADDIRE| - IP_FF_IPINFO|IP6_NO_IPPOLICY|IP_FF_LOOPBACK, - hdr_length, mctl_present, ire->ire_zoneid); - return; - - case IPPROTO_UDP: - ports = *(uint32_t *)(mp->b_rptr + hdr_length + - UDP_PORTS_OFFSET); - ip_fanout_udp_v6(q, first_mp, ip6h, ports, ill, ill, - fanout_flags|IP_FF_SEND_ICMP|IP_FF_IPINFO| - IP6_NO_IPPOLICY, mctl_present, ire->ire_zoneid); - return; - - case IPPROTO_SCTP: - { - ports = *(uint32_t *)(mp->b_rptr + hdr_length); - ip_fanout_sctp(first_mp, ill, (ipha_t *)ip6h, ports, - fanout_flags|IP_FF_SEND_ICMP|IP_FF_IPINFO, - mctl_present, IP6_NO_IPPOLICY, ire->ire_zoneid); - return; - } - case IPPROTO_ICMPV6: { - icmp6_t *icmp6; - - /* check for full IPv6+ICMPv6 header */ - if ((mp->b_wptr - mp->b_rptr) < - (hdr_length + ICMP6_MINLEN)) { - if (!pullupmsg(mp, hdr_length + ICMP6_MINLEN)) { - ip1dbg(("ip_wput_v6: ICMP hdr pullupmsg" - " failed\n")); - BUMP_MIB(mibptr, ipIfStatsOutDiscards); - freemsg(first_mp); - return; - } - ip6h = (ip6_t *)mp->b_rptr; - } - icmp6 = (icmp6_t *)((uchar_t *)ip6h + hdr_length); - - /* Update output mib stats */ - icmp_update_out_mib_v6(ill, icmp6); - - /* Check variable for testing applications */ - if (ipst->ips_ipv6_drop_inbound_icmpv6) { - freemsg(first_mp); - return; - } - /* - * Assume that there is always at least one conn for - * ICMPv6 (in.ndpd) i.e. don't optimize the case - * where there is no conn. - */ - if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) && - !IS_LOOPBACK(ill)) { - ilm_walker_t ilw; - - /* - * In the multicast case, applications may have - * joined the group from different zones, so we - * need to deliver the packet to each of them. - * Loop through the multicast memberships - * structures (ilm) on the receive ill and send - * a copy of the packet up each matching one. - * However, we don't do this for multicasts sent - * on the loopback interface (PHYI_LOOPBACK flag - * set) as they must stay in the sender's zone. - */ - ilm = ilm_walker_start(&ilw, ill); - for (; ilm != NULL; - ilm = ilm_walker_step(&ilw, ilm)) { - if (!IN6_ARE_ADDR_EQUAL( - &ilm->ilm_v6addr, &ip6h->ip6_dst)) - continue; - if ((fanout_flags & - IP_FF_NO_MCAST_LOOP) && - ilm->ilm_zoneid == ire->ire_zoneid) - continue; - if (!ipif_lookup_zoneid( - ilw.ilw_walk_ill, ilm->ilm_zoneid, - IPIF_UP, NULL)) - continue; - - first_mp1 = ip_copymsg(first_mp); - if (first_mp1 == NULL) - continue; - icmp_inbound_v6(q, first_mp1, - ilw.ilw_walk_ill, ill, hdr_length, - mctl_present, IP6_NO_IPPOLICY, - ilm->ilm_zoneid, NULL); - } - ilm_walker_finish(&ilw); - } else { - first_mp1 = ip_copymsg(first_mp); - if (first_mp1 != NULL) - icmp_inbound_v6(q, first_mp1, ill, ill, - hdr_length, mctl_present, - IP6_NO_IPPOLICY, ire->ire_zoneid, - NULL); - } - } - /* FALLTHRU */ - default: { - /* - * Handle protocols with which IPv6 is less intimate. - */ - fanout_flags |= IP_FF_RAWIP|IP_FF_IPINFO; - - /* - * Enable sending ICMP for "Unknown" nexthdr - * case. i.e. where we did not FALLTHRU from - * IPPROTO_ICMPV6 processing case above. - */ - if (nexthdr != IPPROTO_ICMPV6) - fanout_flags |= IP_FF_SEND_ICMP; - /* - * Note: There can be more than one stream bound - * to a particular protocol. When this is the case, - * each one gets a copy of any incoming packets. - */ - ip_fanout_proto_v6(q, first_mp, ip6h, ill, ill, nexthdr, - nexthdr_offset, fanout_flags|IP6_NO_IPPOLICY, - mctl_present, ire->ire_zoneid); - return; - } - } -} - -/* - * Send packet using IRE. - * Checksumming is controlled by cksum_request: - * -1 => normal i.e. TCP/UDP/SCTP/ICMPv6 are checksummed and nothing else. - * 1 => Skip TCP/UDP/SCTP checksum - * Otherwise => checksum_request contains insert offset for checksum - * - * Assumes that the following set of headers appear in the first - * mblk: - * ip6_t - * Any extension headers - * TCP/UDP/SCTP header (if present) - * The routine can handle an ICMPv6 header that is not in the first mblk. - * - * NOTE : This function does not ire_refrele the ire passed in as the - * argument unlike ip_wput_ire where the REFRELE is done. - * Refer to ip_wput_ire for more on this. - */ -static void -ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src, - int cksum_request, conn_t *connp, int caller, int flags, zoneid_t zoneid) -{ - ip6_t *ip6h; - uint8_t nexthdr; - uint16_t hdr_length; - uint_t reachable = 0x0; - ill_t *ill; - mib2_ipIfStatsEntry_t *mibptr; - mblk_t *first_mp; - boolean_t mctl_present; - ipsec_out_t *io; - boolean_t conn_dontroute; /* conn value for multicast */ - boolean_t conn_multicast_loop; /* conn value for multicast */ - boolean_t multicast_forward; /* Should we forward ? */ - int max_frag; - ip_stack_t *ipst = ire->ire_ipst; - ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; - - ill = ire_to_ill(ire); - first_mp = mp; - multicast_forward = B_FALSE; - - if (mp->b_datap->db_type != M_CTL) { - ip6h = (ip6_t *)first_mp->b_rptr; - } else { - io = (ipsec_out_t *)first_mp->b_rptr; - ASSERT(io->ipsec_out_type == IPSEC_OUT); - /* - * Grab the zone id now because the M_CTL can be discarded by - * ip_wput_ire_parse_ipsec_out() below. - */ - ASSERT(zoneid == io->ipsec_out_zoneid); - ASSERT(zoneid != ALL_ZONES); - ip6h = (ip6_t *)first_mp->b_cont->b_rptr; - /* - * For the multicast case, ipsec_out carries conn_dontroute and - * conn_multicast_loop as conn may not be available here. We - * need this for multicast loopback and forwarding which is done - * later in the code. - */ - if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) { - conn_dontroute = io->ipsec_out_dontroute; - conn_multicast_loop = io->ipsec_out_multicast_loop; - /* - * If conn_dontroute is not set or conn_multicast_loop - * is set, we need to do forwarding/loopback. For - * datagrams from ip_wput_multicast, conn_dontroute is - * set to B_TRUE and conn_multicast_loop is set to - * B_FALSE so that we neither do forwarding nor - * loopback. - */ - if (!conn_dontroute || conn_multicast_loop) - multicast_forward = B_TRUE; - } - } - - /* - * If the sender didn't supply the hop limit and there is a default - * unicast hop limit associated with the output interface, we use - * that if the packet is unicast. Interface specific unicast hop - * limits as set via the SIOCSLIFLNKINFO ioctl. - */ - if (ill->ill_max_hops != 0 && !(flags & IP6I_HOPLIMIT) && - !(IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst))) { - ip6h->ip6_hops = ill->ill_max_hops; - } - - if (ire->ire_type == IRE_LOCAL && ire->ire_zoneid != zoneid && - ire->ire_zoneid != ALL_ZONES) { - /* - * When a zone sends a packet to another zone, we try to deliver - * the packet under the same conditions as if the destination - * was a real node on the network. To do so, we look for a - * matching route in the forwarding table. - * RTF_REJECT and RTF_BLACKHOLE are handled just like - * ip_newroute_v6() does. - * Note that IRE_LOCAL are special, since they are used - * when the zoneid doesn't match in some cases. This means that - * we need to handle ipha_src differently since ire_src_addr - * belongs to the receiving zone instead of the sending zone. - * When ip_restrict_interzone_loopback is set, then - * ire_cache_lookup_v6() ensures that IRE_LOCAL are only used - * for loopback between zones when the logical "Ethernet" would - * have looped them back. - */ - ire_t *src_ire; - - src_ire = ire_ftable_lookup_v6(&ip6h->ip6_dst, 0, 0, 0, - NULL, NULL, zoneid, 0, NULL, (MATCH_IRE_RECURSIVE | - MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE), ipst); - if (src_ire != NULL && - !(src_ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) && - (!ipst->ips_ip_restrict_interzone_loopback || - ire_local_same_lan(ire, src_ire))) { - if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) && - !unspec_src) { - ip6h->ip6_src = src_ire->ire_src_addr_v6; - } - ire_refrele(src_ire); - } else { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutNoRoutes); - if (src_ire != NULL) { - if (src_ire->ire_flags & RTF_BLACKHOLE) { - ire_refrele(src_ire); - freemsg(first_mp); - return; - } - ire_refrele(src_ire); - } - if (ip_hdr_complete_v6(ip6h, zoneid, ipst)) { - /* Failed */ - freemsg(first_mp); - return; - } - icmp_unreachable_v6(q, first_mp, - ICMP6_DST_UNREACH_NOROUTE, B_FALSE, B_FALSE, - zoneid, ipst); - return; - } - } - - if (mp->b_datap->db_type == M_CTL || - ipss->ipsec_outbound_v6_policy_present) { - mp = ip_wput_ire_parse_ipsec_out(first_mp, NULL, ip6h, ire, - connp, unspec_src, zoneid); - if (mp == NULL) { - return; - } - } - - first_mp = mp; - if (mp->b_datap->db_type == M_CTL) { - io = (ipsec_out_t *)mp->b_rptr; - ASSERT(io->ipsec_out_type == IPSEC_OUT); - mp = mp->b_cont; - mctl_present = B_TRUE; - } else { - mctl_present = B_FALSE; - } - - ip6h = (ip6_t *)mp->b_rptr; - nexthdr = ip6h->ip6_nxt; - mibptr = ill->ill_ip_mib; - - if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) && !unspec_src) { - ipif_t *ipif; - - /* - * Select the source address using ipif_select_source_v6. - */ - ipif = ipif_select_source_v6(ill, &ip6h->ip6_dst, B_FALSE, - IPV6_PREFER_SRC_DEFAULT, zoneid); - if (ipif == NULL) { - if (ip_debug > 2) { - /* ip1dbg */ - pr_addr_dbg("ip_wput_ire_v6: no src for " - "dst %s\n", AF_INET6, &ip6h->ip6_dst); - printf("through interface %s\n", ill->ill_name); - } - freemsg(first_mp); - return; - } - ip6h->ip6_src = ipif->ipif_v6src_addr; - ipif_refrele(ipif); - } - if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) { - if ((connp != NULL && connp->conn_multicast_loop) || - !IS_LOOPBACK(ill)) { - if (ilm_lookup_ill_v6(ill, &ip6h->ip6_dst, B_FALSE, - ALL_ZONES) != NULL) { - mblk_t *nmp; - int fanout_flags = 0; - - if (connp != NULL && - !connp->conn_multicast_loop) { - fanout_flags |= IP_FF_NO_MCAST_LOOP; - } - ip1dbg(("ip_wput_ire_v6: " - "Loopback multicast\n")); - nmp = ip_copymsg(first_mp); - if (nmp != NULL) { - ip6_t *nip6h; - mblk_t *mp_ip6h; - - if (mctl_present) { - nip6h = (ip6_t *) - nmp->b_cont->b_rptr; - mp_ip6h = nmp->b_cont; - } else { - nip6h = (ip6_t *)nmp->b_rptr; - mp_ip6h = nmp; - } - - DTRACE_PROBE4( - ip6__loopback__out__start, - ill_t *, NULL, - ill_t *, ill, - ip6_t *, nip6h, - mblk_t *, nmp); - - FW_HOOKS6( - ipst->ips_ip6_loopback_out_event, - ipst->ips_ipv6firewall_loopback_out, - NULL, ill, nip6h, nmp, mp_ip6h, - 0, ipst); - - DTRACE_PROBE1( - ip6__loopback__out__end, - mblk_t *, nmp); - - /* - * DTrace this as ip:::send. A blocked - * packet will fire the send probe, but - * not the receive probe. - */ - DTRACE_IP7(send, mblk_t *, nmp, - conn_t *, NULL, void_ip_t *, nip6h, - __dtrace_ipsr_ill_t *, ill, - ipha_t *, NULL, ip6_t *, nip6h, - int, 1); - - if (nmp != NULL) { - /* - * Deliver locally and to - * every local zone, except - * the sending zone when - * IPV6_MULTICAST_LOOP is - * disabled. - */ - ip_wput_local_v6(RD(q), ill, - nip6h, nmp, ire, - fanout_flags, zoneid); - } - } else { - BUMP_MIB(mibptr, ipIfStatsOutDiscards); - ip1dbg(("ip_wput_ire_v6: " - "copymsg failed\n")); - } - } - } - if (ip6h->ip6_hops == 0 || - IN6_IS_ADDR_MC_NODELOCAL(&ip6h->ip6_dst) || - IS_LOOPBACK(ill)) { - /* - * Local multicast or just loopback on loopback - * interface. - */ - BUMP_MIB(mibptr, ipIfStatsHCOutMcastPkts); - UPDATE_MIB(mibptr, ipIfStatsHCOutMcastOctets, - ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN); - ip1dbg(("ip_wput_ire_v6: local multicast only\n")); - freemsg(first_mp); - return; - } - } - - if (ire->ire_stq != NULL) { - uint32_t sum; - uint_t ill_index = ((ill_t *)ire->ire_stq->q_ptr)-> - ill_phyint->phyint_ifindex; - queue_t *dev_q = ire->ire_stq->q_next; - - /* - * non-NULL send-to queue - packet is to be sent - * out an interface. - */ - - /* Driver is flow-controlling? */ - if (!IP_FLOW_CONTROLLED_ULP(nexthdr) && - DEV_Q_FLOW_BLOCKED(dev_q)) { - /* - * Queue packet if we have an conn to give back - * pressure. We can't queue packets intended for - * hardware acceleration since we've tossed that - * state already. If the packet is being fed back - * from ire_send_v6, we don't know the position in - * the queue to enqueue the packet and we discard - * the packet. - */ - if (ipst->ips_ip_output_queue && connp != NULL && - !mctl_present && caller != IRE_SEND) { - if (caller == IP_WSRV) { - idl_tx_list_t *idl_txl; - - idl_txl = &ipst->ips_idl_tx_list[0]; - connp->conn_did_putbq = 1; - (void) putbq(connp->conn_wq, mp); - conn_drain_insert(connp, idl_txl); - /* - * caller == IP_WSRV implies we are - * the service thread, and the - * queue is already noenabled. - * The check for canput and - * the putbq is not atomic. - * So we need to check again. - */ - if (canput(dev_q)) - connp->conn_did_putbq = 0; - } else { - (void) putq(connp->conn_wq, mp); - } - return; - } - BUMP_MIB(mibptr, ipIfStatsOutDiscards); - freemsg(first_mp); - return; - } - - /* - * Look for reachability confirmations from the transport. - */ - if (ip6h->ip6_vcf & IP_FORWARD_PROG) { - reachable |= IPV6_REACHABILITY_CONFIRMATION; - ip6h->ip6_vcf &= ~IP_FORWARD_PROG; - if (mctl_present) - io->ipsec_out_reachable = B_TRUE; - } - /* Fastpath */ - switch (nexthdr) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_ICMPV6: - case IPPROTO_SCTP: - hdr_length = IPV6_HDR_LEN; - break; - default: { - uint8_t *nexthdrp; - - if (!ip_hdr_length_nexthdr_v6(mp, ip6h, - &hdr_length, &nexthdrp)) { - /* Malformed packet */ - BUMP_MIB(mibptr, ipIfStatsOutDiscards); - freemsg(first_mp); - return; - } - nexthdr = *nexthdrp; - break; - } - } - - if (cksum_request != -1 && nexthdr != IPPROTO_ICMPV6) { - uint16_t *up; - uint16_t *insp; - - /* - * The packet header is processed once for all, even - * in the multirouting case. We disable hardware - * checksum if the packet is multirouted, as it will be - * replicated via several interfaces, and not all of - * them may have this capability. - */ - if (cksum_request == 1 && - !(ire->ire_flags & RTF_MULTIRT)) { - /* Skip the transport checksum */ - goto cksum_done; - } - /* - * Do user-configured raw checksum. - * Compute checksum and insert at offset "cksum_request" - */ - - /* check for enough headers for checksum */ - cksum_request += hdr_length; /* offset from rptr */ - if ((mp->b_wptr - mp->b_rptr) < - (cksum_request + sizeof (int16_t))) { - if (!pullupmsg(mp, - cksum_request + sizeof (int16_t))) { - ip1dbg(("ip_wput_v6: ICMP hdr pullupmsg" - " failed\n")); - BUMP_MIB(mibptr, ipIfStatsOutDiscards); - freemsg(first_mp); - return; - } - ip6h = (ip6_t *)mp->b_rptr; - } - insp = (uint16_t *)((uchar_t *)ip6h + cksum_request); - ASSERT(((uintptr_t)insp & 0x1) == 0); - up = (uint16_t *)&ip6h->ip6_src; - /* - * icmp has placed length and routing - * header adjustment in *insp. - */ - sum = htons(nexthdr) + - up[0] + up[1] + up[2] + up[3] + - up[4] + up[5] + up[6] + up[7] + - up[8] + up[9] + up[10] + up[11] + - up[12] + up[13] + up[14] + up[15]; - sum = (sum & 0xffff) + (sum >> 16); - *insp = IP_CSUM(mp, hdr_length, sum); - } else if (nexthdr == IPPROTO_TCP) { - uint16_t *up; - - /* - * Check for full IPv6 header + enough TCP header - * to get at the checksum field. - */ - if ((mp->b_wptr - mp->b_rptr) < - (hdr_length + TCP_CHECKSUM_OFFSET + - TCP_CHECKSUM_SIZE)) { - if (!pullupmsg(mp, hdr_length + - TCP_CHECKSUM_OFFSET + TCP_CHECKSUM_SIZE)) { - ip1dbg(("ip_wput_v6: TCP hdr pullupmsg" - " failed\n")); - BUMP_MIB(mibptr, ipIfStatsOutDiscards); - freemsg(first_mp); - return; - } - ip6h = (ip6_t *)mp->b_rptr; - } - - up = (uint16_t *)&ip6h->ip6_src; - /* - * Note: The TCP module has stored the length value - * into the tcp checksum field, so we don't - * need to explicitly sum it in here. - */ - sum = up[0] + up[1] + up[2] + up[3] + - up[4] + up[5] + up[6] + up[7] + - up[8] + up[9] + up[10] + up[11] + - up[12] + up[13] + up[14] + up[15]; - - /* Fold the initial sum */ - sum = (sum & 0xffff) + (sum >> 16); - - up = (uint16_t *)(((uchar_t *)ip6h) + - hdr_length + TCP_CHECKSUM_OFFSET); - - IP_CKSUM_XMIT(ill, ire, mp, ip6h, up, IPPROTO_TCP, - hdr_length, ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN, - ire->ire_max_frag, mctl_present, sum); - - /* Software checksum? */ - if (DB_CKSUMFLAGS(mp) == 0) { - IP6_STAT(ipst, ip6_out_sw_cksum); - IP6_STAT_UPDATE(ipst, - ip6_tcp_out_sw_cksum_bytes, - (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN) - - hdr_length); - } - } else if (nexthdr == IPPROTO_UDP) { - uint16_t *up; - - /* - * check for full IPv6 header + enough UDP header - * to get at the UDP checksum field - */ - if ((mp->b_wptr - mp->b_rptr) < (hdr_length + - UDP_CHECKSUM_OFFSET + UDP_CHECKSUM_SIZE)) { - if (!pullupmsg(mp, hdr_length + - UDP_CHECKSUM_OFFSET + UDP_CHECKSUM_SIZE)) { - ip1dbg(("ip_wput_v6: UDP hdr pullupmsg" - " failed\n")); - BUMP_MIB(mibptr, ipIfStatsOutDiscards); - freemsg(first_mp); - return; - } - ip6h = (ip6_t *)mp->b_rptr; - } - up = (uint16_t *)&ip6h->ip6_src; - /* - * Note: The UDP module has stored the length value - * into the udp checksum field, so we don't - * need to explicitly sum it in here. - */ - sum = up[0] + up[1] + up[2] + up[3] + - up[4] + up[5] + up[6] + up[7] + - up[8] + up[9] + up[10] + up[11] + - up[12] + up[13] + up[14] + up[15]; - - /* Fold the initial sum */ - sum = (sum & 0xffff) + (sum >> 16); - - up = (uint16_t *)(((uchar_t *)ip6h) + - hdr_length + UDP_CHECKSUM_OFFSET); - - IP_CKSUM_XMIT(ill, ire, mp, ip6h, up, IPPROTO_UDP, - hdr_length, ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN, - ire->ire_max_frag, mctl_present, sum); - - /* Software checksum? */ - if (DB_CKSUMFLAGS(mp) == 0) { - IP6_STAT(ipst, ip6_out_sw_cksum); - IP6_STAT_UPDATE(ipst, - ip6_udp_out_sw_cksum_bytes, - (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN) - - hdr_length); - } - } else if (nexthdr == IPPROTO_ICMPV6) { - uint16_t *up; - icmp6_t *icmp6; - - /* check for full IPv6+ICMPv6 header */ - if ((mp->b_wptr - mp->b_rptr) < - (hdr_length + ICMP6_MINLEN)) { - if (!pullupmsg(mp, hdr_length + ICMP6_MINLEN)) { - ip1dbg(("ip_wput_v6: ICMP hdr pullupmsg" - " failed\n")); - BUMP_MIB(mibptr, ipIfStatsOutDiscards); - freemsg(first_mp); - return; - } - ip6h = (ip6_t *)mp->b_rptr; - } - icmp6 = (icmp6_t *)((uchar_t *)ip6h + hdr_length); - up = (uint16_t *)&ip6h->ip6_src; - /* - * icmp has placed length and routing - * header adjustment in icmp6_cksum. - */ - sum = htons(IPPROTO_ICMPV6) + - up[0] + up[1] + up[2] + up[3] + - up[4] + up[5] + up[6] + up[7] + - up[8] + up[9] + up[10] + up[11] + - up[12] + up[13] + up[14] + up[15]; - sum = (sum & 0xffff) + (sum >> 16); - icmp6->icmp6_cksum = IP_CSUM(mp, hdr_length, sum); - - /* Update output mib stats */ - icmp_update_out_mib_v6(ill, icmp6); - } else if (nexthdr == IPPROTO_SCTP) { - sctp_hdr_t *sctph; - - if (MBLKL(mp) < (hdr_length + sizeof (*sctph))) { - if (!pullupmsg(mp, hdr_length + - sizeof (*sctph))) { - ip1dbg(("ip_wput_v6: SCTP hdr pullupmsg" - " failed\n")); - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsOutDiscards); - freemsg(mp); - return; - } - ip6h = (ip6_t *)mp->b_rptr; - } - sctph = (sctp_hdr_t *)(mp->b_rptr + hdr_length); - sctph->sh_chksum = 0; - sctph->sh_chksum = sctp_cksum(mp, hdr_length); - } - - cksum_done: - /* - * We force the insertion of a fragment header using the - * IPH_FRAG_HDR flag in two cases: - * - after reception of an ICMPv6 "packet too big" message - * with a MTU < 1280 (cf. RFC 2460 section 5) - * - for multirouted IPv6 packets, so that the receiver can - * discard duplicates according to their fragment identifier - * - * Two flags modifed from the API can modify this behavior. - * The first is IPV6_USE_MIN_MTU. With this API the user - * can specify how to manage PMTUD for unicast and multicast. - * - * IPV6_DONTFRAG disallows fragmentation. - */ - max_frag = ire->ire_max_frag; - switch (IP6I_USE_MIN_MTU_API(flags)) { - case IPV6_USE_MIN_MTU_DEFAULT: - case IPV6_USE_MIN_MTU_UNICAST: - if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) { - max_frag = IPV6_MIN_MTU; - } - break; - - case IPV6_USE_MIN_MTU_NEVER: - max_frag = IPV6_MIN_MTU; - break; - } - if (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN > max_frag || - (ire->ire_frag_flag & IPH_FRAG_HDR)) { - if (connp != NULL && (flags & IP6I_DONTFRAG)) { - icmp_pkt2big_v6(ire->ire_stq, first_mp, - max_frag, B_FALSE, B_TRUE, zoneid, ipst); - return; - } - - if (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN != - (mp->b_cont ? msgdsize(mp) : - mp->b_wptr - (uchar_t *)ip6h)) { - ip0dbg(("Packet length mismatch: %d, %ld\n", - ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN, - msgdsize(mp))); - freemsg(first_mp); - return; - } - /* Do IPSEC processing first */ - if (mctl_present) { - ipsec_out_process(q, first_mp, ire, ill_index); - return; - } - ASSERT(mp->b_prev == NULL); - ip2dbg(("Fragmenting Size = %d, mtu = %d\n", - ntohs(ip6h->ip6_plen) + - IPV6_HDR_LEN, max_frag)); - ASSERT(mp == first_mp); - /* Initiate IPPF processing */ - if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { - ip_process(IPP_LOCAL_OUT, &mp, ill_index); - if (mp == NULL) { - return; - } - } - ip_wput_frag_v6(mp, ire, reachable, connp, - caller, max_frag); - return; - } - /* Do IPSEC processing first */ - if (mctl_present) { - int extra_len = ipsec_out_extra_length(first_mp); - - if (ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN + extra_len > - max_frag && connp != NULL && - (flags & IP6I_DONTFRAG)) { - /* - * IPsec headers will push the packet over the - * MTU limit. Issue an ICMPv6 Packet Too Big - * message for this packet if the upper-layer - * that issued this packet will be able to - * react to the icmp_pkt2big_v6() that we'll - * generate. - */ - icmp_pkt2big_v6(ire->ire_stq, first_mp, - max_frag, B_FALSE, B_TRUE, zoneid, ipst); - return; - } - ipsec_out_process(q, first_mp, ire, ill_index); - return; - } - /* - * XXX multicast: add ip_mforward_v6() here. - * Check conn_dontroute - */ -#ifdef lint - /* - * XXX The only purpose of this statement is to avoid lint - * errors. See the above "XXX multicast". When that gets - * fixed, remove this whole #ifdef lint section. - */ - ip3dbg(("multicast forward is %s.\n", - (multicast_forward ? "TRUE" : "FALSE"))); -#endif - - UPDATE_OB_PKT_COUNT(ire); - ire->ire_last_used_time = lbolt; - ASSERT(mp == first_mp); - ip_xmit_v6(mp, ire, reachable, connp, caller, NULL); - } else { - /* - * DTrace this as ip:::send. A blocked packet will fire the - * send probe, but not the receive probe. - */ - DTRACE_IP7(send, mblk_t *, first_mp, conn_t *, NULL, - void_ip_t *, ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, - NULL, ip6_t *, ip6h, int, 1); - DTRACE_PROBE4(ip6__loopback__out__start, - ill_t *, NULL, ill_t *, ill, - ip6_t *, ip6h, mblk_t *, first_mp); - FW_HOOKS6(ipst->ips_ip6_loopback_out_event, - ipst->ips_ipv6firewall_loopback_out, - NULL, ill, ip6h, first_mp, mp, 0, ipst); - DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, first_mp); - if (first_mp != NULL) { - ip_wput_local_v6(RD(q), ill, ip6h, first_mp, ire, 0, - zoneid); - } - } -} - -/* - * Outbound IPv6 fragmentation routine using MDT. - */ -static void -ip_wput_frag_mdt_v6(mblk_t *mp, ire_t *ire, size_t max_chunk, - size_t unfragmentable_len, uint8_t nexthdr, uint_t prev_nexthdr_offset) -{ - ip6_t *ip6h = (ip6_t *)mp->b_rptr; - uint_t pkts, wroff, hdr_chunk_len, pbuf_idx; - mblk_t *hdr_mp, *md_mp = NULL; - int i1; - multidata_t *mmd; - unsigned char *hdr_ptr, *pld_ptr; - ip_pdescinfo_t pdi; - uint32_t ident; - size_t len; - uint16_t offset; - queue_t *stq = ire->ire_stq; - ill_t *ill = (ill_t *)stq->q_ptr; - ip_stack_t *ipst = ill->ill_ipst; - - ASSERT(DB_TYPE(mp) == M_DATA); - ASSERT(MBLKL(mp) > unfragmentable_len); - - /* - * Move read ptr past unfragmentable portion, we don't want this part - * of the data in our fragments. - */ - mp->b_rptr += unfragmentable_len; - - /* Calculate how many packets we will send out */ - i1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgsize(mp); - pkts = (i1 + max_chunk - 1) / max_chunk; - ASSERT(pkts > 1); - - /* Allocate a message block which will hold all the IP Headers. */ - wroff = ipst->ips_ip_wroff_extra; - hdr_chunk_len = wroff + unfragmentable_len + sizeof (ip6_frag_t); - - i1 = pkts * hdr_chunk_len; - /* - * Create the header buffer, Multidata and destination address - * and SAP attribute that should be associated with it. - */ - if ((hdr_mp = allocb(i1, BPRI_HI)) == NULL || - ((hdr_mp->b_wptr += i1), - (mmd = mmd_alloc(hdr_mp, &md_mp, KM_NOSLEEP)) == NULL) || - !ip_md_addr_attr(mmd, NULL, ire->ire_nce->nce_res_mp)) { - freemsg(mp); - if (md_mp == NULL) { - freemsg(hdr_mp); - } else { -free_mmd: IP6_STAT(ipst, ip6_frag_mdt_discarded); - freemsg(md_mp); - } - IP6_STAT(ipst, ip6_frag_mdt_allocfail); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails); - return; - } - IP6_STAT(ipst, ip6_frag_mdt_allocd); - - /* - * Add a payload buffer to the Multidata; this operation must not - * fail, or otherwise our logic in this routine is broken. There - * is no memory allocation done by the routine, so any returned - * failure simply tells us that we've done something wrong. - * - * A failure tells us that either we're adding the same payload - * buffer more than once, or we're trying to add more buffers than - * allowed. None of the above cases should happen, and we panic - * because either there's horrible heap corruption, and/or - * programming mistake. - */ - if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) { - goto pbuf_panic; - } - - hdr_ptr = hdr_mp->b_rptr; - pld_ptr = mp->b_rptr; - - pdi.flags = PDESC_HBUF_REF | PDESC_PBUF_REF; - - ident = htonl(atomic_add_32_nv(&ire->ire_ident, 1)); - - /* - * len is the total length of the fragmentable data in this - * datagram. For each fragment sent, we will decrement len - * by the amount of fragmentable data sent in that fragment - * until len reaches zero. - */ - len = ntohs(ip6h->ip6_plen) - (unfragmentable_len - IPV6_HDR_LEN); - - offset = 0; - prev_nexthdr_offset += wroff; - - while (len != 0) { - size_t mlen; - ip6_t *fip6h; - ip6_frag_t *fraghdr; - int error; - - ASSERT((hdr_ptr + hdr_chunk_len) <= hdr_mp->b_wptr); - mlen = MIN(len, max_chunk); - len -= mlen; - - fip6h = (ip6_t *)(hdr_ptr + wroff); - ASSERT(OK_32PTR(fip6h)); - bcopy(ip6h, fip6h, unfragmentable_len); - hdr_ptr[prev_nexthdr_offset] = IPPROTO_FRAGMENT; - - fip6h->ip6_plen = htons((uint16_t)(mlen + - unfragmentable_len - IPV6_HDR_LEN + sizeof (ip6_frag_t))); - - fraghdr = (ip6_frag_t *)((unsigned char *)fip6h + - unfragmentable_len); - fraghdr->ip6f_nxt = nexthdr; - fraghdr->ip6f_reserved = 0; - fraghdr->ip6f_offlg = htons(offset) | - ((len != 0) ? IP6F_MORE_FRAG : 0); - fraghdr->ip6f_ident = ident; - - /* - * Record offset and size of header and data of the next packet - * in the multidata message. - */ - PDESC_HDR_ADD(&pdi, hdr_ptr, wroff, - unfragmentable_len + sizeof (ip6_frag_t), 0); - PDESC_PLD_INIT(&pdi); - i1 = MIN(mp->b_wptr - pld_ptr, mlen); - ASSERT(i1 > 0); - PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, pld_ptr, i1); - if (i1 == mlen) { - pld_ptr += mlen; - } else { - i1 = mlen - i1; - mp = mp->b_cont; - ASSERT(mp != NULL); - ASSERT(MBLKL(mp) >= i1); - /* - * Attach the next payload message block to the - * multidata message. - */ - if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) - goto pbuf_panic; - PDESC_PLD_SPAN_ADD(&pdi, pbuf_idx, mp->b_rptr, i1); - pld_ptr = mp->b_rptr + i1; - } - - if ((mmd_addpdesc(mmd, (pdescinfo_t *)&pdi, &error, - KM_NOSLEEP)) == NULL) { - /* - * Any failure other than ENOMEM indicates that we - * have passed in invalid pdesc info or parameters - * to mmd_addpdesc, which must not happen. - * - * EINVAL is a result of failure on boundary checks - * against the pdesc info contents. It should not - * happen, and we panic because either there's - * horrible heap corruption, and/or programming - * mistake. - */ - if (error != ENOMEM) { - cmn_err(CE_PANIC, "ip_wput_frag_mdt_v6: " - "pdesc logic error detected for " - "mmd %p pinfo %p (%d)\n", - (void *)mmd, (void *)&pdi, error); - /* NOTREACHED */ - } - IP6_STAT(ipst, ip6_frag_mdt_addpdescfail); - /* Free unattached payload message blocks as well */ - md_mp->b_cont = mp->b_cont; - goto free_mmd; - } - - /* Advance fragment offset. */ - offset += mlen; - - /* Advance to location for next header in the buffer. */ - hdr_ptr += hdr_chunk_len; - - /* Did we reach the next payload message block? */ - if (pld_ptr == mp->b_wptr && mp->b_cont != NULL) { - mp = mp->b_cont; - /* - * Attach the next message block with payload - * data to the multidata message. - */ - if ((pbuf_idx = mmd_addpldbuf(mmd, mp)) < 0) - goto pbuf_panic; - pld_ptr = mp->b_rptr; - } - } - - ASSERT(hdr_mp->b_wptr == hdr_ptr); - ASSERT(mp->b_wptr == pld_ptr); - - /* Update IP statistics */ - UPDATE_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates, pkts); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragOKs); - UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits, pkts); - /* - * The ipv6 header len is accounted for in unfragmentable_len so - * when calculating the fragmentation overhead just add the frag - * header len. - */ - UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets, - (ntohs(ip6h->ip6_plen) - (unfragmentable_len - IPV6_HDR_LEN)) + - pkts * (unfragmentable_len + sizeof (ip6_frag_t))); - IP6_STAT_UPDATE(ipst, ip6_frag_mdt_pkt_out, pkts); - - ire->ire_ob_pkt_count += pkts; - if (ire->ire_ipif != NULL) - atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, pkts); - - ire->ire_last_used_time = lbolt; - /* Send it down */ - putnext(stq, md_mp); - return; - -pbuf_panic: - cmn_err(CE_PANIC, "ip_wput_frag_mdt_v6: payload buffer logic " - "error for mmd %p pbuf %p (%d)", (void *)mmd, (void *)mp, - pbuf_idx); - /* NOTREACHED */ -} - -/* * IPv6 fragmentation. Essentially the same as IPv4 fragmentation. * We have not optimized this in terms of number of mblks * allocated. For instance, for each fragment sent we always allocate a * mblk to hold the IPv6 header and fragment header. * - * Assumes that all the extension headers are contained in the first mblk. - * - * The fragment header is inserted after an hop-by-hop options header - * and after [an optional destinations header followed by] a routing header. - * - * NOTE : This function does not ire_refrele the ire passed in as - * the argument. + * Assumes that all the extension headers are contained in the first mblk + * and that the fragment header has has already been added by calling + * ip_fraghdr_add_v6. */ -void -ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp, - int caller, int max_frag) +int +ip_fragment_v6(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, uint_t pkt_len, + uint32_t max_frag, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, + pfirepostfrag_t postfragfn, uintptr_t *ixa_cookie) { ip6_t *ip6h = (ip6_t *)mp->b_rptr; ip6_t *fip6h; @@ -11337,27 +4102,31 @@ ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp, mblk_t *dmp; ip6_frag_t *fraghdr; size_t unfragmentable_len; - size_t len; size_t mlen; size_t max_chunk; - uint32_t ident; uint16_t off_flags; uint16_t offset = 0; - ill_t *ill; + ill_t *ill = nce->nce_ill; uint8_t nexthdr; - uint_t prev_nexthdr_offset; uint8_t *ptr; - ip_stack_t *ipst = ire->ire_ipst; - - ASSERT(ire->ire_type == IRE_CACHE); - ill = (ill_t *)ire->ire_stq->q_ptr; + ip_stack_t *ipst = ill->ill_ipst; + uint_t priority = mp->b_band; + int error = 0; - if (max_frag <= 0) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragReqds); + if (max_frag == 0) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails); + ip_drop_output("FragFails: zero max_frag", mp, ill); freemsg(mp); - return; + return (EINVAL); } - BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragReqds); + + /* + * Caller should have added fraghdr_t to pkt_len, and also + * updated ip6_plen. + */ + ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == pkt_len); + ASSERT(msgdsize(mp) == pkt_len); /* * Determine the length of the unfragmentable portion of this @@ -11366,7 +4135,6 @@ ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp, * destination options header, and a potential routing header. */ nexthdr = ip6h->ip6_nxt; - prev_nexthdr_offset = (uint8_t *)&ip6h->ip6_nxt - (uint8_t *)ip6h; ptr = (uint8_t *)&ip6h[1]; if (nexthdr == IPPROTO_HOPOPTS) { @@ -11376,8 +4144,6 @@ ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp, hbh_hdr = (ip6_hbh_t *)ptr; hdr_len = 8 * (hbh_hdr->ip6h_len + 1); nexthdr = hbh_hdr->ip6h_nxt; - prev_nexthdr_offset = (uint8_t *)&hbh_hdr->ip6h_nxt - - (uint8_t *)ip6h; ptr += hdr_len; } if (nexthdr == IPPROTO_DSTOPTS) { @@ -11388,8 +4154,6 @@ ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp, if (dest_hdr->ip6d_nxt == IPPROTO_ROUTING) { hdr_len = 8 * (dest_hdr->ip6d_len + 1); nexthdr = dest_hdr->ip6d_nxt; - prev_nexthdr_offset = (uint8_t *)&dest_hdr->ip6d_nxt - - (uint8_t *)ip6h; ptr += hdr_len; } } @@ -11399,82 +4163,73 @@ ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp, rthdr = (ip6_rthdr_t *)ptr; nexthdr = rthdr->ip6r_nxt; - prev_nexthdr_offset = (uint8_t *)&rthdr->ip6r_nxt - - (uint8_t *)ip6h; hdr_len = 8 * (rthdr->ip6r_len + 1); ptr += hdr_len; } + if (nexthdr != IPPROTO_FRAGMENT) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails); + ip_drop_output("FragFails: bad nexthdr", mp, ill); + freemsg(mp); + return (EINVAL); + } unfragmentable_len = (uint_t)(ptr - (uint8_t *)ip6h); + unfragmentable_len += sizeof (ip6_frag_t); - max_chunk = (min(max_frag, ire->ire_max_frag) - unfragmentable_len - - sizeof (ip6_frag_t)) & ~7; - - /* Check if we can use MDT to send out the frags. */ - ASSERT(!IRE_IS_LOCAL(ire)); - if (ipst->ips_ip_multidata_outbound && reachable == 0 && - !(ire->ire_flags & RTF_MULTIRT) && ILL_MDT_CAPABLE(ill) && - IP_CAN_FRAG_MDT(mp, unfragmentable_len, max_chunk)) { - ip_wput_frag_mdt_v6(mp, ire, max_chunk, unfragmentable_len, - nexthdr, prev_nexthdr_offset); - return; - } + max_chunk = (max_frag - unfragmentable_len) & ~7; /* * Allocate an mblk with enough room for the link-layer - * header, the unfragmentable part of the datagram, and the - * fragment header. This (or a copy) will be used as the + * header and the unfragmentable part of the datagram, which includes + * the fragment header. This (or a copy) will be used as the * first mblk for each fragment we send. */ - hmp = allocb_tmpl(unfragmentable_len + sizeof (ip6_frag_t) + - ipst->ips_ip_wroff_extra, mp); + hmp = allocb_tmpl(unfragmentable_len + ipst->ips_ip_wroff_extra, mp); if (hmp == NULL) { BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails); + ip_drop_output("FragFails: no hmp", mp, ill); freemsg(mp); - return; + return (ENOBUFS); } hmp->b_rptr += ipst->ips_ip_wroff_extra; - hmp->b_wptr = hmp->b_rptr + unfragmentable_len + sizeof (ip6_frag_t); + hmp->b_wptr = hmp->b_rptr + unfragmentable_len; fip6h = (ip6_t *)hmp->b_rptr; - fraghdr = (ip6_frag_t *)(hmp->b_rptr + unfragmentable_len); - bcopy(ip6h, fip6h, unfragmentable_len); - hmp->b_rptr[prev_nexthdr_offset] = IPPROTO_FRAGMENT; - - ident = atomic_add_32_nv(&ire->ire_ident, 1); - - fraghdr->ip6f_nxt = nexthdr; - fraghdr->ip6f_reserved = 0; - fraghdr->ip6f_offlg = 0; - fraghdr->ip6f_ident = htonl(ident); /* - * len is the total length of the fragmentable data in this - * datagram. For each fragment sent, we will decrement len + * pkt_len is set to the total length of the fragmentable data in this + * datagram. For each fragment sent, we will decrement pkt_len * by the amount of fragmentable data sent in that fragment * until len reaches zero. */ - len = ntohs(ip6h->ip6_plen) - (unfragmentable_len - IPV6_HDR_LEN); + pkt_len -= unfragmentable_len; /* * Move read ptr past unfragmentable portion, we don't want this part * of the data in our fragments. */ mp->b_rptr += unfragmentable_len; + if (mp->b_rptr == mp->b_wptr) { + mblk_t *mp1 = mp->b_cont; + freeb(mp); + mp = mp1; + } - while (len != 0) { - mlen = MIN(len, max_chunk); - len -= mlen; - if (len != 0) { + while (pkt_len != 0) { + mlen = MIN(pkt_len, max_chunk); + pkt_len -= mlen; + if (pkt_len != 0) { /* Not last */ hmp0 = copyb(hmp); if (hmp0 == NULL) { - freeb(hmp); - freemsg(mp); BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails); - ip1dbg(("ip_wput_frag_v6: copyb failed\n")); - return; + ip_drop_output("FragFails: copyb failed", + mp, ill); + freeb(hmp); + freemsg(mp); + ip1dbg(("ip_fragment_v6: copyb failed\n")); + return (ENOBUFS); } off_flags = IP6F_MORE_FRAG; } else { @@ -11484,10 +4239,11 @@ ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp, off_flags = 0; } fip6h = (ip6_t *)(hmp0->b_rptr); - fraghdr = (ip6_frag_t *)(hmp0->b_rptr + unfragmentable_len); + fraghdr = (ip6_frag_t *)(hmp0->b_rptr + unfragmentable_len - + sizeof (ip6_frag_t)); fip6h->ip6_plen = htons((uint16_t)(mlen + - unfragmentable_len - IPV6_HDR_LEN + sizeof (ip6_frag_t))); + unfragmentable_len - IPV6_HDR_LEN)); /* * Note: Optimization alert. * In IPv6 (and IPv4) protocol header, Fragment Offset @@ -11504,654 +4260,197 @@ ip_wput_frag_v6(mblk_t *mp, ire_t *ire, uint_t reachable, conn_t *connp, if (!(dmp = ip_carve_mp(&mp, mlen))) { /* mp has already been freed by ip_carve_mp() */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails); + ip_drop_output("FragFails: could not carve mp", + hmp0, ill); if (hmp != NULL) freeb(hmp); freeb(hmp0); ip1dbg(("ip_carve_mp: failed\n")); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails); - return; + return (ENOBUFS); } hmp0->b_cont = dmp; /* Get the priority marking, if any */ - hmp0->b_band = dmp->b_band; - UPDATE_OB_PKT_COUNT(ire); - ire->ire_last_used_time = lbolt; - ip_xmit_v6(hmp0, ire, reachable | IP6_NO_IPPOLICY, connp, - caller, NULL); - reachable = 0; /* No need to redo state machine in loop */ + hmp0->b_band = priority; + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragCreates); + + error = postfragfn(hmp0, nce, ixaflags, + mlen + unfragmentable_len, xmit_hint, szone, nolzid, + ixa_cookie); + if (error != 0 && error != EWOULDBLOCK && hmp != NULL) { + /* No point in sending the other fragments */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragFails); + ip_drop_output("FragFails: postfragfn failed", + hmp, ill); + freeb(hmp); + freemsg(mp); + return (error); + } + /* No need to redo state machine in loop */ + ixaflags &= ~IXAF_REACH_CONF; + offset += mlen; } BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutFragOKs); + return (error); } /* - * Determine if the ill and multicast aspects of that packets - * "matches" the conn. + * Add a fragment header to an IPv6 packet. + * Assumes that all the extension headers are contained in the first mblk. + * + * The fragment header is inserted after an hop-by-hop options header + * and after [an optional destinations header followed by] a routing header. */ -boolean_t -conn_wantpacket_v6(conn_t *connp, ill_t *ill, ip6_t *ip6h, int fanout_flags, - zoneid_t zoneid) +mblk_t * +ip_fraghdr_add_v6(mblk_t *mp, uint32_t ident, ip_xmit_attr_t *ixa) { - ill_t *bound_ill; - boolean_t wantpacket; - in6_addr_t *v6dst_ptr = &ip6h->ip6_dst; - in6_addr_t *v6src_ptr = &ip6h->ip6_src; + ip6_t *ip6h = (ip6_t *)mp->b_rptr; + ip6_t *fip6h; + mblk_t *hmp; + ip6_frag_t *fraghdr; + size_t unfragmentable_len; + uint8_t nexthdr; + uint_t prev_nexthdr_offset; + uint8_t *ptr; + uint_t priority = mp->b_band; + ip_stack_t *ipst = ixa->ixa_ipst; /* - * conn_incoming_ill is set by IPV6_BOUND_IF which limits - * unicast and multicast reception to conn_incoming_ill. - * conn_wantpacket_v6 is called both for unicast and - * multicast. + * Determine the length of the unfragmentable portion of this + * datagram. This consists of the IPv6 header, a potential + * hop-by-hop options header, a potential pre-routing-header + * destination options header, and a potential routing header. */ - bound_ill = connp->conn_incoming_ill; - if (bound_ill != NULL) { - if (IS_IPMP(bound_ill)) { - if (bound_ill->ill_grp != ill->ill_grp) - return (B_FALSE); - } else { - if (bound_ill != ill) - return (B_FALSE); - } - } + nexthdr = ip6h->ip6_nxt; + prev_nexthdr_offset = (uint8_t *)&ip6h->ip6_nxt - (uint8_t *)ip6h; + ptr = (uint8_t *)&ip6h[1]; - if (connp->conn_multi_router) - return (B_TRUE); + if (nexthdr == IPPROTO_HOPOPTS) { + ip6_hbh_t *hbh_hdr; + uint_t hdr_len; - if (!IN6_IS_ADDR_MULTICAST(v6dst_ptr) && - !IN6_IS_ADDR_V4MAPPED_CLASSD(v6dst_ptr)) { - /* - * Unicast case: we match the conn only if it's in the specified - * zone. - */ - return (IPCL_ZONE_MATCH(connp, zoneid)); + hbh_hdr = (ip6_hbh_t *)ptr; + hdr_len = 8 * (hbh_hdr->ip6h_len + 1); + nexthdr = hbh_hdr->ip6h_nxt; + prev_nexthdr_offset = (uint8_t *)&hbh_hdr->ip6h_nxt + - (uint8_t *)ip6h; + ptr += hdr_len; } + if (nexthdr == IPPROTO_DSTOPTS) { + ip6_dest_t *dest_hdr; + uint_t hdr_len; - if ((fanout_flags & IP_FF_NO_MCAST_LOOP) && - (connp->conn_zoneid == zoneid || zoneid == ALL_ZONES)) { - /* - * Loopback case: the sending endpoint has IP_MULTICAST_LOOP - * disabled, therefore we don't dispatch the multicast packet to - * the sending zone. - */ - return (B_FALSE); + dest_hdr = (ip6_dest_t *)ptr; + if (dest_hdr->ip6d_nxt == IPPROTO_ROUTING) { + hdr_len = 8 * (dest_hdr->ip6d_len + 1); + nexthdr = dest_hdr->ip6d_nxt; + prev_nexthdr_offset = (uint8_t *)&dest_hdr->ip6d_nxt + - (uint8_t *)ip6h; + ptr += hdr_len; + } } + if (nexthdr == IPPROTO_ROUTING) { + ip6_rthdr_t *rthdr; + uint_t hdr_len; - if (IS_LOOPBACK(ill) && connp->conn_zoneid != zoneid && - zoneid != ALL_ZONES) { - /* - * Multicast packet on the loopback interface: we only match - * conns who joined the group in the specified zone. - */ - return (B_FALSE); + rthdr = (ip6_rthdr_t *)ptr; + nexthdr = rthdr->ip6r_nxt; + prev_nexthdr_offset = (uint8_t *)&rthdr->ip6r_nxt + - (uint8_t *)ip6h; + hdr_len = 8 * (rthdr->ip6r_len + 1); + ptr += hdr_len; } + unfragmentable_len = (uint_t)(ptr - (uint8_t *)ip6h); - mutex_enter(&connp->conn_lock); - wantpacket = - ilg_lookup_ill_withsrc_v6(connp, v6dst_ptr, v6src_ptr, ill) != NULL; - mutex_exit(&connp->conn_lock); - - return (wantpacket); -} - - -/* - * Transmit a packet and update any NUD state based on the flags - * XXX need to "recover" any ip6i_t when doing putq! - * - * NOTE : This function does not ire_refrele the ire passed in as the - * argument. - */ -void -ip_xmit_v6(mblk_t *mp, ire_t *ire, uint_t flags, conn_t *connp, - int caller, ipsec_out_t *io) -{ - mblk_t *mp1; - nce_t *nce = ire->ire_nce; - ill_t *ill; - ill_t *out_ill; - uint64_t delta; - ip6_t *ip6h; - queue_t *stq = ire->ire_stq; - ire_t *ire1 = NULL; - ire_t *save_ire = ire; - boolean_t multirt_send = B_FALSE; - mblk_t *next_mp = NULL; - ip_stack_t *ipst = ire->ire_ipst; - boolean_t fp_prepend = B_FALSE; - uint32_t hlen; + /* + * Allocate an mblk with enough room for the link-layer + * header, the unfragmentable part of the datagram, and the + * fragment header. + */ + hmp = allocb_tmpl(unfragmentable_len + sizeof (ip6_frag_t) + + ipst->ips_ip_wroff_extra, mp); + if (hmp == NULL) { + ill_t *ill = ixa->ixa_nce->nce_ill; - ip6h = (ip6_t *)mp->b_rptr; - ASSERT(!IN6_IS_ADDR_V4MAPPED(&ire->ire_addr_v6)); - ASSERT(ire->ire_ipversion == IPV6_VERSION); - ASSERT(nce != NULL); - ASSERT(mp->b_datap->db_type == M_DATA); - ASSERT(stq != NULL); - - ill = ire_to_ill(ire); - if (!ill) { - ip0dbg(("ip_xmit_v6: ire_to_ill failed\n")); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards: allocb failure", mp, ill); freemsg(mp); - return; + return (NULL); } + hmp->b_rptr += ipst->ips_ip_wroff_extra; + hmp->b_wptr = hmp->b_rptr + unfragmentable_len + sizeof (ip6_frag_t); - /* Flow-control check has been done in ip_wput_ire_v6 */ - if (IP_FLOW_CONTROLLED_ULP(ip6h->ip6_nxt) || caller == IP_WPUT || - caller == IP_WSRV || canput(stq->q_next)) { - uint32_t ill_index; - - /* - * In most cases, the emission loop below is entered only - * once. Only in the case where the ire holds the - * RTF_MULTIRT flag, do we loop to process all RTF_MULTIRT - * flagged ires in the bucket, and send the packet - * through all crossed RTF_MULTIRT routes. - */ - if (ire->ire_flags & RTF_MULTIRT) { - /* - * Multirouting case. The bucket where ire is stored - * probably holds other RTF_MULTIRT flagged ires - * to the destination. In this call to ip_xmit_v6, - * we attempt to send the packet through all - * those ires. Thus, we first ensure that ire is the - * first RTF_MULTIRT ire in the bucket, - * before walking the ire list. - */ - ire_t *first_ire; - irb_t *irb = ire->ire_bucket; - ASSERT(irb != NULL); - multirt_send = B_TRUE; - - /* Make sure we do not omit any multiroute ire. */ - IRB_REFHOLD(irb); - for (first_ire = irb->irb_ire; - first_ire != NULL; - first_ire = first_ire->ire_next) { - if ((first_ire->ire_flags & RTF_MULTIRT) && - (IN6_ARE_ADDR_EQUAL(&first_ire->ire_addr_v6, - &ire->ire_addr_v6)) && - !(first_ire->ire_marks & - (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN))) - break; - } - - if ((first_ire != NULL) && (first_ire != ire)) { - IRE_REFHOLD(first_ire); - /* ire will be released by the caller */ - ire = first_ire; - nce = ire->ire_nce; - stq = ire->ire_stq; - ill = ire_to_ill(ire); - } - IRB_REFRELE(irb); - } else if (connp != NULL && IPCL_IS_TCP(connp) && - connp->conn_mdt_ok && !connp->conn_tcp->tcp_mdt && - ILL_MDT_USABLE(ill)) { - /* - * This tcp connection was marked as MDT-capable, but - * it has been turned off due changes in the interface. - * Now that the interface support is back, turn it on - * by notifying tcp. We don't directly modify tcp_mdt, - * since we leave all the details to the tcp code that - * knows better. - */ - mblk_t *mdimp = ip_mdinfo_alloc(ill->ill_mdt_capab); - - if (mdimp == NULL) { - ip0dbg(("ip_xmit_v6: can't re-enable MDT for " - "connp %p (ENOMEM)\n", (void *)connp)); - } else { - CONN_INC_REF(connp); - SQUEUE_ENTER_ONE(connp->conn_sqp, mdimp, - tcp_input, connp, SQ_FILL, - SQTAG_TCP_INPUT_MCTL); - } - } - - do { - mblk_t *mp_ip6h; - - if (multirt_send) { - irb_t *irb; - /* - * We are in a multiple send case, need to get - * the next ire and make a duplicate of the - * packet. ire1 holds here the next ire to - * process in the bucket. If multirouting is - * expected, any non-RTF_MULTIRT ire that has - * the right destination address is ignored. - */ - irb = ire->ire_bucket; - ASSERT(irb != NULL); - - IRB_REFHOLD(irb); - for (ire1 = ire->ire_next; - ire1 != NULL; - ire1 = ire1->ire_next) { - if (!(ire1->ire_flags & RTF_MULTIRT)) - continue; - if (!IN6_ARE_ADDR_EQUAL( - &ire1->ire_addr_v6, - &ire->ire_addr_v6)) - continue; - if (ire1->ire_marks & - IRE_MARK_CONDEMNED) - continue; - - /* Got one */ - if (ire1 != save_ire) { - IRE_REFHOLD(ire1); - } - break; - } - IRB_REFRELE(irb); - - if (ire1 != NULL) { - next_mp = copyb(mp); - if ((next_mp == NULL) || - ((mp->b_cont != NULL) && - ((next_mp->b_cont = - dupmsg(mp->b_cont)) == NULL))) { - freemsg(next_mp); - next_mp = NULL; - ire_refrele(ire1); - ire1 = NULL; - } - } - - /* Last multiroute ire; don't loop anymore. */ - if (ire1 == NULL) { - multirt_send = B_FALSE; - } - } - - ill_index = - ((ill_t *)stq->q_ptr)->ill_phyint->phyint_ifindex; - - /* Initiate IPPF processing */ - if (IP6_OUT_IPP(flags, ipst)) { - ip_process(IPP_LOCAL_OUT, &mp, ill_index); - if (mp == NULL) { - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsOutDiscards); - if (next_mp != NULL) - freemsg(next_mp); - if (ire != save_ire) { - ire_refrele(ire); - } - return; - } - ip6h = (ip6_t *)mp->b_rptr; - } - mp_ip6h = mp; - - /* - * Check for fastpath, we need to hold nce_lock to - * prevent fastpath update from chaining nce_fp_mp. - */ - - ASSERT(nce->nce_ipversion != IPV4_VERSION); - mutex_enter(&nce->nce_lock); - if ((mp1 = nce->nce_fp_mp) != NULL) { - uchar_t *rptr; - - hlen = MBLKL(mp1); - rptr = mp->b_rptr - hlen; - /* - * make sure there is room for the fastpath - * datalink header - */ - if (rptr < mp->b_datap->db_base) { - mp1 = copyb(mp1); - mutex_exit(&nce->nce_lock); - if (mp1 == NULL) { - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsOutDiscards); - freemsg(mp); - if (next_mp != NULL) - freemsg(next_mp); - if (ire != save_ire) { - ire_refrele(ire); - } - return; - } - mp1->b_cont = mp; - - /* Get the priority marking, if any */ - mp1->b_band = mp->b_band; - mp = mp1; - } else { - mp->b_rptr = rptr; - /* - * fastpath - pre-pend datalink - * header - */ - bcopy(mp1->b_rptr, rptr, hlen); - mutex_exit(&nce->nce_lock); - fp_prepend = B_TRUE; - } - } else { - /* - * Get the DL_UNITDATA_REQ. - */ - mp1 = nce->nce_res_mp; - if (mp1 == NULL) { - mutex_exit(&nce->nce_lock); - ip1dbg(("ip_xmit_v6: No resolution " - "block ire = %p\n", (void *)ire)); - freemsg(mp); - if (next_mp != NULL) - freemsg(next_mp); - if (ire != save_ire) { - ire_refrele(ire); - } - return; - } - /* - * Prepend the DL_UNITDATA_REQ. - */ - mp1 = copyb(mp1); - mutex_exit(&nce->nce_lock); - if (mp1 == NULL) { - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsOutDiscards); - freemsg(mp); - if (next_mp != NULL) - freemsg(next_mp); - if (ire != save_ire) { - ire_refrele(ire); - } - return; - } - mp1->b_cont = mp; - - /* Get the priority marking, if any */ - mp1->b_band = mp->b_band; - mp = mp1; - } - - out_ill = (ill_t *)stq->q_ptr; - - DTRACE_PROBE4(ip6__physical__out__start, - ill_t *, NULL, ill_t *, out_ill, - ip6_t *, ip6h, mblk_t *, mp); + fip6h = (ip6_t *)hmp->b_rptr; + fraghdr = (ip6_frag_t *)(hmp->b_rptr + unfragmentable_len); - FW_HOOKS6(ipst->ips_ip6_physical_out_event, - ipst->ips_ipv6firewall_physical_out, - NULL, out_ill, ip6h, mp, mp_ip6h, 0, ipst); + bcopy(ip6h, fip6h, unfragmentable_len); + fip6h->ip6_plen = htons(ntohs(fip6h->ip6_plen) + sizeof (ip6_frag_t)); + hmp->b_rptr[prev_nexthdr_offset] = IPPROTO_FRAGMENT; - DTRACE_PROBE1(ip6__physical__out__end, mblk_t *, mp); + fraghdr->ip6f_nxt = nexthdr; + fraghdr->ip6f_reserved = 0; + fraghdr->ip6f_offlg = 0; + fraghdr->ip6f_ident = htonl(ident); - if (mp == NULL) { - if (multirt_send) { - ASSERT(ire1 != NULL); - if (ire != save_ire) { - ire_refrele(ire); - } - /* - * Proceed with the next RTF_MULTIRT - * ire, also set up the send-to queue - * accordingly. - */ - ire = ire1; - ire1 = NULL; - stq = ire->ire_stq; - nce = ire->ire_nce; - ill = ire_to_ill(ire); - mp = next_mp; - next_mp = NULL; - continue; - } else { - ASSERT(next_mp == NULL); - ASSERT(ire1 == NULL); - break; - } - } + /* Get the priority marking, if any */ + hmp->b_band = priority; - if (ipst->ips_ip6_observe.he_interested) { - zoneid_t szone; + /* + * Move read ptr past unfragmentable portion, we don't want this part + * of the data in our fragments. + */ + mp->b_rptr += unfragmentable_len; + hmp->b_cont = mp; + return (hmp); +} - /* - * Both of these functions expect b_rptr to - * be where the IPv6 header starts, so advance - * past the link layer header. - */ - if (fp_prepend) - mp_ip6h->b_rptr += hlen; - szone = ip_get_zoneid_v6(&ip6h->ip6_src, - mp_ip6h, out_ill, ipst, ALL_ZONES); - ipobs_hook(mp_ip6h, IPOBS_HOOK_OUTBOUND, szone, - ALL_ZONES, out_ill, ipst); - if (fp_prepend) - mp_ip6h->b_rptr -= hlen; - } +/* + * Determine if the ill and multicast aspects of that packets + * "matches" the conn. + */ +boolean_t +conn_wantpacket_v6(conn_t *connp, ip_recv_attr_t *ira, ip6_t *ip6h) +{ + ill_t *ill = ira->ira_rill; + zoneid_t zoneid = ira->ira_zoneid; + uint_t in_ifindex; + in6_addr_t *v6dst_ptr = &ip6h->ip6_dst; + in6_addr_t *v6src_ptr = &ip6h->ip6_src; - /* - * Update ire and MIB counters; for save_ire, this has - * been done by the caller. - */ - if (ire != save_ire) { - UPDATE_OB_PKT_COUNT(ire); - ire->ire_last_used_time = lbolt; + /* + * conn_incoming_ifindex is set by IPV6_BOUND_IF and as link-local + * scopeid. This is used to limit + * unicast and multicast reception to conn_incoming_ifindex. + * conn_wantpacket_v6 is called both for unicast and + * multicast packets. + */ + in_ifindex = connp->conn_incoming_ifindex; - if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) { - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsHCOutMcastPkts); - UPDATE_MIB(ill->ill_ip_mib, - ipIfStatsHCOutMcastOctets, - ntohs(ip6h->ip6_plen) + - IPV6_HDR_LEN); - } - } + /* mpathd can bind to the under IPMP interface, which we allow */ + if (in_ifindex != 0 && in_ifindex != ill->ill_phyint->phyint_ifindex) { + if (!IS_UNDER_IPMP(ill)) + return (B_FALSE); - /* - * Send it down. XXX Do we want to flow control AH/ESP - * packets that carry TCP payloads? We don't flow - * control TCP packets, but we should also not - * flow-control TCP packets that have been protected. - * We don't have an easy way to find out if an AH/ESP - * packet was originally TCP or not currently. - */ - if (io == NULL) { - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsHCOutTransmits); - UPDATE_MIB(ill->ill_ip_mib, - ipIfStatsHCOutOctets, - ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN); - DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, - void_ip_t *, ip6h, __dtrace_ipsr_ill_t *, - out_ill, ipha_t *, NULL, ip6_t *, ip6h, - int, 0); - - putnext(stq, mp); - } else { - /* - * Safety Pup says: make sure this is - * going to the right interface! - */ - if (io->ipsec_out_capab_ill_index != - ill_index) { - /* IPsec kstats: bump lose counter */ - freemsg(mp1); - } else { - BUMP_MIB(ill->ill_ip_mib, - ipIfStatsHCOutTransmits); - UPDATE_MIB(ill->ill_ip_mib, - ipIfStatsHCOutOctets, - ntohs(ip6h->ip6_plen) + - IPV6_HDR_LEN); - DTRACE_IP7(send, mblk_t *, mp, - conn_t *, NULL, void_ip_t *, ip6h, - __dtrace_ipsr_ill_t *, out_ill, - ipha_t *, NULL, ip6_t *, ip6h, int, - 0); - ipsec_hw_putnext(stq, mp); - } - } + if (in_ifindex != ipmp_ill_get_ipmp_ifindex(ill)) + return (B_FALSE); + } - if (nce->nce_flags & (NCE_F_NONUD|NCE_F_PERMANENT)) { - if (ire != save_ire) { - ire_refrele(ire); - } - if (multirt_send) { - ASSERT(ire1 != NULL); - /* - * Proceed with the next RTF_MULTIRT - * ire, also set up the send-to queue - * accordingly. - */ - ire = ire1; - ire1 = NULL; - stq = ire->ire_stq; - nce = ire->ire_nce; - ill = ire_to_ill(ire); - mp = next_mp; - next_mp = NULL; - continue; - } - ASSERT(next_mp == NULL); - ASSERT(ire1 == NULL); - return; - } + if (!IPCL_ZONE_MATCH(connp, zoneid)) + return (B_FALSE); - ASSERT(nce->nce_state != ND_INCOMPLETE); + if (!(ira->ira_flags & IRAF_MULTICAST)) + return (B_TRUE); - /* - * Check for upper layer advice - */ - if (flags & IPV6_REACHABILITY_CONFIRMATION) { - /* - * It should be o.k. to check the state without - * a lock here, at most we lose an advice. - */ - nce->nce_last = TICK_TO_MSEC(lbolt64); - if (nce->nce_state != ND_REACHABLE) { - - mutex_enter(&nce->nce_lock); - nce->nce_state = ND_REACHABLE; - nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT; - mutex_exit(&nce->nce_lock); - (void) untimeout(nce->nce_timeout_id); - if (ip_debug > 2) { - /* ip1dbg */ - pr_addr_dbg("ip_xmit_v6: state" - " for %s changed to" - " REACHABLE\n", AF_INET6, - &ire->ire_addr_v6); - } - } - if (ire != save_ire) { - ire_refrele(ire); - } - if (multirt_send) { - ASSERT(ire1 != NULL); - /* - * Proceed with the next RTF_MULTIRT - * ire, also set up the send-to queue - * accordingly. - */ - ire = ire1; - ire1 = NULL; - stq = ire->ire_stq; - nce = ire->ire_nce; - ill = ire_to_ill(ire); - mp = next_mp; - next_mp = NULL; - continue; - } - ASSERT(next_mp == NULL); - ASSERT(ire1 == NULL); - return; - } + if (connp->conn_multi_router) + return (B_TRUE); - delta = TICK_TO_MSEC(lbolt64) - nce->nce_last; - ip1dbg(("ip_xmit_v6: delta = %" PRId64 - " ill_reachable_time = %d \n", delta, - ill->ill_reachable_time)); - if (delta > (uint64_t)ill->ill_reachable_time) { - nce = ire->ire_nce; - mutex_enter(&nce->nce_lock); - switch (nce->nce_state) { - case ND_REACHABLE: - case ND_STALE: - /* - * ND_REACHABLE is identical to - * ND_STALE in this specific case. If - * reachable time has expired for this - * neighbor (delta is greater than - * reachable time), conceptually, the - * neighbor cache is no longer in - * REACHABLE state, but already in - * STALE state. So the correct - * transition here is to ND_DELAY. - */ - nce->nce_state = ND_DELAY; - mutex_exit(&nce->nce_lock); - NDP_RESTART_TIMER(nce, - ipst->ips_delay_first_probe_time); - if (ip_debug > 3) { - /* ip2dbg */ - pr_addr_dbg("ip_xmit_v6: state" - " for %s changed to" - " DELAY\n", AF_INET6, - &ire->ire_addr_v6); - } - break; - case ND_DELAY: - case ND_PROBE: - mutex_exit(&nce->nce_lock); - /* Timers have already started */ - break; - case ND_UNREACHABLE: - /* - * ndp timer has detected that this nce - * is unreachable and initiated deleting - * this nce and all its associated IREs. - * This is a race where we found the - * ire before it was deleted and have - * just sent out a packet using this - * unreachable nce. - */ - mutex_exit(&nce->nce_lock); - break; - default: - ASSERT(0); - } - } + if (ira->ira_protocol == IPPROTO_RSVP) + return (B_TRUE); - if (multirt_send) { - ASSERT(ire1 != NULL); - /* - * Proceed with the next RTF_MULTIRT ire, - * Also set up the send-to queue accordingly. - */ - if (ire != save_ire) { - ire_refrele(ire); - } - ire = ire1; - ire1 = NULL; - stq = ire->ire_stq; - nce = ire->ire_nce; - ill = ire_to_ill(ire); - mp = next_mp; - next_mp = NULL; - } - } while (multirt_send); - /* - * In the multirouting case, release the last ire used for - * emission. save_ire will be released by the caller. - */ - if (ire != save_ire) { - ire_refrele(ire); - } - } else { - /* - * Can't apply backpressure, just discard the packet. - */ - BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); - freemsg(mp); - return; - } + return (conn_hasmembers_ill_withsrc_v6(connp, v6dst_ptr, v6src_ptr, + ira->ira_ill)); } /* @@ -12189,37 +4488,52 @@ pr_addr_dbg(char *fmt1, int af, const void *addr) /* - * Return the length in bytes of the IPv6 headers (base header, ip6i_t - * if needed and extension headers) that will be needed based on the - * ip6_pkt_t structure passed by the caller. + * Return the length in bytes of the IPv6 headers (base header + * extension headers) that will be needed based on the + * ip_pkt_t structure passed by the caller. * * The returned length does not include the length of the upper level * protocol (ULP) header. */ int -ip_total_hdrs_len_v6(ip6_pkt_t *ipp) +ip_total_hdrs_len_v6(const ip_pkt_t *ipp) { int len; len = IPV6_HDR_LEN; - if (ipp->ipp_fields & IPPF_HAS_IP6I) - len += sizeof (ip6i_t); - if (ipp->ipp_fields & IPPF_HOPOPTS) { + + /* + * If there's a security label here, then we ignore any hop-by-hop + * options the user may try to set. + */ + if (ipp->ipp_fields & IPPF_LABEL_V6) { + uint_t hopoptslen; + /* + * Note that ipp_label_len_v6 is just the option - not + * the hopopts extension header. It also needs to be padded + * to a multiple of 8 bytes. + */ + ASSERT(ipp->ipp_label_len_v6 != 0); + hopoptslen = ipp->ipp_label_len_v6 + sizeof (ip6_hbh_t); + hopoptslen = (hopoptslen + 7)/8 * 8; + len += hopoptslen; + } else if (ipp->ipp_fields & IPPF_HOPOPTS) { ASSERT(ipp->ipp_hopoptslen != 0); len += ipp->ipp_hopoptslen; } - if (ipp->ipp_fields & IPPF_RTHDR) { - ASSERT(ipp->ipp_rthdrlen != 0); - len += ipp->ipp_rthdrlen; - } + /* * En-route destination options * Only do them if there's a routing header as well */ - if ((ipp->ipp_fields & (IPPF_RTDSTOPTS|IPPF_RTHDR)) == - (IPPF_RTDSTOPTS|IPPF_RTHDR)) { - ASSERT(ipp->ipp_rtdstoptslen != 0); - len += ipp->ipp_rtdstoptslen; + if ((ipp->ipp_fields & (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) == + (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) { + ASSERT(ipp->ipp_rthdrdstoptslen != 0); + len += ipp->ipp_rthdrdstoptslen; + } + if (ipp->ipp_fields & IPPF_RTHDR) { + ASSERT(ipp->ipp_rthdrlen != 0); + len += ipp->ipp_rthdrlen; } if (ipp->ipp_fields & IPPF_DSTOPTS) { ASSERT(ipp->ipp_dstoptslen != 0); @@ -12230,80 +4544,40 @@ ip_total_hdrs_len_v6(ip6_pkt_t *ipp) /* * All-purpose routine to build a header chain of an IPv6 header - * followed by any required extension headers and a proto header, - * preceeded (where necessary) by an ip6i_t private header. + * followed by any required extension headers and a proto header. * - * The fields of the IPv6 header that are derived from the ip6_pkt_t - * will be filled in appropriately. - * Thus the caller must fill in the rest of the IPv6 header, such as - * traffic class/flowid, source address (if not set here), hoplimit (if not - * set here) and destination address. + * The caller has to set the source and destination address as well as + * ip6_plen. The caller has to massage any routing header and compensate + * for the ULP pseudo-header checksum due to the source route. * - * The extension headers and ip6i_t header will all be fully filled in. + * The extension headers will all be fully filled in. */ void -ip_build_hdrs_v6(uchar_t *ext_hdrs, uint_t ext_hdrs_len, - ip6_pkt_t *ipp, uint8_t protocol) +ip_build_hdrs_v6(uchar_t *buf, uint_t buf_len, const ip_pkt_t *ipp, + uint8_t protocol, uint32_t flowinfo) { uint8_t *nxthdr_ptr; uint8_t *cp; - ip6i_t *ip6i; - ip6_t *ip6h = (ip6_t *)ext_hdrs; + ip6_t *ip6h = (ip6_t *)buf; - /* - * If sending private ip6i_t header down (checksum info, nexthop, - * or ifindex), adjust ip header pointer and set ip6i_t header pointer, - * then fill it in. (The checksum info will be filled in by icmp). - */ - if (ipp->ipp_fields & IPPF_HAS_IP6I) { - ip6i = (ip6i_t *)ip6h; - ip6h = (ip6_t *)&ip6i[1]; - - ip6i->ip6i_flags = 0; - ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW; - if (ipp->ipp_fields & IPPF_IFINDEX || - ipp->ipp_fields & IPPF_SCOPE_ID) { - ASSERT(ipp->ipp_ifindex != 0); - ip6i->ip6i_flags |= IP6I_IFINDEX; - ip6i->ip6i_ifindex = ipp->ipp_ifindex; - } - if (ipp->ipp_fields & IPPF_ADDR) { - /* - * Enable per-packet source address verification if - * IPV6_PKTINFO specified the source address. - * ip6_src is set in the transport's _wput function. - */ - ASSERT(!IN6_IS_ADDR_UNSPECIFIED( - &ipp->ipp_addr)); - ip6i->ip6i_flags |= IP6I_VERIFY_SRC; - } - if (ipp->ipp_fields & IPPF_UNICAST_HOPS) { - ip6h->ip6_hops = ipp->ipp_unicast_hops; - /* - * We need to set this flag so that IP doesn't - * rewrite the IPv6 header's hoplimit with the - * current default value. - */ - ip6i->ip6i_flags |= IP6I_HOPLIMIT; - } - if (ipp->ipp_fields & IPPF_NEXTHOP) { - ASSERT(!IN6_IS_ADDR_UNSPECIFIED( - &ipp->ipp_nexthop)); - ip6i->ip6i_flags |= IP6I_NEXTHOP; - ip6i->ip6i_nexthop = ipp->ipp_nexthop; - } - /* - * tell IP this is an ip6i_t private header - */ - ip6i->ip6i_nxt = IPPROTO_RAW; - } /* Initialize IPv6 header */ - ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW; + ip6h->ip6_vcf = + (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) | + (flowinfo & ~IPV6_VERS_AND_FLOW_MASK); + if (ipp->ipp_fields & IPPF_TCLASS) { - ip6h->ip6_vcf = (ip6h->ip6_vcf & ~IPV6_FLOWINFO_TCLASS) | - (ipp->ipp_tclass << 20); + /* Overrides the class part of flowinfo */ + ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf, + ipp->ipp_tclass); } - if (ipp->ipp_fields & IPPF_ADDR) + + if (ipp->ipp_fields & IPPF_HOPLIMIT) + ip6h->ip6_hops = ipp->ipp_hoplimit; + else + ip6h->ip6_hops = ipp->ipp_unicast_hops; + + if ((ipp->ipp_fields & IPPF_ADDR) && + !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) ip6h->ip6_src = ipp->ipp_addr; nxthdr_ptr = (uint8_t *)&ip6h->ip6_nxt; @@ -12313,7 +4587,47 @@ ip_build_hdrs_v6(uchar_t *ext_hdrs, uint_t ext_hdrs_len, * any extension headers in the right order: * Hop-by-hop, destination, routing, and final destination opts. */ - if (ipp->ipp_fields & IPPF_HOPOPTS) { + /* + * If there's a security label here, then we ignore any hop-by-hop + * options the user may try to set. + */ + if (ipp->ipp_fields & IPPF_LABEL_V6) { + /* + * Hop-by-hop options with the label. + * Note that ipp_label_v6 is just the option - not + * the hopopts extension header. It also needs to be padded + * to a multiple of 8 bytes. + */ + ip6_hbh_t *hbh = (ip6_hbh_t *)cp; + uint_t hopoptslen; + uint_t padlen; + + padlen = ipp->ipp_label_len_v6 + sizeof (ip6_hbh_t); + hopoptslen = (padlen + 7)/8 * 8; + padlen = hopoptslen - padlen; + + *nxthdr_ptr = IPPROTO_HOPOPTS; + nxthdr_ptr = &hbh->ip6h_nxt; + hbh->ip6h_len = hopoptslen/8 - 1; + cp += sizeof (ip6_hbh_t); + bcopy(ipp->ipp_label_v6, cp, ipp->ipp_label_len_v6); + cp += ipp->ipp_label_len_v6; + + ASSERT(padlen <= 7); + switch (padlen) { + case 0: + break; + case 1: + cp[0] = IP6OPT_PAD1; + break; + default: + cp[0] = IP6OPT_PADN; + cp[1] = padlen - 2; + bzero(&cp[2], padlen - 2); + break; + } + cp += padlen; + } else if (ipp->ipp_fields & IPPF_HOPOPTS) { /* Hop-by-hop options */ ip6_hbh_t *hbh = (ip6_hbh_t *)cp; @@ -12327,15 +4641,15 @@ ip_build_hdrs_v6(uchar_t *ext_hdrs, uint_t ext_hdrs_len, * En-route destination options * Only do them if there's a routing header as well */ - if ((ipp->ipp_fields & (IPPF_RTDSTOPTS|IPPF_RTHDR)) == - (IPPF_RTDSTOPTS|IPPF_RTHDR)) { + if ((ipp->ipp_fields & (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) == + (IPPF_RTHDRDSTOPTS|IPPF_RTHDR)) { ip6_dest_t *dst = (ip6_dest_t *)cp; *nxthdr_ptr = IPPROTO_DSTOPTS; nxthdr_ptr = &dst->ip6d_nxt; - bcopy(ipp->ipp_rtdstopts, cp, ipp->ipp_rtdstoptslen); - cp += ipp->ipp_rtdstoptslen; + bcopy(ipp->ipp_rthdrdstopts, cp, ipp->ipp_rthdrdstoptslen); + cp += ipp->ipp_rthdrdstoptslen; } /* * Routing header next @@ -12365,7 +4679,7 @@ ip_build_hdrs_v6(uchar_t *ext_hdrs, uint_t ext_hdrs_len, * Now set the last header pointer to the proto passed in */ *nxthdr_ptr = protocol; - ASSERT((int)(cp - ext_hdrs) == ext_hdrs_len); + ASSERT((int)(cp - buf) == buf_len); } /* @@ -12509,108 +4823,28 @@ ip_massage_options_v6(ip6_t *ip6h, ip6_rthdr_t *rth, netstack_t *ns) return (cksm); } -/* - * Propagate a multicast group membership operation (join/leave) (*fn) on - * all interfaces crossed by the related multirt routes. - * The call is considered successful if the operation succeeds - * on at least one interface. - * The function is called if the destination address in the packet to send - * is multirouted. - */ -int -ip_multirt_apply_membership_v6(int (*fn)(conn_t *, boolean_t, - const in6_addr_t *, int, mcast_record_t, const in6_addr_t *, mblk_t *), - ire_t *ire, conn_t *connp, boolean_t checkonly, const in6_addr_t *v6grp, - mcast_record_t fmode, const in6_addr_t *v6src, mblk_t *first_mp) -{ - ire_t *ire_gw; - irb_t *irb; - int index, error = 0; - opt_restart_t *or; - ip_stack_t *ipst = ire->ire_ipst; - - irb = ire->ire_bucket; - ASSERT(irb != NULL); - - ASSERT(DB_TYPE(first_mp) == M_CTL); - or = (opt_restart_t *)first_mp->b_rptr; - - IRB_REFHOLD(irb); - for (; ire != NULL; ire = ire->ire_next) { - if ((ire->ire_flags & RTF_MULTIRT) == 0) - continue; - if (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, v6grp)) - continue; - - ire_gw = ire_ftable_lookup_v6(&ire->ire_gateway_addr_v6, 0, 0, - IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, NULL, - MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE, ipst); - /* No resolver exists for the gateway; skip this ire. */ - if (ire_gw == NULL) - continue; - index = ire_gw->ire_ipif->ipif_ill->ill_phyint->phyint_ifindex; - /* - * A resolver exists: we can get the interface on which we have - * to apply the operation. - */ - error = fn(connp, checkonly, v6grp, index, fmode, v6src, - first_mp); - if (error == 0) - or->or_private = CGTP_MCAST_SUCCESS; - - if (ip_debug > 0) { - ulong_t off; - char *ksym; - - ksym = kobj_getsymname((uintptr_t)fn, &off); - ip2dbg(("ip_multirt_apply_membership_v6: " - "called %s, multirt group 0x%08x via itf 0x%08x, " - "error %d [success %u]\n", - ksym ? ksym : "?", - ntohl(V4_PART_OF_V6((*v6grp))), - ntohl(V4_PART_OF_V6(ire_gw->ire_src_addr_v6)), - error, or->or_private)); - } - - ire_refrele(ire_gw); - if (error == EINPROGRESS) { - IRB_REFRELE(irb); - return (error); - } - } - IRB_REFRELE(irb); - /* - * Consider the call as successful if we succeeded on at least - * one interface. Otherwise, return the last encountered error. - */ - return (or->or_private == CGTP_MCAST_SUCCESS ? 0 : error); -} - void *ip6_kstat_init(netstackid_t stackid, ip6_stat_t *ip6_statisticsp) { kstat_t *ksp; ip6_stat_t template = { - { "ip6_udp_fast_path", KSTAT_DATA_UINT64 }, - { "ip6_udp_slow_path", KSTAT_DATA_UINT64 }, { "ip6_udp_fannorm", KSTAT_DATA_UINT64 }, { "ip6_udp_fanmb", KSTAT_DATA_UINT64 }, + { "ip6_recv_pullup", KSTAT_DATA_UINT64 }, + { "ip6_db_ref", KSTAT_DATA_UINT64 }, + { "ip6_notaligned", KSTAT_DATA_UINT64 }, + { "ip6_multimblk", KSTAT_DATA_UINT64 }, + { "ipsec_proto_ahesp", KSTAT_DATA_UINT64 }, { "ip6_out_sw_cksum", KSTAT_DATA_UINT64 }, + { "ip6_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, { "ip6_in_sw_cksum", KSTAT_DATA_UINT64 }, { "ip6_tcp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 }, { "ip6_tcp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 }, { "ip6_tcp_in_sw_cksum_err", KSTAT_DATA_UINT64 }, - { "ip6_tcp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, { "ip6_udp_in_full_hw_cksum_err", KSTAT_DATA_UINT64 }, { "ip6_udp_in_part_hw_cksum_err", KSTAT_DATA_UINT64 }, { "ip6_udp_in_sw_cksum_err", KSTAT_DATA_UINT64 }, - { "ip6_udp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, - { "ip6_frag_mdt_pkt_out", KSTAT_DATA_UINT64 }, - { "ip6_frag_mdt_discarded", KSTAT_DATA_UINT64 }, - { "ip6_frag_mdt_allocfail", KSTAT_DATA_UINT64 }, - { "ip6_frag_mdt_addpdescfail", KSTAT_DATA_UINT64 }, - { "ip6_frag_mdt_allocd", KSTAT_DATA_UINT64 }, }; ksp = kstat_create_netstack("ip", 0, "ip6stat", "net", KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t), @@ -12641,7 +4875,7 @@ ip6_kstat_fini(netstackid_t stackid, kstat_t *ksp) * IPV6_SRC_PREFERENCES socket option. */ int -ip6_set_src_preferences(conn_t *connp, uint32_t prefs) +ip6_set_src_preferences(ip_xmit_attr_t *ixa, uint32_t prefs) { /* * We only support preferences that are covered by @@ -12675,47 +4909,15 @@ ip6_set_src_preferences(conn_t *connp, uint32_t prefs) return (EINVAL); } - connp->conn_src_preferences = prefs; + ixa->ixa_src_preferences = prefs; return (0); } size_t -ip6_get_src_preferences(conn_t *connp, uint32_t *val) +ip6_get_src_preferences(ip_xmit_attr_t *ixa, uint32_t *val) { - *val = connp->conn_src_preferences; - return (sizeof (connp->conn_src_preferences)); -} - -int -ip6_set_pktinfo(cred_t *cr, conn_t *connp, struct in6_pktinfo *pkti) -{ - ire_t *ire; - ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - - /* - * Verify the source address and ifindex. Privileged users can use - * any source address. For ancillary data the source address is - * checked in ip_wput_v6. - */ - if (pkti->ipi6_ifindex != 0) { - rw_enter(&ipst->ips_ill_g_lock, RW_READER); - if (!phyint_exists(pkti->ipi6_ifindex, ipst)) { - rw_exit(&ipst->ips_ill_g_lock); - return (ENXIO); - } - rw_exit(&ipst->ips_ill_g_lock); - } - if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr) && - secpolicy_net_rawaccess(cr) != 0) { - ire = ire_route_lookup_v6(&pkti->ipi6_addr, 0, 0, - (IRE_LOCAL|IRE_LOOPBACK), NULL, NULL, - connp->conn_zoneid, NULL, MATCH_IRE_TYPE, ipst); - if (ire != NULL) - ire_refrele(ire); - else - return (ENXIO); - } - return (0); + *val = ixa->ixa_src_preferences; + return (sizeof (ixa->ixa_src_preferences)); } /* @@ -12743,7 +4945,7 @@ ipsec_ah_get_hdr_size_v6(mblk_t *mp, boolean_t till_ah) whereptr = (uint8_t *)&ip6h[1]; for (;;) { /* Assume IP has already stripped it */ - ASSERT(nexthdr != IPPROTO_FRAGMENT && nexthdr != IPPROTO_RAW); + ASSERT(nexthdr != IPPROTO_FRAGMENT); switch (nexthdr) { case IPPROTO_HOPOPTS: hbhhdr = (ip6_hbh_t *)whereptr; @@ -12815,11 +5017,12 @@ ipsec_ah_get_hdr_size_v6(mblk_t *mp, boolean_t till_ah) * inside the IPSQ (ill_g_lock is not held), `ill' may be removed from the * group during or after this lookup. */ -static boolean_t +boolean_t ipif_lookup_testaddr_v6(ill_t *ill, const in6_addr_t *v6srcp, ipif_t **ipifp) { ipif_t *ipif; + ipif = ipif_lookup_addr_exact_v6(v6srcp, ill, ill->ill_ipst); if (ipif != NULL) { if (ipifp != NULL) diff --git a/usr/src/uts/common/inet/ip/ip6_asp.c b/usr/src/uts/common/inet/ip/ip6_asp.c index d54e821359..5c499e6526 100644 --- a/usr/src/uts/common/inet/ip/ip6_asp.c +++ b/usr/src/uts/common/inet/ip/ip6_asp.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/socket.h> #include <sys/ksynch.h> @@ -41,6 +39,7 @@ #include <inet/ip6.h> #include <inet/ip6_asp.h> #include <inet/ip_ire.h> +#include <inet/ip_if.h> #include <inet/ipclassifier.h> #define IN6ADDR_MASK128_INIT \ @@ -415,18 +414,13 @@ ip6_asp_replace(mblk_t *mp, ip6_asp_t *new_table, size_t new_size, ipst->ips_ip6_asp_table = tmp_table; ipst->ips_ip6_asp_table_count = count; - /* - * The user has changed the address selection policy table. IPv6 - * source address selection for existing IRE_CACHE and - * RTF_DYNAMIC entries used the old table, so we need to - * clear the cache. - */ - ire_walk_v6(ire_delete_cache_v6, NULL, ALL_ZONES, ipst); - unlock_end: ipst->ips_ip6_asp_uip = B_FALSE; mutex_exit(&ipst->ips_ip6_asp_lock); + /* Let conn_ixa caching know that source address selection changed */ + ip_update_source_selection(ipst); + replace_end: /* Reply to the ioctl */ q = (queue_t *)mp->b_prev; diff --git a/usr/src/uts/common/inet/ip/ip6_if.c b/usr/src/uts/common/inet/ip/ip6_if.c index a986a755ac..364a44b9d4 100644 --- a/usr/src/uts/common/inet/ip/ip6_if.c +++ b/usr/src/uts/common/inet/ip/ip6_if.c @@ -76,12 +76,13 @@ static in6_addr_t ipv6_ll_template = static ipif_t * ipif_lookup_interface_v6(const in6_addr_t *if_addr, const in6_addr_t *dst, - queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst); + ip_stack_t *ipst); + +static int ipif_add_ires_v6(ipif_t *, boolean_t); /* - * These two functions, ipif_lookup_group_v6() and ill_lookup_group_v6(), - * are called when an application does not specify an interface to be - * used for multicast traffic. It calls ire_lookup_multi_v6() to look + * This function is called when an application does not specify an interface + * to be used for multicast traffic. It calls ire_lookup_multi_v6() to look * for an interface route for the specified multicast group. Doing * this allows the administrator to add prefix routes for multicast to * indicate which interface to be used for multicast traffic in the above @@ -89,47 +90,21 @@ ipif_lookup_interface_v6(const in6_addr_t *if_addr, const in6_addr_t *dst, * multicast group (a /128 route) or anything in between. If there is no * such multicast route, we just find any multicast capable interface and * return it. + * + * We support MULTIRT and RTF_SETSRC on the multicast routes added to the + * unicast table. This is used by CGTP. */ -ipif_t * -ipif_lookup_group_v6(const in6_addr_t *group, zoneid_t zoneid, ip_stack_t *ipst) -{ - ire_t *ire; - ipif_t *ipif; - - ire = ire_lookup_multi_v6(group, zoneid, ipst); - if (ire != NULL) { - ipif = ire->ire_ipif; - ipif_refhold(ipif); - ire_refrele(ire); - return (ipif); - } - - return (ipif_lookup_multicast(ipst, zoneid, B_TRUE)); -} - ill_t * -ill_lookup_group_v6(const in6_addr_t *group, zoneid_t zoneid, ip_stack_t *ipst) +ill_lookup_group_v6(const in6_addr_t *group, zoneid_t zoneid, ip_stack_t *ipst, + boolean_t *multirtp, in6_addr_t *setsrcp) { - ire_t *ire; ill_t *ill; - ipif_t *ipif; - ire = ire_lookup_multi_v6(group, zoneid, ipst); - if (ire != NULL) { - ill = ire->ire_ipif->ipif_ill; - ill_refhold(ill); - ire_refrele(ire); + ill = ire_lookup_multi_ill_v6(group, zoneid, ipst, multirtp, setsrcp); + if (ill != NULL) return (ill); - } - - ipif = ipif_lookup_multicast(ipst, zoneid, B_TRUE); - if (ipif == NULL) - return (NULL); - ill = ipif->ipif_ill; - ill_refhold(ill); - ipif_refrele(ipif); - return (ill); + return (ill_lookup_multicast(ipst, zoneid, B_TRUE)); } /* @@ -138,16 +113,12 @@ ill_lookup_group_v6(const in6_addr_t *group, zoneid_t zoneid, ip_stack_t *ipst) */ static ipif_t * ipif_lookup_interface_v6(const in6_addr_t *if_addr, const in6_addr_t *dst, - queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst) + ip_stack_t *ipst) { ipif_t *ipif; ill_t *ill; - ipsq_t *ipsq; ill_walk_context_t ctx; - if (error != NULL) - *error = 0; - /* * First match all the point-to-point interfaces * before looking at non-point-to-point interfaces. @@ -157,7 +128,6 @@ ipif_lookup_interface_v6(const in6_addr_t *if_addr, const in6_addr_t *dst, rw_enter(&ipst->ips_ill_g_lock, RW_READER); ill = ILL_START_WALK_V6(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { - GRAB_CONN_LOCK(q); mutex_enter(&ill->ill_lock); for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { @@ -167,36 +137,19 @@ ipif_lookup_interface_v6(const in6_addr_t *if_addr, const in6_addr_t *dst, if_addr)) && (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, dst))) { - if (IPIF_CAN_LOOKUP(ipif)) { + if (!IPIF_IS_CONDEMNED(ipif)) { ipif_refhold_locked(ipif); mutex_exit(&ill->ill_lock); - RELEASE_CONN_LOCK(q); rw_exit(&ipst->ips_ill_g_lock); return (ipif); - } else if (IPIF_CAN_WAIT(ipif, q)) { - ipsq = ill->ill_phyint->phyint_ipsq; - mutex_enter(&ipsq->ipsq_lock); - mutex_enter(&ipsq->ipsq_xop->ipx_lock); - mutex_exit(&ill->ill_lock); - rw_exit(&ipst->ips_ill_g_lock); - ipsq_enq(ipsq, q, mp, func, NEW_OP, - ill); - mutex_exit(&ipsq->ipsq_xop->ipx_lock); - mutex_exit(&ipsq->ipsq_lock); - RELEASE_CONN_LOCK(q); - if (error != NULL) - *error = EINPROGRESS; - return (NULL); } } } mutex_exit(&ill->ill_lock); - RELEASE_CONN_LOCK(q); } rw_exit(&ipst->ips_ill_g_lock); /* lookup the ipif based on interface address */ - ipif = ipif_lookup_addr_v6(if_addr, NULL, ALL_ZONES, q, mp, func, - error, ipst); + ipif = ipif_lookup_addr_v6(if_addr, NULL, ALL_ZONES, ipst); ASSERT(ipif == NULL || ipif->ipif_isv6); return (ipif); } @@ -206,17 +159,14 @@ ipif_lookup_interface_v6(const in6_addr_t *if_addr, const in6_addr_t *dst, */ static ipif_t * ipif_lookup_addr_common_v6(const in6_addr_t *addr, ill_t *match_ill, - boolean_t match_illgrp, zoneid_t zoneid, queue_t *q, mblk_t *mp, - ipsq_func_t func, int *error, ip_stack_t *ipst) + uint32_t match_flags, zoneid_t zoneid, ip_stack_t *ipst) { ipif_t *ipif; ill_t *ill; boolean_t ptp = B_FALSE; - ipsq_t *ipsq; ill_walk_context_t ctx; - - if (error != NULL) - *error = 0; + boolean_t match_illgrp = (match_flags & IPIF_MATCH_ILLGRP); + boolean_t no_duplicate = (match_flags & IPIF_MATCH_NONDUP); rw_enter(&ipst->ips_ill_g_lock, RW_READER); /* @@ -230,7 +180,6 @@ repeat: (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) { continue; } - GRAB_CONN_LOCK(q); mutex_enter(&ill->ill_lock); for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { @@ -238,6 +187,12 @@ repeat: ipif->ipif_zoneid != zoneid && ipif->ipif_zoneid != ALL_ZONES) continue; + + if (no_duplicate && + !(ipif->ipif_flags & IPIF_UP)) { + continue; + } + /* Allow the ipif to be down */ if ((!ptp && (IN6_ARE_ADDR_EQUAL( &ipif->ipif_v6lcl_addr, addr) && @@ -245,82 +200,26 @@ repeat: (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, addr))) { - if (IPIF_CAN_LOOKUP(ipif)) { + if (!IPIF_IS_CONDEMNED(ipif)) { ipif_refhold_locked(ipif); mutex_exit(&ill->ill_lock); - RELEASE_CONN_LOCK(q); rw_exit(&ipst->ips_ill_g_lock); return (ipif); - } else if (IPIF_CAN_WAIT(ipif, q)) { - ipsq = ill->ill_phyint->phyint_ipsq; - mutex_enter(&ipsq->ipsq_lock); - mutex_enter(&ipsq->ipsq_xop->ipx_lock); - mutex_exit(&ill->ill_lock); - rw_exit(&ipst->ips_ill_g_lock); - ipsq_enq(ipsq, q, mp, func, NEW_OP, - ill); - mutex_exit(&ipsq->ipsq_xop->ipx_lock); - mutex_exit(&ipsq->ipsq_lock); - RELEASE_CONN_LOCK(q); - if (error != NULL) - *error = EINPROGRESS; - return (NULL); } } } mutex_exit(&ill->ill_lock); - RELEASE_CONN_LOCK(q); } /* If we already did the ptp case, then we are done */ if (ptp) { rw_exit(&ipst->ips_ill_g_lock); - if (error != NULL) - *error = ENXIO; return (NULL); } ptp = B_TRUE; goto repeat; } -boolean_t -ip_addr_exists_v6(const in6_addr_t *addr, zoneid_t zoneid, - ip_stack_t *ipst) -{ - ipif_t *ipif; - ill_t *ill; - ill_walk_context_t ctx; - - rw_enter(&ipst->ips_ill_g_lock, RW_READER); - - ill = ILL_START_WALK_V6(&ctx, ipst); - for (; ill != NULL; ill = ill_next(&ctx, ill)) { - mutex_enter(&ill->ill_lock); - for (ipif = ill->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - if (zoneid != ALL_ZONES && - ipif->ipif_zoneid != zoneid && - ipif->ipif_zoneid != ALL_ZONES) - continue; - /* Allow the ipif to be down */ - if (((IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, - addr) && - (ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || - ((ipif->ipif_flags & IPIF_POINTOPOINT) && - IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6pp_dst_addr, - addr))) { - mutex_exit(&ill->ill_lock); - rw_exit(&ipst->ips_ill_g_lock); - return (B_TRUE); - } - } - mutex_exit(&ill->ill_lock); - } - - rw_exit(&ipst->ips_ill_g_lock); - return (B_FALSE); -} - /* * Lookup an ipif with the specified address. For point-to-point links we * look for matches on either the destination address or the local address, @@ -330,10 +229,24 @@ ip_addr_exists_v6(const in6_addr_t *addr, zoneid_t zoneid, */ ipif_t * ipif_lookup_addr_v6(const in6_addr_t *addr, ill_t *match_ill, zoneid_t zoneid, - queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst) + ip_stack_t *ipst) { - return (ipif_lookup_addr_common_v6(addr, match_ill, B_TRUE, zoneid, q, - mp, func, error, ipst)); + return (ipif_lookup_addr_common_v6(addr, match_ill, IPIF_MATCH_ILLGRP, + zoneid, ipst)); +} + +/* + * Lookup an ipif with the specified address. Similar to ipif_lookup_addr, + * except that we will only return an address if it is not marked as + * IPIF_DUPLICATE + */ +ipif_t * +ipif_lookup_addr_nondup_v6(const in6_addr_t *addr, ill_t *match_ill, + zoneid_t zoneid, ip_stack_t *ipst) +{ + return (ipif_lookup_addr_common_v6(addr, match_ill, + (IPIF_MATCH_ILLGRP | IPIF_MATCH_NONDUP), zoneid, + ipst)); } /* @@ -346,8 +259,8 @@ ipif_lookup_addr_exact_v6(const in6_addr_t *addr, ill_t *match_ill, ip_stack_t *ipst) { ASSERT(match_ill != NULL); - return (ipif_lookup_addr_common_v6(addr, match_ill, B_FALSE, ALL_ZONES, - NULL, NULL, NULL, NULL, ipst)); + return (ipif_lookup_addr_common_v6(addr, match_ill, 0, ALL_ZONES, + ipst)); } /* @@ -473,23 +386,22 @@ ip_remote_addr_ok_v6(const in6_addr_t *addr, const in6_addr_t *subnet_mask) /* * ip_rt_add_v6 is called to add an IPv6 route to the forwarding table. - * ipif_arg is passed in to associate it with the correct interface + * ill is passed in to associate it with the correct interface * (for link-local destinations and gateways). + * If ire_arg is set, then we return the held IRE in that location. */ /* ARGSUSED1 */ int ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask, const in6_addr_t *gw_addr, const in6_addr_t *src_addr, int flags, - ipif_t *ipif_arg, ire_t **ire_arg, queue_t *q, mblk_t *mp, ipsq_func_t func, - struct rtsa_s *sp, ip_stack_t *ipst) + ill_t *ill, ire_t **ire_arg, struct rtsa_s *sp, ip_stack_t *ipst, + zoneid_t zoneid) { - ire_t *ire; + ire_t *ire, *nire; ire_t *gw_ire = NULL; ipif_t *ipif; - boolean_t ipif_refheld = B_FALSE; uint_t type; int match_flags = MATCH_IRE_TYPE; - int error; tsol_gc_t *gc = NULL; tsol_gcgrp_t *gcgrp = NULL; boolean_t gcgrp_xtraref = B_FALSE; @@ -514,14 +426,19 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask, /* * Get the ipif, if any, corresponding to the gw_addr + * If -ifp was specified we restrict ourselves to the ill, otherwise + * we match on the gatway and destination to handle unnumbered pt-pt + * interfaces. */ - ipif = ipif_lookup_interface_v6(gw_addr, dst_addr, q, mp, func, - &error, ipst); - if (ipif != NULL) - ipif_refheld = B_TRUE; - else if (error == EINPROGRESS) { - ip1dbg(("ip_rt_add_v6: null and EINPROGRESS")); - return (error); + if (ill != NULL) + ipif = ipif_lookup_addr_v6(gw_addr, ill, ALL_ZONES, ipst); + else + ipif = ipif_lookup_interface_v6(gw_addr, dst_addr, ipst); + if (ipif != NULL) { + if (IS_VNI(ipif->ipif_ill)) { + ipif_refrele(ipif); + return (EINVAL); + } } /* @@ -535,57 +452,74 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask, if (IN6_ARE_ADDR_EQUAL(gw_addr, &ipv6_loopback) && IN6_ARE_ADDR_EQUAL(dst_addr, &ipv6_loopback) && IN6_ARE_ADDR_EQUAL(mask, &ipv6_all_ones)) { - ire = ire_ctable_lookup_v6(dst_addr, 0, IRE_LOOPBACK, - ipif, ALL_ZONES, NULL, match_flags, ipst); + ire = ire_ftable_lookup_v6(dst_addr, 0, 0, IRE_LOOPBACK, + NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst, + NULL); if (ire != NULL) { ire_refrele(ire); - if (ipif_refheld) - ipif_refrele(ipif); + ipif_refrele(ipif); return (EEXIST); } - ip1dbg(("ipif_up_done: 0x%p creating IRE 0x%x" + ip1dbg(("ip_rt_add_v6: 0x%p creating IRE 0x%x" "for 0x%x\n", (void *)ipif, ipif->ipif_ire_type, ntohl(ipif->ipif_lcl_addr))); ire = ire_create_v6( dst_addr, mask, - &ipif->ipif_v6src_addr, - NULL, - &ipif->ipif_mtu, - NULL, - NULL, - NULL, - ipif->ipif_net_type, - ipif, - NULL, - 0, - 0, - flags, - &ire_uinfo_null, NULL, + ipif->ipif_ire_type, /* LOOPBACK */ + ipif->ipif_ill, + zoneid, + (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE : 0, NULL, ipst); + if (ire == NULL) { - if (ipif_refheld) - ipif_refrele(ipif); + ipif_refrele(ipif); + return (ENOMEM); + } + /* src address assigned by the caller? */ + if ((flags & RTF_SETSRC) && + !IN6_IS_ADDR_UNSPECIFIED(src_addr)) + ire->ire_setsrc_addr_v6 = *src_addr; + + nire = ire_add(ire); + if (nire == NULL) { + /* + * In the result of failure, ire_add() will have + * already deleted the ire in question, so there + * is no need to do that here. + */ + ipif_refrele(ipif); return (ENOMEM); } - error = ire_add(&ire, q, mp, func, B_FALSE); - if (error == 0) - goto save_ire; /* - * In the result of failure, ire_add() will have already - * deleted the ire in question, so there is no need to - * do that here. + * Check if it was a duplicate entry. This handles + * the case of two racing route adds for the same route */ - if (ipif_refheld) + if (nire != ire) { + ASSERT(nire->ire_identical_ref > 1); + ire_delete(nire); + ire_refrele(nire); ipif_refrele(ipif); - return (error); + return (EEXIST); + } + ire = nire; + goto save_ire; } } /* + * The routes for multicast with CGTP are quite special in that + * the gateway is the local interface address, yet RTF_GATEWAY + * is set. We turn off RTF_GATEWAY to provide compatibility with + * this undocumented and unusual use of multicast routes. + */ + if ((flags & RTF_MULTIRT) && ipif != NULL) + flags &= ~RTF_GATEWAY; + + /* * Traditionally, interface routes are ones where RTF_GATEWAY isn't set * and the gateway address provided is one of the system's interface * addresses. By using the routing socket interface and supplying an @@ -619,8 +553,8 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask, * logical interfaces * * 192.0.2.32 255.255.255.224 192.0.2.33 U if0 - * 192.0.2.32 255.255.255.224 192.0.2.34 U if0:1 - * 192.0.2.32 255.255.255.224 192.0.2.35 U if0:2 + * 192.0.2.32 255.255.255.224 192.0.2.34 U if0 + * 192.0.2.32 255.255.255.224 192.0.2.35 U if0 * * the ipif's corresponding to each of these interface routes can be * uniquely identified by the "gateway" (actually interface address). @@ -635,90 +569,68 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask, /* RTF_GATEWAY not set */ if (!(flags & RTF_GATEWAY)) { - queue_t *stq; - if (sp != NULL) { ip2dbg(("ip_rt_add_v6: gateway security attributes " "cannot be set with interface route\n")); - if (ipif_refheld) + if (ipif != NULL) ipif_refrele(ipif); return (EINVAL); } /* - * As the interface index specified with the RTA_IFP sockaddr is - * the same for all ipif's off of an ill, the matching logic - * below uses MATCH_IRE_ILL if such an index was specified. - * This means that routes sharing the same prefix when added - * using a RTA_IFP sockaddr must have distinct interface - * indices (namely, they must be on distinct ill's). - * - * On the other hand, since the gateway address will usually be - * different for each ipif on the system, the matching logic - * uses MATCH_IRE_IPIF in the case of a traditional interface - * route. This means that interface routes for the same prefix - * can be created if they belong to distinct ipif's and if a - * RTA_IFP sockaddr is not present. + * Whether or not ill (RTA_IFP) is set, we require that + * the gateway is one of our local addresses. */ - if (ipif_arg != NULL) { - if (ipif_refheld) { - ipif_refrele(ipif); - ipif_refheld = B_FALSE; - } - ipif = ipif_arg; - match_flags |= MATCH_IRE_ILL; - } else { - /* - * Check the ipif corresponding to the gw_addr - */ - if (ipif == NULL) - return (ENETUNREACH); - match_flags |= MATCH_IRE_IPIF; + if (ipif == NULL) + return (ENETUNREACH); + + /* + * We use MATCH_IRE_ILL here. If the caller specified an + * interface (from the RTA_IFP sockaddr) we use it, otherwise + * we use the ill derived from the gateway address. + * We can always match the gateway address since we record it + * in ire_gateway_addr. + * We don't allow RTA_IFP to specify a different ill than the + * one matching the ipif to make sure we can delete the route. + */ + match_flags |= MATCH_IRE_GW | MATCH_IRE_ILL; + if (ill == NULL) { + ill = ipif->ipif_ill; + } else if (ill != ipif->ipif_ill) { + ipif_refrele(ipif); + return (EINVAL); } - ASSERT(ipif != NULL); /* * We check for an existing entry at this point. */ match_flags |= MATCH_IRE_MASK; - ire = ire_ftable_lookup_v6(dst_addr, mask, 0, IRE_INTERFACE, - ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst); + ire = ire_ftable_lookup_v6(dst_addr, mask, gw_addr, + IRE_INTERFACE, ill, ALL_ZONES, NULL, match_flags, 0, ipst, + NULL); if (ire != NULL) { ire_refrele(ire); - if (ipif_refheld) - ipif_refrele(ipif); + ipif_refrele(ipif); return (EEXIST); } - stq = (ipif->ipif_net_type == IRE_IF_RESOLVER) - ? ipif->ipif_rq : ipif->ipif_wq; - /* * Create a copy of the IRE_LOOPBACK, IRE_IF_NORESOLVER or - * IRE_IF_RESOLVER with the modified address and netmask. + * IRE_IF_RESOLVER with the modified address, netmask, and + * gateway. */ ire = ire_create_v6( dst_addr, mask, - &ipif->ipif_v6src_addr, - NULL, - &ipif->ipif_mtu, - NULL, - NULL, - stq, - ipif->ipif_net_type, - ipif, - NULL, - 0, - 0, + gw_addr, + ill->ill_net_type, + ill, + zoneid, flags, - &ire_uinfo_null, - NULL, NULL, ipst); if (ire == NULL) { - if (ipif_refheld) - ipif_refrele(ipif); + ipif_refrele(ipif); return (ENOMEM); } @@ -731,32 +643,44 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask, * RTF_BLACKHOLE flag as these interface routes, by * definition, can only be that. * - * If the IRE type (as defined by ipif->ipif_net_type) is + * If the IRE type (as defined by ill->ill_net_type) is * IRE_LOOPBACK, then we map the request into a * IRE_IF_NORESOLVER. * * Needless to say, the real IRE_LOOPBACK is NOT created by this * routine, but rather using ire_create_v6() directly. */ - if (ipif->ipif_net_type == IRE_LOOPBACK) { + if (ill->ill_net_type == IRE_LOOPBACK) { ire->ire_type = IRE_IF_NORESOLVER; ire->ire_flags |= RTF_BLACKHOLE; } - error = ire_add(&ire, q, mp, func, B_FALSE); - if (error == 0) - goto save_ire; + /* src address assigned by the caller? */ + if ((flags & RTF_SETSRC) && !IN6_IS_ADDR_UNSPECIFIED(src_addr)) + ire->ire_setsrc_addr_v6 = *src_addr; + + nire = ire_add(ire); + if (nire == NULL) { + /* + * In the result of failure, ire_add() will have + * already deleted the ire in question, so there + * is no need to do that here. + */ + ipif_refrele(ipif); + return (ENOMEM); + } /* - * In the result of failure, ire_add() will have already - * deleted the ire in question, so there is no need to - * do that here. + * Check if it was a duplicate entry. This handles + * the case of two racing route adds for the same route */ - if (ipif_refheld) + if (nire != ire) { + ASSERT(nire->ire_identical_ref > 1); + ire_delete(nire); + ire_refrele(nire); ipif_refrele(ipif); - return (error); - } - if (ipif_refheld) { - ipif_refrele(ipif); - ipif_refheld = B_FALSE; + return (EEXIST); + } + ire = nire; + goto save_ire; } /* @@ -764,14 +688,23 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask, * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the * gateway, it is currently unreachable and we fail the request * accordingly. + * If RTA_IFP was specified we look on that particular ill. */ - ipif = ipif_arg; - if (ipif_arg != NULL) + if (ill != NULL) match_flags |= MATCH_IRE_ILL; - gw_ire = ire_ftable_lookup_v6(gw_addr, 0, 0, IRE_INTERFACE, ipif_arg, - NULL, ALL_ZONES, 0, NULL, match_flags, ipst); - if (gw_ire == NULL) + + /* Check whether the gateway is reachable. */ + type = IRE_INTERFACE; + if (flags & RTF_INDIRECT) + type |= IRE_OFFLINK; + + gw_ire = ire_ftable_lookup_v6(gw_addr, 0, 0, type, ill, + ALL_ZONES, NULL, match_flags, 0, ipst, NULL); + if (gw_ire == NULL) { + if (ipif != NULL) + ipif_refrele(ipif); return (ENETUNREACH); + } /* * We create one of three types of IREs as a result of this request @@ -789,10 +722,12 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask, type = IRE_PREFIX; /* check for a duplicate entry */ - ire = ire_ftable_lookup_v6(dst_addr, mask, gw_addr, type, ipif_arg, - NULL, ALL_ZONES, 0, NULL, - match_flags | MATCH_IRE_MASK | MATCH_IRE_GW, ipst); + ire = ire_ftable_lookup_v6(dst_addr, mask, gw_addr, type, ill, + ALL_ZONES, NULL, + match_flags | MATCH_IRE_MASK | MATCH_IRE_GW, 0, ipst, NULL); if (ire != NULL) { + if (ipif != NULL) + ipif_refrele(ipif); ire_refrele(gw_ire); ire_refrele(ire); return (EEXIST); @@ -809,6 +744,8 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask, /* we hold reference to it upon success */ gcgrp = gcgrp_lookup(&ga, B_TRUE); if (gcgrp == NULL) { + if (ipif != NULL) + ipif_refrele(ipif); ire_refrele(gw_ire); return (ENOMEM); } @@ -824,6 +761,8 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask, if (gc == NULL) { /* release reference held by gcgrp_lookup */ GCGRP_REFRELE(gcgrp); + if (ipif != NULL) + ipif_refrele(ipif); ire_refrele(gw_ire); return (ENOMEM); } @@ -833,23 +772,12 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask, ire = ire_create_v6( dst_addr, /* dest address */ mask, /* mask */ - /* src address assigned by the caller? */ - (((flags & RTF_SETSRC) && !IN6_IS_ADDR_UNSPECIFIED(src_addr)) ? - src_addr : NULL), gw_addr, /* gateway address */ - &gw_ire->ire_max_frag, - NULL, /* no src nce */ - NULL, /* no recv-from queue */ - NULL, /* no send-to queue */ (ushort_t)type, /* IRE type */ - ipif_arg, - NULL, - 0, - 0, + ill, + zoneid, flags, - &gw_ire->ire_uinfo, /* Inherit ULP info from gw */ gc, /* security attribute */ - NULL, ipst); /* @@ -862,26 +790,48 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask, if (ire == NULL) { if (gc != NULL) GC_REFRELE(gc); + if (ipif != NULL) + ipif_refrele(ipif); ire_refrele(gw_ire); return (ENOMEM); } + /* src address assigned by the caller? */ + if ((flags & RTF_SETSRC) && !IN6_IS_ADDR_UNSPECIFIED(src_addr)) + ire->ire_setsrc_addr_v6 = *src_addr; + /* * POLICY: should we allow an RTF_HOST with address INADDR_ANY? * SUN/OS socket stuff does but do we really want to allow ::0 ? */ /* Add the new IRE. */ - error = ire_add(&ire, q, mp, func, B_FALSE); + nire = ire_add(ire); + if (nire == NULL) { + /* + * In the result of failure, ire_add() will have + * already deleted the ire in question, so there + * is no need to do that here. + */ + if (ipif != NULL) + ipif_refrele(ipif); + ire_refrele(gw_ire); + return (ENOMEM); + } /* - * In the result of failure, ire_add() will have already - * deleted the ire in question, so there is no need to - * do that here. + * Check if it was a duplicate entry. This handles + * the case of two racing route adds for the same route */ - if (error != 0) { + if (nire != ire) { + ASSERT(nire->ire_identical_ref > 1); + ire_delete(nire); + ire_refrele(nire); + if (ipif != NULL) + ipif_refrele(ipif); ire_refrele(gw_ire); - return (error); + return (EEXIST); } + ire = nire; if (flags & RTF_MULTIRT) { /* @@ -896,70 +846,51 @@ ip_rt_add_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask, if (ipst->ips_ip_cgtp_filter_ops != NULL && !IN6_IS_ADDR_MULTICAST(&(ire->ire_addr_v6))) { int res; - - res = ipst->ips_ip_cgtp_filter_ops->cfo_add_dest_v6( - ipst->ips_netstack->netstack_stackid, - &ire->ire_addr_v6, - &ire->ire_gateway_addr_v6, - &ire->ire_src_addr_v6, - &gw_ire->ire_src_addr_v6); + ipif_t *src_ipif; + + /* Find the source address corresponding to gw_ire */ + src_ipif = ipif_lookup_addr_v6( + &gw_ire->ire_gateway_addr_v6, NULL, zoneid, ipst); + if (src_ipif != NULL) { + res = ipst->ips_ip_cgtp_filter_ops-> + cfo_add_dest_v6( + ipst->ips_netstack->netstack_stackid, + &ire->ire_addr_v6, + &ire->ire_gateway_addr_v6, + &ire->ire_setsrc_addr_v6, + &src_ipif->ipif_v6lcl_addr); + ipif_refrele(src_ipif); + } else { + res = EADDRNOTAVAIL; + } if (res != 0) { + if (ipif != NULL) + ipif_refrele(ipif); ire_refrele(gw_ire); ire_delete(ire); + ire_refrele(ire); /* Held in ire_add */ return (res); } } } - /* - * Now that the prefix IRE entry has been created, delete any - * existing gateway IRE cache entries as well as any IRE caches - * using the gateway, and force them to be created through - * ip_newroute_v6. - */ - if (gc != NULL) { - ASSERT(gcgrp != NULL); - ire_clookup_delete_cache_gw_v6(gw_addr, ALL_ZONES, ipst); - } - save_ire: if (gw_ire != NULL) { ire_refrele(gw_ire); + gw_ire = NULL; } - if (ipif != NULL) { - mblk_t *save_mp; - + if (ire->ire_ill != NULL) { /* * Save enough information so that we can recreate the IRE if - * the interface goes down and then up. The metrics associated + * the ILL goes down and then up. The metrics associated * with the route will be saved as well when rts_setmetrics() is * called after the IRE has been created. In the case where * memory cannot be allocated, none of this information will be * saved. */ - save_mp = allocb(sizeof (ifrt_t), BPRI_MED); - if (save_mp != NULL) { - ifrt_t *ifrt; - - save_mp->b_wptr += sizeof (ifrt_t); - ifrt = (ifrt_t *)save_mp->b_rptr; - bzero(ifrt, sizeof (ifrt_t)); - ifrt->ifrt_type = ire->ire_type; - ifrt->ifrt_v6addr = ire->ire_addr_v6; - mutex_enter(&ire->ire_lock); - ifrt->ifrt_v6gateway_addr = ire->ire_gateway_addr_v6; - ifrt->ifrt_v6src_addr = ire->ire_src_addr_v6; - mutex_exit(&ire->ire_lock); - ifrt->ifrt_v6mask = ire->ire_mask_v6; - ifrt->ifrt_flags = ire->ire_flags; - ifrt->ifrt_max_frag = ire->ire_max_frag; - mutex_enter(&ipif->ipif_saved_ire_lock); - save_mp->b_cont = ipif->ipif_saved_ire_mp; - ipif->ipif_saved_ire_mp = save_mp; - ipif->ipif_saved_ire_cnt++; - mutex_exit(&ipif->ipif_saved_ire_lock); - } + ill_save_ire(ire->ire_ill, ire); } + if (ire_arg != NULL) { /* * Store the ire that was successfully added into where ire_arg @@ -971,28 +902,27 @@ save_ire: } else { ire_refrele(ire); /* Held in ire_add */ } - if (ipif_refheld) + if (ipif != NULL) ipif_refrele(ipif); return (0); } /* * ip_rt_delete_v6 is called to delete an IPv6 route. - * ipif_arg is passed in to associate it with the correct interface + * ill is passed in to associate it with the correct interface. * (for link-local destinations and gateways). */ /* ARGSUSED4 */ int ip_rt_delete_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask, - const in6_addr_t *gw_addr, uint_t rtm_addrs, int flags, ipif_t *ipif_arg, - queue_t *q, mblk_t *mp, ipsq_func_t func, ip_stack_t *ipst) + const in6_addr_t *gw_addr, uint_t rtm_addrs, int flags, ill_t *ill, + ip_stack_t *ipst, zoneid_t zoneid) { ire_t *ire = NULL; ipif_t *ipif; uint_t type; uint_t match_flags = MATCH_IRE_TYPE; int err = 0; - boolean_t ipif_refheld = B_FALSE; /* * If this is the case of RTF_HOST being set, then we set the netmask @@ -1012,49 +942,49 @@ ip_rt_delete_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask, * * This makes it possible to delete an original * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1. + * However, we have RTF_KERNEL set on the ones created by ipif_up + * and those can not be deleted here. * - * As the interface index specified with the RTA_IFP sockaddr is the - * same for all ipif's off of an ill, the matching logic below uses - * MATCH_IRE_ILL if such an index was specified. This means a route - * sharing the same prefix and interface index as the the route - * intended to be deleted might be deleted instead if a RTA_IFP sockaddr - * is specified in the request. - * - * On the other hand, since the gateway address will usually be - * different for each ipif on the system, the matching logic - * uses MATCH_IRE_IPIF in the case of a traditional interface - * route. This means that interface routes for the same prefix can be - * uniquely identified if they belong to distinct ipif's and if a - * RTA_IFP sockaddr is not present. + * We use MATCH_IRE_ILL if we know the interface. If the caller + * specified an interface (from the RTA_IFP sockaddr) we use it, + * otherwise we use the ill derived from the gateway address. + * We can always match the gateway address since we record it + * in ire_gateway_addr. * * For more detail on specifying routes by gateway address and by * interface index, see the comments in ip_rt_add_v6(). */ - ipif = ipif_lookup_interface_v6(gw_addr, dst_addr, q, mp, func, &err, - ipst); + ipif = ipif_lookup_interface_v6(gw_addr, dst_addr, ipst); if (ipif != NULL) { - ipif_refheld = B_TRUE; - if (ipif_arg != NULL) { - ipif_refrele(ipif); - ipif_refheld = B_FALSE; - ipif = ipif_arg; - match_flags |= MATCH_IRE_ILL; - } else { - match_flags |= MATCH_IRE_IPIF; + ill_t *ill_match; + + if (ill != NULL) + ill_match = ill; + else + ill_match = ipif->ipif_ill; + + match_flags |= MATCH_IRE_ILL; + if (ipif->ipif_ire_type == IRE_LOOPBACK) { + ire = ire_ftable_lookup_v6(dst_addr, 0, 0, IRE_LOOPBACK, + ill_match, ALL_ZONES, NULL, match_flags, 0, ipst, + NULL); + } + if (ire == NULL) { + match_flags |= MATCH_IRE_GW; + ire = ire_ftable_lookup_v6(dst_addr, mask, gw_addr, + IRE_INTERFACE, ill_match, ALL_ZONES, NULL, + match_flags, 0, ipst, NULL); + } + /* Avoid deleting routes created by kernel from an ipif */ + if (ire != NULL && (ire->ire_flags & RTF_KERNEL)) { + ire_refrele(ire); + ire = NULL; } - if (ipif->ipif_ire_type == IRE_LOOPBACK) - ire = ire_ctable_lookup_v6(dst_addr, 0, IRE_LOOPBACK, - ipif, ALL_ZONES, NULL, match_flags, ipst); - if (ire == NULL) - ire = ire_ftable_lookup_v6(dst_addr, mask, 0, - IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL, - match_flags, ipst); - } else if (err == EINPROGRESS) { - return (err); - } else { - err = 0; + /* Restore in case we didn't find a match */ + match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_ILL); } + if (ire == NULL) { /* * At this point, the gateway address is not one of our own @@ -1062,15 +992,11 @@ ip_rt_delete_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask, * set the IRE type to lookup based on whether * this is a host route, a default route or just a prefix. * - * If an ipif_arg was passed in, then the lookup is based on an + * If an ill was passed in, then the lookup is based on an * interface index so MATCH_IRE_ILL is added to match_flags. - * In any case, MATCH_IRE_IPIF is cleared and MATCH_IRE_GW is - * set as the route being looked up is not a traditional - * interface route. */ - match_flags &= ~MATCH_IRE_IPIF; match_flags |= MATCH_IRE_GW; - if (ipif_arg != NULL) + if (ill != NULL) match_flags |= MATCH_IRE_ILL; if (IN6_ARE_ADDR_EQUAL(mask, &ipv6_all_ones)) type = IRE_HOST; @@ -1079,12 +1005,12 @@ ip_rt_delete_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask, else type = IRE_PREFIX; ire = ire_ftable_lookup_v6(dst_addr, mask, gw_addr, type, - ipif_arg, NULL, ALL_ZONES, 0, NULL, match_flags, ipst); + ill, ALL_ZONES, NULL, match_flags, 0, ipst, NULL); } - if (ipif_refheld) { + if (ipif != NULL) { ipif_refrele(ipif); - ipif_refheld = B_FALSE; + ipif = NULL; } if (ire == NULL) return (ESRCH); @@ -1103,42 +1029,9 @@ ip_rt_delete_v6(const in6_addr_t *dst_addr, const in6_addr_t *mask, } } - ipif = ire->ire_ipif; - if (ipif != NULL) { - mblk_t **mpp; - mblk_t *mp; - ifrt_t *ifrt; - in6_addr_t gw_addr_v6; - - /* Remove from ipif_saved_ire_mp list if it is there */ - mutex_enter(&ire->ire_lock); - gw_addr_v6 = ire->ire_gateway_addr_v6; - mutex_exit(&ire->ire_lock); - mutex_enter(&ipif->ipif_saved_ire_lock); - for (mpp = &ipif->ipif_saved_ire_mp; *mpp != NULL; - mpp = &(*mpp)->b_cont) { - /* - * On a given ipif, the triple of address, gateway and - * mask is unique for each saved IRE (in the case of - * ordinary interface routes, the gateway address is - * all-zeroes). - */ - mp = *mpp; - ifrt = (ifrt_t *)mp->b_rptr; - if (IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6addr, - &ire->ire_addr_v6) && - IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6gateway_addr, - &gw_addr_v6) && - IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6mask, - &ire->ire_mask_v6)) { - *mpp = mp->b_cont; - ipif->ipif_saved_ire_cnt--; - freeb(mp); - break; - } - } - mutex_exit(&ipif->ipif_saved_ire_lock); - } + ill = ire->ire_ill; + if (ill != NULL) + ill_remove_saved_ire(ill, ire); ire_delete(ire); ire_refrele(ire); return (err); @@ -1197,7 +1090,6 @@ ipif_set6to4addr(ipif_t *ipif) (void) ip_plen_to_mask_v6(16, &ipif->ipif_v6net_mask); bcopy(ill->ill_phys_addr, &v4phys, sizeof (struct in_addr)); IN6_V4ADDR_TO_6TO4(&v4phys, &ipif->ipif_v6lcl_addr); - ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, ipif->ipif_v6subnet); } @@ -1260,11 +1152,6 @@ ipif_setlinklocal(ipif_t *ipif) ipif->ipif_v6subnet); } - if (ipif->ipif_flags & IPIF_NOLOCAL) { - ipif->ipif_v6src_addr = ipv6_all_zeros; - } else { - ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; - } } /* @@ -1280,123 +1167,15 @@ ipif_setdestlinklocal(ipif_t *ipif) ASSERT(IAM_WRITER_ILL(ill)); if (IN6_IS_ADDR_UNSPECIFIED(&ill->ill_dest_token)) return; + /* Skip if we've already set the pp_dst_addr */ + if (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6pp_dst_addr)) + return; + ipif_get_linklocal(&ipif->ipif_v6pp_dst_addr, &ill->ill_dest_token); ipif->ipif_v6subnet = ipif->ipif_v6pp_dst_addr; } /* - * This function sets up the multicast mappings in NDP. - * Unlike ARP, there are no mapping_mps here. We delete the - * mapping nces and add a new one. - * - * Returns non-zero on error and 0 on success. - */ -int -ipif_ndp_setup_multicast(ipif_t *ipif, nce_t **ret_nce) -{ - ill_t *ill = ipif->ipif_ill; - in6_addr_t v6_mcast_addr = {(uint32_t)V6_MCAST, 0, 0, 0}; - in6_addr_t v6_mcast_mask = {(uint32_t)V6_MCAST, 0, 0, 0}; - in6_addr_t v6_extract_mask; - uchar_t *phys_addr, *bphys_addr, *alloc_phys; - nce_t *mnce = NULL; - int err = 0; - phyint_t *phyi = ill->ill_phyint; - uint32_t hw_extract_start; - dl_unitdata_req_t *dlur; - ip_stack_t *ipst = ill->ill_ipst; - - if (ret_nce != NULL) - *ret_nce = NULL; - - if (ipif->ipif_flags & IPIF_POINTOPOINT) - return (0); - - /* - * IPMP meta-interfaces don't have any inherent multicast mappings, - * and instead use the ones on the underlying interfaces. - */ - if (IS_IPMP(ill)) - return (0); - - /* - * Delete the mapping nce. Normally these should not exist - * as a previous ipif_down -> ipif_ndp_down should have deleted - * all the nces. But they can exist if ip_rput_dlpi_writer - * calls this when PHYI_MULTI_BCAST is set. Mappings are always - * tied to the underlying ill, so don't match across the illgrp. - */ - mnce = ndp_lookup_v6(ill, B_FALSE, &v6_mcast_addr, B_FALSE); - if (mnce != NULL) { - ndp_delete(mnce); - NCE_REFRELE(mnce); - mnce = NULL; - } - - /* - * Get media specific v6 mapping information. Note that - * nd_lla_len can be 0 for tunnels. - */ - alloc_phys = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP); - if ((alloc_phys == NULL) && (ill->ill_nd_lla_len != 0)) - return (ENOMEM); - /* - * Determine the broadcast address. - */ - dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; - if (ill->ill_sap_length < 0) - bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset; - else - bphys_addr = (uchar_t *)dlur + - dlur->dl_dest_addr_offset + ill->ill_sap_length; - - /* - * Check PHYI_MULTI_BCAST and possible length of physical - * address to determine if we use the mapping or the - * broadcast address. - */ - if ((phyi->phyint_flags & PHYI_MULTI_BCAST) || - (!MEDIA_V6MINFO(ill->ill_media, ill->ill_nd_lla_len, - bphys_addr, alloc_phys, &hw_extract_start, - &v6_extract_mask))) { - if (ill->ill_phys_addr_length > IP_MAX_HW_LEN) { - kmem_free(alloc_phys, ill->ill_nd_lla_len); - return (E2BIG); - } - /* Use the link-layer broadcast address for MULTI_BCAST */ - phys_addr = bphys_addr; - bzero(&v6_extract_mask, sizeof (v6_extract_mask)); - hw_extract_start = ill->ill_nd_lla_len; - } else { - phys_addr = alloc_phys; - } - if ((ipif->ipif_flags & IPIF_BROADCAST) || - (ill->ill_flags & ILLF_MULTICAST) || - (phyi->phyint_flags & PHYI_MULTI_BCAST)) { - mutex_enter(&ipst->ips_ndp6->ndp_g_lock); - err = ndp_add_v6(ill, - phys_addr, - &v6_mcast_addr, /* v6 address */ - &v6_mcast_mask, /* v6 mask */ - &v6_extract_mask, - hw_extract_start, - NCE_F_MAPPING | NCE_F_PERMANENT | NCE_F_NONUD, - ND_REACHABLE, - &mnce); - mutex_exit(&ipst->ips_ndp6->ndp_g_lock); - if (err == 0) { - if (ret_nce != NULL) { - *ret_nce = mnce; - } else { - NCE_REFRELE(mnce); - } - } - } - kmem_free(alloc_phys, ill->ill_nd_lla_len); - return (err); -} - -/* * Get the resolver set up for a new ipif. (Always called as writer.) */ int @@ -1405,50 +1184,28 @@ ipif_ndp_up(ipif_t *ipif, boolean_t initial) ill_t *ill = ipif->ipif_ill; int err = 0; nce_t *nce = NULL; - nce_t *mnce = NULL; boolean_t added_ipif = B_FALSE; - ASSERT(IAM_WRITER_ILL(ill)); + DTRACE_PROBE3(ipif__downup, char *, "ipif_ndp_up", + ill_t *, ill, ipif_t *, ipif); ip1dbg(("ipif_ndp_up(%s:%u)\n", ill->ill_name, ipif->ipif_id)); - /* - * ND not supported on XRESOLV interfaces. If ND support (multicast) - * added later, take out this check. - */ - if ((ill->ill_flags & ILLF_XRESOLV) || - IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) || + if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) || (!(ill->ill_net_type & IRE_INTERFACE))) { ipif->ipif_addr_ready = 1; return (0); } - /* - * Need to setup multicast mapping only when the first - * interface is coming UP. - */ - if (ill->ill_ipif_up_count == 0 && - (ill->ill_flags & ILLF_MULTICAST)) { - /* - * We set the multicast before setting up the mapping for - * local address because ipif_ndp_setup_multicast does - * ndp_walk to delete nces which will delete the mapping - * for local address also if we added the mapping for - * local address first. - */ - err = ipif_ndp_setup_multicast(ipif, &mnce); - if (err != 0) - return (err); - } - if ((ipif->ipif_flags & (IPIF_UNNUMBERED|IPIF_NOLOCAL)) == 0) { uint16_t flags; uint16_t state; - uchar_t *hw_addr = NULL; + uchar_t *hw_addr; ill_t *bound_ill; ipmp_illgrp_t *illg = ill->ill_grp; + uint_t hw_addr_len; - /* Permanent entries don't need NUD */ - flags = NCE_F_PERMANENT | NCE_F_NONUD; + flags = NCE_F_MYADDR | NCE_F_NONUD | NCE_F_PUBLISH | + NCE_F_AUTHORITY; if (ill->ill_flags & ILLF_ROUTER) flags |= NCE_F_ISROUTER; @@ -1483,10 +1240,16 @@ ipif_ndp_up(ipif_t *ipif, boolean_t initial) added_ipif = B_TRUE; } hw_addr = bound_ill->ill_nd_lla; + hw_addr_len = bound_ill->ill_phys_addr_length; } else { bound_ill = ill; - if (ill->ill_net_type == IRE_IF_RESOLVER) + if (ill->ill_net_type == IRE_IF_RESOLVER) { hw_addr = ill->ill_nd_lla; + hw_addr_len = ill->ill_phys_addr_length; + } else { + hw_addr = NULL; + hw_addr_len = 0; + } } /* @@ -1496,28 +1259,16 @@ ipif_ndp_up(ipif_t *ipif, boolean_t initial) * unsolicited advertisements to inform others. */ if (initial || !ipif->ipif_addr_ready) { + /* Causes Duplicate Address Detection to run */ state = ND_PROBE; } else { state = ND_REACHABLE; flags |= NCE_F_UNSOL_ADV; } + retry: - /* - * Create an nce for the local address. We pass a match_illgrp - * of B_TRUE because the local address must be unique across - * the illgrp, and the existence of an nce with nce_ill set - * to any ill in the group is indicative of a duplicate address - */ - err = ndp_lookup_then_add_v6(bound_ill, - B_TRUE, - hw_addr, - &ipif->ipif_v6lcl_addr, - &ipv6_all_ones, - &ipv6_all_zeros, - 0, - flags, - state, - &nce); + err = nce_lookup_then_add_v6(ill, hw_addr, hw_addr_len, + &ipif->ipif_v6lcl_addr, flags, state, &nce); switch (err) { case 0: ip1dbg(("ipif_ndp_up: NCE created for %s\n", @@ -1535,14 +1286,21 @@ retry: case EEXIST: ip1dbg(("ipif_ndp_up: NCE already exists for %s\n", ill->ill_name)); - if (!(nce->nce_flags & NCE_F_PERMANENT)) { - ndp_delete(nce); - NCE_REFRELE(nce); + if (!NCE_MYADDR(nce->nce_common)) { + /* + * A leftover nce from before this address + * existed + */ + ncec_delete(nce->nce_common); + nce_refrele(nce); nce = NULL; goto retry; } if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { - NCE_REFRELE(nce); + nce_refrele(nce); + nce = NULL; + ip1dbg(("ipif_ndp_up: NCE already exists " + "for %s\n", ill->ill_name)); goto fail; } /* @@ -1557,6 +1315,7 @@ retry: ipif->ipif_addr_ready = 1; ipif->ipif_added_nce = 1; nce->nce_ipif_cnt++; + err = 0; break; default: ip1dbg(("ipif_ndp_up: NCE creation failed for %s\n", @@ -1568,15 +1327,9 @@ retry: ipif->ipif_addr_ready = 1; } if (nce != NULL) - NCE_REFRELE(nce); - if (mnce != NULL) - NCE_REFRELE(mnce); + nce_refrele(nce); return (0); fail: - if (mnce != NULL) { - ndp_delete(mnce); - NCE_REFRELE(mnce); - } if (added_ipif) ipmp_illgrp_del_ipif(ill->ill_grp, ipif); @@ -1587,181 +1340,7 @@ fail: void ipif_ndp_down(ipif_t *ipif) { - nce_t *nce; - ill_t *ill = ipif->ipif_ill; - - ASSERT(IAM_WRITER_ILL(ill)); - - if (ipif->ipif_isv6) { - if (ipif->ipif_added_nce) { - /* - * For IPMP, `ill' can be the IPMP ill but the NCE will - * always be tied to an underlying IP interface, so we - * match across the illgrp. This is safe since we - * ensure uniqueness across the group in ipif_ndp_up(). - */ - nce = ndp_lookup_v6(ill, B_TRUE, &ipif->ipif_v6lcl_addr, - B_FALSE); - if (nce != NULL) { - if (--nce->nce_ipif_cnt == 0) - ndp_delete(nce); /* last ipif for nce */ - NCE_REFRELE(nce); - } - ipif->ipif_added_nce = 0; - } - - /* - * Make IPMP aware of the deleted data address. - */ - if (IS_IPMP(ill)) - ipmp_illgrp_del_ipif(ill->ill_grp, ipif); - } - - /* - * Remove mapping and all other nces dependent on this ill - * when the last ipif is going away. - */ - if (ill->ill_ipif_up_count == 0) - ndp_walk(ill, (pfi_t)ndp_delete_per_ill, ill, ill->ill_ipst); -} - -/* - * Used when an interface comes up to recreate any extra routes on this - * interface. - */ -static ire_t ** -ipif_recover_ire_v6(ipif_t *ipif) -{ - mblk_t *mp; - ire_t **ipif_saved_irep; - ire_t **irep; - ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; - - ip1dbg(("ipif_recover_ire_v6(%s:%u)", ipif->ipif_ill->ill_name, - ipif->ipif_id)); - - ASSERT(ipif->ipif_isv6); - - mutex_enter(&ipif->ipif_saved_ire_lock); - ipif_saved_irep = (ire_t **)kmem_zalloc(sizeof (ire_t *) * - ipif->ipif_saved_ire_cnt, KM_NOSLEEP); - if (ipif_saved_irep == NULL) { - mutex_exit(&ipif->ipif_saved_ire_lock); - return (NULL); - } - - irep = ipif_saved_irep; - - for (mp = ipif->ipif_saved_ire_mp; mp != NULL; mp = mp->b_cont) { - ire_t *ire; - queue_t *rfq; - queue_t *stq; - ifrt_t *ifrt; - in6_addr_t *src_addr; - in6_addr_t *gateway_addr; - char buf[INET6_ADDRSTRLEN]; - ushort_t type; - - /* - * When the ire was initially created and then added in - * ip_rt_add_v6(), it was created either using - * ipif->ipif_net_type in the case of a traditional interface - * route, or as one of the IRE_OFFSUBNET types (with the - * exception of IRE_HOST type redirect ire which is created by - * icmp_redirect_v6() and which we don't need to save or - * recover). In the case where ipif->ipif_net_type was - * IRE_LOOPBACK, ip_rt_add_v6() will update the ire_type to - * IRE_IF_NORESOLVER before calling ire_add_v6() to satisfy - * software like GateD and Sun Cluster which creates routes - * using the the loopback interface's address as a gateway. - * - * As ifrt->ifrt_type reflects the already updated ire_type, - * ire_create_v6() will be called in the same way here as in - * ip_rt_add_v6(), namely using ipif->ipif_net_type when the - * route looks like a traditional interface route (where - * ifrt->ifrt_type & IRE_INTERFACE is true) and otherwise - * using the saved ifrt->ifrt_type. This means that in - * the case where ipif->ipif_net_type is IRE_LOOPBACK, - * the ire created by ire_create_v6() will be an IRE_LOOPBACK, - * it will then be turned into an IRE_IF_NORESOLVER and then - * added by ire_add_v6(). - */ - ifrt = (ifrt_t *)mp->b_rptr; - if (ifrt->ifrt_type & IRE_INTERFACE) { - rfq = NULL; - stq = (ipif->ipif_net_type == IRE_IF_RESOLVER) - ? ipif->ipif_rq : ipif->ipif_wq; - src_addr = (ifrt->ifrt_flags & RTF_SETSRC) - ? &ifrt->ifrt_v6src_addr - : &ipif->ipif_v6src_addr; - gateway_addr = NULL; - type = ipif->ipif_net_type; - } else { - rfq = NULL; - stq = NULL; - src_addr = (ifrt->ifrt_flags & RTF_SETSRC) - ? &ifrt->ifrt_v6src_addr : NULL; - gateway_addr = &ifrt->ifrt_v6gateway_addr; - type = ifrt->ifrt_type; - } - - /* - * Create a copy of the IRE with the saved address and netmask. - */ - ip1dbg(("ipif_recover_ire_v6: creating IRE %s (%d) for %s/%d\n", - ip_nv_lookup(ire_nv_tbl, ifrt->ifrt_type), ifrt->ifrt_type, - inet_ntop(AF_INET6, &ifrt->ifrt_v6addr, buf, sizeof (buf)), - ip_mask_to_plen_v6(&ifrt->ifrt_v6mask))); - ire = ire_create_v6( - &ifrt->ifrt_v6addr, - &ifrt->ifrt_v6mask, - src_addr, - gateway_addr, - &ifrt->ifrt_max_frag, - NULL, - rfq, - stq, - type, - ipif, - NULL, - 0, - 0, - ifrt->ifrt_flags, - &ifrt->ifrt_iulp_info, - NULL, - NULL, - ipst); - if (ire == NULL) { - mutex_exit(&ipif->ipif_saved_ire_lock); - kmem_free(ipif_saved_irep, - ipif->ipif_saved_ire_cnt * sizeof (ire_t *)); - return (NULL); - } - - /* - * Some software (for example, GateD and Sun Cluster) attempts - * to create (what amount to) IRE_PREFIX routes with the - * loopback address as the gateway. This is primarily done to - * set up prefixes with the RTF_REJECT flag set (for example, - * when generating aggregate routes.) - * - * If the IRE type (as defined by ipif->ipif_net_type) is - * IRE_LOOPBACK, then we map the request into a - * IRE_IF_NORESOLVER. - */ - if (ipif->ipif_net_type == IRE_LOOPBACK) - ire->ire_type = IRE_IF_NORESOLVER; - /* - * ire held by ire_add, will be refreled' in ipif_up_done - * towards the end - */ - (void) ire_add(&ire, NULL, NULL, NULL, B_FALSE); - *irep = ire; - irep++; - ip1dbg(("ipif_recover_ire_v6: added ire %p\n", (void *)ire)); - } - mutex_exit(&ipif->ipif_saved_ire_lock); - return (ipif_saved_irep); + ipif_nce_down(ipif); } /* @@ -1826,8 +1405,7 @@ ip_common_prefix_v6(const in6_addr_t *a1, const in6_addr_t *a2) #define IPIF_VALID_IPV6_SOURCE(ipif) \ (((ipif)->ipif_flags & IPIF_UP) && \ - !((ipif)->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) && \ - (ipif)->ipif_addr_ready) + !((ipif)->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))) /* source address candidate */ typedef struct candidate { @@ -2195,13 +1773,6 @@ rule_addr_type(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo, static rule_res_t rule_prefix(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo, ip_stack_t *ipst) { - /* - * For IPMP, we always want to choose a random source address from - * among any equally usable addresses, so always report a tie. - */ - if (IS_IPMP(dstinfo->dst_ill)) - return (CAND_TIE); - if (!bc->cand_common_pref_set) { bc->cand_common_pref = ip_common_prefix_v6(&bc->cand_srcaddr, dstinfo->dst_addr); @@ -2252,14 +1823,15 @@ rule_must_be_last(cand_t *bc, cand_t *cc, const dstinfo_t *dstinfo, * * src_prefs is the caller's set of source address preferences. If source * address selection is being called to determine the source address of a - * connected socket (from ip_bind_connected_v6()), then the preferences are - * taken from conn_src_preferences. These preferences can be set on a + * connected socket (from ip_set_destination_v6()), then the preferences are + * taken from conn_ixa->ixa_src_preferences. These preferences can be set on a * per-socket basis using the IPV6_SRC_PREFERENCES socket option. The only * preference currently implemented is for rfc3041 temporary addresses. */ ipif_t * ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst, - boolean_t restrict_ill, uint32_t src_prefs, zoneid_t zoneid) + boolean_t restrict_ill, uint32_t src_prefs, zoneid_t zoneid, + boolean_t allow_usesrc, boolean_t *notreadyp) { dstinfo_t dstinfo; char dstr[INET6_ADDRSTRLEN]; @@ -2306,10 +1878,10 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst, * usesrc ifindex. This has higher precedence since it is * finer grained (i.e per interface) v/s being system wide. */ - if (dstill->ill_usesrc_ifindex != 0) { + if (dstill->ill_usesrc_ifindex != 0 && allow_usesrc) { if ((usesrc_ill = ill_lookup_on_ifindex(dstill->ill_usesrc_ifindex, B_TRUE, - NULL, NULL, NULL, NULL, ipst)) != NULL) { + ipst)) != NULL) { dstinfo.dst_ill = usesrc_ill; } else { return (NULL); @@ -2412,6 +1984,12 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst, if (!IPIF_VALID_IPV6_SOURCE(ipif)) continue; + if (!ipif->ipif_addr_ready) { + if (notreadyp != NULL) + *notreadyp = B_TRUE; + continue; + } + if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid && ipif->ipif_zoneid != ALL_ZONES) @@ -2505,7 +2083,7 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst, if (IS_IPMP(ill) && ipif != NULL) { mutex_enter(&ipif->ipif_ill->ill_lock); next_ipif = ipif->ipif_next; - if (next_ipif != NULL && IPIF_CAN_LOOKUP(next_ipif)) + if (next_ipif != NULL && !IPIF_IS_CONDEMNED(next_ipif)) ill->ill_src_ipif = next_ipif; else ill->ill_src_ipif = NULL; @@ -2541,7 +2119,7 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst, } mutex_enter(&ipif->ipif_ill->ill_lock); - if (IPIF_CAN_LOOKUP(ipif)) { + if (!IPIF_IS_CONDEMNED(ipif)) { ipif_refhold_locked(ipif); mutex_exit(&ipif->ipif_ill->ill_lock); rw_exit(&ipst->ips_ill_g_lock); @@ -2556,187 +2134,72 @@ ipif_select_source_v6(ill_t *dstill, const in6_addr_t *dst, } /* - * If old_ipif is not NULL, see if ipif was derived from old - * ipif and if so, recreate the interface route by re-doing - * source address selection. This happens when ipif_down -> - * ipif_update_other_ipifs calls us. + * Pick a source address based on the destination ill and an optional setsrc + * address. + * The result is stored in srcp. If generation is set, then put the source + * generation number there before we look for the source address (to avoid + * missing changes in the set of source addresses. + * If flagsp is set, then us it to pass back ipif_flags. + * + * If the caller wants to cache the returned source address and detect when + * that might be stale, the caller should pass in a generation argument, + * which the caller can later compare against ips_src_generation + * + * The precedence order for selecting an IPv6 source address is: + * - RTF_SETSRC on the first ire in the recursive lookup always wins. + * - If usrsrc is set, swap the ill to be the usesrc one. + * - If IPMP is used on the ill, select a random address from the most + * preferred ones below: + * That is followed by the long list of IPv6 source address selection rules + * starting with rule_isdst(), rule_scope(), etc. * - * If old_ipif is NULL, just redo the source address selection - * if needed. This happens when ipif_up_done_v6 calls us. + * We have lower preference for ALL_ZONES IP addresses, + * as they pose problems with unlabeled destinations. + * + * Note that when multiple IP addresses match e.g., with rule_scope() we pick + * the first one if IPMP is not in use. With IPMP we randomize. */ -void -ipif_recreate_interface_routes_v6(ipif_t *old_ipif, ipif_t *ipif) +int +ip_select_source_v6(ill_t *ill, const in6_addr_t *setsrc, const in6_addr_t *dst, + zoneid_t zoneid, ip_stack_t *ipst, uint_t restrict_ill, uint32_t src_prefs, + in6_addr_t *srcp, uint32_t *generation, uint64_t *flagsp) { - ire_t *ire; - ire_t *ipif_ire; - queue_t *stq; - ill_t *ill; - ipif_t *nipif = NULL; - boolean_t nipif_refheld = B_FALSE; - boolean_t ip6_asp_table_held = B_FALSE; - ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; - - ill = ipif->ipif_ill; - - if (!(ipif->ipif_flags & - (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) { - /* - * Can't possibly have borrowed the source - * from old_ipif. - */ - return; - } + ipif_t *ipif; + boolean_t notready = B_FALSE; /* Set if !ipif_addr_ready found */ - /* - * Is there any work to be done? No work if the address - * is INADDR_ANY, loopback or NOLOCAL or ANYCAST ( - * ipif_select_source_v6() does not borrow addresses from - * NOLOCAL and ANYCAST interfaces). - */ - if ((old_ipif != NULL) && - ((IN6_IS_ADDR_UNSPECIFIED(&old_ipif->ipif_v6lcl_addr)) || - (old_ipif->ipif_ill->ill_wq == NULL) || - (old_ipif->ipif_flags & - (IPIF_NOLOCAL|IPIF_ANYCAST)))) { - return; - } + if (flagsp != NULL) + *flagsp = 0; /* - * Perform the same checks as when creating the - * IRE_INTERFACE in ipif_up_done_v6. + * Need to grab the generation number before we check to + * avoid a race with a change to the set of local addresses. + * No lock needed since the thread which updates the set of local + * addresses use ipif/ill locks and exit those (hence a store memory + * barrier) before doing the atomic increase of ips_src_generation. */ - if (!(ipif->ipif_flags & IPIF_UP)) - return; - - if ((ipif->ipif_flags & IPIF_NOXMIT)) - return; - - if (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet) && - IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask)) - return; - - /* - * We know that ipif uses some other source for its - * IRE_INTERFACE. Is it using the source of this - * old_ipif? - */ - ipif_ire = ipif_to_ire_v6(ipif); - if (ipif_ire == NULL) - return; - - if (old_ipif != NULL && - !IN6_ARE_ADDR_EQUAL(&old_ipif->ipif_v6lcl_addr, - &ipif_ire->ire_src_addr_v6)) { - ire_refrele(ipif_ire); - return; - } - - if (ip_debug > 2) { - /* ip1dbg */ - pr_addr_dbg("ipif_recreate_interface_routes_v6: deleting IRE" - " for src %s\n", AF_INET6, &ipif_ire->ire_src_addr_v6); - } - - stq = ipif_ire->ire_stq; - - /* - * Can't use our source address. Select a different source address - * for the IRE_INTERFACE. We restrict interface route source - * address selection to ipif's assigned to the same link as the - * interface. - */ - if (ip6_asp_can_lookup(ipst)) { - ip6_asp_table_held = B_TRUE; - nipif = ipif_select_source_v6(ill, &ipif->ipif_v6subnet, - B_TRUE, IPV6_PREFER_SRC_DEFAULT, ipif->ipif_zoneid); - } - if (nipif == NULL) { - /* Last resort - all ipif's have IPIF_NOLOCAL */ - nipif = ipif; - } else { - nipif_refheld = B_TRUE; + if (generation != NULL) { + *generation = ipst->ips_src_generation; } - ire = ire_create_v6( - &ipif->ipif_v6subnet, /* dest pref */ - &ipif->ipif_v6net_mask, /* mask */ - &nipif->ipif_v6src_addr, /* src addr */ - NULL, /* no gateway */ - &ipif->ipif_mtu, /* max frag */ - NULL, /* no src nce */ - NULL, /* no recv from queue */ - stq, /* send-to queue */ - ill->ill_net_type, /* IF_[NO]RESOLVER */ - ipif, - NULL, - 0, - 0, - 0, - &ire_uinfo_null, - NULL, - NULL, - ipst); - - if (ire != NULL) { - ire_t *ret_ire; - int error; - - /* - * We don't need ipif_ire anymore. We need to delete - * before we add so that ire_add does not detect - * duplicates. - */ - ire_delete(ipif_ire); - ret_ire = ire; - error = ire_add(&ret_ire, NULL, NULL, NULL, B_FALSE); - ASSERT(error == 0); - ASSERT(ret_ire == ire); - if (ret_ire != NULL) { - /* Held in ire_add */ - ire_refrele(ret_ire); - } + /* Was RTF_SETSRC set on the first IRE in the recursive lookup? */ + if (setsrc != NULL && !IN6_IS_ADDR_UNSPECIFIED(setsrc)) { + *srcp = *setsrc; + return (0); } - /* - * Either we are falling through from above or could not - * allocate a replacement. - */ - ire_refrele(ipif_ire); - if (ip6_asp_table_held) - ip6_asp_table_refrele(ipst); - if (nipif_refheld) - ipif_refrele(nipif); -} - -/* - * This old_ipif is going away. - * - * Determine if any other ipif's are using our address as - * ipif_v6lcl_addr (due to those being IPIF_NOLOCAL, IPIF_ANYCAST, or - * IPIF_DEPRECATED). - * Find the IRE_INTERFACE for such ipif's and recreate them - * to use an different source address following the rules in - * ipif_up_done_v6. - */ -void -ipif_update_other_ipifs_v6(ipif_t *old_ipif) -{ - ipif_t *ipif; - ill_t *ill; - char buf[INET6_ADDRSTRLEN]; - - ASSERT(IAM_WRITER_IPIF(old_ipif)); - - ill = old_ipif->ipif_ill; - - ip1dbg(("ipif_update_other_ipifs_v6(%s, %s)\n", - ill->ill_name, - inet_ntop(AF_INET6, &old_ipif->ipif_v6lcl_addr, - buf, sizeof (buf)))); - for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { - if (ipif != old_ipif) - ipif_recreate_interface_routes_v6(old_ipif, ipif); + ipif = ipif_select_source_v6(ill, dst, restrict_ill, src_prefs, zoneid, + B_TRUE, ¬ready); + if (ipif == NULL) { + if (notready) + return (ENETDOWN); + else + return (EADDRNOTAVAIL); } + *srcp = ipif->ipif_v6lcl_addr; + if (flagsp != NULL) + *flagsp = ipif->ipif_flags; + ipif_refrele(ipif); + return (0); } /* @@ -2744,11 +2207,10 @@ ipif_update_other_ipifs_v6(ipif_t *old_ipif) * the physical device. * q and mp represents an ioctl which will be queued waiting for * completion of the DLPI message exchange. - * MUST be called on an ill queue. Can not set conn_pending_ill for that - * reason thus the DL_PHYS_ADDR_ACK code does not assume ill_pending_q. + * MUST be called on an ill queue. * - * Returns EINPROGRESS when mp has been consumed by queueing it on - * ill_pending_mp and the ioctl will complete in ip_rput. + * Returns EINPROGRESS when mp has been consumed by queueing it. + * The ioctl will complete in ip_rput. */ int ill_dl_phys(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) @@ -2888,6 +2350,7 @@ bad: return (ENOMEM); } +/* Add room for tcp+ip headers */ uint_t ip_loopback_mtu_v6plus = IP_LOOPBACK_MTU + IPV6_HDR_LEN + 20; /* @@ -2899,28 +2362,14 @@ uint_t ip_loopback_mtu_v6plus = IP_LOOPBACK_MTU + IPV6_HDR_LEN + 20; int ipif_up_done_v6(ipif_t *ipif) { - ire_t *ire_array[20]; - ire_t **irep = ire_array; - ire_t **irep1; ill_t *ill = ipif->ipif_ill; - queue_t *stq; - in6_addr_t v6addr; - in6_addr_t route_mask; - ipif_t *src_ipif = NULL; - ipif_t *tmp_ipif; - boolean_t flush_ire_cache = B_TRUE; int err; - char buf[INET6_ADDRSTRLEN]; - ire_t **ipif_saved_irep = NULL; - int ipif_saved_ire_cnt; - int cnt; - boolean_t src_ipif_held = B_FALSE; boolean_t loopback = B_FALSE; - boolean_t ip6_asp_table_held = B_FALSE; - ip_stack_t *ipst = ill->ill_ipst; ip1dbg(("ipif_up_done_v6(%s:%u)\n", ipif->ipif_ill->ill_name, ipif->ipif_id)); + DTRACE_PROBE3(ipif__downup, char *, "ipif_up_done_v6", + ill_t *, ill, ipif_t *, ipif); /* Check if this is a loopback interface */ if (ipif->ipif_ill->ill_wq == NULL) @@ -2929,46 +2378,10 @@ ipif_up_done_v6(ipif_t *ipif) ASSERT(ipif->ipif_isv6); ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); - /* - * If all other interfaces for this ill are down or DEPRECATED, - * or otherwise unsuitable for source address selection, remove - * any IRE_CACHE entries for this ill to make sure source - * address selection gets to take this new ipif into account. - * No need to hold ill_lock while traversing the ipif list since - * we are writer - */ - for (tmp_ipif = ill->ill_ipif; tmp_ipif; - tmp_ipif = tmp_ipif->ipif_next) { - if (((tmp_ipif->ipif_flags & - (IPIF_NOXMIT|IPIF_ANYCAST|IPIF_NOLOCAL|IPIF_DEPRECATED)) || - !(tmp_ipif->ipif_flags & IPIF_UP)) || - (tmp_ipif == ipif)) - continue; - /* first useable pre-existing interface */ - flush_ire_cache = B_FALSE; - break; - } - if (flush_ire_cache) - ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, - IRE_CACHE, ill_ipif_cache_delete, ill, ill); + if (IS_LOOPBACK(ill) || ill->ill_net_type == IRE_IF_NORESOLVER) { + nce_t *loop_nce = NULL; + uint16_t flags = (NCE_F_MYADDR | NCE_F_NONUD | NCE_F_AUTHORITY); - /* - * Figure out which way the send-to queue should go. Only - * IRE_IF_RESOLVER or IRE_IF_NORESOLVER should show up here. - */ - switch (ill->ill_net_type) { - case IRE_IF_RESOLVER: - stq = ill->ill_rq; - break; - case IRE_IF_NORESOLVER: - case IRE_LOOPBACK: - stq = ill->ill_wq; - break; - default: - return (EINVAL); - } - - if (IS_LOOPBACK(ill)) { /* * lo0:1 and subsequent ipifs were marked IRE_LOCAL in * ipif_lookup_on_name(), but in the case of zones we can have @@ -2979,29 +2392,99 @@ ipif_up_done_v6(ipif_t *ipif) ipif->ipif_ire_type = IRE_LOOPBACK; else ipif->ipif_ire_type = IRE_LOCAL; + if (ill->ill_net_type != IRE_LOOPBACK) + flags |= NCE_F_PUBLISH; + err = nce_lookup_then_add_v6(ill, NULL, + ill->ill_phys_addr_length, + &ipif->ipif_v6lcl_addr, flags, ND_REACHABLE, &loop_nce); + + /* A shared-IP zone sees EEXIST for lo0:N */ + if (err == 0 || err == EEXIST) { + ipif->ipif_added_nce = 1; + loop_nce->nce_ipif_cnt++; + nce_refrele(loop_nce); + err = 0; + } else { + ASSERT(loop_nce == NULL); + return (err); + } } - if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST) || - ((ipif->ipif_flags & IPIF_DEPRECATED) && - !(ipif->ipif_flags & IPIF_NOFAILOVER))) { + err = ipif_add_ires_v6(ipif, loopback); + if (err != 0) { /* - * Can't use our source address. Select a different - * source address for the IRE_INTERFACE and IRE_LOCAL + * See comments about return value from + * ipif_addr_availability_check() in ipif_add_ires_v6(). */ - if (ip6_asp_can_lookup(ipst)) { - ip6_asp_table_held = B_TRUE; - src_ipif = ipif_select_source_v6(ipif->ipif_ill, - &ipif->ipif_v6subnet, B_FALSE, - IPV6_PREFER_SRC_DEFAULT, ipif->ipif_zoneid); + if (err != EADDRINUSE) { + ipif_ndp_down(ipif); + } else { + /* + * Make IPMP aware of the deleted ipif so that + * the needed ipmp cleanup (e.g., of ipif_bound_ill) + * can be completed. Note that we do not want to + * destroy the nce that was created on the ipmp_ill + * for the active copy of the duplicate address in + * use. + */ + if (IS_IPMP(ill)) + ipmp_illgrp_del_ipif(ill->ill_grp, ipif); + err = EADDRNOTAVAIL; } - if (src_ipif == NULL) - src_ipif = ipif; /* Last resort */ - else - src_ipif_held = B_TRUE; - } else { - src_ipif = ipif; + return (err); + } + + if (ill->ill_ipif_up_count == 1 && !loopback) { + /* Recover any additional IREs entries for this ill */ + (void) ill_recover_saved_ire(ill); } + if (ill->ill_need_recover_multicast) { + /* + * Need to recover all multicast memberships in the driver. + * This had to be deferred until we had attached. + */ + ill_recover_multicast(ill); + } + + if (ill->ill_ipif_up_count == 1) { + /* + * Since the interface is now up, it may now be active. + */ + if (IS_UNDER_IPMP(ill)) + ipmp_ill_refresh_active(ill); + } + + /* Join the allhosts multicast address and the solicited node MC */ + ipif_multicast_up(ipif); + + /* Perhaps ilgs should use this ill */ + update_conn_ill(NULL, ill->ill_ipst); + + if (ipif->ipif_addr_ready) + ipif_up_notify(ipif); + + return (0); +} + +/* + * Add the IREs associated with the ipif. + * Those MUST be explicitly removed in ipif_delete_ires_v6. + */ +static int +ipif_add_ires_v6(ipif_t *ipif, boolean_t loopback) +{ + ill_t *ill = ipif->ipif_ill; + ip_stack_t *ipst = ill->ill_ipst; + ire_t *ire_array[20]; + ire_t **irep = ire_array; + ire_t **irep1; + in6_addr_t v6addr; + in6_addr_t route_mask; + int err; + char buf[INET6_ADDRSTRLEN]; + ire_t *ire_local = NULL; /* LOCAL or LOOPBACK */ + if (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) && !(ipif->ipif_flags & IPIF_NOLOCAL)) { @@ -3024,45 +2507,38 @@ ipif_up_done_v6(ipif_t *ipif) err = ip_srcid_insert(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst); if (err != 0) { - ip0dbg(("ipif_up_done_v6: srcid_insert %d\n", err)); - if (src_ipif_held) - ipif_refrele(src_ipif); - if (ip6_asp_table_held) - ip6_asp_table_refrele(ipst); + ip0dbg(("ipif_add_ires_v6: srcid_insert %d\n", err)); return (err); } /* * If the interface address is set, create the LOCAL * or LOOPBACK IRE. */ - ip1dbg(("ipif_up_done_v6: creating IRE %d for %s\n", + ip1dbg(("ipif_add_ires_v6: creating IRE %d for %s\n", ipif->ipif_ire_type, inet_ntop(AF_INET6, &ipif->ipif_v6lcl_addr, buf, sizeof (buf)))); - *irep++ = ire_create_v6( + ire_local = ire_create_v6( &ipif->ipif_v6lcl_addr, /* dest address */ &ipv6_all_ones, /* mask */ - &src_ipif->ipif_v6src_addr, /* source address */ NULL, /* no gateway */ - &ip_loopback_mtu_v6plus, /* max frag size */ - NULL, - ipif->ipif_rq, /* recv-from queue */ - NULL, /* no send-to queue */ ipif->ipif_ire_type, /* LOCAL or LOOPBACK */ - ipif, /* interface */ - NULL, - 0, - 0, - (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE : 0, - &ire_uinfo_null, - NULL, + ipif->ipif_ill, /* interface */ + ipif->ipif_zoneid, + ((ipif->ipif_flags & IPIF_PRIVATE) ? + RTF_PRIVATE : 0) | RTF_KERNEL, NULL, ipst); + if (ire_local == NULL) { + ip1dbg(("ipif_up_done_v6: NULL ire_local\n")); + err = ENOMEM; + goto bad; + } } /* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */ - if (stq != NULL && !(ipif->ipif_flags & IPIF_NOXMIT) && + if (!loopback && !(ipif->ipif_flags & IPIF_NOXMIT) && !(IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet) && IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask))) { /* ipif_v6subnet is ipif_v6pp_dst_addr for pt-pt */ @@ -3074,27 +2550,19 @@ ipif_up_done_v6(ipif_t *ipif) route_mask = ipif->ipif_v6net_mask; } - ip1dbg(("ipif_up_done_v6: creating if IRE %d for %s\n", + ip1dbg(("ipif_add_ires_v6: creating if IRE %d for %s\n", ill->ill_net_type, inet_ntop(AF_INET6, &v6addr, buf, sizeof (buf)))); *irep++ = ire_create_v6( &v6addr, /* dest pref */ &route_mask, /* mask */ - &src_ipif->ipif_v6src_addr, /* src addr */ - NULL, /* no gateway */ - &ipif->ipif_mtu, /* max frag */ - NULL, /* no src nce */ - NULL, /* no recv from queue */ - stq, /* send-to queue */ + &ipif->ipif_v6lcl_addr, /* gateway */ ill->ill_net_type, /* IF_[NO]RESOLVER */ - ipif, - NULL, - 0, - 0, - (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE : 0, - &ire_uinfo_null, - NULL, + ipif->ipif_ill, + ipif->ipif_zoneid, + ((ipif->ipif_flags & IPIF_PRIVATE) ? + RTF_PRIVATE : 0) | RTF_KERNEL, NULL, ipst); } @@ -3103,15 +2571,13 @@ ipif_up_done_v6(ipif_t *ipif) for (irep1 = irep; irep1 > ire_array; ) { irep1--; if (*irep1 == NULL) { - ip1dbg(("ipif_up_done_v6: NULL ire found in" + ip1dbg(("ipif_add_ires_v6: NULL ire found in" " ire_array\n")); err = ENOMEM; goto bad; } } - ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); - /* * Need to atomically check for IP address availability under * ip_addr_avail_lock. ill_g_lock is held as reader to ensure no new @@ -3132,20 +2598,12 @@ ipif_up_done_v6(ipif_t *ipif) * the other ipif. So we don't want to delete it (otherwise the * other ipif would be unable to send packets). * ip_addr_availability_check() identifies this case for us and - * returns EADDRINUSE; we need to turn it into EADDRNOTAVAIL + * returns EADDRINUSE; Caller must turn it into EADDRNOTAVAIL * which is the expected error code. * - * Note that, for the non-XRESOLV case, ipif_ndp_down() will - * only delete the nce in the case when the nce_ipif_cnt drops - * to 0. + * Note that ipif_ndp_down() will only delete the nce in the + * case when the nce_ipif_cnt drops to 0. */ - if (err == EADDRINUSE) { - if (ipif->ipif_ill->ill_flags & ILLF_XRESOLV) { - freemsg(ipif->ipif_arp_del_mp); - ipif->ipif_arp_del_mp = NULL; - } - err = EADDRNOTAVAIL; - } ill->ill_ipif_up_count--; ipif->ipif_flags &= ~IPIF_UP; goto bad; @@ -3153,91 +2611,42 @@ ipif_up_done_v6(ipif_t *ipif) /* * Add in all newly created IREs. - * - * NOTE : We refrele the ire though we may branch to "bad" - * later on where we do ire_delete. This is okay - * because nobody can delete it as we are running - * exclusively. */ + if (ire_local != NULL) { + ire_local = ire_add(ire_local); +#ifdef DEBUG + if (ire_local != NULL) { + ire_refhold_notr(ire_local); + ire_refrele(ire_local); + } +#endif + } + rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); + if (ire_local != NULL) + ipif->ipif_ire_local = ire_local; + rw_exit(&ipst->ips_ill_g_lock); + ire_local = NULL; + for (irep1 = irep; irep1 > ire_array; ) { irep1--; /* Shouldn't be adding any bcast ire's */ ASSERT((*irep1)->ire_type != IRE_BROADCAST); ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); - /* - * refheld by ire_add. refele towards the end of the func - */ - (void) ire_add(irep1, NULL, NULL, NULL, B_FALSE); - } - if (ip6_asp_table_held) { - ip6_asp_table_refrele(ipst); - ip6_asp_table_held = B_FALSE; - } - - /* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */ - ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt; - ipif_saved_irep = ipif_recover_ire_v6(ipif); - - if (ill->ill_need_recover_multicast) { - /* - * Need to recover all multicast memberships in the driver. - * This had to be deferred until we had attached. - */ - ill_recover_multicast(ill); - } - - if (ill->ill_ipif_up_count == 1) { - /* - * Since the interface is now up, it may now be active. - */ - if (IS_UNDER_IPMP(ill)) - ipmp_ill_refresh_active(ill); - } - - /* Join the allhosts multicast address and the solicited node MC */ - ipif_multicast_up(ipif); - - /* - * See if anybody else would benefit from our new ipif. - */ - if (!loopback && - !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) { - ill_update_source_selection(ill); - } - - for (irep1 = irep; irep1 > ire_array; ) { - irep1--; + /* refheld by ire_add */ + *irep1 = ire_add(*irep1); if (*irep1 != NULL) { - /* was held in ire_add */ - ire_refrele(*irep1); - } - } - - cnt = ipif_saved_ire_cnt; - for (irep1 = ipif_saved_irep; cnt > 0; irep1++, cnt--) { - if (*irep1 != NULL) { - /* was held in ire_add */ ire_refrele(*irep1); + *irep1 = NULL; } } if (ipif->ipif_addr_ready) ipif_up_notify(ipif); - - if (ipif_saved_irep != NULL) { - kmem_free(ipif_saved_irep, - ipif_saved_ire_cnt * sizeof (ire_t *)); - } - - if (src_ipif_held) - ipif_refrele(src_ipif); - return (0); bad: - if (ip6_asp_table_held) - ip6_asp_table_refrele(ipst); - + if (ire_local != NULL) + ire_delete(ire_local); while (irep > ire_array) { irep--; if (*irep != NULL) @@ -3245,21 +2654,85 @@ bad: } (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst); - if (ipif_saved_irep != NULL) { - kmem_free(ipif_saved_irep, - ipif_saved_ire_cnt * sizeof (ire_t *)); + return (err); +} + +/* Remove all the IREs created by ipif_add_ires_v6 */ +void +ipif_delete_ires_v6(ipif_t *ipif) +{ + ill_t *ill = ipif->ipif_ill; + ip_stack_t *ipst = ill->ill_ipst; + in6_addr_t v6addr; + in6_addr_t route_mask; + ire_t *ire; + int match_args; + boolean_t loopback; + + /* Check if this is a loopback interface */ + loopback = (ipif->ipif_ill->ill_wq == NULL); + + match_args = MATCH_IRE_TYPE | MATCH_IRE_ILL | MATCH_IRE_MASK | + MATCH_IRE_ZONEONLY; + + rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); + if ((ire = ipif->ipif_ire_local) != NULL) { + ipif->ipif_ire_local = NULL; + rw_exit(&ipst->ips_ill_g_lock); + /* + * Move count to ipif so we don't loose the count due to + * a down/up dance. + */ + atomic_add_32(&ipif->ipif_ib_pkt_count, ire->ire_ib_pkt_count); + + ire_delete(ire); + ire_refrele_notr(ire); + } else { + rw_exit(&ipst->ips_ill_g_lock); } - if (src_ipif_held) - ipif_refrele(src_ipif); - ipif_ndp_down(ipif); - ipif_resolver_down(ipif); + match_args |= MATCH_IRE_GW; - return (err); + /* + * Delete the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. + * Note that atun interfaces have an all-zero ipif_v6subnet. + * Thus we allow a zero subnet as long as the mask is non-zero. + */ + if (IS_UNDER_IPMP(ill)) + match_args |= MATCH_IRE_TESTHIDDEN; + + if (!loopback && !(ipif->ipif_flags & IPIF_NOXMIT) && + !(IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6subnet) && + IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6net_mask))) { + /* ipif_v6subnet is ipif_v6pp_dst_addr for pt-pt */ + v6addr = ipif->ipif_v6subnet; + + if (ipif->ipif_flags & IPIF_POINTOPOINT) { + route_mask = ipv6_all_ones; + } else { + route_mask = ipif->ipif_v6net_mask; + } + + ire = ire_ftable_lookup_v6( + &v6addr, /* dest pref */ + &route_mask, /* mask */ + &ipif->ipif_v6lcl_addr, /* gateway */ + ill->ill_net_type, /* IF_[NO]RESOLVER */ + ipif->ipif_ill, + ipif->ipif_zoneid, + NULL, + match_args, + 0, + ipst, + NULL); + ASSERT(ire != NULL); + ire_delete(ire); + ire_refrele(ire); + } } /* - * Delete an ND entry and the corresponding IRE_CACHE entry if it exists. + * Delete an ND entry if it exists. */ /* ARGSUSED */ int @@ -3267,11 +2740,10 @@ ip_siocdelndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_ifreq) { sin6_t *sin6; - nce_t *nce; struct lifreq *lifr; lif_nd_req_t *lnr; ill_t *ill = ipif->ipif_ill; - ire_t *ire; + nce_t *nce; lifr = (struct lifreq *)mp->b_cont->b_cont->b_rptr; lnr = &lifr->lifr_nd; @@ -3289,29 +2761,27 @@ ip_siocdelndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, /* * Since ND mappings must be consistent across an IPMP group, prohibit - * deleting ND mappings on underlying interfaces. Also, since ND - * mappings for IPMP data addresses are owned by IP itself, prohibit - * deleting them. + * deleting ND mappings on underlying interfaces. + * Don't allow deletion of mappings for local addresses. */ if (IS_UNDER_IPMP(ill)) return (EPERM); - if (IS_IPMP(ill)) { - ire = ire_ctable_lookup_v6(&sin6->sin6_addr, NULL, IRE_LOCAL, - ipif, ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, - ill->ill_ipst); - if (ire != NULL) { - ire_refrele(ire); - return (EPERM); - } - } - - /* See comment in ndp_query() regarding IS_IPMP(ill) usage */ - nce = ndp_lookup_v6(ill, IS_IPMP(ill), &sin6->sin6_addr, B_FALSE); + nce = nce_lookup_v6(ill, &sin6->sin6_addr); if (nce == NULL) return (ESRCH); - ndp_delete(nce); - NCE_REFRELE(nce); + + if (NCE_MYADDR(nce->nce_common)) { + nce_refrele(nce); + return (EPERM); + } + + /* + * delete the nce_common which will also delete the nces on any + * under_ill in the case of ipmp. + */ + ncec_delete(nce->nce_common); + nce_refrele(nce); return (0); } @@ -3383,9 +2853,9 @@ ip_siocsetndp_v6(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, return (EPERM); if (IS_IPMP(ill)) { - ire = ire_ctable_lookup_v6(&sin6->sin6_addr, NULL, IRE_LOCAL, - ipif, ALL_ZONES, NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, - ill->ill_ipst); + ire = ire_ftable_lookup_v6(&sin6->sin6_addr, NULL, NULL, + IRE_LOCAL, ill, ALL_ZONES, NULL, + MATCH_IRE_TYPE | MATCH_IRE_ILL, 0, ill->ill_ipst, NULL); if (ire != NULL) { ire_refrele(ire); return (EPERM); diff --git a/usr/src/uts/common/inet/ip/ip6_input.c b/usr/src/uts/common/inet/ip/ip6_input.c new file mode 100644 index 0000000000..cee5344bf6 --- /dev/null +++ b/usr/src/uts/common/inet/ip/ip6_input.c @@ -0,0 +1,2749 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* Copyright (c) 1990 Mentat Inc. */ + +#include <sys/types.h> +#include <sys/stream.h> +#include <sys/dlpi.h> +#include <sys/stropts.h> +#include <sys/sysmacros.h> +#include <sys/strsubr.h> +#include <sys/strlog.h> +#include <sys/strsun.h> +#include <sys/zone.h> +#define _SUN_TPI_VERSION 2 +#include <sys/tihdr.h> +#include <sys/xti_inet.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/kobj.h> +#include <sys/modctl.h> +#include <sys/atomic.h> +#include <sys/policy.h> +#include <sys/priv.h> + +#include <sys/systm.h> +#include <sys/param.h> +#include <sys/kmem.h> +#include <sys/sdt.h> +#include <sys/socket.h> +#include <sys/vtrace.h> +#include <sys/isa_defs.h> +#include <sys/mac.h> +#include <net/if.h> +#include <net/if_arp.h> +#include <net/route.h> +#include <sys/sockio.h> +#include <netinet/in.h> +#include <net/if_dl.h> + +#include <inet/common.h> +#include <inet/mi.h> +#include <inet/mib2.h> +#include <inet/nd.h> +#include <inet/arp.h> +#include <inet/snmpcom.h> +#include <inet/kstatcom.h> + +#include <netinet/igmp_var.h> +#include <netinet/ip6.h> +#include <netinet/icmp6.h> +#include <netinet/sctp.h> + +#include <inet/ip.h> +#include <inet/ip_impl.h> +#include <inet/ip6.h> +#include <inet/ip6_asp.h> +#include <inet/optcom.h> +#include <inet/tcp.h> +#include <inet/tcp_impl.h> +#include <inet/ip_multi.h> +#include <inet/ip_if.h> +#include <inet/ip_ire.h> +#include <inet/ip_ftable.h> +#include <inet/ip_rts.h> +#include <inet/ip_ndp.h> +#include <inet/ip_listutils.h> +#include <netinet/igmp.h> +#include <netinet/ip_mroute.h> +#include <inet/ipp_common.h> + +#include <net/pfkeyv2.h> +#include <inet/sadb.h> +#include <inet/ipsec_impl.h> +#include <inet/ipdrop.h> +#include <inet/ip_netinfo.h> +#include <inet/ilb_ip.h> +#include <sys/squeue_impl.h> +#include <sys/squeue.h> + +#include <sys/ethernet.h> +#include <net/if_types.h> +#include <sys/cpuvar.h> + +#include <ipp/ipp.h> +#include <ipp/ipp_impl.h> +#include <ipp/ipgpc/ipgpc.h> + +#include <sys/pattr.h> +#include <inet/ipclassifier.h> +#include <inet/sctp_ip.h> +#include <inet/sctp/sctp_impl.h> +#include <inet/udp_impl.h> +#include <sys/sunddi.h> + +#include <sys/tsol/label.h> +#include <sys/tsol/tnet.h> + +#include <rpc/pmap_prot.h> + +#ifdef DEBUG +extern boolean_t skip_sctp_cksum; +#endif + +static void ip_input_local_v6(ire_t *, mblk_t *, ip6_t *, ip_recv_attr_t *); + +static void ip_input_multicast_v6(ire_t *, mblk_t *, ip6_t *, + ip_recv_attr_t *); + +#pragma inline(ip_input_common_v6, ip_input_local_v6, ip_forward_xmit_v6) + +/* + * Direct read side procedure capable of dealing with chains. GLDv3 based + * drivers call this function directly with mblk chains while STREAMS + * read side procedure ip_rput() calls this for single packet with ip_ring + * set to NULL to process one packet at a time. + * + * The ill will always be valid if this function is called directly from + * the driver. + * + * If ip_input_v6() is called from GLDv3: + * + * - This must be a non-VLAN IP stream. + * - 'mp' is either an untagged or a special priority-tagged packet. + * - Any VLAN tag that was in the MAC header has been stripped. + * + * If the IP header in packet is not 32-bit aligned, every message in the + * chain will be aligned before further operations. This is required on SPARC + * platform. + */ +void +ip_input_v6(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain, + struct mac_header_info_s *mhip) +{ + (void) ip_input_common_v6(ill, ip_ring, mp_chain, mhip, NULL, NULL, + NULL); +} + +/* + * ip_accept_tcp_v6() - This function is called by the squeue when it retrieves + * a chain of packets in the poll mode. The packets have gone through the + * data link processing but not IP processing. For performance and latency + * reasons, the squeue wants to process the chain in line instead of feeding + * it back via ip_input path. + * + * We set up the ip_recv_attr_t with IRAF_TARGET_SQP to that ip_fanout_v6 + * will pass back any TCP packets matching the target sqp to + * ip_input_common_v6 using ira_target_sqp_mp. Other packets are handled by + * ip_input_v6 and ip_fanout_v6 as normal. + * The TCP packets that match the target squeue are returned to the caller + * as a b_next chain after each packet has been prepend with an mblk + * from ip_recv_attr_to_mblk. + */ +mblk_t * +ip_accept_tcp_v6(ill_t *ill, ill_rx_ring_t *ip_ring, squeue_t *target_sqp, + mblk_t *mp_chain, mblk_t **last, uint_t *cnt) +{ + return (ip_input_common_v6(ill, ip_ring, mp_chain, NULL, target_sqp, + last, cnt)); +} + +/* + * Used by ip_input_v6 and ip_accept_tcp_v6 + * The last three arguments are only used by ip_accept_tcp_v6, and mhip is + * only used by ip_input_v6. + */ +mblk_t * +ip_input_common_v6(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain, + struct mac_header_info_s *mhip, squeue_t *target_sqp, + mblk_t **last, uint_t *cnt) +{ + mblk_t *mp; + ip6_t *ip6h; + ip_recv_attr_t iras; /* Receive attributes */ + rtc_t rtc; + iaflags_t chain_flags = 0; /* Fixed for chain */ + mblk_t *ahead = NULL; /* Accepted head */ + mblk_t *atail = NULL; /* Accepted tail */ + uint_t acnt = 0; /* Accepted count */ + + ASSERT(mp_chain != NULL); + ASSERT(ill != NULL); + + /* These ones do not change as we loop over packets */ + iras.ira_ill = iras.ira_rill = ill; + iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; + iras.ira_rifindex = iras.ira_ruifindex; + iras.ira_sqp = NULL; + iras.ira_ring = ip_ring; + /* For ECMP and outbound transmit ring selection */ + iras.ira_xmit_hint = ILL_RING_TO_XMIT_HINT(ip_ring); + + iras.ira_target_sqp = target_sqp; + iras.ira_target_sqp_mp = NULL; + if (target_sqp != NULL) + chain_flags |= IRAF_TARGET_SQP; + + /* + * We try to have a mhip pointer when possible, but + * it might be NULL in some cases. In those cases we + * have to assume unicast. + */ + iras.ira_mhip = mhip; + iras.ira_flags = 0; + if (mhip != NULL) { + switch (mhip->mhi_dsttype) { + case MAC_ADDRTYPE_MULTICAST : + chain_flags |= IRAF_L2DST_MULTICAST; + break; + case MAC_ADDRTYPE_BROADCAST : + chain_flags |= IRAF_L2DST_BROADCAST; + break; + } + } + + /* + * Initialize the one-element route cache. + * + * We do ire caching from one iteration to + * another. In the event the packet chain contains + * all packets from the same dst, this caching saves + * an ire_route_recursive for each of the succeeding + * packets in a packet chain. + */ + rtc.rtc_ire = NULL; + rtc.rtc_ip6addr = ipv6_all_zeros; + + /* Loop over b_next */ + for (mp = mp_chain; mp != NULL; mp = mp_chain) { + mp_chain = mp->b_next; + mp->b_next = NULL; + + /* + * if db_ref > 1 then copymsg and free original. Packet + * may be changed and we do not want the other entity + * who has a reference to this message to trip over the + * changes. This is a blind change because trying to + * catch all places that might change the packet is too + * difficult. + * + * This corresponds to the fast path case, where we have + * a chain of M_DATA mblks. We check the db_ref count + * of only the 1st data block in the mblk chain. There + * doesn't seem to be a reason why a device driver would + * send up data with varying db_ref counts in the mblk + * chain. In any case the Fast path is a private + * interface, and our drivers don't do such a thing. + * Given the above assumption, there is no need to walk + * down the entire mblk chain (which could have a + * potential performance problem) + * + * The "(DB_REF(mp) > 1)" check was moved from ip_rput() + * to here because of exclusive ip stacks and vnics. + * Packets transmitted from exclusive stack over vnic + * can have db_ref > 1 and when it gets looped back to + * another vnic in a different zone, you have ip_input() + * getting dblks with db_ref > 1. So if someone + * complains of TCP performance under this scenario, + * take a serious look here on the impact of copymsg(). + */ + if (DB_REF(mp) > 1) { + if ((mp = ip_fix_dbref(mp, &iras)) == NULL) + continue; + } + + /* + * IP header ptr not aligned? + * OR IP header not complete in first mblk + */ + ip6h = (ip6_t *)mp->b_rptr; + if (!OK_32PTR(ip6h) || MBLKL(mp) < IPV6_HDR_LEN) { + mp = ip_check_and_align_header(mp, IPV6_HDR_LEN, &iras); + if (mp == NULL) + continue; + ip6h = (ip6_t *)mp->b_rptr; + } + + /* Protect against a mix of Ethertypes and IP versions */ + if (IPH_HDR_VERSION(ip6h) != IPV6_VERSION) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); + ip_drop_input("ipIfStatsInHdrErrors", mp, ill); + freemsg(mp); + /* mhip might point into 1st packet in the chain. */ + iras.ira_mhip = NULL; + continue; + } + + /* + * Check for Martian addrs; we have to explicitly + * test for for zero dst since this is also used as + * an indication that the rtc is not used. + */ + if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_dst)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors); + ip_drop_input("ipIfStatsInAddrErrors", mp, ill); + freemsg(mp); + /* mhip might point into 1st packet in the chain. */ + iras.ira_mhip = NULL; + continue; + } + /* + * Keep L2SRC from a previous packet in chain since mhip + * might point into an earlier packet in the chain. + */ + chain_flags |= (iras.ira_flags & IRAF_L2SRC_SET); + + iras.ira_flags = IRAF_VERIFY_ULP_CKSUM | chain_flags; + iras.ira_free_flags = 0; + iras.ira_cred = NULL; + iras.ira_cpid = NOPID; + iras.ira_tsl = NULL; + iras.ira_zoneid = ALL_ZONES; /* Default for forwarding */ + + /* + * We must count all incoming packets, even if they end + * up being dropped later on. Defer counting bytes until + * we have the whole IP header in first mblk. + */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); + + iras.ira_pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; + UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, + iras.ira_pktlen); + + /* + * Call one of: + * ill_input_full_v6 + * ill_input_short_v6 + * The former is used in the case of TX. See ill_set_inputfn(). + */ + (*ill->ill_inputfn)(mp, ip6h, &ip6h->ip6_dst, &iras, &rtc); + + /* Any references to clean up? No hold on ira_ill */ + if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED)) + ira_cleanup(&iras, B_FALSE); + + if (iras.ira_target_sqp_mp != NULL) { + /* Better be called from ip_accept_tcp */ + ASSERT(target_sqp != NULL); + + /* Found one packet to accept */ + mp = iras.ira_target_sqp_mp; + iras.ira_target_sqp_mp = NULL; + ASSERT(ip_recv_attr_is_mblk(mp)); + + if (atail != NULL) + atail->b_next = mp; + else + ahead = mp; + atail = mp; + acnt++; + mp = NULL; + } + /* mhip might point into 1st packet in the chain. */ + iras.ira_mhip = NULL; + } + /* Any remaining references to the route cache? */ + if (rtc.rtc_ire != NULL) { + ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&rtc.rtc_ip6addr)); + ire_refrele(rtc.rtc_ire); + } + + if (ahead != NULL) { + /* Better be called from ip_accept_tcp */ + ASSERT(target_sqp != NULL); + *last = atail; + *cnt = acnt; + return (ahead); + } + + return (NULL); +} + +/* + * This input function is used when + * - is_system_labeled() + * + * Note that for IPv6 CGTP filtering is handled only when receiving fragment + * headers, and RSVP uses router alert options, thus we don't need anything + * extra for them. + */ +void +ill_input_full_v6(mblk_t *mp, void *iph_arg, void *nexthop_arg, + ip_recv_attr_t *ira, rtc_t *rtc) +{ + ip6_t *ip6h = (ip6_t *)iph_arg; + in6_addr_t *nexthop = (in6_addr_t *)nexthop_arg; + ill_t *ill = ira->ira_ill; + + ASSERT(ira->ira_tsl == NULL); + + /* + * Attach any necessary label information to + * this packet + */ + if (is_system_labeled()) { + ira->ira_flags |= IRAF_SYSTEM_LABELED; + + /* + * This updates ira_cred, ira_tsl and ira_free_flags based + * on the label. + */ + if (!tsol_get_pkt_label(mp, IPV6_VERSION, ira)) { + if (ip6opt_ls != 0) + ip0dbg(("tsol_get_pkt_label v6 failed\n")); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); + freemsg(mp); + return; + } + /* Note that ira_tsl can be NULL here. */ + + /* tsol_get_pkt_label sometimes does pullupmsg */ + ip6h = (ip6_t *)mp->b_rptr; + } + ill_input_short_v6(mp, ip6h, nexthop, ira, rtc); +} + +/* + * Check for IPv6 addresses that should not appear on the wire + * as either source or destination. + * If we ever implement Stateless IPv6 Translators (SIIT) we'd have + * to revisit the IPv4-mapped part. + */ +static boolean_t +ip6_bad_address(in6_addr_t *addr, boolean_t is_src) +{ + if (IN6_IS_ADDR_V4MAPPED(addr)) { + ip1dbg(("ip_input_v6: pkt with IPv4-mapped addr")); + return (B_TRUE); + } + if (IN6_IS_ADDR_LOOPBACK(addr)) { + ip1dbg(("ip_input_v6: pkt with loopback addr")); + return (B_TRUE); + } + if (!is_src && IN6_IS_ADDR_UNSPECIFIED(addr)) { + /* + * having :: in the src is ok: it's used for DAD. + */ + ip1dbg(("ip_input_v6: pkt with unspecified addr")); + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * Routing lookup for IPv6 link-locals. + * First we look on the inbound interface, then we check for IPMP and + * look on the upper interface. + * We update ira_ruifindex if we find the IRE on the upper interface. + */ +static ire_t * +ire_linklocal(const in6_addr_t *nexthop, ill_t *ill, ip_recv_attr_t *ira, + boolean_t allocate, ip_stack_t *ipst) +{ + int match_flags = MATCH_IRE_SECATTR | MATCH_IRE_ILL; + ire_t *ire; + + ASSERT(IN6_IS_ADDR_LINKLOCAL(nexthop)); + ire = ire_route_recursive_v6(nexthop, 0, ill, ALL_ZONES, ira->ira_tsl, + match_flags, allocate, ira->ira_xmit_hint, ipst, NULL, NULL, NULL); + if (!(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || + !IS_UNDER_IPMP(ill)) + return (ire); + + /* + * When we are using IMP we need to look for an IRE on both the + * under and upper interfaces since there are different + * link-local addresses for the under and upper. + */ + ill = ipmp_ill_hold_ipmp_ill(ill); + if (ill == NULL) + return (ire); + + ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex; + + ire_refrele(ire); + ire = ire_route_recursive_v6(nexthop, 0, ill, ALL_ZONES, ira->ira_tsl, + match_flags, allocate, ira->ira_xmit_hint, ipst, NULL, NULL, NULL); + ill_refrele(ill); + return (ire); +} + +/* + * This is the tail-end of the full receive side packet handling. + * It can be used directly when the configuration is simple. + */ +void +ill_input_short_v6(mblk_t *mp, void *iph_arg, void *nexthop_arg, + ip_recv_attr_t *ira, rtc_t *rtc) +{ + ire_t *ire; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; + uint_t pkt_len; + ssize_t len; + ip6_t *ip6h = (ip6_t *)iph_arg; + in6_addr_t nexthop = *(in6_addr_t *)nexthop_arg; + ilb_stack_t *ilbs = ipst->ips_netstack->netstack_ilb; +#define rptr ((uchar_t *)ip6h) + + ASSERT(DB_TYPE(mp) == M_DATA); + + /* + * Check for source/dest being a bad address: loopback, any, or + * v4mapped. All of them start with a 64 bits of zero. + */ + if (ip6h->ip6_src.s6_addr32[0] == 0 && + ip6h->ip6_src.s6_addr32[1] == 0) { + if (ip6_bad_address(&ip6h->ip6_src, B_TRUE)) { + ip1dbg(("ip_input_v6: pkt with bad src addr\n")); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors); + ip_drop_input("ipIfStatsInAddrErrors", mp, ill); + freemsg(mp); + return; + } + } + if (ip6h->ip6_dst.s6_addr32[0] == 0 && + ip6h->ip6_dst.s6_addr32[1] == 0) { + if (ip6_bad_address(&ip6h->ip6_dst, B_FALSE)) { + ip1dbg(("ip_input_v6: pkt with bad dst addr\n")); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors); + ip_drop_input("ipIfStatsInAddrErrors", mp, ill); + freemsg(mp); + return; + } + } + + len = mp->b_wptr - rptr; + pkt_len = ira->ira_pktlen; + + /* multiple mblk or too short */ + len -= pkt_len; + if (len != 0) { + mp = ip_check_length(mp, rptr, len, pkt_len, IPV6_HDR_LEN, ira); + if (mp == NULL) + return; + ip6h = (ip6_t *)mp->b_rptr; + } + + DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, + ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, ip6h, + int, 0); + /* + * The event for packets being received from a 'physical' + * interface is placed after validation of the source and/or + * destination address as being local so that packets can be + * redirected to loopback addresses using ipnat. + */ + DTRACE_PROBE4(ip6__physical__in__start, + ill_t *, ill, ill_t *, NULL, + ip6_t *, ip6h, mblk_t *, mp); + + if (HOOKS6_INTERESTED_PHYSICAL_IN(ipst)) { + int ll_multicast = 0; + int error; + in6_addr_t orig_dst = ip6h->ip6_dst; + + if (ira->ira_flags & IRAF_L2DST_MULTICAST) + ll_multicast = HPE_MULTICAST; + else if (ira->ira_flags & IRAF_L2DST_BROADCAST) + ll_multicast = HPE_BROADCAST; + + FW_HOOKS6(ipst->ips_ip6_physical_in_event, + ipst->ips_ipv6firewall_physical_in, + ill, NULL, ip6h, mp, mp, ll_multicast, ipst, error); + + DTRACE_PROBE1(ip6__physical__in__end, mblk_t *, mp); + + if (mp == NULL) + return; + + /* The length could have changed */ + ip6h = (ip6_t *)mp->b_rptr; + ira->ira_pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; + pkt_len = ira->ira_pktlen; + + /* + * In case the destination changed we override any previous + * change to nexthop. + */ + if (!IN6_ARE_ADDR_EQUAL(&orig_dst, &ip6h->ip6_dst)) + nexthop = ip6h->ip6_dst; + + if (IN6_IS_ADDR_UNSPECIFIED(&nexthop)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors); + ip_drop_input("ipIfStatsInAddrErrors", mp, ill); + freemsg(mp); + return; + } + + } + + if (ipst->ips_ip6_observe.he_interested) { + zoneid_t dzone; + + /* + * On the inbound path the src zone will be unknown as + * this packet has come from the wire. + */ + dzone = ip_get_zoneid_v6(&nexthop, mp, ill, ira, ALL_ZONES); + ipobs_hook(mp, IPOBS_HOOK_INBOUND, ALL_ZONES, dzone, ill, ipst); + } + + if ((ip6h->ip6_vcf & IPV6_VERS_AND_FLOW_MASK) != + IPV6_DEFAULT_VERS_AND_FLOW) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInWrongIPVersion); + ip_drop_input("ipIfStatsInWrongIPVersion", mp, ill); + freemsg(mp); + return; + } + + /* + * For IPv6 we update ira_ip_hdr_length and ira_protocol as + * we parse the headers, starting with the hop-by-hop options header. + */ + ira->ira_ip_hdr_length = IPV6_HDR_LEN; + if ((ira->ira_protocol = ip6h->ip6_nxt) == IPPROTO_HOPOPTS) { + ip6_hbh_t *hbhhdr; + uint_t ehdrlen; + uint8_t *optptr; + + if (pkt_len < IPV6_HDR_LEN + MIN_EHDR_LEN) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts); + ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill); + freemsg(mp); + return; + } + if (mp->b_cont != NULL && + rptr + IPV6_HDR_LEN + MIN_EHDR_LEN > mp->b_wptr) { + ip6h = ip_pullup(mp, IPV6_HDR_LEN + MIN_EHDR_LEN, ira); + if (ip6h == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); + freemsg(mp); + return; + } + } + hbhhdr = (ip6_hbh_t *)&ip6h[1]; + ehdrlen = 8 * (hbhhdr->ip6h_len + 1); + + if (pkt_len < IPV6_HDR_LEN + ehdrlen) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts); + ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill); + freemsg(mp); + return; + } + if (mp->b_cont != NULL && + rptr + IPV6_HDR_LEN + ehdrlen > mp->b_wptr) { + ip6h = ip_pullup(mp, IPV6_HDR_LEN + ehdrlen, ira); + if (ip6h == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); + freemsg(mp); + return; + } + hbhhdr = (ip6_hbh_t *)&ip6h[1]; + } + + /* + * Update ira_ip_hdr_length to skip the hop-by-hop header + * once we get to ip_fanout_v6 + */ + ira->ira_ip_hdr_length += ehdrlen; + ira->ira_protocol = hbhhdr->ip6h_nxt; + + optptr = (uint8_t *)&hbhhdr[1]; + switch (ip_process_options_v6(mp, ip6h, optptr, + ehdrlen - 2, IPPROTO_HOPOPTS, ira)) { + case -1: + /* + * Packet has been consumed and any + * needed ICMP messages sent. + */ + return; + case 0: + /* no action needed */ + break; + case 1: + /* + * Known router alert. Make use handle it as local + * by setting the nexthop to be the all-host multicast + * address, and skip multicast membership filter by + * marking as a router alert. + */ + ira->ira_flags |= IRAF_ROUTER_ALERT; + nexthop = ipv6_all_hosts_mcast; + break; + } + } + + /* + * Here we check to see if we machine is setup as + * L3 loadbalancer and if the incoming packet is for a VIP + * + * Check the following: + * - there is at least a rule + * - protocol of the packet is supported + * + * We don't load balance IPv6 link-locals. + */ + if (ilb_has_rules(ilbs) && ILB_SUPP_L4(ira->ira_protocol) && + !IN6_IS_ADDR_LINKLOCAL(&nexthop)) { + in6_addr_t lb_dst; + int lb_ret; + + /* For convenience, we just pull up the mblk. */ + if (mp->b_cont != NULL) { + if (pullupmsg(mp, -1) == 0) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards - pullupmsg", + mp, ill); + freemsg(mp); + return; + } + ip6h = (ip6_t *)mp->b_rptr; + } + lb_ret = ilb_check_v6(ilbs, ill, mp, ip6h, ira->ira_protocol, + (uint8_t *)ip6h + ira->ira_ip_hdr_length, &lb_dst); + if (lb_ret == ILB_DROPPED) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ILB_DROPPED", mp, ill); + freemsg(mp); + return; + } + if (lb_ret == ILB_BALANCED) { + /* Set the dst to that of the chosen server */ + nexthop = lb_dst; + DB_CKSUMFLAGS(mp) = 0; + } + } + + /* Can not use route cache with TX since the labels can differ */ + if (ira->ira_flags & IRAF_SYSTEM_LABELED) { + if (IN6_IS_ADDR_MULTICAST(&nexthop)) { + ire = ire_multicast(ill); + } else if (IN6_IS_ADDR_LINKLOCAL(&nexthop)) { + ire = ire_linklocal(&nexthop, ill, ira, + (ill->ill_flags & ILLF_ROUTER), ipst); + } else { + /* Match destination and label */ + ire = ire_route_recursive_v6(&nexthop, 0, NULL, + ALL_ZONES, ira->ira_tsl, MATCH_IRE_SECATTR, + (ill->ill_flags & ILLF_ROUTER), ira->ira_xmit_hint, + ipst, NULL, NULL, NULL); + } + /* Update the route cache so we do the ire_refrele */ + ASSERT(ire != NULL); + if (rtc->rtc_ire != NULL) + ire_refrele(rtc->rtc_ire); + rtc->rtc_ire = ire; + rtc->rtc_ip6addr = nexthop; + } else if (IN6_ARE_ADDR_EQUAL(&nexthop, &rtc->rtc_ip6addr)) { + /* Use the route cache */ + ASSERT(rtc->rtc_ire != NULL); + ire = rtc->rtc_ire; + } else { + /* Update the route cache */ + if (IN6_IS_ADDR_MULTICAST(&nexthop)) { + ire = ire_multicast(ill); + } else if (IN6_IS_ADDR_LINKLOCAL(&nexthop)) { + ire = ire_linklocal(&nexthop, ill, ira, + (ill->ill_flags & ILLF_ROUTER), ipst); + } else { + ire = ire_route_recursive_dstonly_v6(&nexthop, + (ill->ill_flags & ILLF_ROUTER), ira->ira_xmit_hint, + ipst); + } + ASSERT(ire != NULL); + if (rtc->rtc_ire != NULL) + ire_refrele(rtc->rtc_ire); + rtc->rtc_ire = ire; + rtc->rtc_ip6addr = nexthop; + } + + ire->ire_ib_pkt_count++; + + /* + * Based on ire_type and ire_flags call one of: + * ire_recv_local_v6 - for IRE_LOCAL + * ire_recv_loopback_v6 - for IRE_LOOPBACK + * ire_recv_multirt_v6 - if RTF_MULTIRT + * ire_recv_noroute_v6 - if RTF_REJECT or RTF_BLACHOLE + * ire_recv_multicast_v6 - for IRE_MULTICAST + * ire_recv_noaccept_v6 - for ire_noaccept ones + * ire_recv_forward_v6 - for the rest. + */ + + (*ire->ire_recvfn)(ire, mp, ip6h, ira); +} +#undef rptr + +/* + * ire_recvfn for IREs that need forwarding + */ +void +ire_recv_forward_v6(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira) +{ + ip6_t *ip6h = (ip6_t *)iph_arg; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; + iaflags_t iraflags = ira->ira_flags; + ill_t *dst_ill; + nce_t *nce; + uint32_t added_tx_len; + uint32_t mtu, iremtu; + + if (iraflags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); + ip_drop_input("l2 multicast not forwarded", mp, ill); + freemsg(mp); + return; + } + + if (!(ill->ill_flags & ILLF_ROUTER)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); + ip_drop_input("ipIfStatsForwProhibits", mp, ill); + freemsg(mp); + return; + } + + /* + * Either ire_nce_capable or ire_dep_parent would be set for the IRE + * when it is found by ire_route_recursive, but that some other thread + * could have changed the routes with the effect of clearing + * ire_dep_parent. In that case we'd end up dropping the packet, or + * finding a new nce below. + * Get, allocate, or update the nce. + * We get a refhold on ire_nce_cache as a result of this to avoid races + * where ire_nce_cache is deleted. + * + * This ensures that we don't forward if the interface is down since + * ipif_down removes all the nces. + */ + mutex_enter(&ire->ire_lock); + nce = ire->ire_nce_cache; + if (nce == NULL) { + /* Not yet set up - try to set one up */ + mutex_exit(&ire->ire_lock); + (void) ire_revalidate_nce(ire); + mutex_enter(&ire->ire_lock); + nce = ire->ire_nce_cache; + if (nce == NULL) { + mutex_exit(&ire->ire_lock); + /* The ire_dep_parent chain went bad, or no memory */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("No ire_dep_parent", mp, ill); + freemsg(mp); + return; + } + } + nce_refhold(nce); + mutex_exit(&ire->ire_lock); + + if (nce->nce_is_condemned) { + nce_t *nce1; + + nce1 = ire_handle_condemned_nce(nce, ire, NULL, ip6h, B_FALSE); + nce_refrele(nce); + if (nce1 == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("No nce", mp, ill); + freemsg(mp); + return; + } + nce = nce1; + } + dst_ill = nce->nce_ill; + + /* + * Unless we are forwarding, drop the packet. + * Unlike IPv4 we don't allow source routed packets out the same + * interface when we are not a router. + * Note that ill_forward_set() will set the ILLF_ROUTER on + * all the group members when it gets an ipmp-ill or under-ill. + */ + if (!(dst_ill->ill_flags & ILLF_ROUTER)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); + ip_drop_input("ipIfStatsForwProhibits", mp, ill); + freemsg(mp); + nce_refrele(nce); + return; + } + + if (ire->ire_zoneid != GLOBAL_ZONEID && ire->ire_zoneid != ALL_ZONES) { + ire->ire_ib_pkt_count--; + /* + * Should only use IREs that are visible from the + * global zone for forwarding. + * For IPv6 any source route would have already been + * advanced in ip_fanout_v6 + */ + ire = ire_route_recursive_v6(&ip6h->ip6_dst, 0, NULL, + GLOBAL_ZONEID, ira->ira_tsl, MATCH_IRE_SECATTR, + (ill->ill_flags & ILLF_ROUTER), ira->ira_xmit_hint, ipst, + NULL, NULL, NULL); + ire->ire_ib_pkt_count++; + (*ire->ire_recvfn)(ire, mp, ip6h, ira); + ire_refrele(ire); + nce_refrele(nce); + return; + } + /* + * ipIfStatsHCInForwDatagrams should only be increment if there + * will be an attempt to forward the packet, which is why we + * increment after the above condition has been checked. + */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams); + + /* Initiate Read side IPPF processing */ + if (IPP_ENABLED(IPP_FWD_IN, ipst)) { + /* ip_process translates an IS_UNDER_IPMP */ + mp = ip_process(IPP_FWD_IN, mp, ill, ill); + if (mp == NULL) { + /* ip_drop_packet and MIB done */ + ip2dbg(("ire_recv_forward_v6: pkt dropped/deferred " + "during IPPF processing\n")); + nce_refrele(nce); + return; + } + } + + DTRACE_PROBE4(ip6__forwarding__start, + ill_t *, ill, ill_t *, dst_ill, ip6_t *, ip6h, mblk_t *, mp); + + if (HOOKS6_INTERESTED_FORWARDING(ipst)) { + int error; + + FW_HOOKS(ipst->ips_ip6_forwarding_event, + ipst->ips_ipv6firewall_forwarding, + ill, dst_ill, ip6h, mp, mp, 0, ipst, error); + + DTRACE_PROBE1(ip6__forwarding__end, mblk_t *, mp); + + if (mp == NULL) { + nce_refrele(nce); + return; + } + /* + * Even if the destination was changed by the filter we use the + * forwarding decision that was made based on the address + * in ip_input. + */ + + /* Might have changed */ + ip6h = (ip6_t *)mp->b_rptr; + ira->ira_pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; + } + + /* Packet is being forwarded. Turning off hwcksum flag. */ + DB_CKSUMFLAGS(mp) = 0; + + /* + * Per RFC 3513 section 2.5.2, we must not forward packets with + * an unspecified source address. + * The loopback address check for both src and dst has already + * been checked in ip_input_v6 + * In the future one can envision adding RPF checks using number 3. + */ + switch (ipst->ips_src_check) { + case 0: + break; + case 1: + case 2: + if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) || + IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors); + ip_drop_input("ipIfStatsInAddrErrors", mp, ill); + nce_refrele(nce); + freemsg(mp); + return; + } + break; + } + + /* + * Check to see if we're forwarding the packet to a + * different link from which it came. If so, check the + * source and destination addresses since routers must not + * forward any packets with link-local source or + * destination addresses to other links. Otherwise (if + * we're forwarding onto the same link), conditionally send + * a redirect message. + */ + if (!IS_ON_SAME_LAN(dst_ill, ill)) { + if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst) || + IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors); + ip_drop_input("ipIfStatsInAddrErrors", mp, ill); + freemsg(mp); + nce_refrele(nce); + return; + } + /* TBD add site-local check at site boundary? */ + } else if (ipst->ips_ipv6_send_redirects) { + ip_send_potential_redirect_v6(mp, ip6h, ire, ira); + } + + added_tx_len = 0; + if (iraflags & IRAF_SYSTEM_LABELED) { + mblk_t *mp1; + uint32_t old_pkt_len = ira->ira_pktlen; + + /* + * Check if it can be forwarded and add/remove + * CIPSO options as needed. + */ + if ((mp1 = tsol_ip_forward(ire, mp, ira)) == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); + ip_drop_input("tsol_ip_forward", mp, ill); + freemsg(mp); + nce_refrele(nce); + return; + } + /* + * Size may have changed. Remember amount added in case + * ip_fragment needs to send an ICMP too big. + */ + mp = mp1; + ip6h = (ip6_t *)mp->b_rptr; + ira->ira_pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; + ira->ira_ip_hdr_length = IPV6_HDR_LEN; + if (ira->ira_pktlen > old_pkt_len) + added_tx_len = ira->ira_pktlen - old_pkt_len; + } + + mtu = dst_ill->ill_mtu; + if ((iremtu = ire->ire_metrics.iulp_mtu) != 0 && iremtu < mtu) + mtu = iremtu; + ip_forward_xmit_v6(nce, mp, ip6h, ira, mtu, added_tx_len); + nce_refrele(nce); + return; + +} + +/* + * Used for sending out unicast and multicast packets that are + * forwarded. + */ +void +ip_forward_xmit_v6(nce_t *nce, mblk_t *mp, ip6_t *ip6h, ip_recv_attr_t *ira, + uint32_t mtu, uint32_t added_tx_len) +{ + ill_t *dst_ill = nce->nce_ill; + uint32_t pkt_len; + iaflags_t iraflags = ira->ira_flags; + ip_stack_t *ipst = dst_ill->ill_ipst; + + if (ip6h->ip6_hops-- <= 1) { + BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ICMP6_TIME_EXCEED_TRANSIT", mp, ira->ira_ill); + icmp_time_exceeded_v6(mp, ICMP6_TIME_EXCEED_TRANSIT, B_FALSE, + ira); + return; + } + + /* Initiate Write side IPPF processing before any fragmentation */ + if (IPP_ENABLED(IPP_FWD_OUT, ipst)) { + /* ip_process translates an IS_UNDER_IPMP */ + mp = ip_process(IPP_FWD_OUT, mp, dst_ill, dst_ill); + if (mp == NULL) { + /* ip_drop_packet and MIB done */ + ip2dbg(("ire_recv_forward_v6: pkt dropped/deferred" \ + " during IPPF processing\n")); + return; + } + } + + pkt_len = ira->ira_pktlen; + + BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsHCOutForwDatagrams); + + if (pkt_len > mtu) { + BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsOutFragFails); + ip_drop_output("ipIfStatsOutFragFails", mp, dst_ill); + if (iraflags & IRAF_SYSTEM_LABELED) { + /* + * Remove any CIPSO option added by + * tsol_ip_forward, and make sure we report + * a path MTU so that there + * is room to add such a CIPSO option for future + * packets. + */ + mtu = tsol_pmtu_adjust(mp, mtu, added_tx_len, AF_INET6); + } + icmp_pkt2big_v6(mp, mtu, B_TRUE, ira); + return; + } + + ASSERT(pkt_len == + ntohs(((ip6_t *)mp->b_rptr)->ip6_plen) + IPV6_HDR_LEN); + + if (iraflags & IRAF_LOOPBACK_COPY) { + /* + * IXAF_NO_LOOP_ZONEID is not set hence 6th arg + * is don't care + */ + (void) ip_postfrag_loopcheck(mp, nce, + (IXAF_LOOPBACK_COPY | IXAF_NO_DEV_FLOW_CTL), + pkt_len, ira->ira_xmit_hint, GLOBAL_ZONEID, 0, NULL); + } else { + (void) ip_xmit(mp, nce, IXAF_NO_DEV_FLOW_CTL, + pkt_len, ira->ira_xmit_hint, GLOBAL_ZONEID, 0, NULL); + } +} + +/* + * ire_recvfn for RTF_REJECT and RTF_BLACKHOLE routes, including IRE_NOROUTE, + * which is what ire_route_recursive returns when there is no matching ire. + * Send ICMP unreachable unless blackhole. + */ +void +ire_recv_noroute_v6(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira) +{ + ip6_t *ip6h = (ip6_t *)iph_arg; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; + + /* Would we have forwarded this packet if we had a route? */ + if (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); + ip_drop_input("l2 multicast not forwarded", mp, ill); + freemsg(mp); + return; + } + + if (!(ill->ill_flags & ILLF_ROUTER)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); + ip_drop_input("ipIfStatsForwProhibits", mp, ill); + freemsg(mp); + return; + } + /* + * If we had a route this could have been forwarded. Count as such. + * + * ipIfStatsHCInForwDatagrams should only be increment if there + * will be an attempt to forward the packet, which is why we + * increment after the above condition has been checked. + */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams); + + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInNoRoutes); + + ip_rts_change_v6(RTM_MISS, &ip6h->ip6_dst, 0, 0, 0, 0, 0, 0, RTA_DST, + ipst); + + if (ire->ire_flags & RTF_BLACKHOLE) { + ip_drop_input("ipIfStatsInNoRoutes RTF_BLACKHOLE", mp, ill); + freemsg(mp); + } else { + ip_drop_input("ipIfStatsInNoRoutes RTF_REJECT", mp, ill); + + icmp_unreachable_v6(mp, ICMP6_DST_UNREACH_NOROUTE, B_FALSE, + ira); + } +} + +/* + * ire_recvfn for IRE_LOCALs marked with ire_noaccept. Such IREs are used for + * VRRP when in noaccept mode. + * We silently drop packets except for Neighbor Solicitations and + * Neighbor Advertisements. + */ +void +ire_recv_noaccept_v6(ire_t *ire, mblk_t *mp, void *iph_arg, + ip_recv_attr_t *ira) +{ + ip6_t *ip6h = (ip6_t *)iph_arg; + ill_t *ill = ira->ira_ill; + icmp6_t *icmp6; + int ip_hdr_length; + + if (ip6h->ip6_nxt != IPPROTO_ICMPV6) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards - noaccept", mp, ill); + freemsg(mp); + return; + } + ip_hdr_length = ira->ira_ip_hdr_length; + if ((mp->b_wptr - mp->b_rptr) < (ip_hdr_length + ICMP6_MINLEN)) { + if (ira->ira_pktlen < (ip_hdr_length + ICMP6_MINLEN)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts); + ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill); + freemsg(mp); + return; + } + ip6h = ip_pullup(mp, ip_hdr_length + ICMP6_MINLEN, ira); + if (ip6h == NULL) { + BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors); + freemsg(mp); + return; + } + } + icmp6 = (icmp6_t *)(&mp->b_rptr[ip_hdr_length]); + + if (icmp6->icmp6_type != ND_NEIGHBOR_SOLICIT && + icmp6->icmp6_type != ND_NEIGHBOR_ADVERT) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards - noaccept", mp, ill); + freemsg(mp); + return; + } + ire_recv_local_v6(ire, mp, ip6h, ira); +} + +/* + * ire_recvfn for IRE_MULTICAST. + */ +void +ire_recv_multicast_v6(ire_t *ire, mblk_t *mp, void *iph_arg, + ip_recv_attr_t *ira) +{ + ip6_t *ip6h = (ip6_t *)iph_arg; + ill_t *ill = ira->ira_ill; + + ASSERT(ire->ire_ill == ira->ira_ill); + + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastPkts); + UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastOctets, ira->ira_pktlen); + + /* Tag for higher-level protocols */ + ira->ira_flags |= IRAF_MULTICAST; + + /* + * So that we don't end up with dups, only one ill an IPMP group is + * nominated to receive multicast traffic. + * If we have no cast_ill we are liberal and accept everything. + */ + if (IS_UNDER_IPMP(ill)) { + ip_stack_t *ipst = ill->ill_ipst; + + /* For an under ill_grp can change under lock */ + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + if (!ill->ill_nom_cast && ill->ill_grp != NULL && + ill->ill_grp->ig_cast_ill != NULL) { + rw_exit(&ipst->ips_ill_g_lock); + ip_drop_input("not on cast ill", mp, ill); + freemsg(mp); + return; + } + rw_exit(&ipst->ips_ill_g_lock); + /* + * We switch to the upper ill so that mrouter and hasmembers + * can operate on upper here and in ip_input_multicast. + */ + ill = ipmp_ill_hold_ipmp_ill(ill); + if (ill != NULL) { + ASSERT(ill != ira->ira_ill); + ASSERT(ire->ire_ill == ira->ira_ill); + ira->ira_ill = ill; + ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex; + } else { + ill = ira->ira_ill; + } + } + +#ifdef notdef + /* + * Check if we are a multicast router - send ip_mforward a copy of + * the packet. + * Due to mroute_decap tunnels we consider forwarding packets even if + * mrouted has not joined the allmulti group on this interface. + */ + if (ipst->ips_ip_g_mrouter) { + int retval; + + /* + * Clear the indication that this may have hardware + * checksum as we are not using it for forwarding. + */ + DB_CKSUMFLAGS(mp) = 0; + + /* + * ip_mforward helps us make these distinctions: If received + * on tunnel and not IGMP, then drop. + * If IGMP packet, then don't check membership + * If received on a phyint and IGMP or PIM, then + * don't check membership + */ + retval = ip_mforward_v6(mp, ira); + /* ip_mforward updates mib variables if needed */ + + switch (retval) { + case 0: + /* + * pkt is okay and arrived on phyint. + */ + break; + case -1: + /* pkt is mal-formed, toss it */ + freemsg(mp); + goto done; + case 1: + /* + * pkt is okay and arrived on a tunnel + * + * If we are running a multicast router + * we need to see all mld packets, which + * are marked with router alerts. + */ + if (ira->ira_flags & IRAF_ROUTER_ALERT) + goto forus; + ip_drop_input("Multicast on tunnel ignored", mp, ill); + freemsg(mp); + goto done; + } + } +#endif /* notdef */ + + /* + * If this was a router alert we skip the group membership check. + */ + if (ira->ira_flags & IRAF_ROUTER_ALERT) + goto forus; + + /* + * Check if we have members on this ill. This is not necessary for + * correctness because even if the NIC/GLD had a leaky filter, we + * filter before passing to each conn_t. + */ + if (!ill_hasmembers_v6(ill, &ip6h->ip6_dst)) { + /* + * Nobody interested + * + * This might just be caused by the fact that + * multiple IP Multicast addresses map to the same + * link layer multicast - no need to increment counter! + */ + ip_drop_input("Multicast with no members", mp, ill); + freemsg(mp); + goto done; + } +forus: + ip2dbg(("ire_recv_multicast_v6: multicast for us\n")); + + /* + * After reassembly and IPsec we will need to duplicate the + * multicast packet for all matching zones on the ill. + */ + ira->ira_zoneid = ALL_ZONES; + + /* Reassemble on the ill on which the packet arrived */ + ip_input_local_v6(ire, mp, ip6h, ira); +done: + if (ill != ire->ire_ill) { + ill_refrele(ill); + ira->ira_ill = ire->ire_ill; + ira->ira_ruifindex = ira->ira_ill->ill_phyint->phyint_ifindex; + } +} + +/* + * ire_recvfn for IRE_OFFLINK with RTF_MULTIRT. + * Drop packets since we don't forward out multirt routes. + */ +/* ARGSUSED */ +void +ire_recv_multirt_v6(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira) +{ + ill_t *ill = ira->ira_ill; + + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInNoRoutes); + ip_drop_input("Not forwarding out MULTIRT", mp, ill); + freemsg(mp); +} + +/* + * ire_recvfn for IRE_LOOPBACK. This is only used when a FW_HOOK + * has rewritten the packet to have a loopback destination address (We + * filter out packet with a loopback destination from arriving over the wire). + * We don't know what zone to use, thus we always use the GLOBAL_ZONEID. + */ +void +ire_recv_loopback_v6(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira) +{ + ip6_t *ip6h = (ip6_t *)iph_arg; + ill_t *ill = ira->ira_ill; + ill_t *ire_ill = ire->ire_ill; + + ira->ira_zoneid = GLOBAL_ZONEID; + + /* Switch to the lo0 ill for further processing */ + if (ire_ill != ill) { + /* + * Update ira_ill to be the ILL on which the IP address + * is hosted. + * No need to hold the ill since we have a hold on the ire + */ + ASSERT(ira->ira_ill == ira->ira_rill); + ira->ira_ill = ire_ill; + + ip_input_local_v6(ire, mp, ip6h, ira); + + /* Restore */ + ASSERT(ira->ira_ill == ire_ill); + ira->ira_ill = ill; + return; + + } + ip_input_local_v6(ire, mp, ip6h, ira); +} + +/* + * ire_recvfn for IRE_LOCAL. + */ +void +ire_recv_local_v6(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira) +{ + ip6_t *ip6h = (ip6_t *)iph_arg; + ill_t *ill = ira->ira_ill; + ill_t *ire_ill = ire->ire_ill; + + /* Make a note for DAD that this address is in use */ + ire->ire_last_used_time = lbolt; + + /* Only target the IRE_LOCAL with the right zoneid. */ + ira->ira_zoneid = ire->ire_zoneid; + + /* + * If the packet arrived on the wrong ill, we check that + * this is ok. + * If it is, then we ensure that we do the reassembly on + * the ill on which the address is hosted. We keep ira_rill as + * the one on which the packet arrived, so that IP_PKTINFO and + * friends can report this. + */ + if (ire_ill != ill) { + ire_t *new_ire; + + new_ire = ip_check_multihome(&ip6h->ip6_dst, ire, ill); + if (new_ire == NULL) { + /* Drop packet */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); + ip_drop_input("ipIfStatsInForwProhibits", mp, ill); + freemsg(mp); + return; + } + /* + * Update ira_ill to be the ILL on which the IP address + * is hosted. No need to hold the ill since we have a + * hold on the ire. Note that we do the switch even if + * new_ire == ire (for IPMP, ire would be the one corresponding + * to the IPMP ill). + */ + ASSERT(ira->ira_ill == ira->ira_rill); + ira->ira_ill = new_ire->ire_ill; + + /* ira_ruifindex tracks the upper for ira_rill */ + if (IS_UNDER_IPMP(ill)) + ira->ira_ruifindex = ill_get_upper_ifindex(ill); + + ip_input_local_v6(new_ire, mp, ip6h, ira); + + /* Restore */ + ASSERT(ira->ira_ill == new_ire->ire_ill); + ira->ira_ill = ill; + ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex; + + if (new_ire != ire) + ire_refrele(new_ire); + return; + } + + ip_input_local_v6(ire, mp, ip6h, ira); +} + +/* + * Common function for packets arriving for the host. Handles + * checksum verification, reassembly checks, etc. + */ +static void +ip_input_local_v6(ire_t *ire, mblk_t *mp, ip6_t *ip6h, ip_recv_attr_t *ira) +{ + iaflags_t iraflags = ira->ira_flags; + + /* + * For multicast we need some extra work before + * we call ip_fanout_v6(), since in the case of shared-IP zones + * we need to pretend that a packet arrived for each zoneid. + */ + if (iraflags & IRAF_MULTICAST) { + ip_input_multicast_v6(ire, mp, ip6h, ira); + return; + } + ip_fanout_v6(mp, ip6h, ira); +} + +/* + * Handle multiple zones which want to receive the same multicast packets + * on this ill by delivering a packet to each of them. + * + * Note that for packets delivered to transports we could instead do this + * as part of the fanout code, but since we need to handle icmp_inbound + * it is simpler to have multicast work the same as IPv4 broadcast. + * + * The ip_fanout matching for multicast matches based on ilm independent of + * zoneid since the zoneid restriction is applied when joining a multicast + * group. + */ +/* ARGSUSED */ +static void +ip_input_multicast_v6(ire_t *ire, mblk_t *mp, ip6_t *ip6h, ip_recv_attr_t *ira) +{ + ill_t *ill = ira->ira_ill; + iaflags_t iraflags = ira->ira_flags; + ip_stack_t *ipst = ill->ill_ipst; + netstack_t *ns = ipst->ips_netstack; + zoneid_t zoneid; + mblk_t *mp1; + ip6_t *ip6h1; + + /* ire_recv_multicast has switched to the upper ill for IPMP */ + ASSERT(!IS_UNDER_IPMP(ill)); + + /* + * If we don't have more than one shared-IP zone, or if + * there are no members in anything but the global zone, + * then just set the zoneid and proceed. + */ + if (ns->netstack_numzones == 1 || + !ill_hasmembers_otherzones_v6(ill, &ip6h->ip6_dst, + GLOBAL_ZONEID)) { + ira->ira_zoneid = GLOBAL_ZONEID; + + /* If sender didn't want this zone to receive it, drop */ + if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) && + ira->ira_no_loop_zoneid == ira->ira_zoneid) { + ip_drop_input("Multicast but wrong zoneid", mp, ill); + freemsg(mp); + return; + } + ip_fanout_v6(mp, ip6h, ira); + return; + } + + /* + * Here we loop over all zoneids that have members in the group + * and deliver a packet to ip_fanout for each zoneid. + * + * First find any members in the lowest numeric zoneid by looking for + * first zoneid larger than -1 (ALL_ZONES). + * We terminate the loop when we receive -1 (ALL_ZONES). + */ + zoneid = ill_hasmembers_nextzone_v6(ill, &ip6h->ip6_dst, ALL_ZONES); + for (; zoneid != ALL_ZONES; + zoneid = ill_hasmembers_nextzone_v6(ill, &ip6h->ip6_dst, zoneid)) { + /* + * Avoid an extra copymsg/freemsg by skipping global zone here + * and doing that at the end. + */ + if (zoneid == GLOBAL_ZONEID) + continue; + + ira->ira_zoneid = zoneid; + + /* If sender didn't want this zone to receive it, skip */ + if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) && + ira->ira_no_loop_zoneid == ira->ira_zoneid) + continue; + + mp1 = copymsg(mp); + if (mp1 == NULL) { + /* Failed to deliver to one zone */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); + continue; + } + ip6h1 = (ip6_t *)mp1->b_rptr; + ip_fanout_v6(mp1, ip6h1, ira); + } + + /* Do the main ire */ + ira->ira_zoneid = GLOBAL_ZONEID; + /* If sender didn't want this zone to receive it, drop */ + if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) && + ira->ira_no_loop_zoneid == ira->ira_zoneid) { + ip_drop_input("Multicast but wrong zoneid", mp, ill); + freemsg(mp); + } else { + ip_fanout_v6(mp, ip6h, ira); + } +} + + +/* + * Determine the zoneid and IRAF_TX_MAC_EXEMPTABLE if trusted extensions + * is in use. Updates ira_zoneid and ira_flags as a result. + */ +static void +ip_fanout_tx_v6(mblk_t *mp, ip6_t *ip6h, uint8_t protocol, uint_t ip_hdr_length, + ip_recv_attr_t *ira) +{ + uint16_t *up; + uint16_t lport; + zoneid_t zoneid; + + ASSERT(ira->ira_flags & IRAF_SYSTEM_LABELED); + + /* + * If the packet is unlabeled we might allow read-down + * for MAC_EXEMPT. Below we clear this if it is a multi-level + * port (MLP). + * Note that ira_tsl can be NULL here. + */ + if (ira->ira_tsl != NULL && ira->ira_tsl->tsl_flags & TSLF_UNLABELED) + ira->ira_flags |= IRAF_TX_MAC_EXEMPTABLE; + + if (ira->ira_zoneid != ALL_ZONES) + return; + + ira->ira_flags |= IRAF_TX_SHARED_ADDR; + + up = (uint16_t *)((uchar_t *)ip6h + ip_hdr_length); + switch (protocol) { + case IPPROTO_TCP: + case IPPROTO_SCTP: + case IPPROTO_UDP: + /* Caller ensures this */ + ASSERT(((uchar_t *)ip6h) + ip_hdr_length +4 <= mp->b_wptr); + + /* + * Only these transports support MLP. + * We know their destination port numbers is in + * the same place in the header. + */ + lport = up[1]; + + /* + * No need to handle exclusive-stack zones + * since ALL_ZONES only applies to the shared IP instance. + */ + zoneid = tsol_mlp_findzone(protocol, lport); + /* + * If no shared MLP is found, tsol_mlp_findzone returns + * ALL_ZONES. In that case, we assume it's SLP, and + * search for the zone based on the packet label. + * + * If there is such a zone, we prefer to find a + * connection in it. Otherwise, we look for a + * MAC-exempt connection in any zone whose label + * dominates the default label on the packet. + */ + if (zoneid == ALL_ZONES) + zoneid = tsol_attr_to_zoneid(ira); + else + ira->ira_flags &= ~IRAF_TX_MAC_EXEMPTABLE; + break; + default: + /* Handle shared address for other protocols */ + zoneid = tsol_attr_to_zoneid(ira); + break; + } + ira->ira_zoneid = zoneid; +} + +/* + * Increment checksum failure statistics + */ +static void +ip_input_cksum_err_v6(uint8_t protocol, uint16_t hck_flags, ill_t *ill) +{ + ip_stack_t *ipst = ill->ill_ipst; + + switch (protocol) { + case IPPROTO_TCP: + BUMP_MIB(ill->ill_ip_mib, tcpIfStatsInErrs); + + if (hck_flags & HCK_FULLCKSUM) + IP6_STAT(ipst, ip6_tcp_in_full_hw_cksum_err); + else if (hck_flags & HCK_PARTIALCKSUM) + IP6_STAT(ipst, ip6_tcp_in_part_hw_cksum_err); + else + IP6_STAT(ipst, ip6_tcp_in_sw_cksum_err); + break; + case IPPROTO_UDP: + BUMP_MIB(ill->ill_ip_mib, udpIfStatsInCksumErrs); + if (hck_flags & HCK_FULLCKSUM) + IP6_STAT(ipst, ip6_udp_in_full_hw_cksum_err); + else if (hck_flags & HCK_PARTIALCKSUM) + IP6_STAT(ipst, ip6_udp_in_part_hw_cksum_err); + else + IP6_STAT(ipst, ip6_udp_in_sw_cksum_err); + break; + case IPPROTO_ICMPV6: + BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInMsgs); + BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors); + break; + default: + ASSERT(0); + break; + } +} + +/* Calculate the IPv6 pseudo-header checksum for TCP, UDP, and ICMPV6 */ +uint32_t +ip_input_cksum_pseudo_v6(ip6_t *ip6h, ip_recv_attr_t *ira) +{ + uint_t ulp_len; + uint32_t cksum; + uint8_t protocol = ira->ira_protocol; + uint16_t ip_hdr_length = ira->ira_ip_hdr_length; + +#define iphs ((uint16_t *)ip6h) + + switch (protocol) { + case IPPROTO_TCP: + ulp_len = ira->ira_pktlen - ip_hdr_length; + + /* Protocol and length */ + cksum = htons(ulp_len) + IP_TCP_CSUM_COMP; + /* IP addresses */ + cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] + + iphs[8] + iphs[9] + iphs[10] + iphs[11] + + iphs[12] + iphs[13] + iphs[14] + iphs[15] + + iphs[16] + iphs[17] + iphs[18] + iphs[19]; + break; + + case IPPROTO_UDP: { + udpha_t *udpha; + + udpha = (udpha_t *)((uchar_t *)ip6h + ip_hdr_length); + + /* Protocol and length */ + cksum = udpha->uha_length + IP_UDP_CSUM_COMP; + /* IP addresses */ + cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] + + iphs[8] + iphs[9] + iphs[10] + iphs[11] + + iphs[12] + iphs[13] + iphs[14] + iphs[15] + + iphs[16] + iphs[17] + iphs[18] + iphs[19]; + break; + } + case IPPROTO_ICMPV6: + ulp_len = ira->ira_pktlen - ip_hdr_length; + + /* Protocol and length */ + cksum = htons(ulp_len) + IP_ICMPV6_CSUM_COMP; + /* IP addresses */ + cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] + + iphs[8] + iphs[9] + iphs[10] + iphs[11] + + iphs[12] + iphs[13] + iphs[14] + iphs[15] + + iphs[16] + iphs[17] + iphs[18] + iphs[19]; + break; + default: + cksum = 0; + break; + } +#undef iphs + return (cksum); +} + + +/* + * Software verification of the ULP checksums. + * Returns B_TRUE if ok. + * Increments statistics of failed. + */ +static boolean_t +ip_input_sw_cksum_v6(mblk_t *mp, ip6_t *ip6h, ip_recv_attr_t *ira) +{ + ip_stack_t *ipst = ira->ira_ill->ill_ipst; + uint32_t cksum; + uint8_t protocol = ira->ira_protocol; + uint16_t ip_hdr_length = ira->ira_ip_hdr_length; + + IP6_STAT(ipst, ip6_in_sw_cksum); + + ASSERT(protocol == IPPROTO_TCP || protocol == IPPROTO_UDP || + protocol == IPPROTO_ICMPV6); + + cksum = ip_input_cksum_pseudo_v6(ip6h, ira); + cksum = IP_CSUM(mp, ip_hdr_length, cksum); + if (cksum == 0) + return (B_TRUE); + + ip_input_cksum_err_v6(protocol, 0, ira->ira_ill); + return (B_FALSE); +} + +/* + * Verify the ULP checksums. + * Returns B_TRUE if ok, or if the ULP doesn't have a well-defined checksum + * algorithm. + * Increments statistics if failed. + */ +static boolean_t +ip_input_cksum_v6(iaflags_t iraflags, mblk_t *mp, ip6_t *ip6h, + ip_recv_attr_t *ira) +{ + ill_t *ill = ira->ira_rill; + uint16_t hck_flags; + uint32_t cksum; + mblk_t *mp1; + uint_t len; + uint8_t protocol = ira->ira_protocol; + uint16_t ip_hdr_length = ira->ira_ip_hdr_length; + + + switch (protocol) { + case IPPROTO_TCP: + case IPPROTO_ICMPV6: + break; + + case IPPROTO_UDP: { + udpha_t *udpha; + + udpha = (udpha_t *)((uchar_t *)ip6h + ip_hdr_length); + /* + * Before going through the regular checksum + * calculation, make sure the received checksum + * is non-zero. RFC 2460 says, a 0x0000 checksum + * in a UDP packet (within IPv6 packet) is invalid + * and should be replaced by 0xffff. This makes + * sense as regular checksum calculation will + * pass for both the cases i.e. 0x0000 and 0xffff. + * Removing one of the case makes error detection + * stronger. + */ + if (udpha->uha_checksum == 0) { + /* 0x0000 checksum is invalid */ + BUMP_MIB(ill->ill_ip_mib, udpIfStatsInCksumErrs); + return (B_FALSE); + } + break; + } + case IPPROTO_SCTP: { + sctp_hdr_t *sctph; + uint32_t pktsum; + + sctph = (sctp_hdr_t *)((uchar_t *)ip6h + ip_hdr_length); +#ifdef DEBUG + if (skip_sctp_cksum) + return (B_TRUE); +#endif + pktsum = sctph->sh_chksum; + sctph->sh_chksum = 0; + cksum = sctp_cksum(mp, ip_hdr_length); + sctph->sh_chksum = pktsum; + if (cksum == pktsum) + return (B_TRUE); + + /* + * Defer until later whether a bad checksum is ok + * in order to allow RAW sockets to use Adler checksum + * with SCTP. + */ + ira->ira_flags |= IRAF_SCTP_CSUM_ERR; + return (B_TRUE); + } + + default: + /* No ULP checksum to verify. */ + return (B_TRUE); + } + + /* + * Revert to software checksum calculation if the interface + * isn't capable of checksum offload. + * We clear DB_CKSUMFLAGS when going through IPsec in ip_fanout. + * Note: IRAF_NO_HW_CKSUM is not currently used. + */ + ASSERT(!IS_IPMP(ill)); + if ((iraflags & IRAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) || + !dohwcksum) { + return (ip_input_sw_cksum_v6(mp, ip6h, ira)); + } + + /* + * We apply this for all ULP protocols. Does the HW know to + * not set the flags for SCTP and other protocols. + */ + + hck_flags = DB_CKSUMFLAGS(mp); + + if (hck_flags & HCK_FULLCKSUM) { + /* + * Full checksum has been computed by the hardware + * and has been attached. If the driver wants us to + * verify the correctness of the attached value, in + * order to protect against faulty hardware, compare + * it against -0 (0xFFFF) to see if it's valid. + */ + if (hck_flags & HCK_FULLCKSUM_OK) + return (B_TRUE); + + cksum = DB_CKSUM16(mp); + if (cksum == 0xFFFF) + return (B_TRUE); + ip_input_cksum_err_v6(protocol, hck_flags, ira->ira_ill); + return (B_FALSE); + } + + mp1 = mp->b_cont; + if ((hck_flags & HCK_PARTIALCKSUM) && + (mp1 == NULL || mp1->b_cont == NULL) && + ip_hdr_length >= DB_CKSUMSTART(mp) && + ((len = ip_hdr_length - DB_CKSUMSTART(mp)) & 1) == 0) { + uint32_t adj; + uchar_t *cksum_start; + + cksum = ip_input_cksum_pseudo_v6(ip6h, ira); + + cksum_start = ((uchar_t *)ip6h + DB_CKSUMSTART(mp)); + + /* + * Partial checksum has been calculated by hardware + * and attached to the packet; in addition, any + * prepended extraneous data is even byte aligned, + * and there are at most two mblks associated with + * the packet. If any such data exists, we adjust + * the checksum; also take care any postpended data. + */ + IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, len, adj); + /* + * One's complement subtract extraneous checksum + */ + cksum += DB_CKSUM16(mp); + if (adj >= cksum) + cksum = ~(adj - cksum) & 0xFFFF; + else + cksum -= adj; + cksum = (cksum & 0xFFFF) + ((int)cksum >> 16); + cksum = (cksum & 0xFFFF) + ((int)cksum >> 16); + if (!(~cksum & 0xFFFF)) + return (B_TRUE); + + ip_input_cksum_err_v6(protocol, hck_flags, ira->ira_ill); + return (B_FALSE); + } + return (ip_input_sw_cksum_v6(mp, ip6h, ira)); +} + + +/* + * Handle fanout of received packets. + * Unicast packets that are looped back (from ire_send_local_v6) and packets + * from the wire are differentiated by checking IRAF_VERIFY_ULP_CKSUM. + * + * IPQoS Notes + * Before sending it to the client, invoke IPPF processing. Policy processing + * takes place only if the callout_position, IPP_LOCAL_IN, is enabled. + */ +void +ip_fanout_v6(mblk_t *mp, ip6_t *ip6h, ip_recv_attr_t *ira) +{ + ill_t *ill = ira->ira_ill; + iaflags_t iraflags = ira->ira_flags; + ip_stack_t *ipst = ill->ill_ipst; + uint8_t protocol; + conn_t *connp; +#define rptr ((uchar_t *)ip6h) + uint_t ip_hdr_length; + uint_t min_ulp_header_length; + int offset; + ssize_t len; + netstack_t *ns = ipst->ips_netstack; + ipsec_stack_t *ipss = ns->netstack_ipsec; + ill_t *rill = ira->ira_rill; + + ASSERT(ira->ira_pktlen == ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN); + + /* + * We repeat this as we parse over destination options header and + * fragment headers (earlier we've handled any hop-by-hop options + * header.) + * We update ira_protocol and ira_ip_hdr_length as we skip past + * the intermediate headers; they already point past any + * hop-by-hop header. + */ +repeat: + protocol = ira->ira_protocol; + ip_hdr_length = ira->ira_ip_hdr_length; + + /* + * Time for IPP once we've done reassembly and IPsec. + * We skip this for loopback packets since we don't do IPQoS + * on loopback. + */ + if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && + !(iraflags & IRAF_LOOPBACK) && + (protocol != IPPROTO_ESP || protocol != IPPROTO_AH || + protocol != IPPROTO_DSTOPTS || protocol != IPPROTO_ROUTING || + protocol != IPPROTO_FRAGMENT)) { + /* + * Use the interface on which the packet arrived - not where + * the IP address is hosted. + */ + /* ip_process translates an IS_UNDER_IPMP */ + mp = ip_process(IPP_LOCAL_IN, mp, rill, ill); + if (mp == NULL) { + /* ip_drop_packet and MIB done */ + return; + } + } + + /* Determine the minimum required size of the upper-layer header */ + /* Need to do this for at least the set of ULPs that TX handles. */ + switch (protocol) { + case IPPROTO_TCP: + min_ulp_header_length = TCP_MIN_HEADER_LENGTH; + break; + case IPPROTO_SCTP: + min_ulp_header_length = SCTP_COMMON_HDR_LENGTH; + break; + case IPPROTO_UDP: + min_ulp_header_length = UDPH_SIZE; + break; + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + min_ulp_header_length = ICMPH_SIZE; + break; + case IPPROTO_FRAGMENT: + case IPPROTO_DSTOPTS: + case IPPROTO_ROUTING: + min_ulp_header_length = MIN_EHDR_LEN; + break; + default: + min_ulp_header_length = 0; + break; + } + /* Make sure we have the min ULP header length */ + len = mp->b_wptr - rptr; + if (len < ip_hdr_length + min_ulp_header_length) { + if (ira->ira_pktlen < ip_hdr_length + min_ulp_header_length) + goto pkt_too_short; + + IP6_STAT(ipst, ip6_recv_pullup); + ip6h = ip_pullup(mp, ip_hdr_length + min_ulp_header_length, + ira); + if (ip6h == NULL) + goto discard; + len = mp->b_wptr - rptr; + } + + /* + * If trusted extensions then determine the zoneid and TX specific + * ira_flags. + */ + if (iraflags & IRAF_SYSTEM_LABELED) { + /* This can update ira->ira_flags and ira->ira_zoneid */ + ip_fanout_tx_v6(mp, ip6h, protocol, ip_hdr_length, ira); + iraflags = ira->ira_flags; + } + + + /* Verify ULP checksum. Handles TCP, UDP, and SCTP */ + if (iraflags & IRAF_VERIFY_ULP_CKSUM) { + if (!ip_input_cksum_v6(iraflags, mp, ip6h, ira)) { + /* Bad checksum. Stats are already incremented */ + ip_drop_input("Bad ULP checksum", mp, ill); + freemsg(mp); + return; + } + /* IRAF_SCTP_CSUM_ERR could have been set */ + iraflags = ira->ira_flags; + } + switch (protocol) { + case IPPROTO_TCP: + /* For TCP, discard multicast packets. */ + if (iraflags & IRAF_MULTIBROADCAST) + goto discard; + + /* First mblk contains IP+TCP headers per above check */ + ASSERT(len >= ip_hdr_length + TCP_MIN_HEADER_LENGTH); + + /* TCP options present? */ + offset = ((uchar_t *)ip6h)[ip_hdr_length + 12] >> 4; + if (offset != 5) { + if (offset < 5) + goto discard; + + /* + * There must be TCP options. + * Make sure we can grab them. + */ + offset <<= 2; + offset += ip_hdr_length; + if (len < offset) { + if (ira->ira_pktlen < offset) + goto pkt_too_short; + + IP6_STAT(ipst, ip6_recv_pullup); + ip6h = ip_pullup(mp, offset, ira); + if (ip6h == NULL) + goto discard; + len = mp->b_wptr - rptr; + } + } + + /* + * Pass up a squeue hint to tcp. + * If ira_sqp is already set (this is loopback) we leave it + * alone. + */ + if (ira->ira_sqp == NULL) { + ira->ira_sqp = ip_squeue_get(ira->ira_ring); + } + + /* Look for AF_INET or AF_INET6 that matches */ + connp = ipcl_classify_v6(mp, IPPROTO_TCP, ip_hdr_length, + ira, ipst); + if (connp == NULL) { + /* Send the TH_RST */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); + tcp_xmit_listeners_reset(mp, ira, ipst, NULL); + return; + } + if (connp->conn_incoming_ifindex != 0 && + connp->conn_incoming_ifindex != ira->ira_ruifindex) { + CONN_DEC_REF(connp); + + /* Send the TH_RST */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); + tcp_xmit_listeners_reset(mp, ira, ipst, NULL); + return; + } + if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) || + (iraflags & IRAF_IPSEC_SECURE)) { + mp = ipsec_check_inbound_policy(mp, connp, + NULL, ip6h, ira); + if (mp == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + /* Note that mp is NULL */ + ip_drop_input("ipIfStatsInDiscards", mp, ill); + CONN_DEC_REF(connp); + return; + } + } + /* Found a client; up it goes */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); + ira->ira_ill = ira->ira_rill = NULL; + if (!IPCL_IS_TCP(connp)) { + /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */ + (connp->conn_recv)(connp, mp, NULL, ira); + CONN_DEC_REF(connp); + ira->ira_ill = ill; + ira->ira_rill = rill; + return; + } + + /* + * We do different processing whether called from + * ip_accept_tcp and we match the target, don't match + * the target, and when we are called by ip_input. + */ + if (iraflags & IRAF_TARGET_SQP) { + if (ira->ira_target_sqp == connp->conn_sqp) { + mblk_t *attrmp; + + attrmp = ip_recv_attr_to_mblk(ira); + if (attrmp == NULL) { + BUMP_MIB(ill->ill_ip_mib, + ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", + mp, ill); + freemsg(mp); + CONN_DEC_REF(connp); + } else { + SET_SQUEUE(attrmp, connp->conn_recv, + connp); + attrmp->b_cont = mp; + ASSERT(ira->ira_target_sqp_mp == NULL); + ira->ira_target_sqp_mp = attrmp; + /* + * Conn ref release when drained from + * the squeue. + */ + } + } else { + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, + connp->conn_recv, connp, ira, SQ_FILL, + SQTAG_IP6_TCP_INPUT); + } + } else { + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, + connp, ira, ip_squeue_flag, SQTAG_IP6_TCP_INPUT); + } + ira->ira_ill = ill; + ira->ira_rill = rill; + return; + + case IPPROTO_SCTP: { + sctp_hdr_t *sctph; + uint32_t ports; /* Source and destination ports */ + sctp_stack_t *sctps = ipst->ips_netstack->netstack_sctp; + + /* For SCTP, discard multicast packets. */ + if (iraflags & IRAF_MULTIBROADCAST) + goto discard; + + /* + * Since there is no SCTP h/w cksum support yet, just + * clear the flag. + */ + DB_CKSUMFLAGS(mp) = 0; + + /* Length ensured above */ + ASSERT(MBLKL(mp) >= ip_hdr_length + SCTP_COMMON_HDR_LENGTH); + sctph = (sctp_hdr_t *)(rptr + ip_hdr_length); + + /* get the ports */ + ports = *(uint32_t *)&sctph->sh_sport; + + if (iraflags & IRAF_SCTP_CSUM_ERR) { + /* + * No potential sctp checksum errors go to the Sun + * sctp stack however they might be Adler-32 summed + * packets a userland stack bound to a raw IP socket + * could reasonably use. Note though that Adler-32 is + * a long deprecated algorithm and customer sctp + * networks should eventually migrate to CRC-32 at + * which time this facility should be removed. + */ + ip_fanout_sctp_raw(mp, NULL, ip6h, ports, ira); + return; + } + connp = sctp_fanout(&ip6h->ip6_src, &ip6h->ip6_dst, ports, + ira, mp, sctps); + if (connp == NULL) { + /* Check for raw socket or OOTB handling */ + ip_fanout_sctp_raw(mp, NULL, ip6h, ports, ira); + return; + } + if (connp->conn_incoming_ifindex != 0 && + connp->conn_incoming_ifindex != ira->ira_ruifindex) { + CONN_DEC_REF(connp); + + /* Check for raw socket or OOTB handling */ + ip_fanout_sctp_raw(mp, NULL, ip6h, ports, ira); + return; + } + + /* Found a client; up it goes */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); + sctp_input(connp, NULL, ip6h, mp, ira); + /* sctp_input does a rele of the sctp_t */ + return; + } + + case IPPROTO_UDP: + /* First mblk contains IP+UDP headers as checked above */ + ASSERT(MBLKL(mp) >= ip_hdr_length + UDPH_SIZE); + + if (iraflags & IRAF_MULTIBROADCAST) { + uint16_t *up; /* Pointer to ports in ULP header */ + + up = (uint16_t *)((uchar_t *)ip6h + ip_hdr_length); + + ip_fanout_udp_multi_v6(mp, ip6h, up[1], up[0], ira); + return; + } + + /* Look for AF_INET or AF_INET6 that matches */ + connp = ipcl_classify_v6(mp, IPPROTO_UDP, ip_hdr_length, + ira, ipst); + if (connp == NULL) { + no_udp_match: + if (ipst->ips_ipcl_proto_fanout_v6[IPPROTO_UDP]. + connf_head != NULL) { + ASSERT(ira->ira_protocol == IPPROTO_UDP); + ip_fanout_proto_v6(mp, ip6h, ira); + } else { + ip_fanout_send_icmp_v6(mp, ICMP6_DST_UNREACH, + ICMP6_DST_UNREACH_NOPORT, ira); + } + return; + + } + if (connp->conn_incoming_ifindex != 0 && + connp->conn_incoming_ifindex != ira->ira_ruifindex) { + CONN_DEC_REF(connp); + goto no_udp_match; + } + if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld : + !canputnext(connp->conn_rq)) { + CONN_DEC_REF(connp); + BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows); + ip_drop_input("udpIfStatsInOverflows", mp, ill); + freemsg(mp); + return; + } + if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) || + (iraflags & IRAF_IPSEC_SECURE)) { + mp = ipsec_check_inbound_policy(mp, connp, + NULL, ip6h, ira); + if (mp == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + /* Note that mp is NULL */ + ip_drop_input("ipIfStatsInDiscards", mp, ill); + CONN_DEC_REF(connp); + return; + } + } + + /* Found a client; up it goes */ + IP6_STAT(ipst, ip6_udp_fannorm); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); + ira->ira_ill = ira->ira_rill = NULL; + (connp->conn_recv)(connp, mp, NULL, ira); + CONN_DEC_REF(connp); + ira->ira_ill = ill; + ira->ira_rill = rill; + return; + default: + break; + } + + /* + * Clear hardware checksumming flag as it is currently only + * used by TCP and UDP. + */ + DB_CKSUMFLAGS(mp) = 0; + + switch (protocol) { + case IPPROTO_ICMPV6: + BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInMsgs); + + /* Check variable for testing applications */ + if (ipst->ips_ipv6_drop_inbound_icmpv6) { + ip_drop_input("ipv6_drop_inbound_icmpv6", mp, ill); + freemsg(mp); + return; + } + /* + * We need to accomodate icmp messages coming in clear + * until we get everything secure from the wire. If + * icmp_accept_clear_messages is zero we check with + * the global policy and act accordingly. If it is + * non-zero, we accept the message without any checks. + * But *this does not mean* that this will be delivered + * to RAW socket clients. By accepting we might send + * replies back, change our MTU value etc., + * but delivery to the ULP/clients depends on their + * policy dispositions. + */ + if (ipst->ips_icmp_accept_clear_messages == 0) { + mp = ipsec_check_global_policy(mp, NULL, + NULL, ip6h, ira, ns); + if (mp == NULL) + return; + } + + /* + * On a labeled system, we have to check whether the zone + * itself is permitted to receive raw traffic. + */ + if (ira->ira_flags & IRAF_SYSTEM_LABELED) { + if (!tsol_can_accept_raw(mp, ira, B_FALSE)) { + BUMP_MIB(ill->ill_icmp6_mib, + ipv6IfIcmpInErrors); + ip_drop_input("tsol_can_accept_raw", mp, ill); + freemsg(mp); + return; + } + } + + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); + mp = icmp_inbound_v6(mp, ira); + if (mp == NULL) { + /* No need to pass to RAW sockets */ + return; + } + break; + + case IPPROTO_DSTOPTS: { + ip6_dest_t *desthdr; + uint_t ehdrlen; + uint8_t *optptr; + + /* We already check for MIN_EHDR_LEN above */ + + /* Check if AH is present and needs to be processed. */ + mp = ipsec_early_ah_v6(mp, ira); + if (mp == NULL) + return; + + /* + * Reinitialize pointers, as ipsec_early_ah_v6() does + * complete pullups. We don't have to do more pullups + * as a result. + */ + ip6h = (ip6_t *)mp->b_rptr; + + if (ira->ira_pktlen - ip_hdr_length < MIN_EHDR_LEN) + goto pkt_too_short; + + if (mp->b_cont != NULL && + rptr + ip_hdr_length + MIN_EHDR_LEN > mp->b_wptr) { + ip6h = ip_pullup(mp, ip_hdr_length + MIN_EHDR_LEN, ira); + if (ip6h == NULL) + goto discard; + } + desthdr = (ip6_dest_t *)(rptr + ip_hdr_length); + ehdrlen = 8 * (desthdr->ip6d_len + 1); + if (ira->ira_pktlen - ip_hdr_length < ehdrlen) + goto pkt_too_short; + if (mp->b_cont != NULL && + rptr + IPV6_HDR_LEN + ehdrlen > mp->b_wptr) { + ip6h = ip_pullup(mp, IPV6_HDR_LEN + ehdrlen, ira); + if (ip6h == NULL) + goto discard; + + desthdr = (ip6_dest_t *)(rptr + ip_hdr_length); + } + optptr = (uint8_t *)&desthdr[1]; + + /* + * Update ira_ip_hdr_length to skip the destination header + * when we repeat. + */ + ira->ira_ip_hdr_length += ehdrlen; + + ira->ira_protocol = desthdr->ip6d_nxt; + + /* + * Note: XXX This code does not seem to make + * distinction between Destination Options Header + * being before/after Routing Header which can + * happen if we are at the end of source route. + * This may become significant in future. + * (No real significant Destination Options are + * defined/implemented yet ). + */ + switch (ip_process_options_v6(mp, ip6h, optptr, + ehdrlen - 2, IPPROTO_DSTOPTS, ira)) { + case -1: + /* + * Packet has been consumed and any needed + * ICMP errors sent. + */ + return; + case 0: + /* No action needed continue */ + break; + case 1: + /* + * Unnexpected return value + * (Router alert is a Hop-by-Hop option) + */ +#ifdef DEBUG + panic("ip_fanout_v6: router " + "alert hbh opt indication in dest opt"); + /*NOTREACHED*/ +#else + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); + freemsg(mp); + return; +#endif + } + goto repeat; + } + case IPPROTO_FRAGMENT: { + ip6_frag_t *fraghdr; + + if (ira->ira_pktlen - ip_hdr_length < sizeof (ip6_frag_t)) + goto pkt_too_short; + + if (mp->b_cont != NULL && + rptr + ip_hdr_length + sizeof (ip6_frag_t) > mp->b_wptr) { + ip6h = ip_pullup(mp, + ip_hdr_length + sizeof (ip6_frag_t), ira); + if (ip6h == NULL) + goto discard; + } + + fraghdr = (ip6_frag_t *)(rptr + ip_hdr_length); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmReqds); + + /* + * Invoke the CGTP (multirouting) filtering module to + * process the incoming packet. Packets identified as + * duplicates must be discarded. Filtering is active + * only if the ip_cgtp_filter ndd variable is + * non-zero. + */ + if (ipst->ips_ip_cgtp_filter && + ipst->ips_ip_cgtp_filter_ops != NULL) { + int cgtp_flt_pkt; + netstackid_t stackid; + + stackid = ipst->ips_netstack->netstack_stackid; + + /* + * CGTP and IPMP are mutually exclusive so + * phyint_ifindex is fine here. + */ + cgtp_flt_pkt = + ipst->ips_ip_cgtp_filter_ops->cfo_filter_v6( + stackid, ill->ill_phyint->phyint_ifindex, + ip6h, fraghdr); + if (cgtp_flt_pkt == CGTP_IP_PKT_DUPLICATE) { + ip_drop_input("CGTP_IP_PKT_DUPLICATE", mp, ill); + freemsg(mp); + return; + } + } + + /* + * Update ip_hdr_length to skip the frag header + * ip_input_fragment_v6 will determine the extension header + * prior to the fragment header and update its nexthdr value, + * and also set ira_protocol to the nexthdr that follows the + * completed fragment. + */ + ip_hdr_length += sizeof (ip6_frag_t); + + /* + * Make sure we have ira_l2src before we loose the original + * mblk + */ + if (!(ira->ira_flags & IRAF_L2SRC_SET)) + ip_setl2src(mp, ira, ira->ira_rill); + + mp = ip_input_fragment_v6(mp, ip6h, fraghdr, + ira->ira_pktlen - ip_hdr_length, ira); + if (mp == NULL) { + /* Reassembly is still pending */ + return; + } + BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmOKs); + + /* + * The mblk chain has the frag header removed and + * ira_protocol, ira_pktlen, ira_ip_hdr_length as well as the + * IP header has been updated to refleact the result. + */ + ip6h = (ip6_t *)mp->b_rptr; + ip_hdr_length = ira->ira_ip_hdr_length; + goto repeat; + } + case IPPROTO_HOPOPTS: + /* + * Illegal header sequence. + * (Hop-by-hop headers are processed above + * and required to immediately follow IPv6 header) + */ + ip_drop_input("ICMP_PARAM_PROBLEM", mp, ill); + icmp_param_problem_nexthdr_v6(mp, B_FALSE, ira); + return; + + case IPPROTO_ROUTING: { + uint_t ehdrlen; + ip6_rthdr_t *rthdr; + + /* Check if AH is present and needs to be processed. */ + mp = ipsec_early_ah_v6(mp, ira); + if (mp == NULL) + return; + + /* + * Reinitialize pointers, as ipsec_early_ah_v6() does + * complete pullups. We don't have to do more pullups + * as a result. + */ + ip6h = (ip6_t *)mp->b_rptr; + + if (ira->ira_pktlen - ip_hdr_length < MIN_EHDR_LEN) + goto pkt_too_short; + + if (mp->b_cont != NULL && + rptr + ip_hdr_length + MIN_EHDR_LEN > mp->b_wptr) { + ip6h = ip_pullup(mp, ip_hdr_length + MIN_EHDR_LEN, ira); + if (ip6h == NULL) + goto discard; + } + rthdr = (ip6_rthdr_t *)(rptr + ip_hdr_length); + protocol = ira->ira_protocol = rthdr->ip6r_nxt; + ehdrlen = 8 * (rthdr->ip6r_len + 1); + if (ira->ira_pktlen - ip_hdr_length < ehdrlen) + goto pkt_too_short; + if (mp->b_cont != NULL && + rptr + IPV6_HDR_LEN + ehdrlen > mp->b_wptr) { + ip6h = ip_pullup(mp, IPV6_HDR_LEN + ehdrlen, ira); + if (ip6h == NULL) + goto discard; + rthdr = (ip6_rthdr_t *)(rptr + ip_hdr_length); + } + if (rthdr->ip6r_segleft != 0) { + /* Not end of source route */ + if (ira->ira_flags & + (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) { + BUMP_MIB(ill->ill_ip_mib, + ipIfStatsForwProhibits); + ip_drop_input("ipIfStatsInForwProhibits", + mp, ill); + freemsg(mp); + return; + } + ip_process_rthdr(mp, ip6h, rthdr, ira); + return; + } + ira->ira_ip_hdr_length += ehdrlen; + goto repeat; + } + + case IPPROTO_AH: + case IPPROTO_ESP: { + /* + * Fast path for AH/ESP. + */ + netstack_t *ns = ipst->ips_netstack; + ipsec_stack_t *ipss = ns->netstack_ipsec; + + IP_STAT(ipst, ipsec_proto_ahesp); + + if (!ipsec_loaded(ipss)) { + ip_proto_not_sup(mp, ira); + return; + } + + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); + /* select inbound SA and have IPsec process the pkt */ + if (protocol == IPPROTO_ESP) { + esph_t *esph; + + mp = ipsec_inbound_esp_sa(mp, ira, &esph); + if (mp == NULL) + return; + + ASSERT(esph != NULL); + ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE); + ASSERT(ira->ira_ipsec_esp_sa != NULL); + ASSERT(ira->ira_ipsec_esp_sa->ipsa_input_func != NULL); + + mp = ira->ira_ipsec_esp_sa->ipsa_input_func(mp, esph, + ira); + } else { + ah_t *ah; + + mp = ipsec_inbound_ah_sa(mp, ira, &ah); + if (mp == NULL) + return; + + ASSERT(ah != NULL); + ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE); + ASSERT(ira->ira_ipsec_ah_sa != NULL); + ASSERT(ira->ira_ipsec_ah_sa->ipsa_input_func != NULL); + mp = ira->ira_ipsec_ah_sa->ipsa_input_func(mp, ah, + ira); + } + + if (mp == NULL) { + /* + * Either it failed or is pending. In the former case + * ipIfStatsInDiscards was increased. + */ + return; + } + /* we're done with IPsec processing, send it up */ + ip_input_post_ipsec(mp, ira); + return; + } + case IPPROTO_NONE: + /* All processing is done. Count as "delivered". */ + freemsg(mp); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); + return; + + case IPPROTO_ENCAP: + case IPPROTO_IPV6: + /* iptun will verify trusted label */ + connp = ipcl_classify_v6(mp, protocol, ip_hdr_length, + ira, ipst); + if (connp != NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); + ira->ira_ill = ira->ira_rill = NULL; + connp->conn_recv(connp, mp, NULL, ira); + CONN_DEC_REF(connp); + ira->ira_ill = ill; + ira->ira_rill = rill; + return; + } + /* FALLTHRU */ + default: + /* + * On a labeled system, we have to check whether the zone + * itself is permitted to receive raw traffic. + */ + if (ira->ira_flags & IRAF_SYSTEM_LABELED) { + if (!tsol_can_accept_raw(mp, ira, B_FALSE)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); + freemsg(mp); + return; + } + } + break; + } + + /* + * The above input functions may have returned the pulled up message. + * So ip6h need to be reinitialized. + */ + ip6h = (ip6_t *)mp->b_rptr; + ira->ira_protocol = protocol; + if (ipst->ips_ipcl_proto_fanout_v6[protocol].connf_head == NULL) { + /* No user-level listener for these packets packets */ + ip_proto_not_sup(mp, ira); + return; + } + + /* + * Handle fanout to raw sockets. There + * can be more than one stream bound to a particular + * protocol. When this is the case, each one gets a copy + * of any incoming packets. + */ + ASSERT(ira->ira_protocol == protocol); + ip_fanout_proto_v6(mp, ip6h, ira); + return; + +pkt_too_short: + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts); + ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill); + freemsg(mp); + return; + +discard: + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); + freemsg(mp); +#undef rptr +} diff --git a/usr/src/uts/common/inet/ip/ip6_ire.c b/usr/src/uts/common/inet/ip/ip6_ire.c index c13a66fcc2..7697ca20c7 100644 --- a/usr/src/uts/common/inet/ip/ip6_ire.c +++ b/usr/src/uts/common/inet/ip/ip6_ire.c @@ -60,122 +60,122 @@ #include <sys/tsol/label.h> #include <sys/tsol/tnet.h> +#define IS_DEFAULT_ROUTE_V6(ire) \ + (((ire)->ire_type & IRE_DEFAULT) || \ + (((ire)->ire_type & IRE_INTERFACE) && \ + (IN6_IS_ADDR_UNSPECIFIED(&(ire)->ire_addr_v6)))) + static ire_t ire_null; -static ire_t *ire_ihandle_lookup_onlink_v6(ire_t *cire); -static boolean_t ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, - const in6_addr_t *mask, const in6_addr_t *gateway, int type, - const ipif_t *ipif, zoneid_t zoneid, uint32_t ihandle, - const ts_label_t *tsl, int match_flags); -static ire_t *ire_init_v6(ire_t *, const in6_addr_t *, const in6_addr_t *, - const in6_addr_t *, const in6_addr_t *, uint_t *, queue_t *, queue_t *, - ushort_t, ipif_t *, const in6_addr_t *, uint32_t, uint32_t, uint_t, - const iulp_t *, tsol_gc_t *, tsol_gcgrp_t *, ip_stack_t *); -static ire_t *ip6_ctable_lookup_impl(ire_ctable_args_t *); +static ire_t * +ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask, + const in6_addr_t *gateway, int type, const ill_t *ill, + zoneid_t zoneid, const ts_label_t *tsl, int flags, + ip_stack_t *ipst); /* * Initialize the ire that is specific to IPv6 part and call * ire_init_common to finish it. + * Returns zero or errno. */ -static ire_t * +int ire_init_v6(ire_t *ire, const in6_addr_t *v6addr, const in6_addr_t *v6mask, - const in6_addr_t *v6src_addr, const in6_addr_t *v6gateway, - uint_t *max_fragp, queue_t *rfq, queue_t *stq, ushort_t type, - ipif_t *ipif, const in6_addr_t *v6cmask, uint32_t phandle, - uint32_t ihandle, uint_t flags, const iulp_t *ulp_info, tsol_gc_t *gc, - tsol_gcgrp_t *gcgrp, ip_stack_t *ipst) + const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, + zoneid_t zoneid, uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst) { + int error; /* - * Reject IRE security attribute creation/initialization + * Reject IRE security attmakeribute creation/initialization * if system is not running in Trusted mode. */ - if ((gc != NULL || gcgrp != NULL) && !is_system_labeled()) - return (NULL); - + if (gc != NULL && !is_system_labeled()) + return (EINVAL); BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_alloced); - ire->ire_addr_v6 = *v6addr; - - if (v6src_addr != NULL) - ire->ire_src_addr_v6 = *v6src_addr; - if (v6mask != NULL) { - ire->ire_mask_v6 = *v6mask; - ire->ire_masklen = ip_mask_to_plen_v6(&ire->ire_mask_v6); - } + if (v6addr != NULL) + ire->ire_addr_v6 = *v6addr; if (v6gateway != NULL) ire->ire_gateway_addr_v6 = *v6gateway; - if (type == IRE_CACHE && v6cmask != NULL) - ire->ire_cmask_v6 = *v6cmask; - - /* - * Multirouted packets need to have a fragment header added so that - * the receiver is able to discard duplicates according to their - * fragment identifier. - */ - if (type == IRE_CACHE && (flags & RTF_MULTIRT)) { - ire->ire_frag_flag = IPH_FRAG_HDR; + /* Make sure we don't have stray values in some fields */ + switch (type) { + case IRE_LOOPBACK: + ire->ire_gateway_addr_v6 = ire->ire_addr_v6; + /* FALLTHRU */ + case IRE_HOST: + case IRE_LOCAL: + case IRE_IF_CLONE: + ire->ire_mask_v6 = ipv6_all_ones; + ire->ire_masklen = IPV6_ABITS; + break; + case IRE_PREFIX: + case IRE_DEFAULT: + case IRE_IF_RESOLVER: + case IRE_IF_NORESOLVER: + if (v6mask != NULL) { + ire->ire_mask_v6 = *v6mask; + ire->ire_masklen = + ip_mask_to_plen_v6(&ire->ire_mask_v6); + } + break; + case IRE_MULTICAST: + case IRE_NOROUTE: + ASSERT(v6mask == NULL); + break; + default: + ASSERT(0); + return (EINVAL); } - /* ire_init_common will free the mblks upon encountering any failure */ - if (!ire_init_common(ire, max_fragp, NULL, rfq, stq, type, ipif, - phandle, ihandle, flags, IPV6_VERSION, ulp_info, gc, gcgrp, ipst)) - return (NULL); - - return (ire); -} - -/* - * Similar to ire_create_v6 except that it is called only when - * we want to allocate ire as an mblk e.g. we have a external - * resolver. Do we need this in IPv6 ? - * - * IPv6 initializes the ire_nce in ire_add_v6, which expects to - * find the ire_nce to be null when it is called. So, although - * we have a src_nce parameter (in the interest of matching up with - * the argument list of the v4 version), we ignore the src_nce - * argument here. - */ -/* ARGSUSED */ -ire_t * -ire_create_mp_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask, - const in6_addr_t *v6src_addr, const in6_addr_t *v6gateway, - nce_t *src_nce, queue_t *rfq, queue_t *stq, ushort_t type, - ipif_t *ipif, const in6_addr_t *v6cmask, - uint32_t phandle, uint32_t ihandle, uint_t flags, const iulp_t *ulp_info, - tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, ip_stack_t *ipst) -{ - ire_t *ire; - ire_t *ret_ire; - mblk_t *mp; + error = ire_init_common(ire, type, ill, zoneid, flags, IPV6_VERSION, + gc, ipst); + if (error != NULL) + return (error); - ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr)); + /* Determine which function pointers to use */ + ire->ire_postfragfn = ip_xmit; /* Common case */ - /* Allocate the new IRE. */ - mp = allocb(sizeof (ire_t), BPRI_MED); - if (mp == NULL) { - ip1dbg(("ire_create_mp_v6: alloc failed\n")); - return (NULL); + switch (ire->ire_type) { + case IRE_LOCAL: + ire->ire_sendfn = ire_send_local_v6; + ire->ire_recvfn = ire_recv_local_v6; +#ifdef SO_VRRP + ASSERT(ire->ire_ill != NULL); + if (ire->ire_ill->ill_flags & ILLF_NOACCEPT) { + ire->ire_noaccept = B_TRUE; + ire->ire_recvfn = ire_recv_noaccept_v6; + } +#endif + break; + case IRE_LOOPBACK: + ire->ire_sendfn = ire_send_local_v6; + ire->ire_recvfn = ire_recv_loopback_v6; + break; + case IRE_MULTICAST: + ire->ire_postfragfn = ip_postfrag_loopcheck; + ire->ire_sendfn = ire_send_multicast_v6; + ire->ire_recvfn = ire_recv_multicast_v6; + break; + default: + /* + * For IRE_IF_ALL and IRE_OFFLINK we forward received + * packets by default. + */ + ire->ire_sendfn = ire_send_wire_v6; + ire->ire_recvfn = ire_recv_forward_v6; + break; } - - ire = (ire_t *)mp->b_rptr; - mp->b_wptr = (uchar_t *)&ire[1]; - - /* Start clean. */ - *ire = ire_null; - ire->ire_mp = mp; - mp->b_datap->db_type = IRE_DB_TYPE; - - ret_ire = ire_init_v6(ire, v6addr, v6mask, v6src_addr, v6gateway, - NULL, rfq, stq, type, ipif, v6cmask, phandle, - ihandle, flags, ulp_info, gc, gcgrp, ipst); - - if (ret_ire == NULL) { - freeb(ire->ire_mp); - return (NULL); + if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { + ire->ire_sendfn = ire_send_noroute_v6; + ire->ire_recvfn = ire_recv_noroute_v6; + } else if (ire->ire_flags & RTF_MULTIRT) { + ire->ire_postfragfn = ip_postfrag_multirt_v6; + ire->ire_sendfn = ire_send_multirt_v6; + ire->ire_recvfn = ire_recv_multirt_v6; } - return (ire); + ire->ire_nce_capable = ire_determine_nce_capable(ire); + return (0); } /* @@ -183,153 +183,76 @@ ire_create_mp_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask, * * NOTE : This is called as writer sometimes though not required * by this function. - * - * See comments above ire_create_mp_v6() for the rationale behind the - * unused src_nce argument. */ /* ARGSUSED */ ire_t * ire_create_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask, - const in6_addr_t *v6src_addr, const in6_addr_t *v6gateway, - uint_t *max_fragp, nce_t *src_nce, queue_t *rfq, queue_t *stq, - ushort_t type, ipif_t *ipif, const in6_addr_t *v6cmask, - uint32_t phandle, uint32_t ihandle, uint_t flags, const iulp_t *ulp_info, - tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, ip_stack_t *ipst) + const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, zoneid_t zoneid, + uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst) { ire_t *ire; - ire_t *ret_ire; + int error; ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr)); ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); if (ire == NULL) { - ip1dbg(("ire_create_v6: alloc failed\n")); + DTRACE_PROBE(kmem__cache__alloc); return (NULL); } *ire = ire_null; - ret_ire = ire_init_v6(ire, v6addr, v6mask, v6src_addr, v6gateway, - max_fragp, rfq, stq, type, ipif, v6cmask, phandle, - ihandle, flags, ulp_info, gc, gcgrp, ipst); + error = ire_init_v6(ire, v6addr, v6mask, v6gateway, + type, ill, zoneid, flags, gc, ipst); - if (ret_ire == NULL) { + if (error != 0) { + DTRACE_PROBE2(ire__init__v6, ire_t *, ire, int, error); kmem_cache_free(ire_cache, ire); return (NULL); } - ASSERT(ret_ire == ire); return (ire); } /* - * Find an IRE_INTERFACE for the multicast group. + * Find the ill matching a multicast group. * Allows different routes for multicast addresses * in the unicast routing table (akin to FF::0/8 but could be more specific) * which point at different interfaces. This is used when IPV6_MULTICAST_IF * isn't specified (when sending) and when IPV6_JOIN_GROUP doesn't * specify the interface to join on. * - * Supports link-local addresses by following the ipif/ill when recursing. + * Supports link-local addresses by using ire_route_recursive which follows + * the ill when recursing. + * + * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group + * and the MULTIRT property can be different for different groups, we + * extract RTF_MULTIRT from the special unicast route added for a group + * with CGTP and pass that back in the multirtp argument. + * This is used in ip_set_destination etc to set ixa_postfragfn for multicast. + * We have a setsrcp argument for the same reason. */ -ire_t * -ire_lookup_multi_v6(const in6_addr_t *group, zoneid_t zoneid, ip_stack_t *ipst) +ill_t * +ire_lookup_multi_ill_v6(const in6_addr_t *group, zoneid_t zoneid, + ip_stack_t *ipst, boolean_t *multirtp, in6_addr_t *setsrcp) { ire_t *ire; - ipif_t *ipif = NULL; - int match_flags = MATCH_IRE_TYPE; - in6_addr_t gw_addr_v6; - - ire = ire_ftable_lookup_v6(group, 0, 0, 0, NULL, NULL, - zoneid, 0, NULL, MATCH_IRE_DEFAULT, ipst); + ill_t *ill; - /* We search a resolvable ire in case of multirouting. */ - if ((ire != NULL) && (ire->ire_flags & RTF_MULTIRT)) { - ire_t *cire = NULL; - /* - * If the route is not resolvable, the looked up ire - * may be changed here. In that case, ire_multirt_lookup_v6() - * IRE_REFRELE the original ire and change it. - */ - (void) ire_multirt_lookup_v6(&cire, &ire, MULTIRT_CACHEGW, - NULL, ipst); - if (cire != NULL) - ire_refrele(cire); - } - if (ire == NULL) - return (NULL); - /* - * Make sure we follow ire_ipif. - * - * We need to determine the interface route through - * which the gateway will be reached. - */ - if (ire->ire_ipif != NULL) { - ipif = ire->ire_ipif; - match_flags |= MATCH_IRE_ILL; - } + ire = ire_route_recursive_v6(group, 0, NULL, zoneid, NULL, + MATCH_IRE_DSTONLY, B_FALSE, 0, ipst, setsrcp, NULL, NULL); + ASSERT(ire != NULL); - switch (ire->ire_type) { - case IRE_DEFAULT: - case IRE_PREFIX: - case IRE_HOST: - mutex_enter(&ire->ire_lock); - gw_addr_v6 = ire->ire_gateway_addr_v6; - mutex_exit(&ire->ire_lock); - ire_refrele(ire); - ire = ire_ftable_lookup_v6(&gw_addr_v6, 0, 0, - IRE_INTERFACE, ipif, NULL, zoneid, 0, - NULL, match_flags, ipst); - return (ire); - case IRE_IF_NORESOLVER: - case IRE_IF_RESOLVER: - return (ire); - default: + if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { ire_refrele(ire); return (NULL); } -} -/* - * Return any local address. We use this to target ourselves - * when the src address was specified as 'default'. - * Preference for IRE_LOCAL entries. - */ -ire_t * -ire_lookup_local_v6(zoneid_t zoneid, ip_stack_t *ipst) -{ - ire_t *ire; - irb_t *irb; - ire_t *maybe = NULL; - int i; + if (multirtp != NULL) + *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0; - for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) { - irb = &ipst->ips_ip_cache_table_v6[i]; - if (irb->irb_ire == NULL) - continue; - rw_enter(&irb->irb_lock, RW_READER); - for (ire = irb->irb_ire; ire; ire = ire->ire_next) { - if ((ire->ire_marks & IRE_MARK_CONDEMNED) || - ire->ire_zoneid != zoneid && - ire->ire_zoneid != ALL_ZONES) - continue; - switch (ire->ire_type) { - case IRE_LOOPBACK: - if (maybe == NULL) { - IRE_REFHOLD(ire); - maybe = ire; - } - break; - case IRE_LOCAL: - if (maybe != NULL) { - ire_refrele(maybe); - } - IRE_REFHOLD(ire); - rw_exit(&irb->irb_lock); - return (ire); - } - } - rw_exit(&irb->irb_lock); - } - return (maybe); + ill = ire_nexthop_ill(ire); + ire_refrele(ire); + return (ill); } /* @@ -369,6 +292,8 @@ ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask) if (plen < 0 || plen > IPV6_ABITS) return (NULL); *bitmask = ipv6_all_zeros; + if (plen == 0) + return (bitmask); ptr = (uint32_t *)bitmask; while (plen > 32) { @@ -380,196 +305,78 @@ ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask) } /* - * Add a fully initialized IRE to an appropriate - * table based on ire_type. - * - * The forward table contains IRE_PREFIX/IRE_HOST/IRE_HOST and - * IRE_IF_RESOLVER/IRE_IF_NORESOLVER and IRE_DEFAULT. - * - * The cache table contains IRE_BROADCAST/IRE_LOCAL/IRE_LOOPBACK - * and IRE_CACHE. - * - * NOTE : This function is called as writer though not required - * by this function. + * Add a fully initialized IPv6 IRE to the forwarding table. + * This returns NULL on failure, or a held IRE on success. + * Normally the returned IRE is the same as the argument. But a different + * IRE will be returned if the added IRE is deemed identical to an existing + * one. In that case ire_identical_ref will be increased. + * The caller always needs to do an ire_refrele() on the returned IRE. */ -int -ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func) +ire_t * +ire_add_v6(ire_t *ire) { ire_t *ire1; int mask_table_index; irb_t *irb_ptr; ire_t **irep; - int flags; - ire_t *pire = NULL; - ill_t *stq_ill; - boolean_t ndp_g_lock_held = B_FALSE; - ire_t *ire = *ire_p; + int match_flags; int error; ip_stack_t *ipst = ire->ire_ipst; - uint_t marks = 0; ASSERT(ire->ire_ipversion == IPV6_VERSION); - ASSERT(ire->ire_mp == NULL); /* Calls should go through ire_add */ - ASSERT(ire->ire_nce == NULL); - - /* - * IREs with source addresses hosted on interfaces that are under IPMP - * should be hidden so that applications don't accidentally end up - * sending packets with test addresses as their source addresses, or - * sending out interfaces that are e.g. IFF_INACTIVE. Hide them here. - * (We let IREs with unspecified source addresses slip through since - * ire_send_v6() will delete them automatically.) - */ - if (ire->ire_ipif != NULL && IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) && - !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6)) { - DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire); - marks |= IRE_MARK_TESTHIDDEN; - } - - /* Find the appropriate list head. */ - switch (ire->ire_type) { - case IRE_HOST: - ire->ire_mask_v6 = ipv6_all_ones; - ire->ire_masklen = IPV6_ABITS; - ire->ire_marks |= marks; - if ((ire->ire_flags & RTF_SETSRC) == 0) - ire->ire_src_addr_v6 = ipv6_all_zeros; - break; - case IRE_CACHE: - ire->ire_mask_v6 = ipv6_all_ones; - ire->ire_masklen = IPV6_ABITS; - ire->ire_marks |= marks; - break; - case IRE_LOCAL: - case IRE_LOOPBACK: - ire->ire_mask_v6 = ipv6_all_ones; - ire->ire_masklen = IPV6_ABITS; - break; - case IRE_PREFIX: - case IRE_DEFAULT: - ire->ire_marks |= marks; - if ((ire->ire_flags & RTF_SETSRC) == 0) - ire->ire_src_addr_v6 = ipv6_all_zeros; - break; - case IRE_IF_RESOLVER: - case IRE_IF_NORESOLVER: - ire->ire_marks |= marks; - break; - default: - printf("ire_add_v6: ire %p has unrecognized IRE type (%d)\n", - (void *)ire, ire->ire_type); - ire_delete(ire); - *ire_p = NULL; - return (EINVAL); - } /* Make sure the address is properly masked. */ V6_MASK_COPY(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6); - if ((ire->ire_type & IRE_CACHETABLE) == 0) { - /* IRE goes into Forward Table */ - mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6); - if ((ipst->ips_ip_forwarding_table_v6[mask_table_index]) == - NULL) { - irb_t *ptr; - int i; - - ptr = (irb_t *)mi_zalloc(( - ipst->ips_ip6_ftable_hash_size * sizeof (irb_t))); - if (ptr == NULL) { - ire_delete(ire); - *ire_p = NULL; - return (ENOMEM); - } - for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) { - rw_init(&ptr[i].irb_lock, NULL, - RW_DEFAULT, NULL); - } - mutex_enter(&ipst->ips_ire_ft_init_lock); - if (ipst->ips_ip_forwarding_table_v6[ - mask_table_index] == NULL) { - ipst->ips_ip_forwarding_table_v6[ - mask_table_index] = ptr; - mutex_exit(&ipst->ips_ire_ft_init_lock); - } else { - /* - * Some other thread won the race in - * initializing the forwarding table at the - * same index. - */ - mutex_exit(&ipst->ips_ire_ft_init_lock); - for (i = 0; i < ipst->ips_ip6_ftable_hash_size; - i++) { - rw_destroy(&ptr[i].irb_lock); - } - mi_free(ptr); - } - } - irb_ptr = &(ipst->ips_ip_forwarding_table_v6[mask_table_index][ - IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6, - ipst->ips_ip6_ftable_hash_size)]); - } else { - irb_ptr = &(ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6( - ire->ire_addr_v6, ipst->ips_ip6_cache_table_size)]); - } - /* - * For xresolv interfaces (v6 interfaces with an external - * address resolver), ip_newroute_v6/ip_newroute_ipif_v6 - * are unable to prevent the deletion of the interface route - * while adding an IRE_CACHE for an on-link destination - * in the IRE_IF_RESOLVER case, since the ire has to go to - * the external resolver and return. We can't do a REFHOLD on the - * associated interface ire for fear of the message being freed - * if the external resolver can't resolve the address. - * Here we look up the interface ire in the forwarding table - * and make sure that the interface route has not been deleted. - */ - if (ire->ire_type == IRE_CACHE && - IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6) && - (((ill_t *)ire->ire_stq->q_ptr)->ill_net_type == IRE_IF_RESOLVER) && - (((ill_t *)ire->ire_stq->q_ptr)->ill_flags & ILLF_XRESOLV)) { + mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6); + if ((ipst->ips_ip_forwarding_table_v6[mask_table_index]) == NULL) { + irb_t *ptr; + int i; - pire = ire_ihandle_lookup_onlink_v6(ire); - if (pire == NULL) { + ptr = (irb_t *)mi_zalloc((ipst->ips_ip6_ftable_hash_size * + sizeof (irb_t))); + if (ptr == NULL) { ire_delete(ire); - *ire_p = NULL; - return (EINVAL); + return (NULL); } - /* Prevent pire from getting deleted */ - IRB_REFHOLD(pire->ire_bucket); - /* Has it been removed already? */ - if (pire->ire_marks & IRE_MARK_CONDEMNED) { - IRB_REFRELE(pire->ire_bucket); - ire_refrele(pire); - ire_delete(ire); - *ire_p = NULL; - return (EINVAL); + for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) { + rw_init(&ptr[i].irb_lock, NULL, RW_DEFAULT, NULL); + } + mutex_enter(&ipst->ips_ire_ft_init_lock); + if (ipst->ips_ip_forwarding_table_v6[mask_table_index] == + NULL) { + ipst->ips_ip_forwarding_table_v6[mask_table_index] = + ptr; + mutex_exit(&ipst->ips_ire_ft_init_lock); + } else { + /* + * Some other thread won the race in + * initializing the forwarding table at the + * same index. + */ + mutex_exit(&ipst->ips_ire_ft_init_lock); + for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) { + rw_destroy(&ptr[i].irb_lock); + } + mi_free(ptr); } } + irb_ptr = &(ipst->ips_ip_forwarding_table_v6[mask_table_index][ + IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6, + ipst->ips_ip6_ftable_hash_size)]); - flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW); + match_flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW); + if (ire->ire_ill != NULL) + match_flags |= MATCH_IRE_ILL; /* - * For IRE_CACHES, MATCH_IRE_IPIF is not enough to check - * for duplicates because : - * - * 1) ire_ipif->ipif_ill and ire_stq->q_ptr could be - * pointing at different ills. A real duplicate is - * a match on both ire_ipif and ire_stq. - * - * 2) We could have multiple packets trying to create - * an IRE_CACHE for the same ill. - * - * Rather than looking at the packet, we depend on the above for - * MATCH_IRE_ILL here. - * - * Unlike IPv4, MATCH_IRE_IPIF is needed here as we could have - * multiple IRE_CACHES for an ill for the same destination - * with various scoped addresses i.e represented by ipifs. - * - * MATCH_IRE_ILL is done implicitly below for IRE_CACHES. + * Start the atomic add of the ire. Grab the bucket lock and the + * ill lock. Check for condemned. */ - if (ire->ire_ipif != NULL) - flags |= MATCH_IRE_IPIF; + error = ire_atomic_start(irb_ptr, ire); + if (error != 0) { + ire_delete(ire); + return (NULL); + } /* * If we are creating a hidden IRE, make sure we search for @@ -577,103 +384,36 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func) * Otherwise, we might find an IRE on some other interface * that's not marked hidden. */ - if (ire->ire_marks & IRE_MARK_TESTHIDDEN) - flags |= MATCH_IRE_MARK_TESTHIDDEN; - - /* - * Start the atomic add of the ire. Grab the ill locks, - * ill_g_usesrc_lock and the bucket lock. Check for condemned. - * To avoid lock order problems, get the ndp6.ndp_g_lock now itself. - */ - if (ire->ire_type == IRE_CACHE) { - mutex_enter(&ipst->ips_ndp6->ndp_g_lock); - ndp_g_lock_held = B_TRUE; - } - - /* - * If ipif or ill is changing ire_atomic_start() may queue the - * request and return EINPROGRESS. - */ - - error = ire_atomic_start(irb_ptr, ire, q, mp, func); - if (error != 0) { - if (ndp_g_lock_held) - mutex_exit(&ipst->ips_ndp6->ndp_g_lock); - /* - * We don't know whether it is a valid ipif or not. - * So, set it to NULL. This assumes that the ire has not added - * a reference to the ipif. - */ - ire->ire_ipif = NULL; - ire_delete(ire); - if (pire != NULL) { - IRB_REFRELE(pire->ire_bucket); - ire_refrele(pire); - } - *ire_p = NULL; - return (error); - } - /* - * To avoid creating ires having stale values for the ire_max_frag - * we get the latest value atomically here. For more details - * see the block comment in ip_sioctl_mtu and in DL_NOTE_SDU_CHANGE - * in ip_rput_dlpi_writer - */ - if (ire->ire_max_fragp == NULL) { - if (IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) - ire->ire_max_frag = ire->ire_ipif->ipif_mtu; - else - ire->ire_max_frag = pire->ire_max_frag; - } else { - uint_t max_frag; - - max_frag = *ire->ire_max_fragp; - ire->ire_max_fragp = NULL; - ire->ire_max_frag = max_frag; - } + if (ire->ire_testhidden) + match_flags |= MATCH_IRE_TESTHIDDEN; /* * Atomically check for duplicate and insert in the table. */ for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { - if (ire1->ire_marks & IRE_MARK_CONDEMNED) + if (IRE_IS_CONDEMNED(ire1)) continue; - - if (ire->ire_type == IRE_CACHE) { - /* - * We do MATCH_IRE_ILL implicitly here for IRE_CACHES. - * As ire_ipif and ire_stq could point to two - * different ills, we can't pass just ire_ipif to - * ire_match_args and get a match on both ills. - * This is just needed for duplicate checks here and - * so we don't add an extra argument to - * ire_match_args for this. Do it locally. - * - * NOTE : Currently there is no part of the code - * that asks for both MATH_IRE_IPIF and MATCH_IRE_ILL - * match for IRE_CACHEs. Thus we don't want to - * extend the arguments to ire_match_args_v6. - */ - if (ire1->ire_stq != ire->ire_stq) - continue; - /* - * Multiroute IRE_CACHEs for a given destination can - * have the same ire_ipif, typically if their source - * address is forced using RTF_SETSRC, and the same - * send-to queue. We differentiate them using the parent - * handle. - */ - if ((ire1->ire_flags & RTF_MULTIRT) && - (ire->ire_flags & RTF_MULTIRT) && - (ire1->ire_phandle != ire->ire_phandle)) - continue; - } + /* + * Here we need an exact match on zoneid, i.e., + * ire_match_args doesn't fit. + */ if (ire1->ire_zoneid != ire->ire_zoneid) continue; + + if (ire1->ire_type != ire->ire_type) + continue; + + /* + * Note: We do not allow multiple routes that differ only + * in the gateway security attributes; such routes are + * considered duplicates. + * To change that we explicitly have to treat them as + * different here. + */ if (ire_match_args_v6(ire1, &ire->ire_addr_v6, &ire->ire_mask_v6, &ire->ire_gateway_addr_v6, - ire->ire_type, ire->ire_ipif, ire->ire_zoneid, 0, NULL, - flags)) { + ire->ire_type, ire->ire_ill, ire->ire_zoneid, NULL, + match_flags)) { /* * Return the old ire after doing a REFHOLD. * As most of the callers continue to use the IRE @@ -683,141 +423,25 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func) */ ip1dbg(("found dup ire existing %p new %p", (void *)ire1, (void *)ire)); - IRE_REFHOLD(ire1); - if (ndp_g_lock_held) - mutex_exit(&ipst->ips_ndp6->ndp_g_lock); + ire_refhold(ire1); + atomic_add_32(&ire1->ire_identical_ref, 1); ire_atomic_end(irb_ptr, ire); ire_delete(ire); - if (pire != NULL) { - /* - * Assert that it is - * not yet removed from the list. - */ - ASSERT(pire->ire_ptpn != NULL); - IRB_REFRELE(pire->ire_bucket); - ire_refrele(pire); - } - *ire_p = ire1; - return (0); + return (ire1); } } - if (ire->ire_type == IRE_CACHE) { - const in6_addr_t *addr_v6; - ill_t *ill = ire_to_ill(ire); - char buf[INET6_ADDRSTRLEN]; - nce_t *nce; - /* - * All IRE_CACHE types must have a nce. If this is - * not the case the entry will not be added. We need - * to make sure that if somebody deletes the nce - * after we looked up, they will find this ire and - * delete the ire. To delete this ire one needs the - * bucket lock which we are still holding here. So, - * even if the nce gets deleted after we looked up, - * this ire will get deleted. - * - * NOTE : Don't need the ire_lock for accessing - * ire_gateway_addr_v6 as it is appearing first - * time on the list and rts_setgwr_v6 could not - * be changing this. - */ - addr_v6 = &ire->ire_gateway_addr_v6; - if (IN6_IS_ADDR_UNSPECIFIED(addr_v6)) - addr_v6 = &ire->ire_addr_v6; - - /* nce fastpath is per-ill; don't match across illgrp */ - nce = ndp_lookup_v6(ill, B_FALSE, addr_v6, B_TRUE); - if (nce == NULL) - goto failed; - - /* Pair of refhold, refrele just to get the tracing right */ - NCE_REFHOLD_TO_REFHOLD_NOTR(nce); - /* - * Atomically make sure that new IREs don't point - * to an NCE that is logically deleted (CONDEMNED). - * ndp_delete() first marks the NCE CONDEMNED. - * This ensures that the nce_refcnt won't increase - * due to new nce_lookups or due to addition of new IREs - * pointing to this NCE. Then ndp_delete() cleans up - * existing references. If we don't do it atomically here, - * ndp_delete() -> nce_ire_delete() will not be able to - * clean up the IRE list completely, and the nce_refcnt - * won't go down to zero. - */ - mutex_enter(&nce->nce_lock); - if (ill->ill_flags & ILLF_XRESOLV) { - /* - * If we used an external resolver, we may not - * have gone through neighbor discovery to get here. - * Must update the nce_state before the next check. - */ - if (nce->nce_state == ND_INCOMPLETE) - nce->nce_state = ND_REACHABLE; - } - if (nce->nce_state == ND_INCOMPLETE || - (nce->nce_flags & NCE_F_CONDEMNED) || - (nce->nce_state == ND_UNREACHABLE)) { -failed: - if (ndp_g_lock_held) - mutex_exit(&ipst->ips_ndp6->ndp_g_lock); - if (nce != NULL) - mutex_exit(&nce->nce_lock); - ire_atomic_end(irb_ptr, ire); - ip1dbg(("ire_add_v6: No nce for dst %s \n", - inet_ntop(AF_INET6, &ire->ire_addr_v6, - buf, sizeof (buf)))); - ire_delete(ire); - if (pire != NULL) { - /* - * Assert that it is - * not yet removed from the list. - */ - ASSERT(pire->ire_ptpn != NULL); - IRB_REFRELE(pire->ire_bucket); - ire_refrele(pire); - } - if (nce != NULL) - NCE_REFRELE_NOTR(nce); - *ire_p = NULL; - return (EINVAL); - } else { - ire->ire_nce = nce; - } - mutex_exit(&nce->nce_lock); - } /* - * Find the first entry that matches ire_addr - provides - * tail insertion. *irep will be null if no match. + * Normally we do head insertion since most things do not care about + * the order of the IREs in the bucket. + * However, due to shared-IP zones (and restrict_interzone_loopback) + * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same + * address. For that reason we do tail insertion for IRE_IF_CLONE. */ irep = (ire_t **)irb_ptr; - while ((ire1 = *irep) != NULL && - !IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, &ire1->ire_addr_v6)) - irep = &ire1->ire_next; - ASSERT(!(ire->ire_type & IRE_BROADCAST)); - - if (*irep != NULL) { - /* - * Find the last ire which matches ire_addr_v6. - * Needed to do tail insertion among entries with the same - * ire_addr_v6. - */ - while (IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, - &ire1->ire_addr_v6)) { + if (ire->ire_type & IRE_IF_CLONE) { + while ((ire1 = *irep) != NULL) irep = &ire1->ire_next; - ire1 = *irep; - if (ire1 == NULL) - break; - } - } - - if (ire->ire_type == IRE_DEFAULT) { - /* - * We keep a count of default gateways which is used when - * assigning them as routes. - */ - ipst->ips_ipv6_ire_default_count++; - ASSERT(ipst->ips_ipv6_ire_default_count != 0); /* Wraparound */ } /* Insert at *irep */ ire1 = *irep; @@ -852,62 +476,22 @@ failed: * in the list for the first time and no one else can bump * up the reference count on this yet. */ - IRE_REFHOLD_LOCKED(ire); + ire_refhold_locked(ire); BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_inserted); irb_ptr->irb_ire_cnt++; - if (ire->ire_marks & IRE_MARK_TEMPORARY) - irb_ptr->irb_tmp_ire_cnt++; - if (ire->ire_ipif != NULL) { - DTRACE_PROBE3(ipif__incr__cnt, (ipif_t *), ire->ire_ipif, + if (ire->ire_ill != NULL) { + DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ire->ire_ill, (char *), "ire", (void *), ire); - ire->ire_ipif->ipif_ire_cnt++; - if (ire->ire_stq != NULL) { - stq_ill = (ill_t *)ire->ire_stq->q_ptr; - DTRACE_PROBE3(ill__incr__cnt, (ill_t *), stq_ill, - (char *), "ire", (void *), ire); - stq_ill->ill_ire_cnt++; - } - } else { - ASSERT(ire->ire_stq == NULL); + ire->ire_ill->ill_ire_cnt++; + ASSERT(ire->ire_ill->ill_ire_cnt != 0); /* Wraparound */ } - - if (ndp_g_lock_held) - mutex_exit(&ipst->ips_ndp6->ndp_g_lock); ire_atomic_end(irb_ptr, ire); - if (pire != NULL) { - /* Assert that it is not removed from the list yet */ - ASSERT(pire->ire_ptpn != NULL); - IRB_REFRELE(pire->ire_bucket); - ire_refrele(pire); - } - - if (ire->ire_type != IRE_CACHE) { - /* - * For ire's with with host mask see if there is an entry - * in the cache. If there is one flush the whole cache as - * there might be multiple entries due to RTF_MULTIRT (CGTP). - * If no entry is found than there is no need to flush the - * cache. - */ - - if (ip_mask_to_plen_v6(&ire->ire_mask_v6) == IPV6_ABITS) { - ire_t *lire; - lire = ire_ctable_lookup_v6(&ire->ire_addr_v6, NULL, - IRE_CACHE, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, - ipst); - if (lire != NULL) { - ire_refrele(lire); - ire_flush_cache_v6(ire, IRE_FLUSH_ADD); - } - } else { - ire_flush_cache_v6(ire, IRE_FLUSH_ADD); - } - } + /* Make any caching of the IREs be notified or updated */ + ire_flush_cache_v6(ire, IRE_FLUSH_ADD); - *ire_p = ire; - return (0); + return (ire); } /* @@ -931,7 +515,7 @@ ire_delete_host_redirects_v6(const in6_addr_t *gateway, ip_stack_t *ipst) return; for (i = 0; (i < ipst->ips_ip6_ftable_hash_size); i++) { irb = &irb_ptr[i]; - IRB_REFHOLD(irb); + irb_refhold(irb); for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { if (!(ire->ire_flags & RTF_DYNAMIC)) continue; @@ -941,50 +525,11 @@ ire_delete_host_redirects_v6(const in6_addr_t *gateway, ip_stack_t *ipst) if (IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) ire_delete(ire); } - IRB_REFRELE(irb); + irb_refrele(irb); } } /* - * Delete all the cache entries with this 'addr'. This is the IPv6 counterpart - * of ip_ire_clookup_and_delete. The difference being this function does not - * return any value. IPv6 processing of a gratuitous ARP, as it stands, is - * different than IPv4 in that, regardless of the presence of a cache entry - * for this address, an ire_walk_v6 is done. Another difference is that unlike - * in the case of IPv4 this does not take an ipif_t argument, since it is only - * called by ip_arp_news and the match is always only on the address. - */ -void -ip_ire_clookup_and_delete_v6(const in6_addr_t *addr, ip_stack_t *ipst) -{ - irb_t *irb; - ire_t *cire; - boolean_t found = B_FALSE; - - irb = &ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr, - ipst->ips_ip6_cache_table_size)]; - IRB_REFHOLD(irb); - for (cire = irb->irb_ire; cire != NULL; cire = cire->ire_next) { - if (cire->ire_marks & IRE_MARK_CONDEMNED) - continue; - if (IN6_ARE_ADDR_EQUAL(&cire->ire_addr_v6, addr)) { - - /* This signifies start of a match */ - if (!found) - found = B_TRUE; - if (cire->ire_type == IRE_CACHE) { - if (cire->ire_nce != NULL) - ndp_delete(cire->ire_nce); - ire_delete_v6(cire); - } - /* End of the match */ - } else if (found) - break; - } - IRB_REFRELE(irb); -} - -/* * Delete the specified IRE. * All calls should use ire_delete(). * Sometimes called as writer though not required by this function. @@ -998,11 +543,20 @@ ire_delete_v6(ire_t *ire) in6_addr_t gw_addr_v6; ip_stack_t *ipst = ire->ire_ipst; + /* + * Make sure ire_generation increases from ire_flush_cache happen + * after any lookup/reader has read ire_generation. + * Since the rw_enter makes us wait until any lookup/reader has + * completed we can exit the lock immediately. + */ + rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER); + rw_exit(&ipst->ips_ip6_ire_head_lock); + ASSERT(ire->ire_refcnt >= 1); ASSERT(ire->ire_ipversion == IPV6_VERSION); - if (ire->ire_type != IRE_CACHE) - ire_flush_cache_v6(ire, IRE_FLUSH_DELETE); + ire_flush_cache_v6(ire, IRE_FLUSH_DELETE); + if (ire->ire_type == IRE_DEFAULT) { /* * when a default gateway is going away @@ -1014,368 +568,284 @@ ire_delete_v6(ire_t *ire) mutex_exit(&ire->ire_lock); ire_delete_host_redirects_v6(&gw_addr_v6, ipst); } -} - -/* - * ire_walk routine to delete all IRE_CACHE and IRE_HOST type redirect - * entries. - */ -/*ARGSUSED1*/ -void -ire_delete_cache_v6(ire_t *ire, char *arg) -{ - char addrstr1[INET6_ADDRSTRLEN]; - char addrstr2[INET6_ADDRSTRLEN]; - - if ((ire->ire_type & IRE_CACHE) || - (ire->ire_flags & RTF_DYNAMIC)) { - ip1dbg(("ire_delete_cache_v6: deleted %s type %d through %s\n", - inet_ntop(AF_INET6, &ire->ire_addr_v6, - addrstr1, sizeof (addrstr1)), - ire->ire_type, - inet_ntop(AF_INET6, &ire->ire_gateway_addr_v6, - addrstr2, sizeof (addrstr2)))); - ire_delete(ire); - } - -} -/* - * ire_walk routine to delete all IRE_CACHE/IRE_HOST type redirect entries - * that have a given gateway address. - */ -void -ire_delete_cache_gw_v6(ire_t *ire, char *addr) -{ - in6_addr_t *gw_addr = (in6_addr_t *)addr; - char buf1[INET6_ADDRSTRLEN]; - char buf2[INET6_ADDRSTRLEN]; - in6_addr_t ire_gw_addr_v6; - - if (!(ire->ire_type & IRE_CACHE) && - !(ire->ire_flags & RTF_DYNAMIC)) - return; - - mutex_enter(&ire->ire_lock); - ire_gw_addr_v6 = ire->ire_gateway_addr_v6; - mutex_exit(&ire->ire_lock); + /* + * If we are deleting an IRE_INTERFACE then we make sure we also + * delete any IRE_IF_CLONE that has been created from it. + * Those are always in ire_dep_children. + */ + if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != 0) + ire_dep_delete_if_clone(ire); - if (IN6_ARE_ADDR_EQUAL(&ire_gw_addr_v6, gw_addr)) { - ip1dbg(("ire_delete_cache_gw_v6: deleted %s type %d to %s\n", - inet_ntop(AF_INET6, &ire->ire_src_addr_v6, - buf1, sizeof (buf1)), - ire->ire_type, - inet_ntop(AF_INET6, &ire_gw_addr_v6, - buf2, sizeof (buf2)))); - ire_delete(ire); + /* Remove from parent dependencies and child */ + rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER); + if (ire->ire_dep_parent != NULL) { + ire_dep_remove(ire); } + while (ire->ire_dep_children != NULL) + ire_dep_remove(ire->ire_dep_children); + rw_exit(&ipst->ips_ire_dep_lock); } /* - * Remove all IRE_CACHE entries that match - * the ire specified. (Sometimes called - * as writer though not required by this function.) - * - * The flag argument indicates if the - * flush request is due to addition - * of new route (IRE_FLUSH_ADD) or deletion of old - * route (IRE_FLUSH_DELETE). + * When an IRE is added or deleted this routine is called to make sure + * any caching of IRE information is notified or updated. * - * This routine takes only the IREs from the forwarding - * table and flushes the corresponding entries from - * the cache table. - * - * When flushing due to the deletion of an old route, it - * just checks the cache handles (ire_phandle and ire_ihandle) and - * deletes the ones that match. - * - * When flushing due to the creation of a new route, it checks - * if a cache entry's address matches the one in the IRE and - * that the cache entry's parent has a less specific mask than the - * one in IRE. The destination of such a cache entry could be the - * gateway for other cache entries, so we need to flush those as - * well by looking for gateway addresses matching the IRE's address. + * The flag argument indicates if the flush request is due to addition + * of new route (IRE_FLUSH_ADD), deletion of old route (IRE_FLUSH_DELETE), + * or a change to ire_gateway_addr (IRE_FLUSH_GWCHANGE). */ void ire_flush_cache_v6(ire_t *ire, int flag) { - int i; - ire_t *cire; - irb_t *irb; - ip_stack_t *ipst = ire->ire_ipst; + ip_stack_t *ipst = ire->ire_ipst; - if (ire->ire_type & IRE_CACHE) + /* + * IRE_IF_CLONE ire's don't provide any new information + * than the parent from which they are cloned, so don't + * perturb the generation numbers. + */ + if (ire->ire_type & IRE_IF_CLONE) return; /* - * If a default is just created, there is no point - * in going through the cache, as there will not be any - * cached ires. + * Ensure that an ire_add during a lookup serializes the updates of + * the generation numbers under ire_head_lock so that the lookup gets + * either the old ire and old generation number, or a new ire and new + * generation number. + */ + rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER); + + /* + * If a route was just added, we need to notify everybody that + * has cached an IRE_NOROUTE since there might now be a better + * route for them. */ - if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) - return; if (flag == IRE_FLUSH_ADD) { + ire_increment_generation(ipst->ips_ire_reject_v6); + ire_increment_generation(ipst->ips_ire_blackhole_v6); + } + + /* Adding a default can't otherwise provide a better route */ + if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) { + rw_exit(&ipst->ips_ip6_ire_head_lock); + return; + } + + switch (flag) { + case IRE_FLUSH_DELETE: + case IRE_FLUSH_GWCHANGE: /* - * This selective flush is - * due to the addition of - * new IRE. + * Update ire_generation for all ire_dep_children chains + * starting with this IRE */ - for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) { - irb = &ipst->ips_ip_cache_table_v6[i]; - if ((cire = irb->irb_ire) == NULL) - continue; - IRB_REFHOLD(irb); - for (cire = irb->irb_ire; cire != NULL; - cire = cire->ire_next) { - if (cire->ire_type != IRE_CACHE) - continue; - /* - * If 'cire' belongs to the same subnet - * as the new ire being added, and 'cire' - * is derived from a prefix that is less - * specific than the new ire being added, - * we need to flush 'cire'; for instance, - * when a new interface comes up. - */ - if ((V6_MASK_EQ_2(cire->ire_addr_v6, - ire->ire_mask_v6, ire->ire_addr_v6) && - (ip_mask_to_plen_v6(&cire->ire_cmask_v6) <= - ire->ire_masklen))) { - ire_delete(cire); - continue; - } - /* - * This is the case when the ire_gateway_addr - * of 'cire' belongs to the same subnet as - * the new ire being added. - * Flushing such ires is sometimes required to - * avoid misrouting: say we have a machine with - * two interfaces (I1 and I2), a default router - * R on the I1 subnet, and a host route to an - * off-link destination D with a gateway G on - * the I2 subnet. - * Under normal operation, we will have an - * on-link cache entry for G and an off-link - * cache entry for D with G as ire_gateway_addr, - * traffic to D will reach its destination - * through gateway G. - * If the administrator does 'ifconfig I2 down', - * the cache entries for D and G will be - * flushed. However, G will now be resolved as - * an off-link destination using R (the default - * router) as gateway. Then D will also be - * resolved as an off-link destination using G - * as gateway - this behavior is due to - * compatibility reasons, see comment in - * ire_ihandle_lookup_offlink(). Traffic to D - * will go to the router R and probably won't - * reach the destination. - * The administrator then does 'ifconfig I2 up'. - * Since G is on the I2 subnet, this routine - * will flush its cache entry. It must also - * flush the cache entry for D, otherwise - * traffic will stay misrouted until the IRE - * times out. - */ - if (V6_MASK_EQ_2(cire->ire_gateway_addr_v6, - ire->ire_mask_v6, ire->ire_addr_v6)) { - ire_delete(cire); - continue; - } - } - IRB_REFRELE(irb); - } - } else { + ire_dep_incr_generation(ire); + break; + case IRE_FLUSH_ADD: { + in6_addr_t addr; + in6_addr_t mask; + ip_stack_t *ipst = ire->ire_ipst; + uint_t masklen; + /* - * delete the cache entries based on - * handle in the IRE as this IRE is - * being deleted/changed. + * Find an IRE which is a shorter match than the ire to be added + * For any such IRE (which we repeat) we update the + * ire_generation the same way as in the delete case. */ - for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) { - irb = &ipst->ips_ip_cache_table_v6[i]; - if ((cire = irb->irb_ire) == NULL) - continue; - IRB_REFHOLD(irb); - for (cire = irb->irb_ire; cire != NULL; - cire = cire->ire_next) { - if (cire->ire_type != IRE_CACHE) - continue; - if ((cire->ire_phandle == 0 || - cire->ire_phandle != ire->ire_phandle) && - (cire->ire_ihandle == 0 || - cire->ire_ihandle != ire->ire_ihandle)) - continue; - ire_delete(cire); - } - IRB_REFRELE(irb); + addr = ire->ire_addr_v6; + mask = ire->ire_mask_v6; + masklen = ip_mask_to_plen_v6(&mask); + + ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, NULL, + ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst); + while (ire != NULL) { + /* We need to handle all in the same bucket */ + irb_increment_generation(ire->ire_bucket); + + mask = ire->ire_mask_v6; + ASSERT(masklen > ip_mask_to_plen_v6(&mask)); + masklen = ip_mask_to_plen_v6(&mask); + ire_refrele(ire); + ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, + NULL, ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst); + } } + break; } + rw_exit(&ipst->ips_ip6_ire_head_lock); } /* * Matches the arguments passed with the values in the ire. * - * Note: for match types that match using "ipif" passed in, ipif + * Note: for match types that match using "ill" passed in, ill * must be checked for non-NULL before calling this routine. */ -static boolean_t +boolean_t ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask, - const in6_addr_t *gateway, int type, const ipif_t *ipif, zoneid_t zoneid, - uint32_t ihandle, const ts_label_t *tsl, int match_flags) + const in6_addr_t *gateway, int type, const ill_t *ill, zoneid_t zoneid, + const ts_label_t *tsl, int match_flags) { in6_addr_t masked_addr; in6_addr_t gw_addr_v6; ill_t *ire_ill = NULL, *dst_ill; - ill_t *ipif_ill = NULL; - ipif_t *src_ipif; + ip_stack_t *ipst = ire->ire_ipst; ASSERT(ire->ire_ipversion == IPV6_VERSION); ASSERT(addr != NULL); ASSERT(mask != NULL); ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL); ASSERT((!(match_flags & MATCH_IRE_ILL)) || - (ipif != NULL && ipif->ipif_isv6)); + (ill != NULL && ill->ill_isv6)); /* - * If MATCH_IRE_MARK_TESTHIDDEN is set, then only return the IRE if it - * is in fact hidden, to ensure the caller gets the right one. One - * exception: if the caller passed MATCH_IRE_IHANDLE, then they - * already know the identity of the given IRE_INTERFACE entry and - * there's no point trying to hide it from them. + * If MATCH_IRE_TESTHIDDEN is set, then only return the IRE if it + * is in fact hidden, to ensure the caller gets the right one. */ - if (ire->ire_marks & IRE_MARK_TESTHIDDEN) { - if (match_flags & MATCH_IRE_IHANDLE) - match_flags |= MATCH_IRE_MARK_TESTHIDDEN; - - if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) + if (ire->ire_testhidden) { + if (!(match_flags & MATCH_IRE_TESTHIDDEN)) return (B_FALSE); } if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid && ire->ire_zoneid != ALL_ZONES) { /* - * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid is - * valid and does not match that of ire_zoneid, a failure to + * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid + * does not match that of ire_zoneid, a failure to * match is reported at this point. Otherwise, since some IREs * that are available in the global zone can be used in local * zones, additional checks need to be performed: * - * IRE_CACHE and IRE_LOOPBACK entries should - * never be matched in this situation. + * IRE_LOOPBACK + * entries should never be matched in this situation. + * Each zone has its own IRE_LOOPBACK. * - * IRE entries that have an interface associated with them - * should in general not match unless they are an IRE_LOCAL - * or in the case when MATCH_IRE_DEFAULT has been set in - * the caller. In the case of the former, checking of the - * other fields supplied should take place. + * IRE_LOCAL + * We allow them for any zoneid. ire_route_recursive + * does additional checks when + * ip_restrict_interzone_loopback is set. * - * In the case where MATCH_IRE_DEFAULT has been set, - * all of the ipif's associated with the IRE's ill are - * checked to see if there is a matching zoneid. If any - * one ipif has a matching zoneid, this IRE is a - * potential candidate so checking of the other fields - * takes place. + * If ill_usesrc_ifindex is set + * Then we check if the zone has a valid source address + * on the usesrc ill. * - * In the case where the IRE_INTERFACE has a usable source - * address (indicated by ill_usesrc_ifindex) in the - * correct zone then it's permitted to return this IRE + * If ire_ill is set, then check that the zone has an ipif + * on that ill. + * + * Outside of this function (in ire_round_robin) we check + * that any IRE_OFFLINK has a gateway that reachable from the + * zone when we have multiple choices (ECMP). */ if (match_flags & MATCH_IRE_ZONEONLY) return (B_FALSE); - if (ire->ire_type & (IRE_CACHE | IRE_LOOPBACK)) + if (ire->ire_type & IRE_LOOPBACK) return (B_FALSE); + + if (ire->ire_type & IRE_LOCAL) + goto matchit; + /* - * Note, IRE_INTERFACE can have the stq as NULL. For - * example, if the default multicast route is tied to - * the loopback address. + * The normal case of IRE_ONLINK has a matching zoneid. + * Here we handle the case when shared-IP zones have been + * configured with IP addresses on vniN. In that case it + * is ok for traffic from a zone to use IRE_ONLINK routes + * if the ill has a usesrc pointing at vniN + * Applies to IRE_INTERFACE. */ - if ((ire->ire_type & IRE_INTERFACE) && - (ire->ire_stq != NULL)) { - dst_ill = (ill_t *)ire->ire_stq->q_ptr; + dst_ill = ire->ire_ill; + if (ire->ire_type & IRE_ONLINK) { + uint_t ifindex; + + /* + * Note there is no IRE_INTERFACE on vniN thus + * can't do an IRE lookup for a matching route. + */ + ifindex = dst_ill->ill_usesrc_ifindex; + if (ifindex == 0) + return (B_FALSE); + /* * If there is a usable source address in the - * zone, then it's ok to return an - * IRE_INTERFACE + * zone, then it's ok to return this IRE_INTERFACE */ - if ((dst_ill->ill_usesrc_ifindex != 0) && - (src_ipif = ipif_select_source_v6(dst_ill, addr, - B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid)) - != NULL) { - ip3dbg(("ire_match_args: src_ipif %p" - " dst_ill %p", (void *)src_ipif, - (void *)dst_ill)); - ipif_refrele(src_ipif); - } else { - ip3dbg(("ire_match_args: src_ipif NULL" + if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6, + zoneid, ipst)) { + ip3dbg(("ire_match_args: no usrsrc for zone" " dst_ill %p\n", (void *)dst_ill)); return (B_FALSE); } } - if (ire->ire_ipif != NULL && ire->ire_type != IRE_LOCAL && - !(ire->ire_type & IRE_INTERFACE)) { + /* + * For exampe, with + * route add 11.0.0.0 gw1 -ifp bge0 + * route add 11.0.0.0 gw2 -ifp bge1 + * this code would differentiate based on + * where the sending zone has addresses. + * Only if the zone has an address on bge0 can it use the first + * route. It isn't clear if this behavior is documented + * anywhere. + */ + if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) { ipif_t *tipif; - if ((match_flags & MATCH_IRE_DEFAULT) == 0) - return (B_FALSE); - mutex_enter(&ire->ire_ipif->ipif_ill->ill_lock); - for (tipif = ire->ire_ipif->ipif_ill->ill_ipif; + mutex_enter(&dst_ill->ill_lock); + for (tipif = dst_ill->ill_ipif; tipif != NULL; tipif = tipif->ipif_next) { - if (IPIF_CAN_LOOKUP(tipif) && + if (!IPIF_IS_CONDEMNED(tipif) && (tipif->ipif_flags & IPIF_UP) && (tipif->ipif_zoneid == zoneid || tipif->ipif_zoneid == ALL_ZONES)) break; } - mutex_exit(&ire->ire_ipif->ipif_ill->ill_lock); + mutex_exit(&dst_ill->ill_lock); if (tipif == NULL) return (B_FALSE); } } +matchit: if (match_flags & MATCH_IRE_GW) { mutex_enter(&ire->ire_lock); gw_addr_v6 = ire->ire_gateway_addr_v6; mutex_exit(&ire->ire_lock); } - - /* - * For IRE_CACHE entries, MATCH_IRE_ILL means that somebody wants to - * send out ire_stq (ire_ipif for IRE_CACHE entries is just the means - * of getting a source address -- i.e., ire_src_addr_v6 == - * ire->ire_ipif->ipif_v6src_addr). ire_to_ill() handles this. - * - * NOTE: For IPMP, MATCH_IRE_ILL usually matches any ill in the group. - * However, if MATCH_IRE_MARK_TESTHIDDEN is set (i.e., the IRE is for - * IPMP test traffic), then the ill must match exactly. - */ if (match_flags & MATCH_IRE_ILL) { - ire_ill = ire_to_ill(ire); - ipif_ill = ipif->ipif_ill; - } + ire_ill = ire->ire_ill; + /* + * If asked to match an ill, we *must* match + * on the ire_ill for ipmp test addresses, or + * any of the ill in the group for data addresses. + * If we don't, we may as well fail. + * However, we need an exception for IRE_LOCALs to ensure + * we loopback packets even sent to test addresses on different + * interfaces in the group. + */ + if ((match_flags & MATCH_IRE_TESTHIDDEN) && + !(ire->ire_type & IRE_LOCAL)) { + if (ire->ire_ill != ill) + return (B_FALSE); + } else { + match_flags &= ~MATCH_IRE_TESTHIDDEN; + /* + * We know that ill is not NULL, but ire_ill could be + * NULL + */ + if (ire_ill == NULL || !IS_ON_SAME_LAN(ill, ire_ill)) + return (B_FALSE); + } + } /* No ire_addr_v6 bits set past the mask */ ASSERT(V6_MASK_EQ(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6)); V6_MASK_COPY(*addr, *mask, masked_addr); - if (V6_MASK_EQ(*addr, *mask, ire->ire_addr_v6) && ((!(match_flags & MATCH_IRE_GW)) || IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) && - ((!(match_flags & MATCH_IRE_TYPE)) || - (ire->ire_type & type)) && - ((!(match_flags & MATCH_IRE_SRC)) || - IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6, - &ipif->ipif_v6src_addr)) && - ((!(match_flags & MATCH_IRE_IPIF)) || - (ire->ire_ipif == ipif)) && - ((!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) || - (ire->ire_marks & IRE_MARK_TESTHIDDEN)) && - ((!(match_flags & MATCH_IRE_ILL)) || - (ire_ill == ipif_ill || - (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN) && - ire_ill != NULL && IS_IN_SAME_ILLGRP(ipif_ill, ire_ill)))) && - ((!(match_flags & MATCH_IRE_IHANDLE)) || - (ire->ire_ihandle == ihandle)) && + ((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & type)) && + ((!(match_flags & MATCH_IRE_TESTHIDDEN)) || ire->ire_testhidden) && + ((!(match_flags & MATCH_IRE_MASK)) || + (IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, mask))) && ((!(match_flags & MATCH_IRE_SECATTR)) || (!is_system_labeled()) || (tsol_ire_match_gwattr(ire, tsl) == 0))) { @@ -1386,41 +856,38 @@ ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask, } /* - * Lookup for a route in all the tables + * Check if the zoneid (not ALL_ZONES) has an IRE_INTERFACE for the specified + * gateway address. If ill is non-NULL we also match on it. + * The caller must hold a read lock on RADIX_NODE_HEAD if lock_held is set. */ -ire_t * -ire_route_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask, - const in6_addr_t *gateway, int type, const ipif_t *ipif, ire_t **pire, - zoneid_t zoneid, const ts_label_t *tsl, int flags, ip_stack_t *ipst) +boolean_t +ire_gateway_ok_zone_v6(const in6_addr_t *gateway, zoneid_t zoneid, ill_t *ill, + const ts_label_t *tsl, ip_stack_t *ipst, boolean_t lock_held) { - ire_t *ire = NULL; + ire_t *ire; + uint_t match_flags; - /* - * ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or - * MATCH_IRE_ILL is set. - */ - if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL)) - return (NULL); + if (lock_held) + ASSERT(RW_READ_HELD(&ipst->ips_ip6_ire_head_lock)); + else + rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); - /* - * might be asking for a cache lookup, - * This is not best way to lookup cache, - * user should call ire_cache_lookup directly. - * - * If MATCH_IRE_TYPE was set, first lookup in the cache table and then - * in the forwarding table, if the applicable type flags were set. - */ - if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_CACHETABLE) != 0) { - ire = ire_ctable_lookup_v6(addr, gateway, type, ipif, zoneid, - tsl, flags, ipst); - if (ire != NULL) - return (ire); - } - if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_FORWARDTABLE) != 0) { - ire = ire_ftable_lookup_v6(addr, mask, gateway, type, ipif, - pire, zoneid, 0, tsl, flags, ipst); + match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR; + if (ill != NULL) + match_flags |= MATCH_IRE_ILL; + + ire = ire_ftable_lookup_impl_v6(gateway, &ipv6_all_zeros, + &ipv6_all_zeros, IRE_INTERFACE, ill, zoneid, tsl, match_flags, + ipst); + + if (!lock_held) + rw_exit(&ipst->ips_ip6_ire_head_lock); + if (ire != NULL) { + ire_refrele(ire); + return (B_TRUE); + } else { + return (B_FALSE); } - return (ire); } /* @@ -1429,63 +896,121 @@ ire_route_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask, * required parameters and indicating the * match required in flag field. * - * Looking for default route can be done in three ways - * 1) pass mask as ipv6_all_zeros and set MATCH_IRE_MASK in flags field - * along with other matches. - * 2) pass type as IRE_DEFAULT and set MATCH_IRE_TYPE in flags - * field along with other matches. - * 3) if the destination and mask are passed as zeros. - * - * A request to return a default route if no route - * is found, can be specified by setting MATCH_IRE_DEFAULT - * in flags. - * - * It does not support recursion more than one level. It - * will do recursive lookup only when the lookup maps to - * a prefix or default route and MATCH_IRE_RECURSIVE flag is passed. - * - * If the routing table is setup to allow more than one level - * of recursion, the cleaning up cache table will not work resulting - * in invalid routing. - * * Supports link-local addresses by following the ipif/ill when recursing. - * - * NOTE : When this function returns NULL, pire has already been released. - * pire is valid only when this function successfully returns an - * ire. */ ire_t * ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask, - const in6_addr_t *gateway, int type, const ipif_t *ipif, ire_t **pire, - zoneid_t zoneid, uint32_t ihandle, const ts_label_t *tsl, int flags, - ip_stack_t *ipst) + const in6_addr_t *gateway, int type, const ill_t *ill, + zoneid_t zoneid, const ts_label_t *tsl, int flags, + uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp) { - irb_t *irb_ptr; - ire_t *rire; ire_t *ire = NULL; - ire_t *saved_ire; - nce_t *nce; - int i; - in6_addr_t gw_addr_v6; ASSERT(addr != NULL); ASSERT((!(flags & MATCH_IRE_MASK)) || mask != NULL); ASSERT((!(flags & MATCH_IRE_GW)) || gateway != NULL); - ASSERT(ipif == NULL || ipif->ipif_isv6); + ASSERT(ill == NULL || ill->ill_isv6); + + ASSERT(!IN6_IS_ADDR_V4MAPPED(addr)); /* - * When we return NULL from this function, we should make - * sure that *pire is NULL so that the callers will not - * wrongly REFRELE the pire. + * ire_match_args_v6() will dereference ill if MATCH_IRE_ILL + * is set. */ - if (pire != NULL) - *pire = NULL; + if ((flags & (MATCH_IRE_ILL)) && (ill == NULL)) + return (NULL); + + rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); + ire = ire_ftable_lookup_impl_v6(addr, mask, gateway, type, ill, zoneid, + tsl, flags, ipst); + if (ire == NULL) { + rw_exit(&ipst->ips_ip6_ire_head_lock); + return (NULL); + } + /* - * ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or - * MATCH_IRE_ILL is set. + * round-robin only if we have more than one route in the bucket. + * ips_ip_ecmp_behavior controls when we do ECMP + * 2: always + * 1: for IRE_DEFAULT and /0 IRE_INTERFACE + * 0: never + * + * Note: if we found an IRE_IF_CLONE we won't look at the bucket with + * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match + * and the IRE_INTERFACESs are likely to be shorter matches. */ - if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL)) - return (NULL); + if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) { + if (ipst->ips_ip_ecmp_behavior == 2 || + (ipst->ips_ip_ecmp_behavior == 1 && + IS_DEFAULT_ROUTE_V6(ire))) { + ire_t *next_ire; + ire_ftable_args_t margs; + + (void) memset(&margs, 0, sizeof (margs)); + margs.ift_addr_v6 = *addr; + if (mask != NULL) + margs.ift_mask_v6 = *mask; + if (gateway != NULL) + margs.ift_gateway_v6 = *gateway; + margs.ift_type = type; + margs.ift_ill = ill; + margs.ift_zoneid = zoneid; + margs.ift_tsl = tsl; + margs.ift_flags = flags; + + next_ire = ire_round_robin(ire->ire_bucket, &margs, + xmit_hint, ire, ipst); + if (next_ire == NULL) { + /* keep ire if next_ire is null */ + goto done; + } + ire_refrele(ire); + ire = next_ire; + } + } + +done: + /* Return generation before dropping lock */ + if (generationp != NULL) + *generationp = ire->ire_generation; + + rw_exit(&ipst->ips_ip6_ire_head_lock); + + /* + * For shared-IP zones we need additional checks to what was + * done in ire_match_args to make sure IRE_LOCALs are handled. + * + * When ip_restrict_interzone_loopback is set, then + * we ensure that IRE_LOCAL are only used for loopback + * between zones when the logical "Ethernet" would + * have looped them back. That is, if in the absense of + * the IRE_LOCAL we would have sent to packet out the + * same ill. + */ + if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES && + ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES && + ipst->ips_ip_restrict_interzone_loopback) { + ire = ire_alt_local(ire, zoneid, tsl, ill, generationp); + ASSERT(ire != NULL); + } + + return (ire); +} + +/* + * Look up a single ire. The caller holds either the read or write lock. + */ +ire_t * +ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask, + const in6_addr_t *gateway, int type, const ill_t *ill, + zoneid_t zoneid, const ts_label_t *tsl, int flags, + ip_stack_t *ipst) +{ + irb_t *irb_ptr; + ire_t *ire = NULL; + int i; + + ASSERT(RW_LOCK_HELD(&ipst->ips_ip6_ire_head_lock)); /* * If the mask is known, the lookup @@ -1496,28 +1021,41 @@ ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask, uint_t masklen; masklen = ip_mask_to_plen_v6(mask); - if (ipst->ips_ip_forwarding_table_v6[masklen] == NULL) + if (ipst->ips_ip_forwarding_table_v6[masklen] == NULL) { return (NULL); + } irb_ptr = &(ipst->ips_ip_forwarding_table_v6[masklen][ IRE_ADDR_MASK_HASH_V6(*addr, *mask, ipst->ips_ip6_ftable_hash_size)]); rw_enter(&irb_ptr->irb_lock, RW_READER); for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { - if (ire->ire_marks & IRE_MARK_CONDEMNED) + if (IRE_IS_CONDEMNED(ire)) continue; if (ire_match_args_v6(ire, addr, mask, gateway, type, - ipif, zoneid, ihandle, tsl, flags)) + ill, zoneid, tsl, flags)) goto found_ire; } rw_exit(&irb_ptr->irb_lock); } else { + uint_t masklen; + /* * In this case we don't know the mask, we need to * search the table assuming different mask sizes. - * we start with 128 bit mask, we don't allow default here. */ - for (i = (IP6_MASK_TABLE_SIZE - 1); i > 0; i--) { + if (flags & MATCH_IRE_SHORTERMASK) { + masklen = ip_mask_to_plen_v6(mask); + if (masklen == 0) { + /* Nothing shorter than zero */ + return (NULL); + } + masklen--; + } else { + masklen = IP6_MASK_TABLE_SIZE - 1; + } + + for (i = masklen; i >= 0; i--) { in6_addr_t tmpmask; if ((ipst->ips_ip_forwarding_table_v6[i]) == NULL) @@ -1529,1334 +1067,415 @@ ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask, rw_enter(&irb_ptr->irb_lock, RW_READER); for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { - if (ire->ire_marks & IRE_MARK_CONDEMNED) + if (IRE_IS_CONDEMNED(ire)) continue; if (ire_match_args_v6(ire, addr, - &ire->ire_mask_v6, gateway, type, ipif, - zoneid, ihandle, tsl, flags)) + &ire->ire_mask_v6, gateway, type, ill, + zoneid, tsl, flags)) goto found_ire; } rw_exit(&irb_ptr->irb_lock); } } - - /* - * We come here if no route has yet been found. - * - * Handle the case where default route is - * requested by specifying type as one of the possible - * types for that can have a zero mask (IRE_DEFAULT and IRE_INTERFACE). - * - * If MATCH_IRE_MASK is specified, then the appropriate default route - * would have been found above if it exists so it isn't looked up here. - * If MATCH_IRE_DEFAULT was also specified, then a default route will be - * searched for later. - */ - if ((flags & (MATCH_IRE_TYPE | MATCH_IRE_MASK)) == MATCH_IRE_TYPE && - (type & (IRE_DEFAULT | IRE_INTERFACE))) { - if (ipst->ips_ip_forwarding_table_v6[0] != NULL) { - /* addr & mask is zero for defaults */ - irb_ptr = &ipst->ips_ip_forwarding_table_v6[0][ - IRE_ADDR_HASH_V6(ipv6_all_zeros, - ipst->ips_ip6_ftable_hash_size)]; - rw_enter(&irb_ptr->irb_lock, RW_READER); - for (ire = irb_ptr->irb_ire; ire != NULL; - ire = ire->ire_next) { - - if (ire->ire_marks & IRE_MARK_CONDEMNED) - continue; - - if (ire_match_args_v6(ire, addr, - &ipv6_all_zeros, gateway, type, ipif, - zoneid, ihandle, tsl, flags)) - goto found_ire; - } - rw_exit(&irb_ptr->irb_lock); - } - } - /* - * We come here only if no route is found. - * see if the default route can be used which is allowed - * only if the default matching criteria is specified. - * The ipv6_ire_default_count tracks the number of IRE_DEFAULT - * entries. However, the ip_forwarding_table_v6[0] also contains - * interface routes thus the count can be zero. - */ - saved_ire = NULL; - if ((flags & (MATCH_IRE_DEFAULT | MATCH_IRE_MASK)) == - MATCH_IRE_DEFAULT) { - ire_t *ire_origin; - uint_t g_index; - uint_t index; - - if (ipst->ips_ip_forwarding_table_v6[0] == NULL) - return (NULL); - irb_ptr = &(ipst->ips_ip_forwarding_table_v6[0])[0]; - - /* - * Keep a tab on the bucket while looking the IRE_DEFAULT - * entries. We need to keep track of a particular IRE - * (ire_origin) so this ensures that it will not be unlinked - * from the hash list during the recursive lookup below. - */ - IRB_REFHOLD(irb_ptr); - ire = irb_ptr->irb_ire; - if (ire == NULL) { - IRB_REFRELE(irb_ptr); - return (NULL); - } - - /* - * Get the index first, since it can be changed by other - * threads. Then get to the right default route skipping - * default interface routes if any. As we hold a reference on - * the IRE bucket, ipv6_ire_default_count can only increase so - * we can't reach the end of the hash list unexpectedly. - */ - if (ipst->ips_ipv6_ire_default_count != 0) { - g_index = ipst->ips_ipv6_ire_default_index++; - index = g_index % ipst->ips_ipv6_ire_default_count; - while (index != 0) { - if (!(ire->ire_type & IRE_INTERFACE)) - index--; - ire = ire->ire_next; - } - ASSERT(ire != NULL); - } else { - /* - * No default route, so we only have default interface - * routes: don't enter the first loop. - */ - ire = NULL; - } - - /* - * Round-robin the default routers list looking for a neighbor - * that matches the passed in parameters and is reachable. If - * none found, just return a route from the default router list - * if it exists. If we can't find a default route (IRE_DEFAULT), - * look for interface default routes. - * We start with the ire we found above and we walk the hash - * list until we're back where we started, see - * ire_get_next_default_ire(). It doesn't matter if default - * routes are added or deleted by other threads - we know this - * ire will stay in the list because we hold a reference on the - * ire bucket. - * NB: if we only have interface default routes, ire is NULL so - * we don't even enter this loop (see above). - */ - ire_origin = ire; - for (; ire != NULL; - ire = ire_get_next_default_ire(ire, ire_origin)) { - - if (ire_match_args_v6(ire, addr, - &ipv6_all_zeros, gateway, type, ipif, - zoneid, ihandle, tsl, flags)) { - int match_flags; - - /* - * We have something to work with. - * If we can find a resolved/reachable - * entry, we will use this. Otherwise - * we'll try to find an entry that has - * a resolved cache entry. We will fallback - * on this if we don't find anything else. - */ - if (saved_ire == NULL) - saved_ire = ire; - mutex_enter(&ire->ire_lock); - gw_addr_v6 = ire->ire_gateway_addr_v6; - mutex_exit(&ire->ire_lock); - match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; - rire = ire_ctable_lookup_v6(&gw_addr_v6, NULL, - 0, ire->ire_ipif, zoneid, tsl, match_flags, - ipst); - if (rire != NULL) { - nce = rire->ire_nce; - if (nce != NULL && - NCE_ISREACHABLE(nce) && - nce->nce_flags & NCE_F_ISROUTER) { - ire_refrele(rire); - IRE_REFHOLD(ire); - IRB_REFRELE(irb_ptr); - goto found_ire_held; - } else if (nce != NULL && - !(nce->nce_flags & - NCE_F_ISROUTER)) { - /* - * Make sure we don't use - * this ire - */ - if (saved_ire == ire) - saved_ire = NULL; - } - ire_refrele(rire); - } else if (ipst-> - ips_ipv6_ire_default_count > 1 && - zoneid != GLOBAL_ZONEID) { - /* - * When we're in a local zone, we're - * only interested in default routers - * that are reachable through ipifs - * within our zone. - * The potentially expensive call to - * ire_route_lookup_v6() is avoided when - * we have only one default route. - */ - int ire_match_flags = MATCH_IRE_TYPE | - MATCH_IRE_SECATTR; - - if (ire->ire_ipif != NULL) { - ire_match_flags |= - MATCH_IRE_ILL; - } - rire = ire_route_lookup_v6(&gw_addr_v6, - NULL, NULL, IRE_INTERFACE, - ire->ire_ipif, NULL, - zoneid, tsl, ire_match_flags, ipst); - if (rire != NULL) { - ire_refrele(rire); - saved_ire = ire; - } else if (saved_ire == ire) { - /* - * Make sure we don't use - * this ire - */ - saved_ire = NULL; - } - } - } - } - if (saved_ire != NULL) { - ire = saved_ire; - IRE_REFHOLD(ire); - IRB_REFRELE(irb_ptr); - goto found_ire_held; - } else { - /* - * Look for a interface default route matching the - * args passed in. No round robin here. Just pick - * the right one. - */ - for (ire = irb_ptr->irb_ire; ire != NULL; - ire = ire->ire_next) { - - if (!(ire->ire_type & IRE_INTERFACE)) - continue; - - if (ire->ire_marks & IRE_MARK_CONDEMNED) - continue; - - if (ire_match_args_v6(ire, addr, - &ipv6_all_zeros, gateway, type, ipif, - zoneid, ihandle, tsl, flags)) { - IRE_REFHOLD(ire); - IRB_REFRELE(irb_ptr); - goto found_ire_held; - } - } - IRB_REFRELE(irb_ptr); - } - } ASSERT(ire == NULL); ip1dbg(("ire_ftable_lookup_v6: returning NULL ire")); return (NULL); + found_ire: - ASSERT((ire->ire_marks & IRE_MARK_CONDEMNED) == 0); - IRE_REFHOLD(ire); + ire_refhold(ire); rw_exit(&irb_ptr->irb_lock); - -found_ire_held: - if ((flags & MATCH_IRE_RJ_BHOLE) && - (ire->ire_flags & (RTF_BLACKHOLE | RTF_REJECT))) { - return (ire); - } - /* - * At this point, IRE that was found must be an IRE_FORWARDTABLE - * or IRE_CACHETABLE type. If this is a recursive lookup and an - * IRE_INTERFACE type was found, return that. If it was some other - * IRE_FORWARDTABLE type of IRE (one of the prefix types), then it - * is necessary to fill in the parent IRE pointed to by pire, and - * then lookup the gateway address of the parent. For backwards - * compatiblity, if this lookup returns an - * IRE other than a IRE_CACHETABLE or IRE_INTERFACE, then one more level - * of lookup is done. - */ - if (flags & MATCH_IRE_RECURSIVE) { - const ipif_t *gw_ipif; - int match_flags = MATCH_IRE_DSTONLY; - - if (ire->ire_type & IRE_INTERFACE) - return (ire); - if (pire != NULL) - *pire = ire; - /* - * If we can't find an IRE_INTERFACE or the caller has not - * asked for pire, we need to REFRELE the saved_ire. - */ - saved_ire = ire; - - if (ire->ire_ipif != NULL) - match_flags |= MATCH_IRE_ILL; - - mutex_enter(&ire->ire_lock); - gw_addr_v6 = ire->ire_gateway_addr_v6; - mutex_exit(&ire->ire_lock); - - ire = ire_route_lookup_v6(&gw_addr_v6, NULL, NULL, 0, - ire->ire_ipif, NULL, zoneid, tsl, match_flags, ipst); - if (ire == NULL) { - /* - * In this case we have to deal with the - * MATCH_IRE_PARENT flag, which means the - * parent has to be returned if ire is NULL. - * The aim of this is to have (at least) a starting - * ire when we want to look at all of the ires in a - * bucket aimed at a single destination (as is the - * case in ip_newroute_v6 for the RTF_MULTIRT - * flagged routes). - */ - if (flags & MATCH_IRE_PARENT) { - if (pire != NULL) { - /* - * Need an extra REFHOLD, if the - * parent ire is returned via both - * ire and pire. - */ - IRE_REFHOLD(saved_ire); - } - ire = saved_ire; - } else { - ire_refrele(saved_ire); - if (pire != NULL) - *pire = NULL; - } - return (ire); - } - if (ire->ire_type & (IRE_CACHETABLE | IRE_INTERFACE)) { - /* - * If the caller did not ask for pire, release - * it now. - */ - if (pire == NULL) { - ire_refrele(saved_ire); - } - return (ire); - } - match_flags |= MATCH_IRE_TYPE; - mutex_enter(&ire->ire_lock); - gw_addr_v6 = ire->ire_gateway_addr_v6; - mutex_exit(&ire->ire_lock); - gw_ipif = ire->ire_ipif; - ire_refrele(ire); - ire = ire_route_lookup_v6(&gw_addr_v6, NULL, NULL, - (IRE_CACHETABLE | IRE_INTERFACE), gw_ipif, NULL, zoneid, - NULL, match_flags, ipst); - if (ire == NULL) { - /* - * In this case we have to deal with the - * MATCH_IRE_PARENT flag, which means the - * parent has to be returned if ire is NULL. - * The aim of this is to have (at least) a starting - * ire when we want to look at all of the ires in a - * bucket aimed at a single destination (as is the - * case in ip_newroute_v6 for the RTF_MULTIRT - * flagged routes). - */ - if (flags & MATCH_IRE_PARENT) { - if (pire != NULL) { - /* - * Need an extra REFHOLD, if the - * parent ire is returned via both - * ire and pire. - */ - IRE_REFHOLD(saved_ire); - } - ire = saved_ire; - } else { - ire_refrele(saved_ire); - if (pire != NULL) - *pire = NULL; - } - return (ire); - } else if (pire == NULL) { - /* - * If the caller did not ask for pire, release - * it now. - */ - ire_refrele(saved_ire); - } - return (ire); - } - - ASSERT(pire == NULL || *pire == NULL); return (ire); } -/* - * Delete the IRE cache for the gateway and all IRE caches whose - * ire_gateway_addr_v6 points to this gateway, and allow them to - * be created on demand by ip_newroute_v6. - */ -void -ire_clookup_delete_cache_gw_v6(const in6_addr_t *addr, zoneid_t zoneid, - ip_stack_t *ipst) -{ - irb_t *irb; - ire_t *ire; - - irb = &ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr, - ipst->ips_ip6_cache_table_size)]; - IRB_REFHOLD(irb); - for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { - if (ire->ire_marks & IRE_MARK_CONDEMNED) - continue; - - ASSERT(IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, &ipv6_all_ones)); - if (ire_match_args_v6(ire, addr, &ire->ire_mask_v6, 0, - IRE_CACHE, NULL, zoneid, 0, NULL, MATCH_IRE_TYPE)) { - ire_delete(ire); - } - } - IRB_REFRELE(irb); - - ire_walk_v6(ire_delete_cache_gw_v6, (char *)addr, zoneid, ipst); -} - -/* - * Looks up cache table for a route. - * specific lookup can be indicated by - * passing the MATCH_* flags and the - * necessary parameters. - */ -ire_t * -ire_ctable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *gateway, - int type, const ipif_t *ipif, zoneid_t zoneid, const ts_label_t *tsl, - int flags, ip_stack_t *ipst) -{ - ire_ctable_args_t margs; - - margs.ict_addr = (void *)addr; - margs.ict_gateway = (void *)gateway; - margs.ict_type = type; - margs.ict_ipif = ipif; - margs.ict_zoneid = zoneid; - margs.ict_tsl = tsl; - margs.ict_flags = flags; - margs.ict_ipst = ipst; - margs.ict_wq = NULL; - - return (ip6_ctable_lookup_impl(&margs)); -} /* - * Lookup cache. + * This function is called by + * ip_input/ire_route_recursive when doing a route lookup on only the + * destination address. * - * In general the zoneid has to match (where ALL_ZONES match all of them). - * But for IRE_LOCAL we also need to handle the case where L2 should - * conceptually loop back the packet. This is necessary since neither - * Ethernet drivers nor Ethernet hardware loops back packets sent to their - * own MAC address. This loopback is needed when the normal - * routes (ignoring IREs with different zoneids) would send out the packet on - * the same ill as the ill with which this IRE_LOCAL is associated. + * The optimizations of this function over ire_ftable_lookup are: + * o removing unnecessary flag matching + * o doing longest prefix match instead of overloading it further + * with the unnecessary "best_prefix_match" * - * Earlier versions of this code always matched an IRE_LOCAL independently of - * the zoneid. We preserve that earlier behavior when - * ip_restrict_interzone_loopback is turned off. + * If no route is found we return IRE_NOROUTE. */ ire_t * -ire_cache_lookup_v6(const in6_addr_t *addr, zoneid_t zoneid, - const ts_label_t *tsl, ip_stack_t *ipst) -{ - irb_t *irb_ptr; - ire_t *ire; - - irb_ptr = &ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr, - ipst->ips_ip6_cache_table_size)]; - rw_enter(&irb_ptr->irb_lock, RW_READER); - for (ire = irb_ptr->irb_ire; ire; ire = ire->ire_next) { - if (ire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN)) - continue; - if (IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, addr)) { - /* - * Finally, check if the security policy has any - * restriction on using this route for the specified - * message. - */ - if (tsl != NULL && - ire->ire_gw_secattr != NULL && - tsol_ire_match_gwattr(ire, tsl) != 0) { - continue; - } - - if (zoneid == ALL_ZONES || ire->ire_zoneid == zoneid || - ire->ire_zoneid == ALL_ZONES) { - IRE_REFHOLD(ire); - rw_exit(&irb_ptr->irb_lock); - return (ire); - } - - if (ire->ire_type == IRE_LOCAL) { - if (ipst->ips_ip_restrict_interzone_loopback && - !ire_local_ok_across_zones(ire, zoneid, - (void *)addr, tsl, ipst)) - continue; - - IRE_REFHOLD(ire); - rw_exit(&irb_ptr->irb_lock); - return (ire); - } - } - } - rw_exit(&irb_ptr->irb_lock); - return (NULL); -} - -/* - * Locate the interface ire that is tied to the cache ire 'cire' via - * cire->ire_ihandle. - * - * We are trying to create the cache ire for an onlink destn. or - * gateway in 'cire'. We are called from ire_add_v6() in the IRE_IF_RESOLVER - * case for xresolv interfaces, after the ire has come back from - * an external resolver. - */ -static ire_t * -ire_ihandle_lookup_onlink_v6(ire_t *cire) +ire_ftable_lookup_simple_v6(const in6_addr_t *addr, uint32_t xmit_hint, + ip_stack_t *ipst, uint_t *generationp) { ire_t *ire; - int match_flags; - int i; - int j; - irb_t *irb_ptr; - ip_stack_t *ipst = cire->ire_ipst; - - ASSERT(cire != NULL); - match_flags = MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK; - /* - * We know that the mask of the interface ire equals cire->ire_cmask. - * (When ip_newroute_v6() created 'cire' for an on-link destn. - * it set its cmask from the interface ire's mask) - */ - ire = ire_ftable_lookup_v6(&cire->ire_addr_v6, &cire->ire_cmask_v6, - NULL, IRE_INTERFACE, NULL, NULL, ALL_ZONES, cire->ire_ihandle, - NULL, match_flags, ipst); - if (ire != NULL) - return (ire); - /* - * If we didn't find an interface ire above, we can't declare failure. - * For backwards compatibility, we need to support prefix routes - * pointing to next hop gateways that are not on-link. - * - * In the resolver/noresolver case, ip_newroute_v6() thinks - * it is creating the cache ire for an onlink destination in 'cire'. - * But 'cire' is not actually onlink, because ire_ftable_lookup_v6() - * cheated it, by doing ire_route_lookup_v6() twice and returning an - * interface ire. - * - * Eg. default - gw1 (line 1) - * gw1 - gw2 (line 2) - * gw2 - hme0 (line 3) - * - * In the above example, ip_newroute_v6() tried to create the cache ire - * 'cire' for gw1, based on the interface route in line 3. The - * ire_ftable_lookup_v6() above fails, because there is - * no interface route to reach gw1. (it is gw2). We fall thru below. - * - * Do a brute force search based on the ihandle in a subset of the - * forwarding tables, corresponding to cire->ire_cmask_v6. Otherwise - * things become very complex, since we don't have 'pire' in this - * case. (Also note that this method is not possible in the offlink - * case because we don't know the mask) - */ - i = ip_mask_to_plen_v6(&cire->ire_cmask_v6); - if ((ipst->ips_ip_forwarding_table_v6[i]) == NULL) - return (NULL); - for (j = 0; j < ipst->ips_ip6_ftable_hash_size; j++) { - irb_ptr = &ipst->ips_ip_forwarding_table_v6[i][j]; - rw_enter(&irb_ptr->irb_lock, RW_READER); - for (ire = irb_ptr->irb_ire; ire != NULL; - ire = ire->ire_next) { - if (ire->ire_marks & IRE_MARK_CONDEMNED) - continue; - if ((ire->ire_type & IRE_INTERFACE) && - (ire->ire_ihandle == cire->ire_ihandle)) { - IRE_REFHOLD(ire); - rw_exit(&irb_ptr->irb_lock); - return (ire); - } - } - rw_exit(&irb_ptr->irb_lock); + ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL, ALL_ZONES, NULL, + MATCH_IRE_DSTONLY, xmit_hint, ipst, generationp); + if (ire == NULL) { + ire = ire_reject(ipst, B_TRUE); + if (generationp != NULL) + *generationp = IRE_GENERATION_VERIFY; } - return (NULL); + /* ftable_lookup did round robin */ + return (ire); } - -/* - * Locate the interface ire that is tied to the cache ire 'cire' via - * cire->ire_ihandle. - * - * We are trying to create the cache ire for an offlink destn based - * on the cache ire of the gateway in 'cire'. 'pire' is the prefix ire - * as found by ip_newroute_v6(). We are called from ip_newroute_v6() in - * the IRE_CACHE case. - */ ire_t * -ire_ihandle_lookup_offlink_v6(ire_t *cire, ire_t *pire) +ip_select_route_v6(const in6_addr_t *dst, ip_xmit_attr_t *ixa, + uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp) { - ire_t *ire; - int match_flags; - in6_addr_t gw_addr; - ipif_t *gw_ipif; - ip_stack_t *ipst = cire->ire_ipst; - - ASSERT(cire != NULL && pire != NULL); + ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4)); - match_flags = MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK; - if (pire->ire_ipif != NULL) - match_flags |= MATCH_IRE_ILL; - /* - * We know that the mask of the interface ire equals cire->ire_cmask. - * (When ip_newroute_v6() created 'cire' for an on-link destn. it set - * its cmask from the interface ire's mask) - */ - ire = ire_ftable_lookup_v6(&cire->ire_addr_v6, &cire->ire_cmask_v6, 0, - IRE_INTERFACE, pire->ire_ipif, NULL, ALL_ZONES, cire->ire_ihandle, - NULL, match_flags, ipst); - if (ire != NULL) - return (ire); - /* - * If we didn't find an interface ire above, we can't declare failure. - * For backwards compatibility, we need to support prefix routes - * pointing to next hop gateways that are not on-link. - * - * Assume we are trying to ping some offlink destn, and we have the - * routing table below. - * - * Eg. default - gw1 <--- pire (line 1) - * gw1 - gw2 (line 2) - * gw2 - hme0 (line 3) - * - * If we already have a cache ire for gw1 in 'cire', the - * ire_ftable_lookup_v6 above would have failed, since there is no - * interface ire to reach gw1. We will fallthru below. - * - * Here we duplicate the steps that ire_ftable_lookup_v6() did in - * getting 'cire' from 'pire', in the MATCH_IRE_RECURSIVE case. - * The differences are the following - * i. We want the interface ire only, so we call - * ire_ftable_lookup_v6() instead of ire_route_lookup_v6() - * ii. We look for only prefix routes in the 1st call below. - * ii. We want to match on the ihandle in the 2nd call below. - */ - match_flags = MATCH_IRE_TYPE; - if (pire->ire_ipif != NULL) - match_flags |= MATCH_IRE_ILL; - - mutex_enter(&pire->ire_lock); - gw_addr = pire->ire_gateway_addr_v6; - mutex_exit(&pire->ire_lock); - ire = ire_ftable_lookup_v6(&gw_addr, 0, 0, IRE_OFFSUBNET, - pire->ire_ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst); - if (ire == NULL) - return (NULL); - /* - * At this point 'ire' corresponds to the entry shown in line 2. - * gw_addr is 'gw2' in the example above. - */ - mutex_enter(&ire->ire_lock); - gw_addr = ire->ire_gateway_addr_v6; - mutex_exit(&ire->ire_lock); - gw_ipif = ire->ire_ipif; - ire_refrele(ire); - - match_flags |= MATCH_IRE_IHANDLE; - ire = ire_ftable_lookup_v6(&gw_addr, 0, 0, IRE_INTERFACE, - gw_ipif, NULL, ALL_ZONES, cire->ire_ihandle, - NULL, match_flags, ipst); - return (ire); + return (ip_select_route(dst, ixa, generationp, setsrcp, errorp, + multirtp)); } /* - * Return the IRE_LOOPBACK, IRE_IF_RESOLVER or IRE_IF_NORESOLVER - * ire associated with the specified ipif. + * Recursively look for a route to the destination. Can also match on + * the zoneid, ill, and label. Used for the data paths. See also + * ire_route_recursive_dstonly. * - * This might occasionally be called when IPIF_UP is not set since - * the IPV6_MULTICAST_IF as well as creating interface routes - * allows specifying a down ipif (ipif_lookup* match ipifs that are down). + * If ill is set this means we will match it by adding MATCH_IRE_ILL. * - * Note that if IPIF_NOLOCAL, IPIF_NOXMIT, or IPIF_DEPRECATED is set on - * the ipif this routine might return NULL. - * (Sometimes called as writer though not required by this function.) + * If allocate is not set then we will only inspect the existing IREs; never + * create an IRE_IF_CLONE. This is used on the receive side when we are not + * forwarding. + * + * Note that this function never returns NULL. It returns an IRE_NOROUTE + * instead. + * + * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it + * is an error. + * Allow at most one RTF_INDIRECT. */ ire_t * -ipif_to_ire_v6(const ipif_t *ipif) +ire_route_recursive_impl_v6(ire_t *ire, + const in6_addr_t *nexthop, uint_t ire_type, const ill_t *ill_arg, + zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, + boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst, + in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) { - ire_t *ire; - ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; - uint_t match_flags = MATCH_IRE_TYPE | MATCH_IRE_IPIF; + int i, j; + in6_addr_t v6nexthop = *nexthop; + ire_t *ires[MAX_IRE_RECURSION]; + uint_t generation; + uint_t generations[MAX_IRE_RECURSION]; + boolean_t need_refrele = B_FALSE; + boolean_t invalidate = B_FALSE; + int prefs[MAX_IRE_RECURSION]; + ill_t *ill = NULL; + + if (setsrcp != NULL) + ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp)); + if (gwattrp != NULL) + ASSERT(*gwattrp == NULL); + + if (ill_arg != NULL) + match_args |= MATCH_IRE_ILL; /* - * IRE_INTERFACE entries for ills under IPMP are IRE_MARK_TESTHIDDEN - * so that they aren't accidentally returned. However, if the - * caller's ipif is on an ill under IPMP, there's no need to hide 'em. + * We iterate up to three times to resolve a route, even though + * we have four slots in the array. The extra slot is for an + * IRE_IF_CLONE we might need to create. */ - if (IS_UNDER_IPMP(ipif->ipif_ill)) - match_flags |= MATCH_IRE_MARK_TESTHIDDEN; - - ASSERT(ipif->ipif_isv6); - if (ipif->ipif_ire_type == IRE_LOOPBACK) { - ire = ire_ctable_lookup_v6(&ipif->ipif_v6lcl_addr, NULL, - IRE_LOOPBACK, ipif, ALL_ZONES, NULL, match_flags, ipst); - } else if (ipif->ipif_flags & IPIF_POINTOPOINT) { - /* In this case we need to lookup destination address. */ - ire = ire_ftable_lookup_v6(&ipif->ipif_v6pp_dst_addr, - &ipv6_all_ones, NULL, IRE_INTERFACE, ipif, NULL, ALL_ZONES, - 0, NULL, (match_flags | MATCH_IRE_MASK), ipst); - } else { - ire = ire_ftable_lookup_v6(&ipif->ipif_v6subnet, - &ipif->ipif_v6net_mask, NULL, IRE_INTERFACE, ipif, NULL, - ALL_ZONES, 0, NULL, (match_flags | MATCH_IRE_MASK), ipst); - } - return (ire); -} - -/* - * Return B_TRUE if a multirt route is resolvable - * (or if no route is resolved yet), B_FALSE otherwise. - * This only works in the global zone. - */ -boolean_t -ire_multirt_need_resolve_v6(const in6_addr_t *v6dstp, const ts_label_t *tsl, - ip_stack_t *ipst) -{ - ire_t *first_fire; - ire_t *first_cire; - ire_t *fire; - ire_t *cire; - irb_t *firb; - irb_t *cirb; - int unres_cnt = 0; - boolean_t resolvable = B_FALSE; - - /* Retrieve the first IRE_HOST that matches the destination */ - first_fire = ire_ftable_lookup_v6(v6dstp, &ipv6_all_ones, 0, IRE_HOST, - NULL, NULL, ALL_ZONES, 0, tsl, MATCH_IRE_MASK | MATCH_IRE_TYPE | - MATCH_IRE_SECATTR, ipst); - - /* No route at all */ - if (first_fire == NULL) { - return (B_TRUE); - } - - firb = first_fire->ire_bucket; - ASSERT(firb); - - /* Retrieve the first IRE_CACHE ire for that destination. */ - first_cire = ire_cache_lookup_v6(v6dstp, GLOBAL_ZONEID, tsl, ipst); - - /* No resolved route. */ - if (first_cire == NULL) { - ire_refrele(first_fire); - return (B_TRUE); - } - - /* At least one route is resolved. */ - - cirb = first_cire->ire_bucket; - ASSERT(cirb); - - /* Count the number of routes to that dest that are declared. */ - IRB_REFHOLD(firb); - for (fire = first_fire; fire != NULL; fire = fire->ire_next) { - if (!(fire->ire_flags & RTF_MULTIRT)) - continue; - if (!IN6_ARE_ADDR_EQUAL(&fire->ire_addr_v6, v6dstp)) - continue; - unres_cnt++; - } - IRB_REFRELE(firb); - - - /* Then subtract the number of routes to that dst that are resolved */ - IRB_REFHOLD(cirb); - for (cire = first_cire; cire != NULL; cire = cire->ire_next) { - if (!(cire->ire_flags & RTF_MULTIRT)) - continue; - if (!IN6_ARE_ADDR_EQUAL(&cire->ire_addr_v6, v6dstp)) - continue; - if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN)) - continue; - unres_cnt--; - } - IRB_REFRELE(cirb); - - /* At least one route is unresolved; search for a resolvable route. */ - if (unres_cnt > 0) - resolvable = ire_multirt_lookup_v6(&first_cire, &first_fire, - MULTIRT_USESTAMP|MULTIRT_CACHEGW, tsl, ipst); - - if (first_fire) - ire_refrele(first_fire); - - if (first_cire) - ire_refrele(first_cire); - - return (resolvable); -} - - -/* - * Return B_TRUE and update *ire_arg and *fire_arg - * if at least one resolvable route is found. - * Return B_FALSE otherwise (all routes are resolved or - * the remaining unresolved routes are all unresolvable). - * This only works in the global zone. - */ -boolean_t -ire_multirt_lookup_v6(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags, - const ts_label_t *tsl, ip_stack_t *ipst) -{ - clock_t delta; - ire_t *best_fire = NULL; - ire_t *best_cire = NULL; - ire_t *first_fire; - ire_t *first_cire; - ire_t *fire; - ire_t *cire; - irb_t *firb = NULL; - irb_t *cirb = NULL; - ire_t *gw_ire; - boolean_t already_resolved; - boolean_t res; - in6_addr_t v6dst; - in6_addr_t v6gw; - - ip2dbg(("ire_multirt_lookup_v6: *ire_arg %p, *fire_arg %p, " - "flags %04x\n", (void *)*ire_arg, (void *)*fire_arg, flags)); - - ASSERT(ire_arg); - ASSERT(fire_arg); - - /* Not an IRE_HOST ire; give up. */ - if ((*fire_arg == NULL) || - ((*fire_arg)->ire_type != IRE_HOST)) { - return (B_FALSE); - } + i = 0; + while (i < MAX_IRE_RECURSION - 1) { + /* ire_ftable_lookup handles round-robin/ECMP */ + if (ire == NULL) { + ire = ire_ftable_lookup_v6(&v6nexthop, 0, 0, ire_type, + (ill_arg != NULL ? ill_arg : ill), zoneid, tsl, + match_args, xmit_hint, ipst, &generation); + } else { + /* Caller passed it; extra hold since we will rele */ + ire_refhold(ire); + if (generationp != NULL) + generation = *generationp; + else + generation = IRE_GENERATION_VERIFY; + } - /* This is the first IRE_HOST ire for that destination. */ - first_fire = *fire_arg; - firb = first_fire->ire_bucket; - ASSERT(firb); + if (ire == NULL) + ire = ire_reject(ipst, B_TRUE); - mutex_enter(&first_fire->ire_lock); - v6dst = first_fire->ire_addr_v6; - mutex_exit(&first_fire->ire_lock); + /* Need to return the ire with RTF_REJECT|BLACKHOLE */ + if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) + goto error; - ip2dbg(("ire_multirt_lookup_v6: dst %08x\n", - ntohl(V4_PART_OF_V6(v6dst)))); + ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */ - /* - * Retrieve the first IRE_CACHE ire for that destination; - * if we don't find one, no route for that dest is - * resolved yet. - */ - first_cire = ire_cache_lookup_v6(&v6dst, GLOBAL_ZONEID, tsl, ipst); - if (first_cire) { - cirb = first_cire->ire_bucket; - } - - ip2dbg(("ire_multirt_lookup_v6: first_cire %p\n", (void *)first_cire)); + prefs[i] = ire_pref(ire); + if (i != 0) { + /* + * Don't allow anything unusual past the first + * iteration. + */ + if ((ire->ire_type & + (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) || + prefs[i] <= prefs[i-1]) { + ire_refrele(ire); + ire = ire_reject(ipst, B_TRUE); + goto error; + } + } + /* We have a usable IRE */ + ires[i] = ire; + generations[i] = generation; + i++; + + /* The first RTF_SETSRC address is passed back if setsrcp */ + if ((ire->ire_flags & RTF_SETSRC) && + setsrcp != NULL && IN6_IS_ADDR_UNSPECIFIED(setsrcp)) { + ASSERT(!IN6_IS_ADDR_UNSPECIFIED( + &ire->ire_setsrc_addr_v6)); + *setsrcp = ire->ire_setsrc_addr_v6; + } - /* - * Search for a resolvable route, giving the top priority - * to routes that can be resolved without any call to the resolver. - */ - IRB_REFHOLD(firb); + /* The first ire_gw_secattr is passed back if gwattrp */ + if (ire->ire_gw_secattr != NULL && + gwattrp != NULL && *gwattrp == NULL) + *gwattrp = ire->ire_gw_secattr; - if (!IN6_IS_ADDR_MULTICAST(&v6dst)) { /* - * For all multiroute IRE_HOST ires for that destination, - * check if the route via the IRE_HOST's gateway is - * resolved yet. + * Check if we have a short-cut pointer to an IRE for this + * destination, and that the cached dependency isn't stale. + * In that case we've rejoined an existing tree towards a + * parent, thus we don't need to continue the loop to + * discover the rest of the tree. */ - for (fire = first_fire; fire != NULL; fire = fire->ire_next) { - - if (!(fire->ire_flags & RTF_MULTIRT)) - continue; - if (!IN6_ARE_ADDR_EQUAL(&fire->ire_addr_v6, &v6dst)) - continue; - - if (fire->ire_gw_secattr != NULL && - tsol_ire_match_gwattr(fire, tsl) != 0) { - continue; - } - - mutex_enter(&fire->ire_lock); - v6gw = fire->ire_gateway_addr_v6; - mutex_exit(&fire->ire_lock); - - ip2dbg(("ire_multirt_lookup_v6: fire %p, " - "ire_addr %08x, ire_gateway_addr %08x\n", - (void *)fire, - ntohl(V4_PART_OF_V6(fire->ire_addr_v6)), - ntohl(V4_PART_OF_V6(v6gw)))); + mutex_enter(&ire->ire_lock); + if (ire->ire_dep_parent != NULL && + ire->ire_dep_parent->ire_generation == + ire->ire_dep_parent_generation) { + mutex_exit(&ire->ire_lock); + ire = NULL; + goto done; + } + mutex_exit(&ire->ire_lock); - already_resolved = B_FALSE; + /* + * If this type should have an ire_nce_cache (even if it + * doesn't yet have one) then we are done. Includes + * IRE_INTERFACE with a full 128 bit mask. + */ + if (ire->ire_nce_capable) { + ire = NULL; + goto done; + } - if (first_cire) { - ASSERT(cirb); + ASSERT(!(ire->ire_type & IRE_IF_CLONE)); + /* + * For an IRE_INTERFACE we create an IRE_IF_CLONE for this + * particular destination + */ + if (ire->ire_type & IRE_INTERFACE) { + ire_t *clone; - IRB_REFHOLD(cirb); - /* - * For all IRE_CACHE ires for that - * destination. - */ - for (cire = first_cire; - cire != NULL; - cire = cire->ire_next) { - - if (!(cire->ire_flags & RTF_MULTIRT)) - continue; - if (!IN6_ARE_ADDR_EQUAL( - &cire->ire_addr_v6, &v6dst)) - continue; - if (cire->ire_marks & - (IRE_MARK_CONDEMNED| - IRE_MARK_TESTHIDDEN)) - continue; - - if (cire->ire_gw_secattr != NULL && - tsol_ire_match_gwattr(cire, - tsl) != 0) { - continue; - } - - /* - * Check if the IRE_CACHE's gateway - * matches the IRE_HOST's gateway. - */ - if (IN6_ARE_ADDR_EQUAL( - &cire->ire_gateway_addr_v6, - &v6gw)) { - already_resolved = B_TRUE; - break; - } - } - IRB_REFRELE(cirb); - } + ASSERT(ire->ire_masklen != IPV6_ABITS); /* - * This route is already resolved; - * proceed with next one. + * In the case of ip_input and ILLF_FORWARDING not + * being set, and in the case of RTM_GET, + * there is no point in allocating + * an IRE_IF_CLONE. We return the IRE_INTERFACE. + * Note that !allocate can result in a ire_dep_parent + * which is IRE_IF_* without an IRE_IF_CLONE. + * We recover from that when we need to send packets + * by ensuring that the generations become + * IRE_GENERATION_VERIFY in this case. */ - if (already_resolved) { - ip2dbg(("ire_multirt_lookup_v6: found cire %p, " - "already resolved\n", (void *)cire)); - continue; + if (!allocate) { + invalidate = B_TRUE; + ire = NULL; + goto done; } - /* - * The route is unresolved; is it actually - * resolvable, i.e. is there a cache or a resolver - * for the gateway? - */ - gw_ire = ire_route_lookup_v6(&v6gw, 0, 0, 0, NULL, NULL, - ALL_ZONES, tsl, MATCH_IRE_RECURSIVE | - MATCH_IRE_SECATTR, ipst); - - ip2dbg(("ire_multirt_lookup_v6: looked up gw_ire %p\n", - (void *)gw_ire)); - - /* - * This route can be resolved without any call to the - * resolver; if the MULTIRT_CACHEGW flag is set, - * give the top priority to this ire and exit the - * loop. - * This occurs when an resolver reply is processed - * through ip_wput_nondata() - */ - if ((flags & MULTIRT_CACHEGW) && - (gw_ire != NULL) && - (gw_ire->ire_type & IRE_CACHETABLE)) { + clone = ire_create_if_clone(ire, &v6nexthop, + &generation); + if (clone == NULL) { /* - * Release the resolver associated to the - * previous candidate best ire, if any. + * Temporary failure - no memory. + * Don't want caller to cache IRE_NOROUTE. */ - if (best_cire) { - ire_refrele(best_cire); - ASSERT(best_fire); - } - - best_fire = fire; - best_cire = gw_ire; - - ip2dbg(("ire_multirt_lookup_v6: found top prio " - "best_fire %p, best_cire %p\n", - (void *)best_fire, (void *)best_cire)); - break; + invalidate = B_TRUE; + ire = ire_blackhole(ipst, B_TRUE); + goto error; } - /* - * Compute the time elapsed since our preceding - * attempt to resolve that route. - * If the MULTIRT_USESTAMP flag is set, we take that - * route into account only if this time interval - * exceeds ip_multirt_resolution_interval; - * this prevents us from attempting to resolve a - * broken route upon each sending of a packet. + * Make clone next to last entry and the + * IRE_INTERFACE the last in the dependency + * chain since the clone depends on the + * IRE_INTERFACE. */ - delta = lbolt - fire->ire_last_used_time; - delta = TICK_TO_MSEC(delta); - - res = (boolean_t) - ((delta > ipst-> - ips_ip_multirt_resolution_interval) || - (!(flags & MULTIRT_USESTAMP))); + ASSERT(i >= 1); + ASSERT(i < MAX_IRE_RECURSION); - ip2dbg(("ire_multirt_lookup_v6: fire %p, delta %lu, " - "res %d\n", - (void *)fire, delta, res)); - - if (res) { - /* - * A resolver exists for the gateway: save - * the current IRE_HOST ire as a candidate - * best ire. If we later discover that a - * top priority ire exists (i.e. no need to - * call the resolver), then this new ire - * will be preferred to the current one. - */ - if (gw_ire != NULL) { - if (best_fire == NULL) { - ASSERT(best_cire == NULL); - - best_fire = fire; - best_cire = gw_ire; - - ip2dbg(("ire_multirt_lookup_v6:" - "found candidate " - "best_fire %p, " - "best_cire %p\n", - (void *)best_fire, - (void *)best_cire)); - - /* - * If MULTIRT_CACHEGW is not - * set, we ignore the top - * priority ires that can - * be resolved without any - * call to the resolver; - * In that case, there is - * actually no need - * to continue the loop. - */ - if (!(flags & - MULTIRT_CACHEGW)) { - break; - } - continue; - } - } else { - /* - * No resolver for the gateway: the - * route is not resolvable. - * If the MULTIRT_SETSTAMP flag is - * set, we stamp the IRE_HOST ire, - * so we will not select it again - * during this resolution interval. - */ - if (flags & MULTIRT_SETSTAMP) - fire->ire_last_used_time = - lbolt; - } - } + ires[i] = ires[i-1]; + generations[i] = generations[i-1]; + ires[i-1] = clone; + generations[i-1] = generation; + i++; - if (gw_ire != NULL) - ire_refrele(gw_ire); + ire = NULL; + goto done; } - } else { /* IN6_IS_ADDR_MULTICAST(&v6dst) */ - for (fire = first_fire; - fire != NULL; - fire = fire->ire_next) { - - if (!(fire->ire_flags & RTF_MULTIRT)) - continue; - if (!IN6_ARE_ADDR_EQUAL(&fire->ire_addr_v6, &v6dst)) - continue; - - if (fire->ire_gw_secattr != NULL && - tsol_ire_match_gwattr(fire, tsl) != 0) { - continue; - } - - already_resolved = B_FALSE; - - mutex_enter(&fire->ire_lock); - v6gw = fire->ire_gateway_addr_v6; - mutex_exit(&fire->ire_lock); - - gw_ire = ire_ftable_lookup_v6(&v6gw, 0, 0, - IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, tsl, - MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE | - MATCH_IRE_SECATTR, ipst); - - /* No resolver for the gateway; we skip this ire. */ - if (gw_ire == NULL) { - continue; - } + /* + * We only match on the type and optionally ILL when + * recursing. The type match is used by some callers + * to exclude certain types (such as IRE_IF_CLONE or + * IRE_LOCAL|IRE_LOOPBACK). + */ + match_args &= MATCH_IRE_TYPE; + v6nexthop = ire->ire_gateway_addr_v6; + if (ill == NULL && ire->ire_ill != NULL) { + ill = ire->ire_ill; + need_refrele = B_TRUE; + ill_refhold(ill); + match_args |= MATCH_IRE_ILL; + } - if (first_cire) { + ire = NULL; + } + ASSERT(ire == NULL); + ire = ire_reject(ipst, B_TRUE); - IRB_REFHOLD(cirb); - /* - * For all IRE_CACHE ires for that - * destination. - */ - for (cire = first_cire; - cire != NULL; - cire = cire->ire_next) { - - if (!(cire->ire_flags & RTF_MULTIRT)) - continue; - if (!IN6_ARE_ADDR_EQUAL( - &cire->ire_addr_v6, &v6dst)) - continue; - if (cire->ire_marks & - IRE_MARK_CONDEMNED) - continue; - - if (cire->ire_gw_secattr != NULL && - tsol_ire_match_gwattr(cire, - tsl) != 0) { - continue; - } - - /* - * Cache entries are linked to the - * parent routes using the parent handle - * (ire_phandle). If no cache entry has - * the same handle as fire, fire is - * still unresolved. - */ - ASSERT(cire->ire_phandle != 0); - if (cire->ire_phandle == - fire->ire_phandle) { - already_resolved = B_TRUE; - break; - } - } - IRB_REFRELE(cirb); - } +error: + ASSERT(ire != NULL); + if (need_refrele) + ill_refrele(ill); - /* - * This route is already resolved; proceed with - * next one. - */ - if (already_resolved) { - ire_refrele(gw_ire); - continue; - } + /* + * In the case of MULTIRT we want to try a different IRE the next + * time. We let the next packet retry in that case. + */ + if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT)) + (void) ire_no_good(ires[0]); - /* - * Compute the time elapsed since our preceding - * attempt to resolve that route. - * If the MULTIRT_USESTAMP flag is set, we take - * that route into account only if this time - * interval exceeds ip_multirt_resolution_interval; - * this prevents us from attempting to resolve a - * broken route upon each sending of a packet. - */ - delta = lbolt - fire->ire_last_used_time; - delta = TICK_TO_MSEC(delta); - - res = (boolean_t) - ((delta > ipst-> - ips_ip_multirt_resolution_interval) || - (!(flags & MULTIRT_USESTAMP))); - - ip3dbg(("ire_multirt_lookup_v6: fire %p, delta %lx, " - "flags %04x, res %d\n", - (void *)fire, delta, flags, res)); - - if (res) { - if (best_cire) { - /* - * Release the resolver associated - * to the preceding candidate best - * ire, if any. - */ - ire_refrele(best_cire); - ASSERT(best_fire); - } - best_fire = fire; - best_cire = gw_ire; - continue; - } +cleanup: + /* cleanup ires[i] */ + ire_dep_unbuild(ires, i); + for (j = 0; j < i; j++) + ire_refrele(ires[j]); - ire_refrele(gw_ire); - } - } + ASSERT(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)); + /* + * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the + * ip_select_route since the reject or lack of memory might be gone. + */ + if (generationp != NULL) + *generationp = IRE_GENERATION_VERIFY; + return (ire); - if (best_fire) { - IRE_REFHOLD(best_fire); +done: + ASSERT(ire == NULL); + if (need_refrele) + ill_refrele(ill); + + /* Build dependencies */ + if (!ire_dep_build(ires, generations, i)) { + /* Something in chain was condemned; tear it apart */ + ire = ire_blackhole(ipst, B_TRUE); + goto cleanup; } - IRB_REFRELE(firb); - /* Release the first IRE_CACHE we initially looked up, if any. */ - if (first_cire) - ire_refrele(first_cire); - - /* Found a resolvable route. */ - if (best_fire) { - ASSERT(best_cire); - - if (*fire_arg) - ire_refrele(*fire_arg); - if (*ire_arg) - ire_refrele(*ire_arg); + /* + * Release all refholds except the one for ires[0] that we + * will return to the caller. + */ + for (j = 1; j < i; j++) + ire_refrele(ires[j]); + if (invalidate) { /* - * Update the passed arguments with the - * resolvable multirt route we found + * Since we needed to allocate but couldn't we need to make + * sure that the dependency chain is rebuilt the next time. */ - *fire_arg = best_fire; - *ire_arg = best_cire; - - ip2dbg(("ire_multirt_lookup_v6: returning B_TRUE, " - "*fire_arg %p, *ire_arg %p\n", - (void *)best_fire, (void *)best_cire)); - - return (B_TRUE); + ire_dep_invalidate_generations(ires[0]); + generation = IRE_GENERATION_VERIFY; + } else { + /* + * IREs can have been added or deleted while we did the + * recursive lookup and we can't catch those until we've built + * the dependencies. We verify the stored + * ire_dep_parent_generation to catch any such changes and + * return IRE_GENERATION_VERIFY (which will cause + * ip_select_route to be called again so we can redo the + * recursive lookup next time we send a packet. + */ + generation = ire_dep_validate_generations(ires[0]); + if (generations[0] != ires[0]->ire_generation) { + /* Something changed at the top */ + generation = IRE_GENERATION_VERIFY; + } } + if (generationp != NULL) + *generationp = generation; - ASSERT(best_cire == NULL); - - ip2dbg(("ire_multirt_lookup_v6: returning B_FALSE, *fire_arg %p, " - "*ire_arg %p\n", - (void *)*fire_arg, (void *)*ire_arg)); - - /* No resolvable route. */ - return (B_FALSE); + return (ires[0]); } - -/* - * Find an IRE_OFFSUBNET IRE entry for the multicast address 'v6dstp' - * that goes through 'ipif'. As a fallback, a route that goes through - * ipif->ipif_ill can be returned. - */ ire_t * -ipif_lookup_multi_ire_v6(ipif_t *ipif, const in6_addr_t *v6dstp) +ire_route_recursive_v6(const in6_addr_t *nexthop, uint_t ire_type, + const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, + boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst, + in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) { - ire_t *ire; - ire_t *save_ire = NULL; - ire_t *gw_ire; - irb_t *irb; - in6_addr_t v6gw; - int match_flags = MATCH_IRE_TYPE | MATCH_IRE_ILL; - ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; - - ire = ire_ftable_lookup_v6(v6dstp, 0, 0, 0, NULL, NULL, ALL_ZONES, 0, - NULL, MATCH_IRE_DEFAULT, ipst); - - if (ire == NULL) - return (NULL); - - irb = ire->ire_bucket; - ASSERT(irb); - - IRB_REFHOLD(irb); - ire_refrele(ire); - for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { - if (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, v6dstp) || - (ipif->ipif_zoneid != ire->ire_zoneid && - ire->ire_zoneid != ALL_ZONES)) { - continue; - } - - switch (ire->ire_type) { - case IRE_DEFAULT: - case IRE_PREFIX: - case IRE_HOST: - mutex_enter(&ire->ire_lock); - v6gw = ire->ire_gateway_addr_v6; - mutex_exit(&ire->ire_lock); - gw_ire = ire_ftable_lookup_v6(&v6gw, 0, 0, - IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, - NULL, match_flags, ipst); - - if (gw_ire != NULL) { - if (save_ire != NULL) { - ire_refrele(save_ire); - } - IRE_REFHOLD(ire); - if (gw_ire->ire_ipif == ipif) { - ire_refrele(gw_ire); - - IRB_REFRELE(irb); - return (ire); - } - ire_refrele(gw_ire); - save_ire = ire; - } - break; - case IRE_IF_NORESOLVER: - case IRE_IF_RESOLVER: - if (ire->ire_ipif == ipif) { - if (save_ire != NULL) { - ire_refrele(save_ire); - } - IRE_REFHOLD(ire); - - IRB_REFRELE(irb); - return (ire); - } - break; - } - } - IRB_REFRELE(irb); - - return (save_ire); + return (ire_route_recursive_impl_v6(NULL, nexthop, ire_type, ill, + zoneid, tsl, match_args, allocate, xmit_hint, ipst, setsrcp, + gwattrp, generationp)); } /* - * This is the implementation of the IPv6 IRE cache lookup procedure. - * Separating the interface from the implementation allows additional - * flexibility when specifying search criteria. + * Recursively look for a route to the destination. + * We only handle a destination match here, yet we have the same arguments + * as the full match to allow function pointers to select between the two. + * + * Note that this function never returns NULL. It returns an IRE_NOROUTE + * instead. + * + * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it + * is an error. + * Allow at most one RTF_INDIRECT. */ -static ire_t * -ip6_ctable_lookup_impl(ire_ctable_args_t *margs) +ire_t * +ire_route_recursive_dstonly_v6(const in6_addr_t *nexthop, boolean_t allocate, + uint32_t xmit_hint, ip_stack_t *ipst) { - irb_t *irb_ptr; - ire_t *ire; - ip_stack_t *ipst = margs->ict_ipst; + ire_t *ire; + ire_t *ire1; + uint_t generation; - if ((margs->ict_flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && - (margs->ict_ipif == NULL)) { - return (NULL); - } + /* ire_ftable_lookup handles round-robin/ECMP */ + ire = ire_ftable_lookup_simple_v6(nexthop, xmit_hint, ipst, + &generation); + ASSERT(ire != NULL); - irb_ptr = &ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6( - *((in6_addr_t *)(margs->ict_addr)), - ipst->ips_ip6_cache_table_size)]; - rw_enter(&irb_ptr->irb_lock, RW_READER); - for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { - if (ire->ire_marks & IRE_MARK_CONDEMNED) - continue; - ASSERT(IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, &ipv6_all_ones)); - if (ire_match_args_v6(ire, (in6_addr_t *)margs->ict_addr, - &ire->ire_mask_v6, (in6_addr_t *)margs->ict_gateway, - margs->ict_type, margs->ict_ipif, margs->ict_zoneid, 0, - margs->ict_tsl, margs->ict_flags)) { - IRE_REFHOLD(ire); - rw_exit(&irb_ptr->irb_lock); - return (ire); - } + /* + * If this type should have an ire_nce_cache (even if it + * doesn't yet have one) then we are done. Includes + * IRE_INTERFACE with a full 128 bit mask. + */ + if (ire->ire_nce_capable) + return (ire); + + /* + * If the IRE has a current cached parent we know that the whole + * parent chain is current, hence we don't need to discover and + * build any dependencies by doing a recursive lookup. + */ + mutex_enter(&ire->ire_lock); + if (ire->ire_dep_parent != NULL && + ire->ire_dep_parent->ire_generation == + ire->ire_dep_parent_generation) { + mutex_exit(&ire->ire_lock); + return (ire); } + mutex_exit(&ire->ire_lock); - rw_exit(&irb_ptr->irb_lock); - return (NULL); + /* + * Fallback to loop in the normal code starting with the ire + * we found. Normally this would return the same ire. + */ + ire1 = ire_route_recursive_impl_v6(ire, nexthop, 0, NULL, ALL_ZONES, + NULL, MATCH_IRE_DSTONLY, allocate, xmit_hint, ipst, NULL, NULL, + &generation); + ire_refrele(ire); + return (ire1); } diff --git a/usr/src/uts/common/inet/ip/ip6_output.c b/usr/src/uts/common/inet/ip/ip6_output.c new file mode 100644 index 0000000000..3e06050781 --- /dev/null +++ b/usr/src/uts/common/inet/ip/ip6_output.c @@ -0,0 +1,1315 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* Copyright (c) 1990 Mentat Inc. */ + +#include <sys/types.h> +#include <sys/stream.h> +#include <sys/strsubr.h> +#include <sys/dlpi.h> +#include <sys/strsun.h> +#include <sys/zone.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/atomic.h> + +#include <sys/systm.h> +#include <sys/param.h> +#include <sys/kmem.h> +#include <sys/sdt.h> +#include <sys/socket.h> +#include <sys/mac.h> +#include <net/if.h> +#include <net/if_arp.h> +#include <net/route.h> +#include <sys/sockio.h> +#include <netinet/in.h> +#include <net/if_dl.h> + +#include <inet/common.h> +#include <inet/mi.h> +#include <inet/mib2.h> +#include <inet/nd.h> +#include <inet/arp.h> +#include <inet/snmpcom.h> +#include <inet/kstatcom.h> + +#include <netinet/igmp_var.h> +#include <netinet/ip6.h> +#include <netinet/icmp6.h> +#include <netinet/sctp.h> + +#include <inet/ip.h> +#include <inet/ip_impl.h> +#include <inet/ip6.h> +#include <inet/ip6_asp.h> +#include <inet/tcp.h> +#include <inet/ip_multi.h> +#include <inet/ip_if.h> +#include <inet/ip_ire.h> +#include <inet/ip_ftable.h> +#include <inet/ip_rts.h> +#include <inet/optcom.h> +#include <inet/ip_ndp.h> +#include <inet/ip_listutils.h> +#include <netinet/igmp.h> +#include <netinet/ip_mroute.h> +#include <inet/ipp_common.h> + +#include <net/pfkeyv2.h> +#include <inet/sadb.h> +#include <inet/ipsec_impl.h> +#include <inet/ipdrop.h> +#include <inet/ip_netinfo.h> + +#include <sys/pattr.h> +#include <inet/ipclassifier.h> +#include <inet/sctp_ip.h> +#include <inet/sctp/sctp_impl.h> +#include <inet/udp_impl.h> +#include <sys/sunddi.h> + +#include <sys/tsol/label.h> +#include <sys/tsol/tnet.h> + +#ifdef DEBUG +extern boolean_t skip_sctp_cksum; +#endif + +int +ip_output_simple_v6(mblk_t *mp, ip_xmit_attr_t *ixa) +{ + ip6_t *ip6h; + in6_addr_t firsthop; /* In IP header */ + in6_addr_t dst; /* End of source route, or ip6_dst if none */ + ire_t *ire; + in6_addr_t setsrc; + int error; + ill_t *ill = NULL; + dce_t *dce = NULL; + nce_t *nce; + iaflags_t ixaflags = ixa->ixa_flags; + ip_stack_t *ipst = ixa->ixa_ipst; + uint8_t *nexthdrp; + boolean_t repeat = B_FALSE; + boolean_t multirt = B_FALSE; + uint_t ifindex; + + ip6h = (ip6_t *)mp->b_rptr; + ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION); + + ASSERT(ixa->ixa_nce == NULL); + + ixa->ixa_pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; + ASSERT(ixa->ixa_pktlen == msgdsize(mp)); + if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ixa->ixa_ip_hdr_length, + &nexthdrp)) { + /* Malformed packet */ + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards", mp, NULL); + freemsg(mp); + return (EINVAL); + } + ixa->ixa_protocol = *nexthdrp; + + /* + * Assumes that source routed packets have already been massaged by + * the ULP (ip_massage_options_v6) and as a result ip6_dst is the next + * hop in the source route. The final destination is used for IPsec + * policy and DCE lookup. + */ + firsthop = ip6h->ip6_dst; + dst = ip_get_dst_v6(ip6h, mp, NULL); + +repeat_ire: + error = 0; + setsrc = ipv6_all_zeros; + ire = ip_select_route_v6(&firsthop, ixa, NULL, &setsrc, &error, + &multirt); + ASSERT(ire != NULL); /* IRE_NOROUTE if none found */ + if (error != 0) { + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards", mp, NULL); + freemsg(mp); + goto done; + } + + if (ire->ire_flags & (RTF_BLACKHOLE|RTF_REJECT)) { + /* ire_ill might be NULL hence need to skip some code */ + if (ixaflags & IXAF_SET_SOURCE) + ip6h->ip6_src = ipv6_loopback; + ixa->ixa_fragsize = IP_MAXPACKET; + ire->ire_ob_pkt_count++; + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); + /* No dce yet; use default one */ + error = (ire->ire_sendfn)(ire, mp, ip6h, ixa, + &ipst->ips_dce_default->dce_ident); + goto done; + } + + /* Note that ip6_dst is only used for IRE_MULTICAST */ + nce = ire_to_nce(ire, INADDR_ANY, &ip6h->ip6_dst); + if (nce == NULL) { + /* Allocation failure? */ + ip_drop_output("ire_to_nce", mp, ill); + freemsg(mp); + error = ENOBUFS; + goto done; + } + if (nce->nce_is_condemned) { + nce_t *nce1; + + nce1 = ire_handle_condemned_nce(nce, ire, NULL, ip6h, B_TRUE); + nce_refrele(nce); + if (nce1 == NULL) { + if (!repeat) { + /* Try finding a better IRE */ + repeat = B_TRUE; + ire_refrele(ire); + goto repeat_ire; + } + /* Tried twice - drop packet */ + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("No nce", mp, ill); + freemsg(mp); + error = ENOBUFS; + goto done; + } + nce = nce1; + } + /* + * For multicast with multirt we have a flag passed back from + * ire_lookup_multi_ill_v6 since we don't have an IRE for each + * possible multicast address. + * We also need a flag for multicast since we can't check + * whether RTF_MULTIRT is set in ixa_ire for multicast. + */ + if (multirt) { + ixa->ixa_postfragfn = ip_postfrag_multirt_v6; + ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST; + } else { + ixa->ixa_postfragfn = ire->ire_postfragfn; + ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST; + } + ASSERT(ixa->ixa_nce == NULL); + ixa->ixa_nce = nce; + + /* + * Check for a dce_t with a path mtu. + */ + ifindex = 0; + if (IN6_IS_ADDR_LINKSCOPE(&dst)) + ifindex = nce->nce_common->ncec_ill->ill_phyint->phyint_ifindex; + + dce = dce_lookup_v6(&dst, ifindex, ipst, NULL); + ASSERT(dce != NULL); + + if (!(ixaflags & IXAF_PMTU_DISCOVERY)) { + ixa->ixa_fragsize = IPV6_MIN_MTU; + } else if (dce->dce_flags & DCEF_PMTU) { + /* + * To avoid a periodic timer to increase the path MTU we + * look at dce_last_change_time each time we send a packet. + */ + if (TICK_TO_SEC(lbolt64) - dce->dce_last_change_time > + ipst->ips_ip_pathmtu_interval) { + /* + * Older than 20 minutes. Drop the path MTU information. + */ + mutex_enter(&dce->dce_lock); + dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU); + dce->dce_last_change_time = TICK_TO_SEC(lbolt64); + mutex_exit(&dce->dce_lock); + dce_increment_generation(dce); + ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); + } else { + uint_t fragsize; + + fragsize = ip_get_base_mtu(nce->nce_ill, ire); + if (fragsize > dce->dce_pmtu) + fragsize = dce->dce_pmtu; + ixa->ixa_fragsize = fragsize; + } + } else { + ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); + } + + /* + * We use use ire_nexthop_ill (and not ncec_ill) to avoid the under ipmp + * interface for source address selection. + */ + ill = ire_nexthop_ill(ire); + + if (ixaflags & IXAF_SET_SOURCE) { + in6_addr_t src; + + /* + * We use the final destination to get + * correct selection for source routed packets + */ + + /* If unreachable we have no ill but need some source */ + if (ill == NULL) { + src = ipv6_loopback; + error = 0; + } else { + error = ip_select_source_v6(ill, &setsrc, &dst, + ixa->ixa_zoneid, ipst, B_FALSE, + ixa->ixa_src_preferences, &src, NULL, NULL); + } + if (error != 0) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards - no source", + mp, ill); + freemsg(mp); + goto done; + } + ip6h->ip6_src = src; + } else if (ixaflags & IXAF_VERIFY_SOURCE) { + /* Check if the IP source is assigned to the host. */ + if (!ip_verify_src(mp, ixa, NULL)) { + /* Don't send a packet with a source that isn't ours */ + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards - invalid source", + mp, ill); + freemsg(mp); + error = EADDRNOTAVAIL; + goto done; + } + } + + /* + * Check against global IPsec policy to set the AH/ESP attributes. + * IPsec will set IXAF_IPSEC_* and ixa_ipsec_* as appropriate. + */ + if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) { + ASSERT(ixa->ixa_ipsec_policy == NULL); + mp = ip_output_attach_policy(mp, NULL, ip6h, NULL, ixa); + if (mp == NULL) { + /* MIB and ip_drop_packet already done */ + return (EHOSTUNREACH); /* IPsec policy failure */ + } + } + + if (ill != NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); + } else { + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); + } + + /* + * We update the statistics on the most specific IRE i.e., the first + * one we found. + * We don't have an IRE when we fragment, hence ire_ob_pkt_count + * can only count the use prior to fragmentation. However the MIB + * counters on the ill will be incremented in post fragmentation. + */ + ire->ire_ob_pkt_count++; + + /* + * Based on ire_type and ire_flags call one of: + * ire_send_local_v6 - for IRE_LOCAL and IRE_LOOPBACK + * ire_send_multirt_v6 - if RTF_MULTIRT + * ire_send_noroute_v6 - if RTF_REJECT or RTF_BLACHOLE + * ire_send_multicast_v6 - for IRE_MULTICAST + * ire_send_wire_v6 - for the rest. + */ + error = (ire->ire_sendfn)(ire, mp, ip6h, ixa, &dce->dce_ident); +done: + ire_refrele(ire); + if (dce != NULL) + dce_refrele(dce); + if (ill != NULL) + ill_refrele(ill); + if (ixa->ixa_nce != NULL) + nce_refrele(ixa->ixa_nce); + ixa->ixa_nce = NULL; + return (error); +} + +/* + * ire_sendfn() functions. + * These functions use the following xmit_attr: + * - ixa_fragsize - read to determine whether or not to fragment + * - IXAF_IPSEC_SECURE - to determine whether or not to invoke IPsec + * - ixa_ipsec_* are used inside IPsec + * - IXAF_LOOPBACK_COPY - for multicast + */ + + +/* + * ire_sendfn for IRE_LOCAL and IRE_LOOPBACK + * + * The checks for restrict_interzone_loopback are done in ire_route_recursive. + */ +/* ARGSUSED4 */ +int +ire_send_local_v6(ire_t *ire, mblk_t *mp, void *iph_arg, + ip_xmit_attr_t *ixa, uint32_t *identp) +{ + ip6_t *ip6h = (ip6_t *)iph_arg; + ip_stack_t *ipst = ixa->ixa_ipst; + ill_t *ill = ire->ire_ill; + ip_recv_attr_t iras; /* NOTE: No bzero for performance */ + uint_t pktlen = ixa->ixa_pktlen; + + /* + * No fragmentation, no nce, and no application of IPsec. + * + * + * Note different order between IP provider and FW_HOOKS than in + * send_wire case. + */ + + /* + * DTrace this as ip:::send. A packet blocked by FW_HOOKS will fire the + * send probe, but not the receive probe. + */ + DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, + ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, ip6h, + int, 1); + + DTRACE_PROBE4(ip6__loopback__out__start, + ill_t *, NULL, ill_t *, ill, + ip6_t *, ip6h, mblk_t *, mp); + + if (HOOKS6_INTERESTED_LOOPBACK_OUT(ipst)) { + int error; + + FW_HOOKS(ipst->ips_ip6_loopback_out_event, + ipst->ips_ipv6firewall_loopback_out, + NULL, ill, ip6h, mp, mp, 0, ipst, error); + + DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, mp); + if (mp == NULL) + return (error); + + /* + * Even if the destination was changed by the filter we use the + * forwarding decision that was made based on the address + * in ip_output/ip_set_destination. + */ + /* Length could be different */ + ip6h = (ip6_t *)mp->b_rptr; + pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; + } + + /* + * If a callback is enabled then we need to know the + * source and destination zoneids for the packet. We already + * have those handy. + */ + if (ipst->ips_ip6_observe.he_interested) { + zoneid_t szone, dzone; + zoneid_t stackzoneid; + + stackzoneid = netstackid_to_zoneid( + ipst->ips_netstack->netstack_stackid); + + if (stackzoneid == GLOBAL_ZONEID) { + /* Shared-IP zone */ + dzone = ire->ire_zoneid; + szone = ixa->ixa_zoneid; + } else { + szone = dzone = stackzoneid; + } + ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst); + } + + /* Handle lo0 stats */ + ipst->ips_loopback_packets++; + + /* + * Update output mib stats. Note that we can't move into the icmp + * sender (icmp_output etc) since they don't know the ill and the + * stats are per ill. + */ + if (ixa->ixa_protocol == IPPROTO_ICMPV6) { + icmp6_t *icmp6; + + icmp6 = (icmp6_t *)((uchar_t *)ip6h + ixa->ixa_ip_hdr_length); + icmp_update_out_mib_v6(ill, icmp6); + } + + DTRACE_PROBE4(ip6__loopback__in__start, + ill_t *, ill, ill_t *, NULL, + ip6_t *, ip6h, mblk_t *, mp); + + if (HOOKS6_INTERESTED_LOOPBACK_IN(ipst)) { + int error; + + FW_HOOKS(ipst->ips_ip6_loopback_in_event, + ipst->ips_ipv6firewall_loopback_in, + ill, NULL, ip6h, mp, mp, 0, ipst, error); + + DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, mp); + if (mp == NULL) + return (error); + + /* + * Even if the destination was changed by the filter we use the + * forwarding decision that was made based on the address + * in ip_output/ip_set_destination. + */ + /* Length could be different */ + ip6h = (ip6_t *)mp->b_rptr; + pktlen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; + } + + DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, + ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, ip6h, + int, 1); + + /* Map ixa to ira including IPsec policies */ + ipsec_out_to_in(ixa, ill, &iras); + iras.ira_pktlen = pktlen; + + ire->ire_ib_pkt_count++; + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); + UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pktlen); + + /* Destined to ire_zoneid - use that for fanout */ + iras.ira_zoneid = ire->ire_zoneid; + + if (is_system_labeled()) { + iras.ira_flags |= IRAF_SYSTEM_LABELED; + + /* + * This updates ira_cred, ira_tsl and ira_free_flags based + * on the label. We don't expect this to ever fail for + * loopback packets, so we silently drop the packet should it + * fail. + */ + if (!tsol_get_pkt_label(mp, IPV6_VERSION, &iras)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("tsol_get_pkt_label", mp, ill); + freemsg(mp); + return (0); + } + ASSERT(iras.ira_tsl != NULL); + + /* tsol_get_pkt_label sometimes does pullupmsg */ + ip6h = (ip6_t *)mp->b_rptr; + } + + ip_fanout_v6(mp, ip6h, &iras); + + /* We moved any IPsec refs from ixa to iras */ + ira_cleanup(&iras, B_FALSE); + return (0); +} + +static void +multirt_check_v6(ire_t *ire, ip6_t *ip6h, ip_xmit_attr_t *ixa) +{ + ip_stack_t *ipst = ixa->ixa_ipst; + + /* Limit the TTL on multirt packets. Do this even if IPV6_HOPLIMIT */ + if (ire->ire_type & IRE_MULTICAST) { + if (ip6h->ip6_hops > 1) { + ip2dbg(("ire_send_multirt_v6: forcing multicast " + "multirt TTL to 1 (was %d)\n", ip6h->ip6_hops)); + ip6h->ip6_hops = 1; + } + ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; + } else if ((ipst->ips_ip_multirt_ttl > 0) && + (ip6h->ip6_hops > ipst->ips_ip_multirt_ttl)) { + ip6h->ip6_hops = ipst->ips_ip_multirt_ttl; + /* + * Need to ensure we don't increase the ttl should we go through + * ire_send_multicast. + */ + ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; + } + + /* For IPv6 this also needs to insert a fragment header */ + ixa->ixa_flags |= IXAF_IPV6_ADD_FRAGHDR; +} + +/* + * ire_sendfn for IRE_MULTICAST + * + * Note that we do path MTU discovery by default for IPv6 multicast. But + * since unconnected UDP and RAW sockets don't set IXAF_PMTU_DISCOVERY + * only connected sockets get this by default. + */ +int +ire_send_multicast_v6(ire_t *ire, mblk_t *mp, void *iph_arg, + ip_xmit_attr_t *ixa, uint32_t *identp) +{ + ip6_t *ip6h = (ip6_t *)iph_arg; + ip_stack_t *ipst = ixa->ixa_ipst; + ill_t *ill = ire->ire_ill; + iaflags_t ixaflags = ixa->ixa_flags; + + /* + * The IRE_MULTICAST is the same whether or not multirt is in use. + * Hence we need special-case code. + */ + if (ixaflags & IXAF_MULTIRT_MULTICAST) + multirt_check_v6(ire, ip6h, ixa); + + /* + * Check if anything in ip_input_v6 wants a copy of the transmitted + * packet (after IPsec and fragmentation) + * + * 1. Multicast routers always need a copy unless SO_DONTROUTE is set + * RSVP and the rsvp daemon is an example of a + * protocol and user level process that + * handles it's own routing. Hence, it uses the + * SO_DONTROUTE option to accomplish this. + * 2. If the sender has set IP_MULTICAST_LOOP, then we just + * check whether there are any receivers for the group on the ill + * (ignoring the zoneid). + * 3. If IP_MULTICAST_LOOP is not set, then we check if there are + * any members in other shared-IP zones. + * If such members exist, then we indicate that the sending zone + * shouldn't get a loopback copy to preserve the IP_MULTICAST_LOOP + * behavior. + * + * When we loopback we skip hardware checksum to make sure loopback + * copy is checksumed. + * + * Note that ire_ill is the upper in the case of IPMP. + */ + ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM); + if (ipst->ips_ip_g_mrouter && ill->ill_mrouter_cnt > 0 && + !(ixaflags & IXAF_DONTROUTE)) { + ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; + } else if (ixaflags & IXAF_MULTICAST_LOOP) { + /* + * If this zone or any other zone has members then loopback + * a copy. + */ + if (ill_hasmembers_v6(ill, &ip6h->ip6_dst)) + ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; + } else if (ipst->ips_netstack->netstack_numzones > 1) { + /* + * This zone should not have a copy. But there are some other + * zones which might have members. + */ + if (ill_hasmembers_otherzones_v6(ill, &ip6h->ip6_dst, + ixa->ixa_zoneid)) { + ixa->ixa_flags |= IXAF_NO_LOOP_ZONEID_SET; + ixa->ixa_no_loop_zoneid = ixa->ixa_zoneid; + ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; + } + } + + /* + * Unless IPV6_HOPLIMIT or ire_send_multirt_v6 already set a ttl, + * force the ttl to the IP_MULTICAST_TTL value + */ + if (!(ixaflags & IXAF_NO_TTL_CHANGE)) { + ip6h->ip6_hops = ixa->ixa_multicast_ttl; + } + + return (ire_send_wire_v6(ire, mp, ip6h, ixa, identp)); +} + +/* + * ire_sendfn for IREs with RTF_MULTIRT + */ +int +ire_send_multirt_v6(ire_t *ire, mblk_t *mp, void *iph_arg, + ip_xmit_attr_t *ixa, uint32_t *identp) +{ + ip6_t *ip6h = (ip6_t *)iph_arg; + + multirt_check_v6(ire, ip6h, ixa); + + if (ire->ire_type & IRE_MULTICAST) + return (ire_send_multicast_v6(ire, mp, ip6h, ixa, identp)); + else + return (ire_send_wire_v6(ire, mp, ip6h, ixa, identp)); +} + +/* + * ire_sendfn for IREs with RTF_REJECT/RTF_BLACKHOLE, including IRE_NOROUTE + */ +/* ARGSUSED4 */ +int +ire_send_noroute_v6(ire_t *ire, mblk_t *mp, void *iph_arg, + ip_xmit_attr_t *ixa, uint32_t *identp) +{ + ip6_t *ip6h = (ip6_t *)iph_arg; + ip_stack_t *ipst = ixa->ixa_ipst; + ill_t *ill; + ip_recv_attr_t iras; + boolean_t dummy; + + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); + + if (ire->ire_type & IRE_NOROUTE) { + /* A lack of a route as opposed to RTF_REJECT|BLACKHOLE */ + ip_rts_change_v6(RTM_MISS, &ip6h->ip6_dst, 0, 0, 0, 0, 0, 0, + RTA_DST, ipst); + } + + if (ire->ire_flags & RTF_BLACKHOLE) { + ip_drop_output("ipIfStatsOutNoRoutes RTF_BLACKHOLE", mp, NULL); + freemsg(mp); + /* No error even for local senders - silent blackhole */ + return (0); + } + ip_drop_output("ipIfStatsOutNoRoutes RTF_REJECT", mp, NULL); + + /* + * We need an ill_t for the ip_recv_attr_t even though this packet + * was never received and icmp_unreachable doesn't currently use + * ira_ill. + */ + ill = ill_lookup_on_name("lo0", B_FALSE, + !(ixa->ixa_flags & IRAF_IS_IPV4), &dummy, ipst); + if (ill == NULL) { + freemsg(mp); + return (EHOSTUNREACH); + } + + bzero(&iras, sizeof (iras)); + /* Map ixa to ira including IPsec policies */ + ipsec_out_to_in(ixa, ill, &iras); + + icmp_unreachable_v6(mp, ICMP6_DST_UNREACH_NOROUTE, B_FALSE, &iras); + /* We moved any IPsec refs from ixa to iras */ + ira_cleanup(&iras, B_FALSE); + + ill_refrele(ill); + return (EHOSTUNREACH); +} + +/* + * Calculate a checksum ignoring any hardware capabilities + * + * Returns B_FALSE if the packet was too short for the checksum. Caller + * should free and do stats. + */ +static boolean_t +ip_output_sw_cksum_v6(mblk_t *mp, ip6_t *ip6h, ip_xmit_attr_t *ixa) +{ + ip_stack_t *ipst = ixa->ixa_ipst; + uint_t pktlen = ixa->ixa_pktlen; + uint16_t *cksump; + uint32_t cksum; + uint8_t protocol = ixa->ixa_protocol; + uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length; + +#define iphs ((uint16_t *)ip6h) + + /* Just in case it contained garbage */ + DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS; + + /* + * Calculate ULP checksum + */ + if (protocol == IPPROTO_TCP) { + cksump = IPH_TCPH_CHECKSUMP(ip6h, ip_hdr_length); + cksum = IP_TCP_CSUM_COMP; + } else if (protocol == IPPROTO_UDP) { + cksump = IPH_UDPH_CHECKSUMP(ip6h, ip_hdr_length); + cksum = IP_UDP_CSUM_COMP; + } else if (protocol == IPPROTO_SCTP) { + sctp_hdr_t *sctph; + + ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph))); + sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length); + /* + * Zero out the checksum field to ensure proper + * checksum calculation. + */ + sctph->sh_chksum = 0; +#ifdef DEBUG + if (!skip_sctp_cksum) +#endif + sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length); + return (B_TRUE); + } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { + /* + * icmp has placed length and routing + * header adjustment in the checksum field. + */ + cksump = (uint16_t *)(((uint8_t *)ip6h) + ip_hdr_length + + ixa->ixa_raw_cksum_offset); + cksum = htons(protocol); + } else if (protocol == IPPROTO_ICMPV6) { + cksump = IPH_ICMPV6_CHECKSUMP(ip6h, ip_hdr_length); + cksum = IP_ICMPV6_CSUM_COMP; /* Pseudo-header cksum */ + } else { + return (B_TRUE); + } + + /* ULP puts the checksum field is in the first mblk */ + ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr); + + /* + * We accumulate the pseudo header checksum in cksum. + * This is pretty hairy code, so watch close. One + * thing to keep in mind is that UDP and TCP have + * stored their respective datagram lengths in their + * checksum fields. This lines things up real nice. + */ + cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] + + iphs[8] + iphs[9] + iphs[10] + iphs[11] + + iphs[12] + iphs[13] + iphs[14] + iphs[15] + + iphs[16] + iphs[17] + iphs[18] + iphs[19]; + cksum = IP_CSUM(mp, ip_hdr_length, cksum); + + /* + * For UDP/IPv6 a zero UDP checksum is not allowed. + * Change to 0xffff + */ + if (protocol == IPPROTO_UDP && cksum == 0) + *cksump = ~cksum; + else + *cksump = cksum; + + IP6_STAT(ipst, ip6_out_sw_cksum); + IP6_STAT_UPDATE(ipst, ip6_out_sw_cksum_bytes, pktlen); + + /* No IP header checksum for IPv6 */ + + return (B_TRUE); +#undef iphs +} + +/* There are drivers that can't do partial checksum for ICMPv6 */ +int nxge_cksum_workaround = 1; + +/* + * Calculate the ULP checksum - try to use hardware. + * In the case of MULTIRT or multicast the + * IXAF_NO_HW_CKSUM is set in which case we use software. + * + * Returns B_FALSE if the packet was too short for the checksum. Caller + * should free and do stats. + */ +static boolean_t +ip_output_cksum_v6(iaflags_t ixaflags, mblk_t *mp, ip6_t *ip6h, + ip_xmit_attr_t *ixa, ill_t *ill) +{ + uint_t pktlen = ixa->ixa_pktlen; + uint16_t *cksump; + uint16_t hck_flags; + uint32_t cksum; + uint8_t protocol = ixa->ixa_protocol; + uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length; + +#define iphs ((uint16_t *)ip6h) + + if ((ixaflags & IXAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) || + !dohwcksum) { + return (ip_output_sw_cksum_v6(mp, ip6h, ixa)); + } + + /* + * Calculate ULP checksum. Note that we don't use cksump and cksum + * if the ill has FULL support. + */ + if (protocol == IPPROTO_TCP) { + cksump = IPH_TCPH_CHECKSUMP(ip6h, ip_hdr_length); + cksum = IP_TCP_CSUM_COMP; /* Pseudo-header cksum */ + } else if (protocol == IPPROTO_UDP) { + cksump = IPH_UDPH_CHECKSUMP(ip6h, ip_hdr_length); + cksum = IP_UDP_CSUM_COMP; /* Pseudo-header cksum */ + } else if (protocol == IPPROTO_SCTP) { + sctp_hdr_t *sctph; + + ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph))); + sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length); + /* + * Zero out the checksum field to ensure proper + * checksum calculation. + */ + sctph->sh_chksum = 0; +#ifdef DEBUG + if (!skip_sctp_cksum) +#endif + sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length); + goto ip_hdr_cksum; + } else if (ixa->ixa_flags & IXAF_SET_RAW_CKSUM) { + /* + * icmp has placed length and routing + * header adjustment in the checksum field. + */ + cksump = (uint16_t *)(((uint8_t *)ip6h) + ip_hdr_length + + ixa->ixa_raw_cksum_offset); + cksum = htons(protocol); + } else if (protocol == IPPROTO_ICMPV6) { + cksump = IPH_ICMPV6_CHECKSUMP(ip6h, ip_hdr_length); + cksum = IP_ICMPV6_CSUM_COMP; /* Pseudo-header cksum */ + } else { + ip_hdr_cksum: + /* No IP header checksum for IPv6 */ + return (B_TRUE); + } + + /* ULP puts the checksum field is in the first mblk */ + ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr); + + /* + * Underlying interface supports hardware checksum offload for + * the payload; leave the payload checksum for the hardware to + * calculate. N.B: We only need to set up checksum info on the + * first mblk. + */ + hck_flags = ill->ill_hcksum_capab->ill_hcksum_txflags; + + DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS; + if (hck_flags & HCKSUM_INET_FULL_V6) { + /* + * Hardware calculates pseudo-header, header and the + * payload checksums, so clear the checksum field in + * the protocol header. + */ + *cksump = 0; + DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM; + return (B_TRUE); + } + if (((hck_flags) & HCKSUM_INET_PARTIAL) && + (protocol != IPPROTO_ICMPV6 || !nxge_cksum_workaround)) { + /* + * Partial checksum offload has been enabled. Fill + * the checksum field in the protocol header with the + * pseudo-header checksum value. + * + * We accumulate the pseudo header checksum in cksum. + * This is pretty hairy code, so watch close. One + * thing to keep in mind is that UDP and TCP have + * stored their respective datagram lengths in their + * checksum fields. This lines things up real nice. + */ + cksum += iphs[4] + iphs[5] + iphs[6] + iphs[7] + + iphs[8] + iphs[9] + iphs[10] + iphs[11] + + iphs[12] + iphs[13] + iphs[14] + iphs[15] + + iphs[16] + iphs[17] + iphs[18] + iphs[19]; + cksum += *(cksump); + cksum = (cksum & 0xFFFF) + (cksum >> 16); + *(cksump) = (cksum & 0xFFFF) + (cksum >> 16); + + /* + * Offsets are relative to beginning of IP header. + */ + DB_CKSUMSTART(mp) = ip_hdr_length; + DB_CKSUMSTUFF(mp) = (uint8_t *)cksump - (uint8_t *)ip6h; + DB_CKSUMEND(mp) = pktlen; + DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM; + return (B_TRUE); + } + /* Hardware capabilities include neither full nor partial IPv6 */ + return (ip_output_sw_cksum_v6(mp, ip6h, ixa)); +#undef iphs +} + +/* + * ire_sendfn for offlink and onlink destinations. + * Also called from the multicast, and multirt send functions. + * + * Assumes that the caller has a hold on the ire. + * + * This function doesn't care if the IRE just became condemned since that + * can happen at any time. + */ +/* ARGSUSED */ +int +ire_send_wire_v6(ire_t *ire, mblk_t *mp, void *iph_arg, + ip_xmit_attr_t *ixa, uint32_t *identp) +{ + ip_stack_t *ipst = ixa->ixa_ipst; + ip6_t *ip6h = (ip6_t *)iph_arg; + iaflags_t ixaflags = ixa->ixa_flags; + ill_t *ill; + uint32_t pktlen = ixa->ixa_pktlen; + + ASSERT(ixa->ixa_nce != NULL); + ill = ixa->ixa_nce->nce_ill; + + /* + * Update output mib stats. Note that we can't move into the icmp + * sender (icmp_output etc) since they don't know the ill and the + * stats are per ill. + * + * With IPMP we record the stats on the upper ill. + */ + if (ixa->ixa_protocol == IPPROTO_ICMPV6) { + icmp6_t *icmp6; + + icmp6 = (icmp6_t *)((uchar_t *)ip6h + ixa->ixa_ip_hdr_length); + icmp_update_out_mib_v6(ixa->ixa_nce->nce_common->ncec_ill, + icmp6); + } + + if (ixaflags & IXAF_DONTROUTE) + ip6h->ip6_hops = 1; + + /* + * This might set b_band, thus the IPsec and fragmentation + * code in IP ensures that b_band is updated in the first mblk. + */ + if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { + /* ip_process translates an IS_UNDER_IPMP */ + mp = ip_process(IPP_LOCAL_OUT, mp, ill, ill); + if (mp == NULL) { + /* ip_drop_packet and MIB done */ + return (0); /* Might just be delayed */ + } + } + + /* + * To handle IPsec/iptun's labeling needs we need to tag packets + * while we still have ixa_tsl + */ + if (is_system_labeled() && ixa->ixa_tsl != NULL && + (ill->ill_mactype == DL_6TO4 || ill->ill_mactype == DL_IPV4 || + ill->ill_mactype == DL_IPV6)) { + cred_t *newcr; + + newcr = copycred_from_tslabel(ixa->ixa_cred, ixa->ixa_tsl, + KM_NOSLEEP); + if (newcr == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards - newcr", + mp, ill); + freemsg(mp); + return (ENOBUFS); + } + mblk_setcred(mp, newcr, NOPID); + crfree(newcr); /* mblk_setcred did its own crhold */ + } + + /* + * IXAF_IPV6_ADD_FRAGHDR is set for CGTP so that we will add a + * fragment header without fragmenting. CGTP on the receiver will + * filter duplicates on the ident field. + */ + if (pktlen > ixa->ixa_fragsize || + (ixaflags & (IXAF_IPSEC_SECURE|IXAF_IPV6_ADD_FRAGHDR))) { + uint32_t ident; + + if (ixaflags & IXAF_IPSEC_SECURE) + pktlen += ipsec_out_extra_length(ixa); + + if (pktlen > IP_MAXPACKET) + return (EMSGSIZE); + + if (ixaflags & IXAF_SET_ULP_CKSUM) { + /* + * Compute ULP checksum using software + */ + if (!ip_output_sw_cksum_v6(mp, ip6h, ixa)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards", mp, ill); + freemsg(mp); + return (EINVAL); + } + /* Avoid checksum again below if we only add fraghdr */ + ixaflags &= ~IXAF_SET_ULP_CKSUM; + } + + /* + * If we need a fragment header, pick the ident and insert + * the header before IPsec to we have a place to store + * the ident value. + */ + if ((ixaflags & IXAF_IPV6_ADD_FRAGHDR) || + pktlen > ixa->ixa_fragsize) { + /* + * If this packet would generate a icmp_frag_needed + * message, we need to handle it before we do the IPsec + * processing. Otherwise, we need to strip the IPsec + * headers before we send up the message to the ULPs + * which becomes messy and difficult. + */ + if ((pktlen > ixa->ixa_fragsize) && + (ixaflags & IXAF_DONTFRAG)) { + /* Generate ICMP and return error */ + ip_recv_attr_t iras; + + DTRACE_PROBE4(ip6__fragsize__fail, + uint_t, pktlen, uint_t, ixa->ixa_fragsize, + uint_t, ixa->ixa_pktlen, + uint_t, ixa->ixa_pmtu); + + bzero(&iras, sizeof (iras)); + /* Map ixa to ira including IPsec policies */ + ipsec_out_to_in(ixa, ill, &iras); + + ip_drop_output("ICMP6_PKT_TOO_BIG", mp, ill); + icmp_pkt2big_v6(mp, ixa->ixa_fragsize, B_TRUE, + &iras); + /* We moved any IPsec refs from ixa to iras */ + ira_cleanup(&iras, B_FALSE); + return (EMSGSIZE); + } + DTRACE_PROBE4(ip6__fragsize__ok, uint_t, pktlen, + uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen, + uint_t, ixa->ixa_pmtu); + /* + * Assign an ident value for this packet. There could + * be other threads targeting the same destination, so + * we have to arrange for a atomic increment. + * Normally ixa_extra_ident is 0, but in the case of + * LSO it will be the number of TCP segments that the + * driver/hardware will extraly construct. + * + * Note that cl_inet_ipident has only been used for + * IPv4. We don't use it here. + */ + ident = atomic_add_32_nv(identp, ixa->ixa_extra_ident + + 1); +#ifndef _BIG_ENDIAN + ident = htonl(ident); +#endif + ixa->ixa_ident = ident; /* In case we do IPsec */ + } + if (ixaflags & IXAF_IPSEC_SECURE) { + /* + * Pass in sufficient information so that + * IPsec can determine whether to fragment, and + * which function to call after fragmentation. + */ + return (ipsec_out_process(mp, ixa)); + } + + mp = ip_fraghdr_add_v6(mp, ident, ixa); + if (mp == NULL) { + /* MIB and ip_drop_output already done */ + return (ENOMEM); + } + ASSERT(pktlen == ixa->ixa_pktlen); + pktlen += sizeof (ip6_frag_t); + + if (pktlen > ixa->ixa_fragsize) { + return (ip_fragment_v6(mp, ixa->ixa_nce, ixaflags, + pktlen, ixa->ixa_fragsize, + ixa->ixa_xmit_hint, ixa->ixa_zoneid, + ixa->ixa_no_loop_zoneid, ixa->ixa_postfragfn, + &ixa->ixa_cookie)); + } + } + if (ixaflags & IXAF_SET_ULP_CKSUM) { + /* Compute ULP checksum and IP header checksum */ + /* An IS_UNDER_IPMP ill is ok here */ + if (!ip_output_cksum_v6(ixaflags, mp, ip6h, ixa, ill)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards", mp, ill); + freemsg(mp); + return (EINVAL); + } + } + return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixaflags, + pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid, + ixa->ixa_no_loop_zoneid, &ixa->ixa_cookie)); +} + +/* + * Post fragmentation function for RTF_MULTIRT routes. + * Since IRE_MULTICASTs might have RTF_MULTIRT, this function + * checks IXAF_LOOPBACK_COPY. + * + * If no packet is sent due to failures then we return an errno, but if at + * least one succeeded we return zero. + */ +int +ip_postfrag_multirt_v6(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, + uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, + uintptr_t *ixacookie) +{ + irb_t *irb; + ip6_t *ip6h = (ip6_t *)mp->b_rptr; + ire_t *ire; + ire_t *ire1; + mblk_t *mp1; + nce_t *nce1; + ill_t *ill = nce->nce_ill; + ill_t *ill1; + ip_stack_t *ipst = ill->ill_ipst; + int error = 0; + int num_sent = 0; + int err; + uint_t ire_type; + in6_addr_t nexthop; + + ASSERT(!(ixaflags & IXAF_IS_IPV4)); + + /* Check for IXAF_LOOPBACK_COPY */ + if (ixaflags & IXAF_LOOPBACK_COPY) { + mblk_t *mp1; + + mp1 = copymsg(mp); + if (mp1 == NULL) { + /* Failed to deliver the loopback copy. */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards", mp, ill); + error = ENOBUFS; + } else { + ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len, + nolzid); + } + } + + /* + * Loop over RTF_MULTIRT for ip6_dst in the same bucket. Send + * a copy to each one. + * Use the nce (nexthop) and ip6_dst to find the ire. + * + * MULTIRT is not designed to work with shared-IP zones thus we don't + * need to pass a zoneid or a label to the IRE lookup. + */ + if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, &ip6h->ip6_dst)) { + /* Broadcast and multicast case */ + ire = ire_ftable_lookup_v6(&ip6h->ip6_dst, 0, 0, 0, NULL, + ALL_ZONES, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL); + } else { + /* Unicast case */ + ire = ire_ftable_lookup_v6(&ip6h->ip6_dst, 0, &nce->nce_addr, + 0, NULL, ALL_ZONES, NULL, MATCH_IRE_GW, 0, ipst, NULL); + } + + if (ire == NULL || + (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || + !(ire->ire_flags & RTF_MULTIRT)) { + /* Drop */ + ip_drop_output("ip_postfrag_multirt didn't find route", + mp, nce->nce_ill); + if (ire != NULL) + ire_refrele(ire); + return (ENETUNREACH); + } + + irb = ire->ire_bucket; + irb_refhold(irb); + for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { + if (IRE_IS_CONDEMNED(ire1) || + !(ire1->ire_flags & RTF_MULTIRT)) + continue; + + /* Note: When IPv6 uses radix tree we don't need this check */ + if (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, &ire1->ire_addr_v6)) + continue; + + /* Do the ire argument one after the loop */ + if (ire1 == ire) + continue; + + ill1 = ire_nexthop_ill(ire1); + if (ill1 == NULL) { + /* + * This ire might not have been picked by + * ire_route_recursive, in which case ire_dep might + * not have been setup yet. + * We kick ire_route_recursive to try to resolve + * starting at ire1. + */ + ire_t *ire2; + + ire2 = ire_route_recursive_impl_v6(ire1, + &ire1->ire_addr_v6, ire1->ire_type, ire1->ire_ill, + ire1->ire_zoneid, NULL, MATCH_IRE_DSTONLY, + B_TRUE, 0, ipst, NULL, NULL, NULL); + if (ire2 != NULL) + ire_refrele(ire2); + ill1 = ire_nexthop_ill(ire1); + } + if (ill1 == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards - no ill", + mp, ill); + error = ENETUNREACH; + continue; + } + /* Pick the addr and type to use for ndp_nce_init */ + if (nce->nce_common->ncec_flags & NCE_F_MCAST) { + ire_type = IRE_MULTICAST; + nexthop = ip6h->ip6_dst; + } else { + ire_type = ire1->ire_type; /* Doesn't matter */ + nexthop = ire1->ire_gateway_addr_v6; + } + + /* If IPMP meta or under, then we just drop */ + if (ill1->ill_grp != NULL) { + BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards - IPMP", + mp, ill1); + ill_refrele(ill1); + error = ENETUNREACH; + continue; + } + + nce1 = ndp_nce_init(ill1, &nexthop, ire_type); + if (nce1 == NULL) { + BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards - no nce", + mp, ill1); + ill_refrele(ill1); + error = ENOBUFS; + continue; + } + mp1 = copymsg(mp); + if (mp1 == NULL) { + BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards", mp, ill1); + nce_refrele(nce1); + ill_refrele(ill1); + error = ENOBUFS; + continue; + } + /* Preserve HW checksum for this copy */ + DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp); + DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp); + DB_CKSUMEND(mp1) = DB_CKSUMEND(mp); + DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp); + DB_LSOMSS(mp1) = DB_LSOMSS(mp); + + ire1->ire_ob_pkt_count++; + err = ip_xmit(mp1, nce1, ixaflags, pkt_len, xmit_hint, szone, + 0, ixacookie); + if (err == 0) + num_sent++; + else + error = err; + nce_refrele(nce1); + ill_refrele(ill1); + } + irb_refrele(irb); + ire_refrele(ire); + /* Finally, the main one */ + err = ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0, + ixacookie); + if (err == 0) + num_sent++; + else + error = err; + if (num_sent > 0) + return (0); + else + return (error); +} diff --git a/usr/src/uts/common/inet/ip/ip6_rts.c b/usr/src/uts/common/inet/ip/ip6_rts.c index dcf429c8ba..38b43cdf60 100644 --- a/usr/src/uts/common/inet/ip/ip6_rts.c +++ b/usr/src/uts/common/inet/ip/ip6_rts.c @@ -80,8 +80,8 @@ void rts_fill_msg_v6(int type, int rtm_addrs, const in6_addr_t *dst, const in6_addr_t *mask, const in6_addr_t *gateway, const in6_addr_t *src_addr, const in6_addr_t *brd_addr, - const in6_addr_t *author, const ipif_t *ipif, mblk_t *mp, - uint_t sacnt, const tsol_gc_t *gc) + const in6_addr_t *author, const in6_addr_t *ifaddr, const ill_t *ill, + mblk_t *mp, const tsol_gc_t *gc) { rt_msghdr_t *rtm; sin6_t *sin6; @@ -90,7 +90,6 @@ rts_fill_msg_v6(int type, int rtm_addrs, const in6_addr_t *dst, int i; ASSERT(mp != NULL); - ASSERT(sacnt == 0 || gc != NULL); /* * First find the type of the message * and its length. @@ -100,7 +99,7 @@ rts_fill_msg_v6(int type, int rtm_addrs, const in6_addr_t *dst, * Now find the size of the data * that follows the message header. */ - data_size = rts_data_msg_size(rtm_addrs, AF_INET6, sacnt); + data_size = rts_data_msg_size(rtm_addrs, AF_INET6, gc != NULL ? 1 : 0); rtm = (rt_msghdr_t *)mp->b_rptr; mp->b_wptr = &mp->b_rptr[header_size]; @@ -125,13 +124,17 @@ rts_fill_msg_v6(int type, int rtm_addrs, const in6_addr_t *dst, cp += sizeof (sin6_t); break; case RTA_IFA: + sin6->sin6_addr = *ifaddr; + sin6->sin6_family = AF_INET6; + cp += sizeof (sin6_t); + break; case RTA_SRC: sin6->sin6_addr = *src_addr; sin6->sin6_family = AF_INET6; cp += sizeof (sin6_t); break; case RTA_IFP: - cp += ill_dls_info((struct sockaddr_dl *)cp, ipif); + cp += ill_dls_info((struct sockaddr_dl *)cp, ill); break; case RTA_AUTHOR: sin6->sin6_addr = *author; @@ -154,24 +157,20 @@ rts_fill_msg_v6(int type, int rtm_addrs, const in6_addr_t *dst, rtm_ext_t *rtm_ext; struct rtsa_s *rp_dst; tsol_rtsecattr_t *rsap; - int i; ASSERT(gc->gc_grp != NULL); ASSERT(RW_LOCK_HELD(&gc->gc_grp->gcgrp_rwlock)); - ASSERT(sacnt > 0); rtm_ext = (rtm_ext_t *)cp; rtm_ext->rtmex_type = RTMEX_GATEWAY_SECATTR; - rtm_ext->rtmex_len = TSOL_RTSECATTR_SIZE(sacnt); + rtm_ext->rtmex_len = TSOL_RTSECATTR_SIZE(1); rsap = (tsol_rtsecattr_t *)(rtm_ext + 1); - rsap->rtsa_cnt = sacnt; + rsap->rtsa_cnt = 1; rp_dst = rsap->rtsa_attr; - for (i = 0; i < sacnt; i++, gc = gc->gc_next, rp_dst++) { - ASSERT(gc->gc_db != NULL); - bcopy(&gc->gc_db->gcdb_attr, rp_dst, sizeof (*rp_dst)); - } + ASSERT(gc->gc_db != NULL); + bcopy(&gc->gc_db->gcdb_attr, rp_dst, sizeof (*rp_dst)); cp = (uchar_t *)rp_dst; } @@ -208,7 +207,7 @@ ip_rts_change_v6(int type, const in6_addr_t *dst_addr, if (mp == NULL) return; rts_fill_msg_v6(type, rtm_addrs, dst_addr, net_mask, gw_addr, source, - &ipv6_all_zeros, author, NULL, mp, 0, NULL); + &ipv6_all_zeros, &ipv6_all_zeros, author, NULL, mp, NULL); rtm = (rt_msghdr_t *)mp->b_rptr; rtm->rtm_flags = flags; rtm->rtm_errno = error; diff --git a/usr/src/uts/common/inet/ip/ip_arp.c b/usr/src/uts/common/inet/ip/ip_arp.c new file mode 100644 index 0000000000..489d59dbf6 --- /dev/null +++ b/usr/src/uts/common/inet/ip/ip_arp.c @@ -0,0 +1,2468 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <inet/ip_arp.h> +#include <inet/ip_ndp.h> +#include <net/if_arp.h> +#include <netinet/if_ether.h> +#include <sys/strsubr.h> +#include <inet/ip6.h> +#include <inet/ip.h> +#include <inet/ip_ire.h> +#include <inet/ip_if.h> +#include <sys/dlpi.h> +#include <sys/sunddi.h> +#include <sys/strsun.h> +#include <sys/sdt.h> +#include <inet/mi.h> +#include <inet/arp.h> +#include <inet/ipdrop.h> +#include <sys/sockio.h> +#include <inet/ip_impl.h> +#include <sys/policy.h> + +#define ARL_LL_ADDR_OFFSET(arl) (((arl)->arl_sap_length) < 0 ? \ + (sizeof (dl_unitdata_req_t)) : \ + ((sizeof (dl_unitdata_req_t)) + (ABS((arl)->arl_sap_length)))) + +/* + * MAC-specific intelligence. Shouldn't be needed, but the DL_INFO_ACK + * doesn't quite do it for us. + */ +typedef struct arp_m_s { + t_uscalar_t arp_mac_type; + uint32_t arp_mac_arp_hw_type; + t_scalar_t arp_mac_sap_length; + uint32_t arp_mac_hw_addr_length; +} arp_m_t; + +static int arp_close(queue_t *, int); +static void arp_rput(queue_t *, mblk_t *); +static void arp_wput(queue_t *, mblk_t *); +static arp_m_t *arp_m_lookup(t_uscalar_t mac_type); +static void arp_notify(ipaddr_t, mblk_t *, uint32_t, ip_recv_attr_t *, + ncec_t *); +static int arp_output(ill_t *, uint32_t, const uchar_t *, const uchar_t *, + const uchar_t *, const uchar_t *, uchar_t *); +static int arp_modclose(arl_t *); +static void arp_mod_close_tail(arl_t *); +static mblk_t *arl_unbind(arl_t *); +static void arp_process_packet(ill_t *, mblk_t *); +static void arp_excl(ipsq_t *, queue_t *, mblk_t *, void *); +static void arp_drop_packet(const char *str, mblk_t *, ill_t *); +static int arp_open(queue_t *, dev_t *, int, int, cred_t *); +static int ip_sioctl_ifunitsel_arp(queue_t *, int *); +static int ip_sioctl_slifname_arp(queue_t *, void *); +static void arp_dlpi_send(arl_t *, mblk_t *); +static void arl_defaults_common(arl_t *, mblk_t *); +static int arp_modopen(queue_t *, dev_t *, int, int, cred_t *); +static void arp_ifname_notify(arl_t *); +static void arp_rput_dlpi_writer(ipsq_t *, queue_t *, mblk_t *, void *); +static arl_t *ill_to_arl(ill_t *); + +#define DL_PRIM(mp) (((union DL_primitives *)(mp)->b_rptr)->dl_primitive) +#define IS_DLPI_DATA(mp) \ + ((DB_TYPE(mp) == M_PROTO) && \ + MBLKL(mp) >= sizeof (dl_unitdata_ind_t) && \ + (DL_PRIM(mp) == DL_UNITDATA_IND)) + +#define AR_NOTFOUND 1 /* No matching ace found in cache */ +#define AR_MERGED 2 /* Matching ace updated (RFC 826 Merge_flag) */ +#define AR_LOOPBACK 3 /* Our own arp packet was received */ +#define AR_BOGON 4 /* Another host has our IP addr. */ +#define AR_FAILED 5 /* Duplicate Address Detection has failed */ +#define AR_CHANGED 6 /* Address has changed; tell IP (and merged) */ + +boolean_t arp_no_defense; + +struct module_info arp_mod_info = { + IP_MOD_ID, "arpip", 1, INFPSZ, 65536, 1024 +}; +static struct qinit rinit_arp = { + (pfi_t)arp_rput, NULL, arp_open, arp_close, NULL, &arp_mod_info +}; +static struct qinit winit_arp = { + (pfi_t)arp_wput, NULL, arp_open, arp_close, NULL, + &arp_mod_info +}; +struct streamtab arpinfo = { + &rinit_arp, &winit_arp +}; +#define ARH_FIXED_LEN 8 +#define AR_LL_HDR_SLACK 32 + +/* + * pfhooks for ARP. + */ +#define ARP_HOOK_IN(_hook, _event, _ilp, _hdr, _fm, _m, ipst) \ + \ + if ((_hook).he_interested) { \ + hook_pkt_event_t info; \ + \ + info.hpe_protocol = ipst->ips_arp_net_data; \ + info.hpe_ifp = _ilp; \ + info.hpe_ofp = 0; \ + info.hpe_hdr = _hdr; \ + info.hpe_mp = &(_fm); \ + info.hpe_mb = _m; \ + if (hook_run(ipst->ips_arp_net_data->netd_hooks, \ + _event, (hook_data_t)&info) != 0) { \ + if (_fm != NULL) { \ + freemsg(_fm); \ + _fm = NULL; \ + } \ + _hdr = NULL; \ + _m = NULL; \ + } else { \ + _hdr = info.hpe_hdr; \ + _m = info.hpe_mb; \ + } \ + } + +#define ARP_HOOK_OUT(_hook, _event, _olp, _hdr, _fm, _m, ipst) \ + \ + if ((_hook).he_interested) { \ + hook_pkt_event_t info; \ + \ + info.hpe_protocol = ipst->ips_arp_net_data; \ + info.hpe_ifp = 0; \ + info.hpe_ofp = _olp; \ + info.hpe_hdr = _hdr; \ + info.hpe_mp = &(_fm); \ + info.hpe_mb = _m; \ + if (hook_run(ipst->ips_arp_net_data->netd_hooks, \ + _event, (hook_data_t)&info) != 0) { \ + if (_fm != NULL) { \ + freemsg(_fm); \ + _fm = NULL; \ + } \ + _hdr = NULL; \ + _m = NULL; \ + } else { \ + _hdr = info.hpe_hdr; \ + _m = info.hpe_mb; \ + } \ + } + +static arp_m_t arp_m_tbl[] = { + { DL_CSMACD, ARPHRD_ETHER, -2, 6}, /* 802.3 */ + { DL_TPB, ARPHRD_IEEE802, -2, 6}, /* 802.4 */ + { DL_TPR, ARPHRD_IEEE802, -2, 6}, /* 802.5 */ + { DL_METRO, ARPHRD_IEEE802, -2, 6}, /* 802.6 */ + { DL_ETHER, ARPHRD_ETHER, -2, 6}, /* Ethernet */ + { DL_FDDI, ARPHRD_ETHER, -2, 6}, /* FDDI */ + { DL_IB, ARPHRD_IB, -2, 20}, /* Infiniband */ + { DL_OTHER, ARPHRD_ETHER, -2, 6} /* unknown */ +}; + +static void +arl_refhold_locked(arl_t *arl) +{ + ASSERT(MUTEX_HELD(&arl->arl_lock)); + arl->arl_refcnt++; + ASSERT(arl->arl_refcnt != 0); +} + +static void +arl_refrele(arl_t *arl) +{ + mutex_enter(&arl->arl_lock); + ASSERT(arl->arl_refcnt != 0); + arl->arl_refcnt--; + if (arl->arl_refcnt > 1) { + mutex_exit(&arl->arl_lock); + return; + } + + /* ill_close or arp_unbind_complete may be waiting */ + cv_broadcast(&arl->arl_cv); + mutex_exit(&arl->arl_lock); +} + +/* + * wake up any pending ip ioctls. + */ +static void +arp_cmd_done(ill_t *ill, int err, t_uscalar_t lastprim) +{ + if (lastprim == DL_UNBIND_REQ && ill->ill_replumbing) + arp_replumb_done(ill, 0); + else + arp_bringup_done(ill, err); +} + +static int +ip_nce_resolve_all(ill_t *ill, uchar_t *src_haddr, uint32_t hlen, + const in_addr_t *src_paddr, ncec_t **sncec, int op) +{ + int retv; + ncec_t *ncec; + boolean_t ll_changed; + uchar_t *lladdr = NULL; + int new_state; + + ASSERT(ill != NULL); + + ncec = ncec_lookup_illgrp_v4(ill, src_paddr); + *sncec = ncec; + + if (ncec == NULL) { + retv = AR_NOTFOUND; + goto done; + } + + mutex_enter(&ncec->ncec_lock); + /* + * IP addr and hardware address match what we already + * have, then this is a broadcast packet emitted by one of our + * interfaces, reflected by the switch and received on another + * interface. We return AR_LOOPBACK. + */ + lladdr = ncec->ncec_lladdr; + if (NCE_MYADDR(ncec) && hlen == ncec->ncec_ill->ill_phys_addr_length && + bcmp(lladdr, src_haddr, hlen) == 0) { + mutex_exit(&ncec->ncec_lock); + retv = AR_LOOPBACK; + goto done; + } + /* + * If the entry is unverified, then we've just verified that + * someone else already owns this address, because this is a + * message with the same protocol address but different + * hardware address. + */ + if (ncec->ncec_flags & NCE_F_UNVERIFIED) { + mutex_exit(&ncec->ncec_lock); + ncec_delete(ncec); + ncec_refrele(ncec); + *sncec = NULL; + retv = AR_FAILED; + goto done; + } + + /* + * If the IP address matches ours and we're authoritative for + * this entry, then some other node is using our IP addr, so + * return AR_BOGON. Also reset the transmit count to zero so + * that, if we're currently in initial announcement mode, we + * switch back to the lazier defense mode. Knowing that + * there's at least one duplicate out there, we ought not + * blindly announce. + * + * NCE_F_AUTHORITY is set in one of two ways: + * 1. /sbin/arp told us so, via the "permanent" flag. + * 2. This is one of my addresses. + */ + if (ncec->ncec_flags & NCE_F_AUTHORITY) { + ncec->ncec_unsolicit_count = 0; + mutex_exit(&ncec->ncec_lock); + retv = AR_BOGON; + goto done; + } + + /* + * No address conflict was detected, and we are getting + * ready to update the ncec's hwaddr. The nce MUST NOT be on an + * under interface, because all dynamic nce's are created on the + * native interface (in the non-IPMP case) or on the IPMP + * meta-interface (in the IPMP case) + */ + ASSERT(!IS_UNDER_IPMP(ncec->ncec_ill)); + + /* + * update ncec with src_haddr, hlen. + * + * We are trying to resolve this ncec_addr/src_paddr and we + * got a REQUEST/RESPONSE from the ncec_addr/src_paddr. + * So the new_state is at least "STALE". If, in addition, + * this a solicited, unicast ARP_RESPONSE, we can transition + * to REACHABLE. + */ + new_state = ND_STALE; + ip1dbg(("got info for ncec %p from addr %x\n", + (void *)ncec, *src_paddr)); + retv = AR_MERGED; + if (ncec->ncec_state == ND_INCOMPLETE || + ncec->ncec_state == ND_INITIAL) { + ll_changed = B_TRUE; + } else { + ll_changed = nce_cmp_ll_addr(ncec, src_haddr, hlen); + if (!ll_changed) + new_state = ND_UNCHANGED; + else + retv = AR_CHANGED; + } + /* + * We don't have the equivalent of the IPv6 'S' flag indicating + * a solicited response, so we assume that if we are in + * INCOMPLETE, or got back an unchanged lladdr in PROBE state, + * and this is an ARP_RESPONSE, it must be a + * solicited response allowing us to transtion to REACHABLE. + */ + if (op == ARP_RESPONSE) { + switch (ncec->ncec_state) { + case ND_PROBE: + new_state = (ll_changed ? ND_STALE : ND_REACHABLE); + break; + case ND_INCOMPLETE: + new_state = ND_REACHABLE; + break; + } + } + /* + * Call nce_update() to refresh fastpath information on any + * dependent nce_t entries. + */ + nce_update(ncec, new_state, (ll_changed ? src_haddr : NULL)); + mutex_exit(&ncec->ncec_lock); + nce_resolv_ok(ncec); +done: + return (retv); +} + +/* Find an entry for a particular MAC type in the arp_m_tbl. */ +static arp_m_t * +arp_m_lookup(t_uscalar_t mac_type) +{ + arp_m_t *arm; + + for (arm = arp_m_tbl; arm < A_END(arp_m_tbl); arm++) { + if (arm->arp_mac_type == mac_type) + return (arm); + } + return (NULL); +} + +static uint32_t +arp_hw_type(t_uscalar_t mactype) +{ + arp_m_t *arm; + + if ((arm = arp_m_lookup(mactype)) == NULL) + arm = arp_m_lookup(DL_OTHER); + return (arm->arp_mac_arp_hw_type); +} + +/* + * Called when an DLPI control message has been acked; send down the next + * queued message (if any). + * The DLPI messages of interest being bind, attach and unbind since + * these are the only ones sent by ARP via arp_dlpi_send. + */ +static void +arp_dlpi_done(arl_t *arl, ill_t *ill) +{ + mblk_t *mp; + int err; + t_uscalar_t prim; + + mutex_enter(&arl->arl_lock); + prim = arl->arl_dlpi_pending; + + if ((mp = arl->arl_dlpi_deferred) == NULL) { + arl->arl_dlpi_pending = DL_PRIM_INVAL; + if (arl->arl_state_flags & ARL_LL_DOWN) + err = ENETDOWN; + else + err = 0; + mutex_exit(&arl->arl_lock); + + mutex_enter(&ill->ill_lock); + ill->ill_arl_dlpi_pending = 0; + mutex_exit(&ill->ill_lock); + arp_cmd_done(ill, err, prim); + return; + } + + arl->arl_dlpi_deferred = mp->b_next; + mp->b_next = NULL; + + ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); + + arl->arl_dlpi_pending = DL_PRIM(mp); + mutex_exit(&arl->arl_lock); + + mutex_enter(&ill->ill_lock); + ill->ill_arl_dlpi_pending = 1; + mutex_exit(&ill->ill_lock); + + putnext(arl->arl_wq, mp); +} + +/* + * This routine is called during module initialization when the DL_INFO_ACK + * comes back from the device. We set up defaults for all the device dependent + * doo-dads we are going to need. This will leave us ready to roll if we are + * attempting auto-configuration. Alternatively, these defaults can be + * overridden by initialization procedures possessing higher intelligence. + * + * Caller will free the mp. + */ +static void +arp_ll_set_defaults(arl_t *arl, mblk_t *mp) +{ + arp_m_t *arm; + dl_info_ack_t *dlia = (dl_info_ack_t *)mp->b_rptr; + + if ((arm = arp_m_lookup(dlia->dl_mac_type)) == NULL) + arm = arp_m_lookup(DL_OTHER); + ASSERT(arm != NULL); + + /* + * We initialize based on parameters in the (currently) not too + * exhaustive arp_m_tbl. + */ + if (dlia->dl_version == DL_VERSION_2) { + arl->arl_sap_length = dlia->dl_sap_length; + arl->arl_phys_addr_length = dlia->dl_brdcst_addr_length; + if (dlia->dl_provider_style == DL_STYLE2) + arl->arl_needs_attach = 1; + } else { + arl->arl_sap_length = arm->arp_mac_sap_length; + arl->arl_phys_addr_length = arm->arp_mac_hw_addr_length; + } + /* + * Note: the arp_hw_type in the arp header may be derived from + * the ill_mac_type and arp_m_lookup(). + */ + arl->arl_sap = ETHERTYPE_ARP; + arl_defaults_common(arl, mp); +} + +static void +arp_wput(queue_t *q, mblk_t *mp) +{ + int err = EINVAL; + struct iocblk *ioc; + mblk_t *mp1; + + switch (DB_TYPE(mp)) { + case M_IOCTL: + ASSERT(q->q_next != NULL); + ioc = (struct iocblk *)mp->b_rptr; + if (ioc->ioc_cmd != SIOCSLIFNAME && + ioc->ioc_cmd != IF_UNITSEL) { + DTRACE_PROBE4(arl__dlpi, char *, "arp_wput", + char *, "<some ioctl>", char *, "-", + arl_t *, (arl_t *)q->q_ptr); + putnext(q, mp); + return; + } + if ((mp1 = mp->b_cont) == 0) + err = EINVAL; + else if (ioc->ioc_cmd == SIOCSLIFNAME) + err = ip_sioctl_slifname_arp(q, mp1->b_rptr); + else if (ioc->ioc_cmd == IF_UNITSEL) + err = ip_sioctl_ifunitsel_arp(q, (int *)mp1->b_rptr); + if (err == 0) + miocack(q, mp, 0, 0); + else + miocnak(q, mp, 0, err); + return; + default: + DTRACE_PROBE4(arl__dlpi, char *, "arp_wput default", + char *, "default mblk", char *, "-", + arl_t *, (arl_t *)q->q_ptr); + putnext(q, mp); + return; + } +} + +/* + * similar to ill_dlpi_pending(): verify that the received DLPI response + * matches the one that is pending for the arl. + */ +static boolean_t +arl_dlpi_pending(arl_t *arl, t_uscalar_t prim) +{ + t_uscalar_t pending; + + mutex_enter(&arl->arl_lock); + if (arl->arl_dlpi_pending == prim) { + mutex_exit(&arl->arl_lock); + return (B_TRUE); + } + + if (arl->arl_state_flags & ARL_CONDEMNED) { + mutex_exit(&arl->arl_lock); + return (B_FALSE); + } + pending = arl->arl_dlpi_pending; + mutex_exit(&arl->arl_lock); + + if (pending == DL_PRIM_INVAL) { + ip0dbg(("arl_dlpi_pending unsolicited ack for %s on %s", + dl_primstr(prim), arl->arl_name)); + } else { + ip0dbg(("arl_dlpi_pending ack for %s on %s expect %s", + dl_primstr(prim), arl->arl_name, dl_primstr(pending))); + } + return (B_FALSE); +} + +/* DLPI messages, other than DL_UNITDATA_IND are handled here. */ +static void +arp_rput_dlpi(queue_t *q, mblk_t *mp) +{ + arl_t *arl = (arl_t *)q->q_ptr; + union DL_primitives *dlp; + t_uscalar_t prim; + t_uscalar_t reqprim = DL_PRIM_INVAL; + ill_t *ill; + + if ((mp->b_wptr - mp->b_rptr) < sizeof (dlp->dl_primitive)) { + putnext(q, mp); + return; + } + dlp = (union DL_primitives *)mp->b_rptr; + prim = dlp->dl_primitive; + + /* + * If we received an ACK but didn't send a request for it, then it + * can't be part of any pending operation; discard up-front. + */ + switch (prim) { + case DL_ERROR_ACK: + /* + * ce is confused about how DLPI works, so we have to interpret + * an "error" on DL_NOTIFY_ACK (which we never could have sent) + * as really meaning an error on DL_NOTIFY_REQ. + * + * Note that supporting DL_NOTIFY_REQ is optional, so printing + * out an error message on the console isn't warranted except + * for debug. + */ + if (dlp->error_ack.dl_error_primitive == DL_NOTIFY_ACK || + dlp->error_ack.dl_error_primitive == DL_NOTIFY_REQ) { + reqprim = DL_NOTIFY_REQ; + } else { + reqprim = dlp->error_ack.dl_error_primitive; + } + break; + case DL_INFO_ACK: + reqprim = DL_INFO_REQ; + break; + case DL_OK_ACK: + reqprim = dlp->ok_ack.dl_correct_primitive; + break; + case DL_BIND_ACK: + reqprim = DL_BIND_REQ; + break; + default: + DTRACE_PROBE2(rput_dl_badprim, arl_t *, arl, + union DL_primitives *, dlp); + putnext(q, mp); + return; + } + if (reqprim == DL_PRIM_INVAL || !arl_dlpi_pending(arl, reqprim)) { + freemsg(mp); + return; + } + DTRACE_PROBE4(arl__dlpi, char *, "arp_rput_dlpi received", + char *, dl_primstr(prim), char *, dl_primstr(reqprim), + arl_t *, arl); + + ASSERT(prim != DL_NOTIFY_IND); + + ill = arl_to_ill(arl); + + switch (reqprim) { + case DL_INFO_REQ: + /* + * ill has not been set up yet for this case. This is the + * DL_INFO_ACK for the first DL_INFO_REQ sent from + * arp_modopen(). There should be no other arl_dlpi_deferred + * messages pending. We initialize the arl here. + */ + ASSERT(!arl->arl_dlpi_style_set); + ASSERT(arl->arl_dlpi_pending == DL_INFO_REQ); + ASSERT(arl->arl_dlpi_deferred == NULL); + arl->arl_dlpi_pending = DL_PRIM_INVAL; + arp_ll_set_defaults(arl, mp); + freemsg(mp); + return; + case DL_UNBIND_REQ: + mutex_enter(&arl->arl_lock); + arl->arl_state_flags &= ~ARL_DL_UNBIND_IN_PROGRESS; + /* + * This is not an error, so we don't set ARL_LL_DOWN + */ + arl->arl_state_flags &= ~ARL_LL_UP; + arl->arl_state_flags |= ARL_LL_UNBOUND; + if (arl->arl_state_flags & ARL_CONDEMNED) { + /* + * if this is part of the unplumb the arl may + * vaporize any moment after we cv_signal the + * arl_cv so we reset arl_dlpi_pending here. + * All other cases (including replumb) will + * have the arl_dlpi_pending reset in + * arp_dlpi_done. + */ + arl->arl_dlpi_pending = DL_PRIM_INVAL; + } + cv_signal(&arl->arl_cv); + mutex_exit(&arl->arl_lock); + break; + } + if (ill != NULL) { + /* + * ill ref obtained by arl_to_ill() will be released + * by qwriter_ip() + */ + qwriter_ip(ill, ill->ill_wq, mp, arp_rput_dlpi_writer, + CUR_OP, B_TRUE); + return; + } + freemsg(mp); +} + +/* + * Handling of DLPI messages that require exclusive access to the ipsq. + */ +/* ARGSUSED */ +static void +arp_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) +{ + union DL_primitives *dlp = (union DL_primitives *)mp->b_rptr; + ill_t *ill = (ill_t *)q->q_ptr; + arl_t *arl = ill_to_arl(ill); + + if (arl == NULL) { + /* + * happens as a result arp_modclose triggering unbind. + * arp_rput_dlpi will cv_signal the arl_cv and the modclose + * will complete, but when it does ipsq_exit, the waiting + * qwriter_ip gets into the ipsq but will find the arl null. + * There should be no deferred messages in this case, so + * just complete and exit. + */ + arp_cmd_done(ill, 0, DL_UNBIND_REQ); + freemsg(mp); + return; + } + switch (dlp->dl_primitive) { + case DL_ERROR_ACK: + switch (dlp->error_ack.dl_error_primitive) { + case DL_UNBIND_REQ: + mutex_enter(&arl->arl_lock); + arl->arl_state_flags &= ~ARL_DL_UNBIND_IN_PROGRESS; + arl->arl_state_flags &= ~ARL_LL_UP; + arl->arl_state_flags |= ARL_LL_UNBOUND; + arl->arl_state_flags |= ARL_LL_DOWN; + cv_signal(&arl->arl_cv); + mutex_exit(&arl->arl_lock); + break; + case DL_BIND_REQ: + mutex_enter(&arl->arl_lock); + arl->arl_state_flags &= ~ARL_LL_UP; + arl->arl_state_flags |= ARL_LL_DOWN; + arl->arl_state_flags |= ARL_LL_UNBOUND; + cv_signal(&arl->arl_cv); + mutex_exit(&arl->arl_lock); + break; + case DL_ATTACH_REQ: + break; + default: + /* If it's anything else, we didn't send it. */ + arl_refrele(arl); + putnext(q, mp); + return; + } + break; + case DL_OK_ACK: + DTRACE_PROBE4(arl__dlpi, char *, "arp_rput_dlpi_writer ok", + char *, dl_primstr(dlp->ok_ack.dl_correct_primitive), + char *, dl_primstr(dlp->ok_ack.dl_correct_primitive), + arl_t *, arl); + mutex_enter(&arl->arl_lock); + switch (dlp->ok_ack.dl_correct_primitive) { + case DL_UNBIND_REQ: + case DL_ATTACH_REQ: + break; + default: + ip0dbg(("Dropping unrecognized DL_OK_ACK for %s", + dl_primstr(dlp->ok_ack.dl_correct_primitive))); + mutex_exit(&arl->arl_lock); + arl_refrele(arl); + freemsg(mp); + return; + } + mutex_exit(&arl->arl_lock); + break; + case DL_BIND_ACK: + DTRACE_PROBE2(rput_dl_bind, arl_t *, arl, + dl_bind_ack_t *, &dlp->bind_ack); + + mutex_enter(&arl->arl_lock); + ASSERT(arl->arl_state_flags & ARL_LL_BIND_PENDING); + arl->arl_state_flags &= + ~(ARL_LL_BIND_PENDING|ARL_LL_DOWN|ARL_LL_UNBOUND); + arl->arl_state_flags |= ARL_LL_UP; + mutex_exit(&arl->arl_lock); + break; + case DL_UDERROR_IND: + DTRACE_PROBE2(rput_dl_uderror, arl_t *, arl, + dl_uderror_ind_t *, &dlp->uderror_ind); + arl_refrele(arl); + putnext(q, mp); + return; + default: + DTRACE_PROBE2(rput_dl_badprim, arl_t *, arl, + union DL_primitives *, dlp); + arl_refrele(arl); + putnext(q, mp); + return; + } + arp_dlpi_done(arl, ill); + arl_refrele(arl); + freemsg(mp); +} + +void +arp_rput(queue_t *q, mblk_t *mp) +{ + arl_t *arl = q->q_ptr; + boolean_t need_refrele = B_FALSE; + + mutex_enter(&arl->arl_lock); + if (((arl->arl_state_flags & + (ARL_CONDEMNED | ARL_LL_REPLUMBING)) != 0)) { + /* + * Only allow high priority DLPI messages during unplumb or + * replumb, and we don't take an arl_refcnt for that case. + */ + if (DB_TYPE(mp) != M_PCPROTO) { + mutex_exit(&arl->arl_lock); + freemsg(mp); + return; + } + } else { + arl_refhold_locked(arl); + need_refrele = B_TRUE; + } + mutex_exit(&arl->arl_lock); + + switch (DB_TYPE(mp)) { + case M_PCPROTO: + case M_PROTO: { + ill_t *ill; + + /* + * could be one of + * (i) real message from the wire, (DLPI_DATA) + * (ii) DLPI message + * Take a ref on the ill associated with this arl to + * prevent the ill from being unplumbed until this thread + * is done. + */ + if (IS_DLPI_DATA(mp)) { + ill = arl_to_ill(arl); + if (ill == NULL) { + arp_drop_packet("No ill", mp, ill); + break; + } + arp_process_packet(ill, mp); + ill_refrele(ill); + break; + } + /* Miscellaneous DLPI messages get shuffled off. */ + arp_rput_dlpi(q, mp); + break; + } + case M_ERROR: + case M_HANGUP: + if (mp->b_rptr < mp->b_wptr) + arl->arl_error = (int)(*mp->b_rptr & 0xFF); + if (arl->arl_error == 0) + arl->arl_error = ENXIO; + freemsg(mp); + break; + default: + ip1dbg(("arp_rput other db type %x\n", DB_TYPE(mp))); + putnext(q, mp); + break; + } + if (need_refrele) + arl_refrele(arl); +} + +static void +arp_process_packet(ill_t *ill, mblk_t *mp) +{ + mblk_t *mp1; + arh_t *arh; + in_addr_t src_paddr, dst_paddr; + uint32_t hlen, plen; + boolean_t is_probe; + int op; + ncec_t *dst_ncec, *src_ncec = NULL; + uchar_t *src_haddr, *arhp, *dst_haddr, *dp, *sp; + int err; + ip_stack_t *ipst; + boolean_t need_ill_refrele = B_FALSE; + nce_t *nce; + uchar_t *src_lladdr; + dl_unitdata_ind_t *dlui; + ip_recv_attr_t iras; + + ASSERT(ill != NULL); + if (ill->ill_flags & ILLF_NOARP) { + arp_drop_packet("Interface does not support ARP", mp, ill); + return; + } + ipst = ill->ill_ipst; + /* + * What we should have at this point is a DL_UNITDATA_IND message + * followed by an ARP packet. We do some initial checks and then + * get to work. + */ + dlui = (dl_unitdata_ind_t *)mp->b_rptr; + if (dlui->dl_group_address == 1) { + /* + * multicast or broadcast packet. Only accept on the ipmp + * nominated interface for multicasts ('cast_ill'). + * If we have no cast_ill we are liberal and accept everything. + */ + if (IS_UNDER_IPMP(ill)) { + /* For an under ill_grp can change under lock */ + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + if (!ill->ill_nom_cast && ill->ill_grp != NULL && + ill->ill_grp->ig_cast_ill != NULL) { + rw_exit(&ipst->ips_ill_g_lock); + arp_drop_packet("Interface is not nominated " + "for multicast sends and receives", + mp, ill); + return; + } + rw_exit(&ipst->ips_ill_g_lock); + } + } + mp1 = mp->b_cont; + if (mp1 == NULL) { + arp_drop_packet("Missing ARP packet", mp, ill); + return; + } + if (mp1->b_cont != NULL) { + /* No fooling around with funny messages. */ + if (!pullupmsg(mp1, -1)) { + arp_drop_packet("Funny message: pullup failed", + mp, ill); + return; + } + } + arh = (arh_t *)mp1->b_rptr; + hlen = arh->arh_hlen; + plen = arh->arh_plen; + if (MBLKL(mp1) < ARH_FIXED_LEN + 2 * hlen + 2 * plen) { + arp_drop_packet("mblk len too small", mp, ill); + return; + } + /* + * hlen 0 is used for RFC 1868 UnARP. + * + * Note that the rest of the code checks that hlen is what we expect + * for this hardware address type, so might as well discard packets + * here that don't match. + */ + if ((hlen > 0 && hlen != ill->ill_phys_addr_length) || plen == 0) { + DTRACE_PROBE2(rput_bogus, ill_t *, ill, mblk_t *, mp1); + arp_drop_packet("Bogus hlen or plen", mp, ill); + return; + } + /* + * Historically, Solaris has been lenient about hardware type numbers. + * We should check here, but don't. + */ + DTRACE_PROBE3(arp__physical__in__start, ill_t *, ill, arh_t *, arh, + mblk_t *, mp); + /* + * If ill is in an ipmp group, it will be the under ill. If we want + * to report the packet as coming up the IPMP interface, we should + * convert it to the ipmp ill. + */ + ARP_HOOK_IN(ipst->ips_arp_physical_in_event, ipst->ips_arp_physical_in, + ill->ill_phyint->phyint_ifindex, arh, mp, mp1, ipst); + DTRACE_PROBE1(arp__physical__in__end, mblk_t *, mp); + if (mp == NULL) + return; + arhp = (uchar_t *)arh + ARH_FIXED_LEN; + src_haddr = arhp; /* ar$sha */ + arhp += hlen; + bcopy(arhp, &src_paddr, IP_ADDR_LEN); /* ar$spa */ + sp = arhp; + arhp += IP_ADDR_LEN; + dst_haddr = arhp; /* ar$dha */ + arhp += hlen; + bcopy(arhp, &dst_paddr, IP_ADDR_LEN); /* ar$tpa */ + dp = arhp; + op = BE16_TO_U16(arh->arh_operation); + + DTRACE_PROBE2(ip__arp__input, (in_addr_t), src_paddr, + (in_addr_t), dst_paddr); + + /* Determine if this is just a probe */ + is_probe = (src_paddr == INADDR_ANY); + + /* + * ira_ill is the only field used down the arp_notify path. + */ + bzero(&iras, sizeof (iras)); + iras.ira_ill = iras.ira_rill = ill; + /* + * RFC 826: first check if the <protocol, sender protocol address> is + * in the cache, if there is a sender protocol address. Note that this + * step also handles resolutions based on source. + */ + /* Note: after here we need to freeb(mp) and freemsg(mp1) separately */ + mp->b_cont = NULL; + if (is_probe) { + err = AR_NOTFOUND; + } else { + if (plen != 4) { + arp_drop_packet("bad protocol len", mp, ill); + return; + } + err = ip_nce_resolve_all(ill, src_haddr, hlen, &src_paddr, + &src_ncec, op); + switch (err) { + case AR_BOGON: + ASSERT(src_ncec != NULL); + arp_notify(src_paddr, mp1, AR_CN_BOGON, + &iras, src_ncec); + break; + case AR_FAILED: + arp_notify(src_paddr, mp1, AR_CN_FAILED, &iras, + src_ncec); + break; + case AR_LOOPBACK: + DTRACE_PROBE2(rput_loopback, ill_t *, ill, arh_t *, + arh); + freemsg(mp1); + break; + default: + goto update; + } + freemsg(mp); + if (src_ncec != NULL) + ncec_refrele(src_ncec); + return; + } +update: + /* + * Now look up the destination address. By RFC 826, we ignore the + * packet at this step if the target isn't one of our addresses (i.e., + * one we have been asked to PUBLISH). This is true even if the + * target is something we're trying to resolve and the packet + * is a response. + */ + dst_ncec = ncec_lookup_illgrp_v4(ill, &dst_paddr); + if (dst_ncec == NULL || !NCE_PUBLISH(dst_ncec)) { + /* + * Let the client know if the source mapping has changed, even + * if the destination provides no useful information for the + * client. + */ + if (err == AR_CHANGED) { + arp_notify(src_paddr, mp1, AR_CN_ANNOUNCE, &iras, + NULL); + freemsg(mp); + } else { + freemsg(mp); + arp_drop_packet("Target is not interesting", mp1, ill); + } + if (dst_ncec != NULL) + ncec_refrele(dst_ncec); + if (src_ncec != NULL) + ncec_refrele(src_ncec); + return; + } + + if (dst_ncec->ncec_flags & NCE_F_UNVERIFIED) { + /* + * Check for a reflection. Some misbehaving bridges will + * reflect our own transmitted packets back to us. + */ + ASSERT(NCE_PUBLISH(dst_ncec)); + if (hlen != dst_ncec->ncec_ill->ill_phys_addr_length) { + ncec_refrele(dst_ncec); + if (src_ncec != NULL) + ncec_refrele(src_ncec); + freemsg(mp); + arp_drop_packet("bad arh_len", mp1, ill); + return; + } + if (!nce_cmp_ll_addr(dst_ncec, src_haddr, hlen)) { + DTRACE_PROBE3(rput_probe_reflected, ill_t *, ill, + arh_t *, arh, ncec_t *, dst_ncec); + ncec_refrele(dst_ncec); + if (src_ncec != NULL) + ncec_refrele(src_ncec); + freemsg(mp); + arp_drop_packet("Reflected probe", mp1, ill); + return; + } + /* + * Responses targeting our HW address that are not responses to + * our DAD probe must be ignored as they are related to requests + * sent before DAD was restarted. + */ + if (op == ARP_RESPONSE && + (nce_cmp_ll_addr(dst_ncec, dst_haddr, hlen) == 0)) { + ncec_refrele(dst_ncec); + if (src_ncec != NULL) + ncec_refrele(src_ncec); + freemsg(mp); + arp_drop_packet( + "Response to request that was sent before DAD", + mp1, ill); + return; + } + /* + * Responses targeted to HW addresses which are not ours but + * sent to our unverified proto address are also conflicts. + * These may be reported by a proxy rather than the interface + * with the conflicting address, dst_paddr is in conflict + * rather than src_paddr. To ensure IP can locate the correct + * ipif to take down, it is necessary to copy dst_paddr to + * the src_paddr field before sending it to IP. The same is + * required for probes, where src_paddr will be INADDR_ANY. + */ + if (is_probe || op == ARP_RESPONSE) { + bcopy(dp, sp, plen); + arp_notify(src_paddr, mp1, AR_CN_FAILED, &iras, + NULL); + ncec_delete(dst_ncec); + } else if (err == AR_CHANGED) { + arp_notify(src_paddr, mp1, AR_CN_ANNOUNCE, &iras, + NULL); + } else { + DTRACE_PROBE3(rput_request_unverified, + ill_t *, ill, arh_t *, arh, ncec_t *, dst_ncec); + arp_drop_packet("Unverified request", mp1, ill); + } + freemsg(mp); + ncec_refrele(dst_ncec); + if (src_ncec != NULL) + ncec_refrele(src_ncec); + return; + } + /* + * If it's a request, then we reply to this, and if we think the + * sender's unknown, then we create an entry to avoid unnecessary ARPs. + * The design assumption is that someone ARPing us is likely to send us + * a packet soon, and that we'll want to reply to it. + */ + if (op == ARP_REQUEST) { + const uchar_t *nce_hwaddr; + struct in_addr nce_paddr; + clock_t now; + ill_t *under_ill = ill; + boolean_t send_unicast = B_TRUE; + + ASSERT(NCE_PUBLISH(dst_ncec)); + + if ((dst_ncec->ncec_flags & (NCE_F_BCAST|NCE_F_MCAST)) != 0) { + /* + * Ignore senders who are deliberately or accidentally + * confused. + */ + goto bail; + } + + if (!is_probe && err == AR_NOTFOUND) { + ASSERT(src_ncec == NULL); + + if (IS_UNDER_IPMP(under_ill)) { + /* + * create the ncec for the sender on ipmp_ill. + * We pass in the ipmp_ill itself to avoid + * creating an nce_t on the under_ill. + */ + ill = ipmp_ill_hold_ipmp_ill(under_ill); + if (ill == NULL) + ill = under_ill; + else + need_ill_refrele = B_TRUE; + } + + err = nce_lookup_then_add_v4(ill, src_haddr, hlen, + &src_paddr, 0, ND_STALE, &nce); + + switch (err) { + case 0: + case EEXIST: + ip1dbg(("added ncec %p in state %d ill %s\n", + (void *)src_ncec, src_ncec->ncec_state, + ill->ill_name)); + src_ncec = nce->nce_common; + break; + default: + /* + * Either no memory, or the outgoing interface + * is in the process of down/unplumb. In the + * latter case, we will fail the send anyway, + * and in the former case, we should try to send + * the ARP response. + */ + src_lladdr = src_haddr; + goto send_response; + } + ncec_refhold(src_ncec); + nce_refrele(nce); + /* set up cleanup interval on ncec */ + } + + /* + * This implements periodic address defense based on a modified + * version of the RFC 3927 requirements. Instead of sending a + * broadcasted reply every time, as demanded by the RFC, we + * send at most one broadcast reply per arp_broadcast_interval. + */ + now = ddi_get_lbolt(); + if ((now - dst_ncec->ncec_last_time_defended) > + MSEC_TO_TICK(ipst->ips_ipv4_dad_announce_interval)) { + dst_ncec->ncec_last_time_defended = now; + /* + * If this is one of the long-suffering entries, + * pull it out now. It no longer needs separate + * defense, because we're now doing that with this + * broadcasted reply. + */ + dst_ncec->ncec_flags &= ~NCE_F_DELAYED; + send_unicast = B_FALSE; + } + if (src_ncec != NULL && send_unicast) { + src_lladdr = src_ncec->ncec_lladdr; + } else { + src_lladdr = under_ill->ill_bcast_mp->b_rptr + + NCE_LL_ADDR_OFFSET(under_ill); + } +send_response: + nce_hwaddr = dst_ncec->ncec_lladdr; + IN6_V4MAPPED_TO_INADDR(&dst_ncec->ncec_addr, &nce_paddr); + + (void) arp_output(under_ill, ARP_RESPONSE, + nce_hwaddr, (uchar_t *)&nce_paddr, src_haddr, + (uchar_t *)&src_paddr, src_lladdr); + } +bail: + if (dst_ncec != NULL) { + ncec_refrele(dst_ncec); + } + if (src_ncec != NULL) { + ncec_refrele(src_ncec); + } + if (err == AR_CHANGED) { + mp->b_cont = NULL; + arp_notify(src_paddr, mp1, AR_CN_ANNOUNCE, &iras, NULL); + mp1 = NULL; + } + if (need_ill_refrele) + ill_refrele(ill); +done: + freemsg(mp); + freemsg(mp1); +} + +/* + * Basic initialization of the arl_t and the arl_common structure shared with + * the ill_t that is done after SLIFNAME/IF_UNITSEL. + */ +static int +arl_ill_init(arl_t *arl, char *ill_name) +{ + ill_t *ill; + arl_ill_common_t *ai; + + ill = ill_lookup_on_name(ill_name, B_FALSE, B_FALSE, B_FALSE, + arl->arl_ipst); + + if (ill == NULL) + return (ENXIO); + + /* + * By the time we set up the arl, we expect the ETHERTYPE_IP + * stream to be fully bound and attached. So we copy/verify + * relevant information as possible from/against the ill. + * + * The following should have been set up in arp_ll_set_defaults() + * after the first DL_INFO_ACK was received. + */ + ASSERT(arl->arl_phys_addr_length == ill->ill_phys_addr_length); + ASSERT(arl->arl_sap == ETHERTYPE_ARP); + ASSERT(arl->arl_mactype == ill->ill_mactype); + ASSERT(arl->arl_sap_length == ill->ill_sap_length); + + ai = kmem_zalloc(sizeof (*ai), KM_SLEEP); + mutex_enter(&ill->ill_lock); + /* First ensure that the ill is not CONDEMNED. */ + if (ill->ill_state_flags & ILL_CONDEMNED) { + mutex_exit(&ill->ill_lock); + ill_refrele(ill); + kmem_free(ai, sizeof (*ai)); + return (ENXIO); + } + if (ill->ill_common != NULL || arl->arl_common != NULL) { + mutex_exit(&ill->ill_lock); + ip0dbg(("%s: PPA already exists", ill->ill_name)); + ill_refrele(ill); + kmem_free(ai, sizeof (*ai)); + return (EEXIST); + } + mutex_init(&ai->ai_lock, NULL, MUTEX_DEFAULT, NULL); + ai->ai_arl = arl; + ai->ai_ill = ill; + ill->ill_common = ai; + arl->arl_common = ai; + mutex_exit(&ill->ill_lock); + (void) strlcpy(arl->arl_name, ill->ill_name, LIFNAMSIZ); + arl->arl_name_length = ill->ill_name_length; + ill_refrele(ill); + arp_ifname_notify(arl); + return (0); +} + +/* Allocate and do common initializations for DLPI messages. */ +static mblk_t * +ip_ar_dlpi_comm(t_uscalar_t prim, size_t size) +{ + mblk_t *mp; + + if ((mp = allocb(size, BPRI_HI)) == NULL) + return (NULL); + + /* + * DLPIv2 says that DL_INFO_REQ and DL_TOKEN_REQ (the latter + * of which we don't seem to use) are sent with M_PCPROTO, and + * that other DLPI are M_PROTO. + */ + DB_TYPE(mp) = (prim == DL_INFO_REQ) ? M_PCPROTO : M_PROTO; + + mp->b_wptr = mp->b_rptr + size; + bzero(mp->b_rptr, size); + DL_PRIM(mp) = prim; + return (mp); +} + + +int +ip_sioctl_ifunitsel_arp(queue_t *q, int *ppa) +{ + arl_t *arl; + char *cp, ill_name[LIFNAMSIZ]; + + if (q->q_next == NULL) + return (EINVAL); + + do { + q = q->q_next; + } while (q->q_next != NULL); + cp = q->q_qinfo->qi_minfo->mi_idname; + + arl = (arl_t *)q->q_ptr; + (void) snprintf(ill_name, sizeof (ill_name), "%s%d", cp, *ppa); + arl->arl_ppa = *ppa; + return (arl_ill_init(arl, ill_name)); +} + +int +ip_sioctl_slifname_arp(queue_t *q, void *lifreq) +{ + arl_t *arl; + struct lifreq *lifr = lifreq; + + /* ioctl not valid when IP opened as a device */ + if (q->q_next == NULL) + return (EINVAL); + + arl = (arl_t *)q->q_ptr; + arl->arl_ppa = lifr->lifr_ppa; + return (arl_ill_init(arl, lifr->lifr_name)); +} + +arl_t * +ill_to_arl(ill_t *ill) +{ + arl_ill_common_t *ai = ill->ill_common; + arl_t *arl = NULL; + + if (ai == NULL) + return (NULL); + /* + * Find the arl_t that corresponds to this ill_t from the shared + * ill_common structure. We can safely access the ai here as it + * will only be freed in arp_modclose() after we have become + * single-threaded. + */ + mutex_enter(&ai->ai_lock); + if ((arl = ai->ai_arl) != NULL) { + mutex_enter(&arl->arl_lock); + if (!(arl->arl_state_flags & ARL_CONDEMNED)) { + arl_refhold_locked(arl); + mutex_exit(&arl->arl_lock); + } else { + mutex_exit(&arl->arl_lock); + arl = NULL; + } + } + mutex_exit(&ai->ai_lock); + return (arl); +} + +ill_t * +arl_to_ill(arl_t *arl) +{ + arl_ill_common_t *ai = arl->arl_common; + ill_t *ill = NULL; + + if (ai == NULL) { + /* + * happens when the arp stream is just being opened, and + * arl_ill_init has not been executed yet. + */ + return (NULL); + } + /* + * Find the ill_t that corresponds to this arl_t from the shared + * arl_common structure. We can safely access the ai here as it + * will only be freed in arp_modclose() after we have become + * single-threaded. + */ + mutex_enter(&ai->ai_lock); + if ((ill = ai->ai_ill) != NULL) { + mutex_enter(&ill->ill_lock); + if (!ILL_IS_CONDEMNED(ill)) { + ill_refhold_locked(ill); + mutex_exit(&ill->ill_lock); + } else { + mutex_exit(&ill->ill_lock); + ill = NULL; + } + } + mutex_exit(&ai->ai_lock); + return (ill); +} + +int +arp_ll_up(ill_t *ill) +{ + mblk_t *attach_mp = NULL; + mblk_t *bind_mp = NULL; + mblk_t *unbind_mp = NULL; + arl_t *arl; + + ASSERT(IAM_WRITER_ILL(ill)); + arl = ill_to_arl(ill); + + DTRACE_PROBE2(ill__downup, char *, "arp_ll_up", ill_t *, ill); + if (arl == NULL) + return (ENXIO); + DTRACE_PROBE2(arl__downup, char *, "arp_ll_up", arl_t *, arl); + if ((arl->arl_state_flags & ARL_LL_UP) != 0) { + arl_refrele(arl); + return (0); + } + if (arl->arl_needs_attach) { /* DL_STYLE2 */ + attach_mp = + ip_ar_dlpi_comm(DL_ATTACH_REQ, sizeof (dl_attach_req_t)); + if (attach_mp == NULL) + goto bad; + ((dl_attach_req_t *)attach_mp->b_rptr)->dl_ppa = arl->arl_ppa; + } + + /* Allocate and initialize a bind message. */ + bind_mp = ip_ar_dlpi_comm(DL_BIND_REQ, sizeof (dl_bind_req_t)); + if (bind_mp == NULL) + goto bad; + ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ETHERTYPE_ARP; + ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS; + + unbind_mp = ip_ar_dlpi_comm(DL_UNBIND_REQ, sizeof (dl_unbind_req_t)); + if (unbind_mp == NULL) + goto bad; + if (arl->arl_needs_attach) { + arp_dlpi_send(arl, attach_mp); + } + arl->arl_unbind_mp = unbind_mp; + + arl->arl_state_flags |= ARL_LL_BIND_PENDING; + arp_dlpi_send(arl, bind_mp); + arl_refrele(arl); + return (EINPROGRESS); + +bad: + freemsg(attach_mp); + freemsg(bind_mp); + freemsg(unbind_mp); + arl_refrele(arl); + return (ENOMEM); +} + +/* + * consumes/frees mp + */ +static void +arp_notify(in_addr_t src, mblk_t *mp, uint32_t arcn_code, + ip_recv_attr_t *ira, ncec_t *ncec) +{ + char hbuf[MAC_STR_LEN]; + char sbuf[INET_ADDRSTRLEN]; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; + arh_t *arh = (arh_t *)mp->b_rptr; + + switch (arcn_code) { + case AR_CN_BOGON: + /* + * Someone is sending ARP packets with a source protocol + * address that we have published and for which we believe our + * entry is authoritative and verified to be unique on + * the network. + * + * arp_process_packet() sends AR_CN_FAILED for the case when + * a DAD probe is received and the hardware address of a + * non-authoritative entry has changed. Thus, AR_CN_BOGON + * indicates a real conflict, and we have to do resolution. + * + * We back away quickly from the address if it's from DHCP or + * otherwise temporary and hasn't been used recently (or at + * all). We'd like to include "deprecated" addresses here as + * well (as there's no real reason to defend something we're + * discarding), but IPMP "reuses" this flag to mean something + * other than the standard meaning. + */ + if (ip_nce_conflict(mp, ira, ncec)) { + (void) mac_colon_addr((uint8_t *)(arh + 1), + arh->arh_hlen, hbuf, sizeof (hbuf)); + (void) ip_dot_addr(src, sbuf); + cmn_err(CE_WARN, + "proxy ARP problem? Node '%s' is using %s on %s", + hbuf, sbuf, ill->ill_name); + if (!arp_no_defense) + (void) arp_announce(ncec); + /* + * ncec_last_time_defended has been adjusted in + * ip_nce_conflict. + */ + } else { + ncec_delete(ncec); + } + freemsg(mp); + break; + case AR_CN_ANNOUNCE: { + nce_hw_map_t hwm; + /* + * ARP gives us a copy of any packet where it thinks + * the address has changed, so that we can update our + * caches. We're responsible for caching known answers + * in the current design. We check whether the + * hardware address really has changed in all of our + * entries that have cached this mapping, and if so, we + * blow them away. This way we will immediately pick + * up the rare case of a host changing hardware + * address. + */ + if (src == 0) { + freemsg(mp); + break; + } + hwm.hwm_addr = src; + hwm.hwm_hwlen = arh->arh_hlen; + hwm.hwm_hwaddr = (uchar_t *)(arh + 1); + hwm.hwm_flags = 0; + ncec_walk_common(ipst->ips_ndp4, NULL, + (pfi_t)nce_update_hw_changed, &hwm, B_TRUE); + freemsg(mp); + break; + } + case AR_CN_FAILED: + if (arp_no_defense) { + (void) mac_colon_addr((uint8_t *)(arh + 1), + arh->arh_hlen, hbuf, sizeof (hbuf)); + (void) ip_dot_addr(src, sbuf); + + cmn_err(CE_WARN, + "node %s is using our IP address %s on %s", + hbuf, sbuf, ill->ill_name); + freemsg(mp); + break; + } + /* + * mp will be freed by arp_excl. + */ + ill_refhold(ill); + qwriter_ip(ill, ill->ill_rq, mp, arp_excl, NEW_OP, B_FALSE); + return; + default: + ASSERT(0); + freemsg(mp); + break; + } +} + +/* + * arp_output is called to transmit an ARP Request or Response. The mapping + * to RFC 826 variables is: + * haddr1 == ar$sha + * paddr1 == ar$spa + * haddr2 == ar$tha + * paddr2 == ar$tpa + * The ARP frame is sent to the ether_dst in dst_lladdr. + */ +static int +arp_output(ill_t *ill, uint32_t operation, + const uchar_t *haddr1, const uchar_t *paddr1, const uchar_t *haddr2, + const uchar_t *paddr2, uchar_t *dst_lladdr) +{ + arh_t *arh; + uint8_t *cp; + uint_t hlen; + uint32_t plen = IPV4_ADDR_LEN; /* ar$pln from RFC 826 */ + uint32_t proto = IP_ARP_PROTO_TYPE; + mblk_t *mp; + arl_t *arl; + + ASSERT(dst_lladdr != NULL); + hlen = ill->ill_phys_addr_length; /* ar$hln from RFC 826 */ + mp = ill_dlur_gen(dst_lladdr, hlen, ETHERTYPE_ARP, ill->ill_sap_length); + + if (mp == NULL) + return (ENOMEM); + + /* IFF_NOARP flag is set or link down: do not send arp messages */ + if ((ill->ill_flags & ILLF_NOARP) || !ill->ill_dl_up) { + freemsg(mp); + return (ENXIO); + } + + mp->b_cont = allocb(AR_LL_HDR_SLACK + ARH_FIXED_LEN + (hlen * 4) + + plen + plen, BPRI_MED); + if (mp->b_cont == NULL) { + freeb(mp); + return (ENOMEM); + } + + /* Fill in the ARP header. */ + cp = mp->b_cont->b_rptr + (AR_LL_HDR_SLACK + hlen + hlen); + mp->b_cont->b_rptr = cp; + arh = (arh_t *)cp; + U16_TO_BE16(arp_hw_type(ill->ill_mactype), arh->arh_hardware); + U16_TO_BE16(proto, arh->arh_proto); + arh->arh_hlen = (uint8_t)hlen; + arh->arh_plen = (uint8_t)plen; + U16_TO_BE16(operation, arh->arh_operation); + cp += ARH_FIXED_LEN; + bcopy(haddr1, cp, hlen); + cp += hlen; + if (paddr1 == NULL) + bzero(cp, plen); + else + bcopy(paddr1, cp, plen); + cp += plen; + if (haddr2 == NULL) + bzero(cp, hlen); + else + bcopy(haddr2, cp, hlen); + cp += hlen; + bcopy(paddr2, cp, plen); + cp += plen; + mp->b_cont->b_wptr = cp; + + DTRACE_PROBE3(arp__physical__out__start, + ill_t *, ill, arh_t *, arh, mblk_t *, mp); + ARP_HOOK_OUT(ill->ill_ipst->ips_arp_physical_out_event, + ill->ill_ipst->ips_arp_physical_out, + ill->ill_phyint->phyint_ifindex, arh, mp, mp->b_cont, + ill->ill_ipst); + DTRACE_PROBE1(arp__physical__out__end, mblk_t *, mp); + if (mp == NULL) + return (0); + + /* Ship it out. */ + arl = ill_to_arl(ill); + if (arl == NULL) { + freemsg(mp); + return (0); + } + if (canputnext(arl->arl_wq)) + putnext(arl->arl_wq, mp); + else + freemsg(mp); + arl_refrele(arl); + return (0); +} + +/* + * Process resolve requests. + * If we are not yet reachable then we check and decrease ncec_rcnt; otherwise + * we leave it alone (the caller will check and manage ncec_pcnt in those + * cases.) + */ +int +arp_request(ncec_t *ncec, in_addr_t sender, ill_t *ill) +{ + int err; + const uchar_t *target_hwaddr; + struct in_addr nce_paddr; + uchar_t *dst_lladdr; + boolean_t use_rcnt = !NCE_ISREACHABLE(ncec); + + ASSERT(MUTEX_HELD(&ncec->ncec_lock)); + ASSERT(!IS_IPMP(ill)); + + if (use_rcnt && ncec->ncec_rcnt == 0) { + /* not allowed any more retransmits. */ + return (0); + } + + if ((ill->ill_flags & ILLF_NOARP) != 0) + return (0); + + IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &nce_paddr); + + target_hwaddr = + ill->ill_bcast_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill); + + if (NCE_ISREACHABLE(ncec)) { + dst_lladdr = ncec->ncec_lladdr; + } else { + dst_lladdr = ill->ill_bcast_mp->b_rptr + + NCE_LL_ADDR_OFFSET(ill); + } + + mutex_exit(&ncec->ncec_lock); + err = arp_output(ill, ARP_REQUEST, + ill->ill_phys_addr, (uchar_t *)&sender, target_hwaddr, + (uchar_t *)&nce_paddr, dst_lladdr); + mutex_enter(&ncec->ncec_lock); + + if (err != 0) { + /* + * Some transient error such as ENOMEM or a down link was + * encountered. If the link has been taken down permanently, + * the ncec will eventually be cleaned up (ipif_down_tail() + * will call ipif_nce_down() and flush the ncec), to terminate + * recurring attempts to send ARP requests. In all other cases, + * allow the caller another chance at success next time. + */ + return (ncec->ncec_ill->ill_reachable_retrans_time); + } + + if (use_rcnt) + ncec->ncec_rcnt--; + + return (ncec->ncec_ill->ill_reachable_retrans_time); +} + +/* return B_TRUE if dropped */ +boolean_t +arp_announce(ncec_t *ncec) +{ + ill_t *ill; + int err; + uchar_t *sphys_addr, *bcast_addr; + struct in_addr ncec_addr; + boolean_t need_refrele = B_FALSE; + + ASSERT((ncec->ncec_flags & NCE_F_BCAST) == 0); + ASSERT((ncec->ncec_flags & NCE_F_MCAST) == 0); + + if (IS_IPMP(ncec->ncec_ill)) { + /* sent on the cast_ill */ + ill = ipmp_ill_get_xmit_ill(ncec->ncec_ill, B_FALSE); + if (ill == NULL) + return (B_TRUE); + need_refrele = B_TRUE; + } else { + ill = ncec->ncec_ill; + } + + /* + * broadcast an announce to ill_bcast address. + */ + IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ncec_addr); + + sphys_addr = ncec->ncec_lladdr; + bcast_addr = ill->ill_bcast_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill); + + err = arp_output(ill, ARP_REQUEST, + sphys_addr, (uchar_t *)&ncec_addr, bcast_addr, + (uchar_t *)&ncec_addr, bcast_addr); + + if (need_refrele) + ill_refrele(ill); + return (err != 0); +} + +/* return B_TRUE if dropped */ +boolean_t +arp_probe(ncec_t *ncec) +{ + ill_t *ill; + int err; + struct in_addr ncec_addr; + uchar_t *sphys_addr, *dst_lladdr; + + if (IS_IPMP(ncec->ncec_ill)) { + ill = ipmp_ill_get_xmit_ill(ncec->ncec_ill, B_FALSE); + if (ill == NULL) + return (B_TRUE); + } else { + ill = ncec->ncec_ill; + } + + IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ncec_addr); + + sphys_addr = ncec->ncec_lladdr; + dst_lladdr = ill->ill_bcast_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill); + err = arp_output(ill, ARP_REQUEST, + sphys_addr, NULL, NULL, (uchar_t *)&ncec_addr, dst_lladdr); + + if (IS_IPMP(ncec->ncec_ill)) + ill_refrele(ill); + return (err != 0); +} + +static mblk_t * +arl_unbind(arl_t *arl) +{ + mblk_t *mp; + + if ((mp = arl->arl_unbind_mp) != NULL) { + arl->arl_unbind_mp = NULL; + arl->arl_state_flags |= ARL_DL_UNBIND_IN_PROGRESS; + } + return (mp); +} + +int +arp_ll_down(ill_t *ill) +{ + arl_t *arl; + mblk_t *unbind_mp; + int err = 0; + boolean_t replumb = (ill->ill_replumbing == 1); + + DTRACE_PROBE2(ill__downup, char *, "arp_ll_down", ill_t *, ill); + if ((arl = ill_to_arl(ill)) == NULL) + return (ENXIO); + DTRACE_PROBE2(arl__downup, char *, "arp_ll_down", arl_t *, arl); + mutex_enter(&arl->arl_lock); + unbind_mp = arl_unbind(arl); + if (unbind_mp != NULL) { + ASSERT(arl->arl_state_flags & ARL_DL_UNBIND_IN_PROGRESS); + DTRACE_PROBE2(arp__unbinding, mblk_t *, unbind_mp, + arl_t *, arl); + err = EINPROGRESS; + if (replumb) + arl->arl_state_flags |= ARL_LL_REPLUMBING; + } + mutex_exit(&arl->arl_lock); + if (unbind_mp != NULL) + arp_dlpi_send(arl, unbind_mp); + arl_refrele(arl); + return (err); +} + +/* ARGSUSED */ +int +arp_close(queue_t *q, int flags) +{ + if (WR(q)->q_next != NULL) { + /* This is a module close */ + return (arp_modclose(q->q_ptr)); + } + qprocsoff(q); + q->q_ptr = WR(q)->q_ptr = NULL; + return (0); +} + +static int +arp_modclose(arl_t *arl) +{ + arl_ill_common_t *ai = arl->arl_common; + ill_t *ill; + queue_t *q = arl->arl_rq; + mblk_t *mp, *nextmp; + ipsq_t *ipsq = NULL; + + ill = arl_to_ill(arl); + if (ill != NULL) { + if (!ill_waiter_inc(ill)) { + ill_refrele(ill); + } else { + ill_refrele(ill); + if (ipsq_enter(ill, B_FALSE, NEW_OP)) + ipsq = ill->ill_phyint->phyint_ipsq; + ill_waiter_dcr(ill); + } + if (ipsq == NULL) { + /* + * could not enter the ipsq because ill is already + * marked CONDEMNED. + */ + ill = NULL; + } + } + if (ai != NULL && ipsq == NULL) { + /* + * Either we did not get an ill because it was marked CONDEMNED + * or we could not enter the ipsq because it was unplumbing. + * In both cases, wait for the ill to complete ip_modclose(). + * + * If the arp_modclose happened even before SLIFNAME, the ai + * itself would be NULL, in which case we can complete the close + * without waiting. + */ + mutex_enter(&ai->ai_lock); + while (ai->ai_ill != NULL) + cv_wait(&ai->ai_ill_unplumb_done, &ai->ai_lock); + mutex_exit(&ai->ai_lock); + } + ASSERT(ill == NULL || IAM_WRITER_ILL(ill)); + + mutex_enter(&arl->arl_lock); + /* + * If the ill had completed unplumbing before arp_modclose(), there + * would be no ill (and therefore, no ipsq) to serialize arp_modclose() + * so that we need to explicitly check for ARL_CONDEMNED and back off + * if it is set. + */ + if ((arl->arl_state_flags & ARL_CONDEMNED) != 0) { + mutex_exit(&arl->arl_lock); + ASSERT(ipsq == NULL); + return (0); + } + arl->arl_state_flags |= ARL_CONDEMNED; + + /* + * send out all pending dlpi messages, don't wait for the ack (which + * will be ignored in arp_rput when CONDEMNED is set) + * + * We have to check for pending DL_UNBIND_REQ because, in the case + * that ip_modclose() executed before arp_modclose(), the call to + * ill_delete_tail->ipif_arp_down() would have triggered a + * DL_UNBIND_REQ. When arp_modclose() executes ipsq_enter() will fail + * (since ip_modclose() is in the ipsq) but the DL_UNBIND_ACK may not + * have been processed yet. In this scenario, we cannot reset + * arl_dlpi_pending, because the setting/clearing of arl_state_flags + * related to unbind, and the associated cv_waits must be allowed to + * continue. + */ + if (arl->arl_dlpi_pending != DL_UNBIND_REQ) + arl->arl_dlpi_pending = DL_PRIM_INVAL; + mp = arl->arl_dlpi_deferred; + arl->arl_dlpi_deferred = NULL; + mutex_exit(&arl->arl_lock); + + for (; mp != NULL; mp = nextmp) { + nextmp = mp->b_next; + mp->b_next = NULL; + putnext(arl->arl_wq, mp); + } + + /* Wait for data paths to quiesce */ + mutex_enter(&arl->arl_lock); + while (arl->arl_refcnt != 0) + cv_wait(&arl->arl_cv, &arl->arl_lock); + + /* + * unbind, so that nothing else can come up from driver. + */ + mp = arl_unbind(arl); + mutex_exit(&arl->arl_lock); + if (mp != NULL) + arp_dlpi_send(arl, mp); + mutex_enter(&arl->arl_lock); + + /* wait for unbind ack */ + while (arl->arl_state_flags & ARL_DL_UNBIND_IN_PROGRESS) + cv_wait(&arl->arl_cv, &arl->arl_lock); + mutex_exit(&arl->arl_lock); + + qprocsoff(q); + + if (ill != NULL) { + mutex_enter(&ill->ill_lock); + ill->ill_arl_dlpi_pending = 0; + mutex_exit(&ill->ill_lock); + } + + if (ai != NULL) { + mutex_enter(&ai->ai_lock); + ai->ai_arl = NULL; + if (ai->ai_ill == NULL) { + mutex_destroy(&ai->ai_lock); + kmem_free(ai, sizeof (*ai)); + } else { + mutex_exit(&ai->ai_lock); + } + } + + /* free up the rest */ + arp_mod_close_tail(arl); + + q->q_ptr = WR(q)->q_ptr = NULL; + + if (ipsq != NULL) + ipsq_exit(ipsq); + + return (0); +} + +static void +arp_mod_close_tail(arl_t *arl) +{ + ip_stack_t *ipst = arl->arl_ipst; + mblk_t **mpp; + + netstack_hold(ipst->ips_netstack); + + mutex_enter(&ipst->ips_ip_mi_lock); + mi_close_unlink(&ipst->ips_arp_g_head, (IDP)arl); + mutex_exit(&ipst->ips_ip_mi_lock); + + /* + * credp could be null if the open didn't succeed and ip_modopen + * itself calls ip_close. + */ + if (arl->arl_credp != NULL) + crfree(arl->arl_credp); + + /* Free all retained control messages. */ + mpp = &arl->arl_first_mp_to_free; + do { + while (mpp[0]) { + mblk_t *mp; + mblk_t *mp1; + + mp = mpp[0]; + mpp[0] = mp->b_next; + for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { + mp1->b_next = NULL; + mp1->b_prev = NULL; + } + freemsg(mp); + } + } while (mpp++ != &arl->arl_last_mp_to_free); + + netstack_rele(ipst->ips_netstack); + mi_free(arl->arl_name); + mi_close_free((IDP)arl); +} + +/* + * DAD failed. Tear down ipifs with the specified srce address. Note that + * tearing down the ipif also meas deleting the ncec through ipif_down, + * so it is not possible to use nce_timer for recovery. Instead we start + * a timer on the ipif. Caller has to free the mp. + */ +void +arp_failure(mblk_t *mp, ip_recv_attr_t *ira) +{ + ill_t *ill = ira->ira_ill; + + if ((mp = copymsg(mp)) != NULL) { + ill_refhold(ill); + qwriter_ip(ill, ill->ill_rq, mp, arp_excl, NEW_OP, B_FALSE); + } +} + +/* + * This is for exclusive changes due to ARP. Tear down an interface due + * to AR_CN_FAILED and AR_CN_BOGON. + */ +/* ARGSUSED */ +static void +arp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) +{ + ill_t *ill = rq->q_ptr; + arh_t *arh; + ipaddr_t src; + ipif_t *ipif; + ip_stack_t *ipst = ill->ill_ipst; + uchar_t *haddr; + uint_t haddrlen; + + /* first try src = ar$spa */ + arh = (arh_t *)mp->b_rptr; + bcopy((char *)&arh[1] + arh->arh_hlen, &src, IP_ADDR_LEN); + + haddrlen = arh->arh_hlen; + haddr = (uint8_t *)(arh + 1); + + if (haddrlen == ill->ill_phys_addr_length) { + /* + * Ignore conflicts generated by misbehaving switches that + * just reflect our own messages back to us. For IPMP, we may + * see reflections across any ill in the illgrp. + */ + /* For an under ill_grp can change under lock */ + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 || + IS_UNDER_IPMP(ill) && ill->ill_grp != NULL && + ipmp_illgrp_find_ill(ill->ill_grp, haddr, + haddrlen) != NULL) { + rw_exit(&ipst->ips_ill_g_lock); + goto ignore_conflict; + } + rw_exit(&ipst->ips_ill_g_lock); + } + + /* + * Look up the appropriate ipif. + */ + ipif = ipif_lookup_addr(src, ill, ALL_ZONES, ipst); + if (ipif == NULL) + goto ignore_conflict; + + /* Reload the ill to match the ipif */ + ill = ipif->ipif_ill; + + /* If it's already duplicate or ineligible, then don't do anything. */ + if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) { + ipif_refrele(ipif); + goto ignore_conflict; + } + + /* + * If we failed on a recovery probe, then restart the timer to + * try again later. + */ + if (!ipif->ipif_was_dup) { + char hbuf[MAC_STR_LEN]; + char sbuf[INET_ADDRSTRLEN]; + char ibuf[LIFNAMSIZ]; + + (void) mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)); + (void) ip_dot_addr(src, sbuf); + ipif_get_name(ipif, ibuf, sizeof (ibuf)); + + cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);" + " disabled", ibuf, sbuf, hbuf); + } + mutex_enter(&ill->ill_lock); + ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); + ipif->ipif_flags |= IPIF_DUPLICATE; + ill->ill_ipif_dup_count++; + mutex_exit(&ill->ill_lock); + (void) ipif_down(ipif, NULL, NULL); + (void) ipif_down_tail(ipif); + mutex_enter(&ill->ill_lock); + if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && + ill->ill_net_type == IRE_IF_RESOLVER && + !(ipif->ipif_state_flags & IPIF_CONDEMNED) && + ipst->ips_ip_dup_recovery > 0) { + ASSERT(ipif->ipif_recovery_id == 0); + ipif->ipif_recovery_id = timeout(ipif_dup_recovery, + ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); + } + mutex_exit(&ill->ill_lock); + ipif_refrele(ipif); + +ignore_conflict: + freemsg(mp); +} + +/* + * This is a place for a dtrace hook. + * Note that mp can be either the DL_UNITDATA_IND with a b_cont payload, + * or just the ARP packet payload as an M_DATA. + */ +/* ARGSUSED */ +static void +arp_drop_packet(const char *str, mblk_t *mp, ill_t *ill) +{ + freemsg(mp); +} + +static boolean_t +arp_over_driver(queue_t *q) +{ + queue_t *qnext = STREAM(q)->sd_wrq->q_next; + + /* + * check if first module below stream head is IP or UDP. + */ + ASSERT(qnext != NULL); + if (strcmp(Q2NAME(qnext), "ip") != 0 && + strcmp(Q2NAME(qnext), "udp") != 0) { + /* + * module below is not ip or udp, so arp has been pushed + * on the driver. + */ + return (B_TRUE); + } + return (B_FALSE); +} + +static int +arp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) +{ + int err; + + ASSERT(sflag & MODOPEN); + if (!arp_over_driver(q)) { + q->q_qinfo = dummymodinfo.st_rdinit; + WR(q)->q_qinfo = dummymodinfo.st_wrinit; + return ((*dummymodinfo.st_rdinit->qi_qopen)(q, devp, flag, + sflag, credp)); + } + err = arp_modopen(q, devp, flag, sflag, credp); + return (err); +} + +/* + * In most cases we must be a writer on the IP stream before coming to + * arp_dlpi_send(), to serialize DLPI sends to the driver. The exceptions + * when we are not a writer are very early duing initialization (in + * arl_init, before the arl has done a SLIFNAME, so that we don't yet know + * the associated ill) or during arp_mod_close, when we could not enter the + * ipsq because the ill has already unplumbed. + */ +static void +arp_dlpi_send(arl_t *arl, mblk_t *mp) +{ + mblk_t **mpp; + t_uscalar_t prim; + arl_ill_common_t *ai; + + ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); + +#ifdef DEBUG + ai = arl->arl_common; + if (ai != NULL) { + mutex_enter(&ai->ai_lock); + if (ai->ai_ill != NULL) + ASSERT(IAM_WRITER_ILL(ai->ai_ill)); + mutex_exit(&ai->ai_lock); + } +#endif /* DEBUG */ + + mutex_enter(&arl->arl_lock); + if (arl->arl_dlpi_pending != DL_PRIM_INVAL) { + /* Must queue message. Tail insertion */ + mpp = &arl->arl_dlpi_deferred; + while (*mpp != NULL) + mpp = &((*mpp)->b_next); + + *mpp = mp; + mutex_exit(&arl->arl_lock); + return; + } + mutex_exit(&arl->arl_lock); + if ((prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive) + == DL_BIND_REQ) { + ASSERT((arl->arl_state_flags & ARL_DL_UNBIND_IN_PROGRESS) == 0); + } + /* + * No need to take the arl_lock to examine ARL_CONDEMNED at this point + * because the only thread that can see ARL_CONDEMNED here is the + * closing arp_modclose() thread which sets the flag after becoming a + * writer on the ipsq. Threads from IP must have finished and + * cannot be active now. + */ + if (!(arl->arl_state_flags & ARL_CONDEMNED) || + (prim == DL_UNBIND_REQ)) { + if (prim != DL_NOTIFY_CONF) { + ill_t *ill = arl_to_ill(arl); + + arl->arl_dlpi_pending = prim; + if (ill != NULL) { + mutex_enter(&ill->ill_lock); + ill->ill_arl_dlpi_pending = 1; + mutex_exit(&ill->ill_lock); + ill_refrele(ill); + } + } + } + DTRACE_PROBE4(arl__dlpi, char *, "arp_dlpi_send", + char *, dl_primstr(prim), char *, "-", arl_t *, arl); + putnext(arl->arl_wq, mp); +} + +static void +arl_defaults_common(arl_t *arl, mblk_t *mp) +{ + dl_info_ack_t *dlia = (dl_info_ack_t *)mp->b_rptr; + /* + * Till the ill is fully up the ill is not globally visible. + * So no need for a lock. + */ + arl->arl_mactype = dlia->dl_mac_type; + arl->arl_sap_length = dlia->dl_sap_length; + + if (!arl->arl_dlpi_style_set) { + if (dlia->dl_provider_style == DL_STYLE2) + arl->arl_needs_attach = 1; + mutex_enter(&arl->arl_lock); + ASSERT(arl->arl_dlpi_style_set == 0); + arl->arl_dlpi_style_set = 1; + arl->arl_state_flags &= ~ARL_LL_SUBNET_PENDING; + cv_broadcast(&arl->arl_cv); + mutex_exit(&arl->arl_lock); + } +} + +int +arl_init(queue_t *q, arl_t *arl) +{ + mblk_t *info_mp; + dl_info_req_t *dlir; + + /* subset of ill_init */ + mutex_init(&arl->arl_lock, NULL, MUTEX_DEFAULT, 0); + + arl->arl_rq = q; + arl->arl_wq = WR(q); + + info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)), + BPRI_HI); + if (info_mp == NULL) + return (ENOMEM); + /* + * allocate sufficient space to contain device name. + */ + arl->arl_name = (char *)(mi_zalloc(2 * LIFNAMSIZ)); + arl->arl_ppa = UINT_MAX; + arl->arl_state_flags |= (ARL_LL_SUBNET_PENDING | ARL_LL_UNBOUND); + + /* Send down the Info Request to the driver. */ + info_mp->b_datap->db_type = M_PCPROTO; + dlir = (dl_info_req_t *)info_mp->b_rptr; + info_mp->b_wptr = (uchar_t *)&dlir[1]; + dlir->dl_primitive = DL_INFO_REQ; + arl->arl_dlpi_pending = DL_PRIM_INVAL; + qprocson(q); + + arp_dlpi_send(arl, info_mp); + return (0); +} + +int +arl_wait_for_info_ack(arl_t *arl) +{ + int err; + + mutex_enter(&arl->arl_lock); + while (arl->arl_state_flags & ARL_LL_SUBNET_PENDING) { + /* + * Return value of 0 indicates a pending signal. + */ + err = cv_wait_sig(&arl->arl_cv, &arl->arl_lock); + if (err == 0) { + mutex_exit(&arl->arl_lock); + return (EINTR); + } + } + mutex_exit(&arl->arl_lock); + /* + * ip_rput_other could have set an error in ill_error on + * receipt of M_ERROR. + */ + return (arl->arl_error); +} + +void +arl_set_muxid(ill_t *ill, int muxid) +{ + arl_t *arl; + + arl = ill_to_arl(ill); + if (arl != NULL) { + arl->arl_muxid = muxid; + arl_refrele(arl); + } +} + +int +arl_get_muxid(ill_t *ill) +{ + arl_t *arl; + int muxid = 0; + + arl = ill_to_arl(ill); + if (arl != NULL) { + muxid = arl->arl_muxid; + arl_refrele(arl); + } + return (muxid); +} + +static int +arp_modopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) +{ + int err; + zoneid_t zoneid; + netstack_t *ns; + ip_stack_t *ipst; + arl_t *arl = NULL; + + /* + * Prevent unprivileged processes from pushing IP so that + * they can't send raw IP. + */ + if (secpolicy_net_rawaccess(credp) != 0) + return (EPERM); + + ns = netstack_find_by_cred(credp); + ASSERT(ns != NULL); + ipst = ns->netstack_ip; + ASSERT(ipst != NULL); + + /* + * For exclusive stacks we set the zoneid to zero + * to make IP operate as if in the global zone. + */ + if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID) + zoneid = GLOBAL_ZONEID; + else + zoneid = crgetzoneid(credp); + + arl = (arl_t *)mi_open_alloc_sleep(sizeof (arl_t)); + q->q_ptr = WR(q)->q_ptr = arl; + arl->arl_ipst = ipst; + arl->arl_zoneid = zoneid; + err = arl_init(q, arl); + + if (err != 0) { + mi_free(arl->arl_name); + mi_free(arl); + netstack_rele(ipst->ips_netstack); + q->q_ptr = NULL; + WR(q)->q_ptr = NULL; + return (err); + } + + /* + * Wait for the DL_INFO_ACK if a DL_INFO_REQ was sent. + */ + err = arl_wait_for_info_ack(arl); + if (err == 0) + arl->arl_credp = credp; + else + goto fail; + + crhold(credp); + + mutex_enter(&ipst->ips_ip_mi_lock); + err = mi_open_link(&ipst->ips_arp_g_head, (IDP)q->q_ptr, devp, flag, + sflag, credp); + mutex_exit(&ipst->ips_ip_mi_lock); +fail: + if (err) { + (void) arp_close(q, 0); + return (err); + } + return (0); +} + +/* + * Notify any downstream modules (esp softmac and hitbox) of the name + * of this interface using an M_CTL. + */ +static void +arp_ifname_notify(arl_t *arl) +{ + mblk_t *mp1, *mp2; + struct iocblk *iocp; + struct lifreq *lifr; + + if ((mp1 = mkiocb(SIOCSLIFNAME)) == NULL) + return; + if ((mp2 = allocb(sizeof (struct lifreq), BPRI_HI)) == NULL) { + freemsg(mp1); + return; + } + + lifr = (struct lifreq *)mp2->b_rptr; + mp2->b_wptr += sizeof (struct lifreq); + bzero(lifr, sizeof (struct lifreq)); + + (void) strncpy(lifr->lifr_name, arl->arl_name, LIFNAMSIZ); + lifr->lifr_ppa = arl->arl_ppa; + lifr->lifr_flags = ILLF_IPV4; + + /* Use M_CTL to avoid confusing anyone else who might be listening. */ + DB_TYPE(mp1) = M_CTL; + mp1->b_cont = mp2; + iocp = (struct iocblk *)mp1->b_rptr; + iocp->ioc_count = msgsize(mp1->b_cont); + DTRACE_PROBE4(arl__dlpi, char *, "arp_ifname_notify", + char *, "SIOCSLIFNAME", char *, "-", arl_t *, arl); + putnext(arl->arl_wq, mp1); +} + +void +arp_send_replumb_conf(ill_t *ill) +{ + mblk_t *mp; + arl_t *arl = ill_to_arl(ill); + + if (arl == NULL) + return; + /* + * arl_got_replumb and arl_got_unbind to be cleared after we complete + * arp_cmd_done. + */ + mp = mexchange(NULL, NULL, sizeof (dl_notify_conf_t), M_PROTO, + DL_NOTIFY_CONF); + ((dl_notify_conf_t *)(mp->b_rptr))->dl_notification = + DL_NOTE_REPLUMB_DONE; + arp_dlpi_send(arl, mp); + mutex_enter(&arl->arl_lock); + arl->arl_state_flags &= ~ARL_LL_REPLUMBING; + mutex_exit(&arl->arl_lock); + arl_refrele(arl); +} + +/* + * The unplumb code paths call arp_unbind_complete() to make sure that it is + * safe to tear down the ill. We wait for DL_UNBIND_ACK to complete, and also + * for the arl_refcnt to fall to one so that, when we return from + * arp_unbind_complete(), we know for certain that there are no threads in + * arp_rput() that might access the arl_ill. + */ +void +arp_unbind_complete(ill_t *ill) +{ + arl_t *arl = ill_to_arl(ill); + + if (arl == NULL) + return; + mutex_enter(&arl->arl_lock); + /* + * wait for unbind ack and arl_refcnt to drop to 1. Note that the + * quiescent arl_refcnt for this function is 1 (and not 0) because + * ill_to_arl() will itself return after taking a ref on the arl_t. + */ + while (arl->arl_state_flags & ARL_DL_UNBIND_IN_PROGRESS) + cv_wait(&arl->arl_cv, &arl->arl_lock); + while (arl->arl_refcnt != 1) + cv_wait(&arl->arl_cv, &arl->arl_lock); + mutex_exit(&arl->arl_lock); + arl_refrele(arl); +} diff --git a/usr/src/uts/common/inet/ip/ip_attr.c b/usr/src/uts/common/inet/ip/ip_attr.c new file mode 100644 index 0000000000..a46a82c85f --- /dev/null +++ b/usr/src/uts/common/inet/ip/ip_attr.c @@ -0,0 +1,1338 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* Copyright (c) 1990 Mentat Inc. */ + +#include <sys/types.h> +#include <sys/stream.h> +#include <sys/strsun.h> +#include <sys/zone.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/atomic.h> + +#include <sys/systm.h> +#include <sys/param.h> +#include <sys/kmem.h> +#include <sys/sdt.h> +#include <sys/socket.h> +#include <sys/mac.h> +#include <net/if.h> +#include <net/if_arp.h> +#include <net/route.h> +#include <sys/sockio.h> +#include <netinet/in.h> +#include <net/if_dl.h> + +#include <inet/common.h> +#include <inet/mi.h> +#include <inet/mib2.h> +#include <inet/nd.h> +#include <inet/arp.h> +#include <inet/snmpcom.h> +#include <inet/kstatcom.h> + +#include <netinet/igmp_var.h> +#include <netinet/ip6.h> +#include <netinet/icmp6.h> +#include <netinet/sctp.h> + +#include <inet/ip.h> +#include <inet/ip_impl.h> +#include <inet/ip6.h> +#include <inet/ip6_asp.h> +#include <inet/tcp.h> +#include <inet/ip_multi.h> +#include <inet/ip_if.h> +#include <inet/ip_ire.h> +#include <inet/ip_ftable.h> +#include <inet/ip_rts.h> +#include <inet/optcom.h> +#include <inet/ip_ndp.h> +#include <inet/ip_listutils.h> +#include <netinet/igmp.h> +#include <netinet/ip_mroute.h> +#include <inet/ipp_common.h> + +#include <net/pfkeyv2.h> +#include <inet/sadb.h> +#include <inet/ipsec_impl.h> +#include <inet/ipdrop.h> +#include <inet/ip_netinfo.h> +#include <sys/squeue_impl.h> +#include <sys/squeue.h> + +#include <inet/ipclassifier.h> +#include <inet/sctp_ip.h> +#include <inet/sctp/sctp_impl.h> +#include <inet/udp_impl.h> +#include <sys/sunddi.h> + +#include <sys/tsol/label.h> +#include <sys/tsol/tnet.h> + +/* + * Release a reference on ip_xmit_attr. + * The reference is acquired by conn_get_ixa() + */ +#define IXA_REFRELE(ixa) \ +{ \ + if (atomic_add_32_nv(&(ixa)->ixa_refcnt, -1) == 0) \ + ixa_inactive(ixa); \ +} + +#define IXA_REFHOLD(ixa) \ +{ \ + ASSERT((ixa)->ixa_refcnt != 0); \ + atomic_add_32(&(ixa)->ixa_refcnt, 1); \ +} + +/* + * When we need to handle a transmit side asynchronous operation, then we need + * to save sufficient information so that we can call the fragment and postfrag + * functions. That information is captured in an mblk containing this structure. + * + * Since this is currently only used for IPsec, we include information for + * the kernel crypto framework. + */ +typedef struct ixamblk_s { + boolean_t ixm_inbound; /* B_FALSE */ + iaflags_t ixm_flags; /* ixa_flags */ + netstackid_t ixm_stackid; /* Verify it didn't go away */ + uint_t ixm_ifindex; /* Used to find the nce */ + in6_addr_t ixm_nceaddr_v6; /* Used to find nce */ +#define ixm_nceaddr_v4 V4_PART_OF_V6(ixm_nceaddr_v6) + uint32_t ixm_fragsize; + uint_t ixm_pktlen; + uint16_t ixm_ip_hdr_length; /* Points to ULP header */ + uint8_t ixm_protocol; /* Protocol number for ULP cksum */ + pfirepostfrag_t ixm_postfragfn; + + zoneid_t ixm_zoneid; /* Needed for ipobs */ + zoneid_t ixm_no_loop_zoneid; /* IXAF_NO_LOOP_ZONEID_SET */ + + uint_t ixm_scopeid; /* For IPv6 link-locals */ + + uint32_t ixm_ident; /* For IPv6 fragment header */ + uint32_t ixm_xmit_hint; + + cred_t *ixm_cred; /* For getpeerucred - refhold if set */ + pid_t ixm_cpid; /* For getpeerucred */ + + ts_label_t *ixm_tsl; /* Refhold if set. */ + + /* + * When the pointers below are set they have a refhold on the struct. + */ + ipsec_latch_t *ixm_ipsec_latch; + struct ipsa_s *ixm_ipsec_ah_sa; /* SA for AH */ + struct ipsa_s *ixm_ipsec_esp_sa; /* SA for ESP */ + struct ipsec_policy_s *ixm_ipsec_policy; /* why are we here? */ + struct ipsec_action_s *ixm_ipsec_action; /* For reflected packets */ + + ipsa_ref_t ixm_ipsec_ref[2]; /* Soft reference to SA */ + + /* Need these while waiting for SA */ + uint16_t ixm_ipsec_src_port; /* Source port number of d-gram. */ + uint16_t ixm_ipsec_dst_port; /* Destination port number of d-gram. */ + uint8_t ixm_ipsec_icmp_type; /* ICMP type of d-gram */ + uint8_t ixm_ipsec_icmp_code; /* ICMP code of d-gram */ + + sa_family_t ixm_ipsec_inaf; /* Inner address family */ + uint32_t ixm_ipsec_insrc[IXA_MAX_ADDRLEN]; /* Inner src address */ + uint32_t ixm_ipsec_indst[IXA_MAX_ADDRLEN]; /* Inner dest address */ + uint8_t ixm_ipsec_insrcpfx; /* Inner source prefix */ + uint8_t ixm_ipsec_indstpfx; /* Inner destination prefix */ + + uint8_t ixm_ipsec_proto; /* IP protocol number for d-gram. */ +} ixamblk_t; + + +/* + * When we need to handle a receive side asynchronous operation, then we need + * to save sufficient information so that we can call ip_fanout. + * That information is captured in an mblk containing this structure. + * + * Since this is currently only used for IPsec, we include information for + * the kernel crypto framework. + */ +typedef struct iramblk_s { + boolean_t irm_inbound; /* B_TRUE */ + iaflags_t irm_flags; /* ira_flags */ + netstackid_t irm_stackid; /* Verify it didn't go away */ + uint_t irm_ifindex; /* To find ira_ill */ + + uint_t irm_rifindex; /* ira_rifindex */ + uint_t irm_ruifindex; /* ira_ruifindex */ + uint_t irm_pktlen; + uint16_t irm_ip_hdr_length; /* Points to ULP header */ + uint8_t irm_protocol; /* Protocol number for ULP cksum */ + zoneid_t irm_zoneid; /* ALL_ZONES unless local delivery */ + + squeue_t *irm_sqp; + ill_rx_ring_t *irm_ring; + + ipaddr_t irm_mroute_tunnel; /* IRAF_MROUTE_TUNNEL_SET */ + zoneid_t irm_no_loop_zoneid; /* IRAF_NO_LOOP_ZONEID_SET */ + uint32_t irm_esp_udp_ports; /* IRAF_ESP_UDP_PORTS */ + + char irm_l2src[IRA_L2SRC_SIZE]; /* If IRAF_L2SRC_SET */ + + cred_t *irm_cred; /* For getpeerucred - refhold if set */ + pid_t irm_cpid; /* For getpeerucred */ + + ts_label_t *irm_tsl; /* Refhold if set. */ + + /* + * When set these correspond to a refhold on the object. + */ + struct ipsa_s *irm_ipsec_ah_sa; /* SA for AH */ + struct ipsa_s *irm_ipsec_esp_sa; /* SA for ESP */ + struct ipsec_action_s *irm_ipsec_action; /* For reflected packets */ +} iramblk_t; + + +/* + * Take the information in ip_xmit_attr_t and stick it in an mblk + * that can later be passed to ip_xmit_attr_from_mblk to recreate the + * ip_xmit_attr_t. + * + * Returns NULL on memory allocation failure. + */ +mblk_t * +ip_xmit_attr_to_mblk(ip_xmit_attr_t *ixa) +{ + mblk_t *ixamp; + ixamblk_t *ixm; + nce_t *nce = ixa->ixa_nce; + + ASSERT(nce != NULL); + ixamp = allocb(sizeof (*ixm), BPRI_MED); + if (ixamp == NULL) + return (NULL); + + ixamp->b_datap->db_type = M_BREAK; + ixamp->b_wptr += sizeof (*ixm); + ixm = (ixamblk_t *)ixamp->b_rptr; + + bzero(ixm, sizeof (*ixm)); + ixm->ixm_inbound = B_FALSE; + ixm->ixm_flags = ixa->ixa_flags; + ixm->ixm_stackid = ixa->ixa_ipst->ips_netstack->netstack_stackid; + ixm->ixm_ifindex = nce->nce_ill->ill_phyint->phyint_ifindex; + ixm->ixm_nceaddr_v6 = nce->nce_addr; + ixm->ixm_fragsize = ixa->ixa_fragsize; + ixm->ixm_pktlen = ixa->ixa_pktlen; + ixm->ixm_ip_hdr_length = ixa->ixa_ip_hdr_length; + ixm->ixm_protocol = ixa->ixa_protocol; + ixm->ixm_postfragfn = ixa->ixa_postfragfn; + ixm->ixm_zoneid = ixa->ixa_zoneid; + ixm->ixm_no_loop_zoneid = ixa->ixa_no_loop_zoneid; + ixm->ixm_scopeid = ixa->ixa_scopeid; + ixm->ixm_ident = ixa->ixa_ident; + ixm->ixm_xmit_hint = ixa->ixa_xmit_hint; + + if (ixa->ixa_tsl != NULL) { + ixm->ixm_tsl = ixa->ixa_tsl; + label_hold(ixm->ixm_tsl); + } + if (ixa->ixa_cred != NULL) { + ixm->ixm_cred = ixa->ixa_cred; + crhold(ixa->ixa_cred); + } + ixm->ixm_cpid = ixa->ixa_cpid; + + if (ixa->ixa_flags & IXAF_IPSEC_SECURE) { + if (ixa->ixa_ipsec_ah_sa != NULL) { + ixm->ixm_ipsec_ah_sa = ixa->ixa_ipsec_ah_sa; + IPSA_REFHOLD(ixa->ixa_ipsec_ah_sa); + } + if (ixa->ixa_ipsec_esp_sa != NULL) { + ixm->ixm_ipsec_esp_sa = ixa->ixa_ipsec_esp_sa; + IPSA_REFHOLD(ixa->ixa_ipsec_esp_sa); + } + if (ixa->ixa_ipsec_policy != NULL) { + ixm->ixm_ipsec_policy = ixa->ixa_ipsec_policy; + IPPOL_REFHOLD(ixa->ixa_ipsec_policy); + } + if (ixa->ixa_ipsec_action != NULL) { + ixm->ixm_ipsec_action = ixa->ixa_ipsec_action; + IPACT_REFHOLD(ixa->ixa_ipsec_action); + } + if (ixa->ixa_ipsec_latch != NULL) { + ixm->ixm_ipsec_latch = ixa->ixa_ipsec_latch; + IPLATCH_REFHOLD(ixa->ixa_ipsec_latch); + } + ixm->ixm_ipsec_ref[0] = ixa->ixa_ipsec_ref[0]; + ixm->ixm_ipsec_ref[1] = ixa->ixa_ipsec_ref[1]; + ixm->ixm_ipsec_src_port = ixa->ixa_ipsec_src_port; + ixm->ixm_ipsec_dst_port = ixa->ixa_ipsec_dst_port; + ixm->ixm_ipsec_icmp_type = ixa->ixa_ipsec_icmp_type; + ixm->ixm_ipsec_icmp_code = ixa->ixa_ipsec_icmp_code; + ixm->ixm_ipsec_inaf = ixa->ixa_ipsec_inaf; + ixm->ixm_ipsec_insrc[0] = ixa->ixa_ipsec_insrc[0]; + ixm->ixm_ipsec_insrc[1] = ixa->ixa_ipsec_insrc[1]; + ixm->ixm_ipsec_insrc[2] = ixa->ixa_ipsec_insrc[2]; + ixm->ixm_ipsec_insrc[3] = ixa->ixa_ipsec_insrc[3]; + ixm->ixm_ipsec_indst[0] = ixa->ixa_ipsec_indst[0]; + ixm->ixm_ipsec_indst[1] = ixa->ixa_ipsec_indst[1]; + ixm->ixm_ipsec_indst[2] = ixa->ixa_ipsec_indst[2]; + ixm->ixm_ipsec_indst[3] = ixa->ixa_ipsec_indst[3]; + ixm->ixm_ipsec_insrcpfx = ixa->ixa_ipsec_insrcpfx; + ixm->ixm_ipsec_indstpfx = ixa->ixa_ipsec_indstpfx; + ixm->ixm_ipsec_proto = ixa->ixa_ipsec_proto; + } + return (ixamp); +} + +/* + * Extract the ip_xmit_attr_t from the mblk, checking that the + * ip_stack_t, ill_t, and nce_t still exist. Returns B_FALSE if that is + * not the case. + * + * Otherwise ixa is updated. + * Caller needs to release references on the ixa by calling ixa_refrele() + * which will imediately call ixa_inactive to release the references. + */ +boolean_t +ip_xmit_attr_from_mblk(mblk_t *ixamp, ip_xmit_attr_t *ixa) +{ + ixamblk_t *ixm; + netstack_t *ns; + ip_stack_t *ipst; + ill_t *ill; + nce_t *nce; + + /* We assume the caller hasn't initialized ixa */ + bzero(ixa, sizeof (*ixa)); + + ASSERT(DB_TYPE(ixamp) == M_BREAK); + ASSERT(ixamp->b_cont == NULL); + + ixm = (ixamblk_t *)ixamp->b_rptr; + ASSERT(!ixm->ixm_inbound); + + /* Verify the netstack is still around */ + ns = netstack_find_by_stackid(ixm->ixm_stackid); + if (ns == NULL) { + /* Disappeared on us */ + (void) ip_xmit_attr_free_mblk(ixamp); + return (B_FALSE); + } + ipst = ns->netstack_ip; + + /* Verify the ill is still around */ + ill = ill_lookup_on_ifindex(ixm->ixm_ifindex, + !(ixm->ixm_flags & IXAF_IS_IPV4), ipst); + + /* We have the ill, hence the netstack can't go away */ + netstack_rele(ns); + if (ill == NULL) { + /* Disappeared on us */ + (void) ip_xmit_attr_free_mblk(ixamp); + return (B_FALSE); + } + /* + * Find the nce. We don't load-spread (only lookup nce's on the ill) + * because we want to find the same nce as the one we had when + * ip_xmit_attr_to_mblk was called. + */ + if (ixm->ixm_flags & IXAF_IS_IPV4) { + nce = nce_lookup_v4(ill, &ixm->ixm_nceaddr_v4); + } else { + nce = nce_lookup_v6(ill, &ixm->ixm_nceaddr_v6); + } + + /* We have the nce, hence the ill can't go away */ + ill_refrele(ill); + if (nce == NULL) { + /* + * Since this is unusual and we don't know what type of + * nce it was, we drop the packet. + */ + (void) ip_xmit_attr_free_mblk(ixamp); + return (B_FALSE); + } + + ixa->ixa_flags = ixm->ixm_flags; + ixa->ixa_refcnt = 1; + ixa->ixa_ipst = ipst; + ixa->ixa_fragsize = ixm->ixm_fragsize; + ixa->ixa_pktlen = ixm->ixm_pktlen; + ixa->ixa_ip_hdr_length = ixm->ixm_ip_hdr_length; + ixa->ixa_protocol = ixm->ixm_protocol; + ixa->ixa_nce = nce; + ixa->ixa_postfragfn = ixm->ixm_postfragfn; + ixa->ixa_zoneid = ixm->ixm_zoneid; + ixa->ixa_no_loop_zoneid = ixm->ixm_no_loop_zoneid; + ixa->ixa_scopeid = ixm->ixm_scopeid; + ixa->ixa_ident = ixm->ixm_ident; + ixa->ixa_xmit_hint = ixm->ixm_xmit_hint; + + if (ixm->ixm_tsl != NULL) { + ixa->ixa_tsl = ixm->ixm_tsl; + ixa->ixa_free_flags |= IXA_FREE_TSL; + } + if (ixm->ixm_cred != NULL) { + ixa->ixa_cred = ixm->ixm_cred; + ixa->ixa_free_flags |= IXA_FREE_CRED; + } + ixa->ixa_cpid = ixm->ixm_cpid; + + ixa->ixa_ipsec_ah_sa = ixm->ixm_ipsec_ah_sa; + ixa->ixa_ipsec_esp_sa = ixm->ixm_ipsec_esp_sa; + ixa->ixa_ipsec_policy = ixm->ixm_ipsec_policy; + ixa->ixa_ipsec_action = ixm->ixm_ipsec_action; + ixa->ixa_ipsec_latch = ixm->ixm_ipsec_latch; + + ixa->ixa_ipsec_ref[0] = ixm->ixm_ipsec_ref[0]; + ixa->ixa_ipsec_ref[1] = ixm->ixm_ipsec_ref[1]; + ixa->ixa_ipsec_src_port = ixm->ixm_ipsec_src_port; + ixa->ixa_ipsec_dst_port = ixm->ixm_ipsec_dst_port; + ixa->ixa_ipsec_icmp_type = ixm->ixm_ipsec_icmp_type; + ixa->ixa_ipsec_icmp_code = ixm->ixm_ipsec_icmp_code; + ixa->ixa_ipsec_inaf = ixm->ixm_ipsec_inaf; + ixa->ixa_ipsec_insrc[0] = ixm->ixm_ipsec_insrc[0]; + ixa->ixa_ipsec_insrc[1] = ixm->ixm_ipsec_insrc[1]; + ixa->ixa_ipsec_insrc[2] = ixm->ixm_ipsec_insrc[2]; + ixa->ixa_ipsec_insrc[3] = ixm->ixm_ipsec_insrc[3]; + ixa->ixa_ipsec_indst[0] = ixm->ixm_ipsec_indst[0]; + ixa->ixa_ipsec_indst[1] = ixm->ixm_ipsec_indst[1]; + ixa->ixa_ipsec_indst[2] = ixm->ixm_ipsec_indst[2]; + ixa->ixa_ipsec_indst[3] = ixm->ixm_ipsec_indst[3]; + ixa->ixa_ipsec_insrcpfx = ixm->ixm_ipsec_insrcpfx; + ixa->ixa_ipsec_indstpfx = ixm->ixm_ipsec_indstpfx; + ixa->ixa_ipsec_proto = ixm->ixm_ipsec_proto; + + freeb(ixamp); + return (B_TRUE); +} + +/* + * Free the ixm mblk and any references it holds + * Returns b_cont. + */ +mblk_t * +ip_xmit_attr_free_mblk(mblk_t *ixamp) +{ + ixamblk_t *ixm; + mblk_t *mp; + + /* Consume mp */ + ASSERT(DB_TYPE(ixamp) == M_BREAK); + mp = ixamp->b_cont; + + ixm = (ixamblk_t *)ixamp->b_rptr; + ASSERT(!ixm->ixm_inbound); + + if (ixm->ixm_ipsec_ah_sa != NULL) { + IPSA_REFRELE(ixm->ixm_ipsec_ah_sa); + ixm->ixm_ipsec_ah_sa = NULL; + } + if (ixm->ixm_ipsec_esp_sa != NULL) { + IPSA_REFRELE(ixm->ixm_ipsec_esp_sa); + ixm->ixm_ipsec_esp_sa = NULL; + } + if (ixm->ixm_ipsec_policy != NULL) { + IPPOL_REFRELE(ixm->ixm_ipsec_policy); + ixm->ixm_ipsec_policy = NULL; + } + if (ixm->ixm_ipsec_action != NULL) { + IPACT_REFRELE(ixm->ixm_ipsec_action); + ixm->ixm_ipsec_action = NULL; + } + if (ixm->ixm_ipsec_latch) { + IPLATCH_REFRELE(ixm->ixm_ipsec_latch); + ixm->ixm_ipsec_latch = NULL; + } + + if (ixm->ixm_tsl != NULL) { + label_rele(ixm->ixm_tsl); + ixm->ixm_tsl = NULL; + } + if (ixm->ixm_cred != NULL) { + crfree(ixm->ixm_cred); + ixm->ixm_cred = NULL; + } + freeb(ixamp); + return (mp); +} + +/* + * Take the information in ip_recv_attr_t and stick it in an mblk + * that can later be passed to ip_recv_attr_from_mblk to recreate the + * ip_recv_attr_t. + * + * Returns NULL on memory allocation failure. + */ +mblk_t * +ip_recv_attr_to_mblk(ip_recv_attr_t *ira) +{ + mblk_t *iramp; + iramblk_t *irm; + ill_t *ill = ira->ira_ill; + + ASSERT(ira->ira_ill != NULL || ira->ira_ruifindex != 0); + + iramp = allocb(sizeof (*irm), BPRI_MED); + if (iramp == NULL) + return (NULL); + + iramp->b_datap->db_type = M_BREAK; + iramp->b_wptr += sizeof (*irm); + irm = (iramblk_t *)iramp->b_rptr; + + bzero(irm, sizeof (*irm)); + irm->irm_inbound = B_TRUE; + irm->irm_flags = ira->ira_flags; + if (ill != NULL) { + /* Internal to IP - preserve ip_stack_t, ill and rill */ + irm->irm_stackid = + ill->ill_ipst->ips_netstack->netstack_stackid; + irm->irm_ifindex = ira->ira_ill->ill_phyint->phyint_ifindex; + ASSERT(ira->ira_rill->ill_phyint->phyint_ifindex == + ira->ira_rifindex); + } else { + /* Let ip_recv_attr_from_stackid know there isn't one */ + irm->irm_stackid = -1; + } + irm->irm_rifindex = ira->ira_rifindex; + irm->irm_ruifindex = ira->ira_ruifindex; + irm->irm_pktlen = ira->ira_pktlen; + irm->irm_ip_hdr_length = ira->ira_ip_hdr_length; + irm->irm_protocol = ira->ira_protocol; + + irm->irm_sqp = ira->ira_sqp; + irm->irm_ring = ira->ira_ring; + + irm->irm_zoneid = ira->ira_zoneid; + irm->irm_mroute_tunnel = ira->ira_mroute_tunnel; + irm->irm_no_loop_zoneid = ira->ira_no_loop_zoneid; + irm->irm_esp_udp_ports = ira->ira_esp_udp_ports; + + if (ira->ira_tsl != NULL) { + irm->irm_tsl = ira->ira_tsl; + label_hold(irm->irm_tsl); + } + if (ira->ira_cred != NULL) { + irm->irm_cred = ira->ira_cred; + crhold(ira->ira_cred); + } + irm->irm_cpid = ira->ira_cpid; + + if (ira->ira_flags & IRAF_L2SRC_SET) + bcopy(ira->ira_l2src, irm->irm_l2src, IRA_L2SRC_SIZE); + + if (ira->ira_flags & IRAF_IPSEC_SECURE) { + if (ira->ira_ipsec_ah_sa != NULL) { + irm->irm_ipsec_ah_sa = ira->ira_ipsec_ah_sa; + IPSA_REFHOLD(ira->ira_ipsec_ah_sa); + } + if (ira->ira_ipsec_esp_sa != NULL) { + irm->irm_ipsec_esp_sa = ira->ira_ipsec_esp_sa; + IPSA_REFHOLD(ira->ira_ipsec_esp_sa); + } + if (ira->ira_ipsec_action != NULL) { + irm->irm_ipsec_action = ira->ira_ipsec_action; + IPACT_REFHOLD(ira->ira_ipsec_action); + } + } + return (iramp); +} + +/* + * Extract the ip_recv_attr_t from the mblk. If we are used inside IP + * then irm_stackid is not -1, in which case we check that the + * ip_stack_t and ill_t still exist. Returns B_FALSE if that is + * not the case. + * If irm_stackid is zero then we are used by an ULP (e.g., squeue_enter) + * and we just proceed with ira_ill and ira_rill as NULL. + * + * The caller needs to release any references on the pointers inside the ire + * by calling ira_cleanup. + */ +boolean_t +ip_recv_attr_from_mblk(mblk_t *iramp, ip_recv_attr_t *ira) +{ + iramblk_t *irm; + netstack_t *ns; + ip_stack_t *ipst = NULL; + ill_t *ill = NULL, *rill = NULL; + + /* We assume the caller hasn't initialized ira */ + bzero(ira, sizeof (*ira)); + + ASSERT(DB_TYPE(iramp) == M_BREAK); + ASSERT(iramp->b_cont == NULL); + + irm = (iramblk_t *)iramp->b_rptr; + ASSERT(irm->irm_inbound); + + if (irm->irm_stackid != -1) { + /* Verify the netstack is still around */ + ns = netstack_find_by_stackid(irm->irm_stackid); + if (ns == NULL) { + /* Disappeared on us */ + (void) ip_recv_attr_free_mblk(iramp); + return (B_FALSE); + } + ipst = ns->netstack_ip; + + /* Verify the ill is still around */ + ill = ill_lookup_on_ifindex(irm->irm_ifindex, + !(irm->irm_flags & IRAF_IS_IPV4), ipst); + + if (irm->irm_ifindex == irm->irm_rifindex) { + rill = ill; + } else { + rill = ill_lookup_on_ifindex(irm->irm_rifindex, + !(irm->irm_flags & IRAF_IS_IPV4), ipst); + } + + /* We have the ill, hence the netstack can't go away */ + netstack_rele(ns); + if (ill == NULL || rill == NULL) { + /* Disappeared on us */ + if (ill != NULL) + ill_refrele(ill); + if (rill != NULL && rill != ill) + ill_refrele(rill); + (void) ip_recv_attr_free_mblk(iramp); + return (B_FALSE); + } + } + + ira->ira_flags = irm->irm_flags; + /* Caller must ill_refele(ira_ill) by using ira_cleanup() */ + ira->ira_ill = ill; + ira->ira_rill = rill; + + ira->ira_rifindex = irm->irm_rifindex; + ira->ira_ruifindex = irm->irm_ruifindex; + ira->ira_pktlen = irm->irm_pktlen; + ira->ira_ip_hdr_length = irm->irm_ip_hdr_length; + ira->ira_protocol = irm->irm_protocol; + + ira->ira_sqp = irm->irm_sqp; + /* The rest of IP assumes that the rings never go away. */ + ira->ira_ring = irm->irm_ring; + + ira->ira_zoneid = irm->irm_zoneid; + ira->ira_mroute_tunnel = irm->irm_mroute_tunnel; + ira->ira_no_loop_zoneid = irm->irm_no_loop_zoneid; + ira->ira_esp_udp_ports = irm->irm_esp_udp_ports; + + if (irm->irm_tsl != NULL) { + ira->ira_tsl = irm->irm_tsl; + ira->ira_free_flags |= IRA_FREE_TSL; + } + if (irm->irm_cred != NULL) { + ira->ira_cred = irm->irm_cred; + ira->ira_free_flags |= IRA_FREE_CRED; + } + ira->ira_cpid = irm->irm_cpid; + + if (ira->ira_flags & IRAF_L2SRC_SET) + bcopy(irm->irm_l2src, ira->ira_l2src, IRA_L2SRC_SIZE); + + ira->ira_ipsec_ah_sa = irm->irm_ipsec_ah_sa; + ira->ira_ipsec_esp_sa = irm->irm_ipsec_esp_sa; + ira->ira_ipsec_action = irm->irm_ipsec_action; + + freeb(iramp); + return (B_TRUE); +} + +/* + * Free the irm mblk and any references it holds + * Returns b_cont. + */ +mblk_t * +ip_recv_attr_free_mblk(mblk_t *iramp) +{ + iramblk_t *irm; + mblk_t *mp; + + /* Consume mp */ + ASSERT(DB_TYPE(iramp) == M_BREAK); + mp = iramp->b_cont; + + irm = (iramblk_t *)iramp->b_rptr; + ASSERT(irm->irm_inbound); + + if (irm->irm_ipsec_ah_sa != NULL) { + IPSA_REFRELE(irm->irm_ipsec_ah_sa); + irm->irm_ipsec_ah_sa = NULL; + } + if (irm->irm_ipsec_esp_sa != NULL) { + IPSA_REFRELE(irm->irm_ipsec_esp_sa); + irm->irm_ipsec_esp_sa = NULL; + } + if (irm->irm_ipsec_action != NULL) { + IPACT_REFRELE(irm->irm_ipsec_action); + irm->irm_ipsec_action = NULL; + } + if (irm->irm_tsl != NULL) { + label_rele(irm->irm_tsl); + irm->irm_tsl = NULL; + } + if (irm->irm_cred != NULL) { + crfree(irm->irm_cred); + irm->irm_cred = NULL; + } + + freeb(iramp); + return (mp); +} + +/* + * Returns true if the mblk contains an ip_recv_attr_t + * For now we just check db_type. + */ +boolean_t +ip_recv_attr_is_mblk(mblk_t *mp) +{ + /* + * Need to handle the various forms of tcp_timermp which are tagged + * with b_wptr and might have a NULL b_datap. + */ + if (mp->b_wptr == NULL || mp->b_wptr == (uchar_t *)-1) + return (B_FALSE); + +#ifdef DEBUG + iramblk_t *irm; + + if (DB_TYPE(mp) != M_BREAK) + return (B_FALSE); + + irm = (iramblk_t *)mp->b_rptr; + ASSERT(irm->irm_inbound); + return (B_TRUE); +#else + return (DB_TYPE(mp) == M_BREAK); +#endif +} + +static ip_xmit_attr_t * +conn_get_ixa_impl(conn_t *connp, boolean_t replace, int kmflag) +{ + ip_xmit_attr_t *ixa; + ip_xmit_attr_t *oldixa; + + mutex_enter(&connp->conn_lock); + ixa = connp->conn_ixa; + + /* At least one references for the conn_t */ + ASSERT(ixa->ixa_refcnt >= 1); + if (atomic_add_32_nv(&ixa->ixa_refcnt, 1) == 2) { + /* No other thread using conn_ixa */ + mutex_exit(&connp->conn_lock); + return (ixa); + } + ixa = kmem_alloc(sizeof (*ixa), kmflag); + if (ixa == NULL) { + mutex_exit(&connp->conn_lock); + ixa_refrele(connp->conn_ixa); + return (NULL); + } + ixa_safe_copy(connp->conn_ixa, ixa); + + /* Make sure we drop conn_lock before any refrele */ + if (replace) { + ixa->ixa_refcnt++; /* No atomic needed - not visible */ + oldixa = connp->conn_ixa; + connp->conn_ixa = ixa; + mutex_exit(&connp->conn_lock); + IXA_REFRELE(oldixa); /* Undo refcnt from conn_t */ + } else { + oldixa = connp->conn_ixa; + mutex_exit(&connp->conn_lock); + } + IXA_REFRELE(oldixa); /* Undo above atomic_add_32_nv */ + + return (ixa); +} + +/* + * Return an ip_xmit_attr_t to use with a conn_t that ensures that only + * the caller can access the ip_xmit_attr_t. + * + * If nobody else is using conn_ixa we return it. + * Otherwise we make a "safe" copy of conn_ixa + * and return it. The "safe" copy has the pointers set to NULL + * (since the pointers might be changed by another thread using + * conn_ixa). The caller needs to check for NULL pointers to see + * if ip_set_destination needs to be called to re-establish the pointers. + * + * If 'replace' is set then we replace conn_ixa with the new ip_xmit_attr_t. + * That is used when we connect() the ULP. + */ +ip_xmit_attr_t * +conn_get_ixa(conn_t *connp, boolean_t replace) +{ + return (conn_get_ixa_impl(connp, replace, KM_NOSLEEP)); +} + +/* + * Used only when the option is to have the kernel hang due to not + * cleaning up ixa references on ills etc. + */ +ip_xmit_attr_t * +conn_get_ixa_tryhard(conn_t *connp, boolean_t replace) +{ + return (conn_get_ixa_impl(connp, replace, KM_SLEEP)); +} + +/* + * Replace conn_ixa with the ixa argument. + * + * The caller must hold conn_lock. + * + * We return the old ixa; the caller must ixa_refrele that after conn_lock + * has been dropped. + */ +ip_xmit_attr_t * +conn_replace_ixa(conn_t *connp, ip_xmit_attr_t *ixa) +{ + ip_xmit_attr_t *oldixa; + + ASSERT(MUTEX_HELD(&connp->conn_lock)); + + oldixa = connp->conn_ixa; + IXA_REFHOLD(ixa); + connp->conn_ixa = ixa; + return (oldixa); +} + +/* + * Return a ip_xmit_attr_t to use with a conn_t that is based on but + * separate from conn_ixa. + * + * This "safe" copy has the pointers set to NULL + * (since the pointers might be changed by another thread using + * conn_ixa). The caller needs to check for NULL pointers to see + * if ip_set_destination needs to be called to re-establish the pointers. + */ +ip_xmit_attr_t * +conn_get_ixa_exclusive(conn_t *connp) +{ + ip_xmit_attr_t *ixa; + + mutex_enter(&connp->conn_lock); + ixa = connp->conn_ixa; + + /* At least one references for the conn_t */ + ASSERT(ixa->ixa_refcnt >= 1); + + /* Make sure conn_ixa doesn't disappear while we copy it */ + atomic_add_32(&ixa->ixa_refcnt, 1); + + ixa = kmem_alloc(sizeof (*ixa), KM_NOSLEEP); + if (ixa == NULL) { + mutex_exit(&connp->conn_lock); + ixa_refrele(connp->conn_ixa); + return (NULL); + } + ixa_safe_copy(connp->conn_ixa, ixa); + mutex_exit(&connp->conn_lock); + IXA_REFRELE(connp->conn_ixa); + return (ixa); +} + +void +ixa_safe_copy(ip_xmit_attr_t *src, ip_xmit_attr_t *ixa) +{ + bcopy(src, ixa, sizeof (*ixa)); + ixa->ixa_refcnt = 1; + /* + * Clear any pointers that have references and might be changed + * by ip_set_destination or the ULP + */ + ixa->ixa_ire = NULL; + ixa->ixa_nce = NULL; + ixa->ixa_dce = NULL; + ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; + ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; +#ifdef DEBUG + ixa->ixa_curthread = NULL; +#endif + /* Clear all the IPsec pointers and the flag as well. */ + ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; + + ixa->ixa_ipsec_latch = NULL; + ixa->ixa_ipsec_ah_sa = NULL; + ixa->ixa_ipsec_esp_sa = NULL; + ixa->ixa_ipsec_policy = NULL; + ixa->ixa_ipsec_action = NULL; + + /* + * We leave ixa_tsl unchanged, but if it has a refhold we need + * to get an extra refhold. + */ + if (ixa->ixa_free_flags & IXA_FREE_TSL) + label_hold(ixa->ixa_tsl); + + /* + * We leave ixa_cred unchanged, but if it has a refhold we need + * to get an extra refhold. + */ + if (ixa->ixa_free_flags & IXA_FREE_CRED) + crhold(ixa->ixa_cred); +} + +/* + * Duplicate an ip_xmit_attr_t. + * Assumes that the caller controls the ixa, hence we do not need to use + * a safe copy. We just have to increase the refcnt on any pointers. + */ +ip_xmit_attr_t * +ip_xmit_attr_duplicate(ip_xmit_attr_t *src_ixa) +{ + ip_xmit_attr_t *ixa; + + ixa = kmem_alloc(sizeof (*ixa), KM_NOSLEEP); + if (ixa == NULL) + return (NULL); + bcopy(src_ixa, ixa, sizeof (*ixa)); + ixa->ixa_refcnt = 1; + + if (ixa->ixa_ire != NULL) + ire_refhold_notr(ixa->ixa_ire); + if (ixa->ixa_nce != NULL) + nce_refhold(ixa->ixa_nce); + if (ixa->ixa_dce != NULL) + dce_refhold_notr(ixa->ixa_dce); + +#ifdef DEBUG + ixa->ixa_curthread = NULL; +#endif + + if (ixa->ixa_ipsec_latch != NULL) + IPLATCH_REFHOLD(ixa->ixa_ipsec_latch); + if (ixa->ixa_ipsec_ah_sa != NULL) + IPSA_REFHOLD(ixa->ixa_ipsec_ah_sa); + if (ixa->ixa_ipsec_esp_sa != NULL) + IPSA_REFHOLD(ixa->ixa_ipsec_esp_sa); + if (ixa->ixa_ipsec_policy != NULL) + IPPOL_REFHOLD(ixa->ixa_ipsec_policy); + if (ixa->ixa_ipsec_action != NULL) + IPACT_REFHOLD(ixa->ixa_ipsec_action); + + if (ixa->ixa_tsl != NULL) { + label_hold(ixa->ixa_tsl); + ixa->ixa_free_flags |= IXA_FREE_TSL; + } + if (ixa->ixa_cred != NULL) { + crhold(ixa->ixa_cred); + ixa->ixa_free_flags |= IXA_FREE_CRED; + } + return (ixa); +} + +/* + * Used to replace the ixa_label field. + * The caller should have a reference on the label, which we transfer to + * the attributes so that when the attribute is freed/cleaned up + * we will release that reference. + */ +void +ip_xmit_attr_replace_tsl(ip_xmit_attr_t *ixa, ts_label_t *tsl) +{ + ASSERT(tsl != NULL); + + if (ixa->ixa_free_flags & IXA_FREE_TSL) { + ASSERT(ixa->ixa_tsl != NULL); + label_rele(ixa->ixa_tsl); + } else { + ixa->ixa_free_flags |= IXA_FREE_TSL; + } + ixa->ixa_tsl = tsl; +} + +/* + * Replace the ip_recv_attr_t's label. + * Due to kernel RPC's use of db_credp we also need to replace ira_cred; + * TCP/UDP uses ira_cred to set db_credp for non-socket users. + * This can fail (and return B_FALSE) due to lack of memory. + */ +boolean_t +ip_recv_attr_replace_label(ip_recv_attr_t *ira, ts_label_t *tsl) +{ + cred_t *newcr; + + if (ira->ira_free_flags & IRA_FREE_TSL) { + ASSERT(ira->ira_tsl != NULL); + label_rele(ira->ira_tsl); + } + label_hold(tsl); + ira->ira_tsl = tsl; + ira->ira_free_flags |= IRA_FREE_TSL; + + /* + * Reset zoneid if we have a shared address. That allows + * ip_fanout_tx_v4/v6 to determine the zoneid again. + */ + if (ira->ira_flags & IRAF_TX_SHARED_ADDR) + ira->ira_zoneid = ALL_ZONES; + + /* We update ira_cred for RPC */ + newcr = copycred_from_tslabel(ira->ira_cred, ira->ira_tsl, KM_NOSLEEP); + if (newcr == NULL) + return (B_FALSE); + if (ira->ira_free_flags & IRA_FREE_CRED) + crfree(ira->ira_cred); + ira->ira_cred = newcr; + ira->ira_free_flags |= IRA_FREE_CRED; + return (B_TRUE); +} + +/* + * This needs to be called after ip_set_destination/tsol_check_dest might + * have changed ixa_tsl to be specific for a destination, and we now want to + * send to a different destination. + * We have to restart with crgetlabel() since ip_set_destination/ + * tsol_check_dest will start with ixa_tsl. + */ +void +ip_xmit_attr_restore_tsl(ip_xmit_attr_t *ixa, cred_t *cr) +{ + if (!is_system_labeled()) + return; + + if (ixa->ixa_free_flags & IXA_FREE_TSL) { + ASSERT(ixa->ixa_tsl != NULL); + label_rele(ixa->ixa_tsl); + ixa->ixa_free_flags &= ~IXA_FREE_TSL; + } + ixa->ixa_tsl = crgetlabel(cr); +} + +void +ixa_refrele(ip_xmit_attr_t *ixa) +{ + IXA_REFRELE(ixa); +} + +void +ixa_inactive(ip_xmit_attr_t *ixa) +{ + ASSERT(ixa->ixa_refcnt == 0); + + ixa_cleanup(ixa); + kmem_free(ixa, sizeof (*ixa)); +} + +/* + * Release any references contained in the ixa. + * Also clear any fields that are not controlled by ixa_flags. + */ +void +ixa_cleanup(ip_xmit_attr_t *ixa) +{ + if (ixa->ixa_ire != NULL) { + ire_refrele_notr(ixa->ixa_ire); + ixa->ixa_ire = NULL; + } + if (ixa->ixa_dce != NULL) { + dce_refrele_notr(ixa->ixa_dce); + ixa->ixa_dce = NULL; + } + if (ixa->ixa_nce != NULL) { + nce_refrele(ixa->ixa_nce); + ixa->ixa_nce = NULL; + } + ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; + ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; + if (ixa->ixa_flags & IXAF_IPSEC_SECURE) { + ipsec_out_release_refs(ixa); + } + if (ixa->ixa_free_flags & IXA_FREE_TSL) { + ASSERT(ixa->ixa_tsl != NULL); + label_rele(ixa->ixa_tsl); + ixa->ixa_tsl = NULL; + ixa->ixa_free_flags &= ~IXA_FREE_TSL; + } + if (ixa->ixa_free_flags & IXA_FREE_CRED) { + ASSERT(ixa->ixa_cred != NULL); + crfree(ixa->ixa_cred); + ixa->ixa_cred = NULL; + ixa->ixa_free_flags &= ~IXA_FREE_CRED; + } + ixa->ixa_src_preferences = 0; + ixa->ixa_ifindex = 0; + ixa->ixa_multicast_ifindex = 0; + ixa->ixa_multicast_ifaddr = INADDR_ANY; +} + +/* + * Release any references contained in the ira. + * Callers which use ip_recv_attr_from_mblk() would pass B_TRUE as the second + * argument. + */ +void +ira_cleanup(ip_recv_attr_t *ira, boolean_t refrele_ill) +{ + if (ira->ira_ill != NULL) { + if (ira->ira_rill != ira->ira_ill) { + /* Caused by async processing */ + ill_refrele(ira->ira_rill); + } + if (refrele_ill) + ill_refrele(ira->ira_ill); + } + if (ira->ira_flags & IRAF_IPSEC_SECURE) { + ipsec_in_release_refs(ira); + } + if (ira->ira_free_flags & IRA_FREE_TSL) { + ASSERT(ira->ira_tsl != NULL); + label_rele(ira->ira_tsl); + ira->ira_tsl = NULL; + ira->ira_free_flags &= ~IRA_FREE_TSL; + } + if (ira->ira_free_flags & IRA_FREE_CRED) { + ASSERT(ira->ira_cred != NULL); + crfree(ira->ira_cred); + ira->ira_cred = NULL; + ira->ira_free_flags &= ~IRA_FREE_CRED; + } +} + +/* + * Function to help release any IRE, NCE, or DCEs that + * have been deleted and are marked as condemned. + * The caller is responsible for any serialization which is different + * for TCP, SCTP, and others. + */ +static void +ixa_cleanup_stale(ip_xmit_attr_t *ixa) +{ + ire_t *ire; + nce_t *nce; + dce_t *dce; + + ire = ixa->ixa_ire; + nce = ixa->ixa_nce; + dce = ixa->ixa_dce; + + if (ire != NULL && IRE_IS_CONDEMNED(ire)) { + ire_refrele_notr(ire); + ire = ire_blackhole(ixa->ixa_ipst, + !(ixa->ixa_flags & IXAF_IS_IPV4)); + ASSERT(ire != NULL); +#ifdef DEBUG + ire_refhold_notr(ire); + ire_refrele(ire); +#endif + ixa->ixa_ire = ire; + ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; + } + if (nce != NULL && nce->nce_is_condemned) { + /* Can make it NULL as long as we set IRE_GENERATION_VERIFY */ + nce_refrele(nce); + ixa->ixa_nce = NULL; + ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; + } + if (dce != NULL && DCE_IS_CONDEMNED(dce)) { + dce_refrele_notr(dce); + dce = dce_get_default(ixa->ixa_ipst); + ASSERT(dce != NULL); +#ifdef DEBUG + dce_refhold_notr(dce); + dce_refrele(dce); +#endif + ixa->ixa_dce = dce; + ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; + } +} + +/* + * Used to run ixa_cleanup_stale inside the tcp squeue. + * When done we hand the mp back by assigning it to tcps_ixa_cleanup_mp + * and waking up the caller. + */ +/* ARGSUSED2 */ +static void +tcp_ixa_cleanup(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy) +{ + conn_t *connp = (conn_t *)arg; + tcp_stack_t *tcps; + + tcps = connp->conn_netstack->netstack_tcp; + + ixa_cleanup_stale(connp->conn_ixa); + + mutex_enter(&tcps->tcps_ixa_cleanup_lock); + ASSERT(tcps->tcps_ixa_cleanup_mp == NULL); + tcps->tcps_ixa_cleanup_mp = mp; + cv_signal(&tcps->tcps_ixa_cleanup_cv); + mutex_exit(&tcps->tcps_ixa_cleanup_lock); +} + + +/* + * ipcl_walk() function to help release any IRE, NCE, or DCEs that + * have been deleted and are marked as condemned. + * Note that we can't cleanup the pointers since there can be threads + * in conn_ip_output() sending while we are called. + */ +void +conn_ixa_cleanup(conn_t *connp, void *arg) +{ + boolean_t tryhard = (boolean_t)arg; + + if (IPCL_IS_TCP(connp)) { + mblk_t *mp; + tcp_stack_t *tcps; + + tcps = connp->conn_netstack->netstack_tcp; + + mutex_enter(&tcps->tcps_ixa_cleanup_lock); + while ((mp = tcps->tcps_ixa_cleanup_mp) == NULL) { + /* + * Multiple concurrent cleanups; need to have the last + * one run since it could be an unplumb. + */ + cv_wait(&tcps->tcps_ixa_cleanup_cv, + &tcps->tcps_ixa_cleanup_lock); + } + tcps->tcps_ixa_cleanup_mp = NULL; + mutex_exit(&tcps->tcps_ixa_cleanup_lock); + + if (connp->conn_sqp->sq_run == curthread) { + /* Already on squeue */ + tcp_ixa_cleanup(connp, mp, NULL, NULL); + } else { + CONN_INC_REF(connp); + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_ixa_cleanup, + connp, NULL, SQ_PROCESS, SQTAG_TCP_IXA_CLEANUP); + + /* Wait until tcp_ixa_cleanup has run */ + mutex_enter(&tcps->tcps_ixa_cleanup_lock); + while (tcps->tcps_ixa_cleanup_mp == NULL) { + cv_wait(&tcps->tcps_ixa_cleanup_cv, + &tcps->tcps_ixa_cleanup_lock); + } + mutex_exit(&tcps->tcps_ixa_cleanup_lock); + } + } else if (IPCL_IS_SCTP(connp)) { + sctp_t *sctp; + sctp_faddr_t *fp; + + sctp = CONN2SCTP(connp); + RUN_SCTP(sctp); + ixa_cleanup_stale(connp->conn_ixa); + for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->next) + ixa_cleanup_stale(fp->ixa); + WAKE_SCTP(sctp); + } else { + ip_xmit_attr_t *ixa; + + /* + * If there is a different thread using conn_ixa then we get a + * new copy and cut the old one loose from conn_ixa. Otherwise + * we use conn_ixa and prevent any other thread from + * using/changing it. Anybody using conn_ixa (e.g., a thread in + * conn_ip_output) will do an ixa_refrele which will remove any + * references on the ire etc. + * + * Once we are done other threads can use conn_ixa since the + * refcnt will be back at one. + * + * We are called either because an ill is going away, or + * due to memory reclaim. In the former case we wait for + * memory since we must remove the refcnts on the ill. + */ + if (tryhard) { + ixa = conn_get_ixa_tryhard(connp, B_TRUE); + ASSERT(ixa != NULL); + } else { + ixa = conn_get_ixa(connp, B_TRUE); + if (ixa == NULL) { + /* + * Somebody else was using it and kmem_alloc + * failed! Next memory reclaim will try to + * clean up. + */ + DTRACE_PROBE1(conn__ixa__cleanup__bail, + conn_t *, connp); + return; + } + } + ixa_cleanup_stale(ixa); + ixa_refrele(ixa); + } +} + +/* + * ixa needs to be an exclusive copy so that no one changes the cookie + * or the ixa_nce. + */ +boolean_t +ixa_check_drain_insert(conn_t *connp, ip_xmit_attr_t *ixa) +{ + uintptr_t cookie = ixa->ixa_cookie; + ill_dld_direct_t *idd; + idl_tx_list_t *idl_txl; + ill_t *ill = ixa->ixa_nce->nce_ill; + boolean_t inserted = B_FALSE; + + idd = &(ill)->ill_dld_capab->idc_direct; + idl_txl = &ixa->ixa_ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)]; + if (cookie == 0) { + /* + * ip_xmit failed the canputnext check + */ + connp->conn_did_putbq = 1; + ASSERT(cookie == 0); + conn_drain_insert(connp, idl_txl); + if (!IPCL_IS_NONSTR(connp)) + noenable(connp->conn_wq); + return (B_TRUE); + } + ASSERT(ILL_DIRECT_CAPABLE(ill)); + mutex_enter(&idl_txl->txl_lock); + if (connp->conn_direct_blocked || + (idd->idd_tx_fctl_df(idd->idd_tx_fctl_dh, cookie) == 0)) { + DTRACE_PROBE1(ill__tx__not__blocked, boolean, + connp->conn_direct_blocked); + } else if (idl_txl->txl_cookie != NULL && + idl_txl->txl_cookie != ixa->ixa_cookie) { + DTRACE_PROBE2(ill__send__tx__collision, uintptr_t, cookie, + uintptr_t, idl_txl->txl_cookie); + /* bump kstat for cookie collision */ + } else { + connp->conn_direct_blocked = B_TRUE; + idl_txl->txl_cookie = cookie; + conn_drain_insert(connp, idl_txl); + if (!IPCL_IS_NONSTR(connp)) + noenable(connp->conn_wq); + inserted = B_TRUE; + } + mutex_exit(&idl_txl->txl_lock); + return (inserted); +} diff --git a/usr/src/uts/common/inet/ip/ip_dce.c b/usr/src/uts/common/inet/ip/ip_dce.c new file mode 100644 index 0000000000..839c5ae0d0 --- /dev/null +++ b/usr/src/uts/common/inet/ip/ip_dce.c @@ -0,0 +1,873 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/types.h> +#include <sys/stream.h> +#include <sys/strsun.h> +#include <sys/zone.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/atomic.h> +#define _SUN_TPI_VERSION 2 +#include <sys/tihdr.h> + +#include <inet/common.h> +#include <inet/mi.h> +#include <inet/mib2.h> +#include <inet/snmpcom.h> + +#include <netinet/ip6.h> +#include <netinet/icmp6.h> + +#include <inet/ip.h> +#include <inet/ip_impl.h> +#include <inet/ip6.h> +#include <inet/ip6_asp.h> +#include <inet/ip_multi.h> +#include <inet/ip_if.h> +#include <inet/ip_ire.h> +#include <inet/ip_ftable.h> +#include <inet/ip_rts.h> +#include <inet/ip_ndp.h> +#include <inet/ipclassifier.h> +#include <inet/ip_listutils.h> + +#include <sys/sunddi.h> + +/* + * Routines for handling destination cache entries. + * There is always one DCEF_DEFAULT for each ip_stack_t created at init time. + * That entry holds both the IP ident value and the dce generation number. + * + * Any time a DCE is changed significantly (different path MTU, but NOT + * different ULP info!), the dce_generation number is increased. + * Also, when a new DCE is created, the dce_generation number in the default + * DCE is bumped. That allows the dce_t information to be cached efficiently + * as long as the entity caching the dce_t also caches the dce_generation, + * and compares the cached generation to detect any changes. + * Furthermore, when a DCE is deleted, if there are any outstanding references + * to the DCE it will be marked as condemned. The condemned mark is + * a designated generation number which is never otherwise used, hence + * the single comparison with the generation number captures that as well. + * + * An example of code which caches is as follows: + * + * if (mystruct->my_dce_generation != mystruct->my_dce->dce_generation) { + * The DCE has changed + * mystruct->my_dce = dce_lookup_pkt(mp, ixa, + * &mystruct->my_dce_generation); + * Not needed in practice, since we have the default DCE: + * if (DCE_IS_CONDEMNED(mystruct->my_dce)) + * return failure; + * } + * + * Note that for IPv6 link-local addresses we record the ifindex since the + * link-locals are not globally unique. + */ + +/* + * Hash bucket structure for DCEs + */ +typedef struct dcb_s { + krwlock_t dcb_lock; + uint32_t dcb_cnt; + dce_t *dcb_dce; +} dcb_t; + +static void dce_delete_locked(dcb_t *, dce_t *); +static void dce_make_condemned(dce_t *); + +static kmem_cache_t *dce_cache; + + +/* Operates on a uint64_t */ +#define RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48)) + +/* + * Reclaim a fraction of dce's in the dcb. + * For now we have a higher probability to delete DCEs without DCE_PMTU. + */ +static void +dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction) +{ + uint_t fraction_pmtu = fraction*4; + uint_t hash; + dce_t *dce, *nextdce; + + rw_enter(&dcb->dcb_lock, RW_WRITER); + for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) { + nextdce = dce->dce_next; + /* Clear DCEF_PMTU if the pmtu is too old */ + mutex_enter(&dce->dce_lock); + if ((dce->dce_flags & DCEF_PMTU) && + TICK_TO_SEC(lbolt64) - dce->dce_last_change_time > + ipst->ips_ip_pathmtu_interval) { + dce->dce_flags &= ~DCEF_PMTU; + mutex_exit(&dce->dce_lock); + dce_increment_generation(dce); + } else { + mutex_exit(&dce->dce_lock); + } + hash = RANDOM_HASH((uint64_t)(uintptr_t)dce); + if (dce->dce_flags & DCEF_PMTU) { + if (hash % fraction_pmtu != 0) + continue; + } else { + if (hash % fraction != 0) + continue; + } + + IP_STAT(ipst, ip_dce_reclaim_deleted); + dce_delete_locked(dcb, dce); + dce_refrele(dce); + } + rw_exit(&dcb->dcb_lock); +} + +/* + * kmem_cache callback to free up memory. + * + */ +static void +ip_dce_reclaim_stack(ip_stack_t *ipst) +{ + int i; + + IP_STAT(ipst, ip_dce_reclaim_calls); + for (i = 0; i < ipst->ips_dce_hashsize; i++) { + dcb_reclaim(&ipst->ips_dce_hash_v4[i], ipst, + ipst->ips_ip_dce_reclaim_fraction); + + dcb_reclaim(&ipst->ips_dce_hash_v6[i], ipst, + ipst->ips_ip_dce_reclaim_fraction); + } + + /* + * Walk all CONNs that can have a reference on an ire, nce or dce. + * Get them to update any stale references to drop any refholds they + * have. + */ + ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst); +} + +/* + * Called by the memory allocator subsystem directly, when the system + * is running low on memory. + */ +/* ARGSUSED */ +void +ip_dce_reclaim(void *args) +{ + netstack_handle_t nh; + netstack_t *ns; + + netstack_next_init(&nh); + while ((ns = netstack_next(&nh)) != NULL) { + ip_dce_reclaim_stack(ns->netstack_ip); + netstack_rele(ns); + } + netstack_next_fini(&nh); +} + +void +dce_g_init(void) +{ + dce_cache = kmem_cache_create("dce_cache", + sizeof (dce_t), 0, NULL, NULL, ip_dce_reclaim, NULL, NULL, 0); +} + +void +dce_g_destroy(void) +{ + kmem_cache_destroy(dce_cache); +} + + +/* + * Allocate a default DCE and a hash table for per-IP address DCEs + */ +void +dce_stack_init(ip_stack_t *ipst) +{ + int i; + + ipst->ips_dce_default = kmem_cache_alloc(dce_cache, KM_SLEEP); + bzero(ipst->ips_dce_default, sizeof (dce_t)); + ipst->ips_dce_default->dce_flags = DCEF_DEFAULT; + ipst->ips_dce_default->dce_generation = DCE_GENERATION_INITIAL; + ipst->ips_dce_default->dce_last_change_time = TICK_TO_SEC(lbolt64); + ipst->ips_dce_default->dce_refcnt = 1; /* Should never go away */ + ipst->ips_dce_default->dce_ipst = ipst; + + /* This must be a power of two since we are using IRE_ADDR_HASH macro */ + ipst->ips_dce_hashsize = 256; + ipst->ips_dce_hash_v4 = kmem_zalloc(ipst->ips_dce_hashsize * + sizeof (dcb_t), KM_SLEEP); + ipst->ips_dce_hash_v6 = kmem_zalloc(ipst->ips_dce_hashsize * + sizeof (dcb_t), KM_SLEEP); + for (i = 0; i < ipst->ips_dce_hashsize; i++) { + rw_init(&ipst->ips_dce_hash_v4[i].dcb_lock, NULL, RW_DEFAULT, + NULL); + rw_init(&ipst->ips_dce_hash_v6[i].dcb_lock, NULL, RW_DEFAULT, + NULL); + } +} + +void +dce_stack_destroy(ip_stack_t *ipst) +{ + int i; + for (i = 0; i < ipst->ips_dce_hashsize; i++) { + rw_destroy(&ipst->ips_dce_hash_v4[i].dcb_lock); + rw_destroy(&ipst->ips_dce_hash_v6[i].dcb_lock); + } + kmem_free(ipst->ips_dce_hash_v4, + ipst->ips_dce_hashsize * sizeof (dcb_t)); + ipst->ips_dce_hash_v4 = NULL; + kmem_free(ipst->ips_dce_hash_v6, + ipst->ips_dce_hashsize * sizeof (dcb_t)); + ipst->ips_dce_hash_v6 = NULL; + ipst->ips_dce_hashsize = 0; + + ASSERT(ipst->ips_dce_default->dce_refcnt == 1); + kmem_cache_free(dce_cache, ipst->ips_dce_default); + ipst->ips_dce_default = NULL; +} + +/* When any DCE is good enough */ +dce_t * +dce_get_default(ip_stack_t *ipst) +{ + dce_t *dce; + + dce = ipst->ips_dce_default; + dce_refhold(dce); + return (dce); +} + +/* + * Generic for IPv4 and IPv6. + * + * Used by callers that need to cache e.g., the datapath + * Returns the generation number in the last argument. + */ +dce_t * +dce_lookup_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp) +{ + if (ixa->ixa_flags & IXAF_IS_IPV4) { + /* + * If we have a source route we need to look for the final + * destination in the source route option. + */ + ipaddr_t final_dst; + ipha_t *ipha = (ipha_t *)mp->b_rptr; + + final_dst = ip_get_dst(ipha); + return (dce_lookup_v4(final_dst, ixa->ixa_ipst, generationp)); + } else { + uint_t ifindex; + /* + * If we have a routing header we need to look for the final + * destination in the routing extension header. + */ + in6_addr_t final_dst; + ip6_t *ip6h = (ip6_t *)mp->b_rptr; + + final_dst = ip_get_dst_v6(ip6h, mp, NULL); + ifindex = 0; + if (IN6_IS_ADDR_LINKSCOPE(&final_dst) && ixa->ixa_nce != NULL) { + ifindex = ixa->ixa_nce->nce_common->ncec_ill-> + ill_phyint->phyint_ifindex; + } + return (dce_lookup_v6(&final_dst, ifindex, ixa->ixa_ipst, + generationp)); + } +} + +/* + * Used by callers that need to cache e.g., the datapath + * Returns the generation number in the last argument. + */ +dce_t * +dce_lookup_v4(ipaddr_t dst, ip_stack_t *ipst, uint_t *generationp) +{ + uint_t hash; + dcb_t *dcb; + dce_t *dce; + + /* Set *generationp before dropping the lock(s) that allow additions */ + if (generationp != NULL) + *generationp = ipst->ips_dce_default->dce_generation; + + hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize); + dcb = &ipst->ips_dce_hash_v4[hash]; + rw_enter(&dcb->dcb_lock, RW_READER); + for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { + if (dce->dce_v4addr == dst) { + mutex_enter(&dce->dce_lock); + if (!DCE_IS_CONDEMNED(dce)) { + dce_refhold(dce); + if (generationp != NULL) + *generationp = dce->dce_generation; + mutex_exit(&dce->dce_lock); + rw_exit(&dcb->dcb_lock); + return (dce); + } + mutex_exit(&dce->dce_lock); + } + } + rw_exit(&dcb->dcb_lock); + /* Not found */ + dce = ipst->ips_dce_default; + dce_refhold(dce); + return (dce); +} + +/* + * Used by callers that need to cache e.g., the datapath + * Returns the generation number in the last argument. + * ifindex should only be set for link-locals + */ +dce_t * +dce_lookup_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst, + uint_t *generationp) +{ + uint_t hash; + dcb_t *dcb; + dce_t *dce; + + /* Set *generationp before dropping the lock(s) that allow additions */ + if (generationp != NULL) + *generationp = ipst->ips_dce_default->dce_generation; + + hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize); + dcb = &ipst->ips_dce_hash_v6[hash]; + rw_enter(&dcb->dcb_lock, RW_READER); + for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { + if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) && + dce->dce_ifindex == ifindex) { + mutex_enter(&dce->dce_lock); + if (!DCE_IS_CONDEMNED(dce)) { + dce_refhold(dce); + if (generationp != NULL) + *generationp = dce->dce_generation; + mutex_exit(&dce->dce_lock); + rw_exit(&dcb->dcb_lock); + return (dce); + } + mutex_exit(&dce->dce_lock); + } + } + rw_exit(&dcb->dcb_lock); + /* Not found */ + dce = ipst->ips_dce_default; + dce_refhold(dce); + return (dce); +} + +/* + * Atomically looks for a non-default DCE, and if not found tries to create one. + * If there is no memory it returns NULL. + * When an entry is created we increase the generation number on + * the default DCE so that conn_ip_output will detect there is a new DCE. + */ +dce_t * +dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst) +{ + uint_t hash; + dcb_t *dcb; + dce_t *dce; + + hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize); + dcb = &ipst->ips_dce_hash_v4[hash]; + rw_enter(&dcb->dcb_lock, RW_WRITER); + for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { + if (dce->dce_v4addr == dst) { + mutex_enter(&dce->dce_lock); + if (!DCE_IS_CONDEMNED(dce)) { + dce_refhold(dce); + mutex_exit(&dce->dce_lock); + rw_exit(&dcb->dcb_lock); + return (dce); + } + mutex_exit(&dce->dce_lock); + } + } + dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP); + if (dce == NULL) { + rw_exit(&dcb->dcb_lock); + return (NULL); + } + bzero(dce, sizeof (dce_t)); + dce->dce_ipst = ipst; /* No netstack_hold */ + dce->dce_v4addr = dst; + dce->dce_generation = DCE_GENERATION_INITIAL; + dce->dce_ipversion = IPV4_VERSION; + dce->dce_last_change_time = TICK_TO_SEC(lbolt64); + dce_refhold(dce); /* For the hash list */ + + /* Link into list */ + if (dcb->dcb_dce != NULL) + dcb->dcb_dce->dce_ptpn = &dce->dce_next; + dce->dce_next = dcb->dcb_dce; + dce->dce_ptpn = &dcb->dcb_dce; + dcb->dcb_dce = dce; + dce->dce_bucket = dcb; + dce_refhold(dce); /* For the caller */ + rw_exit(&dcb->dcb_lock); + + /* Initialize dce_ident to be different than for the last packet */ + dce->dce_ident = ipst->ips_dce_default->dce_ident + 1; + + dce_increment_generation(ipst->ips_dce_default); + return (dce); +} + +/* + * Atomically looks for a non-default DCE, and if not found tries to create one. + * If there is no memory it returns NULL. + * When an entry is created we increase the generation number on + * the default DCE so that conn_ip_output will detect there is a new DCE. + * ifindex should only be used with link-local addresses. + */ +dce_t * +dce_lookup_and_add_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst) +{ + uint_t hash; + dcb_t *dcb; + dce_t *dce; + + /* We should not create entries for link-locals w/o an ifindex */ + ASSERT(!(IN6_IS_ADDR_LINKSCOPE(dst)) || ifindex != 0); + + hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize); + dcb = &ipst->ips_dce_hash_v6[hash]; + rw_enter(&dcb->dcb_lock, RW_WRITER); + for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { + if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) && + dce->dce_ifindex == ifindex) { + mutex_enter(&dce->dce_lock); + if (!DCE_IS_CONDEMNED(dce)) { + dce_refhold(dce); + mutex_exit(&dce->dce_lock); + rw_exit(&dcb->dcb_lock); + return (dce); + } + mutex_exit(&dce->dce_lock); + } + } + + dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP); + if (dce == NULL) { + rw_exit(&dcb->dcb_lock); + return (NULL); + } + bzero(dce, sizeof (dce_t)); + dce->dce_ipst = ipst; /* No netstack_hold */ + dce->dce_v6addr = *dst; + dce->dce_ifindex = ifindex; + dce->dce_generation = DCE_GENERATION_INITIAL; + dce->dce_ipversion = IPV6_VERSION; + dce->dce_last_change_time = TICK_TO_SEC(lbolt64); + dce_refhold(dce); /* For the hash list */ + + /* Link into list */ + if (dcb->dcb_dce != NULL) + dcb->dcb_dce->dce_ptpn = &dce->dce_next; + dce->dce_next = dcb->dcb_dce; + dce->dce_ptpn = &dcb->dcb_dce; + dcb->dcb_dce = dce; + dce->dce_bucket = dcb; + atomic_add_32(&dcb->dcb_cnt, 1); + dce_refhold(dce); /* For the caller */ + rw_exit(&dcb->dcb_lock); + + /* Initialize dce_ident to be different than for the last packet */ + dce->dce_ident = ipst->ips_dce_default->dce_ident + 1; + dce_increment_generation(ipst->ips_dce_default); + return (dce); +} + +/* + * Set/update uinfo. Creates a per-destination dce if none exists. + * + * Note that we do not bump the generation number here. + * New connections will find the new uinfo. + * + * The only use of this (tcp, sctp using iulp_t) is to set rtt+rtt_sd. + */ +static void +dce_setuinfo(dce_t *dce, iulp_t *uinfo) +{ + /* + * Update the round trip time estimate and/or the max frag size + * and/or the slow start threshold. + * + * We serialize multiple advises using dce_lock. + */ + mutex_enter(&dce->dce_lock); + /* Gard against setting to zero */ + if (uinfo->iulp_rtt != 0) { + /* + * If there is no old cached values, initialize them + * conservatively. Set them to be (1.5 * new value). + */ + if (dce->dce_uinfo.iulp_rtt != 0) { + dce->dce_uinfo.iulp_rtt = (dce->dce_uinfo.iulp_rtt + + uinfo->iulp_rtt) >> 1; + } else { + dce->dce_uinfo.iulp_rtt = uinfo->iulp_rtt + + (uinfo->iulp_rtt >> 1); + } + if (dce->dce_uinfo.iulp_rtt_sd != 0) { + dce->dce_uinfo.iulp_rtt_sd = + (dce->dce_uinfo.iulp_rtt_sd + + uinfo->iulp_rtt_sd) >> 1; + } else { + dce->dce_uinfo.iulp_rtt_sd = uinfo->iulp_rtt_sd + + (uinfo->iulp_rtt_sd >> 1); + } + } + if (uinfo->iulp_mtu != 0) { + if (dce->dce_flags & DCEF_PMTU) { + dce->dce_pmtu = MIN(uinfo->iulp_mtu, dce->dce_pmtu); + } else { + dce->dce_pmtu = MIN(uinfo->iulp_mtu, IP_MAXPACKET); + dce->dce_flags |= DCEF_PMTU; + } + dce->dce_last_change_time = TICK_TO_SEC(lbolt64); + } + if (uinfo->iulp_ssthresh != 0) { + if (dce->dce_uinfo.iulp_ssthresh != 0) + dce->dce_uinfo.iulp_ssthresh = + (uinfo->iulp_ssthresh + + dce->dce_uinfo.iulp_ssthresh) >> 1; + else + dce->dce_uinfo.iulp_ssthresh = uinfo->iulp_ssthresh; + } + /* We have uinfo for sure */ + dce->dce_flags |= DCEF_UINFO; + mutex_exit(&dce->dce_lock); +} + + +int +dce_update_uinfo_v4(ipaddr_t dst, iulp_t *uinfo, ip_stack_t *ipst) +{ + dce_t *dce; + + dce = dce_lookup_and_add_v4(dst, ipst); + if (dce == NULL) + return (ENOMEM); + + dce_setuinfo(dce, uinfo); + dce_refrele(dce); + return (0); +} + +int +dce_update_uinfo_v6(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo, + ip_stack_t *ipst) +{ + dce_t *dce; + + dce = dce_lookup_and_add_v6(dst, ifindex, ipst); + if (dce == NULL) + return (ENOMEM); + + dce_setuinfo(dce, uinfo); + dce_refrele(dce); + return (0); +} + +/* Common routine for IPv4 and IPv6 */ +int +dce_update_uinfo(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo, + ip_stack_t *ipst) +{ + ipaddr_t dst4; + + if (IN6_IS_ADDR_V4MAPPED_ANY(dst)) { + IN6_V4MAPPED_TO_IPADDR(dst, dst4); + return (dce_update_uinfo_v4(dst4, uinfo, ipst)); + } else { + return (dce_update_uinfo_v6(dst, ifindex, uinfo, ipst)); + } +} + +static void +dce_make_condemned(dce_t *dce) +{ + ip_stack_t *ipst = dce->dce_ipst; + + mutex_enter(&dce->dce_lock); + ASSERT(!DCE_IS_CONDEMNED(dce)); + dce->dce_generation = DCE_GENERATION_CONDEMNED; + mutex_exit(&dce->dce_lock); + /* Count how many condemned dces for kmem_cache callback */ + atomic_add_32(&ipst->ips_num_dce_condemned, 1); +} + +/* + * Increment the generation avoiding the special condemned value + */ +void +dce_increment_generation(dce_t *dce) +{ + uint_t generation; + + mutex_enter(&dce->dce_lock); + if (!DCE_IS_CONDEMNED(dce)) { + generation = dce->dce_generation + 1; + if (generation == DCE_GENERATION_CONDEMNED) + generation = DCE_GENERATION_INITIAL; + ASSERT(generation != DCE_GENERATION_VERIFY); + dce->dce_generation = generation; + } + mutex_exit(&dce->dce_lock); +} + +/* + * Increment the generation number on all dces that have a path MTU and + * the default DCE. Used when ill_mtu changes. + */ +void +dce_increment_all_generations(boolean_t isv6, ip_stack_t *ipst) +{ + int i; + dcb_t *dcb; + dce_t *dce; + + for (i = 0; i < ipst->ips_dce_hashsize; i++) { + if (isv6) + dcb = &ipst->ips_dce_hash_v6[i]; + else + dcb = &ipst->ips_dce_hash_v4[i]; + rw_enter(&dcb->dcb_lock, RW_WRITER); + for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { + if (DCE_IS_CONDEMNED(dce)) + continue; + dce_increment_generation(dce); + } + rw_exit(&dcb->dcb_lock); + } + dce_increment_generation(ipst->ips_dce_default); +} + +/* + * Caller needs to do a dce_refrele since we can't do the + * dce_refrele under dcb_lock. + */ +static void +dce_delete_locked(dcb_t *dcb, dce_t *dce) +{ + dce->dce_bucket = NULL; + *dce->dce_ptpn = dce->dce_next; + if (dce->dce_next != NULL) + dce->dce_next->dce_ptpn = dce->dce_ptpn; + dce->dce_ptpn = NULL; + dce->dce_next = NULL; + atomic_add_32(&dcb->dcb_cnt, -1); + dce_make_condemned(dce); +} + +static void +dce_inactive(dce_t *dce) +{ + ip_stack_t *ipst = dce->dce_ipst; + + ASSERT(!(dce->dce_flags & DCEF_DEFAULT)); + ASSERT(dce->dce_ptpn == NULL); + ASSERT(dce->dce_bucket == NULL); + + /* Count how many condemned dces for kmem_cache callback */ + if (DCE_IS_CONDEMNED(dce)) + atomic_add_32(&ipst->ips_num_dce_condemned, -1); + + kmem_cache_free(dce_cache, dce); +} + +void +dce_refrele(dce_t *dce) +{ + ASSERT(dce->dce_refcnt != 0); + if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0) + dce_inactive(dce); +} + +void +dce_refhold(dce_t *dce) +{ + atomic_add_32(&dce->dce_refcnt, 1); + ASSERT(dce->dce_refcnt != 0); +} + +/* No tracing support yet hence the same as the above functions */ +void +dce_refrele_notr(dce_t *dce) +{ + ASSERT(dce->dce_refcnt != 0); + if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0) + dce_inactive(dce); +} + +void +dce_refhold_notr(dce_t *dce) +{ + atomic_add_32(&dce->dce_refcnt, 1); + ASSERT(dce->dce_refcnt != 0); +} + +/* Report both the IPv4 and IPv6 DCEs. */ +mblk_t * +ip_snmp_get_mib2_ip_dce(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst) +{ + struct opthdr *optp; + mblk_t *mp2ctl; + dest_cache_entry_t dest_cache; + mblk_t *mp_tail = NULL; + dce_t *dce; + dcb_t *dcb; + int i; + uint64_t current_time; + + current_time = TICK_TO_SEC(lbolt64); + + /* + * make a copy of the original message + */ + mp2ctl = copymsg(mpctl); + + /* First we do IPv4 entries */ + optp = (struct opthdr *)&mpctl->b_rptr[ + sizeof (struct T_optmgmt_ack)]; + optp->level = MIB2_IP; + optp->name = EXPER_IP_DCE; + + for (i = 0; i < ipst->ips_dce_hashsize; i++) { + dcb = &ipst->ips_dce_hash_v4[i]; + rw_enter(&dcb->dcb_lock, RW_READER); + for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { + dest_cache.DestIpv4Address = dce->dce_v4addr; + dest_cache.DestFlags = dce->dce_flags; + if (dce->dce_flags & DCEF_PMTU) + dest_cache.DestPmtu = dce->dce_pmtu; + else + dest_cache.DestPmtu = 0; + dest_cache.DestIdent = dce->dce_ident; + dest_cache.DestIfindex = 0; + dest_cache.DestAge = current_time - + dce->dce_last_change_time; + if (!snmp_append_data2(mpctl->b_cont, &mp_tail, + (char *)&dest_cache, (int)sizeof (dest_cache))) { + ip1dbg(("ip_snmp_get_mib2_ip_dce: " + "failed to allocate %u bytes\n", + (uint_t)sizeof (dest_cache))); + } + } + rw_exit(&dcb->dcb_lock); + } + optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); + ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", + (int)optp->level, (int)optp->name, (int)optp->len)); + qreply(q, mpctl); + + if (mp2ctl == NULL) { + /* Copymsg failed above */ + return (NULL); + } + + /* Now for IPv6 */ + mpctl = mp2ctl; + mp_tail = NULL; + mp2ctl = copymsg(mpctl); + optp = (struct opthdr *)&mpctl->b_rptr[ + sizeof (struct T_optmgmt_ack)]; + optp->level = MIB2_IP6; + optp->name = EXPER_IP_DCE; + + for (i = 0; i < ipst->ips_dce_hashsize; i++) { + dcb = &ipst->ips_dce_hash_v6[i]; + rw_enter(&dcb->dcb_lock, RW_READER); + for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) { + dest_cache.DestIpv6Address = dce->dce_v6addr; + dest_cache.DestFlags = dce->dce_flags; + if (dce->dce_flags & DCEF_PMTU) + dest_cache.DestPmtu = dce->dce_pmtu; + else + dest_cache.DestPmtu = 0; + dest_cache.DestIdent = dce->dce_ident; + if (IN6_IS_ADDR_LINKSCOPE(&dce->dce_v6addr)) + dest_cache.DestIfindex = dce->dce_ifindex; + else + dest_cache.DestIfindex = 0; + dest_cache.DestAge = current_time - + dce->dce_last_change_time; + if (!snmp_append_data2(mpctl->b_cont, &mp_tail, + (char *)&dest_cache, (int)sizeof (dest_cache))) { + ip1dbg(("ip_snmp_get_mib2_ip_dce: " + "failed to allocate %u bytes\n", + (uint_t)sizeof (dest_cache))); + } + } + rw_exit(&dcb->dcb_lock); + } + optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont); + ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n", + (int)optp->level, (int)optp->name, (int)optp->len)); + qreply(q, mpctl); + + return (mp2ctl); +} + +/* + * Remove IPv6 DCEs which refer to an ifindex that is going away. + * This is not required for correctness, but it avoids netstat -d + * showing stale stuff that will never be used. + */ +void +dce_cleanup(uint_t ifindex, ip_stack_t *ipst) +{ + uint_t i; + dcb_t *dcb; + dce_t *dce, *nextdce; + + for (i = 0; i < ipst->ips_dce_hashsize; i++) { + dcb = &ipst->ips_dce_hash_v6[i]; + rw_enter(&dcb->dcb_lock, RW_WRITER); + + for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) { + nextdce = dce->dce_next; + if (dce->dce_ifindex == ifindex) { + dce_delete_locked(dcb, dce); + dce_refrele(dce); + } + } + rw_exit(&dcb->dcb_lock); + } +} diff --git a/usr/src/uts/common/inet/ip/ip_ftable.c b/usr/src/uts/common/inet/ip/ip_ftable.c index 9e228c2925..771dd9f62f 100644 --- a/usr/src/uts/common/inet/ip/ip_ftable.c +++ b/usr/src/uts/common/inet/ip/ip_ftable.c @@ -42,7 +42,6 @@ #include <sys/param.h> #include <sys/socket.h> #include <sys/strsubr.h> -#include <sys/pattr.h> #include <net/if.h> #include <net/route.h> #include <netinet/in.h> @@ -50,6 +49,7 @@ #include <netinet/ip6.h> #include <netinet/icmp6.h> +#include <inet/ipsec_impl.h> #include <inet/common.h> #include <inet/mi.h> #include <inet/mib2.h> @@ -65,7 +65,6 @@ #include <inet/nd.h> #include <net/pfkeyv2.h> -#include <inet/ipsec_info.h> #include <inet/sadb.h> #include <inet/tcp.h> #include <inet/ipclassifier.h> @@ -78,87 +77,34 @@ (((ire)->ire_type & IRE_DEFAULT) || \ (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0))) -/* - * structure for passing args between ire_ftable_lookup and ire_find_best_route - */ -typedef struct ire_ftable_args_s { - ipaddr_t ift_addr; - ipaddr_t ift_mask; - ipaddr_t ift_gateway; - int ift_type; - const ipif_t *ift_ipif; - zoneid_t ift_zoneid; - uint32_t ift_ihandle; - const ts_label_t *ift_tsl; - int ift_flags; - ire_t *ift_best_ire; -} ire_ftable_args_t; - static ire_t *route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *); -static ire_t *ire_round_robin(irb_t *, zoneid_t, ire_ftable_args_t *, - ip_stack_t *); -static void ire_del_host_redir(ire_t *, char *); -static boolean_t ire_find_best_route(struct radix_node *, void *); -static int ip_send_align_hcksum_flags(mblk_t *, ill_t *); -static ire_t *ire_ftable_lookup_simple(ipaddr_t, - ire_t **, zoneid_t, int, ip_stack_t *); +static void ire_del_host_redir(ire_t *, char *); +static boolean_t ire_find_best_route(struct radix_node *, void *); /* * Lookup a route in forwarding table. A specific lookup is indicated by * passing the required parameters and indicating the match required in the * flag field. * - * Looking for default route can be done in three ways - * 1) pass mask as 0 and set MATCH_IRE_MASK in flags field - * along with other matches. - * 2) pass type as IRE_DEFAULT and set MATCH_IRE_TYPE in flags - * field along with other matches. - * 3) if the destination and mask are passed as zeros. - * - * A request to return a default route if no route - * is found, can be specified by setting MATCH_IRE_DEFAULT - * in flags. - * - * It does not support recursion more than one level. It - * will do recursive lookup only when the lookup maps to - * a prefix or default route and MATCH_IRE_RECURSIVE flag is passed. - * - * If the routing table is setup to allow more than one level - * of recursion, the cleaning up cache table will not work resulting - * in invalid routing. - * * Supports IP_BOUND_IF by following the ipif/ill when recursing. - * - * NOTE : When this function returns NULL, pire has already been released. - * pire is valid only when this function successfully returns an - * ire. */ ire_t * -ire_ftable_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, - int type, const ipif_t *ipif, ire_t **pire, zoneid_t zoneid, - uint32_t ihandle, const ts_label_t *tsl, int flags, ip_stack_t *ipst) +ire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, + int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, + int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp) { - ire_t *ire = NULL; - ipaddr_t gw_addr; + ire_t *ire; struct rt_sockaddr rdst, rmask; struct rt_entry *rt; ire_ftable_args_t margs; - boolean_t found_incomplete = B_FALSE; - ASSERT(ipif == NULL || !ipif->ipif_isv6); + ASSERT(ill == NULL || !ill->ill_isv6); /* - * When we return NULL from this function, we should make - * sure that *pire is NULL so that the callers will not - * wrongly REFRELE the pire. - */ - if (pire != NULL) - *pire = NULL; - /* - * ire_match_args() will dereference ipif MATCH_IRE_SRC or - * MATCH_IRE_ILL is set. + * ire_match_args() will dereference ill if MATCH_IRE_ILL + * is set. */ - if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL)) + if ((flags & MATCH_IRE_ILL) && (ill == NULL)) return (NULL); (void) memset(&rdst, 0, sizeof (rdst)); @@ -176,9 +122,8 @@ ire_ftable_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, margs.ift_mask = mask; margs.ift_gateway = gateway; margs.ift_type = type; - margs.ift_ipif = ipif; + margs.ift_ill = ill; margs.ift_zoneid = zoneid; - margs.ift_ihandle = ihandle; margs.ift_tsl = tsl; margs.ift_flags = flags; @@ -191,232 +136,93 @@ ire_ftable_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, * each matching leaf in the radix tree. ire_match_args is * invoked by the callback function ire_find_best_route() * We hold the global tree lock in read mode when calling - * rn_match_args.Before dropping the global tree lock, ensure + * rn_match_args. Before dropping the global tree lock, ensure * that the radix node can't be deleted by incrementing ire_refcnt. */ RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, ipst->ips_ip_ftable, ire_find_best_route, &margs); ire = margs.ift_best_ire; - RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); - if (rt == NULL) { + RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); return (NULL); - } else { - ASSERT(ire != NULL); } + ASSERT(ire != NULL); DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire); - if (!IS_DEFAULT_ROUTE(ire)) - goto found_ire_held; - /* - * If default route is found, see if default matching criteria - * are satisfied. - */ - if (flags & MATCH_IRE_MASK) { - /* - * we were asked to match a 0 mask, and came back with - * a default route. Ok to return it. - */ - goto found_default_ire; - } - if ((flags & MATCH_IRE_TYPE) && - (type & (IRE_DEFAULT | IRE_INTERFACE))) { - /* - * we were asked to match a default ire type. Ok to return it. - */ - goto found_default_ire; - } - if (flags & MATCH_IRE_DEFAULT) { - goto found_default_ire; - } - /* - * we found a default route, but default matching criteria - * are not specified and we are not explicitly looking for - * default. - */ - IRE_REFRELE(ire); - return (NULL); -found_default_ire: /* * round-robin only if we have more than one route in the bucket. + * ips_ip_ecmp_behavior controls when we do ECMP + * 2: always + * 1: for IRE_DEFAULT and /0 IRE_INTERFACE + * 0: never */ - if ((ire->ire_bucket->irb_ire_cnt > 1) && - IS_DEFAULT_ROUTE(ire) && - ((flags & (MATCH_IRE_DEFAULT | MATCH_IRE_MASK)) == - MATCH_IRE_DEFAULT)) { - ire_t *next_ire; - - next_ire = ire_round_robin(ire->ire_bucket, zoneid, &margs, - ipst); - IRE_REFRELE(ire); - if (next_ire != NULL) { + if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) { + if (ipst->ips_ip_ecmp_behavior == 2 || + (ipst->ips_ip_ecmp_behavior == 1 && + IS_DEFAULT_ROUTE(ire))) { + ire_t *next_ire; + + margs.ift_best_ire = NULL; + next_ire = ire_round_robin(ire->ire_bucket, &margs, + xmit_hint, ire, ipst); + if (next_ire == NULL) { + /* keep ire if next_ire is null */ + goto done; + } + ire_refrele(ire); ire = next_ire; - } else { - /* no route */ - return (NULL); } } -found_ire_held: - if ((flags & MATCH_IRE_RJ_BHOLE) && - (ire->ire_flags & (RTF_BLACKHOLE | RTF_REJECT))) { - return (ire); - } - /* - * At this point, IRE that was found must be an IRE_FORWARDTABLE - * type. If this is a recursive lookup and an IRE_INTERFACE type was - * found, return that. If it was some other IRE_FORWARDTABLE type of - * IRE (one of the prefix types), then it is necessary to fill in the - * parent IRE pointed to by pire, and then lookup the gateway address of - * the parent. For backwards compatiblity, if this lookup returns an - * IRE other than a IRE_CACHETABLE or IRE_INTERFACE, then one more level - * of lookup is done. - */ - if (flags & MATCH_IRE_RECURSIVE) { - ipif_t *gw_ipif; - int match_flags = MATCH_IRE_DSTONLY; - ire_t *save_ire; - if (ire->ire_type & IRE_INTERFACE) - return (ire); - if (pire != NULL) - *pire = ire; - /* - * If we can't find an IRE_INTERFACE or the caller has not - * asked for pire, we need to REFRELE the save_ire. - */ - save_ire = ire; +done: + /* Return generation before dropping lock */ + if (generationp != NULL) + *generationp = ire->ire_generation; - if (ire->ire_ipif != NULL) - match_flags |= MATCH_IRE_ILL; + RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); - /* - * ire_ftable_lookup may end up with an incomplete IRE_CACHE - * entry for the gateway (i.e., one for which the - * ire_nce->nce_state is not yet ND_REACHABLE). If the caller - * has specified MATCH_IRE_COMPLETE, such entries will not - * be returned; instead, we return the IF_RESOLVER ire. - */ - ire = ire_route_lookup(ire->ire_gateway_addr, 0, 0, 0, - ire->ire_ipif, NULL, zoneid, tsl, match_flags, ipst); - DTRACE_PROBE2(ftable__route__lookup1, (ire_t *), ire, - (ire_t *), save_ire); - if (ire == NULL || - ((ire->ire_type & IRE_CACHE) && ire->ire_nce && - ire->ire_nce->nce_state != ND_REACHABLE && - (flags & MATCH_IRE_COMPLETE))) { - /* - * Do not release the parent ire if MATCH_IRE_PARENT - * is set. Also return it via ire. - */ - if (ire != NULL) { - ire_refrele(ire); - ire = NULL; - found_incomplete = B_TRUE; - } - if (flags & MATCH_IRE_PARENT) { - if (pire != NULL) { - /* - * Need an extra REFHOLD, if the parent - * ire is returned via both ire and - * pire. - */ - IRE_REFHOLD(save_ire); - } - ire = save_ire; - } else { - ire_refrele(save_ire); - if (pire != NULL) - *pire = NULL; - } - if (!found_incomplete) - return (ire); - } - if (ire->ire_type & (IRE_CACHETABLE | IRE_INTERFACE)) { - /* - * If the caller did not ask for pire, release - * it now. - */ - if (pire == NULL) { - ire_refrele(save_ire); - } - return (ire); - } - match_flags |= MATCH_IRE_TYPE; - gw_addr = ire->ire_gateway_addr; - gw_ipif = ire->ire_ipif; - ire_refrele(ire); - ire = ire_route_lookup(gw_addr, 0, 0, - (found_incomplete? IRE_INTERFACE : - (IRE_CACHETABLE | IRE_INTERFACE)), - gw_ipif, NULL, zoneid, tsl, match_flags, ipst); - DTRACE_PROBE2(ftable__route__lookup2, (ire_t *), ire, - (ire_t *), save_ire); - if (ire == NULL || - ((ire->ire_type & IRE_CACHE) && ire->ire_nce && - ire->ire_nce->nce_state != ND_REACHABLE && - (flags & MATCH_IRE_COMPLETE))) { - /* - * Do not release the parent ire if MATCH_IRE_PARENT - * is set. Also return it via ire. - */ - if (ire != NULL) { - ire_refrele(ire); - ire = NULL; - } - if (flags & MATCH_IRE_PARENT) { - if (pire != NULL) { - /* - * Need an extra REFHOLD, if the - * parent ire is returned via both - * ire and pire. - */ - IRE_REFHOLD(save_ire); - } - ire = save_ire; - } else { - ire_refrele(save_ire); - if (pire != NULL) - *pire = NULL; - } - return (ire); - } else if (pire == NULL) { - /* - * If the caller did not ask for pire, release - * it now. - */ - ire_refrele(save_ire); - } - return (ire); + /* + * For shared-IP zones we need additional checks to what was + * done in ire_match_args to make sure IRE_LOCALs are handled. + * + * When ip_restrict_interzone_loopback is set, then + * we ensure that IRE_LOCAL are only used for loopback + * between zones when the logical "Ethernet" would + * have looped them back. That is, if in the absense of + * the IRE_LOCAL we would have sent to packet out the + * same ill. + */ + if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES && + ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES && + ipst->ips_ip_restrict_interzone_loopback) { + ire = ire_alt_local(ire, zoneid, tsl, ill, generationp); + ASSERT(ire != NULL); } - ASSERT(pire == NULL || *pire == NULL); return (ire); } /* * This function is called by - * ip_fast_forward->ire_forward_simple + * ip_input/ire_route_recursive when doing a route lookup on only the + * destination address. + * * The optimizations of this function over ire_ftable_lookup are: * o removing unnecessary flag matching * o doing longest prefix match instead of overloading it further * with the unnecessary "best_prefix_match" - * o Does not do round robin of default route for every packet - * o inlines code of ire_ctable_lookup to look for nexthop cache - * entry before calling ire_route_lookup + * + * If no route is found we return IRE_NOROUTE. */ -static ire_t * -ire_ftable_lookup_simple(ipaddr_t addr, - ire_t **pire, zoneid_t zoneid, int flags, - ip_stack_t *ipst) +ire_t * +ire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst, + uint_t *generationp) { - ire_t *ire = NULL; - ire_t *tmp_ire = NULL; + ire_t *ire; struct rt_sockaddr rdst; struct rt_entry *rt; - irb_t *irb_ptr; - ire_t *save_ire; - int match_flags; + irb_t *irb; rdst.rt_sin_len = sizeof (rdst); rdst.rt_sin_family = AF_INET; @@ -430,263 +236,125 @@ ire_ftable_lookup_simple(ipaddr_t addr, rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, ipst->ips_ip_ftable, NULL, NULL); - if (rt == NULL) { - RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); - return (NULL); - } - irb_ptr = &rt->rt_irb; - if (irb_ptr == NULL || irb_ptr->irb_ire_cnt == 0) { - RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); - return (NULL); - } + if (rt == NULL) + goto bad; - rw_enter(&irb_ptr->irb_lock, RW_READER); - for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { - if (ire->ire_zoneid == zoneid) - break; - } + irb = &rt->rt_irb; + if (irb->irb_ire_cnt == 0) + goto bad; - if (ire == NULL || (ire->ire_marks & IRE_MARK_CONDEMNED)) { - rw_exit(&irb_ptr->irb_lock); - RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); - return (NULL); + rw_enter(&irb->irb_lock, RW_READER); + ire = irb->irb_ire; + if (ire == NULL) { + rw_exit(&irb->irb_lock); + goto bad; } - /* we have a ire that matches */ - if (ire != NULL) - IRE_REFHOLD(ire); - rw_exit(&irb_ptr->irb_lock); - RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); - - if ((flags & MATCH_IRE_RJ_BHOLE) && - (ire->ire_flags & (RTF_BLACKHOLE | RTF_REJECT))) { - return (ire); + while (IRE_IS_CONDEMNED(ire)) { + ire = ire->ire_next; + if (ire == NULL) { + rw_exit(&irb->irb_lock); + goto bad; + } } - /* - * At this point, IRE that was found must be an IRE_FORWARDTABLE - * type. If this is a recursive lookup and an IRE_INTERFACE type was - * found, return that. If it was some other IRE_FORWARDTABLE type of - * IRE (one of the prefix types), then it is necessary to fill in the - * parent IRE pointed to by pire, and then lookup the gateway address of - * the parent. For backwards compatiblity, if this lookup returns an - * IRE other than a IRE_CACHETABLE or IRE_INTERFACE, then one more level - * of lookup is done. - */ - match_flags = MATCH_IRE_DSTONLY; - if (ire->ire_type & IRE_INTERFACE) - return (ire); - *pire = ire; - /* - * If we can't find an IRE_INTERFACE or the caller has not - * asked for pire, we need to REFRELE the save_ire. - */ - save_ire = ire; + /* we have a ire that matches */ + ire_refhold(ire); + rw_exit(&irb->irb_lock); /* - * Currently MATCH_IRE_ILL is never used with - * (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT) while - * sending out packets as MATCH_IRE_ILL is used only - * for communicating with on-link hosts. We can't assert - * that here as RTM_GET calls this function with - * MATCH_IRE_ILL | MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE. - * We have already used the MATCH_IRE_ILL in determining - * the right prefix route at this point. To match the - * behavior of how we locate routes while sending out - * packets, we don't want to use MATCH_IRE_ILL below - * while locating the interface route. + * round-robin only if we have more than one route in the bucket. + * ips_ip_ecmp_behavior controls when we do ECMP + * 2: always + * 1: for IRE_DEFAULT and /0 IRE_INTERFACE + * 0: never * - * ire_ftable_lookup may end up with an incomplete IRE_CACHE - * entry for the gateway (i.e., one for which the - * ire_nce->nce_state is not yet ND_REACHABLE). If the caller - * has specified MATCH_IRE_COMPLETE, such entries will not - * be returned; instead, we return the IF_RESOLVER ire. + * Note: if we found an IRE_IF_CLONE we won't look at the bucket with + * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match + * and the IRE_INTERFACESs are likely to be shorter matches. */ - - if (ire->ire_ipif == NULL) { - tmp_ire = ire; - /* - * Look to see if the nexthop entry is in the cachetable - */ - ire = ire_cache_lookup(ire->ire_gateway_addr, zoneid, NULL, - ipst); - if (ire == NULL) { - /* Try ire_route_lookup */ - ire = tmp_ire; - } else { - goto solved; - } - } - if (ire->ire_ipif != NULL) - match_flags |= MATCH_IRE_ILL; - - ire = ire_route_lookup(ire->ire_gateway_addr, 0, - 0, 0, ire->ire_ipif, NULL, zoneid, NULL, match_flags, ipst); -solved: - DTRACE_PROBE2(ftable__route__lookup1, (ire_t *), ire, - (ire_t *), save_ire); - if (ire == NULL) { - /* - * Do not release the parent ire if MATCH_IRE_PARENT - * is set. Also return it via ire. - */ - ire_refrele(save_ire); - *pire = NULL; - return (ire); - } - if (ire->ire_type & (IRE_CACHETABLE | IRE_INTERFACE)) { - /* - * If the caller did not ask for pire, release - * it now. - */ - if (pire == NULL) { - ire_refrele(save_ire); + if (ire->ire_bucket->irb_ire_cnt > 1) { + if (ipst->ips_ip_ecmp_behavior == 2 || + (ipst->ips_ip_ecmp_behavior == 1 && + IS_DEFAULT_ROUTE(ire))) { + ire_t *next_ire; + ire_ftable_args_t margs; + + (void) memset(&margs, 0, sizeof (margs)); + margs.ift_addr = addr; + margs.ift_zoneid = ALL_ZONES; + + next_ire = ire_round_robin(ire->ire_bucket, &margs, + xmit_hint, ire, ipst); + if (next_ire == NULL) { + /* keep ire if next_ire is null */ + if (generationp != NULL) + *generationp = ire->ire_generation; + RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); + return (ire); + } + ire_refrele(ire); + ire = next_ire; } } - return (ire); -} - -/* - * Find an IRE_OFFSUBNET IRE entry for the multicast address 'group' - * that goes through 'ipif'. As a fallback, a route that goes through - * ipif->ipif_ill can be returned. - */ -ire_t * -ipif_lookup_multi_ire(ipif_t *ipif, ipaddr_t group) -{ - ire_t *ire; - ire_t *save_ire = NULL; - ire_t *gw_ire; - irb_t *irb; - ipaddr_t gw_addr; - int match_flags = MATCH_IRE_TYPE | MATCH_IRE_ILL; - ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; - - ASSERT(CLASSD(group)); - - ire = ire_ftable_lookup(group, 0, 0, 0, NULL, NULL, ALL_ZONES, 0, - NULL, MATCH_IRE_DEFAULT, ipst); - - if (ire == NULL) - return (NULL); + /* Return generation before dropping lock */ + if (generationp != NULL) + *generationp = ire->ire_generation; - irb = ire->ire_bucket; - ASSERT(irb); + RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); - IRB_REFHOLD(irb); - ire_refrele(ire); - for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { - if (ire->ire_addr != group || - ipif->ipif_zoneid != ire->ire_zoneid && - ire->ire_zoneid != ALL_ZONES) { - continue; - } + /* + * Since we only did ALL_ZONES matches there is no special handling + * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that. + */ + return (ire); - switch (ire->ire_type) { - case IRE_DEFAULT: - case IRE_PREFIX: - case IRE_HOST: - gw_addr = ire->ire_gateway_addr; - gw_ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE, - ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst); - - if (gw_ire != NULL) { - if (save_ire != NULL) { - ire_refrele(save_ire); - } - IRE_REFHOLD(ire); - if (gw_ire->ire_ipif == ipif) { - ire_refrele(gw_ire); - - IRB_REFRELE(irb); - return (ire); - } - ire_refrele(gw_ire); - save_ire = ire; - } - break; - case IRE_IF_NORESOLVER: - case IRE_IF_RESOLVER: - if (ire->ire_ipif == ipif) { - if (save_ire != NULL) { - ire_refrele(save_ire); - } - IRE_REFHOLD(ire); - - IRB_REFRELE(irb); - return (ire); - } - break; - } - } - IRB_REFRELE(irb); +bad: + if (generationp != NULL) + *generationp = IRE_GENERATION_VERIFY; - return (save_ire); + RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); + return (ire_reject(ipst, B_FALSE)); } /* - * Find an IRE_INTERFACE for the multicast group. + * Find the ill matching a multicast group. * Allows different routes for multicast addresses * in the unicast routing table (akin to 224.0.0.0 but could be more specific) * which point at different interfaces. This is used when IP_MULTICAST_IF * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't * specify the interface to join on. * - * Supports IP_BOUND_IF by following the ipif/ill when recursing. + * Supports link-local addresses by using ire_route_recursive which follows + * the ill when recursing. + * + * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group + * and the MULTIRT property can be different for different groups, we + * extract RTF_MULTIRT from the special unicast route added for a group + * with CGTP and pass that back in the multirtp argument. + * This is used in ip_set_destination etc to set ixa_postfragfn for multicast. + * We have a setsrcp argument for the same reason. */ -ire_t * -ire_lookup_multi(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst) +ill_t * +ire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst, + boolean_t *multirtp, ipaddr_t *setsrcp) { ire_t *ire; - ipif_t *ipif = NULL; - int match_flags = MATCH_IRE_TYPE; - ipaddr_t gw_addr; - - ire = ire_ftable_lookup(group, 0, 0, 0, NULL, NULL, zoneid, - 0, NULL, MATCH_IRE_DEFAULT, ipst); + ill_t *ill; - /* We search a resolvable ire in case of multirouting. */ - if ((ire != NULL) && (ire->ire_flags & RTF_MULTIRT)) { - ire_t *cire = NULL; - /* - * If the route is not resolvable, the looked up ire - * may be changed here. In that case, ire_multirt_lookup() - * IRE_REFRELE the original ire and change it. - */ - (void) ire_multirt_lookup(&cire, &ire, MULTIRT_CACHEGW, NULL, - NULL, ipst); - if (cire != NULL) - ire_refrele(cire); - } - if (ire == NULL) - return (NULL); - /* - * Make sure we follow ire_ipif. - * - * We need to determine the interface route through - * which the gateway will be reached. - */ - if (ire->ire_ipif != NULL) { - ipif = ire->ire_ipif; - match_flags |= MATCH_IRE_ILL; - } - - switch (ire->ire_type) { - case IRE_DEFAULT: - case IRE_PREFIX: - case IRE_HOST: - gw_addr = ire->ire_gateway_addr; - ire_refrele(ire); - ire = ire_ftable_lookup(gw_addr, 0, 0, - IRE_INTERFACE, ipif, NULL, zoneid, 0, - NULL, match_flags, ipst); - return (ire); - case IRE_IF_NORESOLVER: - case IRE_IF_RESOLVER: - return (ire); - default: + ire = ire_route_recursive_v4(group, 0, NULL, zoneid, NULL, + MATCH_IRE_DSTONLY, B_FALSE, 0, ipst, setsrcp, NULL, NULL); + ASSERT(ire != NULL); + if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { ire_refrele(ire); return (NULL); } + + if (multirtp != NULL) + *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0; + + ill = ire_nexthop_ill(ire); + ire_refrele(ire); + return (ill); } /* @@ -701,7 +369,7 @@ ire_del_host_redir(ire_t *ire, char *gateway) } /* - * Search for all HOST REDIRECT routes that are + * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are * pointing at the specified gateway and * delete them. This routine is called only * when a default gateway is going away. @@ -718,732 +386,6 @@ ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst) rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn); } -struct ihandle_arg { - uint32_t ihandle; - ire_t *ire; -}; - -static int -ire_ihandle_onlink_match(struct radix_node *rn, void *arg) -{ - struct rt_entry *rt; - irb_t *irb; - ire_t *ire; - struct ihandle_arg *ih = arg; - - rt = (struct rt_entry *)rn; - ASSERT(rt != NULL); - irb = &rt->rt_irb; - for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { - if ((ire->ire_type & IRE_INTERFACE) && - (ire->ire_ihandle == ih->ihandle)) { - ih->ire = ire; - IRE_REFHOLD(ire); - return (1); - } - } - return (0); -} - -/* - * Locate the interface ire that is tied to the cache ire 'cire' via - * cire->ire_ihandle. - * - * We are trying to create the cache ire for an onlink destn. or - * gateway in 'cire'. We are called from ire_add_v4() in the IRE_IF_RESOLVER - * case, after the ire has come back from ARP. - */ -ire_t * -ire_ihandle_lookup_onlink(ire_t *cire) -{ - ire_t *ire; - int match_flags; - struct ihandle_arg ih; - ip_stack_t *ipst; - - ASSERT(cire != NULL); - ipst = cire->ire_ipst; - - /* - * We don't need to specify the zoneid to ire_ftable_lookup() below - * because the ihandle refers to an ipif which can be in only one zone. - */ - match_flags = MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK; - /* - * We know that the mask of the interface ire equals cire->ire_cmask. - * (When ip_newroute() created 'cire' for an on-link destn. it set its - * cmask from the interface ire's mask) - */ - ire = ire_ftable_lookup(cire->ire_addr, cire->ire_cmask, 0, - IRE_INTERFACE, NULL, NULL, ALL_ZONES, cire->ire_ihandle, - NULL, match_flags, ipst); - if (ire != NULL) - return (ire); - /* - * If we didn't find an interface ire above, we can't declare failure. - * For backwards compatibility, we need to support prefix routes - * pointing to next hop gateways that are not on-link. - * - * In the resolver/noresolver case, ip_newroute() thinks it is creating - * the cache ire for an onlink destination in 'cire'. But 'cire' is - * not actually onlink, because ire_ftable_lookup() cheated it, by - * doing ire_route_lookup() twice and returning an interface ire. - * - * Eg. default - gw1 (line 1) - * gw1 - gw2 (line 2) - * gw2 - hme0 (line 3) - * - * In the above example, ip_newroute() tried to create the cache ire - * 'cire' for gw1, based on the interface route in line 3. The - * ire_ftable_lookup() above fails, because there is no interface route - * to reach gw1. (it is gw2). We fall thru below. - * - * Do a brute force search based on the ihandle in a subset of the - * forwarding tables, corresponding to cire->ire_cmask. Otherwise - * things become very complex, since we don't have 'pire' in this - * case. (Also note that this method is not possible in the offlink - * case because we don't know the mask) - */ - (void) memset(&ih, 0, sizeof (ih)); - ih.ihandle = cire->ire_ihandle; - (void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable, - ire_ihandle_onlink_match, &ih, irb_refhold_rn, irb_refrele_rn); - return (ih.ire); -} - -/* - * IRE iterator used by ire_ftable_lookup[_v6]() to process multiple default - * routes. Given a starting point in the hash list (ire_origin), walk the IREs - * in the bucket skipping default interface routes and deleted entries. - * Returns the next IRE (unheld), or NULL when we're back to the starting point. - * Assumes that the caller holds a reference on the IRE bucket. - */ -ire_t * -ire_get_next_default_ire(ire_t *ire, ire_t *ire_origin) -{ - ASSERT(ire_origin->ire_bucket != NULL); - ASSERT(ire != NULL); - - do { - ire = ire->ire_next; - if (ire == NULL) - ire = ire_origin->ire_bucket->irb_ire; - if (ire == ire_origin) - return (NULL); - } while ((ire->ire_type & IRE_INTERFACE) || - (ire->ire_marks & IRE_MARK_CONDEMNED)); - ASSERT(ire != NULL); - return (ire); -} - -static ipif_t * -ire_forward_src_ipif(ipaddr_t dst, ire_t *sire, ire_t *ire, - int zoneid, ushort_t *marks) -{ - ipif_t *src_ipif; - ill_t *ill = ire->ire_ipif->ipif_ill; - ip_stack_t *ipst = ill->ill_ipst; - - /* - * Pick the best source address from ill. - * - * 1) Try to pick the source address from the destination - * route. Clustering assumes that when we have multiple - * prefixes hosted on an interface, the prefix of the - * source address matches the prefix of the destination - * route. We do this only if the address is not - * DEPRECATED. - * - * 2) If the conn is in a different zone than the ire, we - * need to pick a source address from the right zone. - */ - if ((sire != NULL) && (sire->ire_flags & RTF_SETSRC)) { - /* - * The RTF_SETSRC flag is set in the parent ire (sire). - * Check that the ipif matching the requested source - * address still exists. - */ - src_ipif = ipif_lookup_addr(sire->ire_src_addr, NULL, - zoneid, NULL, NULL, NULL, NULL, ipst); - return (src_ipif); - } - *marks |= IRE_MARK_USESRC_CHECK; - if (IS_IPMP(ill) || - (ire->ire_ipif->ipif_flags & IPIF_DEPRECATED) || - (ill->ill_usesrc_ifindex != 0)) { - src_ipif = ipif_select_source(ill, dst, zoneid); - } else { - src_ipif = ire->ire_ipif; - ASSERT(src_ipif != NULL); - /* hold src_ipif for uniformity */ - ipif_refhold(src_ipif); - } - return (src_ipif); -} - -/* - * This function is called by ip_rput_noire() and ip_fast_forward() - * to resolve the route of incoming packet that needs to be forwarded. - * If the ire of the nexthop is not already in the cachetable, this - * routine will insert it to the table, but won't trigger ARP resolution yet. - * Thus unlike ip_newroute, this function adds incomplete ires to - * the cachetable. ARP resolution for these ires are delayed until - * after all of the packet processing is completed and its ready to - * be sent out on the wire, Eventually, the packet transmit routine - * ip_xmit_v4() attempts to send a packet to the driver. If it finds - * that there is no link layer information, it will do the arp - * resolution and queue the packet in ire->ire_nce->nce_qd_mp and - * then send it out once the arp resolution is over - * (see ip_xmit_v4()->ire_arpresolve()). This scheme is similar to - * the model of BSD/SunOS 4 - * - * In future, the insertion of incomplete ires in the cachetable should - * be implemented in hostpath as well, as doing so will greatly reduce - * the existing complexity for code paths that depend on the context of - * the sender (such as IPsec). - * - * Thus this scheme of adding incomplete ires in cachetable in forwarding - * path can be used as a template for simplifying the hostpath. - */ - -ire_t * -ire_forward(ipaddr_t dst, enum ire_forward_action *ret_action, - ire_t *supplied_ire, ire_t *supplied_sire, const struct ts_label_s *tsl, - ip_stack_t *ipst) -{ - ipaddr_t gw = 0; - ire_t *ire = NULL; - ire_t *sire = NULL, *save_ire; - ill_t *dst_ill = NULL; - int error; - zoneid_t zoneid; - ipif_t *src_ipif = NULL; - mblk_t *res_mp; - ushort_t ire_marks = 0; - tsol_gcgrp_t *gcgrp = NULL; - tsol_gcgrp_addr_t ga; - - zoneid = GLOBAL_ZONEID; - - if (supplied_ire != NULL) { - /* We have arrived here from ipfil_sendpkt */ - ire = supplied_ire; - sire = supplied_sire; - goto create_irecache; - } - - ire = ire_ftable_lookup(dst, 0, 0, 0, NULL, &sire, zoneid, 0, - tsl, MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | - MATCH_IRE_RJ_BHOLE | MATCH_IRE_PARENT|MATCH_IRE_SECATTR, ipst); - - if (ire == NULL) { - ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0, RTA_DST, ipst); - goto icmp_err_ret; - } - - /* - * If we encounter CGTP, we should have the caller use - * ip_newroute to resolve multirt instead of this function. - * CGTP specs explicitly state that it can't be used with routers. - * This essentially prevents insertion of incomplete RTF_MULTIRT - * ires in cachetable. - */ - if (ipst->ips_ip_cgtp_filter && - ((ire->ire_flags & RTF_MULTIRT) || - ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)))) { - ip3dbg(("ire_forward: packet is to be multirouted- " - "handing it to ip_newroute\n")); - if (sire != NULL) - ire_refrele(sire); - ire_refrele(ire); - /* - * Inform caller about encountering of multirt so that - * ip_newroute() can be called. - */ - *ret_action = Forward_check_multirt; - return (NULL); - } - - /* - * Verify that the returned IRE does not have either - * the RTF_REJECT or RTF_BLACKHOLE flags set and that the IRE is - * either an IRE_CACHE, IRE_IF_NORESOLVER or IRE_IF_RESOLVER. - */ - if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) || - (ire->ire_type & (IRE_CACHE | IRE_INTERFACE)) == 0) { - ip3dbg(("ire 0x%p is not cache/resolver/noresolver\n", - (void *)ire)); - goto icmp_err_ret; - } - - /* - * If we already have a fully resolved IRE CACHE of the - * nexthop router, just hand over the cache entry - * and we are done. - */ - - if (ire->ire_type & IRE_CACHE) { - - /* - * If we are using this ire cache entry as a - * gateway to forward packets, chances are we - * will be using it again. So turn off - * the temporary flag, thus reducing its - * chances of getting deleted frequently. - */ - if (ire->ire_marks & IRE_MARK_TEMPORARY) { - irb_t *irb = ire->ire_bucket; - rw_enter(&irb->irb_lock, RW_WRITER); - /* - * We need to recheck for IRE_MARK_TEMPORARY after - * acquiring the lock in order to guarantee - * irb_tmp_ire_cnt - */ - if (ire->ire_marks & IRE_MARK_TEMPORARY) { - ire->ire_marks &= ~IRE_MARK_TEMPORARY; - irb->irb_tmp_ire_cnt--; - } - rw_exit(&irb->irb_lock); - } - - if (sire != NULL) { - UPDATE_OB_PKT_COUNT(sire); - sire->ire_last_used_time = lbolt; - ire_refrele(sire); - } - *ret_action = Forward_ok; - return (ire); - } -create_irecache: - /* - * Increment the ire_ob_pkt_count field for ire if it is an - * INTERFACE (IF_RESOLVER or IF_NORESOLVER) IRE type, and - * increment the same for the parent IRE, sire, if it is some - * sort of prefix IRE (which includes DEFAULT, PREFIX, and HOST). - */ - if ((ire->ire_type & IRE_INTERFACE) != 0) { - UPDATE_OB_PKT_COUNT(ire); - ire->ire_last_used_time = lbolt; - } - - /* - * sire must be either IRE_CACHETABLE OR IRE_INTERFACE type - */ - if (sire != NULL) { - gw = sire->ire_gateway_addr; - ASSERT((sire->ire_type & - (IRE_CACHETABLE | IRE_INTERFACE)) == 0); - UPDATE_OB_PKT_COUNT(sire); - sire->ire_last_used_time = lbolt; - } - - dst_ill = ire->ire_ipif->ipif_ill; - if (IS_IPMP(dst_ill)) - dst_ill = ipmp_illgrp_hold_next_ill(dst_ill->ill_grp); - else - ill_refhold(dst_ill); - - if (dst_ill == NULL) { - ip2dbg(("ire_forward no dst ill; ire 0x%p\n", (void *)ire)); - goto icmp_err_ret; - } - - ASSERT(src_ipif == NULL); - /* Now obtain the src_ipif */ - src_ipif = ire_forward_src_ipif(dst, sire, ire, zoneid, &ire_marks); - if (src_ipif == NULL) - goto icmp_err_ret; - - switch (ire->ire_type) { - case IRE_IF_NORESOLVER: - /* create ire_cache for ire_addr endpoint */ - if (dst_ill->ill_resolver_mp == NULL) { - ip1dbg(("ire_forward: dst_ill %p " - "for IRE_IF_NORESOLVER ire %p has " - "no ill_resolver_mp\n", - (void *)dst_ill, (void *)ire)); - goto icmp_err_ret; - } - /* FALLTHRU */ - case IRE_IF_RESOLVER: - /* - * We have the IRE_IF_RESOLVER of the nexthop gateway - * and now need to build a IRE_CACHE for it. - * In this case, we have the following : - * - * 1) src_ipif - used for getting a source address. - * - * 2) dst_ill - from which we derive ire_stq/ire_rfq. This - * means packets using the IRE_CACHE that we will build - * here will go out on dst_ill. - * - * 3) sire may or may not be NULL. But, the IRE_CACHE that is - * to be created will only be tied to the IRE_INTERFACE - * that was derived from the ire_ihandle field. - * - * If sire is non-NULL, it means the destination is - * off-link and we will first create the IRE_CACHE for the - * gateway. - */ - res_mp = dst_ill->ill_resolver_mp; - if (ire->ire_type == IRE_IF_RESOLVER && - (!OK_RESOLVER_MP(res_mp))) { - goto icmp_err_ret; - } - /* - * To be at this point in the code with a non-zero gw - * means that dst is reachable through a gateway that - * we have never resolved. By changing dst to the gw - * addr we resolve the gateway first. - */ - if (gw != INADDR_ANY) { - /* - * The source ipif that was determined above was - * relative to the destination address, not the - * gateway's. If src_ipif was not taken out of - * the IRE_IF_RESOLVER entry, we'll need to call - * ipif_select_source() again. - */ - if (src_ipif != ire->ire_ipif) { - ipif_refrele(src_ipif); - src_ipif = ipif_select_source(dst_ill, - gw, zoneid); - if (src_ipif == NULL) - goto icmp_err_ret; - } - dst = gw; - gw = INADDR_ANY; - } - /* - * dst has been set to the address of the nexthop. - * - * TSol note: get security attributes of the nexthop; - * Note that the nexthop may either be a gateway, or the - * packet destination itself; Detailed explanation of - * issues involved is provided in the IRE_IF_NORESOLVER - * logic in ip_newroute(). - */ - ga.ga_af = AF_INET; - IN6_IPADDR_TO_V4MAPPED(dst, &ga.ga_addr); - gcgrp = gcgrp_lookup(&ga, B_FALSE); - - if (ire->ire_type == IRE_IF_NORESOLVER) - dst = ire->ire_addr; /* ire_cache for tunnel endpoint */ - - save_ire = ire; - /* - * create an incomplete IRE_CACHE. - * An areq_mp will be generated in ire_arpresolve() for - * RESOLVER interfaces. - */ - ire = ire_create( - (uchar_t *)&dst, /* dest address */ - (uchar_t *)&ip_g_all_ones, /* mask */ - (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ - (uchar_t *)&gw, /* gateway address */ - (save_ire->ire_type == IRE_IF_RESOLVER ? NULL: - &save_ire->ire_max_frag), - NULL, - dst_ill->ill_rq, /* recv-from queue */ - dst_ill->ill_wq, /* send-to queue */ - IRE_CACHE, /* IRE type */ - src_ipif, - ire->ire_mask, /* Parent mask */ - 0, - ire->ire_ihandle, /* Interface handle */ - 0, - &(ire->ire_uinfo), - NULL, - gcgrp, - ipst); - ip1dbg(("incomplete ire_cache 0x%p\n", (void *)ire)); - if (ire != NULL) { - gcgrp = NULL; /* reference now held by IRE */ - ire->ire_marks |= ire_marks; - /* add the incomplete ire: */ - error = ire_add(&ire, NULL, NULL, NULL, B_TRUE); - if (error == 0 && ire != NULL) { - ire->ire_max_frag = save_ire->ire_max_frag; - ip1dbg(("setting max_frag to %d in ire 0x%p\n", - ire->ire_max_frag, (void *)ire)); - } else { - ire_refrele(save_ire); - goto icmp_err_ret; - } - } else { - if (gcgrp != NULL) { - GCGRP_REFRELE(gcgrp); - gcgrp = NULL; - } - } - - ire_refrele(save_ire); - break; - default: - break; - } - - *ret_action = Forward_ok; - if (sire != NULL) - ire_refrele(sire); - if (dst_ill != NULL) - ill_refrele(dst_ill); - if (src_ipif != NULL) - ipif_refrele(src_ipif); - return (ire); -icmp_err_ret: - *ret_action = Forward_ret_icmp_err; - if (sire != NULL) - ire_refrele(sire); - if (dst_ill != NULL) - ill_refrele(dst_ill); - if (src_ipif != NULL) - ipif_refrele(src_ipif); - if (ire != NULL) { - if (ire->ire_flags & RTF_BLACKHOLE) - *ret_action = Forward_blackhole; - ire_refrele(ire); - } - return (NULL); -} - -/* - * Since caller is ip_fast_forward, there is no CGTP or Tsol test - * Also we dont call ftable lookup with MATCH_IRE_PARENT - */ - -ire_t * -ire_forward_simple(ipaddr_t dst, enum ire_forward_action *ret_action, - ip_stack_t *ipst) -{ - ipaddr_t gw = 0; - ire_t *ire = NULL; - ire_t *sire = NULL, *save_ire; - ill_t *dst_ill = NULL; - int error; - zoneid_t zoneid = GLOBAL_ZONEID; - ipif_t *src_ipif = NULL; - mblk_t *res_mp; - ushort_t ire_marks = 0; - - ire = ire_ftable_lookup_simple(dst, &sire, zoneid, - MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE, ipst); - if (ire == NULL) { - ip_rts_change(RTM_MISS, dst, 0, 0, 0, 0, 0, 0, RTA_DST, ipst); - goto icmp_err_ret; - } - - /* - * Verify that the returned IRE does not have either - * the RTF_REJECT or RTF_BLACKHOLE flags set and that the IRE is - * either an IRE_CACHE, IRE_IF_NORESOLVER or IRE_IF_RESOLVER. - */ - if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) || - ((ire->ire_type & (IRE_CACHE | IRE_INTERFACE)) == 0)) { - ip3dbg(("ire 0x%p is not cache/resolver/noresolver\n", - (void *)ire)); - goto icmp_err_ret; - } - - /* - * If we already have a fully resolved IRE CACHE of the - * nexthop router, just hand over the cache entry - * and we are done. - */ - if (ire->ire_type & IRE_CACHE) { - /* - * If we are using this ire cache entry as a - * gateway to forward packets, chances are we - * will be using it again. So turn off - * the temporary flag, thus reducing its - * chances of getting deleted frequently. - */ - if (ire->ire_marks & IRE_MARK_TEMPORARY) { - irb_t *irb = ire->ire_bucket; - rw_enter(&irb->irb_lock, RW_WRITER); - ire->ire_marks &= ~IRE_MARK_TEMPORARY; - irb->irb_tmp_ire_cnt--; - rw_exit(&irb->irb_lock); - } - - if (sire != NULL) { - UPDATE_OB_PKT_COUNT(sire); - ire_refrele(sire); - } - *ret_action = Forward_ok; - return (ire); - } - /* - * Increment the ire_ob_pkt_count field for ire if it is an - * INTERFACE (IF_RESOLVER or IF_NORESOLVER) IRE type, and - * increment the same for the parent IRE, sire, if it is some - * sort of prefix IRE (which includes DEFAULT, PREFIX, and HOST). - */ - if ((ire->ire_type & IRE_INTERFACE) != 0) { - UPDATE_OB_PKT_COUNT(ire); - ire->ire_last_used_time = lbolt; - } - - /* - * sire must be either IRE_CACHETABLE OR IRE_INTERFACE type - */ - if (sire != NULL) { - gw = sire->ire_gateway_addr; - ASSERT((sire->ire_type & - (IRE_CACHETABLE | IRE_INTERFACE)) == 0); - UPDATE_OB_PKT_COUNT(sire); - } - - dst_ill = ire->ire_ipif->ipif_ill; - if (IS_IPMP(dst_ill)) - dst_ill = ipmp_illgrp_hold_next_ill(dst_ill->ill_grp); - else - ill_refhold(dst_ill); /* for symmetry */ - - if (dst_ill == NULL) { - ip2dbg(("ire_forward_simple: no dst ill; ire 0x%p\n", - (void *)ire)); - goto icmp_err_ret; - } - - ASSERT(src_ipif == NULL); - /* Now obtain the src_ipif */ - src_ipif = ire_forward_src_ipif(dst, sire, ire, zoneid, &ire_marks); - if (src_ipif == NULL) - goto icmp_err_ret; - - switch (ire->ire_type) { - case IRE_IF_NORESOLVER: - /* create ire_cache for ire_addr endpoint */ - case IRE_IF_RESOLVER: - /* - * We have the IRE_IF_RESOLVER of the nexthop gateway - * and now need to build a IRE_CACHE for it. - * In this case, we have the following : - * - * 1) src_ipif - used for getting a source address. - * - * 2) dst_ill - from which we derive ire_stq/ire_rfq. This - * means packets using the IRE_CACHE that we will build - * here will go out on dst_ill. - * - * 3) sire may or may not be NULL. But, the IRE_CACHE that is - * to be created will only be tied to the IRE_INTERFACE - * that was derived from the ire_ihandle field. - * - * If sire is non-NULL, it means the destination is - * off-link and we will first create the IRE_CACHE for the - * gateway. - */ - res_mp = dst_ill->ill_resolver_mp; - if (ire->ire_type == IRE_IF_RESOLVER && - (!OK_RESOLVER_MP(res_mp))) { - ire_refrele(ire); - ire = NULL; - goto out; - } - /* - * To be at this point in the code with a non-zero gw - * means that dst is reachable through a gateway that - * we have never resolved. By changing dst to the gw - * addr we resolve the gateway first. - */ - if (gw != INADDR_ANY) { - /* - * The source ipif that was determined above was - * relative to the destination address, not the - * gateway's. If src_ipif was not taken out of - * the IRE_IF_RESOLVER entry, we'll need to call - * ipif_select_source() again. - */ - if (src_ipif != ire->ire_ipif) { - ipif_refrele(src_ipif); - src_ipif = ipif_select_source(dst_ill, - gw, zoneid); - if (src_ipif == NULL) - goto icmp_err_ret; - } - dst = gw; - gw = INADDR_ANY; - } - - if (ire->ire_type == IRE_IF_NORESOLVER) - dst = ire->ire_addr; /* ire_cache for tunnel endpoint */ - - save_ire = ire; - /* - * create an incomplete IRE_CACHE. - * An areq_mp will be generated in ire_arpresolve() for - * RESOLVER interfaces. - */ - ire = ire_create( - (uchar_t *)&dst, /* dest address */ - (uchar_t *)&ip_g_all_ones, /* mask */ - (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ - (uchar_t *)&gw, /* gateway address */ - (save_ire->ire_type == IRE_IF_RESOLVER ? NULL: - &save_ire->ire_max_frag), - NULL, - dst_ill->ill_rq, /* recv-from queue */ - dst_ill->ill_wq, /* send-to queue */ - IRE_CACHE, /* IRE type */ - src_ipif, - ire->ire_mask, /* Parent mask */ - 0, - ire->ire_ihandle, /* Interface handle */ - 0, - &(ire->ire_uinfo), - NULL, - NULL, - ipst); - ip1dbg(("incomplete ire_cache 0x%p\n", (void *)ire)); - if (ire != NULL) { - ire->ire_marks |= ire_marks; - /* add the incomplete ire: */ - error = ire_add(&ire, NULL, NULL, NULL, B_TRUE); - if (error == 0 && ire != NULL) { - ire->ire_max_frag = save_ire->ire_max_frag; - ip1dbg(("setting max_frag to %d in ire 0x%p\n", - ire->ire_max_frag, (void *)ire)); - } else { - ire_refrele(save_ire); - goto icmp_err_ret; - } - } - - ire_refrele(save_ire); - break; - default: - break; - } - -out: - *ret_action = Forward_ok; - if (sire != NULL) - ire_refrele(sire); - if (dst_ill != NULL) - ill_refrele(dst_ill); - if (src_ipif != NULL) - ipif_refrele(src_ipif); - return (ire); -icmp_err_ret: - *ret_action = Forward_ret_icmp_err; - if (src_ipif != NULL) - ipif_refrele(src_ipif); - if (dst_ill != NULL) - ill_refrele(dst_ill); - if (sire != NULL) - ire_refrele(sire); - if (ire != NULL) { - if (ire->ire_flags & RTF_BLACKHOLE) - *ret_action = Forward_blackhole; - ire_refrele(ire); - } - /* caller needs to send icmp error message */ - return (NULL); - -} - /* * Obtain the rt_entry and rt_irb for the route to be added to * the ips_ip_ftable. @@ -1489,7 +431,7 @@ ire_get_bucket(ire_t *ire) rt->rt_nodes->rn_key = (char *)&rt->rt_dst; rt->rt_dst = rdst; irb = &rt->rt_irb; - irb->irb_marks |= IRB_MARK_FTABLE; /* dynamically allocated/freed */ + irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */ irb->irb_ipst = ipst; rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL); RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable); @@ -1510,7 +452,7 @@ ire_get_bucket(ire_t *ire) } if (rt != NULL) { irb = &rt->rt_irb; - IRB_REFHOLD(irb); + irb_refhold(irb); } RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); return (irb); @@ -1551,10 +493,12 @@ ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid) ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6); - if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) { - ill = ire_to_ill(ire); - if (ill != NULL) + if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) { + ill = ire_nexthop_ill(ire); + if (ill != NULL) { ifindex = ill->ill_phyint->phyint_ifindex; + ill_refrele(ill); + } ire_refrele(ire); } netstack_rele(ns); @@ -1563,7 +507,7 @@ ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid) /* * Routine to find the route to a destination. If a ifindex is supplied - * it tries to match the the route to the corresponding ipif for the ifindex + * it tries to match the route to the corresponding ipif for the ifindex */ static ire_t * route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst) @@ -1571,27 +515,33 @@ route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst) ire_t *ire = NULL; int match_flags; - match_flags = (MATCH_IRE_DSTONLY | MATCH_IRE_DEFAULT | - MATCH_IRE_RECURSIVE | MATCH_IRE_RJ_BHOLE); + match_flags = MATCH_IRE_DSTONLY; /* XXX pass NULL tsl for now */ if (dst_addr->sa_family == AF_INET) { - ire = ire_route_lookup( - ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, - 0, 0, 0, NULL, NULL, zoneid, NULL, match_flags, ipst); + ire = ire_route_recursive_v4( + ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL, + zoneid, NULL, match_flags, B_TRUE, 0, ipst, NULL, NULL, + NULL); } else { - ire = ire_route_lookup_v6( - &((struct sockaddr_in6 *)dst_addr)->sin6_addr, - 0, 0, 0, NULL, NULL, zoneid, NULL, match_flags, ipst); + ire = ire_route_recursive_v6( + &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL, + zoneid, NULL, match_flags, B_TRUE, 0, ipst, NULL, NULL, + NULL); + } + ASSERT(ire != NULL); + if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { + ire_refrele(ire); + return (NULL); } return (ire); } /* * This routine is called by IP Filter to send a packet out on the wire - * to a specified V4 dst (which may be onlink or offlink). The ifindex may or - * may not be 0. A non-null ifindex indicates IP Filter has stipulated + * to a specified dstination (which may be onlink or offlink). The ifindex may + * or may not be 0. A non-null ifindex indicates IP Filter has stipulated * an outgoing interface and requires the nexthop to be on that interface. * IP WILL NOT DO the following to the data packet before sending it out: * a. manipulate ttl @@ -1611,21 +561,18 @@ route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst) * of the offlink dst's nexthop needs to get * resolved before packet can be sent to dst. * Thus transmission is not guaranteed. - * + * Note: No longer have visibility to the ARP queue + * hence no EINPROGRESS. */ - int ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex, zoneid_t zoneid) { - ire_t *ire = NULL, *sire = NULL; - ire_t *ire_cache = NULL; - int value; - int match_flags; - ipaddr_t dst; + ipaddr_t nexthop; netstack_t *ns; ip_stack_t *ipst; - enum ire_forward_action ret_action; + ip_xmit_attr_t ixas; + int error; ASSERT(mp != NULL); @@ -1646,429 +593,57 @@ ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex, ASSERT(dst_addr->sa_family == AF_INET || dst_addr->sa_family == AF_INET6); - if (dst_addr->sa_family == AF_INET) { - dst = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr; - } else { - /* - * We dont have support for V6 yet. It will be provided - * once RFE 6399103 has been delivered. - * Until then, for V6 dsts, IP Filter will not call - * this function. Instead the netinfo framework provides - * its own code path, in ip_inject_impl(), to achieve - * what it needs to do, for the time being. - */ - ip1dbg(("ipfil_sendpkt: no V6 support \n")); - value = ECOMM; - freemsg(mp); - goto discard; - } - - /* - * Lets get the ire. We might get the ire cache entry, - * or the ire,sire pair needed to create the cache entry. - * XXX pass NULL tsl for now. - */ - - if (ifindex == 0) { - /* There is no supplied index. So use the FIB info */ - - match_flags = (MATCH_IRE_DSTONLY | MATCH_IRE_DEFAULT | - MATCH_IRE_RECURSIVE | MATCH_IRE_RJ_BHOLE); - ire = ire_route_lookup(dst, - 0, 0, 0, NULL, &sire, zoneid, msg_getlabel(mp), - match_flags, ipst); - } else { - ipif_t *supplied_ipif; - ill_t *ill; - - match_flags = (MATCH_IRE_DSTONLY | MATCH_IRE_DEFAULT | - MATCH_IRE_RECURSIVE| MATCH_IRE_RJ_BHOLE| - MATCH_IRE_SECATTR | MATCH_IRE_ILL); - - /* - * If supplied ifindex is non-null, the only valid - * nexthop is one off of the interface corresponding - * to the specified ifindex. - */ - ill = ill_lookup_on_ifindex(ifindex, B_FALSE, - NULL, NULL, NULL, NULL, ipst); - if (ill == NULL) { - ip1dbg(("ipfil_sendpkt: Could not find" - " route to dst\n")); - value = ECOMM; - freemsg(mp); - goto discard; - } - - supplied_ipif = ipif_get_next_ipif(NULL, ill); - ire = ire_route_lookup(dst, 0, 0, 0, supplied_ipif, - &sire, zoneid, msg_getlabel(mp), match_flags, ipst); - if (supplied_ipif != NULL) - ipif_refrele(supplied_ipif); - ill_refrele(ill); - } - + bzero(&ixas, sizeof (ixas)); /* - * Verify that the returned IRE is non-null and does - * not have either the RTF_REJECT or RTF_BLACKHOLE - * flags set and that the IRE is either an IRE_CACHE, - * IRE_IF_NORESOLVER or IRE_IF_RESOLVER. + * No IPsec, no fragmentation, and don't let any hooks see + * the packet. */ - if (ire == NULL || - ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) || - (ire->ire_type & (IRE_CACHE | IRE_INTERFACE)) == 0)) { - /* - * Either ire could not be found or we got - * an invalid one - */ - ip1dbg(("ipfil_sendpkt: Could not find route to dst\n")); - value = ENONET; - freemsg(mp); - goto discard; - } - - /* IP Filter and CGTP dont mix. So bail out if CGTP is on */ - if (ipst->ips_ip_cgtp_filter && - ((ire->ire_flags & RTF_MULTIRT) || - ((sire != NULL) && (sire->ire_flags & RTF_MULTIRT)))) { - ip1dbg(("ipfil_sendpkt: IPFilter does not work with CGTP\n")); - value = ECOMM; - freemsg(mp); - goto discard; - } + ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK; + ixas.ixa_cred = kcred; + ixas.ixa_cpid = NOPID; + ixas.ixa_tsl = NULL; + ixas.ixa_ipst = ipst; + ixas.ixa_ifindex = ifindex; - ASSERT(ire->ire_type != IRE_CACHE || ire->ire_nce != NULL); - - /* - * If needed, we will create the ire cache entry for the - * nexthop, resolve its link-layer address and then send - * the packet out without ttl or IPSec processing. - */ - switch (ire->ire_type) { - case IRE_CACHE: - if (sire != NULL) { - UPDATE_OB_PKT_COUNT(sire); - sire->ire_last_used_time = lbolt; - ire_refrele(sire); - } - ire_cache = ire; - break; - case IRE_IF_NORESOLVER: - case IRE_IF_RESOLVER: - /* - * Call ire_forward(). This function - * will, create the ire cache entry of the - * the nexthop and adds this incomplete ire - * to the ire cache table - */ - ire_cache = ire_forward(dst, &ret_action, ire, sire, - msg_getlabel(mp), ipst); - if (ire_cache == NULL) { - ip1dbg(("ipfil_sendpkt: failed to create the" - " ire cache entry \n")); - value = ENONET; - freemsg(mp); - sire = NULL; - ire = NULL; - goto discard; - } - break; - } - - if (DB_CKSUMFLAGS(mp)) { - if (ip_send_align_hcksum_flags(mp, ire_to_ill(ire_cache))) - goto cleanup; - } - - /* - * Now that we have the ire cache entry of the nexthop, call - * ip_xmit_v4() to trigger mac addr resolution - * if necessary and send it once ready. - */ - - value = ip_xmit_v4(mp, ire_cache, NULL, B_FALSE, NULL); -cleanup: - ire_refrele(ire_cache); - /* - * At this point, the reference for these have already been - * released within ire_forward() and/or ip_xmit_v4(). So we set - * them to NULL to make sure we dont drop the references - * again in case ip_xmit_v4() returns with either SEND_FAILED - * or LLHDR_RESLV_FAILED - */ - sire = NULL; - ire = NULL; - - switch (value) { - case SEND_FAILED: - ip1dbg(("ipfil_sendpkt: Send failed\n")); - value = ECOMM; - break; - case LLHDR_RESLV_FAILED: - ip1dbg(("ipfil_sendpkt: Link-layer resolution" - " failed\n")); - value = ECOMM; - break; - case LOOKUP_IN_PROGRESS: - netstack_rele(ns); - return (EINPROGRESS); - case SEND_PASSED: - netstack_rele(ns); - return (0); - } -discard: if (dst_addr->sa_family == AF_INET) { - BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); - } else { - BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); - } - if (ire != NULL) - ire_refrele(ire); - if (sire != NULL) - ire_refrele(sire); - netstack_rele(ns); - return (value); -} - - -/* - * We don't check for dohwcksum in here because it should be being used - * elsewhere to control what flags are being set on the mblk. That is, - * if DB_CKSUMFLAGS() is non-zero then we assume dohwcksum to be true - * for this packet. - * - * This function assumes that it is *only* being called for TCP or UDP - * packets and nothing else. - */ -static int -ip_send_align_hcksum_flags(mblk_t *mp, ill_t *ill) -{ - int illhckflags; - int mbhckflags; - uint16_t *up; - uint32_t cksum; - ipha_t *ipha; - ip6_t *ip6; - int proto; - int ipversion; - int length; - int start; - ip6_pkt_t ipp; - - mbhckflags = DB_CKSUMFLAGS(mp); - ASSERT(mbhckflags != 0); - ASSERT(mp->b_datap->db_type == M_DATA); - /* - * Since this function only knows how to manage the hardware checksum - * issue, reject and packets that have flags set on the aside from - * checksum related attributes as we cannot necessarily safely map - * that packet onto the new NIC. Packets that can be potentially - * dropped here include those marked for LSO. - */ - if ((mbhckflags & - ~(HCK_FULLCKSUM|HCK_PARTIALCKSUM|HCK_IPV4_HDRCKSUM)) != 0) { - DTRACE_PROBE2(pbr__incapable, (mblk_t *), mp, (ill_t *), ill); - freemsg(mp); - return (-1); - } - - ipha = (ipha_t *)mp->b_rptr; - - /* - * Find out what the new NIC is capable of, if anything, and - * only allow it to be used with M_DATA mblks being sent out. - */ - if (ILL_HCKSUM_CAPABLE(ill)) { - illhckflags = ill->ill_hcksum_capab->ill_hcksum_txflags; - } else { - /* - * No capabilities, so turn off everything. - */ - illhckflags = 0; - (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0, 0, 0); - mp->b_datap->db_struioflag &= ~STRUIO_IP; - } - - DTRACE_PROBE4(pbr__info__a, (mblk_t *), mp, (ill_t *), ill, - uint32_t, illhckflags, uint32_t, mbhckflags); - /* - * This block of code that looks for the position of the TCP/UDP - * checksum is early in this function because we need to know - * what needs to be blanked out for the hardware checksum case. - * - * That we're in this function implies that the packet is either - * TCP or UDP on Solaris, so checks are made for one protocol and - * if that fails, the other is therefore implied. - */ - ipversion = IPH_HDR_VERSION(ipha); + ipha_t *ipha = (ipha_t *)mp->b_rptr; - if (ipversion == IPV4_VERSION) { - proto = ipha->ipha_protocol; - if (proto == IPPROTO_TCP) { - up = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH); - } else { - up = IPH_UDPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH); + ixas.ixa_flags |= IXAF_IS_IPV4; + nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr; + if (nexthop != ipha->ipha_dst) { + ixas.ixa_flags |= IXAF_NEXTHOP_SET; + ixas.ixa_nexthop_v4 = nexthop; } + ixas.ixa_multicast_ttl = ipha->ipha_ttl; } else { - uint8_t lasthdr; - - /* - * Nothing I've seen indicates that IPv6 checksum'ing - * precludes the presence of extension headers, so we - * can't just look at the next header value in the IPv6 - * packet header to see if it is TCP/UDP. - */ - ip6 = (ip6_t *)ipha; - (void) memset(&ipp, 0, sizeof (ipp)); - start = ip_find_hdr_v6(mp, ip6, &ipp, &lasthdr); - proto = lasthdr; - - if (proto == IPPROTO_TCP) { - up = IPH_TCPH_CHECKSUMP(ipha, start); - } else { - up = IPH_UDPH_CHECKSUMP(ipha, start); - } - } + ip6_t *ip6h = (ip6_t *)mp->b_rptr; + in6_addr_t *nexthop6; - /* - * The first case here is easiest: - * mblk hasn't asked for full checksum, but the card supports it. - * - * In addition, check for IPv4 header capability. Note that only - * the mblk flag is checked and not ipversion. - */ - if ((((illhckflags & HCKSUM_INET_FULL_V4) && (ipversion == 4)) || - (((illhckflags & HCKSUM_INET_FULL_V6) && (ipversion == 6)))) && - ((mbhckflags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) != 0)) { - int newflags = HCK_FULLCKSUM; - - if ((mbhckflags & HCK_IPV4_HDRCKSUM) != 0) { - if ((illhckflags & HCKSUM_IPHDRCKSUM) != 0) { - newflags |= HCK_IPV4_HDRCKSUM; - } else { - /* - * Rather than call a function, just inline - * the computation of the basic IPv4 header. - */ - cksum = (ipha->ipha_dst >> 16) + - (ipha->ipha_dst & 0xFFFF) + - (ipha->ipha_src >> 16) + - (ipha->ipha_src & 0xFFFF); - IP_HDR_CKSUM(ipha, cksum, - ((uint32_t *)ipha)[0], - ((uint16_t *)ipha)[4]); - } + nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr; + if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) { + ixas.ixa_flags |= IXAF_NEXTHOP_SET; + ixas.ixa_nexthop_v6 = *nexthop6; } - - *up = 0; - (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0, - newflags, 0); - return (0); - } - - DTRACE_PROBE2(pbr__info__b, int, ipversion, int, proto); - - /* - * Start calculating the pseudo checksum over the IP packet header. - * Although the final pseudo checksum used by TCP/UDP consists of - * more than just the address fields, we can use the result of - * adding those together a little bit further down for IPv4. - */ - if (ipversion == IPV4_VERSION) { - cksum = (ipha->ipha_dst >> 16) + (ipha->ipha_dst & 0xFFFF) + - (ipha->ipha_src >> 16) + (ipha->ipha_src & 0xFFFF); - start = IP_SIMPLE_HDR_LENGTH; - length = ntohs(ipha->ipha_length); - DTRACE_PROBE3(pbr__info__e, uint32_t, ipha->ipha_src, - uint32_t, ipha->ipha_dst, int, cksum); - } else { - uint16_t *pseudo; - - pseudo = (uint16_t *)&ip6->ip6_src; - - /* calculate pseudo-header checksum */ - cksum = pseudo[0] + pseudo[1] + pseudo[2] + pseudo[3] + - pseudo[4] + pseudo[5] + pseudo[6] + pseudo[7] + - pseudo[8] + pseudo[9] + pseudo[10] + pseudo[11] + - pseudo[12] + pseudo[13] + pseudo[14] + pseudo[15]; - - length = ntohs(ip6->ip6_plen) + sizeof (ip6_t); - } - - /* Fold the initial sum */ - cksum = (cksum & 0xffff) + (cksum >> 16); - - /* - * If the packet was asking for an IPv4 header checksum to be - * calculated but the interface doesn't support that, fill it in - * using our pseudo checksum as a starting point. - */ - if (((mbhckflags & HCK_IPV4_HDRCKSUM) != 0) && - ((illhckflags & HCKSUM_IPHDRCKSUM) == 0)) { - /* - * IP_HDR_CKSUM uses the 2rd arg to the macro in a destructive - * way so pass in a copy of the checksum calculated thus far. - */ - uint32_t ipsum = cksum; - - DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; - - IP_HDR_CKSUM(ipha, ipsum, ((uint32_t *)ipha)[0], - ((uint16_t *)ipha)[4]); - } - - DTRACE_PROBE3(pbr__info__c, int, start, int, length, int, cksum); - - if (proto == IPPROTO_TCP) { - cksum += IP_TCP_CSUM_COMP; - } else { - cksum += IP_UDP_CSUM_COMP; + ixas.ixa_multicast_ttl = ip6h->ip6_hops; } - cksum += htons(length - start); - cksum = (cksum & 0xffff) + (cksum >> 16); - - /* - * For TCP/UDP, we either want to setup the packet for partial - * checksum or we want to do it all ourselves because the NIC - * offers no support for either partial or full checksum. - */ - if ((illhckflags & HCKSUM_INET_PARTIAL) != 0) { - /* - * The only case we care about here is if the mblk was - * previously set for full checksum offload. If it was - * marked for partial (and the NIC does partial), then - * we have nothing to do. Similarly if the packet was - * not set for partial or full, we do nothing as this - * is cheaper than more work to set something up. - */ - if ((mbhckflags & HCK_FULLCKSUM) != 0) { - uint32_t offset; - - if (proto == IPPROTO_TCP) { - offset = TCP_CHECKSUM_OFFSET; - } else { - offset = UDP_CHECKSUM_OFFSET; - } - *up = cksum; - - DTRACE_PROBE3(pbr__info__f, int, length - start, int, - cksum, int, offset); + error = ip_output_simple(mp, &ixas); + ixa_cleanup(&ixas); - (void) hcksum_assoc(mp, NULL, NULL, start, - start + offset, length, 0, - DB_CKSUMFLAGS(mp) | HCK_PARTIALCKSUM, 0); - } + netstack_rele(ns); + switch (error) { + case 0: + break; - } else if (mbhckflags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)) { - DB_CKSUMFLAGS(mp) &= ~(HCK_PARTIALCKSUM|HCK_FULLCKSUM); + case EHOSTUNREACH: + case ENETUNREACH: + error = ENONET; + break; - *up = 0; - *up = IP_CSUM(mp, start, cksum); + default: + error = ECOMM; + break; } - - DTRACE_PROBE4(pbr__info__d, (mblk_t *), mp, (ipha_t *), ipha, - (uint16_t *), up, int, cksum); - return (0); + return (error); } /* @@ -2094,18 +669,18 @@ ire_find_best_route(struct radix_node *rn, void *arg) rw_enter(&irb_ptr->irb_lock, RW_READER); for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { - if (ire->ire_marks & IRE_MARK_CONDEMNED) + if (IRE_IS_CONDEMNED(ire)) continue; - if (margs->ift_flags & MATCH_IRE_MASK) + if (margs->ift_flags & (MATCH_IRE_MASK|MATCH_IRE_SHORTERMASK)) match_mask = margs->ift_mask; else match_mask = ire->ire_mask; if (ire_match_args(ire, margs->ift_addr, match_mask, - margs->ift_gateway, margs->ift_type, margs->ift_ipif, - margs->ift_zoneid, margs->ift_ihandle, margs->ift_tsl, - margs->ift_flags, NULL)) { - IRE_REFHOLD(ire); + margs->ift_gateway, margs->ift_type, margs->ift_ill, + margs->ift_zoneid, margs->ift_tsl, + margs->ift_flags)) { + ire_refhold(ire); rw_exit(&irb_ptr->irb_lock); margs->ift_best_ire = ire; return (B_TRUE); @@ -2198,107 +773,182 @@ irb_refrele_ftable(irb_t *irb) } /* - * IRE iterator used by ire_ftable_lookup() to process multiple default - * routes. Given a starting point in the hash list (ire_origin), walk the IREs - * in the bucket skipping default interface routes and deleted entries. - * Returns the next IRE (unheld), or NULL when we're back to the starting point. - * Assumes that the caller holds a reference on the IRE bucket. + * IRE iterator used by ire_ftable_lookup to process multiple equal + * routes. Given a starting point in the hash list (hash), walk the IREs + * in the bucket skipping deleted entries. We treat the bucket as a circular + * list for the purposes of walking it. + * Returns the IRE (held) that corresponds to the hash value. If that IRE is + * not applicable (ire_match_args failed) then it returns a subsequent one. + * If we fail to find an IRE we return NULL. * - * In the absence of good IRE_DEFAULT routes, this function will return - * the first IRE_INTERFACE route found (if any). + * Assumes that the caller holds a reference on the IRE bucket and a read lock + * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6). + * + * Applies to IPv4 and IPv6. + * + * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same + * address and bucket, we compare against ire_type for the orig_ire. We also + * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being + * first in the bucket. Thus we compare that ire_flags match the orig_ire. + * + * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is + * reachable from the zone i.e., that the ire_gateway_addr is in a subnet + * in which the zone has an IP address. We check this for the global zone + * even if no shared-IP zones are configured. */ ire_t * -ire_round_robin(irb_t *irb_ptr, zoneid_t zoneid, ire_ftable_args_t *margs, - ip_stack_t *ipst) +ire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash, + ire_t *orig_ire, ip_stack_t *ipst) { - ire_t *ire_origin; - ire_t *ire, *maybe_ire = NULL; + ire_t *ire, *maybe_ire = NULL; + uint_t maybe_badcnt; + uint_t maxwalk; - rw_enter(&irb_ptr->irb_lock, RW_WRITER); - ire_origin = irb_ptr->irb_rr_origin; - if (ire_origin != NULL) { - ire_origin = ire_origin->ire_next; - IRE_FIND_NEXT_ORIGIN(ire_origin); - } + /* Fold in more bits from the hint/hash */ + hash = hash ^ (hash >> 8) ^ (hash >> 16); - if (ire_origin == NULL) { - /* - * first time through routine, or we dropped off the end - * of list. - */ - ire_origin = irb_ptr->irb_ire; - IRE_FIND_NEXT_ORIGIN(ire_origin); - } - irb_ptr->irb_rr_origin = ire_origin; - IRB_REFHOLD_LOCKED(irb_ptr); + rw_enter(&irb_ptr->irb_lock, RW_WRITER); + maxwalk = irb_ptr->irb_ire_cnt; /* Excludes condemned */ + hash %= maxwalk; + irb_refhold_locked(irb_ptr); rw_exit(&irb_ptr->irb_lock); - DTRACE_PROBE2(ire__rr__origin, (irb_t *), irb_ptr, - (ire_t *), ire_origin); - /* * Round-robin the routers list looking for a route that * matches the passed in parameters. - * We start with the ire we found above and we walk the hash - * list until we're back where we started. It doesn't matter if - * routes are added or deleted by other threads - we know this - * ire will stay in the list because we hold a reference on the - * ire bucket. + * First we skip "hash" number of non-condemned IREs. + * Then we match the IRE. + * If we find an ire which has a non-zero ire_badcnt then we remember + * it and keep on looking for a lower ire_badcnt. + * If we come to the end of the list we continue (treat the + * bucket list as a circular list) but we match less than "max" + * entries. */ - ire = ire_origin; - while (ire != NULL) { - int match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR; - ire_t *rire; + ire = irb_ptr->irb_ire; + while (maxwalk > 0) { + if (IRE_IS_CONDEMNED(ire)) + goto next_ire_skip; + + /* Skip the first "hash" entries to do ECMP */ + if (hash != 0) { + hash--; + goto next_ire_skip; + } - if (ire->ire_marks & IRE_MARK_CONDEMNED) + /* See CGTP comment above */ + if (ire->ire_type != orig_ire->ire_type || + ire->ire_flags != orig_ire->ire_flags) goto next_ire; - if (!ire_match_args(ire, margs->ift_addr, (ipaddr_t)0, - margs->ift_gateway, margs->ift_type, margs->ift_ipif, - margs->ift_zoneid, margs->ift_ihandle, margs->ift_tsl, - margs->ift_flags, NULL)) + /* + * Note: Since IPv6 has hash buckets instead of radix + * buckers we need to explicitly compare the addresses. + * That makes this less efficient since we will be called + * even if there is no alternatives just because the + * bucket has multiple IREs for different addresses. + */ + if (ire->ire_ipversion == IPV6_VERSION) { + if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6, + &ire->ire_addr_v6)) + goto next_ire; + } + + /* + * For some reason find_best_route uses ire_mask. We do + * the same. + */ + if (ire->ire_ipversion == IPV4_VERSION ? + !ire_match_args(ire, margs->ift_addr, + ire->ire_mask, margs->ift_gateway, + margs->ift_type, margs->ift_ill, margs->ift_zoneid, + margs->ift_tsl, margs->ift_flags) : + !ire_match_args_v6(ire, &margs->ift_addr_v6, + &ire->ire_mask_v6, &margs->ift_gateway_v6, + margs->ift_type, margs->ift_ill, margs->ift_zoneid, + margs->ift_tsl, margs->ift_flags)) goto next_ire; - if (ire->ire_type & IRE_INTERFACE) { + if (margs->ift_zoneid != ALL_ZONES && + (ire->ire_type & IRE_OFFLINK)) { /* - * keep looking to see if there is a non-interface - * default ire, but save this one as a last resort. + * When we're in a zone, we're only + * interested in routers that are + * reachable through ipifs within our zone. */ - if (maybe_ire == NULL) - maybe_ire = ire; - goto next_ire; + if (ire->ire_ipversion == IPV4_VERSION) { + if (!ire_gateway_ok_zone_v4( + ire->ire_gateway_addr, margs->ift_zoneid, + ire->ire_ill, margs->ift_tsl, ipst, + B_TRUE)) + goto next_ire; + } else { + if (!ire_gateway_ok_zone_v6( + &ire->ire_gateway_addr_v6, + margs->ift_zoneid, ire->ire_ill, + margs->ift_tsl, ipst, B_TRUE)) + goto next_ire; + } } - - if (zoneid == ALL_ZONES) { - IRE_REFHOLD(ire); - IRB_REFRELE(irb_ptr); + mutex_enter(&ire->ire_lock); + /* Look for stale ire_badcnt and clear */ + if (ire->ire_badcnt != 0 && + (TICK_TO_SEC(lbolt64) - ire->ire_last_badcnt > + ipst->ips_ip_ire_badcnt_lifetime)) + ire->ire_badcnt = 0; + mutex_exit(&ire->ire_lock); + + if (ire->ire_badcnt == 0) { + /* We found one with a zero badcnt; done */ + ire_refhold(ire); + /* + * Care needed since irb_refrele grabs WLOCK to free + * the irb_t. + */ + if (ire->ire_ipversion == IPV4_VERSION) { + RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); + irb_refrele(irb_ptr); + RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); + } else { + rw_exit(&ipst->ips_ip6_ire_head_lock); + irb_refrele(irb_ptr); + rw_enter(&ipst->ips_ip6_ire_head_lock, + RW_READER); + } return (ire); } /* - * When we're in a non-global zone, we're only - * interested in routers that are - * reachable through ipifs within our zone. + * keep looking to see if there is a better (lower + * badcnt) matching IRE, but save this one as a last resort. + * If we find a lower badcnt pick that one as the last* resort. */ - if (ire->ire_ipif != NULL) - match_flags |= MATCH_IRE_ILL; - - rire = ire_route_lookup(ire->ire_gateway_addr, 0, 0, - IRE_INTERFACE, ire->ire_ipif, NULL, zoneid, margs->ift_tsl, - match_flags, ipst); - if (rire != NULL) { - ire_refrele(rire); - IRE_REFHOLD(ire); - IRB_REFRELE(irb_ptr); - return (ire); + if (maybe_ire == NULL) { + maybe_ire = ire; + maybe_badcnt = ire->ire_badcnt; + } else if (ire->ire_badcnt < maybe_badcnt) { + maybe_ire = ire; + maybe_badcnt = ire->ire_badcnt; } + next_ire: - ire = (ire->ire_next ? ire->ire_next : irb_ptr->irb_ire); - if (ire == ire_origin) - break; + maxwalk--; +next_ire_skip: + ire = ire->ire_next; + if (ire == NULL) + ire = irb_ptr->irb_ire; } if (maybe_ire != NULL) - IRE_REFHOLD(maybe_ire); - IRB_REFRELE(irb_ptr); + ire_refhold(maybe_ire); + + /* Care needed since irb_refrele grabs WLOCK to free the irb_t. */ + if (ire->ire_ipversion == IPV4_VERSION) { + RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); + irb_refrele(irb_ptr); + RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); + } else { + rw_exit(&ipst->ips_ip6_ire_head_lock); + irb_refrele(irb_ptr); + rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); + } return (maybe_ire); } @@ -2306,7 +956,7 @@ void irb_refhold_rn(struct radix_node *rn) { if ((rn->rn_flags & RNF_ROOT) == 0) - IRB_REFHOLD(&((rt_t *)(rn))->rt_irb); + irb_refhold(&((rt_t *)(rn))->rt_irb); } void @@ -2315,3 +965,587 @@ irb_refrele_rn(struct radix_node *rn) if ((rn->rn_flags & RNF_ROOT) == 0) irb_refrele_ftable(&((rt_t *)(rn))->rt_irb); } + +/* + * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject + * routes this routine sets up a ire_nce_cache as well. The caller needs to + * lookup an nce for the multicast case. + */ +ire_t * +ip_select_route(const in6_addr_t *v6dst, ip_xmit_attr_t *ixa, + uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp) +{ + uint_t match_args; + uint_t ire_type; + ill_t *ill; + ire_t *ire; + ip_stack_t *ipst = ixa->ixa_ipst; + ipaddr_t v4dst; + in6_addr_t v6nexthop; + iaflags_t ixaflags = ixa->ixa_flags; + nce_t *nce; + + match_args = MATCH_IRE_SECATTR; + IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst); + if (setsrcp != NULL) + ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp)); + if (errorp != NULL) + ASSERT(*errorp == 0); + + /* + * The content of the ixa will be different if IP_NEXTHOP, + * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set + */ + + if ((ixaflags & IXAF_IS_IPV4) ? CLASSD(v4dst) : + IN6_IS_ADDR_MULTICAST(v6dst)) { + /* Pick up the IRE_MULTICAST for the ill */ + if (ixa->ixa_multicast_ifindex != 0) { + ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex, + !(ixaflags & IXAF_IS_IPV4), ipst); + } else if (ixaflags & IXAF_SCOPEID_SET) { + /* sin6_scope_id takes precedence over ixa_ifindex */ + ASSERT(ixa->ixa_scopeid != 0); + ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, + !(ixaflags & IXAF_IS_IPV4), ipst); + } else if (ixa->ixa_ifindex != 0) { + /* + * In the ipmp case, the ixa_ifindex is set to + * point at an under_ill and we would return the + * ire_multicast() corresponding to that under_ill. + */ + ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, + !(ixaflags & IXAF_IS_IPV4), ipst); + } else if (ixaflags & IXAF_IS_IPV4) { + ipaddr_t v4setsrc = INADDR_ANY; + + ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid, ipst, + multirtp, &v4setsrc); + if (setsrcp != NULL) + IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); + } else { + ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid, ipst, + multirtp, setsrcp); + } + if (ill != NULL && IS_VNI(ill)) { + ill_refrele(ill); + ill = NULL; + } + if (ill == NULL) { + if (errorp != NULL) + *errorp = ENXIO; + /* Get a hold on the IRE_NOROUTE */ + ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4)); + return (ire); + } + if (!(ill->ill_flags & ILLF_MULTICAST)) { + ill_refrele(ill); + if (errorp != NULL) + *errorp = EHOSTUNREACH; + /* Get a hold on the IRE_NOROUTE */ + ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4)); + return (ire); + } + /* Get a refcnt on the single IRE_MULTICAST per ill */ + ire = ire_multicast(ill); + ill_refrele(ill); + if (generationp != NULL) + *generationp = ire->ire_generation; + if (errorp != NULL && + (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { + *errorp = EHOSTUNREACH; + } + return (ire); + } + + if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) { + if (ixaflags & IXAF_SCOPEID_SET) { + /* sin6_scope_id takes precedence over ixa_ifindex */ + ASSERT(ixa->ixa_scopeid != 0); + ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, + !(ixaflags & IXAF_IS_IPV4), ipst); + } else { + ASSERT(ixa->ixa_ifindex != 0); + ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, + !(ixaflags & IXAF_IS_IPV4), ipst); + } + if (ill != NULL && IS_VNI(ill)) { + ill_refrele(ill); + ill = NULL; + } + if (ill == NULL) { + if (errorp != NULL) + *errorp = ENXIO; + /* Get a hold on the IRE_NOROUTE */ + ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4)); + return (ire); + } + /* + * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF + * so for both of them we need to be able look for an under + * interface. + */ + if (IS_UNDER_IPMP(ill)) + match_args |= MATCH_IRE_TESTHIDDEN; + } else { + ill = NULL; + } + + if (ixaflags & IXAF_NEXTHOP_SET) { + /* IP_NEXTHOP was set */ + v6nexthop = ixa->ixa_nexthop_v6; + } else { + v6nexthop = *v6dst; + } + + ire_type = 0; + /* If ill is null then ire_route_recursive will set MATCH_IRE_ILL */ + + /* + * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then + * we only look for an onlink IRE. + */ + if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) { + match_args |= MATCH_IRE_TYPE; + ire_type = IRE_ONLINK; + } + + if (ixaflags & IXAF_IS_IPV4) { + ipaddr_t v4nexthop; + ipaddr_t v4setsrc = INADDR_ANY; + + IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop); + ire = ire_route_recursive_v4(v4nexthop, ire_type, ill, + ixa->ixa_zoneid, ixa->ixa_tsl, match_args, B_TRUE, + ixa->ixa_xmit_hint, ipst, &v4setsrc, NULL, generationp); + if (setsrcp != NULL) + IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); + } else { + ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill, + ixa->ixa_zoneid, ixa->ixa_tsl, match_args, B_TRUE, + ixa->ixa_xmit_hint, ipst, setsrcp, NULL, generationp); + } + +#ifdef DEBUG + if (match_args & MATCH_IRE_TESTHIDDEN) { + ip3dbg(("looking for hidden; dst %x ire %p\n", + v4dst, (void *)ire)); + } +#endif + + if (ill != NULL) + ill_refrele(ill); + + if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || + (ire->ire_type & IRE_MULTICAST)) { + /* No ire_nce_cache */ + return (ire); + } + + /* Setup ire_nce_cache if it doesn't exist or is condemned. */ + mutex_enter(&ire->ire_lock); + nce = ire->ire_nce_cache; + if (nce == NULL || nce->nce_is_condemned) { + mutex_exit(&ire->ire_lock); + (void) ire_revalidate_nce(ire); + } else { + mutex_exit(&ire->ire_lock); + } + return (ire); +} + +/* + * Find a route given some xmit attributes and a packet. + * Generic for IPv4 and IPv6 + * + * This never returns NULL. But when it returns the IRE_NOROUTE + * it might set errorp. + */ +ire_t * +ip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp, + int *errorp, boolean_t *multirtp) +{ + if (ixa->ixa_flags & IXAF_IS_IPV4) { + ipha_t *ipha = (ipha_t *)mp->b_rptr; + in6_addr_t v6dst; + + IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); + + return (ip_select_route(&v6dst, ixa, generationp, + NULL, errorp, multirtp)); + } else { + ip6_t *ip6h = (ip6_t *)mp->b_rptr; + + return (ip_select_route(&ip6h->ip6_dst, ixa, generationp, + NULL, errorp, multirtp)); + } +} + +ire_t * +ip_select_route_v4(ipaddr_t dst, ip_xmit_attr_t *ixa, uint_t *generationp, + ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp) +{ + in6_addr_t v6dst; + ire_t *ire; + in6_addr_t setsrc; + + ASSERT(ixa->ixa_flags & IXAF_IS_IPV4); + + IN6_IPADDR_TO_V4MAPPED(dst, &v6dst); + + setsrc = ipv6_all_zeros; + ire = ip_select_route(&v6dst, ixa, generationp, &setsrc, errorp, + multirtp); + if (v4setsrcp != NULL) + IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp); + return (ire); +} + +/* + * Recursively look for a route to the destination. Can also match on + * the zoneid, ill, and label. Used for the data paths. See also + * ire_route_recursive. + * + * If ill is set this means we will match it by adding MATCH_IRE_ILL. + * + * Note that this function never returns NULL. It returns an IRE_NOROUTE + * instead. + * + * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it + * is an error. + * Allow at most one RTF_INDIRECT. + */ +ire_t * +ire_route_recursive_impl_v4(ire_t *ire, + ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg, + zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, + boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, + tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) +{ + int i, j; + ire_t *ires[MAX_IRE_RECURSION]; + uint_t generation; + uint_t generations[MAX_IRE_RECURSION]; + boolean_t need_refrele = B_FALSE; + boolean_t invalidate = B_FALSE; + int prefs[MAX_IRE_RECURSION]; + ill_t *ill = NULL; + + if (setsrcp != NULL) + ASSERT(*setsrcp == INADDR_ANY); + if (gwattrp != NULL) + ASSERT(*gwattrp == NULL); + + if (ill_arg != NULL) + match_args |= MATCH_IRE_ILL; + + /* + * We iterate up to three times to resolve a route, even though + * we have four slots in the array. The extra slot is for an + * IRE_IF_CLONE we might need to create. + */ + i = 0; + while (i < MAX_IRE_RECURSION - 1) { + /* ire_ftable_lookup handles round-robin/ECMP */ + if (ire == NULL) { + ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type, + (ill_arg != NULL ? ill_arg : ill), zoneid, tsl, + match_args, xmit_hint, ipst, &generation); + } else { + /* Caller passed it; extra hold since we will rele */ + ire_refhold(ire); + if (generationp != NULL) + generation = *generationp; + else + generation = IRE_GENERATION_VERIFY; + } + if (ire == NULL) + ire = ire_reject(ipst, B_FALSE); + + /* Need to return the ire with RTF_REJECT|BLACKHOLE */ + if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) + goto error; + + ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */ + + prefs[i] = ire_pref(ire); + if (i != 0) { + /* + * Don't allow anything unusual past the first + * iteration. + */ + if ((ire->ire_type & + (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) || + prefs[i] <= prefs[i-1]) { + ire_refrele(ire); + ire = ire_reject(ipst, B_FALSE); + goto error; + } + } + /* We have a usable IRE */ + ires[i] = ire; + generations[i] = generation; + i++; + + /* The first RTF_SETSRC address is passed back if setsrcp */ + if ((ire->ire_flags & RTF_SETSRC) && + setsrcp != NULL && *setsrcp == INADDR_ANY) { + ASSERT(ire->ire_setsrc_addr != INADDR_ANY); + *setsrcp = ire->ire_setsrc_addr; + } + + /* The first ire_gw_secattr is passed back if gwattrp */ + if (ire->ire_gw_secattr != NULL && + gwattrp != NULL && *gwattrp == NULL) + *gwattrp = ire->ire_gw_secattr; + + /* + * Check if we have a short-cut pointer to an IRE for this + * destination, and that the cached dependency isn't stale. + * In that case we've rejoined an existing tree towards a + * parent, thus we don't need to continue the loop to + * discover the rest of the tree. + */ + mutex_enter(&ire->ire_lock); + if (ire->ire_dep_parent != NULL && + ire->ire_dep_parent->ire_generation == + ire->ire_dep_parent_generation) { + mutex_exit(&ire->ire_lock); + ire = NULL; + goto done; + } + mutex_exit(&ire->ire_lock); + + /* + * If this type should have an ire_nce_cache (even if it + * doesn't yet have one) then we are done. Includes + * IRE_INTERFACE with a full 32 bit mask. + */ + if (ire->ire_nce_capable) { + ire = NULL; + goto done; + } + ASSERT(!(ire->ire_type & IRE_IF_CLONE)); + /* + * For an IRE_INTERFACE we create an IRE_IF_CLONE for this + * particular destination + */ + if (ire->ire_type & IRE_INTERFACE) { + in6_addr_t v6nexthop; + ire_t *clone; + + ASSERT(ire->ire_masklen != IPV4_ABITS); + + /* + * In the case of ip_input and ILLF_FORWARDING not + * being set, and in the case of RTM_GET, + * there is no point in allocating + * an IRE_IF_CLONE. We return the IRE_INTERFACE. + * Note that !allocate can result in a ire_dep_parent + * which is IRE_IF_* without an IRE_IF_CLONE. + * We recover from that when we need to send packets + * by ensuring that the generations become + * IRE_GENERATION_VERIFY in this case. + */ + if (!allocate) { + invalidate = B_TRUE; + ire = NULL; + goto done; + } + + IN6_IPADDR_TO_V4MAPPED(nexthop, &v6nexthop); + + clone = ire_create_if_clone(ire, &v6nexthop, + &generation); + if (clone == NULL) { + /* + * Temporary failure - no memory. + * Don't want caller to cache IRE_NOROUTE. + */ + invalidate = B_TRUE; + ire = ire_blackhole(ipst, B_FALSE); + goto error; + } + /* + * Make clone next to last entry and the + * IRE_INTERFACE the last in the dependency + * chain since the clone depends on the + * IRE_INTERFACE. + */ + ASSERT(i >= 1); + ASSERT(i < MAX_IRE_RECURSION); + + ires[i] = ires[i-1]; + generations[i] = generations[i-1]; + ires[i-1] = clone; + generations[i-1] = generation; + i++; + + ire = NULL; + goto done; + } + + /* + * We only match on the type and optionally ILL when + * recursing. The type match is used by some callers + * to exclude certain types (such as IRE_IF_CLONE or + * IRE_LOCAL|IRE_LOOPBACK). + */ + match_args &= MATCH_IRE_TYPE; + nexthop = ire->ire_gateway_addr; + if (ill == NULL && ire->ire_ill != NULL) { + ill = ire->ire_ill; + need_refrele = B_TRUE; + ill_refhold(ill); + match_args |= MATCH_IRE_ILL; + } + ire = NULL; + } + ASSERT(ire == NULL); + ire = ire_reject(ipst, B_FALSE); + +error: + ASSERT(ire != NULL); + if (need_refrele) + ill_refrele(ill); + + /* + * In the case of MULTIRT we want to try a different IRE the next + * time. We let the next packet retry in that case. + */ + if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT)) + (void) ire_no_good(ires[0]); + +cleanup: + /* cleanup ires[i] */ + ire_dep_unbuild(ires, i); + for (j = 0; j < i; j++) + ire_refrele(ires[j]); + + ASSERT(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)); + /* + * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the + * ip_select_route since the reject or lack of memory might be gone. + */ + if (generationp != NULL) + *generationp = IRE_GENERATION_VERIFY; + return (ire); + +done: + ASSERT(ire == NULL); + if (need_refrele) { + ill_refrele(ill); + ill = NULL; + } + + /* Build dependencies */ + if (!ire_dep_build(ires, generations, i)) { + /* Something in chain was condemned; tear it apart */ + ire = ire_reject(ipst, B_FALSE); + goto cleanup; + } + + /* + * Release all refholds except the one for ires[0] that we + * will return to the caller. + */ + for (j = 1; j < i; j++) + ire_refrele(ires[j]); + + if (invalidate) { + /* + * Since we needed to allocate but couldn't we need to make + * sure that the dependency chain is rebuilt the next time. + */ + ire_dep_invalidate_generations(ires[0]); + generation = IRE_GENERATION_VERIFY; + } else { + /* + * IREs can have been added or deleted while we did the + * recursive lookup and we can't catch those until we've built + * the dependencies. We verify the stored + * ire_dep_parent_generation to catch any such changes and + * return IRE_GENERATION_VERIFY (which will cause + * ip_select_route to be called again so we can redo the + * recursive lookup next time we send a packet. + */ + generation = ire_dep_validate_generations(ires[0]); + if (generations[0] != ires[0]->ire_generation) { + /* Something changed at the top */ + generation = IRE_GENERATION_VERIFY; + } + } + if (generationp != NULL) + *generationp = generation; + + return (ires[0]); +} + +ire_t * +ire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill, + zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, + boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, + tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) +{ + return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill, + zoneid, tsl, match_args, allocate, xmit_hint, ipst, setsrcp, + gwattrp, generationp)); +} + +/* + * Recursively look for a route to the destination. + * We only handle a destination match here, yet we have the same arguments + * as the full match to allow function pointers to select between the two. + * + * Note that this function never returns NULL. It returns an IRE_NOROUTE + * instead. + * + * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it + * is an error. + * Allow at most one RTF_INDIRECT. + */ +ire_t * +ire_route_recursive_dstonly_v4(ipaddr_t nexthop, boolean_t allocate, + uint32_t xmit_hint, ip_stack_t *ipst) +{ + ire_t *ire; + ire_t *ire1; + uint_t generation; + + /* ire_ftable_lookup handles round-robin/ECMP */ + ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst, + &generation); + ASSERT(ire != NULL); + + /* + * If this type should have an ire_nce_cache (even if it + * doesn't yet have one) then we are done. Includes + * IRE_INTERFACE with a full 32 bit mask. + */ + if (ire->ire_nce_capable) + return (ire); + + /* + * If the IRE has a current cached parent we know that the whole + * parent chain is current, hence we don't need to discover and + * build any dependencies by doing a recursive lookup. + */ + mutex_enter(&ire->ire_lock); + if (ire->ire_dep_parent != NULL && + ire->ire_dep_parent->ire_generation == + ire->ire_dep_parent_generation) { + mutex_exit(&ire->ire_lock); + return (ire); + } + mutex_exit(&ire->ire_lock); + + /* + * Fallback to loop in the normal code starting with the ire + * we found. Normally this would return the same ire. + */ + ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES, + NULL, MATCH_IRE_DSTONLY, allocate, xmit_hint, ipst, NULL, NULL, + &generation); + ire_refrele(ire); + return (ire1); +} diff --git a/usr/src/uts/common/inet/ip/ip_helper_stream.c b/usr/src/uts/common/inet/ip/ip_helper_stream.c index 6f5608e950..3fa6364417 100644 --- a/usr/src/uts/common/inet/ip/ip_helper_stream.c +++ b/usr/src/uts/common/inet/ip/ip_helper_stream.c @@ -58,14 +58,14 @@ static struct qinit ip_helper_stream_winit = { &ip_helper_stream_info, NULL, NULL, NULL, STRUIOT_NONE }; -#define IP_USE_HELPER_CACHE (ip_helper_stream_cache != NULL) - /* * set the q_ptr of the 'q' to the conn_t pointer passed in */ static void ip_helper_share_conn(queue_t *q, mblk_t *mp, cred_t *crp) { + conn_t *connp = *((conn_t **)mp->b_cont->b_rptr); + /* * This operation is allowed only on helper streams with kcred */ @@ -75,24 +75,12 @@ ip_helper_share_conn(queue_t *q, mblk_t *mp, cred_t *crp) return; } - if (IP_USE_HELPER_CACHE) { - ip_helper_stream_info_t *ip_helper_info; - - ip_helper_info = *((ip_helper_stream_info_t **) - mp->b_cont->b_rptr); - ip_helper_info->iphs_minfo = q->q_ptr; - ip_helper_info->iphs_rq = RD(q); - ip_helper_info->iphs_wq = WR(q); - } else { - conn_t *connp = *((conn_t **)mp->b_cont->b_rptr); - - connp->conn_helper_info->iphs_minfo = q->q_ptr; - connp->conn_helper_info->iphs_rq = RD(q); - connp->conn_helper_info->iphs_wq = WR(q); - WR(q)->q_ptr = RD(q)->q_ptr = (void *)connp; - connp->conn_rq = RD(q); - connp->conn_wq = WR(q); - } + connp->conn_helper_info->iphs_minfo = q->q_ptr; + connp->conn_helper_info->iphs_rq = RD(q); + connp->conn_helper_info->iphs_wq = WR(q); + WR(q)->q_ptr = RD(q)->q_ptr = (void *)connp; + connp->conn_rq = RD(q); + connp->conn_wq = WR(q); miocack(q, mp, 0, 0); } @@ -104,17 +92,13 @@ ip_helper_wput(queue_t *q, mblk_t *mp) iocp->ioc_cmd == SIOCSQPTR) { ip_helper_share_conn(q, mp, iocp->ioc_cr); } else { - conn_t *connp = (conn_t *)q->q_ptr; - - if (connp->conn_af_isv6) { - ip_wput_v6(q, mp); - } else { - ip_wput(q, mp); - } + /* We only handle ioctl related messages here */ + ASSERT(DB_TYPE(mp) != M_DATA); + ip_wput_nondata(q, mp); } } -/* ARGSUSED */ +/* ARGSUSED3 */ int ip_helper_stream_setup(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, boolean_t isv6) @@ -126,10 +110,8 @@ ip_helper_stream_setup(queue_t *q, dev_t *devp, int flag, int sflag, ASSERT(RD(q) == q); - ip_minfop = kmem_alloc(sizeof (ip_helper_minfo_t), KM_NOSLEEP); - if (ip_minfop == NULL) { - return (ENOMEM); - } + ip_minfop = kmem_alloc(sizeof (ip_helper_minfo_t), KM_SLEEP); + ASSERT(ip_minfop != NULL); ip_minfop->ip_minfo_dev = 0; ip_minfop->ip_minfo_arena = NULL; @@ -171,7 +153,7 @@ ip_helper_stream_setup(queue_t *q, dev_t *devp, int flag, int sflag, return (0); } -/* ARGSUSED */ +/* ARGSUSED1 */ static int ip_helper_stream_close(queue_t *q, int flag) { @@ -189,305 +171,91 @@ ip_helper_stream_close(queue_t *q, int flag) /* * Public interface for creating an IP stream with shared conn_t + * Handles multiple callers in parallel by using conn_lock. + * Note that we allocate the helper stream without any locks, which means + * we might need to free it if we had two threads doing this concurrently + * for the conn_t. */ -/* ARGSUSED */ int ip_create_helper_stream(conn_t *connp, ldi_ident_t li) { + ip_helper_stream_info_t *helper; int error; int ret; ASSERT(!servicing_interrupt()); - error = 0; - if (IP_USE_HELPER_CACHE) { - connp->conn_helper_info = kmem_cache_alloc( - ip_helper_stream_cache, KM_NOSLEEP); - if (connp->conn_helper_info == NULL) - return (EAGAIN); - connp->conn_rq = connp->conn_helper_info->iphs_rq; - connp->conn_wq = connp->conn_helper_info->iphs_wq; - /* - * Doesn't need to hold the QLOCK for there is no one else - * should have a pointer to this queue. - */ - connp->conn_rq->q_flag |= QWANTR; - connp->conn_wq->q_flag |= QWANTR; - - connp->conn_rq->q_ptr = connp; - connp->conn_wq->q_ptr = connp; - } else { - ASSERT(connp->conn_helper_info == NULL); - connp->conn_helper_info = kmem_alloc( - sizeof (ip_helper_stream_info_t), KM_SLEEP); - /* - * open ip device via the layered interface. - * pass in kcred as some threads do not have the - * priviledge to open /dev/ip and the check in - * secpolicy_spec_open() will fail the open - */ - error = ldi_open_by_name(connp->conn_af_isv6 ? - DEV_IP6 : DEV_IP, IP_HELPER_STR, - kcred, &connp->conn_helper_info->iphs_handle, li); - - if (error != 0) { - kmem_free(connp->conn_helper_info, - (sizeof (ip_helper_stream_info_t))); - connp->conn_helper_info = NULL; - return (error); - } - /* - * Share connp with the helper stream - */ - error = ldi_ioctl(connp->conn_helper_info->iphs_handle, - SIOCSQPTR, (intptr_t)connp, FKIOCTL, kcred, &ret); - - if (error != 0) { - /* - * Passing in a zero flag indicates that an error - * occured and stream was not shared - */ - (void) ldi_close(connp->conn_helper_info->iphs_handle, - 0, kcred); - kmem_free(connp->conn_helper_info, - (sizeof (ip_helper_stream_info_t))); - connp->conn_helper_info = NULL; - } + if (connp->conn_helper_info != NULL) { + /* Already allocated */ + return (0); } - return (error); -} - -/* - * Public interface for freeing IP helper stream - */ -/* ARGSUSED */ -void -ip_free_helper_stream(conn_t *connp) -{ - ASSERT(!servicing_interrupt()); - if (IP_USE_HELPER_CACHE) { - - if (connp->conn_helper_info == NULL) - return; - ASSERT(connp->conn_helper_info->iphs_rq != NULL); - ASSERT(connp->conn_helper_info->iphs_wq != NULL); - - /* Prevent service procedures from being called */ - disable_svc(connp->conn_helper_info->iphs_rq); - - /* Wait until service procedure of each queue is run */ - wait_svc(connp->conn_helper_info->iphs_rq); - - /* Cleanup any pending ioctls */ - conn_ioctl_cleanup(connp); - - /* Allow service procedures to be called again */ - enable_svc(connp->conn_helper_info->iphs_rq); - - /* Flush the queues */ - flushq(connp->conn_helper_info->iphs_rq, FLUSHALL); - flushq(connp->conn_helper_info->iphs_wq, FLUSHALL); - - connp->conn_helper_info->iphs_rq->q_ptr = NULL; - connp->conn_helper_info->iphs_wq->q_ptr = NULL; - - kmem_cache_free(ip_helper_stream_cache, - connp->conn_helper_info); - } else { - ASSERT( - connp->conn_helper_info->iphs_handle != NULL); - - connp->conn_helper_info->iphs_rq->q_ptr = - connp->conn_helper_info->iphs_wq->q_ptr = - connp->conn_helper_info->iphs_minfo; - (void) ldi_close(connp->conn_helper_info->iphs_handle, - IP_HELPER_STR, kcred); - kmem_free(connp->conn_helper_info, - sizeof (ip_helper_stream_info_t)); - } - connp->conn_helper_info = NULL; -} - -/* - * create a T_SVR4_OPTMGMT_REQ TPI message and send down the IP stream - */ -static int -ip_send_option_request(conn_t *connp, uint_t optset_context, int level, - int option_name, const void *optval, t_uscalar_t optlen, cred_t *cr) -{ - struct T_optmgmt_req *optmgmt_reqp; - struct opthdr *ohp; - ssize_t size; - mblk_t *mp; - - size = sizeof (struct T_optmgmt_req) + sizeof (struct opthdr) + optlen; - /* Not used to generate UCRED, thus don't need correct pid */ - mp = allocb_cred(size, cr, NOPID); - if (mp == NULL) - return (ENOMEM); - - mp->b_datap->db_type = M_PROTO; - optmgmt_reqp = (struct T_optmgmt_req *)mp->b_wptr; - - optmgmt_reqp->PRIM_type = T_SVR4_OPTMGMT_REQ; - optmgmt_reqp->MGMT_flags = optset_context; - optmgmt_reqp->OPT_length = (t_scalar_t)sizeof (struct opthdr) + optlen; - optmgmt_reqp->OPT_offset = (t_scalar_t)sizeof (struct T_optmgmt_req); - - mp->b_wptr += sizeof (struct T_optmgmt_req); - - ohp = (struct opthdr *)mp->b_wptr; - ohp->level = level; - ohp->name = option_name; - ohp->len = optlen; - - mp->b_wptr += sizeof (struct opthdr); - - if (optval != NULL) { - bcopy(optval, mp->b_wptr, optlen); - } else { - bzero(mp->b_wptr, optlen); - } - mp->b_wptr += optlen; + error = 0; + helper = kmem_alloc(sizeof (ip_helper_stream_info_t), KM_SLEEP); /* - * Send down the primitive + * open ip device via the layered interface. + * pass in kcred as some threads do not have the + * priviledge to open /dev/ip and the check in + * secpolicy_spec_open() will fail the open */ - return (ldi_putmsg(connp->conn_helper_info->iphs_handle, mp)); -} + error = ldi_open_by_name((connp->conn_family == AF_INET6 ? DEV_IP6 : + DEV_IP), IP_HELPER_STR, kcred, &helper->iphs_handle, li); -/* - * wait/process the response to T_SVR4_OPTMGMT_REQ TPI message - */ -static int -ip_get_option_response(conn_t *connp, uint_t optset_context, void *optval, - t_uscalar_t *optlenp) -{ - union T_primitives *tpr; - int error; - mblk_t *mp; - - mp = NULL; - - ASSERT(optset_context == T_CHECK || optset_context == T_NEGOTIATE); - error = ldi_getmsg(connp->conn_helper_info->iphs_handle, &mp, NULL); if (error != 0) { + kmem_free(helper, sizeof (ip_helper_stream_info_t)); return (error); } - - if (DB_TYPE(mp) != M_PCPROTO || MBLKL(mp) < sizeof (tpr->type)) { - error = EPROTO; - goto done; - } - - tpr = (union T_primitives *)mp->b_rptr; - - switch (tpr->type) { - case T_OPTMGMT_ACK: - if (MBLKL(mp) < TOPTMGMTACKSZ) - error = EPROTO; - break; - case T_ERROR_ACK: - if (MBLKL(mp) < TERRORACKSZ) { - error = EPROTO; - break; - } - - if (tpr->error_ack.TLI_error == TSYSERR) - error = tpr->error_ack.UNIX_error; - else - error = proto_tlitosyserr(tpr->error_ack.TLI_error); - break; - default: - error = EPROTO; - break; + /* Make sure we are the only one */ + mutex_enter(&connp->conn_lock); + if (connp->conn_helper_info != NULL) { + /* Some other thread won - discard this stream */ + mutex_exit(&connp->conn_lock); + (void) ldi_close(helper->iphs_handle, 0, kcred); + kmem_free(helper, sizeof (ip_helper_stream_info_t)); + return (0); } + connp->conn_helper_info = helper; + /* + * Share connp with the helper stream. We hold conn_lock across this + * operation. + */ + error = ldi_ioctl(helper->iphs_handle, SIOCSQPTR, (intptr_t)connp, + FKIOCTL, kcred, &ret); - if ((optset_context == T_CHECK) && (error == 0)) { - struct opthdr *opt_res; - t_uscalar_t len; - t_uscalar_t size; - t_uscalar_t maxlen = *optlenp; - void *option; - struct T_optmgmt_ack *optmgmt_ack; - - optmgmt_ack = (struct T_optmgmt_ack *)mp->b_rptr; - opt_res = (struct opthdr *) - ((uintptr_t)mp->b_rptr + optmgmt_ack->OPT_offset); - /* - * Check mblk boundary - */ - if (!MBLKIN(mp, optmgmt_ack->OPT_offset, - optmgmt_ack->OPT_length)) { - error = EPROTO; - goto done; - } - - /* - * Check alignment - */ - if ((((uintptr_t)opt_res) & (__TPI_ALIGN_SIZE - 1)) != 0) { - error = EPROTO; - goto done; - } - - option = &opt_res[1]; - - /* check to ensure that the option is within bounds */ - if ((((uintptr_t)option + opt_res->len) < (uintptr_t)option) || - !MBLKIN(mp, sizeof (struct opthdr), opt_res->len)) { - error = EPROTO; - goto done; - } - - len = opt_res->len; - size = MIN(len, maxlen); - + if (error != 0) { /* - * Copy data + * Passing in a zero flag indicates that an error + * occured and stream was not shared */ - bcopy(option, optval, size); - bcopy(&size, optlenp, sizeof (size)); + (void) ldi_close(helper->iphs_handle, 0, kcred); + kmem_free(helper, sizeof (ip_helper_stream_info_t)); + connp->conn_helper_info = NULL; } - -done: - freemsg(mp); + mutex_exit(&connp->conn_lock); return (error); } /* - * Public interface to get socketoptions via the ip helper stream. - */ -int -ip_get_options(conn_t *connp, int level, int option_name, void *optval, - t_uscalar_t *optlenp, cred_t *cr) -{ - int error; - - error = ip_send_option_request(connp, T_CHECK, level, option_name, NULL, - *optlenp, cr); - if (error) - return (error); - - return (ip_get_option_response(connp, T_CHECK, optval, optlenp)); -} - -/* - * Public interface to set socket options via the ip helper stream. + * Public interface for freeing IP helper stream + * Caller must ensure no concurrent use of the conn_t, which is normally + * done by calling this from the close routine when the conn_t is quiesced. */ -int -ip_set_options(conn_t *connp, int level, int option_name, const void *optval, - t_uscalar_t optlen, cred_t *cr) +void +ip_free_helper_stream(conn_t *connp) { + ASSERT(!servicing_interrupt()); - int error; + if (connp->conn_helper_info == NULL) + return; - error = ip_send_option_request(connp, T_NEGOTIATE, level, option_name, - optval, optlen, cr); - if (error) - return (error); + ASSERT(connp->conn_helper_info->iphs_handle != NULL); - return (ip_get_option_response(connp, T_NEGOTIATE, (void *)optval, - &optlen)); + connp->conn_helper_info->iphs_rq->q_ptr = + connp->conn_helper_info->iphs_wq->q_ptr = + connp->conn_helper_info->iphs_minfo; + (void) ldi_close(connp->conn_helper_info->iphs_handle, + IP_HELPER_STR, kcred); + kmem_free(connp->conn_helper_info, sizeof (ip_helper_stream_info_t)); + connp->conn_helper_info = NULL; } diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c index b175f4530f..6066da35b4 100644 --- a/usr/src/uts/common/inet/ip/ip_if.c +++ b/usr/src/uts/common/inet/ip/ip_if.c @@ -72,6 +72,7 @@ #include <inet/mi.h> #include <inet/nd.h> #include <inet/arp.h> +#include <inet/ip_arp.h> #include <inet/mib2.h> #include <inet/ip.h> #include <inet/ip6.h> @@ -88,12 +89,6 @@ #include <inet/ip_netinfo.h> #include <inet/ilb_ip.h> -#include <net/pfkeyv2.h> -#include <inet/ipsec_info.h> -#include <inet/sadb.h> -#include <inet/ipsec_impl.h> -#include <sys/iphada.h> - #include <netinet/igmp.h> #include <inet/ip_listutils.h> #include <inet/ipclassifier.h> @@ -119,15 +114,6 @@ typedef struct ipft_s { #define IPFT_F_NO_REPLY 0x1 /* IP ioctl does not expect any reply */ #define IPFT_F_SELF_REPLY 0x2 /* ioctl callee does the ioctl reply */ -typedef struct ip_sock_ar_s { - union { - area_t ip_sock_area; - ared_t ip_sock_ared; - areq_t ip_sock_areq; - } ip_sock_ar_u; - queue_t *ip_sock_ar_q; -} ip_sock_ar_t; - static int nd_ill_forward_get(queue_t *, mblk_t *, caddr_t, cred_t *); static int nd_ill_forward_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *ioc_cr); @@ -148,7 +134,7 @@ static int ip_sioctl_netmask_tail(ipif_t *ipif, sin_t *sin, queue_t *q, static int ip_sioctl_subnet_tail(ipif_t *ipif, in6_addr_t, in6_addr_t, queue_t *q, mblk_t *mp, boolean_t need_up); static int ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, - int ioccmd, struct linkblk *li, boolean_t doconsist); + int ioccmd, struct linkblk *li); static ipaddr_t ip_subnet_mask(ipaddr_t addr, ipif_t **, ip_stack_t *); static void ip_wput_ioctl(queue_t *q, mblk_t *mp); static void ipsq_flush(ill_t *ill); @@ -159,17 +145,14 @@ static void ipsq_delete(ipsq_t *); static ipif_t *ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize, boolean_t insert); -static void ipif_check_bcast_ires(ipif_t *test_ipif); static ire_t **ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep); +static void ipif_delete_bcast_ires(ipif_t *ipif); +static int ipif_add_ires_v4(ipif_t *, boolean_t); static boolean_t ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6); -static void ipif_down_delete_ire(ire_t *ire, char *ipif); -static void ipif_delete_cache_ire(ire_t *, char *); static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp); static void ipif_free(ipif_t *ipif); static void ipif_free_tail(ipif_t *ipif); -static void ipif_mtu_change(ire_t *ire, char *ipif_arg); -static void ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif); static void ipif_set_default(ipif_t *ipif); static int ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *ppa); @@ -177,17 +160,13 @@ static int ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q); static ipif_t *ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, boolean_t *exists, boolean_t isv6, zoneid_t zoneid, - queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *); -static void ipif_update_other_ipifs(ipif_t *old_ipif); + ip_stack_t *); static int ill_alloc_ppa(ill_if_t *, ill_t *); -static int ill_arp_off(ill_t *ill); -static int ill_arp_on(ill_t *ill); static void ill_delete_interface_type(ill_if_t *); static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q); static void ill_dl_down(ill_t *ill); static void ill_down(ill_t *ill); -static void ill_downi(ire_t *ire, char *ill_arg); static void ill_free_mib(ill_t *ill); static void ill_glist_delete(ill_t *); static void ill_phyint_reinit(ill_t *ill); @@ -199,38 +178,22 @@ static ip_v6intfid_func_t ip_ether_v6intfid, ip_ib_v6intfid; static ip_v6intfid_func_t ip_ipv4_v6intfid, ip_ipv6_v6intfid; static ip_v6intfid_func_t ip_ipmp_v6intfid, ip_nodef_v6intfid; static ip_v6intfid_func_t ip_ipv4_v6destintfid, ip_ipv6_v6destintfid; -static ip_v6mapinfo_func_t ip_ether_v6mapinfo, ip_ib_v6mapinfo; -static ip_v6mapinfo_func_t ip_nodef_v6mapinfo; -static ip_v4mapinfo_func_t ip_ether_v4mapinfo, ip_ib_v4mapinfo; -static ip_v4mapinfo_func_t ip_nodef_v4mapinfo; -static void ipif_save_ire(ipif_t *, ire_t *); -static void ipif_remove_ire(ipif_t *, ire_t *); -static void ip_cgtp_bcast_add(ire_t *, ire_t *, ip_stack_t *); +static ip_v4mapinfo_func_t ip_ether_v4_mapping; +static ip_v6mapinfo_func_t ip_ether_v6_mapping; +static ip_v4mapinfo_func_t ip_ib_v4_mapping; +static ip_v6mapinfo_func_t ip_ib_v6_mapping; +static ip_v4mapinfo_func_t ip_mbcast_mapping; +static void ip_cgtp_bcast_add(ire_t *, ip_stack_t *); static void ip_cgtp_bcast_delete(ire_t *, ip_stack_t *); static void phyint_free(phyint_t *); -/* - * Per-ill IPsec capabilities management. - */ -static ill_ipsec_capab_t *ill_ipsec_capab_alloc(void); -static void ill_ipsec_capab_free(ill_ipsec_capab_t *); -static void ill_ipsec_capab_add(ill_t *, uint_t, boolean_t); -static void ill_ipsec_capab_delete(ill_t *, uint_t); -static boolean_t ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *, int); -static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *, - boolean_t); +static void ill_capability_dispatch(ill_t *, mblk_t *, dl_capability_sub_t *); static void ill_capability_id_ack(ill_t *, mblk_t *, dl_capability_sub_t *); -static void ill_capability_mdt_ack(ill_t *, mblk_t *, dl_capability_sub_t *); -static void ill_capability_mdt_reset_fill(ill_t *, mblk_t *); -static void ill_capability_ipsec_ack(ill_t *, mblk_t *, dl_capability_sub_t *); -static void ill_capability_ipsec_reset_fill(ill_t *, mblk_t *); static void ill_capability_hcksum_ack(ill_t *, mblk_t *, dl_capability_sub_t *); static void ill_capability_hcksum_reset_fill(ill_t *, mblk_t *); static void ill_capability_zerocopy_ack(ill_t *, mblk_t *, dl_capability_sub_t *); static void ill_capability_zerocopy_reset_fill(ill_t *, mblk_t *); -static int ill_capability_ipsec_reset_size(ill_t *, int *, int *, int *, - int *); static void ill_capability_dld_reset_fill(ill_t *, mblk_t *); static void ill_capability_dld_ack(ill_t *, mblk_t *, dl_capability_sub_t *); @@ -242,11 +205,11 @@ static void ill_capability_send(ill_t *, mblk_t *); static ill_t *ill_prev_usesrc(ill_t *); static int ill_relink_usesrc_ills(ill_t *, ill_t *, uint_t); static void ill_disband_usesrc_group(ill_t *); -static void conn_cleanup_stale_ire(conn_t *, caddr_t); +static void ip_sioctl_garp_reply(mblk_t *, ill_t *, void *, int); #ifdef DEBUG -static void ill_trace_cleanup(const ill_t *); -static void ipif_trace_cleanup(const ipif_t *); +static void ill_trace_cleanup(const ill_t *); +static void ipif_trace_cleanup(const ipif_t *); #endif /* @@ -255,182 +218,10 @@ static void ipif_trace_cleanup(const ipif_t *); */ int ip_min_frag_prune_time = 0; -/* - * max # of IPsec algorithms supported. Limited to 1 byte by PF_KEY - * and the IPsec DOI - */ -#define MAX_IPSEC_ALGS 256 - -#define BITSPERBYTE 8 -#define BITS(type) (BITSPERBYTE * (long)sizeof (type)) - -#define IPSEC_ALG_ENABLE(algs, algid) \ - ((algs)[(algid) / BITS(ipsec_capab_elem_t)] |= \ - (1 << ((algid) % BITS(ipsec_capab_elem_t)))) - -#define IPSEC_ALG_IS_ENABLED(algid, algs) \ - ((algs)[(algid) / BITS(ipsec_capab_elem_t)] & \ - (1 << ((algid) % BITS(ipsec_capab_elem_t)))) - -typedef uint8_t ipsec_capab_elem_t; - -/* - * Per-algorithm parameters. Note that at present, only encryption - * algorithms have variable keysize (IKE does not provide a way to negotiate - * auth algorithm keysize). - * - * All sizes here are in bits. - */ -typedef struct -{ - uint16_t minkeylen; - uint16_t maxkeylen; -} ipsec_capab_algparm_t; - -/* - * Per-ill capabilities. - */ -struct ill_ipsec_capab_s { - ipsec_capab_elem_t *encr_hw_algs; - ipsec_capab_elem_t *auth_hw_algs; - uint32_t algs_size; /* size of _hw_algs in bytes */ - /* algorithm key lengths */ - ipsec_capab_algparm_t *encr_algparm; - uint32_t encr_algparm_size; - uint32_t encr_algparm_end; -}; - -/* - * The field values are larger than strictly necessary for simple - * AR_ENTRY_ADDs but the padding lets us accomodate the socket ioctls. - */ -static area_t ip_area_template = { - AR_ENTRY_ADD, /* area_cmd */ - sizeof (ip_sock_ar_t) + (IP_ADDR_LEN*2) + sizeof (struct sockaddr_dl), - /* area_name_offset */ - /* area_name_length temporarily holds this structure length */ - sizeof (area_t), /* area_name_length */ - IP_ARP_PROTO_TYPE, /* area_proto */ - sizeof (ip_sock_ar_t), /* area_proto_addr_offset */ - IP_ADDR_LEN, /* area_proto_addr_length */ - sizeof (ip_sock_ar_t) + IP_ADDR_LEN, - /* area_proto_mask_offset */ - 0, /* area_flags */ - sizeof (ip_sock_ar_t) + IP_ADDR_LEN + IP_ADDR_LEN, - /* area_hw_addr_offset */ - /* Zero length hw_addr_length means 'use your idea of the address' */ - 0 /* area_hw_addr_length */ -}; - -/* - * AR_ENTRY_ADD/DELETE templates have been added for IPv6 external resolver - * support - */ -static area_t ip6_area_template = { - AR_ENTRY_ADD, /* area_cmd */ - sizeof (ip_sock_ar_t) + (IPV6_ADDR_LEN*2) + sizeof (sin6_t), - /* area_name_offset */ - /* area_name_length temporarily holds this structure length */ - sizeof (area_t), /* area_name_length */ - IP_ARP_PROTO_TYPE, /* area_proto */ - sizeof (ip_sock_ar_t), /* area_proto_addr_offset */ - IPV6_ADDR_LEN, /* area_proto_addr_length */ - sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN, - /* area_proto_mask_offset */ - 0, /* area_flags */ - sizeof (ip_sock_ar_t) + IPV6_ADDR_LEN + IPV6_ADDR_LEN, - /* area_hw_addr_offset */ - /* Zero length hw_addr_length means 'use your idea of the address' */ - 0 /* area_hw_addr_length */ -}; - -static ared_t ip_ared_template = { - AR_ENTRY_DELETE, - sizeof (ared_t) + IP_ADDR_LEN, - sizeof (ared_t), - IP_ARP_PROTO_TYPE, - sizeof (ared_t), - IP_ADDR_LEN, - 0 -}; - -static ared_t ip6_ared_template = { - AR_ENTRY_DELETE, - sizeof (ared_t) + IPV6_ADDR_LEN, - sizeof (ared_t), - IP_ARP_PROTO_TYPE, - sizeof (ared_t), - IPV6_ADDR_LEN, - 0 -}; - -/* - * A template for an IPv6 AR_ENTRY_QUERY template has not been created, as - * as the areq doesn't include an IP address in ill_dl_up() (the only place a - * areq is used). - */ -static areq_t ip_areq_template = { - AR_ENTRY_QUERY, /* cmd */ - sizeof (areq_t)+(2*IP_ADDR_LEN), /* name offset */ - sizeof (areq_t), /* name len (filled by ill_arp_alloc) */ - IP_ARP_PROTO_TYPE, /* protocol, from arps perspective */ - sizeof (areq_t), /* target addr offset */ - IP_ADDR_LEN, /* target addr_length */ - 0, /* flags */ - sizeof (areq_t) + IP_ADDR_LEN, /* sender addr offset */ - IP_ADDR_LEN, /* sender addr length */ - AR_EQ_DEFAULT_XMIT_COUNT, /* xmit_count */ - AR_EQ_DEFAULT_XMIT_INTERVAL, /* (re)xmit_interval in milliseconds */ - AR_EQ_DEFAULT_MAX_BUFFERED /* max # of requests to buffer */ - /* anything else filled in by the code */ -}; - -static arc_t ip_aru_template = { - AR_INTERFACE_UP, - sizeof (arc_t), /* Name offset */ - sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ -}; - -static arc_t ip_ard_template = { - AR_INTERFACE_DOWN, - sizeof (arc_t), /* Name offset */ - sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ -}; - -static arc_t ip_aron_template = { - AR_INTERFACE_ON, - sizeof (arc_t), /* Name offset */ - sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ -}; - -static arc_t ip_aroff_template = { - AR_INTERFACE_OFF, - sizeof (arc_t), /* Name offset */ - sizeof (arc_t) /* Name length (set by ill_arp_alloc) */ -}; - -static arma_t ip_arma_multi_template = { - AR_MAPPING_ADD, - sizeof (arma_t) + 3*IP_ADDR_LEN + IP_MAX_HW_LEN, - /* Name offset */ - sizeof (arma_t), /* Name length (set by ill_arp_alloc) */ - IP_ARP_PROTO_TYPE, - sizeof (arma_t), /* proto_addr_offset */ - IP_ADDR_LEN, /* proto_addr_length */ - sizeof (arma_t) + IP_ADDR_LEN, /* proto_mask_offset */ - sizeof (arma_t) + 2*IP_ADDR_LEN, /* proto_extract_mask_offset */ - ACE_F_PERMANENT | ACE_F_MAPPING, /* flags */ - sizeof (arma_t) + 3*IP_ADDR_LEN, /* hw_addr_offset */ - IP_MAX_HW_LEN, /* hw_addr_length */ - 0, /* hw_mapping_start */ -}; - static ipft_t ip_ioctl_ftbl[] = { { IP_IOC_IRE_DELETE, ip_ire_delete, sizeof (ipid_t), 0 }, { IP_IOC_IRE_DELETE_NO_REPLY, ip_ire_delete, sizeof (ipid_t), IPFT_F_NO_REPLY }, - { IP_IOC_IRE_ADVISE_NO_REPLY, ip_ire_advise, sizeof (ipic_t), - IPFT_F_NO_REPLY }, { IP_IOC_RTS_REQUEST, ip_rts_request, 0, IPFT_F_SELF_REPLY }, { 0 } }; @@ -444,35 +235,38 @@ static uchar_t ip_six_byte_all_ones[] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; static ip_m_t ip_m_tbl[] = { { DL_ETHER, IFT_ETHER, ETHERTYPE_IP, ETHERTYPE_IPV6, - ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_ether_v6intfid, + ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid, ip_nodef_v6intfid }, { DL_CSMACD, IFT_ISO88023, ETHERTYPE_IP, ETHERTYPE_IPV6, - ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_nodef_v6intfid, + ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, ip_nodef_v6intfid }, { DL_TPB, IFT_ISO88024, ETHERTYPE_IP, ETHERTYPE_IPV6, - ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_nodef_v6intfid, + ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, ip_nodef_v6intfid }, { DL_TPR, IFT_ISO88025, ETHERTYPE_IP, ETHERTYPE_IPV6, - ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_nodef_v6intfid, + ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, ip_nodef_v6intfid }, { DL_FDDI, IFT_FDDI, ETHERTYPE_IP, ETHERTYPE_IPV6, - ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_ether_v6intfid, + ip_ether_v4_mapping, ip_ether_v6_mapping, ip_ether_v6intfid, ip_nodef_v6intfid }, { DL_IB, IFT_IB, ETHERTYPE_IP, ETHERTYPE_IPV6, - ip_ib_v4mapinfo, ip_ib_v6mapinfo, ip_ib_v6intfid, + ip_ib_v4_mapping, ip_ib_v6_mapping, ip_ib_v6intfid, + ip_nodef_v6intfid }, + { DL_IPV4, IFT_IPV4, IPPROTO_ENCAP, IPPROTO_IPV6, + ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid, + ip_ipv4_v6destintfid }, + { DL_IPV6, IFT_IPV6, IPPROTO_ENCAP, IPPROTO_IPV6, + ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv6_v6intfid, + ip_ipv6_v6destintfid }, + { DL_6TO4, IFT_6TO4, IPPROTO_ENCAP, IPPROTO_IPV6, + ip_mbcast_mapping, ip_mbcast_mapping, ip_ipv4_v6intfid, ip_nodef_v6intfid }, - { DL_IPV4, IFT_IPV4, IPPROTO_ENCAP, IPPROTO_IPV6, ip_nodef_v4mapinfo, - ip_nodef_v6mapinfo, ip_ipv4_v6intfid, ip_ipv4_v6destintfid }, - { DL_IPV6, IFT_IPV6, IPPROTO_ENCAP, IPPROTO_IPV6, ip_nodef_v4mapinfo, - ip_nodef_v6mapinfo, ip_ipv6_v6intfid, ip_ipv6_v6destintfid }, - { DL_6TO4, IFT_6TO4, IPPROTO_ENCAP, IPPROTO_IPV6, ip_nodef_v4mapinfo, - ip_nodef_v6mapinfo, ip_ipv4_v6intfid, ip_nodef_v6intfid }, { SUNW_DL_VNI, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, NULL, NULL, ip_nodef_v6intfid, ip_nodef_v6intfid }, { SUNW_DL_IPMP, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, NULL, NULL, ip_ipmp_v6intfid, ip_nodef_v6intfid }, { DL_OTHER, IFT_OTHER, ETHERTYPE_IP, ETHERTYPE_IPV6, - ip_ether_v4mapinfo, ip_ether_v6mapinfo, ip_nodef_v6intfid, + ip_ether_v4_mapping, ip_ether_v6_mapping, ip_nodef_v6intfid, ip_nodef_v6intfid } }; @@ -567,149 +361,6 @@ ill_allocate_mibs(ill_t *ill) } /* - * Common code for preparation of ARP commands. Two points to remember: - * 1) The ill_name is tacked on at the end of the allocated space so - * the templates name_offset field must contain the total space - * to allocate less the name length. - * - * 2) The templates name_length field should contain the *template* - * length. We use it as a parameter to bcopy() and then write - * the real ill_name_length into the name_length field of the copy. - * (Always called as writer.) - */ -mblk_t * -ill_arp_alloc(ill_t *ill, const uchar_t *template, caddr_t addr) -{ - arc_t *arc = (arc_t *)template; - char *cp; - int len; - mblk_t *mp; - uint_t name_length = ill->ill_name_length; - uint_t template_len = arc->arc_name_length; - - len = arc->arc_name_offset + name_length; - mp = allocb(len, BPRI_HI); - if (mp == NULL) - return (NULL); - cp = (char *)mp->b_rptr; - mp->b_wptr = (uchar_t *)&cp[len]; - if (template_len) - bcopy(template, cp, template_len); - if (len > template_len) - bzero(&cp[template_len], len - template_len); - mp->b_datap->db_type = M_PROTO; - - arc = (arc_t *)cp; - arc->arc_name_length = name_length; - cp = (char *)arc + arc->arc_name_offset; - bcopy(ill->ill_name, cp, name_length); - - if (addr) { - area_t *area = (area_t *)mp->b_rptr; - - cp = (char *)area + area->area_proto_addr_offset; - bcopy(addr, cp, area->area_proto_addr_length); - if (area->area_cmd == AR_ENTRY_ADD) { - cp = (char *)area; - len = area->area_proto_addr_length; - if (area->area_proto_mask_offset) - cp += area->area_proto_mask_offset; - else - cp += area->area_proto_addr_offset + len; - while (len-- > 0) - *cp++ = (char)~0; - } - } - return (mp); -} - -mblk_t * -ipif_area_alloc(ipif_t *ipif, uint_t optflags) -{ - caddr_t addr; - mblk_t *mp; - area_t *area; - uchar_t *areap; - ill_t *ill = ipif->ipif_ill; - - if (ill->ill_isv6) { - ASSERT(ill->ill_flags & ILLF_XRESOLV); - addr = (caddr_t)&ipif->ipif_v6lcl_addr; - areap = (uchar_t *)&ip6_area_template; - } else { - addr = (caddr_t)&ipif->ipif_lcl_addr; - areap = (uchar_t *)&ip_area_template; - } - - if ((mp = ill_arp_alloc(ill, areap, addr)) == NULL) - return (NULL); - - /* - * IPMP requires that the hardware address be included in all - * AR_ENTRY_ADD requests so that ARP can deduce the arl to send on. - * If there are no active underlying ills in the group (and thus no - * hardware address, DAD will be deferred until an underlying ill - * becomes active. - */ - if (IS_IPMP(ill)) { - if ((ill = ipmp_ipif_hold_bound_ill(ipif)) == NULL) { - freemsg(mp); - return (NULL); - } - } else { - ill_refhold(ill); - } - - area = (area_t *)mp->b_rptr; - area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR; - area->area_flags |= optflags; - area->area_hw_addr_length = ill->ill_phys_addr_length; - bcopy(ill->ill_phys_addr, mp->b_rptr + area->area_hw_addr_offset, - area->area_hw_addr_length); - - ill_refrele(ill); - return (mp); -} - -mblk_t * -ipif_ared_alloc(ipif_t *ipif) -{ - caddr_t addr; - uchar_t *aredp; - - if (ipif->ipif_ill->ill_isv6) { - ASSERT(ipif->ipif_ill->ill_flags & ILLF_XRESOLV); - addr = (caddr_t)&ipif->ipif_v6lcl_addr; - aredp = (uchar_t *)&ip6_ared_template; - } else { - addr = (caddr_t)&ipif->ipif_lcl_addr; - aredp = (uchar_t *)&ip_ared_template; - } - - return (ill_arp_alloc(ipif->ipif_ill, aredp, addr)); -} - -mblk_t * -ill_ared_alloc(ill_t *ill, ipaddr_t addr) -{ - return (ill_arp_alloc(ill, (uchar_t *)&ip_ared_template, - (char *)&addr)); -} - -mblk_t * -ill_arie_alloc(ill_t *ill, const char *grifname, const void *template) -{ - mblk_t *mp = ill_arp_alloc(ill, template, 0); - arie_t *arie; - - if (mp != NULL) { - arie = (arie_t *)mp->b_rptr; - (void) strlcpy(arie->arie_grifname, grifname, LIFNAMSIZ); - } - return (mp); -} - -/* * Completely vaporize a lower level tap and all associated interfaces. * ill_delete is called only out of ip_close when the device control * stream is being closed. @@ -735,8 +386,8 @@ ill_delete(ill_t *ill) * remove it from the list, and free the data structure. * Walk down the ipif list and remove the logical interfaces * first before removing the main ipif. We can't unplumb - * zeroth interface first in the case of IPv6 as reset_conn_ill - * -> ip_ll_delmulti_v6 de-references ill_ipif for checking + * zeroth interface first in the case of IPv6 as update_conn_ill + * -> ip_ll_multireq de-references ill_ipif for checking * POINTOPOINT. * * If ill_ipif was not properly initialized (i.e low on memory), @@ -747,22 +398,15 @@ ill_delete(ill_t *ill) ipif_free(ipif); /* - * Used only by ill_arp_on and ill_arp_off, which are writers. - * So nobody can be using this mp now. Free the mp allocated for - * honoring ILLF_NOARP + * clean out all the nce_t entries that depend on this + * ill for the ill_phys_addr. */ - freemsg(ill->ill_arp_on_mp); - ill->ill_arp_on_mp = NULL; + nce_flush(ill, B_TRUE); /* Clean up msgs on pending upcalls for mrouted */ reset_mrt_ill(ill); - /* - * ipif_free -> reset_conn_ipif will remove all multicast - * references for IPv4. For IPv6, we need to do it here as - * it points only at ills. - */ - reset_conn_ill(ill); + update_conn_ill(ill, ipst); /* * Remove multicast references added as a result of calls to @@ -786,6 +430,16 @@ ill_delete(ill_t *ill) sctp_update_ill(ill, SCTP_ILL_REMOVE); /* + * Walk all CONNs that can have a reference on an ire or nce for this + * ill (we actually walk all that now have stale references). + */ + ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst); + + /* With IPv6 we have dce_ifindex. Cleanup for neatness */ + if (ill->ill_isv6) + dce_cleanup(ill->ill_phyint->phyint_ifindex, ipst); + + /* * If an address on this ILL is being used as a source address then * clear out the pointers in other ILLs that point to this ILL. */ @@ -828,12 +482,10 @@ ill_delete_tail(ill_t *ill) for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { ipif_non_duplicate(ipif); - ipif_down_tail(ipif); + (void) ipif_down_tail(ipif); } - ASSERT(ill->ill_ipif_dup_count == 0 && - ill->ill_arp_down_mp == NULL && - ill->ill_arp_del_mapping_mp == NULL); + ASSERT(ill->ill_ipif_dup_count == 0); /* * If polling capability is enabled (which signifies direct @@ -864,23 +516,6 @@ ill_delete_tail(ill_t *ill) /* * Free capabilities. */ - if (ill->ill_ipsec_capab_ah != NULL) { - ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_AH); - ill_ipsec_capab_free(ill->ill_ipsec_capab_ah); - ill->ill_ipsec_capab_ah = NULL; - } - - if (ill->ill_ipsec_capab_esp != NULL) { - ill_ipsec_capab_delete(ill, DL_CAPAB_IPSEC_ESP); - ill_ipsec_capab_free(ill->ill_ipsec_capab_esp); - ill->ill_ipsec_capab_esp = NULL; - } - - if (ill->ill_mdt_capab != NULL) { - kmem_free(ill->ill_mdt_capab, sizeof (ill_mdt_capab_t)); - ill->ill_mdt_capab = NULL; - } - if (ill->ill_hcksum_capab != NULL) { kmem_free(ill->ill_hcksum_capab, sizeof (ill_hcksum_capab_t)); ill->ill_hcksum_capab = NULL; @@ -911,11 +546,10 @@ ill_delete_tail(ill_t *ill) * * We don't walk conns, mrts and ires because * - * 1) reset_conn_ill and reset_mrt_ill cleans up conns and mrts. + * 1) update_conn_ill and reset_mrt_ill cleans up conns and mrts. * 2) ill_down ->ill_downi walks all the ires and cleans up * ill references. */ - ASSERT(ilm_walk_ill(ill) == 0); /* * If this ill is an IPMP meta-interface, blow away the illgrp. This @@ -974,6 +608,9 @@ ill_delete_tail(ill_t *ill) ill_trace_cleanup(ill); #endif + /* The default multicast interface might have changed */ + ire_increment_multicast_generation(ipst, ill->ill_isv6); + /* Drop refcnt here */ netstack_rele(ill->ill_ipst->ips_netstack); ill->ill_ipst = NULL; @@ -1077,97 +714,6 @@ ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap, } /* - * Add the 'mp' to the list of pending mp's headed by ill_pending_mp. Return - * an error if we already have 1 or more ioctls in progress. This is only - * needed for SIOCG*ARP. - */ -boolean_t -ill_pending_mp_add(ill_t *ill, conn_t *connp, mblk_t *add_mp) -{ - ASSERT(MUTEX_HELD(&ill->ill_lock)); - ASSERT((add_mp->b_next == NULL) && (add_mp->b_prev == NULL)); - /* We should only see M_IOCDATA arp ioctls here. */ - ASSERT(add_mp->b_datap->db_type == M_IOCDATA); - - ASSERT(MUTEX_HELD(&connp->conn_lock)); - /* - * Return error if the conn has started closing. The conn - * could have finished cleaning up the pending mp list, - * If so we should not add another mp to the list negating - * the cleanup. - */ - if (connp->conn_state_flags & CONN_CLOSING) - return (B_FALSE); - /* - * Add the pending mp to the head of the list, chained by b_next. - * Note down the conn on which the ioctl request came, in b_prev. - * This will be used to later get the conn, when we get a response - * on the ill queue, from some other module (typically arp) - */ - add_mp->b_next = (void *)ill->ill_pending_mp; - add_mp->b_queue = CONNP_TO_WQ(connp); - ill->ill_pending_mp = add_mp; - if (connp != NULL) - connp->conn_oper_pending_ill = ill; - return (B_TRUE); -} - -/* - * Retrieve the ill_pending_mp and return it. We have to walk the list - * of mblks starting at ill_pending_mp, and match based on the ioc_id. - */ -mblk_t * -ill_pending_mp_get(ill_t *ill, conn_t **connpp, uint_t ioc_id) -{ - mblk_t *prev = NULL; - mblk_t *curr = NULL; - uint_t id; - conn_t *connp; - - /* - * When the conn closes, conn_ioctl_cleanup needs to clean - * up the pending mp, but it does not know the ioc_id and - * passes in a zero for it. - */ - mutex_enter(&ill->ill_lock); - if (ioc_id != 0) - *connpp = NULL; - - /* Search the list for the appropriate ioctl based on ioc_id */ - for (prev = NULL, curr = ill->ill_pending_mp; curr != NULL; - prev = curr, curr = curr->b_next) { - id = ((struct iocblk *)curr->b_rptr)->ioc_id; - connp = Q_TO_CONN(curr->b_queue); - /* Match based on the ioc_id or based on the conn */ - if ((id == ioc_id) || (ioc_id == 0 && connp == *connpp)) - break; - } - - if (curr != NULL) { - /* Unlink the mblk from the pending mp list */ - if (prev != NULL) { - prev->b_next = curr->b_next; - } else { - ASSERT(ill->ill_pending_mp == curr); - ill->ill_pending_mp = curr->b_next; - } - - /* - * conn refcnt must have been bumped up at the start of - * the ioctl. So we can safely access the conn. - */ - ASSERT(CONN_Q(curr->b_queue)); - *connpp = Q_TO_CONN(curr->b_queue); - curr->b_next = NULL; - curr->b_queue = NULL; - } - - mutex_exit(&ill->ill_lock); - - return (curr); -} - -/* * Add the pending mp to the list. There can be only 1 pending mp * in the list. Any exclusive ioctl that needs to wait for a response * from another module or driver needs to use this function to set @@ -1283,6 +829,7 @@ ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp) ipxop_t *ipx; queue_t *q; ipif_t *ipif; + int cmd; ASSERT(IAM_WRITER_ILL(ill)); ipx = ill->ill_phyint->phyint_ipsq->ipsq_xop; @@ -1312,11 +859,16 @@ ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp) ipx->ipx_pending_ipif = NULL; ipx->ipx_waitfor = 0; ipx->ipx_current_ipif = NULL; + cmd = ipx->ipx_current_ioctl; ipx->ipx_current_ioctl = 0; ipx->ipx_current_done = B_TRUE; mutex_exit(&ipx->ipx_lock); if (DB_TYPE(mp) == M_IOCTL || DB_TYPE(mp) == M_IOCDATA) { + DTRACE_PROBE4(ipif__ioctl, + char *, "ipsq_pending_mp_cleanup", + int, cmd, ill_t *, ipif == NULL ? NULL : ipif->ipif_ill, + ipif_t *, ipif); if (connp == NULL) { ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL); } else { @@ -1337,43 +889,6 @@ ipsq_pending_mp_cleanup(ill_t *ill, conn_t *connp) } /* - * The ill is closing. Cleanup all the pending mps. Called exclusively - * towards the end of ill_delete. The refcount has gone to 0. So nobody - * knows this ill, and hence nobody can add an mp to this list - */ -static void -ill_pending_mp_cleanup(ill_t *ill) -{ - mblk_t *mp; - queue_t *q; - - ASSERT(IAM_WRITER_ILL(ill)); - - mutex_enter(&ill->ill_lock); - /* - * Every mp on the pending mp list originating from an ioctl - * added 1 to the conn refcnt, at the start of the ioctl. - * So bump it down now. See comments in ip_wput_nondata() - */ - while (ill->ill_pending_mp != NULL) { - mp = ill->ill_pending_mp; - ill->ill_pending_mp = mp->b_next; - mutex_exit(&ill->ill_lock); - - q = mp->b_queue; - ASSERT(CONN_Q(q)); - mp->b_next = NULL; - mp->b_prev = NULL; - mp->b_queue = NULL; - ip_ioctl_finish(q, mp, ENXIO, NO_COPYOUT, NULL); - mutex_enter(&ill->ill_lock); - } - ill->ill_pending_ipif = NULL; - - mutex_exit(&ill->ill_lock); -} - -/* * Called in the conn close path and ill delete path */ static void @@ -1435,6 +950,9 @@ ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp) curr->b_prev = NULL; curr->b_queue = NULL; if (DB_TYPE(curr) == M_IOCTL || DB_TYPE(curr) == M_IOCDATA) { + DTRACE_PROBE4(ipif__ioctl, + char *, "ipsq_xopq_mp_cleanup", + int, 0, ill_t *, NULL, ipif_t *, NULL); ip_ioctl_finish(q, curr, ENXIO, connp != NULL ? CONN_CLOSE : NO_COPYOUT, NULL); } else { @@ -1455,7 +973,6 @@ ipsq_xopq_mp_cleanup(ill_t *ill, conn_t *connp) void conn_ioctl_cleanup(conn_t *connp) { - mblk_t *curr; ipsq_t *ipsq; ill_t *ill; boolean_t refheld; @@ -1476,13 +993,6 @@ conn_ioctl_cleanup(conn_t *connp) return; } - curr = ill_pending_mp_get(ill, &connp, 0); - if (curr != NULL) { - mutex_exit(&connp->conn_lock); - CONN_DEC_REF(connp); - inet_freemsg(curr); - return; - } /* * We may not be able to refhold the ill if the ill/ipif * is changing. But we need to make sure that the ill will @@ -1522,58 +1032,43 @@ conn_ioctl_cleanup(conn_t *connp) /* * ipcl_walk function for cleaning up conn_*_ill fields. + * Note that we leave ixa_multicast_ifindex, conn_incoming_ifindex, and + * conn_bound_if in place. We prefer dropping + * packets instead of sending them out the wrong interface, or accepting + * packets from the wrong ifindex. */ static void conn_cleanup_ill(conn_t *connp, caddr_t arg) { ill_t *ill = (ill_t *)arg; - ire_t *ire; mutex_enter(&connp->conn_lock); - if (connp->conn_multicast_ill == ill) { - /* Revert to late binding */ - connp->conn_multicast_ill = NULL; - } - if (connp->conn_incoming_ill == ill) - connp->conn_incoming_ill = NULL; - if (connp->conn_outgoing_ill == ill) - connp->conn_outgoing_ill = NULL; if (connp->conn_dhcpinit_ill == ill) { connp->conn_dhcpinit_ill = NULL; ASSERT(ill->ill_dhcpinit != 0); atomic_dec_32(&ill->ill_dhcpinit); - } - if (connp->conn_ire_cache != NULL) { - ire = connp->conn_ire_cache; - /* - * Source address selection makes it possible for IRE_CACHE - * entries to be created with ire_stq coming from interface X - * and ipif coming from interface Y. Thus whenever interface - * X goes down, remove all references to it by checking both - * on ire_ipif and ire_stq. - */ - if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || - (ire->ire_type == IRE_CACHE && - ire->ire_stq == ill->ill_wq)) { - connp->conn_ire_cache = NULL; - mutex_exit(&connp->conn_lock); - ire_refrele_notr(ire); - return; - } + ill_set_inputfn(ill); } mutex_exit(&connp->conn_lock); } -static void +static int ill_down_ipifs_tail(ill_t *ill) { ipif_t *ipif; + int err; ASSERT(IAM_WRITER_ILL(ill)); for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { ipif_non_duplicate(ipif); - ipif_down_tail(ipif); + /* + * ipif_down_tail will call arp_ll_down on the last ipif + * and typically return EINPROGRESS when the DL_UNBIND is sent. + */ + if ((err = ipif_down_tail(ipif)) != 0) + return (err); } + return (0); } /* ARGSUSED */ @@ -1581,7 +1076,7 @@ void ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) { ASSERT(IAM_WRITER_IPSQ(ipsq)); - ill_down_ipifs_tail(q->q_ptr); + (void) ill_down_ipifs_tail(q->q_ptr); freemsg(mp); ipsq_current_finish(ipsq); } @@ -1598,12 +1093,27 @@ ill_down_start(queue_t *q, mblk_t *mp) ipif_t *ipif; ASSERT(IAM_WRITER_ILL(ill)); + mutex_enter(&ill->ill_lock); + ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS; + /* no more nce addition allowed */ + mutex_exit(&ill->ill_lock); for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) (void) ipif_down(ipif, NULL, NULL); ill_down(ill); + /* + * Walk all CONNs that can have a reference on an ire or nce for this + * ill (we actually walk all that now have stale references). + */ + ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ill->ill_ipst); + + /* With IPv6 we have dce_ifindex. Cleanup for neatness */ + if (ill->ill_isv6) + dce_cleanup(ill->ill_phyint->phyint_ifindex, ill->ill_ipst); + + (void) ipsq_pending_mp_cleanup(ill, NULL); ipsq_current_start(ill->ill_phyint->phyint_ipsq, ill->ill_ipif, 0); @@ -1626,44 +1136,68 @@ ill_down_start(queue_t *q, mblk_t *mp) static void ill_down(ill_t *ill) { + mblk_t *mp; ip_stack_t *ipst = ill->ill_ipst; - /* Blow off any IREs dependent on this ILL. */ - ire_walk(ill_downi, ill, ipst); + /* + * Blow off any IREs dependent on this ILL. + * The caller needs to handle conn_ixa_cleanup + */ + ill_delete_ires(ill); + + ire_walk_ill(0, 0, ill_downi, ill, ill); /* Remove any conn_*_ill depending on this ill */ ipcl_walk(conn_cleanup_ill, (caddr_t)ill, ipst); + + /* + * Free state for additional IREs. + */ + mutex_enter(&ill->ill_saved_ire_lock); + mp = ill->ill_saved_ire_mp; + ill->ill_saved_ire_mp = NULL; + ill->ill_saved_ire_cnt = 0; + mutex_exit(&ill->ill_saved_ire_lock); + freemsg(mp); } /* - * ire_walk routine used to delete every IRE that depends on queues - * associated with 'ill'. (Always called as writer.) + * ire_walk routine used to delete every IRE that depends on + * 'ill'. (Always called as writer.) + * + * Note: since the routes added by the kernel are deleted separately, + * this will only be 1) IRE_IF_CLONE and 2) manually added IRE_INTERFACE. + * + * We also remove references on ire_nce_cache entries that refer to the ill. */ -static void +void ill_downi(ire_t *ire, char *ill_arg) { ill_t *ill = (ill_t *)ill_arg; + nce_t *nce; - /* - * Source address selection makes it possible for IRE_CACHE - * entries to be created with ire_stq coming from interface X - * and ipif coming from interface Y. Thus whenever interface - * X goes down, remove all references to it by checking both - * on ire_ipif and ire_stq. - */ - if ((ire->ire_ipif != NULL && ire->ire_ipif->ipif_ill == ill) || - (ire->ire_type == IRE_CACHE && ire->ire_stq == ill->ill_wq)) { + mutex_enter(&ire->ire_lock); + nce = ire->ire_nce_cache; + if (nce != NULL && nce->nce_ill == ill) + ire->ire_nce_cache = NULL; + else + nce = NULL; + mutex_exit(&ire->ire_lock); + if (nce != NULL) + nce_refrele(nce); + if (ire->ire_ill == ill) ire_delete(ire); - } } -/* - * Remove ire/nce from the fastpath list. - */ +/* Remove IRE_IF_CLONE on this ill */ void -ill_fastpath_nack(ill_t *ill) +ill_downi_if_clone(ire_t *ire, char *ill_arg) { - nce_fastpath_list_dispatch(ill, NULL, NULL); + ill_t *ill = (ill_t *)ill_arg; + + ASSERT(ire->ire_type & IRE_IF_CLONE); + if (ire->ire_ill == ill) + ire_delete(ire); } /* Consume an M_IOCACK of the fastpath probe. */ @@ -1685,20 +1219,11 @@ ill_fastpath_ack(ill_t *ill, mblk_t *mp) freeb(mp1); if (mp == NULL) return; - if (mp->b_cont != NULL) { - /* - * Update all IRE's or NCE's that are waiting for - * fastpath update. - */ - nce_fastpath_list_dispatch(ill, ndp_fastpath_update, mp); - mp1 = mp->b_cont; - freeb(mp); - mp = mp1; - } else { + if (mp->b_cont != NULL) + nce_fastpath_update(ill, mp); + else ip0dbg(("ill_fastpath_ack: no b_cont\n")); - } - - freeb(mp); + freemsg(mp); } /* @@ -1745,6 +1270,8 @@ ill_fastpath_probe(ill_t *ill, mblk_t *dlur_mp) ioc = (struct iocblk *)mp->b_rptr; ioc->ioc_count = msgdsize(mp->b_cont); + DTRACE_PROBE3(ill__dlpi, char *, "ill_fastpath_probe", + char *, "DL_IOC_HDR_INFO", ill_t *, ill); putnext(ill->ill_wq, mp); return (0); } @@ -1797,8 +1324,7 @@ ill_capability_reset(ill_t *ill, boolean_t reneg) * direct function call capabilities viz. ILL_CAPAB_DLD* * which will be turned off by the corresponding reset functions. */ - ill->ill_capabilities &= ~(ILL_CAPAB_MDT | ILL_CAPAB_HCKSUM | - ILL_CAPAB_ZEROCOPY | ILL_CAPAB_AH | ILL_CAPAB_ESP); + ill->ill_capabilities &= ~(ILL_CAPAB_HCKSUM | ILL_CAPAB_ZEROCOPY); } static void @@ -1812,9 +1338,6 @@ ill_capability_reset_alloc(ill_t *ill) ASSERT(IAM_WRITER_ILL(ill)); ASSERT(ill->ill_capab_reset_mp == NULL); - if (ILL_MDT_CAPABLE(ill)) - size += sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t); - if (ILL_HCKSUM_CAPABLE(ill)) { size += sizeof (dl_capability_sub_t) + sizeof (dl_capab_hcksum_t); @@ -1825,12 +1348,6 @@ ill_capability_reset_alloc(ill_t *ill) sizeof (dl_capab_zerocopy_t); } - if (ill->ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP)) { - size += sizeof (dl_capability_sub_t); - size += ill_capability_ipsec_reset_size(ill, NULL, NULL, - NULL, NULL); - } - if (ill->ill_capabilities & ILL_CAPAB_DLD) { size += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t); @@ -1853,10 +1370,8 @@ ill_capability_reset_alloc(ill_t *ill) * Each handler fills in the corresponding dl_capability_sub_t * inside the mblk, */ - ill_capability_mdt_reset_fill(ill, mp); ill_capability_hcksum_reset_fill(ill, mp); ill_capability_zerocopy_reset_fill(ill, mp); - ill_capability_ipsec_reset_fill(ill, mp); ill_capability_dld_reset_fill(ill, mp); ill->ill_capab_reset_mp = mp; @@ -1906,162 +1421,7 @@ ill_capability_id_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *outers) } /* Process the encapsulated sub-capability */ - ill_capability_dispatch(ill, mp, inners, B_TRUE); -} - -/* - * Process Multidata Transmit capability negotiation ack received from a - * DLS Provider. isub must point to the sub-capability (DL_CAPAB_MDT) of a - * DL_CAPABILITY_ACK message. - */ -static void -ill_capability_mdt_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) -{ - mblk_t *nmp = NULL; - dl_capability_req_t *oc; - dl_capab_mdt_t *mdt_ic, *mdt_oc; - ill_mdt_capab_t **ill_mdt_capab; - uint_t sub_dl_cap = isub->dl_cap; - uint8_t *capend; - - ASSERT(sub_dl_cap == DL_CAPAB_MDT); - - ill_mdt_capab = (ill_mdt_capab_t **)&ill->ill_mdt_capab; - - /* - * Note: range checks here are not absolutely sufficient to - * make us robust against malformed messages sent by drivers; - * this is in keeping with the rest of IP's dlpi handling. - * (Remember, it's coming from something else in the kernel - * address space) - */ - - capend = (uint8_t *)(isub + 1) + isub->dl_length; - if (capend > mp->b_wptr) { - cmn_err(CE_WARN, "ill_capability_mdt_ack: " - "malformed sub-capability too long for mblk"); - return; - } - - mdt_ic = (dl_capab_mdt_t *)(isub + 1); - - if (mdt_ic->mdt_version != MDT_VERSION_2) { - cmn_err(CE_CONT, "ill_capability_mdt_ack: " - "unsupported MDT sub-capability (version %d, expected %d)", - mdt_ic->mdt_version, MDT_VERSION_2); - return; - } - - if (!dlcapabcheckqid(&mdt_ic->mdt_mid, ill->ill_lmod_rq)) { - ip1dbg(("ill_capability_mdt_ack: mid token for MDT " - "capability isn't as expected; pass-thru module(s) " - "detected, discarding capability\n")); - return; - } - - if (mdt_ic->mdt_flags & DL_CAPAB_MDT_ENABLE) { - - if (*ill_mdt_capab == NULL) { - *ill_mdt_capab = kmem_zalloc(sizeof (ill_mdt_capab_t), - KM_NOSLEEP); - if (*ill_mdt_capab == NULL) { - cmn_err(CE_WARN, "ill_capability_mdt_ack: " - "could not enable MDT version %d " - "for %s (ENOMEM)\n", MDT_VERSION_2, - ill->ill_name); - return; - } - } - - ip1dbg(("ill_capability_mdt_ack: interface %s supports " - "MDT version %d (%d bytes leading, %d bytes trailing " - "header spaces, %d max pld bufs, %d span limit)\n", - ill->ill_name, MDT_VERSION_2, - mdt_ic->mdt_hdr_head, mdt_ic->mdt_hdr_tail, - mdt_ic->mdt_max_pld, mdt_ic->mdt_span_limit)); - - (*ill_mdt_capab)->ill_mdt_version = MDT_VERSION_2; - (*ill_mdt_capab)->ill_mdt_on = 1; - /* - * Round the following values to the nearest 32-bit; ULP - * may further adjust them to accomodate for additional - * protocol headers. We pass these values to ULP during - * bind time. - */ - (*ill_mdt_capab)->ill_mdt_hdr_head = - roundup(mdt_ic->mdt_hdr_head, 4); - (*ill_mdt_capab)->ill_mdt_hdr_tail = - roundup(mdt_ic->mdt_hdr_tail, 4); - (*ill_mdt_capab)->ill_mdt_max_pld = mdt_ic->mdt_max_pld; - (*ill_mdt_capab)->ill_mdt_span_limit = mdt_ic->mdt_span_limit; - - ill->ill_capabilities |= ILL_CAPAB_MDT; - } else { - uint_t size; - uchar_t *rptr; - - size = sizeof (dl_capability_req_t) + - sizeof (dl_capability_sub_t) + sizeof (dl_capab_mdt_t); - - if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { - cmn_err(CE_WARN, "ill_capability_mdt_ack: " - "could not enable MDT for %s (ENOMEM)\n", - ill->ill_name); - return; - } - - rptr = nmp->b_rptr; - /* initialize dl_capability_req_t */ - oc = (dl_capability_req_t *)nmp->b_rptr; - oc->dl_sub_offset = sizeof (dl_capability_req_t); - oc->dl_sub_length = sizeof (dl_capability_sub_t) + - sizeof (dl_capab_mdt_t); - nmp->b_rptr += sizeof (dl_capability_req_t); - - /* initialize dl_capability_sub_t */ - bcopy(isub, nmp->b_rptr, sizeof (*isub)); - nmp->b_rptr += sizeof (*isub); - - /* initialize dl_capab_mdt_t */ - mdt_oc = (dl_capab_mdt_t *)nmp->b_rptr; - bcopy(mdt_ic, mdt_oc, sizeof (*mdt_ic)); - - nmp->b_rptr = rptr; - - ip1dbg(("ill_capability_mdt_ack: asking interface %s " - "to enable MDT version %d\n", ill->ill_name, - MDT_VERSION_2)); - - /* set ENABLE flag */ - mdt_oc->mdt_flags |= DL_CAPAB_MDT_ENABLE; - - /* nmp points to a DL_CAPABILITY_REQ message to enable MDT */ - ill_capability_send(ill, nmp); - } -} - -static void -ill_capability_mdt_reset_fill(ill_t *ill, mblk_t *mp) -{ - dl_capab_mdt_t *mdt_subcap; - dl_capability_sub_t *dl_subcap; - - if (!ILL_MDT_CAPABLE(ill)) - return; - - ASSERT(ill->ill_mdt_capab != NULL); - - dl_subcap = (dl_capability_sub_t *)mp->b_wptr; - dl_subcap->dl_cap = DL_CAPAB_MDT; - dl_subcap->dl_length = sizeof (*mdt_subcap); - - mdt_subcap = (dl_capab_mdt_t *)(dl_subcap + 1); - mdt_subcap->mdt_version = ill->ill_mdt_capab->ill_mdt_version; - mdt_subcap->mdt_flags = 0; - mdt_subcap->mdt_hdr_head = 0; - mdt_subcap->mdt_hdr_tail = 0; - - mp->b_wptr += sizeof (*dl_subcap) + sizeof (*mdt_subcap); + ill_capability_dispatch(ill, mp, inners); } static void @@ -2083,503 +1443,10 @@ ill_capability_dld_reset_fill(ill_t *ill, mblk_t *mp) mp->b_wptr += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dld_t); } -/* - * Allocate an IPsec capability request which will be filled by our - * caller to turn on support for one or more algorithms. - */ -/* ARGSUSED */ -static mblk_t * -ill_alloc_ipsec_cap_req(ill_t *ill, dl_capability_sub_t *isub) -{ - mblk_t *nmp; - dl_capability_req_t *ocap; - dl_capab_ipsec_t *ocip; - dl_capab_ipsec_t *icip; - uint8_t *ptr; - icip = (dl_capab_ipsec_t *)(isub + 1); - - /* - * Allocate new mblk which will contain a new capability - * request to enable the capabilities. - */ - - nmp = ip_dlpi_alloc(sizeof (dl_capability_req_t) + - sizeof (dl_capability_sub_t) + isub->dl_length, DL_CAPABILITY_REQ); - if (nmp == NULL) - return (NULL); - - ptr = nmp->b_rptr; - - /* initialize dl_capability_req_t */ - ocap = (dl_capability_req_t *)ptr; - ocap->dl_sub_offset = sizeof (dl_capability_req_t); - ocap->dl_sub_length = sizeof (dl_capability_sub_t) + isub->dl_length; - ptr += sizeof (dl_capability_req_t); - - /* initialize dl_capability_sub_t */ - bcopy(isub, ptr, sizeof (*isub)); - ptr += sizeof (*isub); - - /* initialize dl_capab_ipsec_t */ - ocip = (dl_capab_ipsec_t *)ptr; - bcopy(icip, ocip, sizeof (*icip)); - - nmp->b_wptr = (uchar_t *)(&ocip->cip_data[0]); - return (nmp); -} - -/* - * Process an IPsec capability negotiation ack received from a DLS Provider. - * isub must point to the sub-capability (DL_CAPAB_IPSEC_AH or - * DL_CAPAB_IPSEC_ESP) of a DL_CAPABILITY_ACK message. - */ static void -ill_capability_ipsec_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) -{ - dl_capab_ipsec_t *icip; - dl_capab_ipsec_alg_t *ialg; /* ptr to input alg spec. */ - dl_capab_ipsec_alg_t *oalg; /* ptr to output alg spec. */ - uint_t cipher, nciphers; - mblk_t *nmp; - uint_t alg_len; - boolean_t need_sadb_dump; - uint_t sub_dl_cap = isub->dl_cap; - ill_ipsec_capab_t **ill_capab; - uint64_t ill_capab_flag; - uint8_t *capend, *ciphend; - boolean_t sadb_resync; - - ASSERT(sub_dl_cap == DL_CAPAB_IPSEC_AH || - sub_dl_cap == DL_CAPAB_IPSEC_ESP); - - if (sub_dl_cap == DL_CAPAB_IPSEC_AH) { - ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_ah; - ill_capab_flag = ILL_CAPAB_AH; - } else { - ill_capab = (ill_ipsec_capab_t **)&ill->ill_ipsec_capab_esp; - ill_capab_flag = ILL_CAPAB_ESP; - } - - /* - * If the ill capability structure exists, then this incoming - * DL_CAPABILITY_ACK is a response to a "renegotiation" cycle. - * If this is so, then we'd need to resynchronize the SADB - * after re-enabling the offloaded ciphers. - */ - sadb_resync = (*ill_capab != NULL); - - /* - * Note: range checks here are not absolutely sufficient to - * make us robust against malformed messages sent by drivers; - * this is in keeping with the rest of IP's dlpi handling. - * (Remember, it's coming from something else in the kernel - * address space) - */ - - capend = (uint8_t *)(isub + 1) + isub->dl_length; - if (capend > mp->b_wptr) { - cmn_err(CE_WARN, "ill_capability_ipsec_ack: " - "malformed sub-capability too long for mblk"); - return; - } - - /* - * There are two types of acks we process here: - * 1. acks in reply to a (first form) generic capability req - * (no ENABLE flag set) - * 2. acks in reply to a ENABLE capability req. - * (ENABLE flag set) - * - * We process the subcapability passed as argument as follows: - * 1 do initializations - * 1.1 initialize nmp = NULL - * 1.2 set need_sadb_dump to B_FALSE - * 2 for each cipher in subcapability: - * 2.1 if ENABLE flag is set: - * 2.1.1 update per-ill ipsec capabilities info - * 2.1.2 set need_sadb_dump to B_TRUE - * 2.2 if ENABLE flag is not set: - * 2.2.1 if nmp is NULL: - * 2.2.1.1 allocate and initialize nmp - * 2.2.1.2 init current pos in nmp - * 2.2.2 copy current cipher to current pos in nmp - * 2.2.3 set ENABLE flag in nmp - * 2.2.4 update current pos - * 3 if nmp is not equal to NULL, send enable request - * 3.1 send capability request - * 4 if need_sadb_dump is B_TRUE - * 4.1 enable promiscuous on/off notifications - * 4.2 call ill_dlpi_send(isub->dlcap) to send all - * AH or ESP SA's to interface. - */ - - nmp = NULL; - oalg = NULL; - need_sadb_dump = B_FALSE; - icip = (dl_capab_ipsec_t *)(isub + 1); - ialg = (dl_capab_ipsec_alg_t *)(&icip->cip_data[0]); - - nciphers = icip->cip_nciphers; - ciphend = (uint8_t *)(ialg + icip->cip_nciphers); - - if (ciphend > capend) { - cmn_err(CE_WARN, "ill_capability_ipsec_ack: " - "too many ciphers for sub-capability len"); - return; - } - - for (cipher = 0; cipher < nciphers; cipher++) { - alg_len = sizeof (dl_capab_ipsec_alg_t); - - if (ialg->alg_flag & DL_CAPAB_ALG_ENABLE) { - /* - * TBD: when we provide a way to disable capabilities - * from above, need to manage the request-pending state - * and fail if we were not expecting this ACK. - */ - IPSECHW_DEBUG(IPSECHW_CAPAB, - ("ill_capability_ipsec_ack: got ENABLE ACK\n")); - - /* - * Update IPsec capabilities for this ill - */ - - if (*ill_capab == NULL) { - IPSECHW_DEBUG(IPSECHW_CAPAB, - ("ill_capability_ipsec_ack: " - "allocating ipsec_capab for ill\n")); - *ill_capab = ill_ipsec_capab_alloc(); - - if (*ill_capab == NULL) { - cmn_err(CE_WARN, - "ill_capability_ipsec_ack: " - "could not enable IPsec Hardware " - "acceleration for %s (ENOMEM)\n", - ill->ill_name); - return; - } - } - - ASSERT(ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH || - ialg->alg_type == DL_CAPAB_IPSEC_ALG_ENCR); - - if (ialg->alg_prim >= MAX_IPSEC_ALGS) { - cmn_err(CE_WARN, - "ill_capability_ipsec_ack: " - "malformed IPsec algorithm id %d", - ialg->alg_prim); - continue; - } - - if (ialg->alg_type == DL_CAPAB_IPSEC_ALG_AUTH) { - IPSEC_ALG_ENABLE((*ill_capab)->auth_hw_algs, - ialg->alg_prim); - } else { - ipsec_capab_algparm_t *alp; - - IPSEC_ALG_ENABLE((*ill_capab)->encr_hw_algs, - ialg->alg_prim); - if (!ill_ipsec_capab_resize_algparm(*ill_capab, - ialg->alg_prim)) { - cmn_err(CE_WARN, - "ill_capability_ipsec_ack: " - "no space for IPsec alg id %d", - ialg->alg_prim); - continue; - } - alp = &((*ill_capab)->encr_algparm[ - ialg->alg_prim]); - alp->minkeylen = ialg->alg_minbits; - alp->maxkeylen = ialg->alg_maxbits; - } - ill->ill_capabilities |= ill_capab_flag; - /* - * indicate that a capability was enabled, which - * will be used below to kick off a SADB dump - * to the ill. - */ - need_sadb_dump = B_TRUE; - } else { - IPSECHW_DEBUG(IPSECHW_CAPAB, - ("ill_capability_ipsec_ack: enabling alg 0x%x\n", - ialg->alg_prim)); - - if (nmp == NULL) { - nmp = ill_alloc_ipsec_cap_req(ill, isub); - if (nmp == NULL) { - /* - * Sending the PROMISC_ON/OFF - * notification request failed. - * We cannot enable the algorithms - * since the Provider will not - * notify IP of promiscous mode - * changes, which could lead - * to leakage of packets. - */ - cmn_err(CE_WARN, - "ill_capability_ipsec_ack: " - "could not enable IPsec Hardware " - "acceleration for %s (ENOMEM)\n", - ill->ill_name); - return; - } - /* ptr to current output alg specifier */ - oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr; - } - - /* - * Copy current alg specifier, set ENABLE - * flag, and advance to next output alg. - * For now we enable all IPsec capabilities. - */ - ASSERT(oalg != NULL); - bcopy(ialg, oalg, alg_len); - oalg->alg_flag |= DL_CAPAB_ALG_ENABLE; - nmp->b_wptr += alg_len; - oalg = (dl_capab_ipsec_alg_t *)nmp->b_wptr; - } - - /* move to next input algorithm specifier */ - ialg = (dl_capab_ipsec_alg_t *) - ((char *)ialg + alg_len); - } - - if (nmp != NULL) - /* - * nmp points to a DL_CAPABILITY_REQ message to enable - * IPsec hardware acceleration. - */ - ill_capability_send(ill, nmp); - - if (need_sadb_dump) - /* - * An acknowledgement corresponding to a request to - * enable acceleration was received, notify SADB. - */ - ill_ipsec_capab_add(ill, sub_dl_cap, sadb_resync); -} - -/* - * Given an mblk with enough space in it, create sub-capability entries for - * DL_CAPAB_IPSEC_{AH,ESP} types which consist of previously-advertised - * offloaded ciphers (both AUTH and ENCR) with their enable flags cleared, - * in preparation for the reset the DL_CAPABILITY_REQ message. - */ -static void -ill_fill_ipsec_reset(uint_t nciphers, int stype, uint_t slen, - ill_ipsec_capab_t *ill_cap, mblk_t *mp) -{ - dl_capab_ipsec_t *oipsec; - dl_capab_ipsec_alg_t *oalg; - dl_capability_sub_t *dl_subcap; - int i, k; - - ASSERT(nciphers > 0); - ASSERT(ill_cap != NULL); - ASSERT(mp != NULL); - ASSERT(MBLKTAIL(mp) >= sizeof (*dl_subcap) + sizeof (*oipsec) + slen); - - /* dl_capability_sub_t for "stype" */ - dl_subcap = (dl_capability_sub_t *)mp->b_wptr; - dl_subcap->dl_cap = stype; - dl_subcap->dl_length = sizeof (dl_capab_ipsec_t) + slen; - mp->b_wptr += sizeof (dl_capability_sub_t); - - /* dl_capab_ipsec_t for "stype" */ - oipsec = (dl_capab_ipsec_t *)mp->b_wptr; - oipsec->cip_version = 1; - oipsec->cip_nciphers = nciphers; - mp->b_wptr = (uchar_t *)&oipsec->cip_data[0]; - - /* create entries for "stype" AUTH ciphers */ - for (i = 0; i < ill_cap->algs_size; i++) { - for (k = 0; k < BITSPERBYTE; k++) { - if ((ill_cap->auth_hw_algs[i] & (1 << k)) == 0) - continue; - - oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr; - bzero((void *)oalg, sizeof (*oalg)); - oalg->alg_type = DL_CAPAB_IPSEC_ALG_AUTH; - oalg->alg_prim = k + (BITSPERBYTE * i); - mp->b_wptr += sizeof (dl_capab_ipsec_alg_t); - } - } - /* create entries for "stype" ENCR ciphers */ - for (i = 0; i < ill_cap->algs_size; i++) { - for (k = 0; k < BITSPERBYTE; k++) { - if ((ill_cap->encr_hw_algs[i] & (1 << k)) == 0) - continue; - - oalg = (dl_capab_ipsec_alg_t *)mp->b_wptr; - bzero((void *)oalg, sizeof (*oalg)); - oalg->alg_type = DL_CAPAB_IPSEC_ALG_ENCR; - oalg->alg_prim = k + (BITSPERBYTE * i); - mp->b_wptr += sizeof (dl_capab_ipsec_alg_t); - } - } -} - -/* - * Macro to count number of 1s in a byte (8-bit word). The total count is - * accumulated into the passed-in argument (sum). We could use SPARCv9's - * POPC instruction, but our macro is more flexible for an arbitrary length - * of bytes, such as {auth,encr}_hw_algs. These variables are currently - * 256-bits long (MAX_IPSEC_ALGS), so if we know for sure that the length - * stays that way, we can reduce the number of iterations required. - */ -#define COUNT_1S(val, sum) { \ - uint8_t x = val & 0xff; \ - x = (x & 0x55) + ((x >> 1) & 0x55); \ - x = (x & 0x33) + ((x >> 2) & 0x33); \ - sum += (x & 0xf) + ((x >> 4) & 0xf); \ -} - -/* ARGSUSED */ -static int -ill_capability_ipsec_reset_size(ill_t *ill, int *ah_cntp, int *ah_lenp, - int *esp_cntp, int *esp_lenp) -{ - ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah; - ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp; - uint64_t ill_capabilities = ill->ill_capabilities; - int ah_cnt = 0, esp_cnt = 0; - int ah_len = 0, esp_len = 0; - int i, size = 0; - - if (!(ill_capabilities & (ILL_CAPAB_AH | ILL_CAPAB_ESP))) - return (0); - - ASSERT(cap_ah != NULL || !(ill_capabilities & ILL_CAPAB_AH)); - ASSERT(cap_esp != NULL || !(ill_capabilities & ILL_CAPAB_ESP)); - - /* Find out the number of ciphers for AH */ - if (cap_ah != NULL) { - for (i = 0; i < cap_ah->algs_size; i++) { - COUNT_1S(cap_ah->auth_hw_algs[i], ah_cnt); - COUNT_1S(cap_ah->encr_hw_algs[i], ah_cnt); - } - if (ah_cnt > 0) { - size += sizeof (dl_capability_sub_t) + - sizeof (dl_capab_ipsec_t); - /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */ - ah_len = (ah_cnt - 1) * sizeof (dl_capab_ipsec_alg_t); - size += ah_len; - } - } - - /* Find out the number of ciphers for ESP */ - if (cap_esp != NULL) { - for (i = 0; i < cap_esp->algs_size; i++) { - COUNT_1S(cap_esp->auth_hw_algs[i], esp_cnt); - COUNT_1S(cap_esp->encr_hw_algs[i], esp_cnt); - } - if (esp_cnt > 0) { - size += sizeof (dl_capability_sub_t) + - sizeof (dl_capab_ipsec_t); - /* dl_capab_ipsec_t contains one dl_capab_ipsec_alg_t */ - esp_len = (esp_cnt - 1) * sizeof (dl_capab_ipsec_alg_t); - size += esp_len; - } - } - - if (ah_cntp != NULL) - *ah_cntp = ah_cnt; - if (ah_lenp != NULL) - *ah_lenp = ah_len; - if (esp_cntp != NULL) - *esp_cntp = esp_cnt; - if (esp_lenp != NULL) - *esp_lenp = esp_len; - - return (size); -} - -/* ARGSUSED */ -static void -ill_capability_ipsec_reset_fill(ill_t *ill, mblk_t *mp) +ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp) { - ill_ipsec_capab_t *cap_ah = ill->ill_ipsec_capab_ah; - ill_ipsec_capab_t *cap_esp = ill->ill_ipsec_capab_esp; - int ah_cnt = 0, esp_cnt = 0; - int ah_len = 0, esp_len = 0; - int size; - - size = ill_capability_ipsec_reset_size(ill, &ah_cnt, &ah_len, - &esp_cnt, &esp_len); - if (size == 0) - return; - - /* - * Clear the capability flags for IPsec HA but retain the ill - * capability structures since it's possible that another thread - * is still referring to them. The structures only get deallocated - * when we destroy the ill. - * - * Various places check the flags to see if the ill is capable of - * hardware acceleration, and by clearing them we ensure that new - * outbound IPsec packets are sent down encrypted. - */ - - /* Fill in DL_CAPAB_IPSEC_AH sub-capability entries */ - if (ah_cnt > 0) { - ill_fill_ipsec_reset(ah_cnt, DL_CAPAB_IPSEC_AH, ah_len, - cap_ah, mp); - } - - /* Fill in DL_CAPAB_IPSEC_ESP sub-capability entries */ - if (esp_cnt > 0) { - ill_fill_ipsec_reset(esp_cnt, DL_CAPAB_IPSEC_ESP, esp_len, - cap_esp, mp); - } - - /* - * At this point we've composed a bunch of sub-capabilities to be - * encapsulated in a DL_CAPABILITY_REQ and later sent downstream - * by the caller. Upon receiving this reset message, the driver - * must stop inbound decryption (by destroying all inbound SAs) - * and let the corresponding packets come in encrypted. - */ -} - -static void -ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp, - boolean_t encapsulated) -{ - boolean_t legacy = B_FALSE; - - /* - * Note that only the following two sub-capabilities may be - * considered as "legacy", since their original definitions - * do not incorporate the dl_mid_t module ID token, and hence - * may require the use of the wrapper sub-capability. - */ switch (subp->dl_cap) { - case DL_CAPAB_IPSEC_AH: - case DL_CAPAB_IPSEC_ESP: - legacy = B_TRUE; - break; - } - - /* - * For legacy sub-capabilities which don't incorporate a queue_t - * pointer in their structures, discard them if we detect that - * there are intermediate modules in between IP and the driver. - */ - if (!encapsulated && legacy && ill->ill_lmod_cnt > 1) { - ip1dbg(("ill_capability_dispatch: unencapsulated capab type " - "%d discarded; %d module(s) present below IP\n", - subp->dl_cap, ill->ill_lmod_cnt)); - return; - } - - switch (subp->dl_cap) { - case DL_CAPAB_IPSEC_AH: - case DL_CAPAB_IPSEC_ESP: - ill_capability_ipsec_ack(ill, mp, subp); - break; - case DL_CAPAB_MDT: - ill_capability_mdt_ack(ill, mp, subp); - break; case DL_CAPAB_HCKSUM: ill_capability_hcksum_ack(ill, mp, subp); break; @@ -3104,7 +1971,7 @@ ill_capability_lso_enable(ill_t *ill) DLD_ENABLE)) == 0) { ill->ill_lso_capab->ill_lso_flags = lso.lso_flags; ill->ill_lso_capab->ill_lso_max = lso.lso_max; - ill->ill_capabilities |= ILL_CAPAB_DLD_LSO; + ill->ill_capabilities |= ILL_CAPAB_LSO; ip1dbg(("ill_capability_lso_enable: interface %s " "has enabled LSO\n ", ill->ill_name)); } else { @@ -3180,7 +2047,7 @@ ill_capability_dld_disable(ill_t *ill) NULL, DLD_DISABLE); } - if ((ill->ill_capabilities & ILL_CAPAB_DLD_LSO) != 0) { + if ((ill->ill_capabilities & ILL_CAPAB_LSO) != 0) { ASSERT(ill->ill_lso_capab != NULL); /* * Clear the capability flag for LSO but retain the @@ -3189,7 +2056,7 @@ ill_capability_dld_disable(ill_t *ill) * deallocated when we destroy the ill. */ - ill->ill_capabilities &= ~ILL_CAPAB_DLD_LSO; + ill->ill_capabilities &= ~ILL_CAPAB_LSO; (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_LSO, NULL, DLD_DISABLE); } @@ -3335,7 +2202,7 @@ ill_capability_ack_thr(void *arg) ill_capability_id_ack(ill, mp, subp); break; default: - ill_capability_dispatch(ill, mp, subp, B_FALSE); + ill_capability_dispatch(ill, mp, subp); break; } } @@ -3410,8 +2277,14 @@ ill_frag_timeout(ill_t *ill, time_t dead_interval) uint32_t hdr_length; mblk_t *send_icmp_head; mblk_t *send_icmp_head_v6; - zoneid_t zoneid; ip_stack_t *ipst = ill->ill_ipst; + ip_recv_attr_t iras; + + bzero(&iras, sizeof (iras)); + iras.ira_flags = 0; + iras.ira_ill = iras.ira_rill = ill; + iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; + iras.ira_rifindex = iras.ira_ruifindex; ipfb = ill->ill_frag_hash_tbl; if (ipfb == NULL) @@ -3483,6 +2356,7 @@ ill_frag_timeout(ill_t *ill, time_t dead_interval) } } BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); + ip_drop_input("ipIfStatsReasmFails", ipf->ipf_mp, ill); freeb(ipf->ipf_mp); } mutex_exit(&ipfb->ipfb_lock); @@ -3496,19 +2370,21 @@ ill_frag_timeout(ill_t *ill, time_t dead_interval) mp = send_icmp_head_v6; send_icmp_head_v6 = send_icmp_head_v6->b_next; mp->b_next = NULL; - if (mp->b_datap->db_type == M_CTL) - ip6h = (ip6_t *)mp->b_cont->b_rptr; - else - ip6h = (ip6_t *)mp->b_rptr; - zoneid = ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst, + ip6h = (ip6_t *)mp->b_rptr; + iras.ira_flags = 0; + /* + * This will result in an incorrect ALL_ZONES zoneid + * for multicast packets, but we + * don't send ICMP errors for those in any case. + */ + iras.ira_zoneid = + ipif_lookup_addr_zoneid_v6(&ip6h->ip6_dst, ill, ipst); - if (zoneid == ALL_ZONES) { - freemsg(mp); - } else { - icmp_time_exceeded_v6(ill->ill_wq, mp, - ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE, - B_FALSE, zoneid, ipst); - } + ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill); + icmp_time_exceeded_v6(mp, + ICMP_REASSEMBLY_TIME_EXCEEDED, B_FALSE, + &iras); + ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); } while (send_icmp_head != NULL) { ipaddr_t dst; @@ -3517,19 +2393,20 @@ ill_frag_timeout(ill_t *ill, time_t dead_interval) send_icmp_head = send_icmp_head->b_next; mp->b_next = NULL; - if (mp->b_datap->db_type == M_CTL) - dst = ((ipha_t *)mp->b_cont->b_rptr)->ipha_dst; - else - dst = ((ipha_t *)mp->b_rptr)->ipha_dst; + dst = ((ipha_t *)mp->b_rptr)->ipha_dst; - zoneid = ipif_lookup_addr_zoneid(dst, ill, ipst); - if (zoneid == ALL_ZONES) { - freemsg(mp); - } else { - icmp_time_exceeded(ill->ill_wq, mp, - ICMP_REASSEMBLY_TIME_EXCEEDED, zoneid, - ipst); - } + iras.ira_flags = IRAF_IS_IPV4; + /* + * This will result in an incorrect ALL_ZONES zoneid + * for broadcast and multicast packets, but we + * don't send ICMP errors for those in any case. + */ + iras.ira_zoneid = ipif_lookup_addr_zoneid(dst, + ill, ipst); + ip_drop_input("ICMP_TIME_EXCEEDED reass", mp, ill); + icmp_time_exceeded(mp, + ICMP_REASSEMBLY_TIME_EXCEEDED, &iras); + ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); } } /* @@ -3647,8 +2524,9 @@ ill_frag_free_pkts(ill_t *ill, ipfb_t *ipfb, ipf_t *ipf, int free_cnt) ipfb->ipfb_count -= count; ASSERT(ipfb->ipfb_frag_pkts > 0); ipfb->ipfb_frag_pkts--; - freemsg(mp); BUMP_MIB(ill->ill_ip_mib, ipIfStatsReasmFails); + ip_drop_input("ipIfStatsReasmFails", mp, ill); + freemsg(mp); } if (ipf) @@ -3776,6 +2654,7 @@ static void ill_set_nce_router_flags(ill_t *ill, boolean_t enable) { ipif_t *ipif; + ncec_t *ncec; nce_t *nce; for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { @@ -3784,16 +2663,16 @@ ill_set_nce_router_flags(ill_t *ill, boolean_t enable) * addresses on IPMP interfaces have an nce_ill that points to * the bound underlying ill. */ - nce = ndp_lookup_v6(ill, B_TRUE, &ipif->ipif_v6lcl_addr, - B_FALSE); + nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr); if (nce != NULL) { - mutex_enter(&nce->nce_lock); + ncec = nce->nce_common; + mutex_enter(&ncec->ncec_lock); if (enable) - nce->nce_flags |= NCE_F_ISROUTER; + ncec->ncec_flags |= NCE_F_ISROUTER; else - nce->nce_flags &= ~NCE_F_ISROUTER; - mutex_exit(&nce->nce_lock); - NCE_REFRELE(nce); + ncec->ncec_flags &= ~NCE_F_ISROUTER; + mutex_exit(&ncec->ncec_lock); + nce_refrele(nce); } } } @@ -3986,8 +2865,7 @@ ill_get_ppa_ptr(char *name) * use avl tree to locate the ill. */ static ill_t * -ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp, - ipsq_func_t func, int *error, ip_stack_t *ipst) +ill_find_by_name(char *name, boolean_t isv6, ip_stack_t *ipst) { char *ppa_ptr = NULL; int len; @@ -3995,10 +2873,6 @@ ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp, ill_t *ill = NULL; ill_if_t *ifp; int list; - ipsq_t *ipsq; - - if (error != NULL) - *error = 0; /* * get ppa ptr @@ -4009,8 +2883,6 @@ ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp, list = IP_V4_G_HEAD; if ((ppa_ptr = ill_get_ppa_ptr(name)) == NULL) { - if (error != NULL) - *error = ENXIO; return (NULL); } @@ -4038,42 +2910,19 @@ ill_find_by_name(char *name, boolean_t isv6, queue_t *q, mblk_t *mp, /* * Even the interface type does not exist. */ - if (error != NULL) - *error = ENXIO; return (NULL); } ill = avl_find(&ifp->illif_avl_by_ppa, (void *) &ppa, NULL); if (ill != NULL) { - /* - * The block comment at the start of ipif_down - * explains the use of the macros used below - */ - GRAB_CONN_LOCK(q); mutex_enter(&ill->ill_lock); if (ILL_CAN_LOOKUP(ill)) { ill_refhold_locked(ill); mutex_exit(&ill->ill_lock); - RELEASE_CONN_LOCK(q); return (ill); - } else if (ILL_CAN_WAIT(ill, q)) { - ipsq = ill->ill_phyint->phyint_ipsq; - mutex_enter(&ipsq->ipsq_lock); - mutex_enter(&ipsq->ipsq_xop->ipx_lock); - mutex_exit(&ill->ill_lock); - ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); - mutex_exit(&ipsq->ipsq_xop->ipx_lock); - mutex_exit(&ipsq->ipsq_lock); - RELEASE_CONN_LOCK(q); - if (error != NULL) - *error = EINPROGRESS; - return (NULL); } mutex_exit(&ill->ill_lock); - RELEASE_CONN_LOCK(q); } - if (error != NULL) - *error = ENXIO; return (NULL); } @@ -4474,6 +3323,8 @@ ill_init(queue_t *q, ill_t *ill) * ip_open(), before we reach here. */ mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, 0); + mutex_init(&ill->ill_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL); + ill->ill_saved_ire_cnt = 0; ill->ill_rq = q; ill->ill_wq = WR(q); @@ -4521,7 +3372,9 @@ ill_init(queue_t *q, ill_t *ill) */ ill->ill_phyint->phyint_illv4 = ill; ill->ill_ppa = UINT_MAX; - ill->ill_fastpath_list = &ill->ill_fastpath_list; + list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node)); + + ill_set_inputfn(ill); if (!ipsq_init(ill, B_TRUE)) { freemsg(info_mp); @@ -4536,6 +3389,8 @@ ill_init(queue_t *q, ill_t *ill) ill->ill_frag_count = 0; ill->ill_ipf_gen = 0; + rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL); + mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL); ill->ill_global_timer = INFINITY; ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; @@ -4550,7 +3405,6 @@ ill_init(queue_t *q, ill_t *ill) * IPv6. */ ill->ill_reachable_time = ND_REACHABLE_TIME; - ill->ill_reachable_retrans_time = ND_RETRANS_TIMER; ill->ill_xmit_count = ND_MAX_MULTICAST_SOLICIT; ill->ill_max_buf = ND_MAX_Q; ill->ill_refcnt = 0; @@ -4574,15 +3428,14 @@ ill_init(queue_t *q, ill_t *ill) * creates datalink socket info from the device. */ int -ill_dls_info(struct sockaddr_dl *sdl, const ipif_t *ipif) +ill_dls_info(struct sockaddr_dl *sdl, const ill_t *ill) { size_t len; - ill_t *ill = ipif->ipif_ill; sdl->sdl_family = AF_LINK; - sdl->sdl_index = ill->ill_phyint->phyint_ifindex; + sdl->sdl_index = ill_get_upper_ifindex(ill); sdl->sdl_type = ill->ill_type; - ipif_get_name(ipif, sdl->sdl_data, sizeof (sdl->sdl_data)); + ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data)); len = strlen(sdl->sdl_data); ASSERT(len < 256); sdl->sdl_nlen = (uchar_t)len; @@ -4604,7 +3457,7 @@ ill_xarp_info(struct sockaddr_dl *sdl, ill_t *ill) sdl->sdl_family = AF_LINK; sdl->sdl_index = ill->ill_phyint->phyint_ifindex; sdl->sdl_type = ill->ill_type; - ipif_get_name(ill->ill_ipif, sdl->sdl_data, sizeof (sdl->sdl_data)); + ill_get_name(ill, sdl->sdl_data, sizeof (sdl->sdl_data)); sdl->sdl_nlen = (uchar_t)mi_strlen(sdl->sdl_data); sdl->sdl_alen = ill->ill_phys_addr_length; sdl->sdl_slen = 0; @@ -4646,7 +3499,7 @@ loopback_kstat_update(kstat_t *ksp, int rw) /* * Has ifindex been plumbed already? */ -boolean_t +static boolean_t phyint_exists(uint_t index, ip_stack_t *ipst) { ASSERT(index != 0); @@ -4749,8 +3602,7 @@ phyint_flags_init(phyint_t *phyi, t_uscalar_t mactype) */ ill_t * ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, - queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, boolean_t *did_alloc, - ip_stack_t *ipst) + boolean_t *did_alloc, ip_stack_t *ipst) { ill_t *ill; ipif_t *ipif; @@ -4762,9 +3614,9 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, isloopback = mi_strcmp(name, ipif_loopback_name) == 0; rw_enter(&ipst->ips_ill_g_lock, RW_READER); - ill = ill_find_by_name(name, isv6, q, mp, func, error, ipst); + ill = ill_find_by_name(name, isv6, ipst); rw_exit(&ipst->ips_ill_g_lock); - if (ill != NULL || (error != NULL && *error == EINPROGRESS)) + if (ill != NULL) return (ill); /* @@ -4775,9 +3627,8 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, return (NULL); rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); - - ill = ill_find_by_name(name, isv6, q, mp, func, error, ipst); - if (ill != NULL || (error != NULL && *error == EINPROGRESS)) { + ill = ill_find_by_name(name, isv6, ipst); + if (ill != NULL) { rw_exit(&ipst->ips_ill_g_lock); return (ill); } @@ -4791,6 +3642,7 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, *ill = ill_null; mutex_init(&ill->ill_lock, NULL, MUTEX_DEFAULT, NULL); ill->ill_ipst = ipst; + list_create(&ill->ill_nce, sizeof (nce_t), offsetof(nce_t, nce_node)); netstack_hold(ipst->ips_netstack); /* * For exclusive stacks we set the zoneid to zero @@ -4809,17 +3661,16 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, mutex_init(&ill->ill_phyint->phyint_lock, NULL, MUTEX_DEFAULT, 0); phyint_flags_init(ill->ill_phyint, DL_LOOP); - ill->ill_max_frag = IP_LOOPBACK_MTU; - /* Add room for tcp+ip headers */ if (isv6) { ill->ill_isv6 = B_TRUE; - ill->ill_max_frag += IPV6_HDR_LEN + 20; /* for TCP */ + ill->ill_max_frag = ip_loopback_mtu_v6plus; } else { - ill->ill_max_frag += IP_SIMPLE_HDR_LENGTH + 20; + ill->ill_max_frag = ip_loopback_mtuplus; } if (!ill_allocate_mibs(ill)) goto done; - ill->ill_max_mtu = ill->ill_max_frag; + ill->ill_current_frag = ill->ill_max_frag; + ill->ill_mtu = ill->ill_max_frag; /* Initial value */ /* * ipif_loopback_name can't be pointed at directly because its used * by both the ipv4 and ipv6 interfaces. When the ill is removed @@ -4832,6 +3683,8 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, /* Set ill_dlpi_pending for ipsq_current_finish() to work properly */ ill->ill_dlpi_pending = DL_PRIM_INVAL; + rw_init(&ill->ill_mcast_lock, NULL, RW_DEFAULT, NULL); + mutex_init(&ill->ill_mcast_serializer, NULL, MUTEX_DEFAULT, NULL); ill->ill_global_timer = INFINITY; ill->ill_mcast_v1_time = ill->ill_mcast_v2_time = 0; ill->ill_mcast_v1_tset = ill->ill_mcast_v2_tset = 0; @@ -4857,14 +3710,12 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, ipaddr_t inaddr_loopback = htonl(INADDR_LOOPBACK); IN6_IPADDR_TO_V4MAPPED(inaddr_loopback, &ipif->ipif_v6lcl_addr); - ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; V4MASK_TO_V6(htonl(IN_CLASSA_NET), ipif->ipif_v6net_mask); V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, ipif->ipif_v6subnet); ill->ill_flags |= ILLF_IPV4; } else { ipif->ipif_v6lcl_addr = ipv6_loopback; - ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; ipif->ipif_v6net_mask = ipv6_all_ones; V6_MASK_COPY(ipif->ipif_v6lcl_addr, ipif->ipif_v6net_mask, ipif->ipif_v6subnet); @@ -4884,6 +3735,8 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, ipsq = ill->ill_phyint->phyint_ipsq; + ill_set_inputfn(ill); + if (ill_glist_insert(ill, "lo", isv6) != 0) cmn_err(CE_PANIC, "cannot insert loopback interface"); @@ -4924,8 +3777,6 @@ ill_lookup_on_name(char *name, boolean_t do_alloc, boolean_t isv6, } } - if (error != NULL) - *error = 0; *did_alloc = B_TRUE; rw_exit(&ipst->ips_ill_g_lock); ill_nic_event_dispatch(ill, MAP_IPIF_ID(ill->ill_ipif->ipif_id), @@ -4947,8 +3798,6 @@ done: mi_free(ill); } rw_exit(&ipst->ips_ill_g_lock); - if (error != NULL) - *error = ENOMEM; return (NULL); } @@ -4956,8 +3805,7 @@ done: * For IPP calls - use the ip_stack_t for global stack. */ ill_t * -ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6, - queue_t *q, mblk_t *mp, ipsq_func_t func, int *err) +ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6) { ip_stack_t *ipst; ill_t *ill; @@ -4968,7 +3816,7 @@ ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6, return (NULL); } - ill = ill_lookup_on_ifindex(index, isv6, q, mp, func, err, ipst); + ill = ill_lookup_on_ifindex(index, isv6, ipst); netstack_rele(ipst->ips_netstack); return (ill); } @@ -4977,19 +3825,11 @@ ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6, * Return a pointer to the ill which matches the index and IP version type. */ ill_t * -ill_lookup_on_ifindex(uint_t index, boolean_t isv6, queue_t *q, mblk_t *mp, - ipsq_func_t func, int *err, ip_stack_t *ipst) +ill_lookup_on_ifindex(uint_t index, boolean_t isv6, ip_stack_t *ipst) { ill_t *ill; - ipsq_t *ipsq; phyint_t *phyi; - ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) || - (q != NULL && mp != NULL && func != NULL && err != NULL)); - - if (err != NULL) - *err = 0; - /* * Indexes are stored in the phyint - a common structure * to both IPv4 and IPv6. @@ -5000,43 +3840,45 @@ ill_lookup_on_ifindex(uint_t index, boolean_t isv6, queue_t *q, mblk_t *mp, if (phyi != NULL) { ill = isv6 ? phyi->phyint_illv6: phyi->phyint_illv4; if (ill != NULL) { - /* - * The block comment at the start of ipif_down - * explains the use of the macros used below - */ - GRAB_CONN_LOCK(q); mutex_enter(&ill->ill_lock); - if (ILL_CAN_LOOKUP(ill)) { + if (!ILL_IS_CONDEMNED(ill)) { ill_refhold_locked(ill); mutex_exit(&ill->ill_lock); - RELEASE_CONN_LOCK(q); rw_exit(&ipst->ips_ill_g_lock); return (ill); - } else if (ILL_CAN_WAIT(ill, q)) { - ipsq = ill->ill_phyint->phyint_ipsq; - mutex_enter(&ipsq->ipsq_lock); - mutex_enter(&ipsq->ipsq_xop->ipx_lock); - rw_exit(&ipst->ips_ill_g_lock); - mutex_exit(&ill->ill_lock); - ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); - mutex_exit(&ipsq->ipsq_xop->ipx_lock); - mutex_exit(&ipsq->ipsq_lock); - RELEASE_CONN_LOCK(q); - if (err != NULL) - *err = EINPROGRESS; - return (NULL); } - RELEASE_CONN_LOCK(q); mutex_exit(&ill->ill_lock); } } rw_exit(&ipst->ips_ill_g_lock); - if (err != NULL) - *err = ENXIO; return (NULL); } /* + * Verify whether or not an interface index is valid. + * It can be zero (meaning "reset") or an interface index assigned + * to a non-VNI interface. (We don't use VNI interface to send packets.) + */ +boolean_t +ip_ifindex_valid(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst) +{ + ill_t *ill; + + if (ifindex == 0) + return (B_TRUE); + + ill = ill_lookup_on_ifindex(ifindex, isv6, ipst); + if (ill == NULL) + return (B_FALSE); + if (IS_VNI(ill)) { + ill_refrele(ill); + return (B_FALSE); + } + ill_refrele(ill); + return (B_TRUE); +} + +/* * Return the ifindex next in sequence after the passed in ifindex. * If there is no next ifindex for the given protocol, return 0. */ @@ -5118,6 +3960,20 @@ ill_get_ifindex_by_name(char *name, ip_stack_t *ipst) } /* + * Return the ifindex to be used by upper layer protocols for instance + * for IPV6_RECVPKTINFO. If IPMP this is the one for the upper ill. + */ +uint_t +ill_get_upper_ifindex(const ill_t *ill) +{ + if (IS_UNDER_IPMP(ill)) + return (ipmp_ill_get_ipmp_ifindex(ill)); + else + return (ill->ill_phyint->phyint_ifindex); +} + + +/* * Obtain a reference to the ill. The ill_refcnt is a dynamic refcnt * that gives a running thread a reference to the ill. This reference must be * released by the thread when it is done accessing the ill and related @@ -5145,17 +4001,18 @@ ill_refhold_locked(ill_t *ill) ILL_TRACE_REF(ill); } -int +/* Returns true if we managed to get a refhold */ +boolean_t ill_check_and_refhold(ill_t *ill) { mutex_enter(&ill->ill_lock); - if (ILL_CAN_LOOKUP(ill)) { + if (!ILL_IS_CONDEMNED(ill)) { ill_refhold_locked(ill); mutex_exit(&ill->ill_lock); - return (0); + return (B_TRUE); } mutex_exit(&ill->ill_lock); - return (ILL_LOOKUP_FAILED); + return (B_FALSE); } /* @@ -5234,8 +4091,8 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) ASSERT(IAM_WRITER_ILL(ill)); /* - * Till the ill is fully up ILL_CHANGING will be set and - * the ill is not globally visible. So no need for a lock. + * Till the ill is fully up the ill is not globally visible. + * So no need for a lock. */ dlia = (dl_info_ack_t *)mp->b_rptr; ill->ill_mactype = dlia->dl_mac_type; @@ -5279,8 +4136,9 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) * IP will fly apart otherwise. */ min_mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU; - ill->ill_max_frag = MAX(min_mtu, dlia->dl_max_sdu); - ill->ill_max_mtu = ill->ill_max_frag; + ill->ill_max_frag = MAX(min_mtu, dlia->dl_max_sdu); + ill->ill_current_frag = ill->ill_max_frag; + ill->ill_mtu = ill->ill_max_frag; ill->ill_type = ipm->ip_m_type; @@ -5320,14 +4178,6 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) */ ill->ill_sap = (ill->ill_isv6) ? ipm->ip_m_ipv6sap : ipm->ip_m_ipv4sap; /* - * Set ipif_mtu which is used to set the IRE's - * ire_max_frag value. The driver could have sent - * a different mtu from what it sent last time. No - * need to call ipif_mtu_change because IREs have - * not yet been created. - */ - ill->ill_ipif->ipif_mtu = ill->ill_max_mtu; - /* * Clear all the flags that were set based on ill_bcast_addr_length * and ill_phys_addr_length (in ipif_set_values) as these could have * changed now and we need to re-evaluate. @@ -5336,8 +4186,7 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) ill->ill_ipif->ipif_flags &= ~(IPIF_BROADCAST | IPIF_POINTOPOINT); /* - * Free ill_resolver_mp and ill_bcast_mp as things could have - * changed now. + * Free ill_bcast_mp as things could have changed now. * * NOTE: The IPMP meta-interface is special-cased because it starts * with no underlying interfaces (and thus an unknown broadcast @@ -5345,19 +4194,14 @@ ip_ll_subnet_defaults(ill_t *ill, mblk_t *mp) * capable as part of allowing it to join a group. */ if (ill->ill_bcast_addr_length == 0 && !IS_IPMP(ill)) { - if (ill->ill_resolver_mp != NULL) - freemsg(ill->ill_resolver_mp); if (ill->ill_bcast_mp != NULL) freemsg(ill->ill_bcast_mp); - if (ill->ill_flags & ILLF_XRESOLV) - ill->ill_net_type = IRE_IF_RESOLVER; - else - ill->ill_net_type = IRE_IF_NORESOLVER; - ill->ill_resolver_mp = ill_dlur_gen(NULL, + ill->ill_net_type = IRE_IF_NORESOLVER; + + ill->ill_bcast_mp = ill_dlur_gen(NULL, ill->ill_phys_addr_length, ill->ill_sap, ill->ill_sap_length); - ill->ill_bcast_mp = copymsg(ill->ill_resolver_mp); if (ill->ill_isv6) /* @@ -5520,7 +4364,7 @@ ipif_comp_multi(ipif_t *old_ipif, ipif_t *new_ipif, boolean_t isv6) * 3b. link local, but deprecated * 4. loopback. */ -ipif_t * +static ipif_t * ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6) { ill_t *ill; @@ -5537,7 +4381,8 @@ ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6) for (; ill != NULL; ill = ill_next(&ctx, ill)) { mutex_enter(&ill->ill_lock); - if (IS_VNI(ill) || IS_UNDER_IPMP(ill) || !ILL_CAN_LOOKUP(ill) || + if (IS_VNI(ill) || IS_UNDER_IPMP(ill) || + ILL_IS_CONDEMNED(ill) || !(ill->ill_flags & ILLF_MULTICAST)) { mutex_exit(&ill->ill_lock); continue; @@ -5550,7 +4395,7 @@ ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6) continue; } if (!(ipif->ipif_flags & IPIF_UP) || - !IPIF_CAN_LOOKUP(ipif)) { + IPIF_IS_CONDEMNED(ipif)) { continue; } @@ -5618,6 +4463,22 @@ ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6) } } +ill_t * +ill_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6) +{ + ipif_t *ipif; + ill_t *ill; + + ipif = ipif_lookup_multicast(ipst, zoneid, isv6); + if (ipif == NULL) + return (NULL); + + ill = ipif->ipif_ill; + ill_refhold(ill); + ipif_refrele(ipif); + return (ill); +} + /* * This function is called when an application does not specify an interface * to be used for multicast traffic (joining a group/sending data). It @@ -5629,22 +4490,21 @@ ipif_lookup_multicast(ip_stack_t *ipst, zoneid_t zoneid, boolean_t isv6) * anything in between. If there is no such multicast route, we just find * any multicast capable interface and return it. The returned ipif * is refhold'ed. + * + * We support MULTIRT and RTF_SETSRC on the multicast routes added to the + * unicast table. This is used by CGTP. */ -ipif_t * -ipif_lookup_group(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst) +ill_t * +ill_lookup_group_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst, + boolean_t *multirtp, ipaddr_t *setsrcp) { - ire_t *ire; - ipif_t *ipif; + ill_t *ill; - ire = ire_lookup_multi(group, zoneid, ipst); - if (ire != NULL) { - ipif = ire->ire_ipif; - ipif_refhold(ipif); - ire_refrele(ire); - return (ipif); - } + ill = ire_lookup_multi_ill_v4(group, zoneid, ipst, multirtp, setsrcp); + if (ill != NULL) + return (ill); - return (ipif_lookup_multicast(ipst, zoneid, B_FALSE)); + return (ill_lookup_multicast(ipst, zoneid, B_FALSE)); } /* @@ -5652,16 +4512,11 @@ ipif_lookup_group(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst) * The destination address is used only for matching point-to-point interfaces. */ ipif_t * -ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp, - ipsq_func_t func, int *error, ip_stack_t *ipst) +ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, ip_stack_t *ipst) { ipif_t *ipif; ill_t *ill; ill_walk_context_t ctx; - ipsq_t *ipsq; - - if (error != NULL) - *error = 0; /* * First match all the point-to-point interfaces @@ -5672,7 +4527,6 @@ ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp, rw_enter(&ipst->ips_ill_g_lock, RW_READER); ill = ILL_START_WALK_V4(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { - GRAB_CONN_LOCK(q); mutex_enter(&ill->ill_lock); for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { @@ -5680,41 +4534,20 @@ ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp, if ((ipif->ipif_flags & IPIF_POINTOPOINT) && (ipif->ipif_lcl_addr == if_addr) && (ipif->ipif_pp_dst_addr == dst)) { - /* - * The block comment at the start of ipif_down - * explains the use of the macros used below - */ - if (IPIF_CAN_LOOKUP(ipif)) { + if (!IPIF_IS_CONDEMNED(ipif)) { ipif_refhold_locked(ipif); mutex_exit(&ill->ill_lock); - RELEASE_CONN_LOCK(q); rw_exit(&ipst->ips_ill_g_lock); return (ipif); - } else if (IPIF_CAN_WAIT(ipif, q)) { - ipsq = ill->ill_phyint->phyint_ipsq; - mutex_enter(&ipsq->ipsq_lock); - mutex_enter(&ipsq->ipsq_xop->ipx_lock); - mutex_exit(&ill->ill_lock); - rw_exit(&ipst->ips_ill_g_lock); - ipsq_enq(ipsq, q, mp, func, NEW_OP, - ill); - mutex_exit(&ipsq->ipsq_xop->ipx_lock); - mutex_exit(&ipsq->ipsq_lock); - RELEASE_CONN_LOCK(q); - if (error != NULL) - *error = EINPROGRESS; - return (NULL); } } } mutex_exit(&ill->ill_lock); - RELEASE_CONN_LOCK(q); } rw_exit(&ipst->ips_ill_g_lock); /* lookup the ipif based on interface address */ - ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, q, mp, func, error, - ipst); + ipif = ipif_lookup_addr(if_addr, NULL, ALL_ZONES, ipst); ASSERT(ipif == NULL || !ipif->ipif_isv6); return (ipif); } @@ -5723,18 +4556,15 @@ ipif_lookup_interface(ipaddr_t if_addr, ipaddr_t dst, queue_t *q, mblk_t *mp, * Common function for ipif_lookup_addr() and ipif_lookup_addr_exact(). */ static ipif_t * -ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, boolean_t match_illgrp, - zoneid_t zoneid, queue_t *q, mblk_t *mp, ipsq_func_t func, int *error, - ip_stack_t *ipst) +ipif_lookup_addr_common(ipaddr_t addr, ill_t *match_ill, uint32_t match_flags, + zoneid_t zoneid, ip_stack_t *ipst) { ipif_t *ipif; ill_t *ill; boolean_t ptp = B_FALSE; - ipsq_t *ipsq; ill_walk_context_t ctx; - - if (error != NULL) - *error = 0; + boolean_t match_illgrp = (match_flags & IPIF_MATCH_ILLGRP); + boolean_t no_duplicate = (match_flags & IPIF_MATCH_NONDUP); rw_enter(&ipst->ips_ill_g_lock, RW_READER); /* @@ -5748,7 +4578,6 @@ repeat: (!match_illgrp || !IS_IN_SAME_ILLGRP(ill, match_ill))) { continue; } - GRAB_CONN_LOCK(q); mutex_enter(&ill->ill_lock); for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { @@ -5756,47 +4585,29 @@ repeat: zoneid != ipif->ipif_zoneid && ipif->ipif_zoneid != ALL_ZONES) continue; + + if (no_duplicate && !(ipif->ipif_flags & IPIF_UP)) + continue; + /* Allow the ipif to be down */ if ((!ptp && (ipif->ipif_lcl_addr == addr) && ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || (ptp && (ipif->ipif_flags & IPIF_POINTOPOINT) && (ipif->ipif_pp_dst_addr == addr))) { - /* - * The block comment at the start of ipif_down - * explains the use of the macros used below - */ - if (IPIF_CAN_LOOKUP(ipif)) { + if (!IPIF_IS_CONDEMNED(ipif)) { ipif_refhold_locked(ipif); mutex_exit(&ill->ill_lock); - RELEASE_CONN_LOCK(q); rw_exit(&ipst->ips_ill_g_lock); return (ipif); - } else if (IPIF_CAN_WAIT(ipif, q)) { - ipsq = ill->ill_phyint->phyint_ipsq; - mutex_enter(&ipsq->ipsq_lock); - mutex_enter(&ipsq->ipsq_xop->ipx_lock); - mutex_exit(&ill->ill_lock); - rw_exit(&ipst->ips_ill_g_lock); - ipsq_enq(ipsq, q, mp, func, NEW_OP, - ill); - mutex_exit(&ipsq->ipsq_xop->ipx_lock); - mutex_exit(&ipsq->ipsq_lock); - RELEASE_CONN_LOCK(q); - if (error != NULL) - *error = EINPROGRESS; - return (NULL); } } } mutex_exit(&ill->ill_lock); - RELEASE_CONN_LOCK(q); } /* If we already did the ptp case, then we are done */ if (ptp) { rw_exit(&ipst->ips_ill_g_lock); - if (error != NULL) - *error = ENXIO; return (NULL); } ptp = B_TRUE; @@ -5804,55 +4615,6 @@ repeat: } /* - * Check if the address exists in the system. - * We don't hold the conn_lock as we will not perform defered ipsqueue - * operation. - */ -boolean_t -ip_addr_exists(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst) -{ - ipif_t *ipif; - ill_t *ill; - ill_walk_context_t ctx; - - rw_enter(&ipst->ips_ill_g_lock, RW_READER); - - ill = ILL_START_WALK_V4(&ctx, ipst); - for (; ill != NULL; ill = ill_next(&ctx, ill)) { - mutex_enter(&ill->ill_lock); - for (ipif = ill->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - if (zoneid != ALL_ZONES && - zoneid != ipif->ipif_zoneid && - ipif->ipif_zoneid != ALL_ZONES) - continue; - /* Allow the ipif to be down */ - /* - * XXX Different from ipif_lookup_addr(), we don't do - * twice lookups. As from bind()'s point of view, we - * may return once we find a match. - */ - if (((ipif->ipif_lcl_addr == addr) && - ((ipif->ipif_flags & IPIF_UNNUMBERED) == 0)) || - ((ipif->ipif_flags & IPIF_POINTOPOINT) && - (ipif->ipif_pp_dst_addr == addr))) { - /* - * Allow bind() to be successful even if the - * ipif is with IPIF_CHANGING bit set. - */ - mutex_exit(&ill->ill_lock); - rw_exit(&ipst->ips_ill_g_lock); - return (B_TRUE); - } - } - mutex_exit(&ill->ill_lock); - } - - rw_exit(&ipst->ips_ill_g_lock); - return (B_FALSE); -} - -/* * Lookup an ipif with the specified address. For point-to-point links we * look for matches on either the destination address or the local address, * but we skip the local address check if IPIF_UNNUMBERED is set. If the @@ -5860,11 +4622,25 @@ ip_addr_exists(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst) * (or illgrp if `match_ill' is in an IPMP group). */ ipif_t * -ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q, - mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst) +ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, + ip_stack_t *ipst) +{ + return (ipif_lookup_addr_common(addr, match_ill, IPIF_MATCH_ILLGRP, + zoneid, ipst)); +} + +/* + * Lookup an ipif with the specified address. Similar to ipif_lookup_addr, + * except that we will only return an address if it is not marked as + * IPIF_DUPLICATE + */ +ipif_t * +ipif_lookup_addr_nondup(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, + ip_stack_t *ipst) { - return (ipif_lookup_addr_common(addr, match_ill, B_TRUE, zoneid, q, mp, - func, error, ipst)); + return (ipif_lookup_addr_common(addr, match_ill, + (IPIF_MATCH_ILLGRP | IPIF_MATCH_NONDUP), + zoneid, ipst)); } /* @@ -5872,12 +4648,12 @@ ipif_lookup_addr(ipaddr_t addr, ill_t *match_ill, zoneid_t zoneid, queue_t *q, * `match_ill' across the IPMP group. This function is only needed in some * corner-cases; almost everything should use ipif_lookup_addr(). */ -static ipif_t * +ipif_t * ipif_lookup_addr_exact(ipaddr_t addr, ill_t *match_ill, ip_stack_t *ipst) { ASSERT(match_ill != NULL); - return (ipif_lookup_addr_common(addr, match_ill, B_FALSE, ALL_ZONES, - NULL, NULL, NULL, NULL, ipst)); + return (ipif_lookup_addr_common(addr, match_ill, 0, ALL_ZONES, + ipst)); } /* @@ -5951,13 +4727,13 @@ repeat: * IRE lookup and pick the first ipif corresponding to the source address in the * ire. * Returns: held ipif + * + * This is only used for ICMP_ADDRESS_MASK_REQUESTs */ ipif_t * ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid) { ipif_t *ipif; - ire_t *ire; - ip_stack_t *ipst = ill->ill_ipst; ASSERT(!ill->ill_isv6); @@ -5970,7 +4746,7 @@ ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid) */ mutex_enter(&ill->ill_lock); for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { - if (!IPIF_CAN_LOOKUP(ipif)) + if (IPIF_IS_CONDEMNED(ipif)) continue; if (zoneid != ALL_ZONES && zoneid != ipif->ipif_zoneid && ipif->ipif_zoneid != ALL_ZONES) @@ -5991,24 +4767,11 @@ ipif_lookup_remote(ill_t *ill, ipaddr_t addr, zoneid_t zoneid) } } mutex_exit(&ill->ill_lock); - ire = ire_route_lookup(addr, 0, 0, 0, NULL, NULL, zoneid, - NULL, MATCH_IRE_RECURSIVE, ipst); - if (ire != NULL) { - /* - * The callers of this function wants to know the - * interface on which they have to send the replies - * back. For IREs that have ire_stq and ire_ipif - * derived from different ills, we really don't care - * what we return here. - */ - ipif = ire->ire_ipif; - if (ipif != NULL) { - ipif_refhold(ipif); - ire_refrele(ire); - return (ipif); - } - ire_refrele(ire); - } + /* + * For a remote destination it isn't possible to nail down a particular + * ipif. + */ + /* Pick the first interface */ ipif = ipif_get_next_ipif(NULL, ill); return (ipif); @@ -6027,9 +4790,8 @@ ill_is_quiescent(ill_t *ill) ASSERT(MUTEX_HELD(&ill->ill_lock)); for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { - if (ipif->ipif_refcnt != 0 || !IPIF_DOWN_OK(ipif)) { + if (ipif->ipif_refcnt != 0) return (B_FALSE); - } } if (!ILL_DOWN_OK(ill) || ill->ill_refcnt != 0) { return (B_FALSE); @@ -6045,7 +4807,7 @@ ill_is_freeable(ill_t *ill) ASSERT(MUTEX_HELD(&ill->ill_lock)); for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { - if (ipif->ipif_refcnt != 0 || !IPIF_FREE_OK(ipif)) { + if (ipif->ipif_refcnt != 0) { return (B_FALSE); } } @@ -6067,9 +4829,8 @@ ipif_is_quiescent(ipif_t *ipif) ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); - if (ipif->ipif_refcnt != 0 || !IPIF_DOWN_OK(ipif)) { + if (ipif->ipif_refcnt != 0) return (B_FALSE); - } ill = ipif->ipif_ill; if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 || @@ -6078,7 +4839,7 @@ ipif_is_quiescent(ipif_t *ipif) } /* This is the last ipif going down or being deleted on this ill */ - if (!ILL_DOWN_OK(ill) || ill->ill_refcnt != 0) { + if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0) { return (B_FALSE); } @@ -6087,14 +4848,14 @@ ipif_is_quiescent(ipif_t *ipif) /* * return true if the ipif can be destroyed: the ipif has to be quiescent - * with zero references from ire/nce/ilm to it. + * with zero references from ire/ilm to it. */ static boolean_t ipif_is_freeable(ipif_t *ipif) { ASSERT(MUTEX_HELD(&ipif->ipif_ill->ill_lock)); ASSERT(ipif->ipif_id != 0); - return (ipif->ipif_refcnt == 0 && IPIF_FREE_OK(ipif)); + return (ipif->ipif_refcnt == 0); } /* @@ -6275,7 +5036,7 @@ th_trace_gethash(ip_stack_t *ipst) * block. */ objsize = MAX(MAX(sizeof (ill_t), sizeof (ipif_t)), - MAX(sizeof (ire_t), sizeof (nce_t))); + MAX(sizeof (ire_t), sizeof (ncec_t))); rshift = highbit(objsize); mh = mod_hash_create_extended(name, 64, mod_hash_null_keydtor, th_trace_free, mod_hash_byptr, (void *)rshift, @@ -6509,7 +5270,7 @@ ipif_get_next_ipif(ipif_t *curr, ill_t *ill) mutex_enter(&ill->ill_lock); for (ipif = (curr == NULL ? ill->ill_ipif : curr->ipif_next); ipif != NULL; ipif = ipif->ipif_next) { - if (!IPIF_CAN_LOOKUP(ipif)) + if (IPIF_IS_CONDEMNED(ipif)) continue; ipif_refhold_locked(ipif); mutex_exit(&ill->ill_lock); @@ -6535,28 +5296,53 @@ ip_m_lookup(t_uscalar_t mac_type) } /* + * Make a link layer address from the multicast IP address *addr. + * To form the link layer address, invoke the ip_m_v*mapping function + * associated with the link-layer type. + */ +void +ip_mcast_mapping(ill_t *ill, uchar_t *addr, uchar_t *hwaddr) +{ + ip_m_t *ipm; + + if (ill->ill_net_type == IRE_IF_NORESOLVER) + return; + + ASSERT(addr != NULL); + + ipm = ip_m_lookup(ill->ill_mactype); + if (ipm == NULL || + (ill->ill_isv6 && ipm->ip_m_v6mapping == NULL) || + (!ill->ill_isv6 && ipm->ip_m_v4mapping == NULL)) { + ip0dbg(("no mapping for ill %s mactype 0x%x\n", + ill->ill_name, ill->ill_mactype)); + return; + } + if (ill->ill_isv6) + (*ipm->ip_m_v6mapping)(ill, addr, hwaddr); + else + (*ipm->ip_m_v4mapping)(ill, addr, hwaddr); +} + +/* * ip_rt_add is called to add an IPv4 route to the forwarding table. - * ipif_arg is passed in to associate it with the correct interface. - * We may need to restart this operation if the ipif cannot be looked up - * due to an exclusive operation that is currently in progress. The restart - * entry point is specified by 'func' + * ill is passed in to associate it with the correct interface. + * If ire_arg is set, then we return the held IRE in that location. */ int ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, - ipaddr_t src_addr, int flags, ipif_t *ipif_arg, ire_t **ire_arg, - boolean_t ioctl_msg, queue_t *q, mblk_t *mp, ipsq_func_t func, - struct rtsa_s *sp, ip_stack_t *ipst) + ipaddr_t src_addr, int flags, ill_t *ill, ire_t **ire_arg, + boolean_t ioctl_msg, struct rtsa_s *sp, ip_stack_t *ipst, zoneid_t zoneid) { - ire_t *ire; + ire_t *ire, *nire; ire_t *gw_ire = NULL; ipif_t *ipif = NULL; - boolean_t ipif_refheld = B_FALSE; uint_t type; int match_flags = MATCH_IRE_TYPE; - int error; tsol_gc_t *gc = NULL; tsol_gcgrp_t *gcgrp = NULL; boolean_t gcgrp_xtraref = B_FALSE; + boolean_t cgtp_broadcast; ip1dbg(("ip_rt_add:")); @@ -6579,27 +5365,19 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, return (ENETUNREACH); /* * Get the ipif, if any, corresponding to the gw_addr + * If -ifp was specified we restrict ourselves to the ill, otherwise + * we match on the gatway and destination to handle unnumbered pt-pt + * interfaces. */ - ipif = ipif_lookup_interface(gw_addr, dst_addr, q, mp, func, &error, - ipst); + if (ill != NULL) + ipif = ipif_lookup_addr(gw_addr, ill, ALL_ZONES, ipst); + else + ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst); if (ipif != NULL) { if (IS_VNI(ipif->ipif_ill)) { ipif_refrele(ipif); return (EINVAL); } - ipif_refheld = B_TRUE; - } else if (error == EINPROGRESS) { - ip1dbg(("ip_rt_add: null and EINPROGRESS")); - return (EINPROGRESS); - } else { - error = 0; - } - - if (ipif != NULL) { - ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif nonnull")); - ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); - } else { - ip1dbg(("ip_rt_add: ipif_lookup_interface done ipif is null")); } /* @@ -6612,12 +5390,12 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, flags &= ~RTF_GATEWAY; if (gw_addr == INADDR_LOOPBACK && dst_addr == INADDR_LOOPBACK && mask == IP_HOST_MASK) { - ire = ire_ctable_lookup(dst_addr, 0, IRE_LOOPBACK, ipif, - ALL_ZONES, NULL, match_flags, ipst); + ire = ire_ftable_lookup_v4(dst_addr, 0, 0, IRE_LOOPBACK, + NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst, + NULL); if (ire != NULL) { ire_refrele(ire); - if (ipif_refheld) - ipif_refrele(ipif); + ipif_refrele(ipif); return (EEXIST); } ip1dbg(("ip_rt_add: 0x%p creating IRE 0x%x" @@ -6627,40 +5405,58 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, ire = ire_create( (uchar_t *)&dst_addr, /* dest address */ (uchar_t *)&mask, /* mask */ - (uchar_t *)&ipif->ipif_src_addr, NULL, /* no gateway */ - &ipif->ipif_mtu, - NULL, - ipif->ipif_rq, /* recv-from queue */ - NULL, /* no send-to queue */ ipif->ipif_ire_type, /* LOOPBACK */ - ipif, - 0, - 0, - 0, - (ipif->ipif_flags & IPIF_PRIVATE) ? - RTF_PRIVATE : 0, - &ire_uinfo_null, - NULL, + ipif->ipif_ill, + zoneid, + (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE : 0, NULL, ipst); if (ire == NULL) { - if (ipif_refheld) - ipif_refrele(ipif); + ipif_refrele(ipif); return (ENOMEM); } - error = ire_add(&ire, q, mp, func, B_FALSE); - if (error == 0) - goto save_ire; - if (ipif_refheld) - ipif_refrele(ipif); - return (error); + /* src address assigned by the caller? */ + if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC)) + ire->ire_setsrc_addr = src_addr; + nire = ire_add(ire); + if (nire == NULL) { + /* + * In the result of failure, ire_add() will have + * already deleted the ire in question, so there + * is no need to do that here. + */ + ipif_refrele(ipif); + return (ENOMEM); + } + /* + * Check if it was a duplicate entry. This handles + * the case of two racing route adds for the same route + */ + if (nire != ire) { + ASSERT(nire->ire_identical_ref > 1); + ire_delete(nire); + ire_refrele(nire); + ipif_refrele(ipif); + return (EEXIST); + } + ire = nire; + goto save_ire; } } /* + * The routes for multicast with CGTP are quite special in that + * the gateway is the local interface address, yet RTF_GATEWAY + * is set. We turn off RTF_GATEWAY to provide compatibility with + * this undocumented and unusual use of multicast routes. + */ + if ((flags & RTF_MULTIRT) && ipif != NULL) + flags &= ~RTF_GATEWAY; + + /* * Traditionally, interface routes are ones where RTF_GATEWAY isn't set * and the gateway address provided is one of the system's interface * addresses. By using the routing socket interface and supplying an @@ -6694,8 +5490,8 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, * logical interfaces * * 192.0.2.32 255.255.255.224 192.0.2.33 U if0 - * 192.0.2.32 255.255.255.224 192.0.2.34 U if0:1 - * 192.0.2.32 255.255.255.224 192.0.2.35 U if0:2 + * 192.0.2.32 255.255.255.224 192.0.2.34 U if0 + * 192.0.2.32 255.255.255.224 192.0.2.35 U if0 * * the ipif's corresponding to each of these interface routes can be * uniquely identified by the "gateway" (actually interface address). @@ -6710,47 +5506,37 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, /* RTF_GATEWAY not set */ if (!(flags & RTF_GATEWAY)) { - queue_t *stq; - if (sp != NULL) { ip2dbg(("ip_rt_add: gateway security attributes " "cannot be set with interface route\n")); - if (ipif_refheld) + if (ipif != NULL) ipif_refrele(ipif); return (EINVAL); } /* - * As the interface index specified with the RTA_IFP sockaddr is - * the same for all ipif's off of an ill, the matching logic - * below uses MATCH_IRE_ILL if such an index was specified. - * This means that routes sharing the same prefix when added - * using a RTA_IFP sockaddr must have distinct interface - * indices (namely, they must be on distinct ill's). - * - * On the other hand, since the gateway address will usually be - * different for each ipif on the system, the matching logic - * uses MATCH_IRE_IPIF in the case of a traditional interface - * route. This means that interface routes for the same prefix - * can be created if they belong to distinct ipif's and if a - * RTA_IFP sockaddr is not present. + * Whether or not ill (RTA_IFP) is set, we require that + * the gateway is one of our local addresses. */ - if (ipif_arg != NULL) { - if (ipif_refheld) { - ipif_refrele(ipif); - ipif_refheld = B_FALSE; - } - ipif = ipif_arg; - match_flags |= MATCH_IRE_ILL; - } else { - /* - * Check the ipif corresponding to the gw_addr - */ - if (ipif == NULL) - return (ENETUNREACH); - match_flags |= MATCH_IRE_IPIF; + if (ipif == NULL) + return (ENETUNREACH); + + /* + * We use MATCH_IRE_ILL here. If the caller specified an + * interface (from the RTA_IFP sockaddr) we use it, otherwise + * we use the ill derived from the gateway address. + * We can always match the gateway address since we record it + * in ire_gateway_addr. + * We don't allow RTA_IFP to specify a different ill than the + * one matching the ipif to make sure we can delete the route. + */ + match_flags |= MATCH_IRE_GW | MATCH_IRE_ILL; + if (ill == NULL) { + ill = ipif->ipif_ill; + } else if (ill != ipif->ipif_ill) { + ipif_refrele(ipif); + return (EINVAL); } - ASSERT(ipif != NULL); /* * We check for an existing entry at this point. @@ -6761,45 +5547,32 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, */ if (!ioctl_msg) match_flags |= MATCH_IRE_MASK; - ire = ire_ftable_lookup(dst_addr, mask, 0, IRE_INTERFACE, ipif, - NULL, ALL_ZONES, 0, NULL, match_flags, ipst); + ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, + IRE_INTERFACE, ill, ALL_ZONES, NULL, match_flags, 0, ipst, + NULL); if (ire != NULL) { ire_refrele(ire); - if (ipif_refheld) - ipif_refrele(ipif); + ipif_refrele(ipif); return (EEXIST); } - stq = (ipif->ipif_net_type == IRE_IF_RESOLVER) - ? ipif->ipif_rq : ipif->ipif_wq; - /* - * Create a copy of the IRE_LOOPBACK, - * IRE_IF_NORESOLVER or IRE_IF_RESOLVER with - * the modified address and netmask. + * Create a copy of the IRE_LOOPBACK, IRE_IF_NORESOLVER or + * IRE_IF_RESOLVER with the modified address, netmask, and + * gateway. */ ire = ire_create( (uchar_t *)&dst_addr, (uint8_t *)&mask, - (uint8_t *)&ipif->ipif_src_addr, - NULL, - &ipif->ipif_mtu, - NULL, - NULL, - stq, - ipif->ipif_net_type, - ipif, - 0, - 0, - 0, + (uint8_t *)&gw_addr, + ill->ill_net_type, + ill, + zoneid, flags, - &ire_uinfo_null, - NULL, NULL, ipst); if (ire == NULL) { - if (ipif_refheld) - ipif_refrele(ipif); + ipif_refrele(ipif); return (ENOMEM); } @@ -6810,7 +5583,7 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, * set up prefixes with the RTF_REJECT flag set (for example, * when generating aggregate routes.) * - * If the IRE type (as defined by ipif->ipif_net_type) is + * If the IRE type (as defined by ill->ill_net_type) is * IRE_LOOPBACK, then we map the request into a * IRE_IF_NORESOLVER. We also OR in the RTF_BLACKHOLE flag as * these interface routes, by definition, can only be that. @@ -6819,27 +5592,37 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, * routine, but rather using ire_create() directly. * */ - if (ipif->ipif_net_type == IRE_LOOPBACK) { + if (ill->ill_net_type == IRE_LOOPBACK) { ire->ire_type = IRE_IF_NORESOLVER; ire->ire_flags |= RTF_BLACKHOLE; } - error = ire_add(&ire, q, mp, func, B_FALSE); - if (error == 0) - goto save_ire; + /* src address assigned by the caller? */ + if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC)) + ire->ire_setsrc_addr = src_addr; + nire = ire_add(ire); + if (nire == NULL) { + /* + * In the result of failure, ire_add() will have + * already deleted the ire in question, so there + * is no need to do that here. + */ + ipif_refrele(ipif); + return (ENOMEM); + } /* - * In the result of failure, ire_add() will have already - * deleted the ire in question, so there is no need to - * do that here. + * Check if it was a duplicate entry. This handles + * the case of two racing route adds for the same route */ - if (ipif_refheld) + if (nire != ire) { + ire_delete(nire); + ire_refrele(nire); ipif_refrele(ipif); - return (error); - } - if (ipif_refheld) { - ipif_refrele(ipif); - ipif_refheld = B_FALSE; + return (EEXIST); + } + ire = nire; + goto save_ire; } /* @@ -6847,13 +5630,19 @@ ip_rt_add(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, * If we don't have an IRE_IF_NORESOLVER or IRE_IF_RESOLVER for the * gateway, it is currently unreachable and we fail the request * accordingly. + * If RTA_IFP was specified we look on that particular ill. */ - ipif = ipif_arg; - if (ipif_arg != NULL) + if (ill != NULL) match_flags |= MATCH_IRE_ILL; + + /* Check whether the gateway is reachable. */ again: - gw_ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE, ipif_arg, NULL, - ALL_ZONES, 0, NULL, match_flags, ipst); + type = IRE_INTERFACE; + if (flags & RTF_INDIRECT) + type |= IRE_OFFLINK; + + gw_ire = ire_ftable_lookup_v4(gw_addr, 0, 0, type, ill, + ALL_ZONES, NULL, match_flags, 0, ipst, NULL); if (gw_ire == NULL) { /* * With IPMP, we allow host routes to influence in.mpathd's @@ -6862,10 +5651,13 @@ again: * underlying IRE_INTERFACEs are marked hidden. So allow * hidden test IREs to be found and try again. */ - if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) { - match_flags |= MATCH_IRE_MARK_TESTHIDDEN; + if (!(match_flags & MATCH_IRE_TESTHIDDEN)) { + match_flags |= MATCH_IRE_TESTHIDDEN; goto again; } + + if (ipif != NULL) + ipif_refrele(ipif); return (ENETUNREACH); } @@ -6885,10 +5677,12 @@ again: type = IRE_PREFIX; /* check for a duplicate entry */ - ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, ipif_arg, - NULL, ALL_ZONES, 0, NULL, - match_flags | MATCH_IRE_MASK | MATCH_IRE_GW, ipst); + ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill, + ALL_ZONES, NULL, match_flags | MATCH_IRE_MASK | MATCH_IRE_GW, + 0, ipst, NULL); if (ire != NULL) { + if (ipif != NULL) + ipif_refrele(ipif); ire_refrele(gw_ire); ire_refrele(ire); return (EEXIST); @@ -6905,6 +5699,8 @@ again: /* we hold reference to it upon success */ gcgrp = gcgrp_lookup(&ga, B_TRUE); if (gcgrp == NULL) { + if (ipif != NULL) + ipif_refrele(ipif); ire_refrele(gw_ire); return (ENOMEM); } @@ -6918,6 +5714,8 @@ again: */ gc = gc_create(sp, gcgrp, &gcgrp_xtraref); if (gc == NULL) { + if (ipif != NULL) + ipif_refrele(ipif); /* release reference held by gcgrp_lookup */ GCGRP_REFRELE(gcgrp); ire_refrele(gw_ire); @@ -6929,23 +5727,12 @@ again: ire = ire_create( (uchar_t *)&dst_addr, /* dest address */ (uchar_t *)&mask, /* mask */ - /* src address assigned by the caller? */ - (uchar_t *)(((src_addr != INADDR_ANY) && - (flags & RTF_SETSRC)) ? &src_addr : NULL), (uchar_t *)&gw_addr, /* gateway address */ - &gw_ire->ire_max_frag, - NULL, /* no src nce */ - NULL, /* no recv-from queue */ - NULL, /* no send-to queue */ (ushort_t)type, /* IRE type */ - ipif_arg, - 0, - 0, - 0, + ill, + zoneid, flags, - &gw_ire->ire_uinfo, /* Inherit ULP info from gw */ gc, /* security attribute */ - NULL, ipst); /* @@ -6958,26 +5745,51 @@ again: if (ire == NULL) { if (gc != NULL) GC_REFRELE(gc); + if (ipif != NULL) + ipif_refrele(ipif); ire_refrele(gw_ire); return (ENOMEM); } + /* Before we add, check if an extra CGTP broadcast is needed */ + cgtp_broadcast = ((flags & RTF_MULTIRT) && + ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST); + + /* src address assigned by the caller? */ + if ((src_addr != INADDR_ANY) && (flags & RTF_SETSRC)) + ire->ire_setsrc_addr = src_addr; + /* * POLICY: should we allow an RTF_HOST with address INADDR_ANY? * SUN/OS socket stuff does but do we really want to allow 0.0.0.0? */ /* Add the new IRE. */ - error = ire_add(&ire, q, mp, func, B_FALSE); - if (error != 0) { + nire = ire_add(ire); + if (nire == NULL) { /* - * In the result of failure, ire_add() will have already - * deleted the ire in question, so there is no need to - * do that here. + * In the result of failure, ire_add() will have + * already deleted the ire in question, so there + * is no need to do that here. */ + if (ipif != NULL) + ipif_refrele(ipif); ire_refrele(gw_ire); - return (error); + return (ENOMEM); + } + /* + * Check if it was a duplicate entry. This handles + * the case of two racing route adds for the same route + */ + if (nire != ire) { + ire_delete(nire); + ire_refrele(nire); + if (ipif != NULL) + ipif_refrele(ipif); + ire_refrele(gw_ire); + return (EEXIST); } + ire = nire; if (flags & RTF_MULTIRT) { /* @@ -6990,45 +5802,47 @@ again: * because an IP source address cannot be a broadcast * or a multicast. */ - ire_t *ire_dst = ire_ctable_lookup(ire->ire_addr, 0, - IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); - if (ire_dst != NULL) { - ip_cgtp_bcast_add(ire, ire_dst, ipst); - ire_refrele(ire_dst); + if (cgtp_broadcast) { + ip_cgtp_bcast_add(ire, ipst); goto save_ire; } if (ipst->ips_ip_cgtp_filter_ops != NULL && !CLASSD(ire->ire_addr)) { - int res = ipst->ips_ip_cgtp_filter_ops->cfo_add_dest_v4( - ipst->ips_netstack->netstack_stackid, - ire->ire_addr, - ire->ire_gateway_addr, - ire->ire_src_addr, - gw_ire->ire_src_addr); + int res; + ipif_t *src_ipif; + + /* Find the source address corresponding to gw_ire */ + src_ipif = ipif_lookup_addr(gw_ire->ire_gateway_addr, + NULL, zoneid, ipst); + if (src_ipif != NULL) { + res = ipst->ips_ip_cgtp_filter_ops-> + cfo_add_dest_v4( + ipst->ips_netstack->netstack_stackid, + ire->ire_addr, + ire->ire_gateway_addr, + ire->ire_setsrc_addr, + src_ipif->ipif_lcl_addr); + ipif_refrele(src_ipif); + } else { + res = EADDRNOTAVAIL; + } if (res != 0) { + if (ipif != NULL) + ipif_refrele(ipif); ire_refrele(gw_ire); ire_delete(ire); + ire_refrele(ire); /* Held in ire_add */ return (res); } } } - /* - * Now that the prefix IRE entry has been created, delete any - * existing gateway IRE cache entries as well as any IRE caches - * using the gateway, and force them to be created through - * ip_newroute. - */ - if (gc != NULL) { - ASSERT(gcgrp != NULL); - ire_clookup_delete_cache_gw(gw_addr, ALL_ZONES, ipst); - } - save_ire: if (gw_ire != NULL) { ire_refrele(gw_ire); + gw_ire = NULL; } - if (ipif != NULL) { + if (ill != NULL) { /* * Save enough information so that we can recreate the IRE if * the interface goes down and then up. The metrics associated @@ -7037,7 +5851,7 @@ save_ire: * memory cannot be allocated, none of this information will be * saved. */ - ipif_save_ire(ipif, ire); + ill_save_ire(ill, ire); } if (ioctl_msg) ip_rts_rtmsg(RTM_OLDADD, ire, 0, ipst); @@ -7052,27 +5866,23 @@ save_ire: } else { ire_refrele(ire); /* Held in ire_add */ } - if (ipif_refheld) + if (ipif != NULL) ipif_refrele(ipif); return (0); } /* * ip_rt_delete is called to delete an IPv4 route. - * ipif_arg is passed in to associate it with the correct interface. - * We may need to restart this operation if the ipif cannot be looked up - * due to an exclusive operation that is currently in progress. The restart - * entry point is specified by 'func' + * ill is passed in to associate it with the correct interface. */ /* ARGSUSED4 */ int ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, - uint_t rtm_addrs, int flags, ipif_t *ipif_arg, boolean_t ioctl_msg, - queue_t *q, mblk_t *mp, ipsq_func_t func, ip_stack_t *ipst) + uint_t rtm_addrs, int flags, ill_t *ill, boolean_t ioctl_msg, + ip_stack_t *ipst, zoneid_t zoneid) { ire_t *ire = NULL; ipif_t *ipif; - boolean_t ipif_refheld = B_FALSE; uint_t type; uint_t match_flags = MATCH_IRE_TYPE; int err = 0; @@ -7096,52 +5906,47 @@ ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, * * This makes it possible to delete an original * IRE_IF_NORESOLVER/IRE_IF_RESOLVER - consistent with SunOS 4.1. + * However, we have RTF_KERNEL set on the ones created by ipif_up + * and those can not be deleted here. * - * As the interface index specified with the RTA_IFP sockaddr is the - * same for all ipif's off of an ill, the matching logic below uses - * MATCH_IRE_ILL if such an index was specified. This means a route - * sharing the same prefix and interface index as the the route - * intended to be deleted might be deleted instead if a RTA_IFP sockaddr - * is specified in the request. - * - * On the other hand, since the gateway address will usually be - * different for each ipif on the system, the matching logic - * uses MATCH_IRE_IPIF in the case of a traditional interface - * route. This means that interface routes for the same prefix can be - * uniquely identified if they belong to distinct ipif's and if a - * RTA_IFP sockaddr is not present. + * We use MATCH_IRE_ILL if we know the interface. If the caller + * specified an interface (from the RTA_IFP sockaddr) we use it, + * otherwise we use the ill derived from the gateway address. + * We can always match the gateway address since we record it + * in ire_gateway_addr. * * For more detail on specifying routes by gateway address and by * interface index, see the comments in ip_rt_add(). */ - ipif = ipif_lookup_interface(gw_addr, dst_addr, q, mp, func, &err, - ipst); - if (ipif != NULL) - ipif_refheld = B_TRUE; - else if (err == EINPROGRESS) - return (err); - else - err = 0; + ipif = ipif_lookup_interface(gw_addr, dst_addr, ipst); if (ipif != NULL) { - if (ipif_arg != NULL) { - if (ipif_refheld) { - ipif_refrele(ipif); - ipif_refheld = B_FALSE; - } - ipif = ipif_arg; - match_flags |= MATCH_IRE_ILL; - } else { - match_flags |= MATCH_IRE_IPIF; - } + ill_t *ill_match; + + if (ill != NULL) + ill_match = ill; + else + ill_match = ipif->ipif_ill; + + match_flags |= MATCH_IRE_ILL; if (ipif->ipif_ire_type == IRE_LOOPBACK) { - ire = ire_ctable_lookup(dst_addr, 0, IRE_LOOPBACK, ipif, - ALL_ZONES, NULL, match_flags, ipst); + ire = ire_ftable_lookup_v4(dst_addr, 0, 0, IRE_LOOPBACK, + ill_match, ALL_ZONES, NULL, match_flags, 0, ipst, + NULL); } if (ire == NULL) { - ire = ire_ftable_lookup(dst_addr, mask, 0, - IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL, - match_flags, ipst); + match_flags |= MATCH_IRE_GW; + ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, + IRE_INTERFACE, ill_match, ALL_ZONES, NULL, + match_flags, 0, ipst, NULL); } + /* Avoid deleting routes created by kernel from an ipif */ + if (ire != NULL && (ire->ire_flags & RTF_KERNEL)) { + ire_refrele(ire); + ire = NULL; + } + + /* Restore in case we didn't find a match */ + match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_ILL); } if (ire == NULL) { @@ -7151,15 +5956,11 @@ ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, * set the IRE type to lookup based on whether * this is a host route, a default route or just a prefix. * - * If an ipif_arg was passed in, then the lookup is based on an + * If an ill was passed in, then the lookup is based on an * interface index so MATCH_IRE_ILL is added to match_flags. - * In any case, MATCH_IRE_IPIF is cleared and MATCH_IRE_GW is - * set as the route being looked up is not a traditional - * interface route. */ - match_flags &= ~MATCH_IRE_IPIF; match_flags |= MATCH_IRE_GW; - if (ipif_arg != NULL) + if (ill != NULL) match_flags |= MATCH_IRE_ILL; if (mask == IP_HOST_MASK) type = IRE_HOST; @@ -7167,14 +5968,15 @@ ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, type = IRE_DEFAULT; else type = IRE_PREFIX; - ire = ire_ftable_lookup(dst_addr, mask, gw_addr, type, ipif_arg, - NULL, ALL_ZONES, 0, NULL, match_flags, ipst); + ire = ire_ftable_lookup_v4(dst_addr, mask, gw_addr, type, ill, + ALL_ZONES, NULL, match_flags, 0, ipst, NULL); } - if (ipif_refheld) + if (ipif != NULL) { ipif_refrele(ipif); + ipif = NULL; + } - /* ipif is not refheld anymore */ if (ire == NULL) return (ESRCH); @@ -7193,9 +5995,9 @@ ip_rt_delete(ipaddr_t dst_addr, ipaddr_t mask, ipaddr_t gw_addr, ip_cgtp_bcast_delete(ire, ipst); } - ipif = ire->ire_ipif; - if (ipif != NULL) - ipif_remove_ire(ipif, ire); + ill = ire->ire_ill; + if (ill != NULL) + ill_remove_saved_ire(ill, ire); if (ioctl_msg) ip_rts_rtmsg(RTM_OLDDEL, ire, 0, ipst); ire_delete(ire); @@ -7249,7 +6051,7 @@ ip_siocaddrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, } error = ip_rt_add(dst_addr, mask, gw_addr, 0, rt->rt_flags, NULL, NULL, - B_TRUE, q, mp, ip_process_ioctl, NULL, ipst); + B_TRUE, NULL, ipst, ALL_ZONES); if (ipif != NULL) ipif_refrele(ipif); return (error); @@ -7301,8 +6103,8 @@ ip_siocdelrt(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, } error = ip_rt_delete(dst_addr, mask, gw_addr, - RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, B_TRUE, q, - mp, ip_process_ioctl, ipst); + RTA_DST | RTA_GATEWAY | RTA_NETMASK, rt->rt_flags, NULL, B_TRUE, + ipst, ALL_ZONES); if (ipif != NULL) ipif_refrele(ipif); return (error); @@ -7655,7 +6457,8 @@ ipsq_dlpi_done(ipsq_t *ipsq) if (phyi != NULL) { ill = phyi->phyint_illv4; if (ill != NULL && - ill->ill_dlpi_pending != DL_PRIM_INVAL) + (ill->ill_dlpi_pending != DL_PRIM_INVAL || + ill->ill_arl_dlpi_pending)) return (B_FALSE); ill = phyi->phyint_illv6; @@ -7819,8 +6622,8 @@ ipsq_try_enter_internal(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, /* * The ipsq_t (ipsq) is the synchronization data structure used to serialize - * certain critical operations like plumbing (i.e. most set ioctls), multicast - * joins, igmp/mld timers, etc. There is one ipsq per phyint. The ipsq + * certain critical operations like plumbing (i.e. most set ioctls), etc. + * There is one ipsq per phyint. The ipsq * serializes exclusive ioctls issued by applications on a per ipsq basis in * ipsq_xopq_mphead. It also protects against multiple threads executing in * the ipsq. Responses from the driver pertain to the current ioctl (say a @@ -7838,7 +6641,7 @@ ipsq_try_enter_internal(ill_t *ill, queue_t *q, mblk_t *mp, ipsq_func_t func, * proceeds to dequeue the next ioctl in ipsq_xopq_mphead and start the next * ioctl if the current ioctl has completed. If the current ioctl is still * in progress it simply returns. The current ioctl could be waiting for - * a response from another module (arp or the driver or could be waiting for + * a response from another module (the driver or could be waiting for * the ipif/ill/ire refcnts to drop to zero. In such a case the ipx_pending_mp * and ipx_pending_ipif are set. ipx_current_ipif is set throughout the * execution of the ioctl and ipsq_exit does not start the next ioctl unless @@ -7959,6 +6762,38 @@ ipsq_exit(ipsq_t *ipsq) } /* + * Used to start any igmp or mld timers that could not be started + * while holding ill_mcast_lock. The timers can't be started while holding + * the lock, since mld/igmp_start_timers may need to call untimeout() + * which can't be done while holding the lock which the timeout handler + * acquires. Otherwise + * there could be a deadlock since the timeout handlers + * mld_timeout_handler_per_ill/igmp_timeout_handler_per_ill also acquire + * ill_mcast_lock. + */ +void +ill_mcast_timer_start(ip_stack_t *ipst) +{ + int next; + + mutex_enter(&ipst->ips_igmp_timer_lock); + next = ipst->ips_igmp_deferred_next; + ipst->ips_igmp_deferred_next = INFINITY; + mutex_exit(&ipst->ips_igmp_timer_lock); + + if (next != INFINITY) + igmp_start_timers(next, ipst); + + mutex_enter(&ipst->ips_mld_timer_lock); + next = ipst->ips_mld_deferred_next; + ipst->ips_mld_deferred_next = INFINITY; + mutex_exit(&ipst->ips_mld_timer_lock); + + if (next != INFINITY) + mld_start_timers(next, ipst); +} + +/* * Start the current exclusive operation on `ipsq'; associate it with `ipif' * and `ioccmd'. */ @@ -8101,7 +6936,6 @@ ipsq_flush(ill_t *ill) mutex_exit(&ipx->ipx_lock); (void) ipsq_pending_mp_cleanup(ill, NULL); ipsq_xopq_mp_cleanup(ill, NULL); - ill_pending_mp_cleanup(ill); } /* @@ -8114,7 +6948,7 @@ ipsq_flush(ill_t *ill) */ int ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, - cmd_info_t *ci, ipsq_func_t func) + cmd_info_t *ci) { char *name; struct ifreq *ifr; @@ -8124,7 +6958,6 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, conn_t *connp; boolean_t isv6; boolean_t exists; - int err; mblk_t *mp1; zoneid_t zoneid; ip_stack_t *ipst; @@ -8138,7 +6971,7 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, } else { ill = NULL; connp = Q_TO_CONN(q); - isv6 = connp->conn_af_isv6; + isv6 = (connp->conn_family == AF_INET6); zoneid = connp->conn_zoneid; if (zoneid == GLOBAL_ZONEID) { /* global zone can access ipifs in all zones */ @@ -8195,13 +7028,38 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, ipif_refhold(ipif); } else { ipif = ipif_lookup_on_name(name, mi_strlen(name), B_FALSE, - &exists, isv6, zoneid, - (connp == NULL) ? q : CONNP_TO_WQ(connp), mp, func, &err, - ipst); - if (ipif == NULL) { - if (err == EINPROGRESS) - return (err); - err = 0; /* Ensure we don't use it below */ + &exists, isv6, zoneid, ipst); + + /* + * Ensure that get ioctls don't see any internal state changes + * caused by set ioctls by deferring them if IPIF_CHANGING is + * set. + */ + if (ipif != NULL && !(ipip->ipi_flags & IPI_WR) && + !IAM_WRITER_IPIF(ipif)) { + ipsq_t *ipsq; + + if (connp != NULL) + mutex_enter(&connp->conn_lock); + mutex_enter(&ipif->ipif_ill->ill_lock); + if (IPIF_IS_CHANGING(ipif) && + !IPIF_IS_CONDEMNED(ipif)) { + ipsq = ipif->ipif_ill->ill_phyint->phyint_ipsq; + mutex_enter(&ipsq->ipsq_lock); + mutex_enter(&ipsq->ipsq_xop->ipx_lock); + mutex_exit(&ipif->ipif_ill->ill_lock); + ipsq_enq(ipsq, q, mp, ip_process_ioctl, + NEW_OP, ipif->ipif_ill); + mutex_exit(&ipsq->ipsq_xop->ipx_lock); + mutex_exit(&ipsq->ipsq_lock); + if (connp != NULL) + mutex_exit(&connp->conn_lock); + ipif_refrele(ipif); + return (EINPROGRESS); + } + mutex_exit(&ipif->ipif_ill->ill_lock); + if (connp != NULL) + mutex_exit(&connp->conn_lock); } } @@ -8226,6 +7084,9 @@ ip_extract_lifreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, if (ipif == NULL) return (ENXIO); + DTRACE_PROBE4(ipif__ioctl, char *, "ip_extract_lifreq", + int, ipip->ipi_cmd, ill_t *, ipif->ipif_ill, ipif_t *, ipif); + ci->ci_ipif = ipif; return (0); } @@ -8544,7 +7405,6 @@ ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, struct iocblk *iocp = (struct iocblk *)mp->b_rptr; uint_t ifindex; zoneid_t zoneid; - int err = 0; boolean_t isv6 = B_FALSE; struct sockaddr_in *sin; struct sockaddr_in6 *sin6; @@ -8571,13 +7431,12 @@ ip_sioctl_get_lifsrcof(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, return (EINVAL); ifindex = STRUCT_FGET(lifs, lifs_ifindex); - isv6 = (Q_TO_CONN(q))->conn_af_isv6; - ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, q, mp, - ip_process_ioctl, &err, ipst); + isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6; + ipif = ipif_lookup_on_ifindex(ifindex, isv6, zoneid, ipst); if (ipif == NULL) { ip1dbg(("ip_sioctl_get_lifsrcof: no ipif for ifindex %d\n", ifindex)); - return (err); + return (ENXIO); } /* Allocate a buffer to hold requested information */ @@ -8943,17 +7802,19 @@ ip_sioctl_dstinfo(queue_t *q, mblk_t *mp) in6_addr_t *daddr, *saddr; ipaddr_t v4daddr; ire_t *ire; + ipaddr_t v4setsrc; + in6_addr_t v6setsrc; char *slabel, *dlabel; boolean_t isipv4; int match_ire; ill_t *dst_ill; - ipif_t *src_ipif, *ire_ipif; struct iocblk *iocp = (struct iocblk *)mp->b_rptr; - zoneid_t zoneid; - ip_stack_t *ipst = CONNQ_TO_IPST(q); + conn_t *connp = Q_TO_CONN(q); + zoneid_t zoneid = IPCL_ZONEID(connp); + ip_stack_t *ipst = connp->conn_netstack->netstack_ip; + uint64_t ipif_flags; ASSERT(q->q_next == NULL); /* this ioctl not allowed if ip is module */ - zoneid = Q_TO_CONN(q)->conn_zoneid; /* * This ioctl is I_STR only, and must have a @@ -8976,7 +7837,7 @@ ip_sioctl_dstinfo(queue_t *q, mblk_t *mp) data_mp = new_data_mp; mp->b_cont = data_mp; } - match_ire = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_PARENT; + match_ire = MATCH_IRE_DSTONLY; for (cur = data_mp->b_rptr, end = data_mp->b_wptr; end - cur >= sizeof (struct dstinforeq); @@ -8987,8 +7848,8 @@ ip_sioctl_dstinfo(queue_t *q, mblk_t *mp) /* * ip_addr_scope_v6() and ip6_asp_lookup() handle - * v4 mapped addresses; ire_ftable_lookup[_v6]() - * and ipif_select_source[_v6]() do not. + * v4 mapped addresses; ire_ftable_lookup_v6() + * and ip_select_source_v6() do not. */ dir->dir_dscope = ip_addr_scope_v6(daddr); dlabel = ip6_asp_lookup(daddr, &dir->dir_precedence, ipst); @@ -8996,13 +7857,19 @@ ip_sioctl_dstinfo(queue_t *q, mblk_t *mp) isipv4 = IN6_IS_ADDR_V4MAPPED(daddr); if (isipv4) { IN6_V4MAPPED_TO_IPADDR(daddr, v4daddr); - ire = ire_ftable_lookup(v4daddr, NULL, NULL, - 0, NULL, NULL, zoneid, 0, NULL, match_ire, ipst); + v4setsrc = INADDR_ANY; + ire = ire_route_recursive_v4(v4daddr, 0, NULL, zoneid, + NULL, match_ire, B_TRUE, 0, ipst, &v4setsrc, NULL, + NULL); } else { - ire = ire_ftable_lookup_v6(daddr, NULL, NULL, - 0, NULL, NULL, zoneid, 0, NULL, match_ire, ipst); + v6setsrc = ipv6_all_zeros; + ire = ire_route_recursive_v6(daddr, 0, NULL, zoneid, + NULL, match_ire, B_TRUE, 0, ipst, &v6setsrc, NULL, + NULL); } - if (ire == NULL) { + ASSERT(ire != NULL); + if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { + ire_refrele(ire); dir->dir_dreachable = 0; /* move on to next dst addr */ @@ -9010,36 +7877,40 @@ ip_sioctl_dstinfo(queue_t *q, mblk_t *mp) } dir->dir_dreachable = 1; - ire_ipif = ire->ire_ipif; - if (ire_ipif == NULL) - goto next_dst; + dst_ill = ire_nexthop_ill(ire); + if (dst_ill == NULL) { + ire_refrele(ire); + continue; + } - /* - * We expect to get back an interface ire or a - * gateway ire cache entry. For both types, the - * output interface is ire_ipif->ipif_ill. - */ - dst_ill = ire_ipif->ipif_ill; + /* With ipmp we most likely look at the ipmp ill here */ dir->dir_dmactype = dst_ill->ill_mactype; if (isipv4) { - src_ipif = ipif_select_source(dst_ill, v4daddr, zoneid); + ipaddr_t v4saddr; + + if (ip_select_source_v4(dst_ill, v4setsrc, v4daddr, + connp->conn_ixa->ixa_multicast_ifaddr, zoneid, ipst, + &v4saddr, NULL, &ipif_flags) != 0) { + v4saddr = INADDR_ANY; + ipif_flags = 0; + } + IN6_IPADDR_TO_V4MAPPED(v4saddr, saddr); } else { - src_ipif = ipif_select_source_v6(dst_ill, - daddr, B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid); + if (ip_select_source_v6(dst_ill, &v6setsrc, daddr, + zoneid, ipst, B_FALSE, IPV6_PREFER_SRC_DEFAULT, + saddr, NULL, &ipif_flags) != 0) { + *saddr = ipv6_all_zeros; + ipif_flags = 0; + } } - if (src_ipif == NULL) - goto next_dst; - *saddr = src_ipif->ipif_v6lcl_addr; dir->dir_sscope = ip_addr_scope_v6(saddr); slabel = ip6_asp_lookup(saddr, NULL, ipst); dir->dir_labelmatch = ip6_asp_labelcmp(dlabel, slabel); - dir->dir_sdeprecated = - (src_ipif->ipif_flags & IPIF_DEPRECATED) ? 1 : 0; - ipif_refrele(src_ipif); -next_dst: + dir->dir_sdeprecated = (ipif_flags & IPIF_DEPRECATED) ? 1 : 0; ire_refrele(ire); + ill_refrele(dst_ill); } miocack(q, mp, iocp->ioc_count, 0); } @@ -9088,16 +7959,16 @@ ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, v4_addr); - ire = ire_ctable_lookup(v4_addr, 0, - IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, - NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst); + ire = ire_ftable_lookup_v4(v4_addr, 0, 0, + IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, NULL, + MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL); } else { in6_addr_t v6addr; v6addr = sin6->sin6_addr; - ire = ire_ctable_lookup_v6(&v6addr, 0, - IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, - NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst); + ire = ire_ftable_lookup_v6(&v6addr, 0, 0, + IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, NULL, + MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL); } break; } @@ -9105,9 +7976,9 @@ ip_sioctl_tmyaddr(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, ipaddr_t v4addr; v4addr = sin->sin_addr.s_addr; - ire = ire_ctable_lookup(v4addr, 0, + ire = ire_ftable_lookup_v4(v4addr, 0, 0, IRE_LOCAL|IRE_LOOPBACK, NULL, zoneid, - NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, ipst); + NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, 0, ipst, NULL); break; } default: @@ -9160,9 +8031,8 @@ ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, sin = (sin_t *)&sia->sa_addr; /* - * Match addresses with a zero gateway field to avoid - * routes going through a router. - * Exclude broadcast and multicast addresses. + * We check for IRE_ONLINK and exclude IRE_BROADCAST|IRE_MULTICAST + * to make sure we only look at on-link unicast address. */ switch (sin->sin_family) { case AF_INET6: { @@ -9174,20 +8044,18 @@ ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, v4_addr); if (!CLASSD(v4_addr)) { - ire = ire_route_lookup(v4_addr, 0, 0, 0, - NULL, NULL, zoneid, NULL, - MATCH_IRE_GW, ipst); + ire = ire_ftable_lookup_v4(v4_addr, 0, 0, 0, + NULL, zoneid, NULL, MATCH_IRE_DSTONLY, + 0, ipst, NULL); } } else { in6_addr_t v6addr; - in6_addr_t v6gw; v6addr = sin6->sin6_addr; - v6gw = ipv6_all_zeros; if (!IN6_IS_ADDR_MULTICAST(&v6addr)) { - ire = ire_route_lookup_v6(&v6addr, 0, - &v6gw, 0, NULL, NULL, zoneid, - NULL, MATCH_IRE_GW, ipst); + ire = ire_ftable_lookup_v6(&v6addr, 0, 0, 0, + NULL, zoneid, NULL, MATCH_IRE_DSTONLY, 0, + ipst, NULL); } } break; @@ -9197,9 +8065,8 @@ ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, v4addr = sin->sin_addr.s_addr; if (!CLASSD(v4addr)) { - ire = ire_route_lookup(v4addr, 0, 0, 0, - NULL, NULL, zoneid, NULL, - MATCH_IRE_GW, ipst); + ire = ire_ftable_lookup_v4(v4addr, 0, 0, 0, NULL, + zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL); } break; } @@ -9208,10 +8075,11 @@ ip_sioctl_tonlink(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, } sia->sa_res = 0; if (ire != NULL) { - if (ire->ire_type & (IRE_INTERFACE|IRE_CACHE| - IRE_LOCAL|IRE_LOOPBACK)) { + ASSERT(!(ire->ire_type & IRE_MULTICAST)); + + if ((ire->ire_type & IRE_ONLINK) && + !(ire->ire_type & IRE_BROADCAST)) sia->sa_res = 1; - } ire_refrele(ire); } return (0); @@ -9228,54 +8096,40 @@ ip_sioctl_tmysite(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, return (ENXIO); } -/* - * ARP IOCTLs. - * How does IP get in the business of fronting ARP configuration/queries? - * Well it's like this, the Berkeley ARP IOCTLs (SIOCGARP, SIOCDARP, SIOCSARP) - * are by tradition passed in through a datagram socket. That lands in IP. - * As it happens, this is just as well since the interface is quite crude in - * that it passes in no information about protocol or hardware types, or - * interface association. After making the protocol assumption, IP is in - * the position to look up the name of the ILL, which ARP will need, and - * format a request that can be handled by ARP. The request is passed up - * stream to ARP, and the original IOCTL is completed by IP when ARP passes - * back a response. ARP supports its own set of more general IOCTLs, in - * case anyone is interested. - */ +/* ARP IOCTLs. */ /* ARGSUSED */ int ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *dummy_ifreq) { - mblk_t *mp1; - mblk_t *mp2; - mblk_t *pending_mp; - ipaddr_t ipaddr; - area_t *area; - struct iocblk *iocp; - conn_t *connp; - struct arpreq *ar; - struct xarpreq *xar; - int flags, alength; - uchar_t *lladdr; - ire_t *ire; - ip_stack_t *ipst; - ill_t *ill = ipif->ipif_ill; - ill_t *proxy_ill = NULL; - ipmp_arpent_t *entp = NULL; - boolean_t if_arp_ioctl = B_FALSE; - boolean_t proxyarp = B_FALSE; + int err; + ipaddr_t ipaddr; + struct iocblk *iocp; + conn_t *connp; + struct arpreq *ar; + struct xarpreq *xar; + int arp_flags, flags, alength; + uchar_t *lladdr; + ip_stack_t *ipst; + ill_t *ill = ipif->ipif_ill; + ill_t *proxy_ill = NULL; + ipmp_arpent_t *entp = NULL; + boolean_t proxyarp = B_FALSE; + boolean_t if_arp_ioctl = B_FALSE; + ncec_t *ncec = NULL; + nce_t *nce; ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); connp = Q_TO_CONN(q); ipst = connp->conn_netstack->netstack_ip; + iocp = (struct iocblk *)mp->b_rptr; if (ipip->ipi_cmd_type == XARP_CMD) { /* We have a chain - M_IOCTL-->MI_COPY_MBLK-->XARPREQ_MBLK */ xar = (struct xarpreq *)mp->b_cont->b_cont->b_rptr; ar = NULL; - flags = xar->xarp_flags; + arp_flags = xar->xarp_flags; lladdr = (uchar_t *)LLADDR(&xar->xarp_ha); if_arp_ioctl = (xar->xarp_ha.sdl_nlen != 0); /* @@ -9294,7 +8148,7 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ar = (struct arpreq *)mp->b_cont->b_cont->b_rptr; xar = NULL; - flags = ar->arp_flags; + arp_flags = ar->arp_flags; lladdr = (uchar_t *)ar->arp_ha.sa_data; /* * Theoretically, the sa_family could tell us what link @@ -9315,7 +8169,14 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, } } - ipaddr = sin->sin_addr.s_addr; + /* Translate ATF* flags to NCE* flags */ + flags = 0; + if (arp_flags & ATF_AUTHORITY) + flags |= NCE_F_AUTHORITY; + if (arp_flags & ATF_PERM) + flags |= NCE_F_NONUD; /* not subject to aging */ + if (arp_flags & ATF_PUBL) + flags |= NCE_F_PUBLISH; /* * IPMP ARP special handling: @@ -9349,171 +8210,120 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, lladdr = proxy_ill->ill_phys_addr; } /* FALLTHRU */ - case SIOCDARP: - case SIOCDXARP: - ire = ire_ctable_lookup(ipaddr, 0, IRE_LOCAL, NULL, - ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); - if (ire != NULL) { - ire_refrele(ire); - return (EPERM); - } } } + ipaddr = sin->sin_addr.s_addr; /* - * We are going to pass up to ARP a packet chain that looks - * like: - * - * M_IOCTL-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK - * - * Get a copy of the original IOCTL mblk to head the chain, - * to be sent up (in mp1). Also get another copy to store - * in the ill_pending_mp list, for matching the response - * when it comes back from ARP. - */ - mp1 = copyb(mp); - pending_mp = copymsg(mp); - if (mp1 == NULL || pending_mp == NULL) { - if (mp1 != NULL) - freeb(mp1); - if (pending_mp != NULL) - inet_freemsg(pending_mp); - return (ENOMEM); - } - - mp2 = ill_arp_alloc(ill, (uchar_t *)&ip_area_template, - (caddr_t)&ipaddr); - if (mp2 == NULL) { - freeb(mp1); - inet_freemsg(pending_mp); - return (ENOMEM); - } - /* Put together the chain. */ - mp1->b_cont = mp2; - mp1->b_datap->db_type = M_IOCTL; - mp2->b_cont = mp; - mp2->b_datap->db_type = M_DATA; - - iocp = (struct iocblk *)mp1->b_rptr; - - /* - * An M_IOCDATA's payload (struct copyresp) is mostly the same as an - * M_IOCTL's payload (struct iocblk), but 'struct copyresp' has a - * cp_private field (or cp_rval on 32-bit systems) in place of the - * ioc_count field; set ioc_count to be correct. + * don't match across illgrp per case (1) and (2). + * XXX use IS_IPMP(ill) like ndp_sioc_update? */ - iocp->ioc_count = MBLKL(mp1->b_cont); + nce = nce_lookup_v4(ill, &ipaddr); + if (nce != NULL) + ncec = nce->nce_common; - /* - * Set the proper command in the ARP message. - * Convert the SIOC{G|S|D}ARP calls into our - * AR_ENTRY_xxx calls. - */ - area = (area_t *)mp2->b_rptr; switch (iocp->ioc_cmd) { case SIOCDARP: - case SIOCDXARP: + case SIOCDXARP: { /* - * We defer deleting the corresponding IRE until - * we return from arp. + * Delete the NCE if any. + */ + if (ncec == NULL) { + iocp->ioc_error = ENXIO; + break; + } + /* Don't allow changes to arp mappings of local addresses. */ + if (NCE_MYADDR(ncec)) { + nce_refrele(nce); + return (ENOTSUP); + } + iocp->ioc_error = 0; + + /* + * Delete the nce_common which has ncec_ill set to ipmp_ill. + * This will delete all the nce entries on the under_ills. + */ + ncec_delete(ncec); + /* + * Once the NCE has been deleted, then the ire_dep* consistency + * mechanism will find any IRE which depended on the now + * condemned NCE (as part of sending packets). + * That mechanism handles redirects by deleting redirects + * that refer to UNREACHABLE nces. */ - area->area_cmd = AR_ENTRY_DELETE; - area->area_proto_mask_offset = 0; break; + } case SIOCGARP: case SIOCGXARP: - area->area_cmd = AR_ENTRY_SQUERY; - area->area_proto_mask_offset = 0; + if (ncec != NULL) { + lladdr = ncec->ncec_lladdr; + flags = ncec->ncec_flags; + iocp->ioc_error = 0; + ip_sioctl_garp_reply(mp, ncec->ncec_ill, lladdr, flags); + } else { + iocp->ioc_error = ENXIO; + } break; case SIOCSARP: case SIOCSXARP: - /* - * Delete the corresponding ire to make sure IP will - * pick up any change from arp. - */ + /* Don't allow changes to arp mappings of local addresses. */ + if (ncec != NULL && NCE_MYADDR(ncec)) { + nce_refrele(nce); + return (ENOTSUP); + } + + /* static arp entries will undergo NUD if ATF_PERM is not set */ + flags |= NCE_F_STATIC; if (!if_arp_ioctl) { - (void) ip_ire_clookup_and_delete(ipaddr, NULL, ipst); + ip_nce_lookup_and_update(&ipaddr, NULL, ipst, + lladdr, alength, flags); } else { ipif_t *ipif = ipif_get_next_ipif(NULL, ill); if (ipif != NULL) { - (void) ip_ire_clookup_and_delete(ipaddr, ipif, - ipst); + ip_nce_lookup_and_update(&ipaddr, ipif, ipst, + lladdr, alength, flags); ipif_refrele(ipif); } } - break; - } - iocp->ioc_cmd = area->area_cmd; - - /* - * Fill in the rest of the ARP operation fields. - */ - area->area_hw_addr_length = alength; - bcopy(lladdr, (char *)area + area->area_hw_addr_offset, alength); - - /* Translate the flags. */ - if (flags & ATF_PERM) - area->area_flags |= ACE_F_PERMANENT; - if (flags & ATF_PUBL) - area->area_flags |= ACE_F_PUBLISH; - if (flags & ATF_AUTHORITY) - area->area_flags |= ACE_F_AUTHORITY; - - /* - * If this is a permanent AR_ENTRY_ADD on the IPMP interface, track it - * so that IP can update ARP as the active ills in the group change. - */ - if (IS_IPMP(ill) && area->area_cmd == AR_ENTRY_ADD && - (area->area_flags & ACE_F_PERMANENT)) { - entp = ipmp_illgrp_create_arpent(ill->ill_grp, mp2, proxyarp); - + if (nce != NULL) { + nce_refrele(nce); + nce = NULL; + } /* - * The second part of the conditional below handles a corner - * case: if this is proxy ARP and the IPMP group has no active - * interfaces, we can't send the request to ARP now since it - * won't be able to build an ACE. So we return success and - * notify ARP about the proxy ARP entry once an interface - * becomes active. + * NCE_F_STATIC entries will be added in state ND_REACHABLE + * by nce_add_common() */ - if (entp == NULL || (proxyarp && proxy_ill == NULL)) { - mp2->b_cont = NULL; - inet_freemsg(mp1); - inet_freemsg(pending_mp); - return (entp == NULL ? ENOMEM : 0); + err = nce_lookup_then_add_v4(ill, lladdr, + ill->ill_phys_addr_length, &ipaddr, flags, ND_UNCHANGED, + &nce); + if (err == EEXIST) { + ncec = nce->nce_common; + mutex_enter(&ncec->ncec_lock); + ncec->ncec_state = ND_REACHABLE; + ncec->ncec_flags = flags; + nce_update(ncec, ND_UNCHANGED, lladdr); + mutex_exit(&ncec->ncec_lock); + err = 0; + } + if (nce != NULL) { + nce_refrele(nce); + nce = NULL; + } + if (IS_IPMP(ill) && err == 0) { + entp = ipmp_illgrp_create_arpent(ill->ill_grp, + proxyarp, ipaddr, lladdr, ill->ill_phys_addr_length, + flags); + if (entp == NULL || (proxyarp && proxy_ill == NULL)) { + iocp->ioc_error = (entp == NULL ? ENOMEM : 0); + break; + } } + iocp->ioc_error = err; } - /* - * Before sending 'mp' to ARP, we have to clear the b_next - * and b_prev. Otherwise if STREAMS encounters such a message - * in freemsg(), (because ARP can close any time) it can cause - * a panic. But mi code needs the b_next and b_prev values of - * mp->b_cont, to complete the ioctl. So we store it here - * in pending_mp->bcont, and restore it in ip_sioctl_iocack() - * when the response comes down from ARP. - */ - pending_mp->b_cont->b_next = mp->b_cont->b_next; - pending_mp->b_cont->b_prev = mp->b_cont->b_prev; - mp->b_cont->b_next = NULL; - mp->b_cont->b_prev = NULL; - - mutex_enter(&connp->conn_lock); - mutex_enter(&ill->ill_lock); - /* conn has not yet started closing, hence this can't fail */ - if (ipip->ipi_flags & IPI_WR) { - VERIFY(ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), - pending_mp, 0) != 0); - } else { - VERIFY(ill_pending_mp_add(ill, connp, pending_mp) != 0); + if (nce != NULL) { + nce_refrele(nce); } - mutex_exit(&ill->ill_lock); - mutex_exit(&connp->conn_lock); - - /* - * Up to ARP it goes. The response will come back in ip_wput() as an - * M_IOCACK, and will be handed to ip_sioctl_iocack() for completion. - */ - putnext(ill->ill_rq, mp1); /* * If we created an IPMP ARP entry, mark that we've notified ARP. @@ -9521,7 +8331,7 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, if (entp != NULL) ipmp_illgrp_mark_arpent(ill->ill_grp, entp); - return (EINPROGRESS); + return (iocp->ioc_error); } /* @@ -9530,10 +8340,9 @@ ip_sioctl_arp(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, */ int ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, - cmd_info_t *ci, ipsq_func_t func) + cmd_info_t *ci) { mblk_t *mp1; - int err; sin_t *sin; conn_t *connp; ipif_t *ipif; @@ -9548,7 +8357,7 @@ ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, /* ioctl comes down on a conn */ ASSERT(!(q->q_flag & QREADR) && q->q_next == NULL); connp = Q_TO_CONN(q); - if (connp->conn_af_isv6) + if (connp->conn_family == AF_INET6) return (ENXIO); ipst = connp->conn_netstack->netstack_ip; @@ -9575,10 +8384,9 @@ ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, if (ipip->ipi_cmd_type == XARP_CMD && sdl->sdl_nlen != 0) { ipif = ipif_lookup_on_name(sdl->sdl_data, sdl->sdl_nlen, - B_FALSE, &exists, B_FALSE, ALL_ZONES, CONNP_TO_WQ(connp), - mp, func, &err, ipst); + B_FALSE, &exists, B_FALSE, ALL_ZONES, ipst); if (ipif == NULL) - return (err); + return (ENXIO); if (ipif->ipif_id != 0) { ipif_refrele(ipif); return (ENXIO); @@ -9591,23 +8399,24 @@ ip_extract_arpreq(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, * find the wrong ill, so we first do an ipif_lookup_addr(). */ ipif = ipif_lookup_addr(sin->sin_addr.s_addr, NULL, ALL_ZONES, - CONNP_TO_WQ(connp), mp, func, &err, ipst); + ipst); if (ipif == NULL) { - ire = ire_ftable_lookup(sin->sin_addr.s_addr, 0, 0, - IRE_IF_RESOLVER, NULL, NULL, ALL_ZONES, 0, NULL, - MATCH_IRE_TYPE, ipst); - if (ire == NULL || ((ill = ire_to_ill(ire)) == NULL)) { + ire = ire_ftable_lookup_v4(sin->sin_addr.s_addr, + 0, 0, IRE_IF_RESOLVER, NULL, ALL_ZONES, + NULL, MATCH_IRE_TYPE, 0, ipst, NULL); + if (ire == NULL || ((ill = ire->ire_ill) == NULL)) { if (ire != NULL) ire_refrele(ire); return (ENXIO); } + ASSERT(ire != NULL && ill != NULL); ipif = ill->ill_ipif; ipif_refhold(ipif); ire_refrele(ire); } } - if (ipif->ipif_net_type != IRE_IF_RESOLVER) { + if (ipif->ipif_ill->ill_net_type != IRE_IF_RESOLVER) { ipif_refrele(ipif); return (ENXIO); } @@ -9700,123 +8509,20 @@ ip_sioctl_plink_ipmp(ill_t *ill, int ioccmd) void ip_sioctl_plink(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) { - mblk_t *mp1, *mp2; + mblk_t *mp1; struct linkblk *li; - struct ipmx_s *ipmxp; - ill_t *ill; int ioccmd = ((struct iocblk *)mp->b_rptr)->ioc_cmd; int err = 0; - boolean_t entered_ipsq = B_FALSE; - boolean_t islink; - ip_stack_t *ipst; - - if (CONN_Q(q)) - ipst = CONNQ_TO_IPST(q); - else - ipst = ILLQ_TO_IPST(q); ASSERT(ioccmd == I_PLINK || ioccmd == I_PUNLINK || ioccmd == I_LINK || ioccmd == I_UNLINK); - islink = (ioccmd == I_PLINK || ioccmd == I_LINK); - mp1 = mp->b_cont; /* This is the linkblk info */ li = (struct linkblk *)mp1->b_rptr; - /* - * ARP has added this special mblk, and the utility is asking us - * to perform consistency checks, and also atomically set the - * muxid. Ifconfig is an example. It achieves this by using - * /dev/arp as the mux to plink the arp stream, and pushes arp on - * to /dev/udp[6] stream for use as the mux when plinking the IP - * stream. SIOCSLIFMUXID is not required. See ifconfig.c, arp.c - * and other comments in this routine for more details. - */ - mp2 = mp1->b_cont; /* This is added by ARP */ - - /* - * If I_{P}LINK/I_{P}UNLINK is issued by a utility other than - * ifconfig which didn't push ARP on top of the dummy mux, we won't - * get the special mblk above. For backward compatibility, we - * request ip_sioctl_plink_ipmod() to skip the consistency checks. - * The utility will use SIOCSLIFMUXID to store the muxids. This is - * not atomic, and can leave the streams unplumbable if the utility - * is interrupted before it does the SIOCSLIFMUXID. - */ - if (mp2 == NULL) { - err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li, B_FALSE); - if (err == EINPROGRESS) - return; - goto done; - } - - /* - * This is an I_{P}LINK sent down by ifconfig through the ARP module; - * ARP has appended this last mblk to tell us whether the lower stream - * is an arp-dev stream or an IP module stream. - */ - ipmxp = (struct ipmx_s *)mp2->b_rptr; - if (ipmxp->ipmx_arpdev_stream) { - /* - * The lower stream is the arp-dev stream. - */ - ill = ill_lookup_on_name(ipmxp->ipmx_name, B_FALSE, B_FALSE, - q, mp, ip_sioctl_plink, &err, NULL, ipst); - if (ill == NULL) { - if (err == EINPROGRESS) - return; - err = EINVAL; - goto done; - } - - if (ipsq == NULL) { - ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink, - NEW_OP, B_FALSE); - if (ipsq == NULL) { - ill_refrele(ill); - return; - } - entered_ipsq = B_TRUE; - } - ASSERT(IAM_WRITER_ILL(ill)); - ill_refrele(ill); - - /* - * To ensure consistency between IP and ARP, the following - * LIFO scheme is used in plink/punlink. (IP first, ARP last). - * This is because the muxid's are stored in the IP stream on - * the ill. - * - * I_{P}LINK: ifconfig plinks the IP stream before plinking - * the ARP stream. On an arp-dev stream, IP checks that it is - * not yet plinked, and it also checks that the corresponding - * IP stream is already plinked. - * - * I_{P}UNLINK: ifconfig punlinks the ARP stream before - * punlinking the IP stream. IP does not allow punlink of the - * IP stream unless the arp stream has been punlinked. - */ - if ((islink && - (ill->ill_arp_muxid != 0 || ill->ill_ip_muxid == 0)) || - (!islink && ill->ill_arp_muxid != li->l_index)) { - err = EINVAL; - goto done; - } - - if (IS_IPMP(ill) && - (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0) - goto done; - - ill->ill_arp_muxid = islink ? li->l_index : 0; - } else { - /* - * The lower stream is probably an IP module stream. Do - * consistency checking. - */ - err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li, B_TRUE); - if (err == EINPROGRESS) - return; - } + err = ip_sioctl_plink_ipmod(ipsq, q, mp, ioccmd, li); + if (err == EINPROGRESS) + return; done: if (err == 0) miocack(q, mp, 0, 0); @@ -9826,21 +8532,19 @@ done: /* Conn was refheld in ip_sioctl_copyin_setup */ if (CONN_Q(q)) CONN_OPER_PENDING_DONE(Q_TO_CONN(q)); - if (entered_ipsq) - ipsq_exit(ipsq); } /* * Process I_{P}LINK and I_{P}UNLINK requests named by `ioccmd' and pointed to * by `mp' and `li' for the IP module stream (if li->q_bot is in fact an IP * module stream). If `doconsist' is set, then do the extended consistency - * checks requested by ifconfig(1M) and (atomically) set ill_ip_muxid here. + * checks requested by ifconfig(1M) and (atomically) set ill_muxid here. * Returns zero on success, EINPROGRESS if the operation is still pending, or * an error code on failure. */ static int ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd, - struct linkblk *li, boolean_t doconsist) + struct linkblk *li) { int err = 0; ill_t *ill; @@ -9849,6 +8553,8 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd, struct qinit *qinfo; boolean_t islink = (ioccmd == I_PLINK || ioccmd == I_LINK); boolean_t entered_ipsq = B_FALSE; + boolean_t is_ip = B_FALSE; + arl_t *arl; /* * Walk the lower stream to verify it's the IP module stream. @@ -9861,6 +8567,11 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd, name = qinfo->qi_minfo->mi_idname; if (name != NULL && strcmp(name, ip_mod_info.mi_idname) == 0 && qinfo->qi_putp != (pfi_t)ip_lwput && ipwq->q_next != NULL) { + is_ip = B_TRUE; + break; + } + if (name != NULL && strcmp(name, arp_mod_info.mi_idname) == 0 && + qinfo->qi_putp != (pfi_t)ip_lwput && ipwq->q_next != NULL) { break; } } @@ -9871,30 +8582,46 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd, if (ipwq == NULL) return (0); - ill = ipwq->q_ptr; + if (!is_ip) { + arl = (arl_t *)ipwq->q_ptr; + ill = arl_to_ill(arl); + if (ill == NULL) + return (0); + } else { + ill = ipwq->q_ptr; + } ASSERT(ill != NULL); if (ipsq == NULL) { ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_sioctl_plink, NEW_OP, B_FALSE); - if (ipsq == NULL) + if (ipsq == NULL) { + if (!is_ip) + ill_refrele(ill); return (EINPROGRESS); + } entered_ipsq = B_TRUE; } ASSERT(IAM_WRITER_ILL(ill)); - - if (doconsist) { - /* - * Consistency checking requires that I_{P}LINK occurs - * prior to setting ill_ip_muxid, and that I_{P}UNLINK - * occurs prior to clearing ill_arp_muxid. - */ - if ((islink && ill->ill_ip_muxid != 0) || - (!islink && ill->ill_arp_muxid != 0)) { - err = EINVAL; - goto done; + mutex_enter(&ill->ill_lock); + if (!is_ip) { + if (islink && ill->ill_muxid == 0) { + /* + * Plumbing has to be done with IP plumbed first, arp + * second, but here we have arp being plumbed first. + */ + mutex_exit(&ill->ill_lock); + ipsq_exit(ipsq); + ill_refrele(ill); + return (EINVAL); } } + mutex_exit(&ill->ill_lock); + if (!is_ip) { + arl->arl_muxid = islink ? li->l_index : 0; + ill_refrele(ill); + goto done; + } if (IS_IPMP(ill) && (err = ip_sioctl_plink_ipmp(ill, ioccmd)) != 0) goto done; @@ -9912,8 +8639,7 @@ ip_sioctl_plink_ipmod(ipsq_t *ipsq, queue_t *q, mblk_t *mp, int ioccmd, ill->ill_lmod_cnt++; } - if (doconsist) - ill->ill_ip_muxid = islink ? li->l_index : 0; + ill->ill_muxid = islink ? li->l_index : 0; /* * Mark the ipsq busy until the capability operations initiated below @@ -9997,11 +8723,11 @@ ip_sioctl_copyin_resume(ipsq_t *dummy_ipsq, queue_t *q, mblk_t *mp, } /* - * ip_sioctl_copyin_setup is called by ip_wput with any M_IOCTL message + * ip_sioctl_copyin_setup is called by ip_wput_nondata with any M_IOCTL message * that arrives. Most of the IOCTLs are "socket" IOCTLs which we handle * in either I_STR or TRANSPARENT form, using the mi_copy facility. * We establish here the size of the block to be copied in. mi_copyin - * arranges for this to happen, an processing continues in ip_wput with + * arranges for this to happen, an processing continues in ip_wput_nondata with * an M_IOCDATA message. */ void @@ -10054,17 +8780,7 @@ ip_sioctl_copyin_setup(queue_t *q, mblk_t *mp) * will fail all ioctls). */ if ((q->q_next != NULL) && !(ipip->ipi_flags & IPI_MODOK)) { - if (ipip->ipi_flags & IPI_PASS_DOWN) { - /* - * Pass common Streams ioctls which the IP - * module does not own or consume along to - * be processed down stream. - */ - putnext(q, mp); - return; - } else { - goto nak; - } + goto nak; } /* Make sure we have ioctl data to process. */ @@ -10216,286 +8932,62 @@ nak: qreply(q, mp); } -/* ip_wput hands off ARP IOCTL responses to us */ -/* ARGSUSED3 */ -void -ip_sioctl_iocack(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) +static void +ip_sioctl_garp_reply(mblk_t *mp, ill_t *ill, void *hwaddr, int flags) { struct arpreq *ar; struct xarpreq *xar; - area_t *area; - mblk_t *area_mp; + mblk_t *tmp; struct iocblk *iocp; - mblk_t *orig_ioc_mp, *tmp; - struct iocblk *orig_iocp; - ill_t *ill; - conn_t *connp = NULL; - mblk_t *pending_mp; - int x_arp_ioctl = B_FALSE, ifx_arp_ioctl = B_FALSE; + int x_arp_ioctl = B_FALSE; int *flagsp; char *storage = NULL; - sin_t *sin; - ipaddr_t addr; - int err; - ip_stack_t *ipst; - ASSERT(ipsq == NULL || IAM_WRITER_IPSQ(ipsq)); - ill = q->q_ptr; ASSERT(ill != NULL); - ipst = ill->ill_ipst; - - /* - * We should get back from ARP a packet chain that looks like: - * M_IOCACK-->ARP_op_MBLK-->ORIG_M_IOCTL-->MI_COPY_MBLK-->[X]ARPREQ_MBLK - */ - if (!(area_mp = mp->b_cont) || - (area_mp->b_wptr - area_mp->b_rptr) < sizeof (ip_sock_ar_t) || - !(orig_ioc_mp = area_mp->b_cont) || - !orig_ioc_mp->b_cont || !orig_ioc_mp->b_cont->b_cont) { - freemsg(mp); - return; - } - orig_iocp = (struct iocblk *)orig_ioc_mp->b_rptr; + iocp = (struct iocblk *)mp->b_rptr; + ASSERT(iocp->ioc_cmd == SIOCGXARP || iocp->ioc_cmd == SIOCGARP); - tmp = (orig_ioc_mp->b_cont)->b_cont; - if ((orig_iocp->ioc_cmd == SIOCGXARP) || - (orig_iocp->ioc_cmd == SIOCSXARP) || - (orig_iocp->ioc_cmd == SIOCDXARP)) { + tmp = (mp->b_cont)->b_cont; /* xarpreq/arpreq */ + if ((iocp->ioc_cmd == SIOCGXARP) || + (iocp->ioc_cmd == SIOCSXARP)) { x_arp_ioctl = B_TRUE; xar = (struct xarpreq *)tmp->b_rptr; - sin = (sin_t *)&xar->xarp_pa; flagsp = &xar->xarp_flags; storage = xar->xarp_ha.sdl_data; - if (xar->xarp_ha.sdl_nlen != 0) - ifx_arp_ioctl = B_TRUE; } else { ar = (struct arpreq *)tmp->b_rptr; - sin = (sin_t *)&ar->arp_pa; flagsp = &ar->arp_flags; storage = ar->arp_ha.sa_data; } - iocp = (struct iocblk *)mp->b_rptr; - - /* - * Find the pending message; if we're exclusive, it'll be on our IPSQ. - * Otherwise, we can find it from our ioc_id. - */ - if (ipsq != NULL) - pending_mp = ipsq_pending_mp_get(ipsq, &connp); - else - pending_mp = ill_pending_mp_get(ill, &connp, iocp->ioc_id); - - if (pending_mp == NULL) { - ASSERT(connp == NULL); - inet_freemsg(mp); - return; - } - ASSERT(connp != NULL); - q = CONNP_TO_WQ(connp); - - /* Uncouple the internally generated IOCTL from the original one */ - area = (area_t *)area_mp->b_rptr; - area_mp->b_cont = NULL; - - /* - * Restore the b_next and b_prev used by mi code. This is needed - * to complete the ioctl using mi* functions. We stored them in - * the pending mp prior to sending the request to ARP. - */ - orig_ioc_mp->b_cont->b_next = pending_mp->b_cont->b_next; - orig_ioc_mp->b_cont->b_prev = pending_mp->b_cont->b_prev; - inet_freemsg(pending_mp); - /* - * We're done if there was an error or if this is not an SIOCG{X}ARP - * Catch the case where there is an IRE_CACHE by no entry in the - * arp table. - */ - addr = sin->sin_addr.s_addr; - if (iocp->ioc_error && iocp->ioc_cmd == AR_ENTRY_SQUERY) { - ire_t *ire; - dl_unitdata_req_t *dlup; - mblk_t *llmp; - int addr_len; - ill_t *ipsqill = NULL; - - if (ifx_arp_ioctl) { - /* - * There's no need to lookup the ill, since - * we've already done that when we started - * processing the ioctl and sent the message - * to ARP on that ill. So use the ill that - * is stored in q->q_ptr. - */ - ipsqill = ill; - ire = ire_ctable_lookup(addr, 0, IRE_CACHE, - ipsqill->ill_ipif, ALL_ZONES, - NULL, MATCH_IRE_TYPE | MATCH_IRE_ILL, ipst); - } else { - ire = ire_ctable_lookup(addr, 0, IRE_CACHE, - NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); - if (ire != NULL) - ipsqill = ire_to_ill(ire); - } - - if ((x_arp_ioctl) && (ipsqill != NULL)) - storage += ill_xarp_info(&xar->xarp_ha, ipsqill); - - if (ire != NULL) { - /* - * Since the ire obtained from cachetable is used for - * mac addr copying below, treat an incomplete ire as if - * as if we never found it. - */ - if (ire->ire_nce != NULL && - ire->ire_nce->nce_state != ND_REACHABLE) { - ire_refrele(ire); - ire = NULL; - ipsqill = NULL; - goto errack; - } - *flagsp = ATF_INUSE; - llmp = (ire->ire_nce != NULL ? - ire->ire_nce->nce_res_mp : NULL); - if (llmp != NULL && ipsqill != NULL) { - uchar_t *macaddr; - - addr_len = ipsqill->ill_phys_addr_length; - if (x_arp_ioctl && ((addr_len + - ipsqill->ill_name_length) > - sizeof (xar->xarp_ha.sdl_data))) { - ire_refrele(ire); - freemsg(mp); - ip_ioctl_finish(q, orig_ioc_mp, - EINVAL, NO_COPYOUT, ipsq); - return; - } - *flagsp |= ATF_COM; - dlup = (dl_unitdata_req_t *)llmp->b_rptr; - if (ipsqill->ill_sap_length < 0) - macaddr = llmp->b_rptr + - dlup->dl_dest_addr_offset; - else - macaddr = llmp->b_rptr + - dlup->dl_dest_addr_offset + - ipsqill->ill_sap_length; - /* - * For SIOCGARP, MAC address length - * validation has already been done - * before the ioctl was issued to ARP to - * allow it to progress only on 6 byte - * addressable (ethernet like) media. Thus - * the mac address copying can not overwrite - * the sa_data area below. - */ - bcopy(macaddr, storage, addr_len); - } - /* Ditch the internal IOCTL. */ - freemsg(mp); - ire_refrele(ire); - ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, ipsq); - return; - } - } - - /* - * If this was a failed AR_ENTRY_ADD or a successful AR_ENTRY_DELETE - * on the IPMP meta-interface, ensure any ARP entries added in - * ip_sioctl_arp() are deleted. - */ - if (IS_IPMP(ill) && - ((iocp->ioc_error != 0 && iocp->ioc_cmd == AR_ENTRY_ADD) || - ((iocp->ioc_error == 0 && iocp->ioc_cmd == AR_ENTRY_DELETE)))) { - ipmp_illgrp_t *illg = ill->ill_grp; - ipmp_arpent_t *entp; - - if ((entp = ipmp_illgrp_lookup_arpent(illg, &addr)) != NULL) - ipmp_illgrp_destroy_arpent(illg, entp); - } - - /* - * Delete the coresponding IRE_CACHE if any. - * Reset the error if there was one (in case there was no entry - * in arp.) - */ - if (iocp->ioc_cmd == AR_ENTRY_DELETE) { - ipif_t *ipintf = NULL; - - if (ifx_arp_ioctl) { - /* - * There's no need to lookup the ill, since - * we've already done that when we started - * processing the ioctl and sent the message - * to ARP on that ill. So use the ill that - * is stored in q->q_ptr. - */ - ipintf = ill->ill_ipif; - } - if (ip_ire_clookup_and_delete(addr, ipintf, ipst)) { - /* - * The address in "addr" may be an entry for a - * router. If that's true, then any off-net - * IRE_CACHE entries that go through the router - * with address "addr" must be clobbered. Use - * ire_walk to achieve this goal. - */ - if (ifx_arp_ioctl) - ire_walk_ill_v4(MATCH_IRE_ILL, 0, - ire_delete_cache_gw, (char *)&addr, ill); - else - ire_walk_v4(ire_delete_cache_gw, (char *)&addr, - ALL_ZONES, ipst); - iocp->ioc_error = 0; - } - } -errack: - if (iocp->ioc_error || iocp->ioc_cmd != AR_ENTRY_SQUERY) { - err = iocp->ioc_error; - freemsg(mp); - ip_ioctl_finish(q, orig_ioc_mp, err, NO_COPYOUT, ipsq); - return; - } - - /* - * Completion of an SIOCG{X}ARP. Translate the information from - * the area_t into the struct {x}arpreq. + * We're done if this is not an SIOCG{X}ARP */ if (x_arp_ioctl) { storage += ill_xarp_info(&xar->xarp_ha, ill); if ((ill->ill_phys_addr_length + ill->ill_name_length) > sizeof (xar->xarp_ha.sdl_data)) { - freemsg(mp); - ip_ioctl_finish(q, orig_ioc_mp, EINVAL, NO_COPYOUT, - ipsq); + iocp->ioc_error = EINVAL; return; } } *flagsp = ATF_INUSE; - if (area->area_flags & ACE_F_PERMANENT) - *flagsp |= ATF_PERM; - if (area->area_flags & ACE_F_PUBLISH) - *flagsp |= ATF_PUBL; - if (area->area_flags & ACE_F_AUTHORITY) + /* + * If /sbin/arp told us we are the authority using the "permanent" + * flag, or if this is one of my addresses print "permanent" + * in the /sbin/arp output. + */ + if ((flags & NCE_F_MYADDR) || (flags & NCE_F_AUTHORITY)) *flagsp |= ATF_AUTHORITY; - if (area->area_hw_addr_length != 0) { + if (flags & NCE_F_NONUD) + *flagsp |= ATF_PERM; /* not subject to aging */ + if (flags & NCE_F_PUBLISH) + *flagsp |= ATF_PUBL; + if (hwaddr != NULL) { *flagsp |= ATF_COM; - /* - * For SIOCGARP, MAC address length validation has - * already been done before the ioctl was issued to ARP - * to allow it to progress only on 6 byte addressable - * (ethernet like) media. Thus the mac address copying - * can not overwrite the sa_data area below. - */ - bcopy((char *)area + area->area_hw_addr_offset, - storage, area->area_hw_addr_length); + bcopy((char *)hwaddr, storage, ill->ill_phys_addr_length); } - - /* Ditch the internal IOCTL. */ - freemsg(mp); - /* Complete the original. */ - ip_ioctl_finish(q, orig_ioc_mp, 0, COPYOUT, ipsq); } /* @@ -10552,7 +9044,7 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, name = lifr->lifr_name; ASSERT(CONN_Q(q)); connp = Q_TO_CONN(q); - isv6 = connp->conn_af_isv6; + isv6 = (connp->conn_family == AF_INET6); zoneid = connp->conn_zoneid; namelen = mi_strlen(name); if (namelen == 0) @@ -10567,7 +9059,7 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, * for the last 4 args to ipif_lookup_name. */ ipif = ipif_lookup_on_name(lifr->lifr_name, namelen, B_TRUE, - &exists, isv6, zoneid, NULL, NULL, NULL, NULL, ipst); + &exists, isv6, zoneid, ipst); /* Prevent any further action */ if (ipif == NULL) { return (ENOBUFS); @@ -10605,12 +9097,11 @@ ip_sioctl_addif(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, break; } } - ill = ill_lookup_on_name(name, B_FALSE, isv6, - CONNP_TO_WQ(connp), mp, ip_process_ioctl, &err, NULL, ipst); + ill = ill_lookup_on_name(name, B_FALSE, isv6, NULL, ipst); if (found_sep) *cp = IPIF_SEPARATOR_CHAR; if (ill == NULL) - return (err); + return (ENXIO); } ipsq = ipsq_try_enter(NULL, ill, q, mp, ip_process_ioctl, NEW_OP, @@ -10687,7 +9178,7 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ASSERT(q->q_next == NULL); ip1dbg(("ip_sioctl_remove_if(%s:%u %p)\n", - ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); + ill->ill_name, ipif->ipif_id, (void *)ipif)); ASSERT(IAM_WRITER_IPIF(ipif)); connp = Q_TO_CONN(q); @@ -10703,7 +9194,7 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, * same as any other interface (meaning it skips the code directly * below). */ - if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) { + if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) { if (sin->sin_family == AF_UNSPEC && (IN6_IS_ADDR_UNSPECIFIED(&((sin6_t *)sin)->sin6_addr))) { /* @@ -10802,7 +9293,7 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, mutex_exit(&ill->ill_lock); mutex_exit(&connp->conn_lock); ipif_non_duplicate(ipif); - ipif_down_tail(ipif); + (void) ipif_down_tail(ipif); ipif_free_tail(ipif); /* frees ipif */ return (0); } @@ -10833,7 +9324,7 @@ ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, ip1dbg(("ip_sioctl_removeif_restart(%s:%u %p)\n", ill->ill_name, ipif->ipif_id, (void *)ipif)); - if (ipif->ipif_id == 0 && ipif->ipif_net_type == IRE_LOOPBACK) { + if (ipif->ipif_id == 0 && ill->ill_net_type == IRE_LOOPBACK) { ASSERT(ill->ill_state_flags & ILL_CONDEMNED); ill_delete_tail(ill); mi_free(ill); @@ -10841,10 +9332,9 @@ ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, } ipif_non_duplicate(ipif); - ipif_down_tail(ipif); + (void) ipif_down_tail(ipif); ipif_free_tail(ipif); - ILL_UNMARK_CHANGING(ill); return (0); } @@ -10930,8 +9420,6 @@ ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, * we have net and subnet bcast ire's for * the old address if we need them. */ - if (!ipif->ipif_isv6) - ipif_check_bcast_ires(ipif); /* * If the interface is already marked up, * we call ipif_down which will take care @@ -10941,7 +9429,7 @@ ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, err = ipif_logical_down(ipif, q, mp); if (err == EINPROGRESS) return (err); - ipif_down_tail(ipif); + (void) ipif_down_tail(ipif); need_up = 1; } @@ -10988,11 +9476,6 @@ ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ov6addr = ipif->ipif_v6lcl_addr; ipif->ipif_v6lcl_addr = v6addr; sctp_update_ipif_addr(ipif, ov6addr); - if (ipif->ipif_flags & (IPIF_ANYCAST | IPIF_NOLOCAL)) { - ipif->ipif_v6src_addr = ipv6_all_zeros; - } else { - ipif->ipif_v6src_addr = v6addr; - } ipif->ipif_addr_ready = 0; /* @@ -11050,12 +9533,22 @@ ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, * ip_rput_dlpi when we see the DL_BIND_ACK. */ err = ipif_up(ipif, q, mp); + } else { + /* Perhaps ilgs should use this ill */ + update_conn_ill(NULL, ill->ill_ipst); } if (need_dl_down) ill_dl_down(ill); - if (need_arp_down) - ipif_resolver_down(ipif); + + if (need_arp_down && !ill->ill_isv6) + (void) ipif_arp_down(ipif); + + /* + * The default multicast interface might have changed (for + * instance if the IPv6 scope of the address changed) + */ + ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6); return (err); } @@ -11072,7 +9565,7 @@ ip_sioctl_addr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ip1dbg(("ip_sioctl_addr_restart(%s:%u %p)\n", ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); ASSERT(IAM_WRITER_IPIF(ipif)); - ipif_down_tail(ipif); + (void) ipif_down_tail(ipif); return (ip_sioctl_addr_tail(ipif, sin, q, mp, B_TRUE)); } @@ -11162,7 +9655,7 @@ ip_sioctl_dstaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, err = ipif_logical_down(ipif, q, mp); if (err == EINPROGRESS) return (err); - ipif_down_tail(ipif); + (void) ipif_down_tail(ipif); need_up = B_TRUE; } /* @@ -11254,8 +9747,8 @@ ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, if (need_dl_down) ill_dl_down(ill); - if (need_arp_down) - ipif_resolver_down(ipif); + if (need_arp_down && !ipif->ipif_isv6) + (void) ipif_arp_down(ipif); return (err); } @@ -11271,7 +9764,7 @@ ip_sioctl_dstaddr_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, { ip1dbg(("ip_sioctl_dstaddr_restart(%s:%u %p)\n", ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); - ipif_down_tail(ipif); + (void) ipif_down_tail(ipif); return (ip_sioctl_dstaddr_tail(ipif, sin, q, mp, B_TRUE)); } @@ -11333,7 +9826,6 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, struct ifreq *ifr; struct lifreq *lifr; boolean_t set_linklocal = B_FALSE; - boolean_t zero_source = B_FALSE; ip1dbg(("ip_sioctl_flags(%s:%u %p)\n", ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); @@ -11345,7 +9837,7 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, if (ipip->ipi_cmd_type == IF_CMD) { ifr = (struct ifreq *)if_req; - flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff); + flags = (uint64_t)(ifr->ifr_flags & 0x0000ffff); } else { lifr = (struct lifreq *)if_req; flags = lifr->lifr_flags; @@ -11425,10 +9917,10 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, } /* - * Only allow the IFF_XRESOLV and IFF_TEMPORARY flags to be set on + * Only allow IFF_TEMPORARY flag to be set on * IPv6 interfaces. */ - if ((turn_on & (IFF_XRESOLV|IFF_TEMPORARY)) && !(ipif->ipif_isv6)) + if ((turn_on & IFF_TEMPORARY) && !(ipif->ipif_isv6)) return (EINVAL); /* @@ -11444,9 +9936,6 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, if ((turn_on & IFF_ROUTER) && (phyi->phyint_flags & PHYI_LOOPBACK)) return (EINVAL); - if (flags & (IFF_NOLOCAL|IFF_ANYCAST)) - zero_source = B_TRUE; - /* * For IPv6 ipif_id 0, don't allow the interface to be up without * a link local address if IFF_NOLOCAL or IFF_ANYCAST are not set. @@ -11454,7 +9943,7 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, * set later on in this function. */ if (ipif->ipif_id == 0 && ipif->ipif_isv6 && - (flags & IFF_UP) && !zero_source && + (flags & IFF_UP) && !(flags & (IFF_NOLOCAL|IFF_ANYCAST)) && IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { if (ipif_cant_setlinklocal(ipif)) return (EINVAL); @@ -11560,13 +10049,15 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ill_ipif, RTSQ_DEFAULT); } } + /* The default multicast interface might have changed */ + ire_increment_multicast_generation(ill->ill_ipst, + ill->ill_isv6); + return (0); - } else if (set_linklocal || zero_source) { + } else if (set_linklocal) { mutex_enter(&ill->ill_lock); if (set_linklocal) ipif->ipif_state_flags |= IPIF_SET_LINKLOCAL; - if (zero_source) - ipif->ipif_state_flags |= IPIF_ZERO_SOURCE; mutex_exit(&ill->ill_lock); } @@ -11610,13 +10101,10 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ILLF_NONUD|IPIF_PRIVATE|IPIF_ANYCAST|IPIF_PREFERRED| IPIF_NOFAILOVER)) { /* - * Taking this ipif down, make sure we have - * valid net and subnet bcast ire's for other - * logical interfaces, if we need them. + * ipif_down() will ire_delete bcast ire's for the subnet, + * while the ire_identical_ref tracks the case of IRE_BROADCAST + * entries shared between multiple ipifs on the same subnet. */ - if (!ipif->ipif_isv6) - ipif_check_bcast_ires(ipif); - if (((ipif->ipif_flags | turn_on) & IPIF_UP) && !(turn_off & IPIF_UP)) { if (ipif->ipif_flags & IPIF_UP) @@ -11627,7 +10115,7 @@ ip_sioctl_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ip1dbg(("ipif_down returns %d err ", err)); if (err == EINPROGRESS) return (err); - ipif_down_tail(ipif); + (void) ipif_down_tail(ipif); } return (ip_sioctl_flags_tail(ipif, flags, q, mp)); } @@ -11642,7 +10130,6 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp) boolean_t phyint_flags_modified = B_FALSE; int err = 0; boolean_t set_linklocal = B_FALSE; - boolean_t zero_source = B_FALSE; ip1dbg(("ip_sioctl_flags_tail(%s:%u)\n", ipif->ipif_ill->ill_name, ipif->ipif_id)); @@ -11680,21 +10167,13 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp) set_linklocal = B_TRUE; ipif->ipif_state_flags &= ~IPIF_SET_LINKLOCAL; } - if (ipif->ipif_state_flags & IPIF_ZERO_SOURCE) { - zero_source = B_TRUE; - ipif->ipif_state_flags &= ~IPIF_ZERO_SOURCE; - } + mutex_exit(&ill->ill_lock); mutex_exit(&phyi->phyint_lock); if (set_linklocal) (void) ipif_setlinklocal(ipif); - if (zero_source) - ipif->ipif_v6src_addr = ipv6_all_zeros; - else - ipif->ipif_v6src_addr = ipif->ipif_v6lcl_addr; - /* * PHYI_FAILED, PHYI_INACTIVE, and PHYI_OFFLINE are all the same to * the kernel: if any of them has been set by userland, the interface @@ -11744,6 +10223,9 @@ ip_sioctl_flags_tail(ipif_t *ipif, uint64_t flags, queue_t *q, mblk_t *mp) */ sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); } + + /* The default multicast interface might have changed */ + ire_increment_multicast_generation(ill->ill_ipst, ill->ill_isv6); return (err); } @@ -11762,7 +10244,7 @@ ip_sioctl_flags_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ip1dbg(("ip_sioctl_flags_restart(%s:%u %p)\n", ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); - ipif_down_tail(ipif); + (void) ipif_down_tail(ipif); if (ipip->ipi_cmd_type == IF_CMD) { /* cast to uint16_t prevents unwanted sign extension */ flags = (uint16_t)ifr->ifr_flags; @@ -11814,6 +10296,10 @@ ip_sioctl_get_flags(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, return (0); } +/* + * We allow the MTU to be set on an ILL, but not have it be different + * for different IPIFs since we don't actually send packets on IPIFs. + */ /* ARGSUSED */ int ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, @@ -11823,8 +10309,7 @@ ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, int ip_min_mtu; struct ifreq *ifr; struct lifreq *lifr; - ire_t *ire; - ip_stack_t *ipst; + ill_t *ill; ip1dbg(("ip_sioctl_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); @@ -11835,48 +10320,35 @@ ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, lifr = (struct lifreq *)if_req; mtu = lifr->lifr_mtu; } + /* Only allow for logical unit zero i.e. not on "bge0:17" */ + if (ipif->ipif_id != 0) + return (EINVAL); + ill = ipif->ipif_ill; if (ipif->ipif_isv6) ip_min_mtu = IPV6_MIN_MTU; else ip_min_mtu = IP_MIN_MTU; - if (mtu > ipif->ipif_ill->ill_max_frag || mtu < ip_min_mtu) + mutex_enter(&ill->ill_lock); + if (mtu > ill->ill_max_frag || mtu < ip_min_mtu) { + mutex_exit(&ill->ill_lock); return (EINVAL); + } + /* + * The dce and fragmentation code can handle changes to ill_mtu + * concurrent with sending/fragmenting packets. + */ + ill->ill_mtu = mtu; + ill->ill_flags |= ILLF_FIXEDMTU; + mutex_exit(&ill->ill_lock); /* - * Change the MTU size in all relevant ire's. - * Mtu change Vs. new ire creation - protocol below. - * First change ipif_mtu and the ire_max_frag of the - * interface ire. Then do an ire walk and change the - * ire_max_frag of all affected ires. During ire_add - * under the bucket lock, set the ire_max_frag of the - * new ire being created from the ipif/ire from which - * it is being derived. If an mtu change happens after - * the ire is added, the new ire will be cleaned up. - * Conversely if the mtu change happens before the ire - * is added, ire_add will see the new value of the mtu. + * Make sure all dce_generation checks find out + * that ill_mtu has changed. */ - ipif->ipif_mtu = mtu; - ipif->ipif_flags |= IPIF_FIXEDMTU; + dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst); - if (ipif->ipif_isv6) - ire = ipif_to_ire_v6(ipif); - else - ire = ipif_to_ire(ipif); - if (ire != NULL) { - ire->ire_max_frag = ipif->ipif_mtu; - ire_refrele(ire); - } - ipst = ipif->ipif_ill->ill_ipst; - if (ipif->ipif_flags & IPIF_UP) { - if (ipif->ipif_isv6) - ire_walk_v6(ipif_mtu_change, (char *)ipif, ALL_ZONES, - ipst); - else - ire_walk_v4(ipif_mtu_change, (char *)ipif, ALL_ZONES, - ipst); - } /* Update the MTU in SCTP's list */ sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); return (0); @@ -11893,12 +10365,17 @@ ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ip1dbg(("ip_sioctl_get_mtu(%s:%u %p)\n", ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); + + /* + * We allow a get on any logical interface even though the set + * can only be done on logical unit 0. + */ if (ipip->ipi_cmd_type == IF_CMD) { ifr = (struct ifreq *)if_req; - ifr->ifr_metric = ipif->ipif_mtu; + ifr->ifr_metric = ipif->ipif_ill->ill_mtu; } else { lifr = (struct lifreq *)if_req; - lifr->lifr_mtu = ipif->ipif_mtu; + lifr->lifr_mtu = ipif->ipif_ill->ill_mtu; } return (0); } @@ -11911,9 +10388,10 @@ ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, { ipaddr_t addr; ire_t *ire; - ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; + ill_t *ill = ipif->ipif_ill; + ip_stack_t *ipst = ill->ill_ipst; - ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ipif->ipif_ill->ill_name, + ip1dbg(("ip_sioctl_brdaddr(%s:%u)\n", ill->ill_name, ipif->ipif_id)); ASSERT(IAM_WRITER_IPIF(ipif)); @@ -11931,12 +10409,10 @@ ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, * If we are already up, make sure the new * broadcast address makes sense. If it does, * there should be an IRE for it already. - * Don't match on ipif, only on the ill - * since we are sharing these now. */ - ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, - ipif, ALL_ZONES, NULL, - (MATCH_IRE_ILL | MATCH_IRE_TYPE), ipst); + ire = ire_ftable_lookup_v4(addr, 0, 0, IRE_BROADCAST, + ill, ipif->ipif_zoneid, NULL, + (MATCH_IRE_ILL | MATCH_IRE_TYPE), 0, ipst, NULL); if (ire == NULL) { return (EINVAL); } else { @@ -11944,13 +10420,13 @@ ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, } } /* - * Changing the broadcast addr for this ipif. - * Make sure we have valid net and subnet bcast - * ire's for other logical interfaces, if needed. + * Changing the broadcast addr for this ipif. Since the IRE_BROADCAST + * needs to already exist we never need to change the set of + * IRE_BROADCASTs when we are UP. */ if (addr != ipif->ipif_brd_addr) - ipif_check_bcast_ires(ipif); - IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr); + IN6_IPADDR_TO_V4MAPPED(addr, &ipif->ipif_v6brd_addr); + return (0); } @@ -12026,13 +10502,10 @@ ip_sioctl_netmask(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, * Make sure we have valid net and subnet broadcast ire's * for the old netmask, if needed by other logical interfaces. */ - if (!ipif->ipif_isv6) - ipif_check_bcast_ires(ipif); - err = ipif_logical_down(ipif, q, mp); if (err == EINPROGRESS) return (err); - ipif_down_tail(ipif); + (void) ipif_down_tail(ipif); err = ip_sioctl_netmask_tail(ipif, sin, q, mp); return (err); } @@ -12087,7 +10560,7 @@ ip_sioctl_netmask_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, { ip1dbg(("ip_sioctl_netmask_restart(%s:%u %p)\n", ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); - ipif_down_tail(ipif); + (void) ipif_down_tail(ipif); return (ip_sioctl_netmask_tail(ipif, sin, q, mp)); } @@ -12188,6 +10661,7 @@ int ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *if_req) { + int arp_muxid; ip1dbg(("ip_sioctl_muxid(%s:%u %p)\n", ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); @@ -12197,14 +10671,15 @@ ip_sioctl_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, if (ipip->ipi_cmd_type == IF_CMD) { struct ifreq *ifr = (struct ifreq *)if_req; - ipif->ipif_ill->ill_ip_muxid = ifr->ifr_ip_muxid; - ipif->ipif_ill->ill_arp_muxid = ifr->ifr_arp_muxid; + ipif->ipif_ill->ill_muxid = ifr->ifr_ip_muxid; + arp_muxid = ifr->ifr_arp_muxid; } else { struct lifreq *lifr = (struct lifreq *)if_req; - ipif->ipif_ill->ill_ip_muxid = lifr->lifr_ip_muxid; - ipif->ipif_ill->ill_arp_muxid = lifr->lifr_arp_muxid; + ipif->ipif_ill->ill_muxid = lifr->lifr_ip_muxid; + arp_muxid = lifr->lifr_arp_muxid; } + arl_set_muxid(ipif->ipif_ill, arp_muxid); return (0); } @@ -12213,22 +10688,24 @@ int ip_sioctl_get_muxid(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *if_req) { + int arp_muxid = 0; ip1dbg(("ip_sioctl_get_muxid(%s:%u %p)\n", ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); /* * Get the muxid saved in ill for I_PUNLINK. */ + arp_muxid = arl_get_muxid(ipif->ipif_ill); if (ipip->ipi_cmd_type == IF_CMD) { struct ifreq *ifr = (struct ifreq *)if_req; - ifr->ifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid; - ifr->ifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid; + ifr->ifr_ip_muxid = ipif->ipif_ill->ill_muxid; + ifr->ifr_arp_muxid = arp_muxid; } else { struct lifreq *lifr = (struct lifreq *)if_req; - lifr->lifr_ip_muxid = ipif->ipif_ill->ill_ip_muxid; - lifr->lifr_arp_muxid = ipif->ipif_ill->ill_arp_muxid; + lifr->lifr_ip_muxid = ipif->ipif_ill->ill_muxid; + lifr->lifr_arp_muxid = arp_muxid; } return (0); } @@ -12298,7 +10775,7 @@ ip_sioctl_subnet(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, err = ipif_logical_down(ipif, q, mp); if (err == EINPROGRESS) return (err); - ipif_down_tail(ipif); + (void) ipif_down_tail(ipif); need_up = B_TRUE; } @@ -12353,7 +10830,7 @@ ip_sioctl_subnet_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ip1dbg(("ip_sioctl_subnet_restart(%s:%u %p)\n", ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); - ipif_down_tail(ipif); + (void) ipif_down_tail(ipif); addrlen = lifr->lifr_addrlen; if (ipif->ipif_isv6) { @@ -12454,7 +10931,7 @@ ip_sioctl_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, err = ipif_logical_down(ipif, q, mp); if (err == EINPROGRESS) return (err); - ipif_down_tail(ipif); + (void) ipif_down_tail(ipif); need_up = B_TRUE; } err = ip_sioctl_token_tail(ipif, sin6, addrlen, q, mp, need_up); @@ -12538,24 +11015,6 @@ ip_sioctl_get_token(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, /* * Set (hardware) link specific information that might override * what was acquired through the DL_INFO_ACK. - * The logic is as follows. - * - * become exclusive - * set CHANGING flag - * change mtu on affected IREs - * clear CHANGING flag - * - * An ire add that occurs before the CHANGING flag is set will have its mtu - * changed by the ip_sioctl_lnkinfo. - * - * During the time the CHANGING flag is set, no new ires will be added to the - * bucket, and ire add will fail (due the CHANGING flag). - * - * An ire add that occurs after the CHANGING flag is set will have the right mtu - * before it is added to the bucket. - * - * Obviously only 1 thread can set the CHANGING flag and we need to become - * exclusive to set the flag. */ /* ARGSUSED */ int @@ -12563,19 +11022,16 @@ ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipi, void *if_req) { ill_t *ill = ipif->ipif_ill; - ipif_t *nipif; int ip_min_mtu; - boolean_t mtu_walk = B_FALSE; struct lifreq *lifr = (struct lifreq *)if_req; lif_ifinfo_req_t *lir; - ire_t *ire; ip1dbg(("ip_sioctl_lnkinfo(%s:%u %p)\n", ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); lir = &lifr->lifr_ifinfo; ASSERT(IAM_WRITER_IPIF(ipif)); - /* Only allow for logical unit zero i.e. not on "le0:17" */ + /* Only allow for logical unit zero i.e. not on "bge0:17" */ if (ipif->ipif_id != 0) return (EINVAL); @@ -12588,9 +11044,20 @@ ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, /* * Verify values before we set anything. Allow zero to * mean unspecified. + * + * XXX We should be able to set the user-defined lir_mtu to some value + * that is greater than ill_current_frag but less than ill_max_frag- the + * ill_max_frag value tells us the max MTU that can be handled by the + * datalink, whereas the ill_current_frag is dynamically computed for + * some link-types like tunnels, based on the tunnel PMTU. However, + * since there is currently no way of distinguishing between + * administratively fixed link mtu values (e.g., those set via + * /sbin/dladm) and dynamically discovered MTUs (e.g., those discovered + * for tunnels) we conservatively choose the ill_current_frag as the + * upper-bound. */ if (lir->lir_maxmtu != 0 && - (lir->lir_maxmtu > ill->ill_max_frag || + (lir->lir_maxmtu > ill->ill_current_frag || lir->lir_maxmtu < ip_min_mtu)) return (EINVAL); if (lir->lir_reachtime != 0 && @@ -12601,18 +11068,12 @@ ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, return (EINVAL); mutex_enter(&ill->ill_lock); - ill->ill_state_flags |= ILL_CHANGING; - for (nipif = ill->ill_ipif; nipif != NULL; - nipif = nipif->ipif_next) { - nipif->ipif_state_flags |= IPIF_CHANGING; - } - - if (lir->lir_maxmtu != 0) { - ill->ill_max_mtu = lir->lir_maxmtu; + /* + * The dce and fragmentation code can handle changes to ill_mtu + * concurrent with sending/fragmenting packets. + */ + if (lir->lir_maxmtu != 0) ill->ill_user_mtu = lir->lir_maxmtu; - mtu_walk = B_TRUE; - } - mutex_exit(&ill->ill_lock); if (lir->lir_reachtime != 0) ill->ill_reachable_time = lir->lir_reachtime; @@ -12621,47 +11082,29 @@ ip_sioctl_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ill->ill_reachable_retrans_time = lir->lir_reachretrans; ill->ill_max_hops = lir->lir_maxhops; - ill->ill_max_buf = ND_MAX_Q; - - if (mtu_walk) { + if (!(ill->ill_flags & ILLF_FIXEDMTU) && ill->ill_user_mtu != 0) { /* - * Set the MTU on all ipifs associated with this ill except - * for those whose MTU was fixed via SIOCSLIFMTU. + * ill_mtu is the actual interface MTU, obtained as the min + * of user-configured mtu and the value announced by the + * driver (via DL_NOTE_SDU_SIZE/DL_INFO_ACK). Note that since + * we have already made the choice of requiring + * ill_user_mtu < ill_current_frag by the time we get here, + * the ill_mtu effectively gets assigned to the ill_user_mtu + * here. */ - for (nipif = ill->ill_ipif; nipif != NULL; - nipif = nipif->ipif_next) { - if (nipif->ipif_flags & IPIF_FIXEDMTU) - continue; - - nipif->ipif_mtu = ill->ill_max_mtu; - - if (!(nipif->ipif_flags & IPIF_UP)) - continue; - - if (nipif->ipif_isv6) - ire = ipif_to_ire_v6(nipif); - else - ire = ipif_to_ire(nipif); - if (ire != NULL) { - ire->ire_max_frag = ipif->ipif_mtu; - ire_refrele(ire); - } - - ire_walk_ill(MATCH_IRE_ILL, 0, ipif_mtu_change, - nipif, ill); - } - } - - mutex_enter(&ill->ill_lock); - for (nipif = ill->ill_ipif; nipif != NULL; - nipif = nipif->ipif_next) { - nipif->ipif_state_flags &= ~IPIF_CHANGING; + ill->ill_mtu = MIN(ill->ill_current_frag, ill->ill_user_mtu); } - ILL_UNMARK_CHANGING(ill); mutex_exit(&ill->ill_lock); /* + * Make sure all dce_generation checks find out + * that ill_mtu has changed. + */ + if (!(ill->ill_flags & ILLF_FIXEDMTU) && (lir->lir_maxmtu != 0)) + dce_increment_all_generations(ill->ill_isv6, ill->ill_ipst); + + /* * Refresh IPMP meta-interface MTU if necessary. */ if (IS_UNDER_IPMP(ill)) @@ -12687,7 +11130,7 @@ ip_sioctl_get_lnkinfo(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, lir->lir_maxhops = ill->ill_max_hops; lir->lir_reachtime = ill->ill_reachable_time; lir->lir_reachretrans = ill->ill_reachable_retrans_time; - lir->lir_maxmtu = ill->ill_max_mtu; + lir->lir_maxmtu = ill->ill_mtu; return (0); } @@ -12722,7 +11165,7 @@ ip_subnet_mask(ipaddr_t addr, ipif_t **ipifp, ip_stack_t *ipst) mutex_enter(&ill->ill_lock); for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { - if (!IPIF_CAN_LOOKUP(ipif)) + if (IPIF_IS_CONDEMNED(ipif)) continue; if (!(ipif->ipif_flags & IPIF_UP)) continue; @@ -12848,29 +11291,9 @@ done: } /* - * Lookup an ipif using the sequence id (ipif_seqid) + * Assign a unique id for the ipif. This is used by sctp_addr.c + * Note: remove if sctp_addr.c is redone to not shadow ill/ipif data structures. */ -ipif_t * -ipif_lookup_seqid(ill_t *ill, uint_t seqid) -{ - ipif_t *ipif; - - ASSERT(MUTEX_HELD(&ill->ill_lock)); - - for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { - if (ipif->ipif_seqid == seqid && IPIF_CAN_LOOKUP(ipif)) - return (ipif); - } - return (NULL); -} - -/* - * Assign a unique id for the ipif. This is used later when we send - * IRES to ARP for resolution where we initialize ire_ipif_seqid - * to the value pointed by ire_ipif->ipif_seqid. Later when the - * IRE is added, we verify that ipif has not disappeared. - */ - static void ipif_assign_seqid(ipif_t *ipif) { @@ -12893,41 +11316,21 @@ ipif_clone(const ipif_t *sipif, ipif_t *dipif) ASSERT(!(sipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE))); ASSERT(!(dipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE))); ASSERT(sipif->ipif_ire_type == dipif->ipif_ire_type); - ASSERT(sipif->ipif_arp_del_mp == NULL); - ASSERT(dipif->ipif_arp_del_mp == NULL); - ASSERT(sipif->ipif_igmp_rpt == NULL); - ASSERT(dipif->ipif_igmp_rpt == NULL); - ASSERT(sipif->ipif_multicast_up == 0); - ASSERT(dipif->ipif_multicast_up == 0); - ASSERT(sipif->ipif_joined_allhosts == 0); - ASSERT(dipif->ipif_joined_allhosts == 0); - - dipif->ipif_mtu = sipif->ipif_mtu; + dipif->ipif_flags = sipif->ipif_flags; dipif->ipif_metric = sipif->ipif_metric; dipif->ipif_zoneid = sipif->ipif_zoneid; dipif->ipif_v6subnet = sipif->ipif_v6subnet; dipif->ipif_v6lcl_addr = sipif->ipif_v6lcl_addr; - dipif->ipif_v6src_addr = sipif->ipif_v6src_addr; dipif->ipif_v6net_mask = sipif->ipif_v6net_mask; dipif->ipif_v6brd_addr = sipif->ipif_v6brd_addr; dipif->ipif_v6pp_dst_addr = sipif->ipif_v6pp_dst_addr; /* - * While dipif is down right now, it might've been up before. Since - * it's changing identity, its packet counters need to be reset. - */ - dipif->ipif_ib_pkt_count = 0; - dipif->ipif_ob_pkt_count = 0; - dipif->ipif_fo_pkt_count = 0; - - /* * As per the comment atop the function, we assume that these sipif * fields will be changed before sipif is unlocked. */ dipif->ipif_seqid = sipif->ipif_seqid; - dipif->ipif_saved_ire_mp = sipif->ipif_saved_ire_mp; - dipif->ipif_saved_ire_cnt = sipif->ipif_saved_ire_cnt; dipif->ipif_state_flags = sipif->ipif_state_flags; } @@ -12951,13 +11354,6 @@ ipif_transfer(ipif_t *sipif, ipif_t *dipif, ipif_t *virgipif) * Grab all of the locks that protect the ipif in a defined order. */ GRAB_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill); - if (sipif > dipif) { - mutex_enter(&sipif->ipif_saved_ire_lock); - mutex_enter(&dipif->ipif_saved_ire_lock); - } else { - mutex_enter(&dipif->ipif_saved_ire_lock); - mutex_enter(&sipif->ipif_saved_ire_lock); - } ipif_clone(sipif, dipif); if (virgipif != NULL) { @@ -12965,8 +11361,6 @@ ipif_transfer(ipif_t *sipif, ipif_t *dipif, ipif_t *virgipif) mi_free(virgipif); } - mutex_exit(&sipif->ipif_saved_ire_lock); - mutex_exit(&dipif->ipif_saved_ire_lock); RELEASE_ILL_LOCKS(sipif->ipif_ill, dipif->ipif_ill); /* @@ -13115,10 +11509,7 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize, */ ipif->ipif_zoneid = ill->ill_zoneid; - mutex_init(&ipif->ipif_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL); - ipif->ipif_refcnt = 0; - ipif->ipif_saved_ire_cnt = 0; if (insert) { if (ipif_insert(ipif, ire_type != IRE_LOOPBACK) != 0) { @@ -13171,8 +11562,6 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize, IN6_IPADDR_TO_V4MAPPED(inaddr_any, &ipif->ipif_v6lcl_addr); IN6_IPADDR_TO_V4MAPPED(inaddr_any, - &ipif->ipif_v6src_addr); - IN6_IPADDR_TO_V4MAPPED(inaddr_any, &ipif->ipif_v6subnet); IN6_IPADDR_TO_V4MAPPED(inaddr_any, &ipif->ipif_v6net_mask); @@ -13189,8 +11578,6 @@ ipif_allocate(ill_t *ill, int id, uint_t ire_type, boolean_t initialize, if (!initialize) goto out; - ipif->ipif_mtu = ill->ill_max_mtu; - /* * NOTE: The IPMP meta-interface is special-cased because it starts * with no underlying interfaces (and thus an unknown broadcast @@ -13236,207 +11623,47 @@ out: } /* - * If appropriate, send a message up to the resolver delete the entry - * for the address of this interface which is going out of business. - * (Always called as writer). - * - * NOTE : We need to check for NULL mps as some of the fields are - * initialized only for some interface types. See ipif_resolver_up() - * for details. + * Remove the neighbor cache entries associated with this logical + * interface. */ -void -ipif_resolver_down(ipif_t *ipif) +int +ipif_arp_down(ipif_t *ipif) { - mblk_t *mp; ill_t *ill = ipif->ipif_ill; + int err = 0; - ip1dbg(("ipif_resolver_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); + ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); ASSERT(IAM_WRITER_IPIF(ipif)); - if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV)) - return; - - /* Delete the mapping for the local address */ - mp = ipif->ipif_arp_del_mp; - if (mp != NULL) { - ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n", - *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); - putnext(ill->ill_rq, mp); - ipif->ipif_arp_del_mp = NULL; - } - - /* - * Make IPMP aware of the deleted data address. - */ - if (IS_IPMP(ill)) - ipmp_illgrp_del_ipif(ill->ill_grp, ipif); + DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_down", + ill_t *, ill, ipif_t *, ipif); + ipif_nce_down(ipif); /* * If this is the last ipif that is going down and there are no * duplicate addresses we may yet attempt to re-probe, then we need to * clean up ARP completely. */ - if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0) { + if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 && + !ill->ill_logical_down && ill->ill_net_type == IRE_IF_RESOLVER) { /* * If this was the last ipif on an IPMP interface, purge any - * IPMP ARP entries associated with it. + * static ARP entries associated with it. */ if (IS_IPMP(ill)) ipmp_illgrp_refresh_arpent(ill->ill_grp); - /* Send up AR_INTERFACE_DOWN message */ - mp = ill->ill_arp_down_mp; - if (mp != NULL) { - ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n", - *(unsigned *)mp->b_rptr, ill->ill_name, - ipif->ipif_id)); - putnext(ill->ill_rq, mp); - ill->ill_arp_down_mp = NULL; - } - - /* Tell ARP to delete the multicast mappings */ - mp = ill->ill_arp_del_mapping_mp; - if (mp != NULL) { - ip1dbg(("ipif_resolver_down: arp cmd %x for %s:%u\n", - *(unsigned *)mp->b_rptr, ill->ill_name, - ipif->ipif_id)); - putnext(ill->ill_rq, mp); - ill->ill_arp_del_mapping_mp = NULL; - } + /* UNBIND, DETACH */ + err = arp_ll_down(ill); } -} - -/* - * Set up the multicast mappings for `ipif' in ARP. If `arp_add_mapping_mp' - * is non-NULL, then upon success it will contain an mblk that can be passed - * to ARP to create the mapping. Otherwise, if it's NULL, upon success ARP - * will have already been notified to create the mapping. Returns zero on - * success, -1 upon failure. - */ -int -ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp) -{ - mblk_t *del_mp = NULL; - mblk_t *add_mp = NULL; - mblk_t *mp; - ill_t *ill = ipif->ipif_ill; - phyint_t *phyi = ill->ill_phyint; - ipaddr_t addr, mask, extract_mask = 0; - arma_t *arma; - uint8_t *maddr, *bphys_addr; - uint32_t hw_start; - dl_unitdata_req_t *dlur; - - ASSERT(IAM_WRITER_IPIF(ipif)); - if (ipif->ipif_flags & IPIF_POINTOPOINT) - return (0); - - /* - * IPMP meta-interfaces don't have any inherent multicast mappings, - * and instead use the ones on the underlying interfaces. - */ - if (IS_IPMP(ill)) - return (0); - - /* - * Delete the existing mapping from ARP. Normally, ipif_down() -> - * ipif_resolver_down() will send this up to ARP, but it may be that - * we are enabling PHYI_MULTI_BCAST via ip_rput_dlpi_writer(). - */ - mp = ill->ill_arp_del_mapping_mp; - if (mp != NULL) { - ip1dbg(("ipif_arp_setup_multicast: arp cmd %x for %s:%u\n", - *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); - putnext(ill->ill_rq, mp); - ill->ill_arp_del_mapping_mp = NULL; - } - - if (arp_add_mapping_mp != NULL) - *arp_add_mapping_mp = NULL; - - /* - * Check that the address is not to long for the constant - * length reserved in the template arma_t. - */ - if (ill->ill_phys_addr_length > IP_MAX_HW_LEN) - return (-1); - - /* Add mapping mblk */ - addr = (ipaddr_t)htonl(INADDR_UNSPEC_GROUP); - mask = (ipaddr_t)htonl(IN_CLASSD_NET); - add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_arma_multi_template, - (caddr_t)&addr); - if (add_mp == NULL) - return (-1); - arma = (arma_t *)add_mp->b_rptr; - maddr = (uint8_t *)arma + arma->arma_hw_addr_offset; - bcopy(&mask, (char *)arma + arma->arma_proto_mask_offset, IP_ADDR_LEN); - arma->arma_hw_addr_length = ill->ill_phys_addr_length; - /* - * Determine the broadcast address. - */ - dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; - if (ill->ill_sap_length < 0) - bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset; - else - bphys_addr = (uchar_t *)dlur + - dlur->dl_dest_addr_offset + ill->ill_sap_length; - /* - * Check PHYI_MULTI_BCAST and length of physical - * address to determine if we use the mapping or the - * broadcast address. - */ - if (!(phyi->phyint_flags & PHYI_MULTI_BCAST)) - if (!MEDIA_V4MINFO(ill->ill_media, ill->ill_phys_addr_length, - bphys_addr, maddr, &hw_start, &extract_mask)) - phyi->phyint_flags |= PHYI_MULTI_BCAST; - - if ((phyi->phyint_flags & PHYI_MULTI_BCAST) || - (ill->ill_flags & ILLF_MULTICAST)) { - /* Make sure this will not match the "exact" entry. */ - addr = (ipaddr_t)htonl(INADDR_ALLHOSTS_GROUP); - del_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ared_template, - (caddr_t)&addr); - if (del_mp == NULL) { - freemsg(add_mp); - return (-1); - } - bcopy(&extract_mask, (char *)arma + - arma->arma_proto_extract_mask_offset, IP_ADDR_LEN); - if (phyi->phyint_flags & PHYI_MULTI_BCAST) { - /* Use link-layer broadcast address for MULTI_BCAST */ - bcopy(bphys_addr, maddr, ill->ill_phys_addr_length); - ip2dbg(("ipif_arp_setup_multicast: adding" - " MULTI_BCAST ARP setup for %s\n", ill->ill_name)); - } else { - arma->arma_hw_mapping_start = hw_start; - ip2dbg(("ipif_arp_setup_multicast: adding multicast" - " ARP setup for %s\n", ill->ill_name)); - } - } else { - freemsg(add_mp); - ASSERT(del_mp == NULL); - /* It is neither MULTICAST nor MULTI_BCAST */ - return (0); - } - ASSERT(add_mp != NULL && del_mp != NULL); - ASSERT(ill->ill_arp_del_mapping_mp == NULL); - ill->ill_arp_del_mapping_mp = del_mp; - if (arp_add_mapping_mp != NULL) { - /* The caller just wants the mblks allocated */ - *arp_add_mapping_mp = add_mp; - } else { - /* The caller wants us to send it to arp */ - putnext(ill->ill_rq, add_mp); - } - return (0); + return (err); } /* * Get the resolver set up for a new IP address. (Always called as writer.) - * Called both for IPv4 and IPv6 interfaces, though it only sets up the - * resolver for v6 if it's an ILLF_XRESOLV interface. Honors ILLF_NOARP. + * Called both for IPv4 and IPv6 interfaces, though it only does some + * basic DAD related initialization for IPv6. Honors ILLF_NOARP. * * The enumerated value res_act tunes the behavior: * * Res_act_initial: set up all the resolver structures for a new @@ -13451,17 +11678,9 @@ ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp) int ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) { - mblk_t *arp_up_mp = NULL; - mblk_t *arp_down_mp = NULL; - mblk_t *arp_add_mp = NULL; - mblk_t *arp_del_mp = NULL; - mblk_t *arp_add_mapping_mp = NULL; - mblk_t *arp_del_mapping_mp = NULL; - ill_t *ill = ipif->ipif_ill; - int err = ENOMEM; - boolean_t added_ipif = B_FALSE; - boolean_t publish; - boolean_t was_dup; + ill_t *ill = ipif->ipif_ill; + int err; + boolean_t was_dup; ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n", ill->ill_name, ipif->ipif_id, (uint_t)ipif->ipif_flags)); @@ -13490,231 +11709,55 @@ ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) return (0); } /* NDP will set the ipif_addr_ready flag when it's ready */ - if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV)) + if (ill->ill_isv6) return (0); - if (ill->ill_isv6) { - /* - * External resolver for IPv6 - */ - ASSERT(res_act == Res_act_initial); - publish = !IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr); - } else { - /* - * IPv4 arp case. If the ARP stream has already started - * closing, fail this request for ARP bringup. Else - * record the fact that an ARP bringup is pending. - */ - mutex_enter(&ill->ill_lock); - if (ill->ill_arp_closing) { - mutex_exit(&ill->ill_lock); - err = EINVAL; - goto failed; - } else { - if (ill->ill_ipif_up_count == 0 && - ill->ill_ipif_dup_count == 0 && !was_dup) - ill->ill_arp_bringup_pending = 1; - mutex_exit(&ill->ill_lock); - } - publish = (ipif->ipif_lcl_addr != INADDR_ANY); - } - - if (IS_IPMP(ill) && publish) { - /* - * If we're here via ipif_up(), then the ipif won't be bound - * yet -- add it to the group, which will bind it if possible. - * (We would add it in ipif_up(), but deleting on failure - * there is gruesome.) If we're here via ipmp_ill_bind_ipif(), - * then the ipif has already been added to the group and we - * just need to use the binding. - */ - if (ipmp_ipif_bound_ill(ipif) == NULL) { - if (ipmp_illgrp_add_ipif(ill->ill_grp, ipif) == NULL) { - /* - * We couldn't bind the ipif to an ill yet, - * so we have nothing to publish. - */ - publish = B_FALSE; - } - added_ipif = B_TRUE; - } - } - - /* - * Add an entry for the local address in ARP only if it - * is not UNNUMBERED and it is suitable for publishing. - */ - if (!(ipif->ipif_flags & IPIF_UNNUMBERED) && publish) { - if (res_act == Res_act_defend) { - arp_add_mp = ipif_area_alloc(ipif, ACE_F_DEFEND); - if (arp_add_mp == NULL) - goto failed; - /* - * If we're just defending our address now, then - * there's no need to set up ARP multicast mappings. - * The publish command is enough. - */ - goto done; - } - - /* - * Allocate an ARP add message and an ARP delete message (the - * latter is saved for use when the address goes down). - */ - if ((arp_add_mp = ipif_area_alloc(ipif, 0)) == NULL) - goto failed; - - if ((arp_del_mp = ipif_ared_alloc(ipif)) == NULL) - goto failed; - - if (res_act != Res_act_initial) - goto arp_setup_multicast; - } else { - if (res_act != Res_act_initial) - goto done; - } - /* - * Need to bring up ARP or setup multicast mapping only - * when the first interface is coming UP. - */ - if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0 || was_dup) - goto done; - - /* - * Allocate an ARP down message (to be saved) and an ARP up message. - */ - arp_down_mp = ill_arp_alloc(ill, (uchar_t *)&ip_ard_template, 0); - if (arp_down_mp == NULL) - goto failed; - - arp_up_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aru_template, 0); - if (arp_up_mp == NULL) - goto failed; - - if (ipif->ipif_flags & IPIF_POINTOPOINT) - goto done; - -arp_setup_multicast: - /* - * Setup the multicast mappings. This function initializes - * ill_arp_del_mapping_mp also. This does not need to be done for - * IPv6, or for the IPMP interface (since it has no link-layer). - */ - if (!ill->ill_isv6 && !IS_IPMP(ill)) { - err = ipif_arp_setup_multicast(ipif, &arp_add_mapping_mp); - if (err != 0) - goto failed; - ASSERT(ill->ill_arp_del_mapping_mp != NULL); - ASSERT(arp_add_mapping_mp != NULL); - } -done: - if (arp_up_mp != NULL) { - ip1dbg(("ipif_resolver_up: ARP_UP for %s:%u\n", - ill->ill_name, ipif->ipif_id)); - putnext(ill->ill_rq, arp_up_mp); - arp_up_mp = NULL; - } - if (arp_add_mp != NULL) { - ip1dbg(("ipif_resolver_up: ARP_ADD for %s:%u\n", - ill->ill_name, ipif->ipif_id)); - /* - * If it's an extended ARP implementation, then we'll wait to - * hear that DAD has finished before using the interface. - */ - if (!ill->ill_arp_extend) - ipif->ipif_addr_ready = 1; - putnext(ill->ill_rq, arp_add_mp); - arp_add_mp = NULL; - } else { - ipif->ipif_addr_ready = 1; - } - if (arp_add_mapping_mp != NULL) { - ip1dbg(("ipif_resolver_up: MAPPING_ADD for %s:%u\n", - ill->ill_name, ipif->ipif_id)); - putnext(ill->ill_rq, arp_add_mapping_mp); - arp_add_mapping_mp = NULL; - } - - if (res_act == Res_act_initial) { - if (ill->ill_flags & ILLF_NOARP) - err = ill_arp_off(ill); - else - err = ill_arp_on(ill); - if (err != 0) { - ip0dbg(("ipif_resolver_up: arp_on/off failed %d\n", - err)); - goto failed; - } - } - - if (arp_del_mp != NULL) { - ASSERT(ipif->ipif_arp_del_mp == NULL); - ipif->ipif_arp_del_mp = arp_del_mp; - } - if (arp_down_mp != NULL) { - ASSERT(ill->ill_arp_down_mp == NULL); - ill->ill_arp_down_mp = arp_down_mp; - } - if (arp_del_mapping_mp != NULL) { - ASSERT(ill->ill_arp_del_mapping_mp == NULL); - ill->ill_arp_del_mapping_mp = arp_del_mapping_mp; - } - - return ((ill->ill_ipif_up_count != 0 || was_dup || - ill->ill_ipif_dup_count != 0) ? 0 : EINPROGRESS); -failed: - ip1dbg(("ipif_resolver_up: FAILED\n")); - if (added_ipif) - ipmp_illgrp_del_ipif(ill->ill_grp, ipif); - freemsg(arp_add_mp); - freemsg(arp_del_mp); - freemsg(arp_add_mapping_mp); - freemsg(arp_up_mp); - freemsg(arp_down_mp); - ill->ill_arp_bringup_pending = 0; + err = ipif_arp_up(ipif, res_act, was_dup); return (err); } /* - * This routine restarts IPv4 duplicate address detection (DAD) when a link has - * just gone back up. + * This routine restarts IPv4/IPv6 duplicate address detection (DAD) + * when a link has just gone back up. */ static void -ipif_arp_start_dad(ipif_t *ipif) +ipif_nce_start_dad(ipif_t *ipif) { + ncec_t *ncec; ill_t *ill = ipif->ipif_ill; - mblk_t *arp_add_mp; + boolean_t isv6 = ill->ill_isv6; - /* ACE_F_UNVERIFIED restarts DAD */ - if (ill->ill_net_type != IRE_IF_RESOLVER || ill->ill_arp_closing || - (ipif->ipif_flags & IPIF_UNNUMBERED) || - ipif->ipif_lcl_addr == INADDR_ANY || - (arp_add_mp = ipif_area_alloc(ipif, ACE_F_UNVERIFIED)) == NULL) { - /* - * If we can't contact ARP for some reason, that's not really a - * problem. Just send out the routing socket notification that - * DAD completion would have done, and continue. - */ - ipif_mask_reply(ipif); - ipif_up_notify(ipif); - ipif->ipif_addr_ready = 1; - return; - } + if (isv6) { + ncec = ncec_lookup_illgrp_v6(ipif->ipif_ill, + &ipif->ipif_v6lcl_addr); + } else { + ipaddr_t v4addr; - putnext(ill->ill_rq, arp_add_mp); -} + if (ill->ill_net_type != IRE_IF_RESOLVER || + (ipif->ipif_flags & IPIF_UNNUMBERED) || + ipif->ipif_lcl_addr == INADDR_ANY) { + /* + * If we can't contact ARP for some reason, + * that's not really a problem. Just send + * out the routing socket notification that + * DAD completion would have done, and continue. + */ + ipif_mask_reply(ipif); + ipif_up_notify(ipif); + ipif->ipif_addr_ready = 1; + return; + } -static void -ipif_ndp_start_dad(ipif_t *ipif) -{ - nce_t *nce; + IN6_V4MAPPED_TO_IPADDR(&ipif->ipif_v6lcl_addr, v4addr); + ncec = ncec_lookup_illgrp_v4(ipif->ipif_ill, &v4addr); + } - nce = ndp_lookup_v6(ipif->ipif_ill, B_TRUE, &ipif->ipif_v6lcl_addr, - B_FALSE); - if (nce == NULL) + if (ncec == NULL) { + ip1dbg(("couldn't find ncec for ipif %p leaving !ready\n", + (void *)ipif)); return; - - if (!ndp_restart_dad(nce)) { + } + if (!nce_restart_dad(ncec)) { /* * If we can't restart DAD for some reason, that's not really a * problem. Just send out the routing socket notification that @@ -13723,7 +11766,7 @@ ipif_ndp_start_dad(ipif_t *ipif) ipif_up_notify(ipif); ipif->ipif_addr_ready = 1; } - NCE_REFRELE(nce); + ncec_refrele(ncec); } /* @@ -13749,30 +11792,21 @@ ill_restart_dad(ill_t *ill, boolean_t went_up) * If layer two doesn't support duplicate address detection, then just * send the routing socket message now and be done with it. */ - if ((ill->ill_isv6 && (ill->ill_flags & ILLF_XRESOLV)) || - (!ill->ill_isv6 && !ill->ill_arp_extend)) { + if (!ill->ill_isv6 && arp_no_defense) { ip_rts_ifmsg(ill->ill_ipif, RTSQ_DEFAULT); return; } for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { if (went_up) { + if (ipif->ipif_flags & IPIF_UP) { - if (ill->ill_isv6) - ipif_ndp_start_dad(ipif); - else - ipif_arp_start_dad(ipif); - } else if (ill->ill_isv6 && - (ipif->ipif_flags & IPIF_DUPLICATE)) { + ipif_nce_start_dad(ipif); + } else if (ipif->ipif_flags & IPIF_DUPLICATE) { /* - * For IPv4, the ARP module itself will - * automatically start the DAD process when it - * sees DL_NOTE_LINK_UP. We respond to the - * AR_CN_READY at the completion of that task. - * For IPv6, we must kick off the bring-up - * process now. + * kick off the bring-up process now. */ - ndp_do_recovery(ipif); + ipif_do_recovery(ipif); } else { /* * Unfortunately, the first ipif is "special" @@ -13822,7 +11856,7 @@ ipsq_delete(ipsq_t *ipsq) static int ill_up_ipifs_on_ill(ill_t *ill, queue_t *q, mblk_t *mp) { - int err; + int err = 0; ipif_t *ipif; if (ill == NULL) @@ -13841,9 +11875,6 @@ ill_up_ipifs_on_ill(ill_t *ill, queue_t *q, mblk_t *mp) } } } - mutex_enter(&ill->ill_lock); - ill->ill_state_flags &= ~ILL_CHANGING; - mutex_exit(&ill->ill_lock); ill->ill_up_ipifs = B_FALSE; return (0); } @@ -13859,6 +11890,15 @@ ill_up_ipifs(ill_t *ill, queue_t *q, mblk_t *mp) ASSERT(IAM_WRITER_ILL(ill)); + if (ill->ill_replumbing) { + ill->ill_replumbing = 0; + /* + * Send down REPLUMB_DONE notification followed by the + * BIND_REQ on the arp stream. + */ + if (!ill->ill_isv6) + arp_send_replumb_conf(ill); + } err = ill_up_ipifs_on_ill(ill->ill_phyint->phyint_illv4, q, mp); if (err != 0) return (err); @@ -13887,16 +11927,10 @@ ill_down_ipifs(ill_t *ill, boolean_t logical) if (ipif->ipif_flags & IPIF_UP) ipif->ipif_was_up = B_TRUE; - /* - * Need to re-create net/subnet bcast ires if - * they are dependent on ipif. - */ - if (!ipif->ipif_isv6) - ipif_check_bcast_ires(ipif); if (logical) { (void) ipif_logical_down(ipif, NULL, NULL); ipif_non_duplicate(ipif); - ipif_down_tail(ipif); + (void) ipif_down_tail(ipif); } else { (void) ipif_down(ipif, NULL, NULL); } @@ -13904,29 +11938,18 @@ ill_down_ipifs(ill_t *ill, boolean_t logical) } /* - * Redo source address selection. This is called when a - * non-NOLOCAL/DEPRECATED/ANYCAST ipif comes up. + * Redo source address selection. This makes IXAF_VERIFY_SOURCE take + * a look again at valid source addresses. + * This should be called each time after the set of source addresses has been + * changed. */ void -ill_update_source_selection(ill_t *ill) +ip_update_source_selection(ip_stack_t *ipst) { - ipif_t *ipif; - - ASSERT(IAM_WRITER_ILL(ill)); - - /* - * Underlying interfaces are only used for test traffic and thus - * should always send with their (deprecated) source addresses. - */ - if (IS_UNDER_IPMP(ill)) - return; - - for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { - if (ill->ill_isv6) - ipif_recreate_interface_routes_v6(NULL, ipif); - else - ipif_recreate_interface_routes(NULL, ipif); - } + /* We skip past SRC_GENERATION_VERIFY */ + if (atomic_add_32_nv(&ipst->ips_src_generation, 1) == + SRC_GENERATION_VERIFY) + atomic_add_32(&ipst->ips_src_generation, 1); } /* @@ -14154,6 +12177,8 @@ ip_sioctl_groupinfo(ipif_t *dummy_ipif, sin_t *sin, queue_t *q, mblk_t *mp, static void ill_dl_down(ill_t *ill) { + DTRACE_PROBE2(ill__downup, char *, "ill_dl_down", ill_t *, ill); + /* * The ill is down; unbind but stay attached since we're still * associated with a PPA. If we have negotiated DLPI capabilites @@ -14167,6 +12192,13 @@ ill_dl_down(ill_t *ill) ip1dbg(("ill_dl_down(%s)\n", ill->ill_name)); + if (!ill->ill_replumbing) { + /* Free all ilms for this ill */ + update_conn_ill(ill, ill->ill_ipst); + } else { + ill_leave_multicast(ill); + } + ill->ill_unbind_mp = NULL; if (mp != NULL) { ip1dbg(("ill_dl_down: %s (%u) for %s\n", @@ -14191,23 +12223,13 @@ ill_dl_down(ill_t *ill) ill_capability_reset(ill, B_FALSE); ill_dlpi_send(ill, mp); } - - /* - * Toss all of our multicast memberships. We could keep them, but - * then we'd have to do bookkeeping of any joins and leaves performed - * by the application while the the interface is down (we can't just - * issue them because arp cannot currently process AR_ENTRY_SQUERY's - * on a downed interface). - */ - ill_leave_multicast(ill); - mutex_enter(&ill->ill_lock); ill->ill_dl_up = 0; ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0); mutex_exit(&ill->ill_lock); } -static void +void ill_dlpi_dispatch(ill_t *ill, mblk_t *mp) { union DL_primitives *dlp; @@ -14249,6 +12271,8 @@ ill_dlpi_dispatch(ill_t *ill, mblk_t *mp) } mutex_exit(&ill->ill_lock); + DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_dispatch", + char *, dl_primstr(prim), ill_t *, ill); putnext(ill->ill_wq, mp); /* @@ -14301,8 +12325,9 @@ ill_dlpi_send(ill_t *ill, mblk_t *mp) while (*mpp != NULL) mpp = &((*mpp)->b_next); - ip1dbg(("ill_dlpi_send: deferring request for %s\n", - ill->ill_name)); + ip1dbg(("ill_dlpi_send: deferring request for %s " + "while %s pending\n", ill->ill_name, + dl_primstr(ill->ill_dlpi_pending))); *mpp = mp; mutex_exit(&ill->ill_lock); @@ -14437,51 +12462,237 @@ ill_dlpi_done(ill_t *ill, t_uscalar_t prim) ill_dlpi_dispatch(ill, mp); } +/* + * Queue a (multicast) DLPI control message to be sent to the driver by + * later calling ill_dlpi_send_queued. + * We queue them while holding a lock (ill_mcast_lock) to ensure that they + * are sent in order i.e., prevent a DL_DISABMULTI_REQ and DL_ENABMULTI_REQ + * for the same group to race. + * We send DLPI control messages in order using ill_lock. + * For IPMP we should be called on the cast_ill. + */ void -conn_delete_ire(conn_t *connp, caddr_t arg) +ill_dlpi_queue(ill_t *ill, mblk_t *mp) { - ipif_t *ipif = (ipif_t *)arg; - ire_t *ire; + mblk_t **mpp; - /* - * Look at the cached ires on conns which has pointers to ipifs. - * We just call ire_refrele which clears up the reference - * to ire. Called when a conn closes. Also called from ipif_free - * to cleanup indirect references to the stale ipif via the cached ire. - */ - mutex_enter(&connp->conn_lock); - ire = connp->conn_ire_cache; - if (ire != NULL && (ipif == NULL || ire->ire_ipif == ipif)) { - connp->conn_ire_cache = NULL; - mutex_exit(&connp->conn_lock); - IRE_REFRELE_NOTR(ire); - return; + ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); + + mutex_enter(&ill->ill_lock); + /* Must queue message. Tail insertion */ + mpp = &ill->ill_dlpi_deferred; + while (*mpp != NULL) + mpp = &((*mpp)->b_next); + + *mpp = mp; + mutex_exit(&ill->ill_lock); +} + +/* + * Send the messages that were queued. Make sure there is only + * one outstanding message. ip_rput_dlpi_writer calls ill_dlpi_done() + * when an ACK or a NAK is received to process the next queued message. + * For IPMP we are called on the upper ill, but when send what is queued + * on the cast_ill. + */ +void +ill_dlpi_send_queued(ill_t *ill) +{ + mblk_t *mp; + union DL_primitives *dlp; + t_uscalar_t prim; + ill_t *release_ill = NULL; + + if (IS_IPMP(ill)) { + /* On the upper IPMP ill. */ + release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); + if (release_ill == NULL) { + /* Avoid ever sending anything down to the ipmpstub */ + return; + } + ill = release_ill; } - mutex_exit(&connp->conn_lock); + mutex_enter(&ill->ill_lock); + while ((mp = ill->ill_dlpi_deferred) != NULL) { + if (ill->ill_dlpi_pending != DL_PRIM_INVAL) { + /* Can't send. Somebody else will send it */ + mutex_exit(&ill->ill_lock); + goto done; + } + ill->ill_dlpi_deferred = mp->b_next; + mp->b_next = NULL; + if (!ill->ill_dl_up) { + /* + * Nobody there. All multicast addresses will be + * re-joined when we get the DL_BIND_ACK bringing the + * interface up. + */ + freemsg(mp); + continue; + } + dlp = (union DL_primitives *)mp->b_rptr; + prim = dlp->dl_primitive; + + if (!(ill->ill_state_flags & ILL_CONDEMNED) || + (prim == DL_UNBIND_REQ)) { + ill->ill_dlpi_pending = prim; + } + mutex_exit(&ill->ill_lock); + DTRACE_PROBE3(ill__dlpi, char *, "ill_dlpi_send_queued", + char *, dl_primstr(prim), ill_t *, ill); + putnext(ill->ill_wq, mp); + mutex_enter(&ill->ill_lock); + } + mutex_exit(&ill->ill_lock); +done: + if (release_ill != NULL) + ill_refrele(release_ill); } /* - * Some operations (e.g., ipif_down()) conditionally delete a number - * of IREs. Those IREs may have been previously cached in the conn structure. - * This ipcl_walk() walker function releases all references to such IREs based - * on the condemned flag. + * Queue an IP (IGMP/MLD) message to be sent by IP from + * ill_mcast_send_queued + * We queue them while holding a lock (ill_mcast_lock) to ensure that they + * are sent in order i.e., prevent a IGMP leave and IGMP join for the same + * group to race. + * We send them in order using ill_lock. + * For IPMP we are called on the upper ill, but we queue on the cast_ill. */ -/* ARGSUSED */ void -conn_cleanup_stale_ire(conn_t *connp, caddr_t arg) +ill_mcast_queue(ill_t *ill, mblk_t *mp) { - ire_t *ire; + mblk_t **mpp; + ill_t *release_ill = NULL; - mutex_enter(&connp->conn_lock); - ire = connp->conn_ire_cache; - if (ire != NULL && (ire->ire_marks & IRE_MARK_CONDEMNED)) { - connp->conn_ire_cache = NULL; - mutex_exit(&connp->conn_lock); - IRE_REFRELE_NOTR(ire); - return; + ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock)); + + if (IS_IPMP(ill)) { + /* On the upper IPMP ill. */ + release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); + if (release_ill == NULL) { + /* Discard instead of queuing for the ipmp interface */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards - no cast_ill", + mp, ill); + freemsg(mp); + return; + } + ill = release_ill; } - mutex_exit(&connp->conn_lock); + + mutex_enter(&ill->ill_lock); + /* Must queue message. Tail insertion */ + mpp = &ill->ill_mcast_deferred; + while (*mpp != NULL) + mpp = &((*mpp)->b_next); + + *mpp = mp; + mutex_exit(&ill->ill_lock); + if (release_ill != NULL) + ill_refrele(release_ill); +} + +/* + * Send the IP packets that were queued by ill_mcast_queue. + * These are IGMP/MLD packets. + * + * For IPMP we are called on the upper ill, but when send what is queued + * on the cast_ill. + * + * Request loopback of the report if we are acting as a multicast + * router, so that the process-level routing demon can hear it. + * This will run multiple times for the same group if there are members + * on the same group for multiple ipif's on the same ill. The + * igmp_input/mld_input code will suppress this due to the loopback thus we + * always loopback membership report. + * + * We also need to make sure that this does not get load balanced + * by IPMP. We do this by passing an ill to ip_output_simple. + */ +void +ill_mcast_send_queued(ill_t *ill) +{ + mblk_t *mp; + ip_xmit_attr_t ixas; + ill_t *release_ill = NULL; + + if (IS_IPMP(ill)) { + /* On the upper IPMP ill. */ + release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); + if (release_ill == NULL) { + /* + * We should have no messages on the ipmp interface + * but no point in trying to send them. + */ + return; + } + ill = release_ill; + } + bzero(&ixas, sizeof (ixas)); + ixas.ixa_zoneid = ALL_ZONES; + ixas.ixa_cred = kcred; + ixas.ixa_cpid = NOPID; + ixas.ixa_tsl = NULL; + /* + * Here we set ixa_ifindex. If IPMP it will be the lower ill which + * makes ip_select_route pick the IRE_MULTICAST for the cast_ill. + * That is necessary to handle IGMP/MLD snooping switches. + */ + ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex; + ixas.ixa_ipst = ill->ill_ipst; + + mutex_enter(&ill->ill_lock); + while ((mp = ill->ill_mcast_deferred) != NULL) { + ill->ill_mcast_deferred = mp->b_next; + mp->b_next = NULL; + if (!ill->ill_dl_up) { + /* + * Nobody there. Just drop the ip packets. + * IGMP/MLD will resend later, if this is a replumb. + */ + freemsg(mp); + continue; + } + mutex_enter(&ill->ill_phyint->phyint_lock); + if (IS_UNDER_IPMP(ill) && !ipmp_ill_is_active(ill)) { + /* + * When the ill is getting deactivated, we only want to + * send the DLPI messages, so drop IGMP/MLD packets. + * DLPI messages are handled by ill_dlpi_send_queued() + */ + mutex_exit(&ill->ill_phyint->phyint_lock); + freemsg(mp); + continue; + } + mutex_exit(&ill->ill_phyint->phyint_lock); + mutex_exit(&ill->ill_lock); + + /* Check whether we are sending IPv4 or IPv6. */ + if (ill->ill_isv6) { + ip6_t *ip6h = (ip6_t *)mp->b_rptr; + + ixas.ixa_multicast_ttl = ip6h->ip6_hops; + ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6; + } else { + ipha_t *ipha = (ipha_t *)mp->b_rptr; + + ixas.ixa_multicast_ttl = ipha->ipha_ttl; + ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; + ixas.ixa_flags &= ~IXAF_SET_ULP_CKSUM; + } + + ixas.ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_SOURCE; + (void) ip_output_simple(mp, &ixas); + ixa_cleanup(&ixas); + + mutex_enter(&ill->ill_lock); + } + mutex_exit(&ill->ill_lock); + +done: + if (release_ill != NULL) + ill_refrele(release_ill); } /* @@ -14494,7 +12705,7 @@ conn_cleanup_stale_ire(conn_t *connp, caddr_t arg) * that both Solaris and 4.3 BSD have exhibited this behaviour for a long * time. We go thru the cleanup in order to remove these routes. * b. The bringup of the interface could fail in ill_dl_up i.e. we get - * DL_ERROR_ACK in response to the the DL_BIND request. The interface is + * DL_ERROR_ACK in response to the DL_BIND request. The interface is * down, but we need to cleanup i.e. do ill_dl_down and * ip_rput_dlpi_writer (DL_ERROR_ACK) -> ipif_down. * @@ -14504,12 +12715,11 @@ conn_cleanup_stale_ire(conn_t *connp, caddr_t arg) * * The following members in ipif_t track references to the ipif. * int ipif_refcnt; Active reference count - * uint_t ipif_ire_cnt; Number of ire's referencing this ipif - * uint_t ipif_ilm_cnt; Number of ilms's references this ipif. * * The following members in ill_t track references to the ill. * int ill_refcnt; active refcnt * uint_t ill_ire_cnt; Number of ires referencing ill + * uint_t ill_ncec_cnt; Number of ncecs referencing ill * uint_t ill_nce_cnt; Number of nces referencing ill * uint_t ill_ilm_cnt; Number of ilms referencing ill * @@ -14525,21 +12735,25 @@ conn_cleanup_stale_ire(conn_t *connp, caddr_t arg) * references to the ipif / ill. Pointers from other structures do not * count towards this reference count. * - * ipif_ire_cnt/ill_ire_cnt is the number of ire's - * associated with the ipif/ill. This is incremented whenever a new - * ire is created referencing the ipif/ill. This is done atomically inside - * ire_add_v[46] where the ire is actually added to the ire hash table. - * The count is decremented in ire_inactive where the ire is destroyed. + * ill_ire_cnt is the number of ire's associated with the + * ill. This is incremented whenever a new ire is created referencing the + * ill. This is done atomically inside ire_add_v[46] where the ire is + * actually added to the ire hash table. The count is decremented in + * ire_inactive where the ire is destroyed. * - * nce's reference ill's thru nce_ill and the count of nce's associated with - * an ill is recorded in ill_nce_cnt. This is incremented atomically in + * ill_ncec_cnt is the number of ncec's referencing the ill thru ncec_ill. + * This is incremented atomically in * ndp_add_v4()/ndp_add_v6() where the nce is actually added to the - * table. Similarly it is decremented in ndp_inactive() where the nce + * table. Similarly it is decremented in ncec_inactive() where the ncec + * is destroyed. + * + * ill_nce_cnt is the number of nce's referencing the ill thru nce_ill. This is + * incremented atomically in nce_add() where the nce is actually added to the + * ill_nce. Similarly it is decremented in nce_inactive() where the nce * is destroyed. * - * ilm's reference to the ipif (for IPv4 ilm's) or the ill (for IPv6 ilm's) - * is incremented in ilm_add_v6() and decremented before the ilm is freed - * in ilm_walker_cleanup() or ilm_delete(). + * ill_ilm_cnt is the ilm's reference to the ill. It is incremented in + * ilm_add() and decremented before the ilm is freed in ilm_delete(). * * Flow of ioctls involving interface down/up * @@ -14555,50 +12769,22 @@ conn_cleanup_stale_ire(conn_t *connp, caddr_t arg) * to the above. All the *tail functions are called after the refcounts have * dropped to the appropriate values. * - * The mechanism to quiesce an ipif is as follows. - * - * Mark the ipif as IPIF_CHANGING. No more lookups will be allowed - * on the ipif. Callers either pass a flag requesting wait or the lookup - * functions will return NULL. - * - * Delete all ires referencing this ipif + * SIOC ioctls during the IPIF_CHANGING interval. * - * Any thread attempting to do an ipif_refhold on an ipif that has been - * obtained thru a cached pointer will first make sure that - * the ipif can be refheld using the macro IPIF_CAN_LOOKUP and only then - * increment the refcount. - * - * The above guarantees that the ipif refcount will eventually come down to - * zero and the ipif will quiesce, once all threads that currently hold a - * reference to the ipif refrelease the ipif. The ipif is quiescent after the - * ipif_refcount has dropped to zero and all ire's associated with this ipif - * have also been ire_inactive'd. i.e. when ipif_{ire, ill}_cnt and - * ipif_refcnt both drop to zero. See also: comments above IPIF_DOWN_OK() - * in ip.h - * - * Lookups during the IPIF_CHANGING/ILL_CHANGING interval. - * - * Threads trying to lookup an ipif or ill can pass a flag requesting - * wait and restart if the ipif / ill cannot be looked up currently. - * For eg. bind, and route operations (Eg. route add / delete) cannot return - * failure if the ipif is currently undergoing an exclusive operation, and - * hence pass the flag. The mblk is then enqueued in the ipsq and the operation - * is restarted by ipsq_exit() when the current exclusive operation completes. - * The lookup and enqueue is atomic using the ill_lock and ipsq_lock. The + * Threads handling SIOC set ioctls serialize on the squeue, but this + * is not done for SIOC get ioctls. Since a set ioctl can cause several + * steps of internal changes to the state, some of which are visible in + * ipif_flags (such as IFF_UP being cleared and later set), and we want + * the set ioctl to be atomic related to the get ioctls, the SIOC get code + * will wait and restart ioctls if IPIF_CHANGING is set. The mblk is then + * enqueued in the ipsq and the operation is restarted by ipsq_exit() when + * the current exclusive operation completes. The IPIF_CHANGING check + * and enqueue is atomic using the ill_lock and ipsq_lock. The * lookup is done holding the ill_lock. Hence the ill/ipif state flags can't * change while the ill_lock is held. Before dropping the ill_lock we acquire * the ipsq_lock and call ipsq_enq. This ensures that ipsq_exit can't finish - * until we release the ipsq_lock, even though the the ill/ipif state flags + * until we release the ipsq_lock, even though the ill/ipif state flags * can change after we drop the ill_lock. - * - * An attempt to send out a packet using an ipif that is currently - * IPIF_CHANGING will fail. No attempt is made in this case to enqueue this - * operation and restart it later when the exclusive condition on the ipif ends. - * This is an example of not passing the wait flag to the lookup functions. For - * example an attempt to refhold and use conn->conn_multicast_ipif and send - * out a multicast packet on that ipif will fail while the ipif is - * IPIF_CHANGING. An attempt to create an IRE_CACHE using an ipif that is - * currently IPIF_CHANGING will also fail. */ int ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) @@ -14613,6 +12799,9 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) ip1dbg(("ipif_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); + DTRACE_PROBE3(ipif__downup, char *, "ipif_down", + ill_t *, ill, ipif_t *, ipif); + if (ipif->ipif_flags & IPIF_UP) { mutex_enter(&ill->ill_lock); ipif->ipif_flags &= ~IPIF_UP; @@ -14649,15 +12838,12 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) } } - /* - * Delete all IRE's pointing at this ipif or its source address. - */ - if (ipif->ipif_isv6) { - ire_walk_v6(ipif_down_delete_ire, (char *)ipif, ALL_ZONES, - ipst); - } else { - ire_walk_v4(ipif_down_delete_ire, (char *)ipif, ALL_ZONES, - ipst); + if (ipif_was_up) { + /* only delete if we'd added ire's before */ + if (ipif->ipif_isv6) + ipif_delete_ires_v6(ipif); + else + ipif_delete_ires_v4(ipif); } if (ipif_was_up && ill->ill_ipif_up_count == 0) { @@ -14672,30 +12858,28 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) } /* - * Cleaning up the conn_ire_cache or conns must be done only after the - * ires have been deleted above. Otherwise a thread could end up - * caching an ire in a conn after we have finished the cleanup of the - * conn. The caching is done after making sure that the ire is not yet - * condemned. Also documented in the block comment above ip_output + * neighbor-discovery or arp entries for this interface. The ipif + * has to be quiesced, so we walk all the nce's and delete those + * that point at the ipif->ipif_ill. At the same time, we also + * update IPMP so that ipifs for data addresses are unbound. We dont + * call ipif_arp_down to DL_UNBIND the arp stream itself here, but defer + * that for ipif_down_tail() */ - ipcl_walk(conn_cleanup_stale_ire, NULL, ipst); - /* Also, delete the ires cached in SCTP */ - sctp_ire_cache_flush(ipif); + ipif_nce_down(ipif); /* - * Update any other ipifs which have used "our" local address as - * a source address. This entails removing and recreating IRE_INTERFACE - * entries for such ipifs. + * If this is the last ipif on the ill, we also need to remove + * any IREs with ire_ill set. Otherwise ipif_is_quiescent() will + * never succeed. */ - if (ipif->ipif_isv6) - ipif_update_other_ipifs_v6(ipif); - else - ipif_update_other_ipifs(ipif); + if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0) + ire_walk_ill(0, 0, ill_downi, ill, ill); /* - * neighbor-discovery or arp entries for this interface. + * Walk all CONNs that can have a reference on an ire for this + * ipif (we actually walk all that now have stale references). */ - ipif_ndp_down(ipif); + ipcl_walk(conn_ixa_cleanup, (void *)B_TRUE, ipst); /* * If mp is NULL the caller will wait for the appropriate refcnt. @@ -14748,10 +12932,14 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) return (EINPROGRESS); } -void +int ipif_down_tail(ipif_t *ipif) { ill_t *ill = ipif->ipif_ill; + int err = 0; + + DTRACE_PROBE3(ipif__downup, char *, "ipif_down_tail", + ill_t *, ill, ipif_t *, ipif); /* * Skip any loopback interface (null wq). @@ -14766,15 +12954,14 @@ ipif_down_tail(ipif_t *ipif) ill->ill_dl_up) { ill_dl_down(ill); } - ill->ill_logical_down = 0; + if (!ipif->ipif_isv6) + err = ipif_arp_down(ipif); - /* - * Has to be after removing the routes in ipif_down_delete_ire. - */ - ipif_resolver_down(ipif); + ill->ill_logical_down = 0; ip_rts_ifmsg(ipif, RTSQ_DEFAULT); ip_rts_newaddrmsg(RTM_DELETE, 0, ipif, RTSQ_DEFAULT); + return (err); } /* @@ -14785,6 +12972,9 @@ ipif_down_tail(ipif_t *ipif) static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp) { + DTRACE_PROBE3(ipif__downup, char *, "ipif_logical_down", + ill_t *, ipif->ipif_ill, ipif_t *, ipif); + /* * The ill_logical_down flag is a transient flag. It is set here * and is cleared once the down has completed in ipif_down_tail. @@ -14799,152 +12989,6 @@ ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp) } /* - * This is called when the SIOCSLIFUSESRC ioctl is processed in IP. - * If the usesrc client ILL is already part of a usesrc group or not, - * in either case a ire_stq with the matching usesrc client ILL will - * locate the IRE's that need to be deleted. We want IREs to be created - * with the new source address. - */ -static void -ipif_delete_cache_ire(ire_t *ire, char *ill_arg) -{ - ill_t *ucill = (ill_t *)ill_arg; - - ASSERT(IAM_WRITER_ILL(ucill)); - - if (ire->ire_stq == NULL) - return; - - if ((ire->ire_type == IRE_CACHE) && - ((ill_t *)ire->ire_stq->q_ptr == ucill)) - ire_delete(ire); -} - -/* - * ire_walk routine to delete every IRE dependent on the interface - * address that is going down. (Always called as writer.) - * Works for both v4 and v6. - * In addition for checking for ire_ipif matches it also checks for - * IRE_CACHE entries which have the same source address as the - * disappearing ipif since ipif_select_source might have picked - * that source. Note that ipif_down/ipif_update_other_ipifs takes - * care of any IRE_INTERFACE with the disappearing source address. - */ -static void -ipif_down_delete_ire(ire_t *ire, char *ipif_arg) -{ - ipif_t *ipif = (ipif_t *)ipif_arg; - - ASSERT(IAM_WRITER_IPIF(ipif)); - if (ire->ire_ipif == NULL) - return; - - if (ire->ire_ipif != ipif) { - /* - * Look for a matching source address. - */ - if (ire->ire_type != IRE_CACHE) - return; - if (ipif->ipif_flags & IPIF_NOLOCAL) - return; - - if (ire->ire_ipversion == IPV4_VERSION) { - if (ire->ire_src_addr != ipif->ipif_src_addr) - return; - } else { - if (!IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6, - &ipif->ipif_v6lcl_addr)) - return; - } - ire_delete(ire); - return; - } - /* - * ire_delete() will do an ire_flush_cache which will delete - * all ire_ipif matches - */ - ire_delete(ire); -} - -/* - * ire_walk_ill function for deleting all IRE_CACHE entries for an ill when - * 1) an ipif (on that ill) changes the IPIF_DEPRECATED flags, or - * 2) when an interface is brought up or down (on that ill). - * This ensures that the IRE_CACHE entries don't retain stale source - * address selection results. - */ -void -ill_ipif_cache_delete(ire_t *ire, char *ill_arg) -{ - ill_t *ill = (ill_t *)ill_arg; - - ASSERT(IAM_WRITER_ILL(ill)); - ASSERT(ire->ire_type == IRE_CACHE); - - /* - * We are called for IRE_CACHEs whose ire_stq or ire_ipif matches - * ill, but we only want to delete the IRE if ire_ipif matches. - */ - ASSERT(ire->ire_ipif != NULL); - if (ill == ire->ire_ipif->ipif_ill) - ire_delete(ire); -} - -/* - * Delete all the IREs whose ire_stq's reference `ill_arg'. IPMP uses this - * instead of ill_ipif_cache_delete() because ire_ipif->ipif_ill references - * the IPMP ill. - */ -void -ill_stq_cache_delete(ire_t *ire, char *ill_arg) -{ - ill_t *ill = (ill_t *)ill_arg; - - ASSERT(IAM_WRITER_ILL(ill)); - ASSERT(ire->ire_type == IRE_CACHE); - - /* - * We are called for IRE_CACHEs whose ire_stq or ire_ipif matches - * ill, but we only want to delete the IRE if ire_stq matches. - */ - if (ire->ire_stq->q_ptr == ill_arg) - ire_delete(ire); -} - -/* - * Delete all the IREs whose ire_stq's reference any ill in the same IPMP - * group as `ill_arg'. Used by ipmp_ill_deactivate() to flush all IRE_CACHE - * entries for the illgrp. - */ -void -ill_grp_cache_delete(ire_t *ire, char *ill_arg) -{ - ill_t *ill = (ill_t *)ill_arg; - - ASSERT(IAM_WRITER_ILL(ill)); - - if (ire->ire_type == IRE_CACHE && - IS_IN_SAME_ILLGRP((ill_t *)ire->ire_stq->q_ptr, ill)) { - ire_delete(ire); - } -} - -/* - * Delete all broadcast IREs with a source address on `ill_arg'. - */ -static void -ill_broadcast_delete(ire_t *ire, char *ill_arg) -{ - ill_t *ill = (ill_t *)ill_arg; - - ASSERT(IAM_WRITER_ILL(ill)); - ASSERT(ire->ire_type == IRE_BROADCAST); - - if (ire->ire_ipif->ipif_ill == ill) - ire_delete(ire); -} - -/* * Initiate deallocate of an IPIF. Always called as writer. Called by * ill_delete or ip_sioctl_removeif. */ @@ -14959,16 +13003,6 @@ ipif_free(ipif_t *ipif) (void) untimeout(ipif->ipif_recovery_id); ipif->ipif_recovery_id = 0; - /* Remove conn references */ - reset_conn_ipif(ipif); - - /* - * Make sure we have valid net and subnet broadcast ire's for the - * other ipif's which share them with this ipif. - */ - if (!ipif->ipif_isv6) - ipif_check_bcast_ires(ipif); - /* * Take down the interface. We can be called either from ill_delete * or from ip_sioctl_removeif. @@ -14996,27 +13030,15 @@ ipif_free(ipif_t *ipif) static void ipif_free_tail(ipif_t *ipif) { - mblk_t *mp; ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; /* - * Free state for addition IRE_IF_[NO]RESOLVER ire's. - */ - mutex_enter(&ipif->ipif_saved_ire_lock); - mp = ipif->ipif_saved_ire_mp; - ipif->ipif_saved_ire_mp = NULL; - mutex_exit(&ipif->ipif_saved_ire_lock); - freemsg(mp); - - /* * Need to hold both ill_g_lock and ill_lock while * inserting or removing an ipif from the linked list * of ipifs hanging off the ill. */ rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); - ASSERT(ilm_walk_ipif(ipif) == 0); - #ifdef DEBUG ipif_trace_cleanup(ipif); #endif @@ -15028,10 +13050,9 @@ ipif_free_tail(ipif_t *ipif) ipif_remove(ipif); rw_exit(&ipst->ips_ill_g_lock); - mutex_destroy(&ipif->ipif_saved_ire_lock); - ASSERT(!(ipif->ipif_flags & (IPIF_UP | IPIF_DUPLICATE))); ASSERT(ipif->ipif_recovery_id == 0); + ASSERT(ipif->ipif_ire_local == NULL); /* Free the memory. */ mi_free(ipif); @@ -15064,6 +13085,23 @@ ipif_get_name(const ipif_t *ipif, char *buf, int len) } /* + * Sets `buf' to an ill name. + */ +void +ill_get_name(const ill_t *ill, char *buf, int len) +{ + char *name; + size_t name_len; + + name = ill->ill_name; + name_len = ill->ill_name_length; + len -= 1; + buf[len] = '\0'; + len = MIN(len, name_len); + bcopy(name, buf, len); +} + +/* * Find an IPIF based on the name passed in. Names can be of the form <phys> * (e.g., le0) or <phys>:<#> (e.g., le0:1). When there is no colon, the * implied unit id is zero. <phys> must correspond to the name of an ILL. @@ -15071,8 +13109,7 @@ ipif_get_name(const ipif_t *ipif, char *buf, int len) */ static ipif_t * ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, - boolean_t *exists, boolean_t isv6, zoneid_t zoneid, queue_t *q, - mblk_t *mp, ipsq_func_t func, int *error, ip_stack_t *ipst) + boolean_t *exists, boolean_t isv6, zoneid_t zoneid, ip_stack_t *ipst) { char *cp; char *endp; @@ -15081,10 +13118,6 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, ipif_t *ipif; uint_t ire_type; boolean_t did_alloc = B_FALSE; - ipsq_t *ipsq; - - if (error != NULL) - *error = 0; /* * If the caller wants to us to create the ipif, make sure we have a @@ -15093,8 +13126,6 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, ASSERT(!do_alloc || zoneid != ALL_ZONES); if (namelen == 0) { - if (error != NULL) - *error = ENXIO; return (NULL); } @@ -15121,8 +13152,6 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, * is zero, fail. */ if (&cp[2] < endp && cp[1] == '0') { - if (error != NULL) - *error = EINVAL; return (NULL); } } @@ -15140,7 +13169,7 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, * ill_lookup_on_name will clear it. */ ill = ill_lookup_on_name(name, do_alloc, isv6, - q, mp, func, error, &did_alloc, ipst); + &did_alloc, ipst); if (cp != endp) *cp = IPIF_SEPARATOR_CHAR; if (ill == NULL) @@ -15153,13 +13182,10 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, cp++; if (ddi_strtol(cp, NULL, 0, &id) != 0) { ill_refrele(ill); - if (error != NULL) - *error = ENXIO; return (NULL); } } - GRAB_CONN_LOCK(q); mutex_enter(&ill->ill_lock); /* Now see if there is an IPIF with this unit number. */ for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { @@ -15168,16 +13194,9 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, zoneid != ipif->ipif_zoneid && ipif->ipif_zoneid != ALL_ZONES) { mutex_exit(&ill->ill_lock); - RELEASE_CONN_LOCK(q); ill_refrele(ill); - if (error != NULL) - *error = ENXIO; return (NULL); } - /* - * The block comment at the start of ipif_down - * explains the use of the macros used below - */ if (IPIF_CAN_LOOKUP(ipif)) { ipif_refhold_locked(ipif); mutex_exit(&ill->ill_lock); @@ -15189,32 +13208,15 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, * ipif_ill_refrele_tail which can end up * in trying to acquire any lock. */ - RELEASE_CONN_LOCK(q); ill_refrele(ill); return (ipif); - } else if (IPIF_CAN_WAIT(ipif, q)) { - ipsq = ill->ill_phyint->phyint_ipsq; - mutex_enter(&ipsq->ipsq_lock); - mutex_enter(&ipsq->ipsq_xop->ipx_lock); - mutex_exit(&ill->ill_lock); - ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); - mutex_exit(&ipsq->ipsq_xop->ipx_lock); - mutex_exit(&ipsq->ipsq_lock); - RELEASE_CONN_LOCK(q); - ill_refrele(ill); - if (error != NULL) - *error = EINPROGRESS; - return (NULL); } } } - RELEASE_CONN_LOCK(q); if (!do_alloc) { mutex_exit(&ill->ill_lock); ill_refrele(ill); - if (error != NULL) - *error = ENXIO; return (NULL); } @@ -15236,8 +13238,6 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, ipif = ipif_allocate(ill, id, ire_type, B_TRUE, B_TRUE); if (ipif != NULL) ipif_refhold_locked(ipif); - else if (error != NULL) - *error = ENOMEM; mutex_exit(&ill->ill_lock); ill_refrele(ill); return (ipif); @@ -15258,6 +13258,7 @@ ipif_mask_reply(ipif_t *ipif) ipha_t *ipha; mblk_t *mp; ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; + ip_xmit_attr_t ixas; #define REPLY_LEN (sizeof (icmp_ipha) + sizeof (icmph_t) + IP_ADDR_LEN) @@ -15269,6 +13270,9 @@ ipif_mask_reply(ipif_t *ipif) /* ICMP mask reply is not for a loopback interface */ ASSERT(ipif->ipif_ill->ill_wq != NULL); + if (ipif->ipif_lcl_addr == INADDR_ANY) + return; + mp = allocb(REPLY_LEN, BPRI_HI); if (mp == NULL) return; @@ -15278,7 +13282,7 @@ ipif_mask_reply(ipif_t *ipif) bzero(ipha, REPLY_LEN); *ipha = icmp_ipha; ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl; - ipha->ipha_src = ipif->ipif_src_addr; + ipha->ipha_src = ipif->ipif_lcl_addr; ipha->ipha_dst = ipif->ipif_brd_addr; ipha->ipha_length = htons(REPLY_LEN); ipha->ipha_ident = 0; @@ -15288,64 +13292,19 @@ ipif_mask_reply(ipif_t *ipif) bcopy(&ipif->ipif_net_mask, &icmph[1], IP_ADDR_LEN); icmph->icmph_checksum = IP_CSUM(mp, sizeof (ipha_t), 0); - put(ipif->ipif_wq, mp); - + bzero(&ixas, sizeof (ixas)); + ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; + ixas.ixa_flags |= IXAF_SET_SOURCE; + ixas.ixa_zoneid = ALL_ZONES; + ixas.ixa_ifindex = 0; + ixas.ixa_ipst = ipst; + ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; + (void) ip_output_simple(mp, &ixas); + ixa_cleanup(&ixas); #undef REPLY_LEN } /* - * When the mtu in the ipif changes, we call this routine through ire_walk - * to update all the relevant IREs. - * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq. - */ -static void -ipif_mtu_change(ire_t *ire, char *ipif_arg) -{ - ipif_t *ipif = (ipif_t *)ipif_arg; - - if (ire->ire_stq == NULL || ire->ire_ipif != ipif) - return; - - mutex_enter(&ire->ire_lock); - if (ire->ire_marks & IRE_MARK_PMTU) { - /* Avoid increasing the PMTU */ - ire->ire_max_frag = MIN(ipif->ipif_mtu, ire->ire_max_frag); - if (ire->ire_max_frag == ipif->ipif_mtu) - ire->ire_marks &= ~IRE_MARK_PMTU; - } else { - ire->ire_max_frag = MIN(ipif->ipif_mtu, IP_MAXPACKET); - } - mutex_exit(&ire->ire_lock); -} - -/* - * When the mtu in the ill changes, we call this routine through ire_walk - * to update all the relevant IREs. - * Skip IRE_LOCAL and "loopback" IRE_BROADCAST by checking ire_stq. - */ -void -ill_mtu_change(ire_t *ire, char *ill_arg) -{ - ill_t *ill = (ill_t *)ill_arg; - - if (ire->ire_stq == NULL || ire->ire_ipif->ipif_ill != ill) - return; - - mutex_enter(&ire->ire_lock); - if (ire->ire_marks & IRE_MARK_PMTU) { - /* Avoid increasing the PMTU */ - ire->ire_max_frag = MIN(ire->ire_ipif->ipif_mtu, - ire->ire_max_frag); - if (ire->ire_max_frag == ire->ire_ipif->ipif_mtu) { - ire->ire_marks &= ~IRE_MARK_PMTU; - } - } else { - ire->ire_max_frag = MIN(ire->ire_ipif->ipif_mtu, IP_MAXPACKET); - } - mutex_exit(&ire->ire_lock); -} - -/* * Join the ipif specific multicast groups. * Must be called after a mapping has been set up in the resolver. (Always * called as writer.) @@ -15355,13 +13314,15 @@ ipif_multicast_up(ipif_t *ipif) { int err; ill_t *ill; + ilm_t *ilm; ASSERT(IAM_WRITER_IPIF(ipif)); ill = ipif->ipif_ill; ip1dbg(("ipif_multicast_up\n")); - if (!(ill->ill_flags & ILLF_MULTICAST) || ipif->ipif_multicast_up) + if (!(ill->ill_flags & ILLF_MULTICAST) || + ipif->ipif_allhosts_ilm != NULL) return; if (ipif->ipif_isv6) { @@ -15380,228 +13341,147 @@ ipif_multicast_up(ipif_t *ipif) * underlying IPMP interfaces since they should be invisible. */ if (!IS_UNDER_IPMP(ill)) { - err = ip_addmulti_v6(&v6allmc, ill, ipif->ipif_zoneid, - ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); - if (err != 0) { + ilm = ip_addmulti(&v6allmc, ill, ipif->ipif_zoneid, + &err); + if (ilm == NULL) { + ASSERT(err != 0); ip0dbg(("ipif_multicast_up: " "all_hosts_mcast failed %d\n", err)); return; } - ipif->ipif_joined_allhosts = 1; + ipif->ipif_allhosts_ilm = ilm; } /* - * Enable multicast for the solicited node multicast address + * Enable multicast for the solicited node multicast address. + * If IPMP we need to put the membership on the upper ill. */ if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { - err = ip_addmulti_v6(&v6solmc, ill, ipif->ipif_zoneid, - ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); - if (err != 0) { + ill_t *mcast_ill = NULL; + boolean_t need_refrele; + + if (IS_UNDER_IPMP(ill) && + (mcast_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) { + need_refrele = B_TRUE; + } else { + mcast_ill = ill; + need_refrele = B_FALSE; + } + + ilm = ip_addmulti(&v6solmc, mcast_ill, + ipif->ipif_zoneid, &err); + if (need_refrele) + ill_refrele(mcast_ill); + + if (ilm == NULL) { + ASSERT(err != 0); ip0dbg(("ipif_multicast_up: solicited MC" " failed %d\n", err)); - if (ipif->ipif_joined_allhosts) { - (void) ip_delmulti_v6(&v6allmc, ill, - ipif->ipif_zoneid, B_TRUE, B_TRUE); - ipif->ipif_joined_allhosts = 0; + if ((ilm = ipif->ipif_allhosts_ilm) != NULL) { + ipif->ipif_allhosts_ilm = NULL; + (void) ip_delmulti(ilm); } return; } + ipif->ipif_solmulti_ilm = ilm; } } else { + in6_addr_t v6group; + if (ipif->ipif_lcl_addr == INADDR_ANY || IS_UNDER_IPMP(ill)) return; /* Join the all hosts multicast address */ ip1dbg(("ipif_multicast_up - addmulti\n")); - err = ip_addmulti(htonl(INADDR_ALLHOSTS_GROUP), ipif, - ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); - if (err) { + IN6_IPADDR_TO_V4MAPPED(htonl(INADDR_ALLHOSTS_GROUP), &v6group); + + ilm = ip_addmulti(&v6group, ill, ipif->ipif_zoneid, &err); + if (ilm == NULL) { + ASSERT(err != 0); ip0dbg(("ipif_multicast_up: failed %d\n", err)); return; } + ipif->ipif_allhosts_ilm = ilm; } - ipif->ipif_multicast_up = 1; } /* * Blow away any multicast groups that we joined in ipif_multicast_up(). - * (Explicit memberships are blown away in ill_leave_multicast() when the - * ill is brought down.) + * (ilms from explicit memberships are handled in conn_update_ill.) */ void ipif_multicast_down(ipif_t *ipif) { - int err; - ASSERT(IAM_WRITER_IPIF(ipif)); ip1dbg(("ipif_multicast_down\n")); - if (!ipif->ipif_multicast_up) - return; - - ip1dbg(("ipif_multicast_down - delmulti\n")); - - if (!ipif->ipif_isv6) { - err = ip_delmulti(htonl(INADDR_ALLHOSTS_GROUP), ipif, B_TRUE, - B_TRUE); - if (err != 0) - ip0dbg(("ipif_multicast_down: failed %d\n", err)); - - ipif->ipif_multicast_up = 0; - return; - } - /* - * Leave the all-hosts multicast address. - */ - if (ipif->ipif_joined_allhosts) { - err = ip_delmulti_v6(&ipv6_all_hosts_mcast, ipif->ipif_ill, - ipif->ipif_zoneid, B_TRUE, B_TRUE); - if (err != 0) { - ip0dbg(("ipif_multicast_down: all_hosts_mcast " - "failed %d\n", err)); - } - ipif->ipif_joined_allhosts = 0; + if (ipif->ipif_allhosts_ilm != NULL) { + (void) ip_delmulti(ipif->ipif_allhosts_ilm); + ipif->ipif_allhosts_ilm = NULL; } - - /* - * Disable multicast for the solicited node multicast address - */ - if (!(ipif->ipif_flags & IPIF_NOLOCAL)) { - in6_addr_t ipv6_multi = ipv6_solicited_node_mcast; - - ipv6_multi.s6_addr32[3] |= - ipif->ipif_v6lcl_addr.s6_addr32[3]; - - err = ip_delmulti_v6(&ipv6_multi, ipif->ipif_ill, - ipif->ipif_zoneid, B_TRUE, B_TRUE); - if (err != 0) { - ip0dbg(("ipif_multicast_down: sol MC failed %d\n", - err)); - } + if (ipif->ipif_solmulti_ilm != NULL) { + (void) ip_delmulti(ipif->ipif_solmulti_ilm); + ipif->ipif_solmulti_ilm = NULL; } - - ipif->ipif_multicast_up = 0; } /* * Used when an interface comes up to recreate any extra routes on this * interface. */ -static ire_t ** -ipif_recover_ire(ipif_t *ipif) +int +ill_recover_saved_ire(ill_t *ill) { - mblk_t *mp; - ire_t **ipif_saved_irep; - ire_t **irep; - ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; - - ip1dbg(("ipif_recover_ire(%s:%u)", ipif->ipif_ill->ill_name, - ipif->ipif_id)); + mblk_t *mp; + ip_stack_t *ipst = ill->ill_ipst; - mutex_enter(&ipif->ipif_saved_ire_lock); - ipif_saved_irep = (ire_t **)kmem_zalloc(sizeof (ire_t *) * - ipif->ipif_saved_ire_cnt, KM_NOSLEEP); - if (ipif_saved_irep == NULL) { - mutex_exit(&ipif->ipif_saved_ire_lock); - return (NULL); - } + ip1dbg(("ill_recover_saved_ire(%s)", ill->ill_name)); - irep = ipif_saved_irep; - for (mp = ipif->ipif_saved_ire_mp; mp != NULL; mp = mp->b_cont) { - ire_t *ire; - queue_t *rfq; - queue_t *stq; + mutex_enter(&ill->ill_saved_ire_lock); + for (mp = ill->ill_saved_ire_mp; mp != NULL; mp = mp->b_cont) { + ire_t *ire, *nire; ifrt_t *ifrt; - uchar_t *src_addr; - uchar_t *gateway_addr; - ushort_t type; - /* - * When the ire was initially created and then added in - * ip_rt_add(), it was created either using ipif->ipif_net_type - * in the case of a traditional interface route, or as one of - * the IRE_OFFSUBNET types (with the exception of - * IRE_HOST types ire which is created by icmp_redirect() and - * which we don't need to save or recover). In the case where - * ipif->ipif_net_type was IRE_LOOPBACK, ip_rt_add() will update - * the ire_type to IRE_IF_NORESOLVER before calling ire_add() - * to satisfy software like GateD and Sun Cluster which creates - * routes using the the loopback interface's address as a - * gateway. - * - * As ifrt->ifrt_type reflects the already updated ire_type, - * ire_create() will be called in the same way here as - * in ip_rt_add(), namely using ipif->ipif_net_type when - * the route looks like a traditional interface route (where - * ifrt->ifrt_type & IRE_INTERFACE is true) and otherwise using - * the saved ifrt->ifrt_type. This means that in the case where - * ipif->ipif_net_type is IRE_LOOPBACK, the ire created by - * ire_create() will be an IRE_LOOPBACK, it will then be turned - * into an IRE_IF_NORESOLVER and then added by ire_add(). - */ ifrt = (ifrt_t *)mp->b_rptr; - ASSERT(ifrt->ifrt_type != IRE_CACHE); - if (ifrt->ifrt_type & IRE_INTERFACE) { - rfq = NULL; - stq = (ipif->ipif_net_type == IRE_IF_RESOLVER) - ? ipif->ipif_rq : ipif->ipif_wq; - src_addr = (ifrt->ifrt_flags & RTF_SETSRC) - ? (uint8_t *)&ifrt->ifrt_src_addr - : (uint8_t *)&ipif->ipif_src_addr; - gateway_addr = NULL; - type = ipif->ipif_net_type; - } else if (ifrt->ifrt_type & IRE_BROADCAST) { - /* Recover multiroute broadcast IRE. */ - rfq = ipif->ipif_rq; - stq = ipif->ipif_wq; - src_addr = (ifrt->ifrt_flags & RTF_SETSRC) - ? (uint8_t *)&ifrt->ifrt_src_addr - : (uint8_t *)&ipif->ipif_src_addr; - gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr; - type = ifrt->ifrt_type; - } else { - rfq = NULL; - stq = NULL; - src_addr = (ifrt->ifrt_flags & RTF_SETSRC) - ? (uint8_t *)&ifrt->ifrt_src_addr : NULL; - gateway_addr = (uint8_t *)&ifrt->ifrt_gateway_addr; - type = ifrt->ifrt_type; - } - /* * Create a copy of the IRE with the saved address and netmask. */ - ip1dbg(("ipif_recover_ire: creating IRE %s (%d) for " - "0x%x/0x%x\n", - ip_nv_lookup(ire_nv_tbl, ifrt->ifrt_type), ifrt->ifrt_type, - ntohl(ifrt->ifrt_addr), - ntohl(ifrt->ifrt_mask))); - ire = ire_create( - (uint8_t *)&ifrt->ifrt_addr, - (uint8_t *)&ifrt->ifrt_mask, - src_addr, - gateway_addr, - &ifrt->ifrt_max_frag, - NULL, - rfq, - stq, - type, - ipif, - 0, - 0, - 0, - ifrt->ifrt_flags, - &ifrt->ifrt_iulp_info, - NULL, - NULL, - ipst); - + if (ill->ill_isv6) { + ire = ire_create_v6( + &ifrt->ifrt_v6addr, + &ifrt->ifrt_v6mask, + &ifrt->ifrt_v6gateway_addr, + ifrt->ifrt_type, + ill, + ifrt->ifrt_zoneid, + ifrt->ifrt_flags, + NULL, + ipst); + } else { + ire = ire_create( + (uint8_t *)&ifrt->ifrt_addr, + (uint8_t *)&ifrt->ifrt_mask, + (uint8_t *)&ifrt->ifrt_gateway_addr, + ifrt->ifrt_type, + ill, + ifrt->ifrt_zoneid, + ifrt->ifrt_flags, + NULL, + ipst); + } if (ire == NULL) { - mutex_exit(&ipif->ipif_saved_ire_lock); - kmem_free(ipif_saved_irep, - ipif->ipif_saved_ire_cnt * sizeof (ire_t *)); - return (NULL); + mutex_exit(&ill->ill_saved_ire_lock); + return (ENOMEM); + } + + if (ifrt->ifrt_flags & RTF_SETSRC) { + if (ill->ill_isv6) { + ire->ire_setsrc_addr_v6 = + ifrt->ifrt_v6setsrc_addr; + } else { + ire->ire_setsrc_addr = ifrt->ifrt_setsrc_addr; + } } /* @@ -15611,23 +13491,37 @@ ipif_recover_ire(ipif_t *ipif) * set up prefixes with the RTF_REJECT flag set (for example, * when generating aggregate routes.) * - * If the IRE type (as defined by ipif->ipif_net_type) is + * If the IRE type (as defined by ill->ill_net_type) is * IRE_LOOPBACK, then we map the request into a * IRE_IF_NORESOLVER. */ - if (ipif->ipif_net_type == IRE_LOOPBACK) + if (ill->ill_net_type == IRE_LOOPBACK) ire->ire_type = IRE_IF_NORESOLVER; + /* * ire held by ire_add, will be refreled' towards the * the end of ipif_up_done */ - (void) ire_add(&ire, NULL, NULL, NULL, B_FALSE); - *irep = ire; - irep++; - ip1dbg(("ipif_recover_ire: added ire %p\n", (void *)ire)); + nire = ire_add(ire); + /* + * Check if it was a duplicate entry. This handles + * the case of two racing route adds for the same route + */ + if (nire == NULL) { + ip1dbg(("ill_recover_saved_ire: FAILED\n")); + } else if (nire != ire) { + ip1dbg(("ill_recover_saved_ire: duplicate ire %p\n", + (void *)nire)); + ire_delete(nire); + } else { + ip1dbg(("ill_recover_saved_ire: added ire %p\n", + (void *)nire)); + } + if (nire != NULL) + ire_refrele(nire); } - mutex_exit(&ipif->ipif_saved_ire_lock); - return (ipif_saved_irep); + mutex_exit(&ill->ill_saved_ire_lock); + return (0); } /* @@ -15766,6 +13660,8 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) ASSERT(IAM_WRITER_IPIF(ipif)); ip1dbg(("ipif_up(%s:%u)\n", ill->ill_name, ipif->ipif_id)); + DTRACE_PROBE3(ipif__downup, char *, "ipif_up", + ill_t *, ill, ipif_t *, ipif); /* Shouldn't get here if it is already up. */ if (ipif->ipif_flags & IPIF_UP) @@ -15786,7 +13682,7 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) /* * The ipif being brought up should be quiesced. If it's not, * something has gone amiss and we need to bail out. (If it's - * quiesced, we know it will remain so via IPIF_CHANGING.) + * quiesced, we know it will remain so via IPIF_CONDEMNED.) */ mutex_enter(&ill->ill_lock); if (!ipif_is_quiescent(ipif)) { @@ -15868,8 +13764,8 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) /* * If the ipif being brought up was on slot zero, then we * first need to bring up the placeholder we stuck there. In - * ip_rput_dlpi_writer(), ip_arp_done(), or the recursive call - * to ipif_up() itself, if we successfully bring up the + * ip_rput_dlpi_writer(), arp_bringup_done(), or the recursive + * call to ipif_up() itself, if we successfully bring up the * placeholder, we'll check ill_move_ipif and bring it up too. */ if (ipif_orig_id == 0) { @@ -15907,13 +13803,13 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) } /* - * ipif_resolver_up may end up sending an - * AR_INTERFACE_UP message to ARP, which would, in - * turn send a DLPI message to the driver. ioctls are + * ipif_resolver_up may end up needeing to bind/attach + * the ARP stream, which in turn necessitates a + * DLPI message exchange with the driver. ioctls are * serialized and so we cannot send more than one * interface up message at a time. If ipif_resolver_up - * does send an interface up message to ARP, we get - * EINPROGRESS and we will complete in ip_arp_done. + * does need to wait for the DLPI handshake for the ARP stream, + * we get EINPROGRESS and we will complete in arp_bringup_done. */ ASSERT(connp != NULL || !CONN_Q(q)); @@ -15928,18 +13824,12 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) return (EINTR); /* - * Crank up the resolver. For IPv6, this cranks up the - * external resolver if one is configured, but even if an - * external resolver isn't configured, it must be called to - * reset DAD state. For IPv6, if an external resolver is not - * being used, ipif_resolver_up() will never return - * EINPROGRESS, so we can always call ipif_ndp_up() here. - * Note that if an external resolver is being used, there's no - * need to call ipif_ndp_up() since it will do nothing. + * Crank up IPv6 neighbor discovery. Unlike ARP, this should + * complete when ipif_ndp_up returns. */ err = ipif_resolver_up(ipif, Res_act_initial); if (err == EINPROGRESS) { - /* We will complete it in ip_arp_done() */ + /* We will complete it in arp_bringup_done() */ return (err); } @@ -15958,9 +13848,13 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) */ ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); ipif->ipif_addr_ready = 1; + err = ill_add_ires(ill); + /* allocation failure? */ + if (err != 0) + return (err); } - err = isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif); + err = (isv6 ? ipif_up_done_v6(ipif) : ipif_up_done(ipif)); if (err == 0 && ill->ill_move_ipif != NULL) { ipif = ill->ill_move_ipif; ill->ill_move_ipif = NULL; @@ -15970,6 +13864,53 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) } /* + * Add any IREs tied to the ill. For now this is just an IRE_MULTICAST. + * The identical set of IREs need to be removed in ill_delete_ires(). + */ +int +ill_add_ires(ill_t *ill) +{ + ire_t *ire; + in6_addr_t dummy6 = {(uint32_t)V6_MCAST, 0, 0, 1}; + in_addr_t dummy4 = htonl(INADDR_ALLHOSTS_GROUP); + + if (ill->ill_ire_multicast != NULL) + return (0); + + /* + * provide some dummy ire_addr for creating the ire. + */ + if (ill->ill_isv6) { + ire = ire_create_v6(&dummy6, 0, 0, IRE_MULTICAST, ill, + ALL_ZONES, RTF_UP, NULL, ill->ill_ipst); + } else { + ire = ire_create((uchar_t *)&dummy4, 0, 0, IRE_MULTICAST, ill, + ALL_ZONES, RTF_UP, NULL, ill->ill_ipst); + } + if (ire == NULL) + return (ENOMEM); + + ill->ill_ire_multicast = ire; + return (0); +} + +void +ill_delete_ires(ill_t *ill) +{ + if (ill->ill_ire_multicast != NULL) { + /* + * BIND/ATTACH completed; Release the ref for ill_ire_multicast + * which was taken without any th_tracing enabled. + * We also mark it as condemned (note that it was never added) + * so that caching conn's can move off of it. + */ + ire_make_condemned(ill->ill_ire_multicast); + ire_refrele_notr(ill->ill_ire_multicast); + ill->ill_ire_multicast = NULL; + } +} + +/* * Perform a bind for the physical device. * When the routine returns EINPROGRESS then mp has been consumed and * the ioctl will be acked from ip_rput_dlpi. @@ -15978,30 +13919,26 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) { - areq_t *areq; - mblk_t *areq_mp = NULL; mblk_t *bind_mp = NULL; mblk_t *unbind_mp = NULL; conn_t *connp; boolean_t success; - uint16_t sap_addr; + int err; + + DTRACE_PROBE2(ill__downup, char *, "ill_dl_up", ill_t *, ill); ip1dbg(("ill_dl_up(%s)\n", ill->ill_name)); ASSERT(IAM_WRITER_ILL(ill)); ASSERT(mp != NULL); - /* Create a resolver cookie for ARP */ - if (!ill->ill_isv6 && ill->ill_net_type == IRE_IF_RESOLVER) { - areq_mp = ill_arp_alloc(ill, (uchar_t *)&ip_areq_template, 0); - if (areq_mp == NULL) - return (ENOMEM); + /* + * Make sure we have an IRE_MULTICAST in case we immediately + * start receiving packets. + */ + err = ill_add_ires(ill); + if (err != 0) + goto bad; - freemsg(ill->ill_resolver_mp); - ill->ill_resolver_mp = areq_mp; - areq = (areq_t *)areq_mp->b_rptr; - sap_addr = ill->ill_sap; - bcopy(&sap_addr, areq->areq_sap, sizeof (sap_addr)); - } bind_mp = ip_dlpi_alloc(sizeof (dl_bind_req_t) + sizeof (long), DL_BIND_REQ); if (bind_mp == NULL) @@ -16067,46 +14004,39 @@ bad: return (ENOMEM); } +/* Add room for tcp+ip headers */ uint_t ip_loopback_mtuplus = IP_LOOPBACK_MTU + IP_SIMPLE_HDR_LENGTH + 20; /* * DLPI and ARP is up. - * Create all the IREs associated with an interface bring up multicast. + * Create all the IREs associated with an interface. Bring up multicast. * Set the interface flag and finish other initialization - * that potentially had to be differed to after DL_BIND_ACK. + * that potentially had to be deferred to after DL_BIND_ACK. */ int ipif_up_done(ipif_t *ipif) { - ire_t *ire_array[20]; - ire_t **irep = ire_array; - ire_t **irep1; - ipaddr_t net_mask = 0; - ipaddr_t subnet_mask, route_mask; - ill_t *ill = ipif->ipif_ill; - queue_t *stq; - ipif_t *src_ipif; - ipif_t *tmp_ipif; - boolean_t flush_ire_cache = B_TRUE; - int err = 0; - ire_t **ipif_saved_irep = NULL; - int ipif_saved_ire_cnt; - int cnt; - boolean_t src_ipif_held = B_FALSE; + ill_t *ill = ipif->ipif_ill; + int err = 0; boolean_t loopback = B_FALSE; - ip_stack_t *ipst = ill->ill_ipst; + boolean_t update_src_selection = B_TRUE; + ipif_t *tmp_ipif; ip1dbg(("ipif_up_done(%s:%u)\n", ipif->ipif_ill->ill_name, ipif->ipif_id)); + DTRACE_PROBE3(ipif__downup, char *, "ipif_up_done", + ill_t *, ill, ipif_t *, ipif); + /* Check if this is a loopback interface */ if (ipif->ipif_ill->ill_wq == NULL) loopback = B_TRUE; ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); + /* * If all other interfaces for this ill are down or DEPRECATED, - * or otherwise unsuitable for source address selection, remove - * any IRE_CACHE entries for this ill to make sure source + * or otherwise unsuitable for source address selection, + * reset the src generation numbers to make sure source * address selection gets to take this new ipif into account. * No need to hold ill_lock while traversing the ipif list since * we are writer @@ -16119,31 +14049,16 @@ ipif_up_done(ipif_t *ipif) (tmp_ipif == ipif)) continue; /* first useable pre-existing interface */ - flush_ire_cache = B_FALSE; + update_src_selection = B_FALSE; break; } - if (flush_ire_cache) - ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, - IRE_CACHE, ill_ipif_cache_delete, (char *)ill, ill); + if (update_src_selection) + ip_update_source_selection(ill->ill_ipst); - /* - * Figure out which way the send-to queue should go. Only - * IRE_IF_RESOLVER or IRE_IF_NORESOLVER or IRE_LOOPBACK - * should show up here. - */ - switch (ill->ill_net_type) { - case IRE_IF_RESOLVER: - stq = ill->ill_rq; - break; - case IRE_IF_NORESOLVER: - case IRE_LOOPBACK: - stq = ill->ill_wq; - break; - default: - return (EINVAL); - } + if (IS_LOOPBACK(ill) || ill->ill_net_type == IRE_IF_NORESOLVER) { + nce_t *loop_nce = NULL; + uint16_t flags = (NCE_F_MYADDR | NCE_F_AUTHORITY | NCE_F_NONUD); - if (IS_LOOPBACK(ill)) { /* * lo0:1 and subsequent ipifs were marked IRE_LOCAL in * ipif_lookup_on_name(), but in the case of zones we can have @@ -16155,29 +14070,130 @@ ipif_up_done(ipif_t *ipif) ipif->ipif_ire_type = IRE_LOOPBACK; else ipif->ipif_ire_type = IRE_LOCAL; + if (ill->ill_net_type != IRE_LOOPBACK) + flags |= NCE_F_PUBLISH; + + /* add unicast nce for the local addr */ + err = nce_lookup_then_add_v4(ill, NULL, + ill->ill_phys_addr_length, &ipif->ipif_lcl_addr, flags, + ND_REACHABLE, &loop_nce); + /* A shared-IP zone sees EEXIST for lo0:N */ + if (err == 0 || err == EEXIST) { + ipif->ipif_added_nce = 1; + loop_nce->nce_ipif_cnt++; + nce_refrele(loop_nce); + err = 0; + } else { + ASSERT(loop_nce == NULL); + return (err); + } } - if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST) || - ((ipif->ipif_flags & IPIF_DEPRECATED) && - !(ipif->ipif_flags & IPIF_NOFAILOVER))) { + /* Create all the IREs associated with this interface */ + err = ipif_add_ires_v4(ipif, loopback); + if (err != 0) { /* - * Can't use our source address. Select a different - * source address for the IRE_INTERFACE and IRE_LOCAL + * see comments about return value from + * ip_addr_availability_check() in ipif_add_ires_v4(). */ - src_ipif = ipif_select_source(ipif->ipif_ill, - ipif->ipif_subnet, ipif->ipif_zoneid); - if (src_ipif == NULL) - src_ipif = ipif; /* Last resort */ - else - src_ipif_held = B_TRUE; - } else { - src_ipif = ipif; + if (err != EADDRINUSE) { + (void) ipif_arp_down(ipif); + } else { + /* + * Make IPMP aware of the deleted ipif so that + * the needed ipmp cleanup (e.g., of ipif_bound_ill) + * can be completed. Note that we do not want to + * destroy the nce that was created on the ipmp_ill + * for the active copy of the duplicate address in + * use. + */ + if (IS_IPMP(ill)) + ipmp_illgrp_del_ipif(ill->ill_grp, ipif); + err = EADDRNOTAVAIL; + } + return (err); } - /* Create all the IREs associated with this interface */ + if (ill->ill_ipif_up_count == 1 && !loopback) { + /* Recover any additional IREs entries for this ill */ + (void) ill_recover_saved_ire(ill); + } + + if (ill->ill_need_recover_multicast) { + /* + * Need to recover all multicast memberships in the driver. + * This had to be deferred until we had attached. The same + * code exists in ipif_up_done_v6() to recover IPv6 + * memberships. + * + * Note that it would be preferable to unconditionally do the + * ill_recover_multicast() in ill_dl_up(), but we cannot do + * that since ill_join_allmulti() depends on ill_dl_up being + * set, and it is not set until we receive a DL_BIND_ACK after + * having called ill_dl_up(). + */ + ill_recover_multicast(ill); + } + + if (ill->ill_ipif_up_count == 1) { + /* + * Since the interface is now up, it may now be active. + */ + if (IS_UNDER_IPMP(ill)) + ipmp_ill_refresh_active(ill); + + /* + * If this is an IPMP interface, we may now be able to + * establish ARP entries. + */ + if (IS_IPMP(ill)) + ipmp_illgrp_refresh_arpent(ill->ill_grp); + } + + /* Join the allhosts multicast address */ + ipif_multicast_up(ipif); + + if (!loopback && !update_src_selection && + !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) + ip_update_source_selection(ill->ill_ipst); + + if (!loopback && ipif->ipif_addr_ready) { + /* Broadcast an address mask reply. */ + ipif_mask_reply(ipif); + } + /* Perhaps ilgs should use this ill */ + update_conn_ill(NULL, ill->ill_ipst); + + /* + * This had to be deferred until we had bound. Tell routing sockets and + * others that this interface is up if it looks like the address has + * been validated. Otherwise, if it isn't ready yet, wait for + * duplicate address detection to do its thing. + */ + if (ipif->ipif_addr_ready) + ipif_up_notify(ipif); + return (0); +} + +/* + * Add the IREs associated with the ipif. + * Those MUST be explicitly removed in ipif_delete_ires_v4. + */ +static int +ipif_add_ires_v4(ipif_t *ipif, boolean_t loopback) +{ + ill_t *ill = ipif->ipif_ill; + ip_stack_t *ipst = ill->ill_ipst; + ire_t *ire_array[20]; + ire_t **irep = ire_array; + ire_t **irep1; + ipaddr_t net_mask = 0; + ipaddr_t subnet_mask, route_mask; + int err; + ire_t *ire_local = NULL; /* LOCAL or LOOPBACK */ + if ((ipif->ipif_lcl_addr != INADDR_ANY) && !(ipif->ipif_flags & IPIF_NOLOCAL)) { - /* * If we're on a labeled system then make sure that zone- * private addresses have proper remote host database entries. @@ -16191,38 +14207,34 @@ ipif_up_done(ipif_t *ipif) err = ip_srcid_insert(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst); if (err != 0) { - ip0dbg(("ipif_up_done: srcid_insert %d\n", err)); + ip0dbg(("ipif_add_ires: srcid_insert %d\n", err)); return (err); } /* If the interface address is set, create the local IRE. */ - ip1dbg(("ipif_up_done: 0x%p creating IRE 0x%x for 0x%x\n", - (void *)ipif, - ipif->ipif_ire_type, - ntohl(ipif->ipif_lcl_addr))); - *irep++ = ire_create( + ire_local = ire_create( (uchar_t *)&ipif->ipif_lcl_addr, /* dest address */ (uchar_t *)&ip_g_all_ones, /* mask */ - (uchar_t *)&src_ipif->ipif_src_addr, /* source address */ NULL, /* no gateway */ - &ip_loopback_mtuplus, /* max frag size */ - NULL, - ipif->ipif_rq, /* recv-from queue */ - NULL, /* no send-to queue */ ipif->ipif_ire_type, /* LOCAL or LOOPBACK */ - ipif, - 0, - 0, - 0, - (ipif->ipif_flags & IPIF_PRIVATE) ? - RTF_PRIVATE : 0, - &ire_uinfo_null, - NULL, + ipif->ipif_ill, + ipif->ipif_zoneid, + ((ipif->ipif_flags & IPIF_PRIVATE) ? + RTF_PRIVATE : 0) | RTF_KERNEL, NULL, ipst); + ip1dbg(("ipif_add_ires: 0x%p creating IRE %p type 0x%x" + " for 0x%x\n", (void *)ipif, (void *)ire_local, + ipif->ipif_ire_type, + ntohl(ipif->ipif_lcl_addr))); + if (ire_local == NULL) { + ip1dbg(("ipif_up_done: NULL ire_local\n")); + err = ENOMEM; + goto bad; + } } else { ip1dbg(( - "ipif_up_done: not creating IRE %d for 0x%x: flags 0x%x\n", + "ipif_add_ires: not creating IRE %d for 0x%x: flags 0x%x\n", ipif->ipif_ire_type, ntohl(ipif->ipif_lcl_addr), (uint_t)ipif->ipif_flags)); @@ -16249,7 +14261,7 @@ ipif_up_done(ipif_t *ipif) } /* Set up the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */ - if (stq != NULL && !(ipif->ipif_flags & IPIF_NOXMIT) && + if (!loopback && !(ipif->ipif_flags & IPIF_NOXMIT) && ipif->ipif_subnet != INADDR_ANY) { /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ @@ -16259,7 +14271,7 @@ ipif_up_done(ipif_t *ipif) route_mask = subnet_mask; } - ip1dbg(("ipif_up_done: ipif 0x%p ill 0x%p " + ip1dbg(("ipif_add_ires: ipif 0x%p ill 0x%p " "creating if IRE ill_net_type 0x%x for 0x%x\n", (void *)ipif, (void *)ill, ill->ill_net_type, @@ -16267,20 +14279,12 @@ ipif_up_done(ipif_t *ipif) *irep++ = ire_create( (uchar_t *)&ipif->ipif_subnet, /* dest address */ (uchar_t *)&route_mask, /* mask */ - (uchar_t *)&src_ipif->ipif_src_addr, /* src addr */ - NULL, /* no gateway */ - &ipif->ipif_mtu, /* max frag */ - NULL, - NULL, /* no recv queue */ - stq, /* send-to queue */ + (uchar_t *)&ipif->ipif_lcl_addr, /* gateway */ ill->ill_net_type, /* IF_[NO]RESOLVER */ - ipif, - 0, - 0, - 0, - (ipif->ipif_flags & IPIF_PRIVATE) ? RTF_PRIVATE: 0, - &ire_uinfo_null, - NULL, + ill, + ipif->ipif_zoneid, + ((ipif->ipif_flags & IPIF_PRIVATE) ? + RTF_PRIVATE: 0) | RTF_KERNEL, NULL, ipst); } @@ -16288,11 +14292,10 @@ ipif_up_done(ipif_t *ipif) /* * Create any necessary broadcast IREs. */ - if (ipif->ipif_flags & IPIF_BROADCAST) + if ((ipif->ipif_flags & IPIF_BROADCAST) && + !(ipif->ipif_flags & IPIF_NOXMIT)) irep = ipif_create_bcast_ires(ipif, irep); - ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); - /* If an earlier ire_create failed, get out now */ for (irep1 = irep; irep1 > ire_array; ) { irep1--; @@ -16324,14 +14327,9 @@ ipif_up_done(ipif_t *ipif) * ipif. So we don't want to delete it (otherwise the other ipif * would be unable to send packets). * ip_addr_availability_check() identifies this case for us and - * returns EADDRINUSE; we need to turn it into EADDRNOTAVAIL + * returns EADDRINUSE; Caller should turn it into EADDRNOTAVAIL * which is the expected error code. */ - if (err == EADDRINUSE) { - freemsg(ipif->ipif_arp_del_mp); - ipif->ipif_arp_del_mp = NULL; - err = EADDRNOTAVAIL; - } ill->ill_ipif_up_count--; ipif->ipif_flags &= ~IPIF_UP; goto bad; @@ -16341,19 +14339,33 @@ ipif_up_done(ipif_t *ipif) * Add in all newly created IREs. ire_create_bcast() has * already checked for duplicates of the IRE_BROADCAST type. */ + if (ire_local != NULL) { + ire_local = ire_add(ire_local); +#ifdef DEBUG + if (ire_local != NULL) { + ire_refhold_notr(ire_local); + ire_refrele(ire_local); + } +#endif + } + + rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); + if (ire_local != NULL) + ipif->ipif_ire_local = ire_local; + rw_exit(&ipst->ips_ill_g_lock); + ire_local = NULL; + for (irep1 = irep; irep1 > ire_array; ) { irep1--; - ASSERT(!MUTEX_HELD(&((*irep1)->ire_ipif->ipif_ill->ill_lock))); - /* - * refheld by ire_add. refele towards the end of the func - */ - (void) ire_add(irep1, NULL, NULL, NULL, B_FALSE); + ASSERT(!MUTEX_HELD(&((*irep1)->ire_ill->ill_lock))); + /* refheld by ire_add. */ + *irep1 = ire_add(*irep1); + if (*irep1 != NULL) { + ire_refrele(*irep1); + *irep1 = NULL; + } } - /* Recover any additional IRE_IF_[NO]RESOLVER entries for this ipif */ - ipif_saved_ire_cnt = ipif->ipif_saved_ire_cnt; - ipif_saved_irep = ipif_recover_ire(ipif); - if (!loopback) { /* * If the broadcast address has been set, make sure it makes @@ -16364,9 +14376,9 @@ ipif_up_done(ipif_t *ipif) (ipif->ipif_flags & IPIF_BROADCAST)) { ire_t *ire; - ire = ire_ctable_lookup(ipif->ipif_brd_addr, 0, - IRE_BROADCAST, ipif, ALL_ZONES, - NULL, (MATCH_IRE_TYPE | MATCH_IRE_ILL), ipst); + ire = ire_ftable_lookup_v4(ipif->ipif_brd_addr, 0, 0, + IRE_BROADCAST, ipif->ipif_ill, ALL_ZONES, NULL, + (MATCH_IRE_TYPE | MATCH_IRE_ILL), 0, ipst, NULL); if (ire == NULL) { /* @@ -16383,176 +14395,113 @@ ipif_up_done(ipif_t *ipif) } } - - if (ill->ill_need_recover_multicast) { - /* - * Need to recover all multicast memberships in the driver. - * This had to be deferred until we had attached. The same - * code exists in ipif_up_done_v6() to recover IPv6 - * memberships. - * - * Note that it would be preferable to unconditionally do the - * ill_recover_multicast() in ill_dl_up(), but we cannot do - * that since ill_join_allmulti() depends on ill_dl_up being - * set, and it is not set until we receive a DL_BIND_ACK after - * having called ill_dl_up(). - */ - ill_recover_multicast(ill); - } - - if (ill->ill_ipif_up_count == 1) { - /* - * Since the interface is now up, it may now be active. - */ - if (IS_UNDER_IPMP(ill)) - ipmp_ill_refresh_active(ill); - - /* - * If this is an IPMP interface, we may now be able to - * establish ARP entries. - */ - if (IS_IPMP(ill)) - ipmp_illgrp_refresh_arpent(ill->ill_grp); - } - - /* Join the allhosts multicast address */ - ipif_multicast_up(ipif); - - /* - * See if anybody else would benefit from our new ipif. - */ - if (!loopback && - !(ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) { - ill_update_source_selection(ill); - } - - for (irep1 = irep; irep1 > ire_array; ) { - irep1--; - if (*irep1 != NULL) { - /* was held in ire_add */ - ire_refrele(*irep1); - } - } - - cnt = ipif_saved_ire_cnt; - for (irep1 = ipif_saved_irep; cnt > 0; irep1++, cnt--) { - if (*irep1 != NULL) { - /* was held in ire_add */ - ire_refrele(*irep1); - } - } - - if (!loopback && ipif->ipif_addr_ready) { - /* Broadcast an address mask reply. */ - ipif_mask_reply(ipif); - } - if (ipif_saved_irep != NULL) { - kmem_free(ipif_saved_irep, - ipif_saved_ire_cnt * sizeof (ire_t *)); - } - if (src_ipif_held) - ipif_refrele(src_ipif); - - /* - * This had to be deferred until we had bound. Tell routing sockets and - * others that this interface is up if it looks like the address has - * been validated. Otherwise, if it isn't ready yet, wait for - * duplicate address detection to do its thing. - */ - if (ipif->ipif_addr_ready) - ipif_up_notify(ipif); return (0); bad: - ip1dbg(("ipif_up_done: FAILED \n")); - + ip1dbg(("ipif_add_ires: FAILED \n")); + if (ire_local != NULL) + ire_delete(ire_local); while (irep > ire_array) { irep--; - if (*irep != NULL) + if (*irep != NULL) { ire_delete(*irep); + } } (void) ip_srcid_remove(&ipif->ipif_v6lcl_addr, ipif->ipif_zoneid, ipst); - if (ipif_saved_irep != NULL) { - kmem_free(ipif_saved_irep, - ipif_saved_ire_cnt * sizeof (ire_t *)); - } - if (src_ipif_held) - ipif_refrele(src_ipif); - - ipif_resolver_down(ipif); return (err); } -/* - * Turn off the ARP with the ILLF_NOARP flag. - */ -static int -ill_arp_off(ill_t *ill) +/* Remove all the IREs created by ipif_add_ires_v4 */ +void +ipif_delete_ires_v4(ipif_t *ipif) { - mblk_t *arp_off_mp = NULL; - mblk_t *arp_on_mp = NULL; + ill_t *ill = ipif->ipif_ill; + ip_stack_t *ipst = ill->ill_ipst; + ipaddr_t net_mask = 0; + ipaddr_t subnet_mask, route_mask; + int match_args; + ire_t *ire; + boolean_t loopback; - ip1dbg(("ill_arp_off(%s)\n", ill->ill_name)); + /* Check if this is a loopback interface */ + loopback = (ipif->ipif_ill->ill_wq == NULL); - ASSERT(IAM_WRITER_ILL(ill)); - ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); + match_args = MATCH_IRE_TYPE | MATCH_IRE_ILL | MATCH_IRE_MASK | + MATCH_IRE_ZONEONLY; - /* - * If the on message is still around we've already done - * an arp_off without doing an arp_on thus there is no - * work needed. - */ - if (ill->ill_arp_on_mp != NULL) - return (0); + rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); + if ((ire = ipif->ipif_ire_local) != NULL) { + ipif->ipif_ire_local = NULL; + rw_exit(&ipst->ips_ill_g_lock); + /* + * Move count to ipif so we don't loose the count due to + * a down/up dance. + */ + atomic_add_32(&ipif->ipif_ib_pkt_count, ire->ire_ib_pkt_count); - /* - * Allocate an ARP on message (to be saved) and an ARP off message - */ - arp_off_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aroff_template, 0); - if (!arp_off_mp) - return (ENOMEM); + ire_delete(ire); + ire_refrele_notr(ire); + } else { + rw_exit(&ipst->ips_ill_g_lock); + } + + match_args |= MATCH_IRE_GW; - arp_on_mp = ill_arp_alloc(ill, (uchar_t *)&ip_aron_template, 0); - if (!arp_on_mp) - goto failed; + if ((ipif->ipif_lcl_addr != INADDR_ANY) && + !(ipif->ipif_flags & IPIF_NOLOCAL)) { + net_mask = ip_net_mask(ipif->ipif_lcl_addr); + } else { + net_mask = htonl(IN_CLASSA_NET); /* fallback */ + } - ASSERT(ill->ill_arp_on_mp == NULL); - ill->ill_arp_on_mp = arp_on_mp; + subnet_mask = ipif->ipif_net_mask; - /* Send an AR_INTERFACE_OFF request */ - putnext(ill->ill_rq, arp_off_mp); - return (0); -failed: + /* + * If mask was not specified, use natural netmask of + * interface address. Also, store this mask back into the + * ipif struct. + */ + if (subnet_mask == 0) + subnet_mask = net_mask; - if (arp_off_mp) - freemsg(arp_off_mp); - return (ENOMEM); -} + /* Delete the IRE_IF_RESOLVER or IRE_IF_NORESOLVER, as appropriate. */ + if (IS_UNDER_IPMP(ill)) + match_args |= MATCH_IRE_TESTHIDDEN; -/* - * Turn on ARP by turning off the ILLF_NOARP flag. - */ -static int -ill_arp_on(ill_t *ill) -{ - mblk_t *mp; + if (!loopback && !(ipif->ipif_flags & IPIF_NOXMIT) && + ipif->ipif_subnet != INADDR_ANY) { + /* ipif_subnet is ipif_pp_dst_addr for pt-pt */ - ip1dbg(("ipif_arp_on(%s)\n", ill->ill_name)); + if (ipif->ipif_flags & IPIF_POINTOPOINT) { + route_mask = IP_HOST_MASK; + } else { + route_mask = subnet_mask; + } - ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); + ire = ire_ftable_lookup_v4( + ipif->ipif_subnet, /* dest address */ + route_mask, /* mask */ + ipif->ipif_lcl_addr, /* gateway */ + ill->ill_net_type, /* IF_[NO]RESOLVER */ + ill, + ipif->ipif_zoneid, + NULL, + match_args, + 0, + ipst, + NULL); + ASSERT(ire != NULL); + ire_delete(ire); + ire_refrele(ire); + } - ASSERT(IAM_WRITER_ILL(ill)); /* - * Send an AR_INTERFACE_ON request if we have already done - * an arp_off (which allocated the message). + * Create any necessary broadcast IREs. */ - if (ill->ill_arp_on_mp != NULL) { - mp = ill->ill_arp_on_mp; - ill->ill_arp_on_mp = NULL; - putnext(ill->ill_rq, mp); - } - return (0); + if ((ipif->ipif_flags & IPIF_BROADCAST) && + !(ipif->ipif_flags & IPIF_NOXMIT)) + ipif_delete_bcast_ires(ipif); } /* @@ -16561,49 +14510,72 @@ ill_arp_on(ill_t *ill) * this selection is done regardless of the destination. */ boolean_t -ipif_usesrc_avail(ill_t *ill, zoneid_t zoneid) +ipif_zone_avail(uint_t ifindex, boolean_t isv6, zoneid_t zoneid, + ip_stack_t *ipst) { - uint_t ifindex; - ipif_t *ipif = NULL; - ill_t *uill; - boolean_t isv6; - ip_stack_t *ipst = ill->ill_ipst; + ipif_t *ipif = NULL; + ill_t *uill; - ASSERT(ill != NULL); + ASSERT(ifindex != 0); - isv6 = ill->ill_isv6; - ifindex = ill->ill_usesrc_ifindex; - if (ifindex != 0) { - uill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, - NULL, ipst); - if (uill == NULL) - return (B_FALSE); - mutex_enter(&uill->ill_lock); - for (ipif = uill->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - if (!IPIF_CAN_LOOKUP(ipif)) - continue; - if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) - continue; - if (!(ipif->ipif_flags & IPIF_UP)) - continue; - if (ipif->ipif_zoneid != zoneid) - continue; - if ((isv6 && - IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) || - (ipif->ipif_lcl_addr == INADDR_ANY)) - continue; - mutex_exit(&uill->ill_lock); - ill_refrele(uill); - return (B_TRUE); - } + uill = ill_lookup_on_ifindex(ifindex, isv6, ipst); + if (uill == NULL) + return (B_FALSE); + + mutex_enter(&uill->ill_lock); + for (ipif = uill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { + if (IPIF_IS_CONDEMNED(ipif)) + continue; + if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) + continue; + if (!(ipif->ipif_flags & IPIF_UP)) + continue; + if (ipif->ipif_zoneid != zoneid) + continue; + if (isv6 ? IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) : + ipif->ipif_lcl_addr == INADDR_ANY) + continue; mutex_exit(&uill->ill_lock); ill_refrele(uill); + return (B_TRUE); } + mutex_exit(&uill->ill_lock); + ill_refrele(uill); return (B_FALSE); } /* + * Find an ipif with a good local address on the ill+zoneid. + */ +ipif_t * +ipif_good_addr(ill_t *ill, zoneid_t zoneid) +{ + ipif_t *ipif; + + mutex_enter(&ill->ill_lock); + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { + if (IPIF_IS_CONDEMNED(ipif)) + continue; + if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) + continue; + if (!(ipif->ipif_flags & IPIF_UP)) + continue; + if (ipif->ipif_zoneid != zoneid && + ipif->ipif_zoneid != ALL_ZONES && zoneid != ALL_ZONES) + continue; + if (ill->ill_isv6 ? + IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr) : + ipif->ipif_lcl_addr == INADDR_ANY) + continue; + ipif_refhold_locked(ipif); + mutex_exit(&ill->ill_lock); + return (ipif); + } + mutex_exit(&ill->ill_lock); + return (NULL); +} + +/* * IP source address type, sorted from worst to best. For a given type, * always prefer IP addresses on the same subnet. All-zones addresses are * suboptimal because they pose problems with unlabeled destinations. @@ -16615,7 +14587,8 @@ typedef enum { IPIF_DIFFNET_ALLZONES, /* allzones and different subnet */ IPIF_SAMENET_ALLZONES, /* allzones and same subnet */ IPIF_DIFFNET, /* normal and different subnet */ - IPIF_SAMENET /* normal and same subnet */ + IPIF_SAMENET, /* normal and same subnet */ + IPIF_LOCALADDR /* local loopback */ } ipif_type_t; /* @@ -16629,7 +14602,8 @@ typedef enum { * This only occurs when there is no valid source address for the ill. */ ipif_t * -ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid) +ipif_select_source_v4(ill_t *ill, ipaddr_t dst, zoneid_t zoneid, + boolean_t allow_usesrc, boolean_t *notreadyp) { ill_t *usill = NULL; ill_t *ipmp_ill = NULL; @@ -16639,9 +14613,9 @@ ipif_select_source(ill_t *ill, ipaddr_t dst, zoneid_t zoneid) ip_stack_t *ipst = ill->ill_ipst; boolean_t samenet; - if (ill->ill_usesrc_ifindex != 0) { + if (ill->ill_usesrc_ifindex != 0 && allow_usesrc) { usill = ill_lookup_on_ifindex(ill->ill_usesrc_ifindex, - B_FALSE, NULL, NULL, NULL, NULL, ipst); + B_FALSE, ipst); if (usill != NULL) ill = usill; /* Select source from usesrc ILL */ else @@ -16705,14 +14679,22 @@ retry: if ((next_ipif = ipif->ipif_next) == NULL) next_ipif = ill->ill_ipif; - if (!IPIF_CAN_LOOKUP(ipif)) + if (IPIF_IS_CONDEMNED(ipif)) continue; /* Always skip NOLOCAL and ANYCAST interfaces */ if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) continue; - if (!(ipif->ipif_flags & IPIF_UP) || !ipif->ipif_addr_ready) + if (!(ipif->ipif_flags & IPIF_UP)) continue; - if (ipif->ipif_zoneid != zoneid && + + if (!ipif->ipif_addr_ready) { + if (notreadyp != NULL) + *notreadyp = B_TRUE; + continue; + } + + if (zoneid != ALL_ZONES && + ipif->ipif_zoneid != zoneid && ipif->ipif_zoneid != ALL_ZONES) continue; @@ -16749,7 +14731,9 @@ retry: samenet = ((ipif->ipif_net_mask & dst) == ipif->ipif_subnet); - if (ipif->ipif_flags & IPIF_DEPRECATED) { + if (ipif->ipif_lcl_addr == dst) { + type = IPIF_LOCALADDR; + } else if (ipif->ipif_flags & IPIF_DEPRECATED) { type = samenet ? IPIF_SAMENET_DEPRECATED : IPIF_DIFFNET_DEPRECATED; } else if (ipif->ipif_zoneid == ALL_ZONES) { @@ -16762,14 +14746,14 @@ retry: if (type > best_type) { best_type = type; best_ipif = ipif; - if (best_type == IPIF_SAMENET) + if (best_type == IPIF_LOCALADDR) break; /* can't get better */ } } while ((ipif = next_ipif) != start_ipif); if ((ipif = best_ipif) != NULL) { mutex_enter(&ipif->ipif_ill->ill_lock); - if (!IPIF_CAN_LOOKUP(ipif)) { + if (IPIF_IS_CONDEMNED(ipif)) { mutex_exit(&ipif->ipif_ill->ill_lock); goto retry; } @@ -16783,7 +14767,7 @@ retry: */ if (IS_IPMP(ill) && ipif != NULL) { next_ipif = ipif->ipif_next; - if (next_ipif != NULL && IPIF_CAN_LOOKUP(next_ipif)) + if (next_ipif != NULL && !IPIF_IS_CONDEMNED(next_ipif)) ill->ill_src_ipif = next_ipif; else ill->ill_src_ipif = NULL; @@ -16803,14 +14787,14 @@ retry: if (ipif == NULL) { char buf1[INET6_ADDRSTRLEN]; - ip1dbg(("ipif_select_source(%s, %s) -> NULL\n", + ip1dbg(("ipif_select_source_v4(%s, %s) -> NULL\n", ill->ill_name, inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)))); } else { char buf1[INET6_ADDRSTRLEN]; char buf2[INET6_ADDRSTRLEN]; - ip1dbg(("ipif_select_source(%s, %s) -> %s\n", + ip1dbg(("ipif_select_source_v4(%s, %s) -> %s\n", ipif->ipif_ill->ill_name, inet_ntop(AF_INET, &dst, buf1, sizeof (buf1)), inet_ntop(AF_INET, &ipif->ipif_lcl_addr, @@ -16821,172 +14805,80 @@ retry: } /* - * If old_ipif is not NULL, see if ipif was derived from old - * ipif and if so, recreate the interface route by re-doing - * source address selection. This happens when ipif_down -> - * ipif_update_other_ipifs calls us. + * Pick a source address based on the destination ill and an optional setsrc + * address. + * The result is stored in srcp. If generation is set, then put the source + * generation number there before we look for the source address (to avoid + * missing changes in the set of source addresses. + * If flagsp is set, then us it to pass back ipif_flags. * - * If old_ipif is NULL, just redo the source address selection - * if needed. This happens when ipif_up_done calls us. + * If the caller wants to cache the returned source address and detect when + * that might be stale, the caller should pass in a generation argument, + * which the caller can later compare against ips_src_generation + * + * The precedence order for selecting an IPv4 source address is: + * - RTF_SETSRC on the offlink ire always wins. + * - If usrsrc is set, swap the ill to be the usesrc one. + * - If IPMP is used on the ill, select a random address from the most + * preferred ones below: + * 1. If onlink destination, same subnet and not deprecated, not ALL_ZONES + * 2. Not deprecated, not ALL_ZONES + * 3. If onlink destination, same subnet and not deprecated, ALL_ZONES + * 4. Not deprecated, ALL_ZONES + * 5. If onlink destination, same subnet and deprecated + * 6. Deprecated. + * + * We have lower preference for ALL_ZONES IP addresses, + * as they pose problems with unlabeled destinations. + * + * Note that when multiple IP addresses match e.g., #1 we pick + * the first one if IPMP is not in use. With IPMP we randomize. */ -static void -ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif) +int +ip_select_source_v4(ill_t *ill, ipaddr_t setsrc, ipaddr_t dst, + ipaddr_t multicast_ifaddr, + zoneid_t zoneid, ip_stack_t *ipst, ipaddr_t *srcp, + uint32_t *generation, uint64_t *flagsp) { - ire_t *ire; - ire_t *ipif_ire; - queue_t *stq; - ipif_t *nipif; - ill_t *ill; - boolean_t need_rele = B_FALSE; - ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; - - ASSERT(old_ipif == NULL || IAM_WRITER_IPIF(old_ipif)); - ASSERT(IAM_WRITER_IPIF(ipif)); + ipif_t *ipif; + boolean_t notready = B_FALSE; /* Set if !ipif_addr_ready found */ - ill = ipif->ipif_ill; - if (!(ipif->ipif_flags & - (IPIF_NOLOCAL|IPIF_ANYCAST|IPIF_DEPRECATED))) { - /* - * Can't possibly have borrowed the source - * from old_ipif. - */ - return; - } + if (flagsp != NULL) + *flagsp = 0; /* - * Is there any work to be done? No work if the address - * is INADDR_ANY, loopback or NOLOCAL or ANYCAST ( - * ipif_select_source() does not borrow addresses from - * NOLOCAL and ANYCAST interfaces). + * Need to grab the generation number before we check to + * avoid a race with a change to the set of local addresses. + * No lock needed since the thread which updates the set of local + * addresses use ipif/ill locks and exit those (hence a store memory + * barrier) before doing the atomic increase of ips_src_generation. */ - if ((old_ipif != NULL) && - ((old_ipif->ipif_lcl_addr == INADDR_ANY) || - (old_ipif->ipif_ill->ill_wq == NULL) || - (old_ipif->ipif_flags & - (IPIF_NOLOCAL|IPIF_ANYCAST)))) { - return; + if (generation != NULL) { + *generation = ipst->ips_src_generation; } - /* - * Perform the same checks as when creating the - * IRE_INTERFACE in ipif_up_done. - */ - if (!(ipif->ipif_flags & IPIF_UP)) - return; - - if ((ipif->ipif_flags & IPIF_NOXMIT) || - (ipif->ipif_subnet == INADDR_ANY)) - return; - - ipif_ire = ipif_to_ire(ipif); - if (ipif_ire == NULL) - return; - - /* - * We know that ipif uses some other source for its - * IRE_INTERFACE. Is it using the source of this - * old_ipif? - */ - if (old_ipif != NULL && - old_ipif->ipif_lcl_addr != ipif_ire->ire_src_addr) { - ire_refrele(ipif_ire); - return; - } - if (ip_debug > 2) { - /* ip1dbg */ - pr_addr_dbg("ipif_recreate_interface_routes: deleting IRE for" - " src %s\n", AF_INET, &ipif_ire->ire_src_addr); - } - - stq = ipif_ire->ire_stq; - - /* - * Can't use our source address. Select a different - * source address for the IRE_INTERFACE. - */ - nipif = ipif_select_source(ill, ipif->ipif_subnet, ipif->ipif_zoneid); - if (nipif == NULL) { - /* Last resort - all ipif's have IPIF_NOLOCAL */ - nipif = ipif; - } else { - need_rele = B_TRUE; + if (CLASSD(dst) && multicast_ifaddr != INADDR_ANY) { + *srcp = multicast_ifaddr; + return (0); } - ire = ire_create( - (uchar_t *)&ipif->ipif_subnet, /* dest pref */ - (uchar_t *)&ipif->ipif_net_mask, /* mask */ - (uchar_t *)&nipif->ipif_src_addr, /* src addr */ - NULL, /* no gateway */ - &ipif->ipif_mtu, /* max frag */ - NULL, /* no src nce */ - NULL, /* no recv from queue */ - stq, /* send-to queue */ - ill->ill_net_type, /* IF_[NO]RESOLVER */ - ipif, - 0, - 0, - 0, - 0, - &ire_uinfo_null, - NULL, - NULL, - ipst); - - if (ire != NULL) { - ire_t *ret_ire; - int error; - - /* - * We don't need ipif_ire anymore. We need to delete - * before we add so that ire_add does not detect - * duplicates. - */ - ire_delete(ipif_ire); - ret_ire = ire; - error = ire_add(&ret_ire, NULL, NULL, NULL, B_FALSE); - ASSERT(error == 0); - ASSERT(ire == ret_ire); - /* Held in ire_add */ - ire_refrele(ret_ire); + /* Was RTF_SETSRC set on the first IRE in the recursive lookup? */ + if (setsrc != INADDR_ANY) { + *srcp = setsrc; + return (0); } - /* - * Either we are falling through from above or could not - * allocate a replacement. - */ - ire_refrele(ipif_ire); - if (need_rele) - ipif_refrele(nipif); -} - -/* - * This old_ipif is going away. - * - * Determine if any other ipif's are using our address as - * ipif_lcl_addr (due to those being IPIF_NOLOCAL, IPIF_ANYCAST, or - * IPIF_DEPRECATED). - * Find the IRE_INTERFACE for such ipifs and recreate them - * to use an different source address following the rules in - * ipif_up_done. - */ -static void -ipif_update_other_ipifs(ipif_t *old_ipif) -{ - ipif_t *ipif; - ill_t *ill; - char buf[INET6_ADDRSTRLEN]; - - ASSERT(IAM_WRITER_IPIF(old_ipif)); - - ill = old_ipif->ipif_ill; - - ip1dbg(("ipif_update_other_ipifs(%s, %s)\n", ill->ill_name, - inet_ntop(AF_INET, &old_ipif->ipif_lcl_addr, buf, sizeof (buf)))); - - for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { - if (ipif == old_ipif) - continue; - ipif_recreate_interface_routes(old_ipif, ipif); + ipif = ipif_select_source_v4(ill, dst, zoneid, B_TRUE, ¬ready); + if (ipif == NULL) { + if (notready) + return (ENETDOWN); + else + return (EADDRNOTAVAIL); } + *srcp = ipif->ipif_lcl_addr; + if (flagsp != NULL) + *flagsp = ipif->ipif_flags; + ipif_refrele(ipif); + return (0); } /* ARGSUSED */ @@ -17049,51 +14941,12 @@ ip_sioctl_sifname(ipif_t *dummy_ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, } /* - * Refresh all IRE_BROADCAST entries associated with `ill' to ensure the - * minimum (but complete) set exist. This is necessary when adding or - * removing an interface to/from an IPMP group, since interfaces in an - * IPMP group use the IRE_BROADCAST entries for the IPMP group (whenever - * its test address subnets overlap with IPMP data addresses). It's also - * used to refresh the IRE_BROADCAST entries associated with the IPMP - * interface when the nominated broadcast interface changes. - */ -void -ill_refresh_bcast(ill_t *ill) -{ - ire_t *ire_array[12]; /* max ipif_create_bcast_ires() can create */ - ire_t **irep; - ipif_t *ipif; - - ASSERT(!ill->ill_isv6); - ASSERT(IAM_WRITER_ILL(ill)); - - /* - * Remove any old broadcast IREs. - */ - ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_BROADCAST, - ill_broadcast_delete, ill, ill); - - /* - * Create new ones for any ipifs that are up and broadcast-capable. - */ - for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { - if ((ipif->ipif_flags & (IPIF_UP|IPIF_BROADCAST)) != - (IPIF_UP|IPIF_BROADCAST)) - continue; - - irep = ipif_create_bcast_ires(ipif, ire_array); - while (irep-- > ire_array) { - (void) ire_add(irep, NULL, NULL, NULL, B_FALSE); - if (*irep != NULL) - ire_refrele(*irep); - } - } -} - -/* * Create any IRE_BROADCAST entries for `ipif', and store those entries in - * `irep'. Returns a pointer to the next free `irep' entry (just like - * ire_check_and_create_bcast()). + * `irep'. Returns a pointer to the next free `irep' entry + * A mirror exists in ipif_delete_bcast_ires(). + * + * The management of any "extra" or seemingly duplicate IRE_BROADCASTs is + * done in ire_add. */ static ire_t ** ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep) @@ -17101,18 +14954,20 @@ ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep) ipaddr_t addr; ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr); ipaddr_t subnetmask = ipif->ipif_net_mask; - int flags = MATCH_IRE_TYPE | MATCH_IRE_ILL; + ill_t *ill = ipif->ipif_ill; + zoneid_t zoneid = ipif->ipif_zoneid; ip1dbg(("ipif_create_bcast_ires: creating broadcast IREs\n")); ASSERT(ipif->ipif_flags & IPIF_BROADCAST); + ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT)); if (ipif->ipif_lcl_addr == INADDR_ANY || (ipif->ipif_flags & IPIF_NOLOCAL)) netmask = htonl(IN_CLASSA_NET); /* fallback */ - irep = ire_check_and_create_bcast(ipif, 0, irep, flags); - irep = ire_check_and_create_bcast(ipif, INADDR_BROADCAST, irep, flags); + irep = ire_create_bcast(ill, 0, zoneid, irep); + irep = ire_create_bcast(ill, INADDR_BROADCAST, zoneid, irep); /* * For backward compatibility, we create net broadcast IREs based on @@ -17125,9 +14980,8 @@ ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep) */ if (netmask < subnetmask) { addr = netmask & ipif->ipif_subnet; - irep = ire_check_and_create_bcast(ipif, addr, irep, flags); - irep = ire_check_and_create_bcast(ipif, ~netmask | addr, irep, - flags); + irep = ire_create_bcast(ill, addr, zoneid, irep); + irep = ire_create_bcast(ill, ~netmask | addr, zoneid, irep); } /* @@ -17138,282 +14992,73 @@ ipif_create_bcast_ires(ipif_t *ipif, ire_t **irep) */ if (subnetmask != 0xFFFFFFFF) { addr = ipif->ipif_subnet; - irep = ire_check_and_create_bcast(ipif, addr, irep, flags); - irep = ire_check_and_create_bcast(ipif, ~subnetmask | addr, - irep, flags); + irep = ire_create_bcast(ill, addr, zoneid, irep); + irep = ire_create_bcast(ill, ~subnetmask | addr, zoneid, irep); } return (irep); } /* - * Broadcast IRE info structure used in the functions below. Since we - * allocate BCAST_COUNT of them on the stack, keep the bit layout compact. - */ -typedef struct bcast_ireinfo { - uchar_t bi_type; /* BCAST_* value from below */ - uchar_t bi_willdie:1, /* will this IRE be going away? */ - bi_needrep:1, /* do we need to replace it? */ - bi_haverep:1, /* have we replaced it? */ - bi_pad:5; - ipaddr_t bi_addr; /* IRE address */ - ipif_t *bi_backup; /* last-ditch ipif to replace it on */ -} bcast_ireinfo_t; - -enum { BCAST_ALLONES, BCAST_ALLZEROES, BCAST_NET, BCAST_SUBNET, BCAST_COUNT }; - -/* - * Check if `ipif' needs the dying broadcast IRE described by `bireinfop', and - * return B_TRUE if it should immediately be used to recreate the IRE. - */ -static boolean_t -ipif_consider_bcast(ipif_t *ipif, bcast_ireinfo_t *bireinfop) -{ - ipaddr_t addr; - - ASSERT(!bireinfop->bi_haverep && bireinfop->bi_willdie); - - switch (bireinfop->bi_type) { - case BCAST_NET: - addr = ipif->ipif_subnet & ip_net_mask(ipif->ipif_subnet); - if (addr != bireinfop->bi_addr) - return (B_FALSE); - break; - case BCAST_SUBNET: - if (ipif->ipif_subnet != bireinfop->bi_addr) - return (B_FALSE); - break; - } - - bireinfop->bi_needrep = 1; - if (ipif->ipif_flags & (IPIF_DEPRECATED|IPIF_NOLOCAL|IPIF_ANYCAST)) { - if (bireinfop->bi_backup == NULL) - bireinfop->bi_backup = ipif; - return (B_FALSE); - } - return (B_TRUE); -} - -/* - * Create the broadcast IREs described by `bireinfop' on `ipif', and return - * them ala ire_check_and_create_bcast(). - */ -static ire_t ** -ipif_create_bcast(ipif_t *ipif, bcast_ireinfo_t *bireinfop, ire_t **irep) -{ - ipaddr_t mask, addr; - - ASSERT(!bireinfop->bi_haverep && bireinfop->bi_needrep); - - addr = bireinfop->bi_addr; - irep = ire_create_bcast(ipif, addr, irep); - - switch (bireinfop->bi_type) { - case BCAST_NET: - mask = ip_net_mask(ipif->ipif_subnet); - irep = ire_create_bcast(ipif, addr | ~mask, irep); - break; - case BCAST_SUBNET: - mask = ipif->ipif_net_mask; - irep = ire_create_bcast(ipif, addr | ~mask, irep); - break; - } - - bireinfop->bi_haverep = 1; - return (irep); -} - -/* - * Walk through all of the ipifs on `ill' that will be affected by `test_ipif' - * going away, and determine if any of the broadcast IREs (named by `bireinfop') - * that are going away are still needed. If so, have ipif_create_bcast() - * recreate them (except for the deprecated case, as explained below). - */ -static ire_t ** -ill_create_bcast(ill_t *ill, ipif_t *test_ipif, bcast_ireinfo_t *bireinfo, - ire_t **irep) -{ - int i; - ipif_t *ipif; - - ASSERT(!ill->ill_isv6); - for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { - /* - * Skip this ipif if it's (a) the one being taken down, (b) - * not in the same zone, or (c) has no valid local address. - */ - if (ipif == test_ipif || - ipif->ipif_zoneid != test_ipif->ipif_zoneid || - ipif->ipif_subnet == 0 || - (ipif->ipif_flags & (IPIF_UP|IPIF_BROADCAST|IPIF_NOXMIT)) != - (IPIF_UP|IPIF_BROADCAST)) - continue; - - /* - * For each dying IRE that hasn't yet been replaced, see if - * `ipif' needs it and whether the IRE should be recreated on - * `ipif'. If `ipif' is deprecated, ipif_consider_bcast() - * will return B_FALSE even if `ipif' needs the IRE on the - * hopes that we'll later find a needy non-deprecated ipif. - * However, the ipif is recorded in bi_backup for possible - * subsequent use by ipif_check_bcast_ires(). - */ - for (i = 0; i < BCAST_COUNT; i++) { - if (!bireinfo[i].bi_willdie || bireinfo[i].bi_haverep) - continue; - if (!ipif_consider_bcast(ipif, &bireinfo[i])) - continue; - irep = ipif_create_bcast(ipif, &bireinfo[i], irep); - } - - /* - * If we've replaced all of the broadcast IREs that are going - * to be taken down, we know we're done. - */ - for (i = 0; i < BCAST_COUNT; i++) { - if (bireinfo[i].bi_willdie && !bireinfo[i].bi_haverep) - break; - } - if (i == BCAST_COUNT) - break; - } - return (irep); -} - -/* - * Check if `test_ipif' (which is going away) is associated with any existing - * broadcast IREs, and whether any other ipifs (e.g., on the same ill) were - * using those broadcast IREs. If so, recreate the broadcast IREs on one or - * more of those other ipifs. (The old IREs will be deleted in ipif_down().) - * - * This is necessary because broadcast IREs are shared. In particular, a - * given ill has one set of all-zeroes and all-ones broadcast IREs (for every - * zone), plus one set of all-subnet-ones, all-subnet-zeroes, all-net-ones, - * and all-net-zeroes for every net/subnet (and every zone) it has IPIF_UP - * ipifs on. Thus, if there are two IPIF_UP ipifs on the same subnet with the - * same zone, they will share the same set of broadcast IREs. - * - * Note: the upper bound of 12 IREs comes from the worst case of replacing all - * six pairs (loopback and non-loopback) of broadcast IREs (all-zeroes, - * all-ones, subnet-zeroes, subnet-ones, net-zeroes, and net-ones). + * Mirror of ipif_create_bcast_ires() */ static void -ipif_check_bcast_ires(ipif_t *test_ipif) +ipif_delete_bcast_ires(ipif_t *ipif) { - ill_t *ill = test_ipif->ipif_ill; - ire_t *ire, *ire_array[12]; /* see note above */ - ire_t **irep1, **irep = &ire_array[0]; - uint_t i, willdie; - ipaddr_t mask = ip_net_mask(test_ipif->ipif_subnet); - bcast_ireinfo_t bireinfo[BCAST_COUNT]; - - ASSERT(!test_ipif->ipif_isv6); - ASSERT(IAM_WRITER_IPIF(test_ipif)); - - /* - * No broadcast IREs for the LOOPBACK interface - * or others such as point to point and IPIF_NOXMIT. - */ - if (!(test_ipif->ipif_flags & IPIF_BROADCAST) || - (test_ipif->ipif_flags & IPIF_NOXMIT)) - return; - - bzero(bireinfo, sizeof (bireinfo)); - bireinfo[0].bi_type = BCAST_ALLZEROES; - bireinfo[0].bi_addr = 0; - - bireinfo[1].bi_type = BCAST_ALLONES; - bireinfo[1].bi_addr = INADDR_BROADCAST; - - bireinfo[2].bi_type = BCAST_NET; - bireinfo[2].bi_addr = test_ipif->ipif_subnet & mask; - - if (test_ipif->ipif_net_mask != 0) - mask = test_ipif->ipif_net_mask; - bireinfo[3].bi_type = BCAST_SUBNET; - bireinfo[3].bi_addr = test_ipif->ipif_subnet & mask; - - /* - * Figure out what (if any) broadcast IREs will die as a result of - * `test_ipif' going away. If none will die, we're done. - */ - for (i = 0, willdie = 0; i < BCAST_COUNT; i++) { - ire = ire_ctable_lookup(bireinfo[i].bi_addr, 0, IRE_BROADCAST, - test_ipif, ALL_ZONES, NULL, - (MATCH_IRE_TYPE | MATCH_IRE_IPIF), ill->ill_ipst); - if (ire != NULL) { - willdie++; - bireinfo[i].bi_willdie = 1; - ire_refrele(ire); - } - } - - if (willdie == 0) - return; - - /* - * Walk through all the ipifs that will be affected by the dying IREs, - * and recreate the IREs as necessary. Note that all interfaces in an - * IPMP illgrp share the same broadcast IREs, and thus the entire - * illgrp must be walked, starting with the IPMP meta-interface (so - * that broadcast IREs end up on it whenever possible). - */ - if (IS_UNDER_IPMP(ill)) - ill = ipmp_illgrp_ipmp_ill(ill->ill_grp); - - irep = ill_create_bcast(ill, test_ipif, bireinfo, irep); + ipaddr_t addr; + ipaddr_t netmask = ip_net_mask(ipif->ipif_lcl_addr); + ipaddr_t subnetmask = ipif->ipif_net_mask; + ill_t *ill = ipif->ipif_ill; + zoneid_t zoneid = ipif->ipif_zoneid; + ire_t *ire; - if (IS_IPMP(ill) || IS_UNDER_IPMP(ill)) { - ipmp_illgrp_t *illg = ill->ill_grp; + ASSERT(ipif->ipif_flags & IPIF_BROADCAST); + ASSERT(!(ipif->ipif_flags & IPIF_NOXMIT)); - ill = list_head(&illg->ig_if); - for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) { - for (i = 0; i < BCAST_COUNT; i++) { - if (bireinfo[i].bi_willdie && - !bireinfo[i].bi_haverep) - break; - } - if (i == BCAST_COUNT) - break; + if (ipif->ipif_lcl_addr == INADDR_ANY || + (ipif->ipif_flags & IPIF_NOLOCAL)) + netmask = htonl(IN_CLASSA_NET); /* fallback */ - irep = ill_create_bcast(ill, test_ipif, bireinfo, irep); - } - } + ire = ire_lookup_bcast(ill, 0, zoneid); + ASSERT(ire != NULL); + ire_delete(ire); ire_refrele(ire); + ire = ire_lookup_bcast(ill, INADDR_BROADCAST, zoneid); + ASSERT(ire != NULL); + ire_delete(ire); ire_refrele(ire); /* - * Scan through the set of broadcast IREs and see if there are any - * that we need to replace that have not yet been replaced. If so, - * replace them using the appropriate backup ipif. + * For backward compatibility, we create net broadcast IREs based on + * the old "IP address class system", since some old machines only + * respond to these class derived net broadcast. However, we must not + * create these net broadcast IREs if the subnetmask is shorter than + * the IP address class based derived netmask. Otherwise, we may + * create a net broadcast address which is the same as an IP address + * on the subnet -- and then TCP will refuse to talk to that address. */ - for (i = 0; i < BCAST_COUNT; i++) { - if (bireinfo[i].bi_needrep && !bireinfo[i].bi_haverep) - irep = ipif_create_bcast(bireinfo[i].bi_backup, - &bireinfo[i], irep); + if (netmask < subnetmask) { + addr = netmask & ipif->ipif_subnet; + ire = ire_lookup_bcast(ill, addr, zoneid); + ASSERT(ire != NULL); + ire_delete(ire); ire_refrele(ire); + ire = ire_lookup_bcast(ill, ~netmask | addr, zoneid); + ASSERT(ire != NULL); + ire_delete(ire); ire_refrele(ire); } /* - * If we can't create all of them, don't add any of them. (Code in - * ip_wput_ire() and ire_to_ill() assumes that we always have a - * non-loopback copy and loopback copy for a given address.) + * Don't create IRE_BROADCAST IREs for the interface if the subnetmask + * is 0xFFFFFFFF, as an IRE_LOCAL for that interface is already + * created. Creating these broadcast IREs will only create confusion + * as `addr' will be the same as the IP address. */ - for (irep1 = irep; irep1 > ire_array; ) { - irep1--; - if (*irep1 == NULL) { - ip0dbg(("ipif_check_bcast_ires: can't create " - "IRE_BROADCAST, memory allocation failure\n")); - while (irep > ire_array) { - irep--; - if (*irep != NULL) - ire_delete(*irep); - } - return; - } - } - - for (irep1 = irep; irep1 > ire_array; ) { - irep1--; - if (ire_add(irep1, NULL, NULL, NULL, B_FALSE) == 0) - ire_refrele(*irep1); /* Held in ire_add */ + if (subnetmask != 0xFFFFFFFF) { + addr = ipif->ipif_subnet; + ire = ire_lookup_bcast(ill, addr, zoneid); + ASSERT(ire != NULL); + ire_delete(ire); ire_refrele(ire); + ire = ire_lookup_bcast(ill, ~subnetmask | addr, zoneid); + ASSERT(ire != NULL); + ire_delete(ire); ire_refrele(ire); } } @@ -17423,7 +15068,7 @@ ipif_check_bcast_ires(ipif_t *test_ipif) * Set IFF_IPV* and ill_isv6 prior to doing the lookup * since ipif_lookup_on_name uses the _isv6 flags when matching. * Returns EINPROGRESS when mp has been consumed by queueing it on - * ill_pending_mp and the ioctl will complete in ip_rput. + * ipx_pending_mp and the ioctl will complete in ip_rput. * * Can operate on either a module or a driver queue. * Returns an error if not a module queue. @@ -17485,7 +15130,7 @@ ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, * We start off as IFF_IPV4 in ipif_allocate and become * IFF_IPV4 or IFF_IPV6 here depending on lifr_flags value. * The only flags that we read from user space are IFF_IPV4, - * IFF_IPV6, IFF_XRESOLV and IFF_BROADCAST. + * IFF_IPV6, and IFF_BROADCAST. * * This ill has not been inserted into the global list. * So we are still single threaded and don't need any lock @@ -17502,22 +15147,13 @@ ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, } new_flags = - lifr->lifr_flags & (IFF_IPV6|IFF_IPV4|IFF_XRESOLV|IFF_BROADCAST); + lifr->lifr_flags & (IFF_IPV6|IFF_IPV4|IFF_BROADCAST); if ((new_flags ^ (IFF_IPV6|IFF_IPV4)) == 0) { ip1dbg(("ip_sioctl_slifname: flags must be exactly one of " "IFF_IPV4 or IFF_IPV6\n")); return (EINVAL); } - /* - * Only allow the IFF_XRESOLV flag to be set on IPv6 interfaces. - */ - if ((new_flags & IFF_XRESOLV) && !(new_flags & IFF_IPV6) && - !(ipif->ipif_isv6)) { - ip1dbg(("ip_sioctl_slifname: XRESOLV only allowed on " - "IPv6 interface\n")); - return (EINVAL); - } /* * We always start off as IPv4, so only need to check for IPv6. @@ -17532,11 +15168,6 @@ ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, else ipif->ipif_flags &= ~IPIF_BROADCAST; - if ((new_flags & IFF_XRESOLV) != 0) - ill->ill_flags |= ILLF_XRESOLV; - else - ill->ill_flags &= ~ILLF_XRESOLV; - /* We started off as V4. */ if (ill->ill_flags & ILLF_IPV6) { ill->ill_phyint->phyint_illv6 = ill; @@ -17566,23 +15197,17 @@ ip_sioctl_slifname_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, */ ipif_t * ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid, - queue_t *q, mblk_t *mp, ipsq_func_t func, int *err, ip_stack_t *ipst) + ip_stack_t *ipst) { ill_t *ill; ipif_t *ipif = NULL; - ASSERT((q == NULL && mp == NULL && func == NULL && err == NULL) || - (q != NULL && mp != NULL && func != NULL && err != NULL)); - - if (err != NULL) - *err = 0; - - ill = ill_lookup_on_ifindex(index, isv6, q, mp, func, err, ipst); + ill = ill_lookup_on_ifindex(index, isv6, ipst); if (ill != NULL) { mutex_enter(&ill->ill_lock); for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { - if (IPIF_CAN_LOOKUP(ipif) && (zoneid == ALL_ZONES || + if (!IPIF_IS_CONDEMNED(ipif) && (zoneid == ALL_ZONES || zoneid == ipif->ipif_zoneid || ipif->ipif_zoneid == ALL_ZONES)) { ipif_refhold_locked(ipif); @@ -17591,8 +15216,6 @@ ipif_lookup_on_ifindex(uint_t index, boolean_t isv6, zoneid_t zoneid, } mutex_exit(&ill->ill_lock); ill_refrele(ill); - if (ipif == NULL && err != NULL) - *err = ENXIO; } return (ipif); } @@ -17673,6 +15296,8 @@ ip_sioctl_slifindex(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, if (ILL_OTHER(ill)) ip_rts_ifmsg(ILL_OTHER(ill)->ill_ipif, RTSQ_DEFAULT); + /* Perhaps ilgs should use this ill */ + update_conn_ill(NULL, ill->ill_ipst); return (0); } @@ -17764,7 +15389,7 @@ ip_sioctl_slifzone(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, err = ipif_logical_down(ipif, q, mp); if (err == EINPROGRESS) return (err); - ipif_down_tail(ipif); + (void) ipif_down_tail(ipif); need_up = B_TRUE; } @@ -17801,6 +15426,9 @@ ip_sioctl_slifzone_tail(ipif_t *ipif, zoneid_t zoneid, /* Update sctp list */ sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); + /* The default multicast interface might have changed */ + ire_increment_multicast_generation(ipst, ipif->ipif_ill->ill_isv6); + if (need_up) { /* * Now bring the interface back up. If this @@ -17825,7 +15453,6 @@ ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, zone_t *zptr; zone_status_t status; - ASSERT(ipif->ipif_id != 0); ASSERT(ipip->ipi_cmd_type == LIF_CMD); if ((zoneid = lifr->lifr_zoneid) == ALL_ZONES) zoneid = GLOBAL_ZONEID; @@ -17863,7 +15490,7 @@ ip_sioctl_slifzone_restart(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, return (EINVAL); } - ipif_down_tail(ipif); + (void) ipif_down_tail(ipif); return (ip_sioctl_slifzone_tail(ipif, lifr->lifr_zoneid, q, mp, B_TRUE)); @@ -17943,6 +15570,16 @@ ill_prev_usesrc(ill_t *uill) * Release all members of the usesrc group. This routine is called * from ill_delete when the interface being unplumbed is the * group head. + * + * This silently clears the usesrc that ifconfig setup. + * An alternative would be to keep that ifindex, and drop packets on the floor + * since no source address can be selected. + * Even if we keep the current semantics, don't need a lock and a linked list. + * Can walk all the ills checking if they have a ill_usesrc_ifindex matching + * the one that is being removed. Issue is how we return the usesrc users + * (SIOCGLIFSRCOF). We want to be able to find the ills which have an + * ill_usesrc_ifindex matching a target ill. We could also do that with an + * ill walk, but the walker would need to insert in the ioctl response. */ static void ill_disband_usesrc_group(ill_t *uill) @@ -18023,8 +15660,7 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ip_ioctl_cmd_t *ipip, void *ifreq) { struct lifreq *lifr = (struct lifreq *)ifreq; - boolean_t isv6 = B_FALSE, reset_flg = B_FALSE, - ill_flag_changed = B_FALSE; + boolean_t isv6 = B_FALSE, reset_flg = B_FALSE; ill_t *usesrc_ill, *usesrc_cli_ill = ipif->ipif_ill; int err = 0, ret; uint_t ifindex; @@ -18035,7 +15671,7 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ASSERT(q->q_next == NULL); ASSERT(CONN_Q(q)); - isv6 = (Q_TO_CONN(q))->conn_af_isv6; + isv6 = (Q_TO_CONN(q))->conn_family == AF_INET6; ifindex = lifr->lifr_index; if (ifindex == 0) { @@ -18048,10 +15684,9 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, reset_flg = B_TRUE; } - usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, q, mp, - ip_process_ioctl, &err, ipst); + usesrc_ill = ill_lookup_on_ifindex(ifindex, isv6, ipst); if (usesrc_ill == NULL) { - return (err); + return (ENXIO); } ipsq = ipsq_try_enter(NULL, usesrc_ill, q, mp, ip_process_ioctl, @@ -18101,31 +15736,6 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, usesrc_ill->ill_isv6)); /* - * The next step ensures that no new ires will be created referencing - * the client ill, until the ILL_CHANGING flag is cleared. Then - * we go through an ire walk deleting all ire caches that reference - * the client ill. New ires referencing the client ill that are added - * to the ire table before the ILL_CHANGING flag is set, will be - * cleaned up by the ire walk below. Attempt to add new ires referencing - * the client ill while the ILL_CHANGING flag is set will be failed - * during the ire_add in ire_atomic_start. ire_atomic_start atomically - * checks (under the ill_g_usesrc_lock) that the ire being added - * is not stale, i.e the ire_stq and ire_ipif are consistent and - * belong to the same usesrc group. - */ - mutex_enter(&usesrc_cli_ill->ill_lock); - usesrc_cli_ill->ill_state_flags |= ILL_CHANGING; - mutex_exit(&usesrc_cli_ill->ill_lock); - ill_flag_changed = B_TRUE; - - if (ipif->ipif_isv6) - ire_walk_v6(ipif_delete_cache_ire, (char *)usesrc_cli_ill, - ALL_ZONES, ipst); - else - ire_walk_v4(ipif_delete_cache_ire, (char *)usesrc_cli_ill, - ALL_ZONES, ipst); - - /* * ill_g_usesrc_lock global lock protects the ill_usesrc_grp_next * and the ill_usesrc_ifindex fields */ @@ -18169,15 +15779,14 @@ ip_sioctl_slifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, rw_exit(&ipst->ips_ill_g_usesrc_lock); done: - if (ill_flag_changed) { - mutex_enter(&usesrc_cli_ill->ill_lock); - usesrc_cli_ill->ill_state_flags &= ~ILL_CHANGING; - mutex_exit(&usesrc_cli_ill->ill_lock); - } if (ipsq != NULL) ipsq_exit(ipsq); /* The refrele on the lifr_name ipif is done by ip_process_ioctl */ ill_refrele(usesrc_ill); + + /* Let conn_ixa caching know that source address selection changed */ + ip_update_source_selection(ipst); + return (err); } @@ -18384,7 +15993,6 @@ ill_phyint_reinit(ill_t *ill) * Now that the phyint's ifindex has been assigned, complete the * remaining */ - ill->ill_ip_mib->ipIfStatsIfIndex = ill->ill_phyint->phyint_ifindex; if (ill->ill_isv6) { ill->ill_icmp6_mib->ipv6IfIcmpIfIndex = @@ -18449,6 +16057,8 @@ ip_ifname_notify(ill_t *ill, queue_t *q) lifr->lifr_ppa = ill->ill_ppa; lifr->lifr_flags = (ill->ill_flags & (ILLF_IPV4|ILLF_IPV6)); + DTRACE_PROBE3(ill__dlpi, char *, "ip_ifname_notify", + char *, "SIOCSLIFNAME", ill_t *, ill); putnext(q, mp1); } @@ -18503,23 +16113,6 @@ ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) */ err = ill_dl_phys(ill, ipif, mp, q); - /* - * If there is no IRE expiration timer running, get one started. - * igmp and mld timers will be triggered by the first multicast - */ - if (ipst->ips_ip_ire_expire_id == 0) { - /* - * acquire the lock and check again. - */ - mutex_enter(&ipst->ips_ip_trash_timer_lock); - if (ipst->ips_ip_ire_expire_id == 0) { - ipst->ips_ip_ire_expire_id = timeout( - ip_trash_timer_expire, ipst, - MSEC_TO_TICK(ipst->ips_ip_timer_interval)); - } - mutex_exit(&ipst->ips_ip_trash_timer_lock); - } - if (ill->ill_isv6) { mutex_enter(&ipst->ips_mld_slowtimeout_lock); if (ipst->ips_mld_slowtimeout_id == 0) { @@ -18545,7 +16138,7 @@ ipif_set_values_tail(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) * Common routine for ppa and ifname setting. Should be called exclusive. * * Returns EINPROGRESS when mp has been consumed by queueing it on - * ill_pending_mp and the ioctl will complete in ip_rput. + * ipx_pending_mp and the ioctl will complete in ip_rput. * * NOTE : If ppa is UNIT_MAX, we assign the next valid ppa and return * the new name and new ppa in lifr_name and lifr_ppa respectively. @@ -18576,6 +16169,7 @@ ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr) ASSERT((mi_strlen(interf_name) + 1) <= LIFNAMSIZ); ASSERT(ill->ill_ppa == UINT_MAX); + ill->ill_defend_start = ill->ill_defend_count = 0; /* The ppa is sent down by ifconfig or is chosen */ if ((ppa_ptr = ill_get_ppa_ptr(interf_name)) == NULL) { return (EINVAL); @@ -18630,18 +16224,18 @@ ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr) if (ill->ill_flags & ILLF_IPV6) { ill->ill_isv6 = B_TRUE; + ill_set_inputfn(ill); if (ill->ill_rq != NULL) { ill->ill_rq->q_qinfo = &iprinitv6; - ill->ill_wq->q_qinfo = &ipwinitv6; } /* Keep the !IN6_IS_ADDR_V4MAPPED assertions happy */ ipif->ipif_v6lcl_addr = ipv6_all_zeros; - ipif->ipif_v6src_addr = ipv6_all_zeros; ipif->ipif_v6subnet = ipv6_all_zeros; ipif->ipif_v6net_mask = ipv6_all_zeros; ipif->ipif_v6brd_addr = ipv6_all_zeros; ipif->ipif_v6pp_dst_addr = ipv6_all_zeros; + ill->ill_reachable_retrans_time = ND_RETRANS_TIMER; /* * point-to-point or Non-mulicast capable * interfaces won't do NUD unless explicitly @@ -18670,8 +16264,9 @@ ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr) ill->ill_flags |= ILLF_ROUTER; } else if (ill->ill_flags & ILLF_IPV4) { ill->ill_isv6 = B_FALSE; + ill_set_inputfn(ill); + ill->ill_reachable_retrans_time = ARP_RETRANS_TIMER; IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6lcl_addr); - IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6src_addr); IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6subnet); IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6net_mask); IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &ipif->ipif_v6brd_addr); @@ -18783,6 +16378,7 @@ ipif_set_values(queue_t *q, mblk_t *mp, char *interf_name, uint_t *new_ppa_ptr) * restore previous values */ ill->ill_isv6 = B_FALSE; + ill_set_inputfn(ill); } return (error); } @@ -18810,95 +16406,11 @@ ipif_init(ip_stack_t *ipst) } /* - * Lookup the ipif corresponding to the onlink destination address. For - * point-to-point interfaces, it matches with remote endpoint destination - * address. For point-to-multipoint interfaces it only tries to match the - * destination with the interface's subnet address. The longest, most specific - * match is found to take care of such rare network configurations like - - * le0: 129.146.1.1/16 - * le1: 129.146.2.2/24 - * - * This is used by SO_DONTROUTE and IP_NEXTHOP. Since neither of those are - * supported on underlying interfaces in an IPMP group, underlying interfaces - * are ignored when looking up a match. (If we didn't ignore them, we'd - * risk using a test address as a source for outgoing traffic.) - */ -ipif_t * -ipif_lookup_onlink_addr(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst) -{ - ipif_t *ipif, *best_ipif; - ill_t *ill; - ill_walk_context_t ctx; - - ASSERT(zoneid != ALL_ZONES); - best_ipif = NULL; - - rw_enter(&ipst->ips_ill_g_lock, RW_READER); - ill = ILL_START_WALK_V4(&ctx, ipst); - for (; ill != NULL; ill = ill_next(&ctx, ill)) { - if (IS_UNDER_IPMP(ill)) - continue; - mutex_enter(&ill->ill_lock); - for (ipif = ill->ill_ipif; ipif != NULL; - ipif = ipif->ipif_next) { - if (!IPIF_CAN_LOOKUP(ipif)) - continue; - if (ipif->ipif_zoneid != zoneid && - ipif->ipif_zoneid != ALL_ZONES) - continue; - /* - * Point-to-point case. Look for exact match with - * destination address. - */ - if (ipif->ipif_flags & IPIF_POINTOPOINT) { - if (ipif->ipif_pp_dst_addr == addr) { - ipif_refhold_locked(ipif); - mutex_exit(&ill->ill_lock); - rw_exit(&ipst->ips_ill_g_lock); - if (best_ipif != NULL) - ipif_refrele(best_ipif); - return (ipif); - } - } else if (ipif->ipif_subnet == (addr & - ipif->ipif_net_mask)) { - /* - * Point-to-multipoint case. Looping through to - * find the most specific match. If there are - * multiple best match ipif's then prefer ipif's - * that are UP. If there is only one best match - * ipif and it is DOWN we must still return it. - */ - if ((best_ipif == NULL) || - (ipif->ipif_net_mask > - best_ipif->ipif_net_mask) || - ((ipif->ipif_net_mask == - best_ipif->ipif_net_mask) && - ((ipif->ipif_flags & IPIF_UP) && - (!(best_ipif->ipif_flags & IPIF_UP))))) { - ipif_refhold_locked(ipif); - mutex_exit(&ill->ill_lock); - rw_exit(&ipst->ips_ill_g_lock); - if (best_ipif != NULL) - ipif_refrele(best_ipif); - best_ipif = ipif; - rw_enter(&ipst->ips_ill_g_lock, - RW_READER); - mutex_enter(&ill->ill_lock); - } - } - } - mutex_exit(&ill->ill_lock); - } - rw_exit(&ipst->ips_ill_g_lock); - return (best_ipif); -} - -/* * Save enough information so that we can recreate the IRE if * the interface goes down and then up. */ -static void -ipif_save_ire(ipif_t *ipif, ire_t *ire) +void +ill_save_ire(ill_t *ill, ire_t *ire) { mblk_t *save_mp; @@ -18910,115 +16422,148 @@ ipif_save_ire(ipif_t *ipif, ire_t *ire) ifrt = (ifrt_t *)save_mp->b_rptr; bzero(ifrt, sizeof (ifrt_t)); ifrt->ifrt_type = ire->ire_type; - ifrt->ifrt_addr = ire->ire_addr; - ifrt->ifrt_gateway_addr = ire->ire_gateway_addr; - ifrt->ifrt_src_addr = ire->ire_src_addr; - ifrt->ifrt_mask = ire->ire_mask; + if (ire->ire_ipversion == IPV4_VERSION) { + ASSERT(!ill->ill_isv6); + ifrt->ifrt_addr = ire->ire_addr; + ifrt->ifrt_gateway_addr = ire->ire_gateway_addr; + ifrt->ifrt_setsrc_addr = ire->ire_setsrc_addr; + ifrt->ifrt_mask = ire->ire_mask; + } else { + ASSERT(ill->ill_isv6); + ifrt->ifrt_v6addr = ire->ire_addr_v6; + /* ire_gateway_addr_v6 can change due to RTM_CHANGE */ + mutex_enter(&ire->ire_lock); + ifrt->ifrt_v6gateway_addr = ire->ire_gateway_addr_v6; + mutex_exit(&ire->ire_lock); + ifrt->ifrt_v6setsrc_addr = ire->ire_setsrc_addr_v6; + ifrt->ifrt_v6mask = ire->ire_mask_v6; + } ifrt->ifrt_flags = ire->ire_flags; - ifrt->ifrt_max_frag = ire->ire_max_frag; - mutex_enter(&ipif->ipif_saved_ire_lock); - save_mp->b_cont = ipif->ipif_saved_ire_mp; - ipif->ipif_saved_ire_mp = save_mp; - ipif->ipif_saved_ire_cnt++; - mutex_exit(&ipif->ipif_saved_ire_lock); + ifrt->ifrt_zoneid = ire->ire_zoneid; + mutex_enter(&ill->ill_saved_ire_lock); + save_mp->b_cont = ill->ill_saved_ire_mp; + ill->ill_saved_ire_mp = save_mp; + ill->ill_saved_ire_cnt++; + mutex_exit(&ill->ill_saved_ire_lock); } } -static void -ipif_remove_ire(ipif_t *ipif, ire_t *ire) +/* + * Remove one entry from ill_saved_ire_mp. + */ +void +ill_remove_saved_ire(ill_t *ill, ire_t *ire) { mblk_t **mpp; mblk_t *mp; ifrt_t *ifrt; - /* Remove from ipif_saved_ire_mp list if it is there */ - mutex_enter(&ipif->ipif_saved_ire_lock); - for (mpp = &ipif->ipif_saved_ire_mp; *mpp != NULL; + /* Remove from ill_saved_ire_mp list if it is there */ + mutex_enter(&ill->ill_saved_ire_lock); + for (mpp = &ill->ill_saved_ire_mp; *mpp != NULL; mpp = &(*mpp)->b_cont) { + in6_addr_t gw_addr_v6; + /* - * On a given ipif, the triple of address, gateway and - * mask is unique for each saved IRE (in the case of - * ordinary interface routes, the gateway address is - * all-zeroes). + * On a given ill, the tuple of address, gateway, mask, + * ire_type, and zoneid is unique for each saved IRE. */ mp = *mpp; ifrt = (ifrt_t *)mp->b_rptr; - if (ifrt->ifrt_addr == ire->ire_addr && + /* ire_gateway_addr_v6 can change - need lock */ + mutex_enter(&ire->ire_lock); + gw_addr_v6 = ire->ire_gateway_addr_v6; + mutex_exit(&ire->ire_lock); + + if (ifrt->ifrt_zoneid != ire->ire_zoneid || + ifrt->ifrt_type != ire->ire_type) + continue; + + if (ill->ill_isv6 ? + (IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6addr, + &ire->ire_addr_v6) && + IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6gateway_addr, + &gw_addr_v6) && + IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6mask, + &ire->ire_mask_v6)) : + (ifrt->ifrt_addr == ire->ire_addr && ifrt->ifrt_gateway_addr == ire->ire_gateway_addr && - ifrt->ifrt_mask == ire->ire_mask) { + ifrt->ifrt_mask == ire->ire_mask)) { *mpp = mp->b_cont; - ipif->ipif_saved_ire_cnt--; + ill->ill_saved_ire_cnt--; freeb(mp); break; } } - mutex_exit(&ipif->ipif_saved_ire_lock); + mutex_exit(&ill->ill_saved_ire_lock); } /* * IP multirouting broadcast routes handling * Append CGTP broadcast IREs to regular ones created * at ifconfig time. + * The usage is a route add <cgtp_bc> <nic_bc> -multirt i.e., both + * the destination and the gateway are broadcast addresses. + * The caller has verified that the destination is an IRE_BROADCAST and that + * RTF_MULTIRT was set. Here if the gateway is a broadcast address, then + * we create a MULTIRT IRE_BROADCAST. + * Note that the IRE_HOST created by ire_rt_add doesn't get found by anything + * since the IRE_BROADCAST takes precedence; ire_add_v4 does head insertion. */ static void -ip_cgtp_bcast_add(ire_t *ire, ire_t *ire_dst, ip_stack_t *ipst) +ip_cgtp_bcast_add(ire_t *ire, ip_stack_t *ipst) { ire_t *ire_prim; ASSERT(ire != NULL); - ASSERT(ire_dst != NULL); - ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0, - IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); + ire_prim = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0, + IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, ipst, + NULL); if (ire_prim != NULL) { /* * We are in the special case of broadcasts for * CGTP. We add an IRE_BROADCAST that holds * the RTF_MULTIRT flag, the destination - * address of ire_dst and the low level + * address and the low level * info of ire_prim. In other words, CGTP * broadcast is added to the redundant ipif. */ - ipif_t *ipif_prim; + ill_t *ill_prim; ire_t *bcast_ire; - ipif_prim = ire_prim->ire_ipif; + ill_prim = ire_prim->ire_ill; - ip2dbg(("ip_cgtp_filter_bcast_add: " - "ire_dst %p, ire_prim %p, ipif_prim %p\n", - (void *)ire_dst, (void *)ire_prim, - (void *)ipif_prim)); + ip2dbg(("ip_cgtp_filter_bcast_add: ire_prim %p, ill_prim %p\n", + (void *)ire_prim, (void *)ill_prim)); bcast_ire = ire_create( (uchar_t *)&ire->ire_addr, (uchar_t *)&ip_g_all_ones, - (uchar_t *)&ire_dst->ire_src_addr, (uchar_t *)&ire->ire_gateway_addr, - &ipif_prim->ipif_mtu, - NULL, - ipif_prim->ipif_rq, - ipif_prim->ipif_wq, IRE_BROADCAST, - ipif_prim, - 0, - 0, - 0, - ire->ire_flags, - &ire_uinfo_null, - NULL, + ill_prim, + GLOBAL_ZONEID, /* CGTP is only for the global zone */ + ire->ire_flags | RTF_KERNEL, NULL, ipst); + /* + * Here we assume that ire_add does head insertion so that + * the added IRE_BROADCAST comes before the existing IRE_HOST. + */ if (bcast_ire != NULL) { - - if (ire_add(&bcast_ire, NULL, NULL, NULL, - B_FALSE) == 0) { + if (ire->ire_flags & RTF_SETSRC) { + bcast_ire->ire_setsrc_addr = + ire->ire_setsrc_addr; + } + bcast_ire = ire_add(bcast_ire); + if (bcast_ire != NULL) { ip2dbg(("ip_cgtp_filter_bcast_add: " "added bcast_ire %p\n", (void *)bcast_ire)); - ipif_save_ire(bcast_ire->ire_ipif, - bcast_ire); + ill_save_ire(ill_prim, bcast_ire); ire_refrele(bcast_ire); } } @@ -19028,430 +16573,52 @@ ip_cgtp_bcast_add(ire_t *ire, ire_t *ire_dst, ip_stack_t *ipst) /* * IP multirouting broadcast routes handling - * Remove the broadcast ire + * Remove the broadcast ire. + * The usage is a route delete <cgtp_bc> <nic_bc> -multirt i.e., both + * the destination and the gateway are broadcast addresses. + * The caller has only verified that RTF_MULTIRT was set. We check + * that the destination is broadcast and that the gateway is a broadcast + * address, and if so delete the IRE added by ip_cgtp_bcast_add(). */ static void ip_cgtp_bcast_delete(ire_t *ire, ip_stack_t *ipst) { - ire_t *ire_dst; - ASSERT(ire != NULL); - ire_dst = ire_ctable_lookup(ire->ire_addr, 0, IRE_BROADCAST, - NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); - if (ire_dst != NULL) { + + if (ip_type_v4(ire->ire_addr, ipst) == IRE_BROADCAST) { ire_t *ire_prim; - ire_prim = ire_ctable_lookup(ire->ire_gateway_addr, 0, - IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); + ire_prim = ire_ftable_lookup_v4(ire->ire_gateway_addr, 0, 0, + IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, 0, + ipst, NULL); if (ire_prim != NULL) { - ipif_t *ipif_prim; + ill_t *ill_prim; ire_t *bcast_ire; - ipif_prim = ire_prim->ire_ipif; + ill_prim = ire_prim->ire_ill; ip2dbg(("ip_cgtp_filter_bcast_delete: " - "ire_dst %p, ire_prim %p, ipif_prim %p\n", - (void *)ire_dst, (void *)ire_prim, - (void *)ipif_prim)); - - bcast_ire = ire_ctable_lookup(ire->ire_addr, - ire->ire_gateway_addr, - IRE_BROADCAST, - ipif_prim, ALL_ZONES, - NULL, - MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_IPIF | - MATCH_IRE_MASK, ipst); + "ire_prim %p, ill_prim %p\n", + (void *)ire_prim, (void *)ill_prim)); + + bcast_ire = ire_ftable_lookup_v4(ire->ire_addr, 0, + ire->ire_gateway_addr, IRE_BROADCAST, + ill_prim, ALL_ZONES, NULL, + MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_ILL | + MATCH_IRE_MASK, 0, ipst, NULL); if (bcast_ire != NULL) { ip2dbg(("ip_cgtp_filter_bcast_delete: " "looked up bcast_ire %p\n", (void *)bcast_ire)); - ipif_remove_ire(bcast_ire->ire_ipif, + ill_remove_saved_ire(bcast_ire->ire_ill, bcast_ire); ire_delete(bcast_ire); ire_refrele(bcast_ire); } ire_refrele(ire_prim); } - ire_refrele(ire_dst); - } -} - -/* - * IPsec hardware acceleration capabilities related functions. - */ - -/* - * Free a per-ill IPsec capabilities structure. - */ -static void -ill_ipsec_capab_free(ill_ipsec_capab_t *capab) -{ - if (capab->auth_hw_algs != NULL) - kmem_free(capab->auth_hw_algs, capab->algs_size); - if (capab->encr_hw_algs != NULL) - kmem_free(capab->encr_hw_algs, capab->algs_size); - if (capab->encr_algparm != NULL) - kmem_free(capab->encr_algparm, capab->encr_algparm_size); - kmem_free(capab, sizeof (ill_ipsec_capab_t)); -} - -/* - * Allocate a new per-ill IPsec capabilities structure. This structure - * is specific to an IPsec protocol (AH or ESP). It is implemented as - * an array which specifies, for each algorithm, whether this algorithm - * is supported by the ill or not. - */ -static ill_ipsec_capab_t * -ill_ipsec_capab_alloc(void) -{ - ill_ipsec_capab_t *capab; - uint_t nelems; - - capab = kmem_zalloc(sizeof (ill_ipsec_capab_t), KM_NOSLEEP); - if (capab == NULL) - return (NULL); - - /* we need one bit per algorithm */ - nelems = MAX_IPSEC_ALGS / BITS(ipsec_capab_elem_t); - capab->algs_size = nelems * sizeof (ipsec_capab_elem_t); - - /* allocate memory to store algorithm flags */ - capab->encr_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP); - if (capab->encr_hw_algs == NULL) - goto nomem; - capab->auth_hw_algs = kmem_zalloc(capab->algs_size, KM_NOSLEEP); - if (capab->auth_hw_algs == NULL) - goto nomem; - /* - * Leave encr_algparm NULL for now since we won't need it half - * the time - */ - return (capab); - -nomem: - ill_ipsec_capab_free(capab); - return (NULL); -} - -/* - * Resize capability array. Since we're exclusive, this is OK. - */ -static boolean_t -ill_ipsec_capab_resize_algparm(ill_ipsec_capab_t *capab, int algid) -{ - ipsec_capab_algparm_t *nalp, *oalp; - uint32_t olen, nlen; - - oalp = capab->encr_algparm; - olen = capab->encr_algparm_size; - - if (oalp != NULL) { - if (algid < capab->encr_algparm_end) - return (B_TRUE); - } - - nlen = (algid + 1) * sizeof (*nalp); - nalp = kmem_zalloc(nlen, KM_NOSLEEP); - if (nalp == NULL) - return (B_FALSE); - - if (oalp != NULL) { - bcopy(oalp, nalp, olen); - kmem_free(oalp, olen); - } - capab->encr_algparm = nalp; - capab->encr_algparm_size = nlen; - capab->encr_algparm_end = algid + 1; - - return (B_TRUE); -} - -/* - * Compare the capabilities of the specified ill with the protocol - * and algorithms specified by the SA passed as argument. - * If they match, returns B_TRUE, B_FALSE if they do not match. - * - * The ill can be passed as a pointer to it, or by specifying its index - * and whether it is an IPv6 ill (ill_index and ill_isv6 arguments). - * - * Called by ipsec_out_is_accelerated() do decide whether an outbound - * packet is eligible for hardware acceleration, and by - * ill_ipsec_capab_send_all() to decide whether a SA must be sent down - * to a particular ill. - */ -boolean_t -ipsec_capab_match(ill_t *ill, uint_t ill_index, boolean_t ill_isv6, - ipsa_t *sa, netstack_t *ns) -{ - boolean_t sa_isv6; - uint_t algid; - struct ill_ipsec_capab_s *cpp; - boolean_t need_refrele = B_FALSE; - ip_stack_t *ipst = ns->netstack_ip; - - if (ill == NULL) { - ill = ill_lookup_on_ifindex(ill_index, ill_isv6, NULL, - NULL, NULL, NULL, ipst); - if (ill == NULL) { - ip0dbg(("ipsec_capab_match: ill doesn't exist\n")); - return (B_FALSE); - } - need_refrele = B_TRUE; - } - - /* - * Use the address length specified by the SA to determine - * if it corresponds to a IPv6 address, and fail the matching - * if the isv6 flag passed as argument does not match. - * Note: this check is used for SADB capability checking before - * sending SA information to an ill. - */ - sa_isv6 = (sa->ipsa_addrfam == AF_INET6); - if (sa_isv6 != ill_isv6) - /* protocol mismatch */ - goto done; - - /* - * Check if the ill supports the protocol, algorithm(s) and - * key size(s) specified by the SA, and get the pointers to - * the algorithms supported by the ill. - */ - switch (sa->ipsa_type) { - - case SADB_SATYPE_ESP: - if (!(ill->ill_capabilities & ILL_CAPAB_ESP)) - /* ill does not support ESP acceleration */ - goto done; - cpp = ill->ill_ipsec_capab_esp; - algid = sa->ipsa_auth_alg; - if (!IPSEC_ALG_IS_ENABLED(algid, cpp->auth_hw_algs)) - goto done; - algid = sa->ipsa_encr_alg; - if (!IPSEC_ALG_IS_ENABLED(algid, cpp->encr_hw_algs)) - goto done; - if (algid < cpp->encr_algparm_end) { - ipsec_capab_algparm_t *alp = &cpp->encr_algparm[algid]; - if (sa->ipsa_encrkeybits < alp->minkeylen) - goto done; - if (sa->ipsa_encrkeybits > alp->maxkeylen) - goto done; - } - break; - - case SADB_SATYPE_AH: - if (!(ill->ill_capabilities & ILL_CAPAB_AH)) - /* ill does not support AH acceleration */ - goto done; - if (!IPSEC_ALG_IS_ENABLED(sa->ipsa_auth_alg, - ill->ill_ipsec_capab_ah->auth_hw_algs)) - goto done; - break; } - - if (need_refrele) - ill_refrele(ill); - return (B_TRUE); -done: - if (need_refrele) - ill_refrele(ill); - return (B_FALSE); -} - -/* - * Add a new ill to the list of IPsec capable ills. - * Called from ill_capability_ipsec_ack() when an ACK was received - * indicating that IPsec hardware processing was enabled for an ill. - * - * ill must point to the ill for which acceleration was enabled. - * dl_cap must be set to DL_CAPAB_IPSEC_AH or DL_CAPAB_IPSEC_ESP. - */ -static void -ill_ipsec_capab_add(ill_t *ill, uint_t dl_cap, boolean_t sadb_resync) -{ - ipsec_capab_ill_t **ills, *cur_ill, *new_ill; - uint_t sa_type; - uint_t ipproto; - ip_stack_t *ipst = ill->ill_ipst; - - ASSERT((dl_cap == DL_CAPAB_IPSEC_AH) || - (dl_cap == DL_CAPAB_IPSEC_ESP)); - - switch (dl_cap) { - case DL_CAPAB_IPSEC_AH: - sa_type = SADB_SATYPE_AH; - ills = &ipst->ips_ipsec_capab_ills_ah; - ipproto = IPPROTO_AH; - break; - case DL_CAPAB_IPSEC_ESP: - sa_type = SADB_SATYPE_ESP; - ills = &ipst->ips_ipsec_capab_ills_esp; - ipproto = IPPROTO_ESP; - break; - } - - rw_enter(&ipst->ips_ipsec_capab_ills_lock, RW_WRITER); - - /* - * Add ill index to list of hardware accelerators. If - * already in list, do nothing. - */ - for (cur_ill = *ills; cur_ill != NULL && - (cur_ill->ill_index != ill->ill_phyint->phyint_ifindex || - cur_ill->ill_isv6 != ill->ill_isv6); cur_ill = cur_ill->next) - ; - - if (cur_ill == NULL) { - /* if this is a new entry for this ill */ - new_ill = kmem_zalloc(sizeof (ipsec_capab_ill_t), KM_NOSLEEP); - if (new_ill == NULL) { - rw_exit(&ipst->ips_ipsec_capab_ills_lock); - return; - } - - new_ill->ill_index = ill->ill_phyint->phyint_ifindex; - new_ill->ill_isv6 = ill->ill_isv6; - new_ill->next = *ills; - *ills = new_ill; - } else if (!sadb_resync) { - /* not resync'ing SADB and an entry exists for this ill */ - rw_exit(&ipst->ips_ipsec_capab_ills_lock); - return; - } - - rw_exit(&ipst->ips_ipsec_capab_ills_lock); - - if (ipst->ips_ipcl_proto_fanout_v6[ipproto].connf_head != NULL) - /* - * IPsec module for protocol loaded, initiate dump - * of the SADB to this ill. - */ - sadb_ill_download(ill, sa_type); -} - -/* - * Remove an ill from the list of IPsec capable ills. - */ -static void -ill_ipsec_capab_delete(ill_t *ill, uint_t dl_cap) -{ - ipsec_capab_ill_t **ills, *cur_ill, *prev_ill; - ip_stack_t *ipst = ill->ill_ipst; - - ASSERT(dl_cap == DL_CAPAB_IPSEC_AH || - dl_cap == DL_CAPAB_IPSEC_ESP); - - ills = (dl_cap == DL_CAPAB_IPSEC_AH) ? &ipst->ips_ipsec_capab_ills_ah : - &ipst->ips_ipsec_capab_ills_esp; - - rw_enter(&ipst->ips_ipsec_capab_ills_lock, RW_WRITER); - - prev_ill = NULL; - for (cur_ill = *ills; cur_ill != NULL && (cur_ill->ill_index != - ill->ill_phyint->phyint_ifindex || cur_ill->ill_isv6 != - ill->ill_isv6); prev_ill = cur_ill, cur_ill = cur_ill->next) - ; - if (cur_ill == NULL) { - /* entry not found */ - rw_exit(&ipst->ips_ipsec_capab_ills_lock); - return; - } - if (prev_ill == NULL) { - /* entry at front of list */ - *ills = NULL; - } else { - prev_ill->next = cur_ill->next; - } - kmem_free(cur_ill, sizeof (ipsec_capab_ill_t)); - rw_exit(&ipst->ips_ipsec_capab_ills_lock); -} - -/* - * Called by SADB to send a DL_CONTROL_REQ message to every ill - * supporting the specified IPsec protocol acceleration. - * sa_type must be SADB_SATYPE_AH or SADB_SATYPE_ESP. - * We free the mblk and, if sa is non-null, release the held referece. - */ -void -ill_ipsec_capab_send_all(uint_t sa_type, mblk_t *mp, ipsa_t *sa, - netstack_t *ns) -{ - ipsec_capab_ill_t *ici, *cur_ici; - ill_t *ill; - mblk_t *nmp, *mp_ship_list = NULL, *next_mp; - ip_stack_t *ipst = ns->netstack_ip; - - ici = (sa_type == SADB_SATYPE_AH) ? ipst->ips_ipsec_capab_ills_ah : - ipst->ips_ipsec_capab_ills_esp; - - rw_enter(&ipst->ips_ipsec_capab_ills_lock, RW_READER); - - for (cur_ici = ici; cur_ici != NULL; cur_ici = cur_ici->next) { - ill = ill_lookup_on_ifindex(cur_ici->ill_index, - cur_ici->ill_isv6, NULL, NULL, NULL, NULL, ipst); - - /* - * Handle the case where the ill goes away while the SADB is - * attempting to send messages. If it's going away, it's - * nuking its shadow SADB, so we don't care.. - */ - - if (ill == NULL) - continue; - - if (sa != NULL) { - /* - * Make sure capabilities match before - * sending SA to ill. - */ - if (!ipsec_capab_match(ill, cur_ici->ill_index, - cur_ici->ill_isv6, sa, ipst->ips_netstack)) { - ill_refrele(ill); - continue; - } - - mutex_enter(&sa->ipsa_lock); - sa->ipsa_flags |= IPSA_F_HW; - mutex_exit(&sa->ipsa_lock); - } - - /* - * Copy template message, and add it to the front - * of the mblk ship list. We want to avoid holding - * the ipsec_capab_ills_lock while sending the - * message to the ills. - * - * The b_next and b_prev are temporarily used - * to build a list of mblks to be sent down, and to - * save the ill to which they must be sent. - */ - nmp = copymsg(mp); - if (nmp == NULL) { - ill_refrele(ill); - continue; - } - ASSERT(nmp->b_next == NULL && nmp->b_prev == NULL); - nmp->b_next = mp_ship_list; - mp_ship_list = nmp; - nmp->b_prev = (mblk_t *)ill; - } - - rw_exit(&ipst->ips_ipsec_capab_ills_lock); - - for (nmp = mp_ship_list; nmp != NULL; nmp = next_mp) { - /* restore the mblk to a sane state */ - next_mp = nmp->b_next; - nmp->b_next = NULL; - ill = (ill_t *)nmp->b_prev; - nmp->b_prev = NULL; - - ill_dlpi_send(ill, nmp); - ill_refrele(ill); - } - - if (sa != NULL) - IPSA_REFRELE(sa); - freemsg(mp); } /* @@ -19531,71 +16698,79 @@ ip_ipmp_v6intfid(ill_t *ill, in6_addr_t *v6addr) addr[0] &= ~0x2; /* set local bit */ } -/* ARGSUSED */ -static boolean_t -ip_ether_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, - uint32_t *hw_start, in6_addr_t *v6_extract_mask) +/* + * Map the multicast in6_addr_t in m_ip6addr to the physaddr for ethernet. + */ +static void +ip_ether_v6_mapping(ill_t *ill, uchar_t *m_ip6addr, uchar_t *m_physaddr) { - /* - * Multicast address mappings used over Ethernet/802.X. - * This address is used as a base for mappings. - */ - static uint8_t ipv6_g_phys_multi_addr[] = {0x33, 0x33, 0x00, - 0x00, 0x00, 0x00}; + phyint_t *phyi = ill->ill_phyint; /* - * Extract low order 32 bits from IPv6 multicast address. - * Or that into the link layer address, starting from the - * second byte. + * Check PHYI_MULTI_BCAST and length of physical + * address to determine if we use the mapping or the + * broadcast address. */ - *hw_start = 2; - v6_extract_mask->s6_addr32[0] = 0; - v6_extract_mask->s6_addr32[1] = 0; - v6_extract_mask->s6_addr32[2] = 0; - v6_extract_mask->s6_addr32[3] = 0xffffffffU; - bcopy(ipv6_g_phys_multi_addr, maddr, lla_length); - return (B_TRUE); + if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 || + ill->ill_phys_addr_length != ETHERADDRL) { + ip_mbcast_mapping(ill, m_ip6addr, m_physaddr); + return; + } + m_physaddr[0] = 0x33; + m_physaddr[1] = 0x33; + m_physaddr[2] = m_ip6addr[12]; + m_physaddr[3] = m_ip6addr[13]; + m_physaddr[4] = m_ip6addr[14]; + m_physaddr[5] = m_ip6addr[15]; } /* - * Indicate by return value whether multicast is supported. If not, - * this code should not touch/change any parameters. + * Map the multicast ipaddr_t in m_ipaddr to the physaddr for ethernet. */ -/* ARGSUSED */ -static boolean_t -ip_ether_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, - uint32_t *hw_start, ipaddr_t *extract_mask) +static void +ip_ether_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) { + phyint_t *phyi = ill->ill_phyint; + /* - * Multicast address mappings used over Ethernet/802.X. - * This address is used as a base for mappings. + * Check PHYI_MULTI_BCAST and length of physical + * address to determine if we use the mapping or the + * broadcast address. */ - static uint8_t ip_g_phys_multi_addr[] = { 0x01, 0x00, 0x5e, - 0x00, 0x00, 0x00 }; - - if (phys_length != ETHERADDRL) - return (B_FALSE); - - *extract_mask = htonl(0x007fffff); - *hw_start = 2; - bcopy(ip_g_phys_multi_addr, maddr, ETHERADDRL); - return (B_TRUE); + if ((phyi->phyint_flags & PHYI_MULTI_BCAST) != 0 || + ill->ill_phys_addr_length != ETHERADDRL) { + ip_mbcast_mapping(ill, m_ipaddr, m_physaddr); + return; + } + m_physaddr[0] = 0x01; + m_physaddr[1] = 0x00; + m_physaddr[2] = 0x5e; + m_physaddr[3] = m_ipaddr[1] & 0x7f; + m_physaddr[4] = m_ipaddr[2]; + m_physaddr[5] = m_ipaddr[3]; } /* ARGSUSED */ -static boolean_t -ip_nodef_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, - uint32_t *hw_start, ipaddr_t *extract_mask) +static void +ip_mbcast_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) { - return (B_FALSE); -} + /* + * for the MULTI_BCAST case and other cases when we want to + * use the link-layer broadcast address for multicast. + */ + uint8_t *bphys_addr; + dl_unitdata_req_t *dlur; -/* ARGSUSED */ -static boolean_t -ip_nodef_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, - uint32_t *hw_start, in6_addr_t *v6_extract_mask) -{ - return (B_FALSE); + dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; + if (ill->ill_sap_length < 0) { + bphys_addr = (uchar_t *)dlur + + dlur->dl_dest_addr_offset; + } else { + bphys_addr = (uchar_t *)dlur + + dlur->dl_dest_addr_offset + ill->ill_sap_length; + } + + bcopy(bphys_addr, m_physaddr, ill->ill_phys_addr_length); } /* @@ -19624,6 +16799,7 @@ ip_ib_v6intfid(ill_t *ill, in6_addr_t *v6addr) } /* + * Map the multicast ipaddr_t in m_ipaddr to the physaddr for InfiniBand. * Note on mapping from multicast IP addresses to IPoIB multicast link * addresses. IPoIB multicast link addresses are based on IBA link addresses. * The format of an IPoIB multicast address is: @@ -19637,72 +16813,70 @@ ip_ib_v6intfid(ill_t *ill, in6_addr_t *v6addr) * network interface. They can be ascertained from the broadcast address. * The Sign. part is the signature, and is 401B for IPv4 and 601B for IPv6. */ - -static boolean_t -ip_ib_v6mapinfo(uint_t lla_length, uint8_t *bphys_addr, uint8_t *maddr, - uint32_t *hw_start, in6_addr_t *v6_extract_mask) +static void +ip_ib_v4_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) { - /* - * Base IPoIB IPv6 multicast address used for mappings. - * Does not contain the IBA scope/Pkey values. - */ - static uint8_t ipv6_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, - 0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00, + static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, + 0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; + uint8_t *bphys_addr; + dl_unitdata_req_t *dlur; + + bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length); /* - * Extract low order 80 bits from IPv6 multicast address. - * Or that into the link layer address, starting from the - * sixth byte. + * RFC 4391: IPv4 MGID is 28-bit long. */ - *hw_start = 6; - bcopy(ipv6_g_phys_ibmulti_addr, maddr, lla_length); + m_physaddr[16] = m_ipaddr[0] & 0x0f; + m_physaddr[17] = m_ipaddr[1]; + m_physaddr[18] = m_ipaddr[2]; + m_physaddr[19] = m_ipaddr[3]; + + dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; + if (ill->ill_sap_length < 0) { + bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset; + } else { + bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset + + ill->ill_sap_length; + } /* * Now fill in the IBA scope/Pkey values from the broadcast address. */ - *(maddr + 5) = *(bphys_addr + 5); - *(maddr + 8) = *(bphys_addr + 8); - *(maddr + 9) = *(bphys_addr + 9); - - v6_extract_mask->s6_addr32[0] = 0; - v6_extract_mask->s6_addr32[1] = htonl(0x0000ffff); - v6_extract_mask->s6_addr32[2] = 0xffffffffU; - v6_extract_mask->s6_addr32[3] = 0xffffffffU; - return (B_TRUE); + m_physaddr[5] = bphys_addr[5]; + m_physaddr[8] = bphys_addr[8]; + m_physaddr[9] = bphys_addr[9]; } -static boolean_t -ip_ib_v4mapinfo(uint_t phys_length, uint8_t *bphys_addr, uint8_t *maddr, - uint32_t *hw_start, ipaddr_t *extract_mask) +static void +ip_ib_v6_mapping(ill_t *ill, uchar_t *m_ipaddr, uchar_t *m_physaddr) { - /* - * Base IPoIB IPv4 multicast address used for mappings. - * Does not contain the IBA scope/Pkey values. - */ static uint8_t ipv4_g_phys_ibmulti_addr[] = { 0x00, 0xff, 0xff, 0xff, - 0xff, 0x10, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00, + 0xff, 0x10, 0x60, 0x1b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; + uint8_t *bphys_addr; + dl_unitdata_req_t *dlur; - if (phys_length != sizeof (ipv4_g_phys_ibmulti_addr)) - return (B_FALSE); + bcopy(ipv4_g_phys_ibmulti_addr, m_physaddr, ill->ill_phys_addr_length); /* - * Extract low order 28 bits from IPv4 multicast address. - * Or that into the link layer address, starting from the - * sixteenth byte. + * RFC 4391: IPv4 MGID is 80-bit long. */ - *extract_mask = htonl(0x0fffffff); - *hw_start = 16; - bcopy(ipv4_g_phys_ibmulti_addr, maddr, phys_length); + bcopy(&m_ipaddr[6], &m_physaddr[10], 10); + dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; + if (ill->ill_sap_length < 0) { + bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset; + } else { + bphys_addr = (uchar_t *)dlur + dlur->dl_dest_addr_offset + + ill->ill_sap_length; + } /* * Now fill in the IBA scope/Pkey values from the broadcast address. */ - *(maddr + 5) = *(bphys_addr + 5); - *(maddr + 8) = *(bphys_addr + 8); - *(maddr + 9) = *(bphys_addr + 9); - return (B_TRUE); + m_physaddr[5] = bphys_addr[5]; + m_physaddr[8] = bphys_addr[8]; + m_physaddr[9] = bphys_addr[9]; } /* @@ -19758,56 +16932,34 @@ ip_ipv4_v6destintfid(ill_t *ill, in6_addr_t *v6addr) } /* - * Returns B_TRUE if an ipif is present in the given zone, matching some flags - * (typically IPIF_UP). If ipifp is non-null, the held ipif is returned there. - * This works for both IPv4 and IPv6; if the passed-in ill is v6, the ipif with - * the link-local address is preferred. + * Lookup an ill and verify that the zoneid has an ipif on that ill. + * Returns an held ill, or NULL. */ -boolean_t -ipif_lookup_zoneid(ill_t *ill, zoneid_t zoneid, int flags, ipif_t **ipifp) +ill_t * +ill_lookup_on_ifindex_zoneid(uint_t index, zoneid_t zoneid, boolean_t isv6, + ip_stack_t *ipst) { + ill_t *ill; ipif_t *ipif; - ipif_t *maybe_ipif = NULL; - mutex_enter(&ill->ill_lock); - if (ill->ill_state_flags & ILL_CONDEMNED) { - mutex_exit(&ill->ill_lock); - if (ipifp != NULL) - *ipifp = NULL; - return (B_FALSE); - } + ill = ill_lookup_on_ifindex(index, isv6, ipst); + if (ill == NULL) + return (NULL); + mutex_enter(&ill->ill_lock); for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { - if (!IPIF_CAN_LOOKUP(ipif)) + if (IPIF_IS_CONDEMNED(ipif)) continue; if (zoneid != ALL_ZONES && ipif->ipif_zoneid != zoneid && ipif->ipif_zoneid != ALL_ZONES) continue; - if ((ipif->ipif_flags & flags) != flags) - continue; - if (ipifp == NULL) { - mutex_exit(&ill->ill_lock); - ASSERT(maybe_ipif == NULL); - return (B_TRUE); - } - if (!ill->ill_isv6 || - IN6_IS_ADDR_LINKLOCAL(&ipif->ipif_v6src_addr)) { - ipif_refhold_locked(ipif); - mutex_exit(&ill->ill_lock); - *ipifp = ipif; - return (B_TRUE); - } - if (maybe_ipif == NULL) - maybe_ipif = ipif; - } - if (ipifp != NULL) { - if (maybe_ipif != NULL) - ipif_refhold_locked(maybe_ipif); - *ipifp = maybe_ipif; + mutex_exit(&ill->ill_lock); + return (ill); } mutex_exit(&ill->ill_lock); - return (maybe_ipif != NULL); + ill_refrele(ill); + return (NULL); } /* @@ -19822,8 +16974,7 @@ ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6, ipif_t *ipif; ill_t *ill; - ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL, - ipst); + ill = ill_lookup_on_ifindex(ifindex, isv6, ipst); if (ill == NULL) return (NULL); @@ -19849,19 +17000,52 @@ ipif_getby_indexes(uint_t ifindex, uint_t lifidx, boolean_t isv6, } /* - * Flush the fastpath by deleting any nce's that are waiting for the fastpath, - * There is one exceptions IRE_BROADCAST are difficult to recreate, - * so instead we just nuke their nce_fp_mp's; see ndp_fastpath_flush() - * for details. + * Set ill_inputfn based on the current know state. + * This needs to be called when any of the factors taken into + * account changes. */ void -ill_fastpath_flush(ill_t *ill) +ill_set_inputfn(ill_t *ill) { - ip_stack_t *ipst = ill->ill_ipst; + ip_stack_t *ipst = ill->ill_ipst; - nce_fastpath_list_dispatch(ill, NULL, NULL); - ndp_walk_common((ill->ill_isv6 ? ipst->ips_ndp6 : ipst->ips_ndp4), - ill, (pfi_t)ndp_fastpath_flush, NULL, B_TRUE); + if (ill->ill_isv6) { + if (is_system_labeled()) + ill->ill_inputfn = ill_input_full_v6; + else + ill->ill_inputfn = ill_input_short_v6; + } else { + if (is_system_labeled()) + ill->ill_inputfn = ill_input_full_v4; + else if (ill->ill_dhcpinit != 0) + ill->ill_inputfn = ill_input_full_v4; + else if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head + != NULL) + ill->ill_inputfn = ill_input_full_v4; + else if (ipst->ips_ip_cgtp_filter && + ipst->ips_ip_cgtp_filter_ops != NULL) + ill->ill_inputfn = ill_input_full_v4; + else + ill->ill_inputfn = ill_input_short_v4; + } +} + +/* + * Re-evaluate ill_inputfn for all the IPv4 ills. + * Used when RSVP and CGTP comes and goes. + */ +void +ill_set_inputfn_all(ip_stack_t *ipst) +{ + ill_walk_context_t ctx; + ill_t *ill; + + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + ill = ILL_START_WALK_V4(&ctx, ipst); + for (; ill != NULL; ill = ill_next(&ctx, ill)) + ill_set_inputfn(ill); + + rw_exit(&ipst->ips_ill_g_lock); } /* @@ -19897,6 +17081,10 @@ ill_set_phys_addr(ill_t *ill, mblk_t *mp) } ipsq_current_start(ipsq, ill->ill_ipif, 0); + mutex_enter(&ill->ill_lock); + ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS; + /* no more nce addition allowed */ + mutex_exit(&ill->ill_lock); /* * If we can quiesce the ill, then set the address. If not, then @@ -19923,8 +17111,8 @@ ill_set_phys_addr(ill_t *ill, mblk_t *mp) * are passed (linked by b_cont), since we sometimes need to save two distinct * copies in the ill_t, and our context doesn't permit sleeping or allocation * failure (we'll free the other copy if it's not needed). Since the ill_t - * is quiesced, we know any stale IREs with the old address information have - * already been removed, so we don't need to call ill_fastpath_flush(). + * is quiesced, we know any stale nce's with the old address information have + * already been removed, so we don't need to call nce_flush(). */ /* ARGSUSED */ static void @@ -19934,6 +17122,7 @@ ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy) mblk_t *addrmp2 = unlinkb(addrmp); dl_notify_ind_t *dlindp = (dl_notify_ind_t *)addrmp->b_rptr; uint_t addrlen, addroff; + int status; ASSERT(IAM_WRITER_IPSQ(ipsq)); @@ -19962,7 +17151,7 @@ ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy) ill->ill_phys_addr = addrmp->b_rptr + addroff; ill->ill_phys_addr_mp = addrmp; ill->ill_phys_addr_length = addrlen; - if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV)) + if (ill->ill_isv6) ill_set_ndmp(ill, addrmp2, addroff, addrlen); else freemsg(addrmp2); @@ -19978,10 +17167,15 @@ ill_set_phys_addr_tail(ipsq_t *ipsq, queue_t *q, mblk_t *addrmp, void *dummy) /* * If there are ipifs to bring up, ill_up_ipifs() will return * EINPROGRESS, and ipsq_current_finish() will be called by - * ip_rput_dlpi_writer() or ip_arp_done() when the last ipif is + * ip_rput_dlpi_writer() or arp_bringup_done() when the last ipif is * brought up. */ - if (ill_up_ipifs(ill, q, addrmp) != EINPROGRESS) + status = ill_up_ipifs(ill, q, addrmp); + mutex_enter(&ill->ill_lock); + if (ill->ill_dl_up) + ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS; + mutex_exit(&ill->ill_lock); + if (status != EINPROGRESS) ipsq_current_finish(ipsq); } @@ -20009,6 +17203,11 @@ ill_replumb(ill_t *ill, mblk_t *mp) ipsq_current_start(ipsq, ill->ill_ipif, 0); + mutex_enter(&ill->ill_lock); + ill->ill_state_flags |= ILL_DOWN_IN_PROGRESS; + /* no more nce addition allowed */ + mutex_exit(&ill->ill_lock); + /* * If we can quiesce the ill, then continue. If not, then * ill_replumb_tail() will be called from ipif_ill_refrele_tail(). @@ -20034,14 +17233,32 @@ static void ill_replumb_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy) { ill_t *ill = q->q_ptr; + int err; + conn_t *connp = NULL; ASSERT(IAM_WRITER_IPSQ(ipsq)); - - ill_down_ipifs_tail(ill); - freemsg(ill->ill_replumb_mp); ill->ill_replumb_mp = copyb(mp); + if (ill->ill_replumb_mp == NULL) { + /* out of memory */ + ipsq_current_finish(ipsq); + return; + } + + mutex_enter(&ill->ill_lock); + ill->ill_up_ipifs = ipsq_pending_mp_add(NULL, ill->ill_ipif, + ill->ill_rq, ill->ill_replumb_mp, 0); + mutex_exit(&ill->ill_lock); + + if (!ill->ill_up_ipifs) { + /* already closing */ + ipsq_current_finish(ipsq); + return; + } + ill->ill_replumbing = 1; + err = ill_down_ipifs_tail(ill); + /* * Successfully quiesced and brought down the interface, now we send * the DL_NOTE_REPLUMB_DONE message down to the driver. Reuse the @@ -20055,15 +17272,23 @@ ill_replumb_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy) ill_dlpi_send(ill, mp); /* - * If there are ipifs to bring up, ill_up_ipifs() will return - * EINPROGRESS, and ipsq_current_finish() will be called by - * ip_rput_dlpi_writer() or ip_arp_done() when the last ipif is - * brought up. + * For IPv4, we would usually get EINPROGRESS because the ETHERTYPE_ARP + * streams have to be unbound. When all the DLPI exchanges are done, + * ipsq_current_finish() will be called by arp_bringup_done(). The + * remainder of ipif bringup via ill_up_ipifs() will also be done in + * arp_bringup_done(). */ - if (ill->ill_replumb_mp == NULL || - ill_up_ipifs(ill, q, ill->ill_replumb_mp) != EINPROGRESS) { - ipsq_current_finish(ipsq); + ASSERT(ill->ill_replumb_mp != NULL); + if (err == EINPROGRESS) + return; + else + ill->ill_replumb_mp = ipsq_pending_mp_get(ipsq, &connp); + ASSERT(connp == NULL); + if (err == 0 && ill->ill_replumb_mp != NULL && + ill_up_ipifs(ill, q, ill->ill_replumb_mp) == EINPROGRESS) { + return; } + ipsq_current_finish(ipsq); } /* @@ -20342,6 +17567,338 @@ fail: "information for %s (ENOMEM)\n", str, ill->ill_name)); } +static int +ipif_arp_up_done_tail(ipif_t *ipif, enum ip_resolver_action res_act) +{ + int err = 0; + const in_addr_t *addr = NULL; + nce_t *nce = NULL; + ill_t *ill = ipif->ipif_ill; + ill_t *bound_ill; + boolean_t added_ipif = B_FALSE; + uint16_t state; + uint16_t flags; + + DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up_done_tail", + ill_t *, ill, ipif_t *, ipif); + if (ipif->ipif_lcl_addr != INADDR_ANY) { + addr = &ipif->ipif_lcl_addr; + } + + if ((ipif->ipif_flags & IPIF_UNNUMBERED) || addr == NULL) { + if (res_act != Res_act_initial) + return (EINVAL); + } + + if (addr != NULL) { + ipmp_illgrp_t *illg = ill->ill_grp; + + /* add unicast nce for the local addr */ + + if (IS_IPMP(ill)) { + /* + * If we're here via ipif_up(), then the ipif + * won't be bound yet -- add it to the group, + * which will bind it if possible. (We would + * add it in ipif_up(), but deleting on failure + * there is gruesome.) If we're here via + * ipmp_ill_bind_ipif(), then the ipif has + * already been added to the group and we + * just need to use the binding. + */ + if ((bound_ill = ipmp_ipif_bound_ill(ipif)) == NULL) { + bound_ill = ipmp_illgrp_add_ipif(illg, ipif); + if (bound_ill == NULL) { + /* + * We couldn't bind the ipif to an ill + * yet, so we have nothing to publish. + * Mark the address as ready and return. + */ + ipif->ipif_addr_ready = 1; + return (0); + } + added_ipif = B_TRUE; + } + } else { + bound_ill = ill; + } + + flags = (NCE_F_MYADDR | NCE_F_PUBLISH | NCE_F_AUTHORITY | + NCE_F_NONUD); + /* + * If this is an initial bring-up (or the ipif was never + * completely brought up), do DAD. Otherwise, we're here + * because IPMP has rebound an address to this ill: send + * unsolicited advertisements (ARP announcements) to + * inform others. + */ + if (res_act == Res_act_initial || !ipif->ipif_addr_ready) { + state = ND_UNCHANGED; /* compute in nce_add_common() */ + } else { + state = ND_REACHABLE; + flags |= NCE_F_UNSOL_ADV; + } + +retry: + err = nce_lookup_then_add_v4(ill, + bound_ill->ill_phys_addr, bound_ill->ill_phys_addr_length, + addr, flags, state, &nce); + + /* + * note that we may encounter EEXIST if we are moving + * the nce as a result of a rebind operation. + */ + switch (err) { + case 0: + ipif->ipif_added_nce = 1; + nce->nce_ipif_cnt++; + break; + case EEXIST: + ip1dbg(("ipif_arp_up: NCE already exists for %s\n", + ill->ill_name)); + if (!NCE_MYADDR(nce->nce_common)) { + /* + * A leftover nce from before this address + * existed + */ + ncec_delete(nce->nce_common); + nce_refrele(nce); + nce = NULL; + goto retry; + } + if ((ipif->ipif_flags & IPIF_POINTOPOINT) == 0) { + nce_refrele(nce); + nce = NULL; + ip1dbg(("ipif_arp_up: NCE already exists " + "for %s:%u\n", ill->ill_name, + ipif->ipif_id)); + goto arp_up_done; + } + /* + * Duplicate local addresses are permissible for + * IPIF_POINTOPOINT interfaces which will get marked + * IPIF_UNNUMBERED later in + * ip_addr_availability_check(). + * + * The nce_ipif_cnt field tracks the number of + * ipifs that have nce_addr as their local address. + */ + ipif->ipif_addr_ready = 1; + ipif->ipif_added_nce = 1; + nce->nce_ipif_cnt++; + err = 0; + break; + default: + ASSERT(nce == NULL); + goto arp_up_done; + } + if (arp_no_defense) { + if ((ipif->ipif_flags & IPIF_UP) && + !ipif->ipif_addr_ready) + ipif_up_notify(ipif); + ipif->ipif_addr_ready = 1; + } + } else { + /* zero address. nothing to publish */ + ipif->ipif_addr_ready = 1; + } + if (nce != NULL) + nce_refrele(nce); +arp_up_done: + if (added_ipif && err != 0) + ipmp_illgrp_del_ipif(ill->ill_grp, ipif); + return (err); +} + +int +ipif_arp_up(ipif_t *ipif, enum ip_resolver_action res_act, boolean_t was_dup) +{ + int err = 0; + ill_t *ill = ipif->ipif_ill; + boolean_t first_interface, wait_for_dlpi = B_FALSE; + + DTRACE_PROBE3(ipif__downup, char *, "ipif_arp_up", + ill_t *, ill, ipif_t *, ipif); + + /* + * need to bring up ARP or setup mcast mapping only + * when the first interface is coming UP. + */ + first_interface = (ill->ill_ipif_up_count == 0 && + ill->ill_ipif_dup_count == 0 && !was_dup); + + if (res_act == Res_act_initial && first_interface) { + /* + * Send ATTACH + BIND + */ + err = arp_ll_up(ill); + if (err != EINPROGRESS && err != 0) + return (err); + + /* + * Add NCE for local address. Start DAD. + * we'll wait to hear that DAD has finished + * before using the interface. + */ + if (err == EINPROGRESS) + wait_for_dlpi = B_TRUE; + } + + if (!wait_for_dlpi) + (void) ipif_arp_up_done_tail(ipif, res_act); + + return (!wait_for_dlpi ? 0 : EINPROGRESS); +} + +/* + * Finish processing of "arp_up" after all the DLPI message + * exchanges have completed between arp and the driver. + */ +void +arp_bringup_done(ill_t *ill, int err) +{ + mblk_t *mp1; + ipif_t *ipif; + conn_t *connp = NULL; + ipsq_t *ipsq; + queue_t *q; + + ip1dbg(("arp_bringup_done(%s)\n", ill->ill_name)); + + ASSERT(IAM_WRITER_ILL(ill)); + + ipsq = ill->ill_phyint->phyint_ipsq; + ipif = ipsq->ipsq_xop->ipx_pending_ipif; + mp1 = ipsq_pending_mp_get(ipsq, &connp); + ASSERT(!((mp1 != NULL) ^ (ipif != NULL))); + if (mp1 == NULL) /* bringup was aborted by the user */ + return; + + /* + * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we + * must have an associated conn_t. Otherwise, we're bringing this + * interface back up as part of handling an asynchronous event (e.g., + * physical address change). + */ + if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { + ASSERT(connp != NULL); + q = CONNP_TO_WQ(connp); + } else { + ASSERT(connp == NULL); + q = ill->ill_rq; + } + if (err == 0) { + if (ipif->ipif_isv6) { + if ((err = ipif_up_done_v6(ipif)) != 0) + ip0dbg(("arp_bringup_done: init failed\n")); + } else { + err = ipif_arp_up_done_tail(ipif, Res_act_initial); + if (err != 0 || (err = ipif_up_done(ipif)) != 0) + ip0dbg(("arp_bringup_done: init failed\n")); + } + } else { + ip0dbg(("arp_bringup_done: DL_BIND_REQ failed\n")); + } + + if ((err == 0) && (ill->ill_up_ipifs)) { + err = ill_up_ipifs(ill, q, mp1); + if (err == EINPROGRESS) + return; + } + + /* + * If we have a moved ipif to bring up, and everything has succeeded + * to this point, bring it up on the IPMP ill. Otherwise, leave it + * down -- the admin can try to bring it up by hand if need be. + */ + if (ill->ill_move_ipif != NULL) { + ipif = ill->ill_move_ipif; + ip1dbg(("bringing up ipif %p on ill %s\n", (void *)ipif, + ipif->ipif_ill->ill_name)); + ill->ill_move_ipif = NULL; + if (err == 0) { + err = ipif_up(ipif, q, mp1); + if (err == EINPROGRESS) + return; + } + } + + /* + * The operation must complete without EINPROGRESS since + * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp. + * Otherwise, the operation will be stuck forever in the ipsq. + */ + ASSERT(err != EINPROGRESS); + if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { + DTRACE_PROBE4(ipif__ioctl, char *, "arp_bringup_done finish", + int, ipsq->ipsq_xop->ipx_current_ioctl, + ill_t *, ill, ipif_t *, ipif); + ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); + } else { + ipsq_current_finish(ipsq); + } +} + +/* + * Finish processing of arp replumb after all the DLPI message + * exchanges have completed between arp and the driver. + */ +void +arp_replumb_done(ill_t *ill, int err) +{ + mblk_t *mp1; + ipif_t *ipif; + conn_t *connp = NULL; + ipsq_t *ipsq; + queue_t *q; + + ASSERT(IAM_WRITER_ILL(ill)); + + ipsq = ill->ill_phyint->phyint_ipsq; + ipif = ipsq->ipsq_xop->ipx_pending_ipif; + mp1 = ipsq_pending_mp_get(ipsq, &connp); + ASSERT(!((mp1 != NULL) ^ (ipif != NULL))); + if (mp1 == NULL) { + ip0dbg(("arp_replumb_done: bringup aborted ioctl %x\n", + ipsq->ipsq_xop->ipx_current_ioctl)); + /* bringup was aborted by the user */ + return; + } + /* + * If an IOCTL is waiting on this (ipsq_current_ioctl != 0), then we + * must have an associated conn_t. Otherwise, we're bringing this + * interface back up as part of handling an asynchronous event (e.g., + * physical address change). + */ + if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { + ASSERT(connp != NULL); + q = CONNP_TO_WQ(connp); + } else { + ASSERT(connp == NULL); + q = ill->ill_rq; + } + if ((err == 0) && (ill->ill_up_ipifs)) { + err = ill_up_ipifs(ill, q, mp1); + if (err == EINPROGRESS) + return; + } + /* + * The operation must complete without EINPROGRESS since + * ipsq_pending_mp_get() has removed the mblk from ipsq_pending_mp. + * Otherwise, the operation will be stuck forever in the ipsq. + */ + ASSERT(err != EINPROGRESS); + if (ipsq->ipsq_xop->ipx_current_ioctl != 0) { + DTRACE_PROBE4(ipif__ioctl, char *, + "arp_replumb_done finish", + int, ipsq->ipsq_xop->ipx_current_ioctl, + ill_t *, ill, ipif_t *, ipif); + ip_ioctl_finish(q, mp1, err, NO_COPYOUT, ipsq); + } else { + ipsq_current_finish(ipsq); + } +} + void ipif_up_notify(ipif_t *ipif) { @@ -20610,3 +18167,48 @@ ip_sioctl_ilb_cmd(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, done: return (ret); } + +/* Remove all cache entries for this logical interface */ +void +ipif_nce_down(ipif_t *ipif) +{ + ill_t *ill = ipif->ipif_ill; + nce_t *nce; + + DTRACE_PROBE3(ipif__downup, char *, "ipif_nce_down", + ill_t *, ill, ipif_t *, ipif); + if (ipif->ipif_added_nce) { + if (ipif->ipif_isv6) + nce = nce_lookup_v6(ill, &ipif->ipif_v6lcl_addr); + else + nce = nce_lookup_v4(ill, &ipif->ipif_lcl_addr); + if (nce != NULL) { + if (--nce->nce_ipif_cnt == 0) + ncec_delete(nce->nce_common); + ipif->ipif_added_nce = 0; + nce_refrele(nce); + } else { + /* + * nce may already be NULL because it was already + * flushed, e.g., due to a call to nce_flush + */ + ipif->ipif_added_nce = 0; + } + } + /* + * Make IPMP aware of the deleted data address. + */ + if (IS_IPMP(ill)) + ipmp_illgrp_del_ipif(ill->ill_grp, ipif); + + /* + * Remove all other nces dependent on this ill when the last ipif + * is going away. + */ + if (ill->ill_ipif_up_count == 0) { + ncec_walk(ill, (pfi_t)ncec_delete_per_ill, + (uchar_t *)ill, ill->ill_ipst); + if (IS_UNDER_IPMP(ill)) + nce_flush(ill, B_TRUE); + } +} diff --git a/usr/src/uts/common/inet/ip/ip_input.c b/usr/src/uts/common/inet/ip/ip_input.c new file mode 100644 index 0000000000..d47670f85d --- /dev/null +++ b/usr/src/uts/common/inet/ip/ip_input.c @@ -0,0 +1,3095 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* Copyright (c) 1990 Mentat Inc. */ + +#include <sys/types.h> +#include <sys/stream.h> +#include <sys/dlpi.h> +#include <sys/stropts.h> +#include <sys/sysmacros.h> +#include <sys/strsubr.h> +#include <sys/strlog.h> +#include <sys/strsun.h> +#include <sys/zone.h> +#define _SUN_TPI_VERSION 2 +#include <sys/tihdr.h> +#include <sys/xti_inet.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/kobj.h> +#include <sys/modctl.h> +#include <sys/atomic.h> +#include <sys/policy.h> +#include <sys/priv.h> + +#include <sys/systm.h> +#include <sys/param.h> +#include <sys/kmem.h> +#include <sys/sdt.h> +#include <sys/socket.h> +#include <sys/vtrace.h> +#include <sys/isa_defs.h> +#include <sys/mac.h> +#include <net/if.h> +#include <net/if_arp.h> +#include <net/route.h> +#include <sys/sockio.h> +#include <netinet/in.h> +#include <net/if_dl.h> + +#include <inet/common.h> +#include <inet/mi.h> +#include <inet/mib2.h> +#include <inet/nd.h> +#include <inet/arp.h> +#include <inet/snmpcom.h> +#include <inet/kstatcom.h> + +#include <netinet/igmp_var.h> +#include <netinet/ip6.h> +#include <netinet/icmp6.h> +#include <netinet/sctp.h> + +#include <inet/ip.h> +#include <inet/ip_impl.h> +#include <inet/ip6.h> +#include <inet/ip6_asp.h> +#include <inet/optcom.h> +#include <inet/tcp.h> +#include <inet/tcp_impl.h> +#include <inet/ip_multi.h> +#include <inet/ip_if.h> +#include <inet/ip_ire.h> +#include <inet/ip_ftable.h> +#include <inet/ip_rts.h> +#include <inet/ip_ndp.h> +#include <inet/ip_listutils.h> +#include <netinet/igmp.h> +#include <netinet/ip_mroute.h> +#include <inet/ipp_common.h> + +#include <net/pfkeyv2.h> +#include <inet/sadb.h> +#include <inet/ipsec_impl.h> +#include <inet/ipdrop.h> +#include <inet/ip_netinfo.h> +#include <inet/ilb_ip.h> +#include <sys/squeue_impl.h> +#include <sys/squeue.h> + +#include <sys/ethernet.h> +#include <net/if_types.h> +#include <sys/cpuvar.h> + +#include <ipp/ipp.h> +#include <ipp/ipp_impl.h> +#include <ipp/ipgpc/ipgpc.h> + +#include <sys/pattr.h> +#include <inet/ipclassifier.h> +#include <inet/sctp_ip.h> +#include <inet/sctp/sctp_impl.h> +#include <inet/udp_impl.h> +#include <sys/sunddi.h> + +#include <sys/tsol/label.h> +#include <sys/tsol/tnet.h> + +#include <rpc/pmap_prot.h> + +#ifdef DEBUG +extern boolean_t skip_sctp_cksum; +#endif + +static void ip_input_local_v4(ire_t *, mblk_t *, ipha_t *, + ip_recv_attr_t *); + +static void ip_input_broadcast_v4(ire_t *, mblk_t *, ipha_t *, + ip_recv_attr_t *); +static void ip_input_multicast_v4(ire_t *, mblk_t *, ipha_t *, + ip_recv_attr_t *); + +#pragma inline(ip_input_common_v4, ip_input_local_v4, ip_forward_xmit_v4) + +/* + * Direct read side procedure capable of dealing with chains. GLDv3 based + * drivers call this function directly with mblk chains while STREAMS + * read side procedure ip_rput() calls this for single packet with ip_ring + * set to NULL to process one packet at a time. + * + * The ill will always be valid if this function is called directly from + * the driver. + * + * If ip_input() is called from GLDv3: + * + * - This must be a non-VLAN IP stream. + * - 'mp' is either an untagged or a special priority-tagged packet. + * - Any VLAN tag that was in the MAC header has been stripped. + * + * If the IP header in packet is not 32-bit aligned, every message in the + * chain will be aligned before further operations. This is required on SPARC + * platform. + */ +void +ip_input(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain, + struct mac_header_info_s *mhip) +{ + (void) ip_input_common_v4(ill, ip_ring, mp_chain, mhip, NULL, NULL, + NULL); +} + +/* + * ip_accept_tcp() - This function is called by the squeue when it retrieves + * a chain of packets in the poll mode. The packets have gone through the + * data link processing but not IP processing. For performance and latency + * reasons, the squeue wants to process the chain in line instead of feeding + * it back via ip_input path. + * + * We set up the ip_recv_attr_t with IRAF_TARGET_SQP to that ip_fanout_v4 + * will pass back any TCP packets matching the target sqp to + * ip_input_common_v4 using ira_target_sqp_mp. Other packets are handled by + * ip_input_v4 and ip_fanout_v4 as normal. + * The TCP packets that match the target squeue are returned to the caller + * as a b_next chain after each packet has been prepend with an mblk + * from ip_recv_attr_to_mblk. + */ +mblk_t * +ip_accept_tcp(ill_t *ill, ill_rx_ring_t *ip_ring, squeue_t *target_sqp, + mblk_t *mp_chain, mblk_t **last, uint_t *cnt) +{ + return (ip_input_common_v4(ill, ip_ring, mp_chain, NULL, target_sqp, + last, cnt)); +} + +/* + * Used by ip_input and ip_accept_tcp + * The last three arguments are only used by ip_accept_tcp, and mhip is + * only used by ip_input. + */ +mblk_t * +ip_input_common_v4(ill_t *ill, ill_rx_ring_t *ip_ring, mblk_t *mp_chain, + struct mac_header_info_s *mhip, squeue_t *target_sqp, + mblk_t **last, uint_t *cnt) +{ + mblk_t *mp; + ipha_t *ipha; + ip_recv_attr_t iras; /* Receive attributes */ + rtc_t rtc; + iaflags_t chain_flags = 0; /* Fixed for chain */ + mblk_t *ahead = NULL; /* Accepted head */ + mblk_t *atail = NULL; /* Accepted tail */ + uint_t acnt = 0; /* Accepted count */ + + ASSERT(mp_chain != NULL); + ASSERT(ill != NULL); + + /* These ones do not change as we loop over packets */ + iras.ira_ill = iras.ira_rill = ill; + iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; + iras.ira_rifindex = iras.ira_ruifindex; + iras.ira_sqp = NULL; + iras.ira_ring = ip_ring; + /* For ECMP and outbound transmit ring selection */ + iras.ira_xmit_hint = ILL_RING_TO_XMIT_HINT(ip_ring); + + iras.ira_target_sqp = target_sqp; + iras.ira_target_sqp_mp = NULL; + if (target_sqp != NULL) + chain_flags |= IRAF_TARGET_SQP; + + /* + * We try to have a mhip pointer when possible, but + * it might be NULL in some cases. In those cases we + * have to assume unicast. + */ + iras.ira_mhip = mhip; + iras.ira_flags = 0; + if (mhip != NULL) { + switch (mhip->mhi_dsttype) { + case MAC_ADDRTYPE_MULTICAST : + chain_flags |= IRAF_L2DST_MULTICAST; + break; + case MAC_ADDRTYPE_BROADCAST : + chain_flags |= IRAF_L2DST_BROADCAST; + break; + } + } + + /* + * Initialize the one-element route cache. + * + * We do ire caching from one iteration to + * another. In the event the packet chain contains + * all packets from the same dst, this caching saves + * an ire_route_recursive for each of the succeeding + * packets in a packet chain. + */ + rtc.rtc_ire = NULL; + rtc.rtc_ipaddr = INADDR_ANY; + + /* Loop over b_next */ + for (mp = mp_chain; mp != NULL; mp = mp_chain) { + mp_chain = mp->b_next; + mp->b_next = NULL; + + ASSERT(DB_TYPE(mp) == M_DATA); + + + /* + * if db_ref > 1 then copymsg and free original. Packet + * may be changed and we do not want the other entity + * who has a reference to this message to trip over the + * changes. This is a blind change because trying to + * catch all places that might change the packet is too + * difficult. + * + * This corresponds to the fast path case, where we have + * a chain of M_DATA mblks. We check the db_ref count + * of only the 1st data block in the mblk chain. There + * doesn't seem to be a reason why a device driver would + * send up data with varying db_ref counts in the mblk + * chain. In any case the Fast path is a private + * interface, and our drivers don't do such a thing. + * Given the above assumption, there is no need to walk + * down the entire mblk chain (which could have a + * potential performance problem) + * + * The "(DB_REF(mp) > 1)" check was moved from ip_rput() + * to here because of exclusive ip stacks and vnics. + * Packets transmitted from exclusive stack over vnic + * can have db_ref > 1 and when it gets looped back to + * another vnic in a different zone, you have ip_input() + * getting dblks with db_ref > 1. So if someone + * complains of TCP performance under this scenario, + * take a serious look here on the impact of copymsg(). + */ + if (DB_REF(mp) > 1) { + if ((mp = ip_fix_dbref(mp, &iras)) == NULL) { + /* mhip might point into 1st packet in chain */ + iras.ira_mhip = NULL; + continue; + } + } + + /* + * IP header ptr not aligned? + * OR IP header not complete in first mblk + */ + ipha = (ipha_t *)mp->b_rptr; + if (!OK_32PTR(ipha) || MBLKL(mp) < IP_SIMPLE_HDR_LENGTH) { + mp = ip_check_and_align_header(mp, IP_SIMPLE_HDR_LENGTH, + &iras); + if (mp == NULL) { + /* mhip might point into 1st packet in chain */ + iras.ira_mhip = NULL; + continue; + } + ipha = (ipha_t *)mp->b_rptr; + } + + /* Protect against a mix of Ethertypes and IP versions */ + if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInHdrErrors); + ip_drop_input("ipIfStatsInHdrErrors", mp, ill); + freemsg(mp); + /* mhip might point into 1st packet in the chain. */ + iras.ira_mhip = NULL; + continue; + } + + /* + * Check for Martian addrs; we have to explicitly + * test for for zero dst since this is also used as + * an indication that the rtc is not used. + */ + if (ipha->ipha_dst == INADDR_ANY) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors); + ip_drop_input("ipIfStatsInAddrErrors", mp, ill); + freemsg(mp); + /* mhip might point into 1st packet in the chain. */ + iras.ira_mhip = NULL; + continue; + } + + /* + * Keep L2SRC from a previous packet in chain since mhip + * might point into an earlier packet in the chain. + * Keep IRAF_VERIFIED_SRC to avoid redoing broadcast + * source check in forwarding path. + */ + chain_flags |= (iras.ira_flags & + (IRAF_L2SRC_SET|IRAF_VERIFIED_SRC)); + + iras.ira_flags = IRAF_IS_IPV4 | IRAF_VERIFY_IP_CKSUM | + IRAF_VERIFY_ULP_CKSUM | chain_flags; + iras.ira_free_flags = 0; + iras.ira_cred = NULL; + iras.ira_cpid = NOPID; + iras.ira_tsl = NULL; + iras.ira_zoneid = ALL_ZONES; /* Default for forwarding */ + + /* + * We must count all incoming packets, even if they end + * up being dropped later on. Defer counting bytes until + * we have the whole IP header in first mblk. + */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); + + iras.ira_pktlen = ntohs(ipha->ipha_length); + UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, + iras.ira_pktlen); + + /* + * Call one of: + * ill_input_full_v4 + * ill_input_short_v4 + * The former is used in unusual cases. See ill_set_inputfn(). + */ + (*ill->ill_inputfn)(mp, ipha, &ipha->ipha_dst, &iras, &rtc); + + /* Any references to clean up? No hold on ira_ill */ + if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED)) + ira_cleanup(&iras, B_FALSE); + + if (iras.ira_target_sqp_mp != NULL) { + /* Better be called from ip_accept_tcp */ + ASSERT(target_sqp != NULL); + + /* Found one packet to accept */ + mp = iras.ira_target_sqp_mp; + iras.ira_target_sqp_mp = NULL; + ASSERT(ip_recv_attr_is_mblk(mp)); + + if (atail != NULL) + atail->b_next = mp; + else + ahead = mp; + atail = mp; + acnt++; + mp = NULL; + } + /* mhip might point into 1st packet in the chain. */ + iras.ira_mhip = NULL; + } + /* Any remaining references to the route cache? */ + if (rtc.rtc_ire != NULL) { + ASSERT(rtc.rtc_ipaddr != INADDR_ANY); + ire_refrele(rtc.rtc_ire); + } + + if (ahead != NULL) { + /* Better be called from ip_accept_tcp */ + ASSERT(target_sqp != NULL); + *last = atail; + *cnt = acnt; + return (ahead); + } + + return (NULL); +} + +/* + * This input function is used when + * - is_system_labeled() + * - CGTP filtering + * - DHCP unicast before we have an IP address configured + * - there is an listener for IPPROTO_RSVP + */ +void +ill_input_full_v4(mblk_t *mp, void *iph_arg, void *nexthop_arg, + ip_recv_attr_t *ira, rtc_t *rtc) +{ + ipha_t *ipha = (ipha_t *)iph_arg; + ipaddr_t nexthop = *(ipaddr_t *)nexthop_arg; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; + int cgtp_flt_pkt; + + ASSERT(ira->ira_tsl == NULL); + + /* + * Attach any necessary label information to + * this packet + */ + if (is_system_labeled()) { + ira->ira_flags |= IRAF_SYSTEM_LABELED; + + /* + * This updates ira_cred, ira_tsl and ira_free_flags based + * on the label. + */ + if (!tsol_get_pkt_label(mp, IPV4_VERSION, ira)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); + freemsg(mp); + return; + } + /* Note that ira_tsl can be NULL here. */ + + /* tsol_get_pkt_label sometimes does pullupmsg */ + ipha = (ipha_t *)mp->b_rptr; + } + + /* + * Invoke the CGTP (multirouting) filtering module to process + * the incoming packet. Packets identified as duplicates + * must be discarded. Filtering is active only if the + * the ip_cgtp_filter ndd variable is non-zero. + */ + cgtp_flt_pkt = CGTP_IP_PKT_NOT_CGTP; + if (ipst->ips_ip_cgtp_filter && + ipst->ips_ip_cgtp_filter_ops != NULL) { + netstackid_t stackid; + + stackid = ipst->ips_netstack->netstack_stackid; + /* + * CGTP and IPMP are mutually exclusive so + * phyint_ifindex is fine here. + */ + cgtp_flt_pkt = + ipst->ips_ip_cgtp_filter_ops->cfo_filter(stackid, + ill->ill_phyint->phyint_ifindex, mp); + if (cgtp_flt_pkt == CGTP_IP_PKT_DUPLICATE) { + ip_drop_input("CGTP_IP_PKT_DUPLICATE", mp, ill); + freemsg(mp); + return; + } + } + + /* + * Brutal hack for DHCPv4 unicast: RFC2131 allows a DHCP + * server to unicast DHCP packets to a DHCP client using the + * IP address it is offering to the client. This can be + * disabled through the "broadcast bit", but not all DHCP + * servers honor that bit. Therefore, to interoperate with as + * many DHCP servers as possible, the DHCP client allows the + * server to unicast, but we treat those packets as broadcast + * here. Note that we don't rewrite the packet itself since + * (a) that would mess up the checksums and (b) the DHCP + * client conn is bound to INADDR_ANY so ip_fanout_udp() will + * hand it the packet regardless. + */ + if (ill->ill_dhcpinit != 0 && + ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION && + ipha->ipha_protocol == IPPROTO_UDP) { + udpha_t *udpha; + + ipha = ip_pullup(mp, sizeof (ipha_t) + sizeof (udpha_t), ira); + if (ipha == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards - dhcp", mp, ill); + freemsg(mp); + return; + } + /* Reload since pullupmsg() can change b_rptr. */ + udpha = (udpha_t *)&ipha[1]; + + if (ntohs(udpha->uha_dst_port) == IPPORT_BOOTPC) { + DTRACE_PROBE2(ip4__dhcpinit__pkt, ill_t *, ill, + mblk_t *, mp); + /* + * This assumes that we deliver to all conns for + * multicast and broadcast packets. + */ + nexthop = INADDR_BROADCAST; + ira->ira_flags |= IRAF_DHCP_UNICAST; + } + } + + /* + * If rsvpd is running, let RSVP daemon handle its processing + * and forwarding of RSVP multicast/unicast packets. + * If rsvpd is not running but mrouted is running, RSVP + * multicast packets are forwarded as multicast traffic + * and RSVP unicast packets are forwarded by unicast router. + * If neither rsvpd nor mrouted is running, RSVP multicast + * packets are not forwarded, but the unicast packets are + * forwarded like unicast traffic. + */ + if (ipha->ipha_protocol == IPPROTO_RSVP && + ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head != NULL) { + /* RSVP packet and rsvpd running. Treat as ours */ + ip2dbg(("ip_input: RSVP for us: 0x%x\n", ntohl(nexthop))); + /* + * We use a multicast address to get the packet to + * ire_recv_multicast_v4. There will not be a membership + * check since we set IRAF_RSVP + */ + nexthop = htonl(INADDR_UNSPEC_GROUP); + ira->ira_flags |= IRAF_RSVP; + } + + ill_input_short_v4(mp, ipha, &nexthop, ira, rtc); +} + +/* + * This is the tail-end of the full receive side packet handling. + * It can be used directly when the configuration is simple. + */ +void +ill_input_short_v4(mblk_t *mp, void *iph_arg, void *nexthop_arg, + ip_recv_attr_t *ira, rtc_t *rtc) +{ + ire_t *ire; + uint_t opt_len; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; + uint_t pkt_len; + ssize_t len; + ipha_t *ipha = (ipha_t *)iph_arg; + ipaddr_t nexthop = *(ipaddr_t *)nexthop_arg; + ilb_stack_t *ilbs = ipst->ips_netstack->netstack_ilb; +#define rptr ((uchar_t *)ipha) + + ASSERT(DB_TYPE(mp) == M_DATA); + + /* + * The following test for loopback is faster than + * IP_LOOPBACK_ADDR(), because it avoids any bitwise + * operations. + * Note that these addresses are always in network byte order + */ + if (((*(uchar_t *)&ipha->ipha_dst) == 127) || + ((*(uchar_t *)&ipha->ipha_src) == 127)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors); + ip_drop_input("ipIfStatsInAddrErrors", mp, ill); + freemsg(mp); + return; + } + + len = mp->b_wptr - rptr; + pkt_len = ira->ira_pktlen; + + /* multiple mblk or too short */ + len -= pkt_len; + if (len != 0) { + mp = ip_check_length(mp, rptr, len, pkt_len, + IP_SIMPLE_HDR_LENGTH, ira); + if (mp == NULL) + return; + ipha = (ipha_t *)mp->b_rptr; + } + + DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, + ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL, + int, 0); + + /* + * The event for packets being received from a 'physical' + * interface is placed after validation of the source and/or + * destination address as being local so that packets can be + * redirected to loopback addresses using ipnat. + */ + DTRACE_PROBE4(ip4__physical__in__start, + ill_t *, ill, ill_t *, NULL, + ipha_t *, ipha, mblk_t *, mp); + + if (HOOKS4_INTERESTED_PHYSICAL_IN(ipst)) { + int ll_multicast = 0; + int error; + ipaddr_t orig_dst = ipha->ipha_dst; + + if (ira->ira_flags & IRAF_L2DST_MULTICAST) + ll_multicast = HPE_MULTICAST; + else if (ira->ira_flags & IRAF_L2DST_BROADCAST) + ll_multicast = HPE_BROADCAST; + + FW_HOOKS(ipst->ips_ip4_physical_in_event, + ipst->ips_ipv4firewall_physical_in, + ill, NULL, ipha, mp, mp, ll_multicast, ipst, error); + + DTRACE_PROBE1(ip4__physical__in__end, mblk_t *, mp); + + if (mp == NULL) + return; + /* The length could have changed */ + ipha = (ipha_t *)mp->b_rptr; + ira->ira_pktlen = ntohs(ipha->ipha_length); + pkt_len = ira->ira_pktlen; + + /* + * In case the destination changed we override any previous + * change to nexthop. + */ + if (orig_dst != ipha->ipha_dst) + nexthop = ipha->ipha_dst; + if (nexthop == INADDR_ANY) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInAddrErrors); + ip_drop_input("ipIfStatsInAddrErrors", mp, ill); + freemsg(mp); + return; + } + } + + if (ipst->ips_ip4_observe.he_interested) { + zoneid_t dzone; + + /* + * On the inbound path the src zone will be unknown as + * this packet has come from the wire. + */ + dzone = ip_get_zoneid_v4(nexthop, mp, ira, ALL_ZONES); + ipobs_hook(mp, IPOBS_HOOK_INBOUND, ALL_ZONES, dzone, ill, ipst); + } + + /* + * If there is a good HW IP header checksum we clear the need + * look at the IP header checksum. + */ + if ((DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) && + ILL_HCKSUM_CAPABLE(ill) && dohwcksum) { + /* Header checksum was ok. Clear the flag */ + DB_CKSUMFLAGS(mp) &= ~HCK_IPV4_HDRCKSUM; + ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM; + } + + /* + * Here we check to see if we machine is setup as + * L3 loadbalancer and if the incoming packet is for a VIP + * + * Check the following: + * - there is at least a rule + * - protocol of the packet is supported + */ + if (ilb_has_rules(ilbs) && ILB_SUPP_L4(ipha->ipha_protocol)) { + ipaddr_t lb_dst; + int lb_ret; + + /* For convenience, we pull up the mblk. */ + if (mp->b_cont != NULL) { + if (pullupmsg(mp, -1) == 0) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards - pullupmsg", + mp, ill); + freemsg(mp); + return; + } + ipha = (ipha_t *)mp->b_rptr; + } + + /* + * We just drop all fragments going to any VIP, at + * least for now.... + */ + if (ntohs(ipha->ipha_fragment_offset_and_flags) & + (IPH_MF | IPH_OFFSET)) { + if (!ilb_rule_match_vip_v4(ilbs, nexthop, NULL)) { + goto after_ilb; + } + + ILB_KSTAT_UPDATE(ilbs, ip_frag_in, 1); + ILB_KSTAT_UPDATE(ilbs, ip_frag_dropped, 1); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ILB fragment", mp, ill); + freemsg(mp); + return; + } + lb_ret = ilb_check_v4(ilbs, ill, mp, ipha, ipha->ipha_protocol, + (uint8_t *)ipha + IPH_HDR_LENGTH(ipha), &lb_dst); + + if (lb_ret == ILB_DROPPED) { + /* Is this the right counter to increase? */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ILB_DROPPED", mp, ill); + freemsg(mp); + return; + } + if (lb_ret == ILB_BALANCED) { + /* Set the dst to that of the chosen server */ + nexthop = lb_dst; + DB_CKSUMFLAGS(mp) = 0; + } + } + +after_ilb: + opt_len = ipha->ipha_version_and_hdr_length - IP_SIMPLE_HDR_VERSION; + ira->ira_ip_hdr_length = IP_SIMPLE_HDR_LENGTH; + if (opt_len != 0) { + int error = 0; + + ira->ira_ip_hdr_length += (opt_len << 2); + ira->ira_flags |= IRAF_IPV4_OPTIONS; + + /* IP Options present! Validate the length. */ + mp = ip_check_optlen(mp, ipha, opt_len, pkt_len, ira); + if (mp == NULL) + return; + + /* Might have changed */ + ipha = (ipha_t *)mp->b_rptr; + + /* Verify IP header checksum before parsing the options */ + if ((ira->ira_flags & IRAF_VERIFY_IP_CKSUM) && + ip_csum_hdr(ipha)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); + ip_drop_input("ipIfStatsInCksumErrs", mp, ill); + freemsg(mp); + return; + } + ira->ira_flags &= ~IRAF_VERIFY_IP_CKSUM; + + /* + * Go off to ip_input_options which returns the next hop + * destination address, which may have been affected + * by source routing. + */ + IP_STAT(ipst, ip_opt); + + nexthop = ip_input_options(ipha, nexthop, mp, ira, &error); + if (error != 0) { + /* + * An ICMP error has been sent and the packet has + * been dropped. + */ + return; + } + } + /* Can not use route cache with TX since the labels can differ */ + if (ira->ira_flags & IRAF_SYSTEM_LABELED) { + if (CLASSD(nexthop)) { + ire = ire_multicast(ill); + } else { + /* Match destination and label */ + ire = ire_route_recursive_v4(nexthop, 0, NULL, + ALL_ZONES, ira->ira_tsl, MATCH_IRE_SECATTR, + (ill->ill_flags & ILLF_ROUTER), + ira->ira_xmit_hint, ipst, NULL, NULL, NULL); + } + /* Update the route cache so we do the ire_refrele */ + ASSERT(ire != NULL); + if (rtc->rtc_ire != NULL) + ire_refrele(rtc->rtc_ire); + rtc->rtc_ire = ire; + rtc->rtc_ipaddr = nexthop; + } else if (nexthop == rtc->rtc_ipaddr) { + /* Use the route cache */ + ASSERT(rtc->rtc_ire != NULL); + ire = rtc->rtc_ire; + } else { + /* Update the route cache */ + if (CLASSD(nexthop)) { + ire = ire_multicast(ill); + } else { + /* Just match the destination */ + ire = ire_route_recursive_dstonly_v4(nexthop, + (ill->ill_flags & ILLF_ROUTER), ira->ira_xmit_hint, + ipst); + } + ASSERT(ire != NULL); + if (rtc->rtc_ire != NULL) + ire_refrele(rtc->rtc_ire); + rtc->rtc_ire = ire; + rtc->rtc_ipaddr = nexthop; + } + + ire->ire_ib_pkt_count++; + + /* + * Based on ire_type and ire_flags call one of: + * ire_recv_local_v4 - for IRE_LOCAL + * ire_recv_loopback_v4 - for IRE_LOOPBACK + * ire_recv_multirt_v4 - if RTF_MULTIRT + * ire_recv_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE + * ire_recv_multicast_v4 - for IRE_MULTICAST + * ire_recv_broadcast_v4 - for IRE_BROADCAST + * ire_recv_noaccept_v4 - for ire_noaccept ones + * ire_recv_forward_v4 - for the rest. + */ + (*ire->ire_recvfn)(ire, mp, ipha, ira); +} +#undef rptr + +/* + * ire_recvfn for IREs that need forwarding + */ +void +ire_recv_forward_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira) +{ + ipha_t *ipha = (ipha_t *)iph_arg; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; + ill_t *dst_ill; + nce_t *nce; + ipaddr_t src = ipha->ipha_src; + uint32_t added_tx_len; + uint32_t mtu, iremtu; + + if (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); + ip_drop_input("l2 multicast not forwarded", mp, ill); + freemsg(mp); + return; + } + + if (!(ill->ill_flags & ILLF_ROUTER) && !ip_source_routed(ipha, ipst)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); + ip_drop_input("ipIfStatsForwProhibits", mp, ill); + freemsg(mp); + return; + } + + /* + * Either ire_nce_capable or ire_dep_parent would be set for the IRE + * when it is found by ire_route_recursive, but that some other thread + * could have changed the routes with the effect of clearing + * ire_dep_parent. In that case we'd end up dropping the packet, or + * finding a new nce below. + * Get, allocate, or update the nce. + * We get a refhold on ire_nce_cache as a result of this to avoid races + * where ire_nce_cache is deleted. + * + * This ensures that we don't forward if the interface is down since + * ipif_down removes all the nces. + */ + mutex_enter(&ire->ire_lock); + nce = ire->ire_nce_cache; + if (nce == NULL) { + /* Not yet set up - try to set one up */ + mutex_exit(&ire->ire_lock); + (void) ire_revalidate_nce(ire); + mutex_enter(&ire->ire_lock); + nce = ire->ire_nce_cache; + if (nce == NULL) { + mutex_exit(&ire->ire_lock); + /* The ire_dep_parent chain went bad, or no memory */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("No ire_dep_parent", mp, ill); + freemsg(mp); + return; + } + } + nce_refhold(nce); + mutex_exit(&ire->ire_lock); + + if (nce->nce_is_condemned) { + nce_t *nce1; + + nce1 = ire_handle_condemned_nce(nce, ire, ipha, NULL, B_FALSE); + nce_refrele(nce); + if (nce1 == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("No nce", mp, ill); + freemsg(mp); + return; + } + nce = nce1; + } + dst_ill = nce->nce_ill; + + /* + * Unless we are forwarding, drop the packet. + * We have to let source routed packets through if they go out + * the same interface i.e., they are 'ping -l' packets. + */ + if (!(dst_ill->ill_flags & ILLF_ROUTER) && + !(ip_source_routed(ipha, ipst) && dst_ill == ill)) { + if (ip_source_routed(ipha, ipst)) { + ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", mp, ill); + icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira); + nce_refrele(nce); + return; + } + BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); + ip_drop_input("ipIfStatsForwProhibits", mp, ill); + freemsg(mp); + nce_refrele(nce); + return; + } + + if (ire->ire_zoneid != GLOBAL_ZONEID && ire->ire_zoneid != ALL_ZONES) { + ipaddr_t dst = ipha->ipha_dst; + + ire->ire_ib_pkt_count--; + /* + * Should only use IREs that are visible from the + * global zone for forwarding. + * Take a source route into account the same way as ip_input + * did. + */ + if (ira->ira_flags & IRAF_IPV4_OPTIONS) { + int error = 0; + + dst = ip_input_options(ipha, dst, mp, ira, &error); + ASSERT(error == 0); /* ip_input checked */ + } + ire = ire_route_recursive_v4(dst, 0, NULL, GLOBAL_ZONEID, + ira->ira_tsl, MATCH_IRE_SECATTR, + (ill->ill_flags & ILLF_ROUTER), ira->ira_xmit_hint, ipst, + NULL, NULL, NULL); + ire->ire_ib_pkt_count++; + (*ire->ire_recvfn)(ire, mp, ipha, ira); + ire_refrele(ire); + nce_refrele(nce); + return; + } + + /* + * ipIfStatsHCInForwDatagrams should only be increment if there + * will be an attempt to forward the packet, which is why we + * increment after the above condition has been checked. + */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams); + + /* Initiate Read side IPPF processing */ + if (IPP_ENABLED(IPP_FWD_IN, ipst)) { + /* ip_process translates an IS_UNDER_IPMP */ + mp = ip_process(IPP_FWD_IN, mp, ill, ill); + if (mp == NULL) { + /* ip_drop_packet and MIB done */ + ip2dbg(("ire_recv_forward_v4: pkt dropped/deferred " + "during IPPF processing\n")); + nce_refrele(nce); + return; + } + } + + DTRACE_PROBE4(ip4__forwarding__start, + ill_t *, ill, ill_t *, dst_ill, ipha_t *, ipha, mblk_t *, mp); + + if (HOOKS4_INTERESTED_FORWARDING(ipst)) { + int error; + + FW_HOOKS(ipst->ips_ip4_forwarding_event, + ipst->ips_ipv4firewall_forwarding, + ill, dst_ill, ipha, mp, mp, 0, ipst, error); + + DTRACE_PROBE1(ip4__forwarding__end, mblk_t *, mp); + + if (mp == NULL) { + nce_refrele(nce); + return; + } + /* + * Even if the destination was changed by the filter we use the + * forwarding decision that was made based on the address + * in ip_input. + */ + + /* Might have changed */ + ipha = (ipha_t *)mp->b_rptr; + ira->ira_pktlen = ntohs(ipha->ipha_length); + } + + /* Packet is being forwarded. Turning off hwcksum flag. */ + DB_CKSUMFLAGS(mp) = 0; + + /* + * Martian Address Filtering [RFC 1812, Section 5.3.7] + * The loopback address check for both src and dst has already + * been checked in ip_input + * In the future one can envision adding RPF checks using number 3. + * If we already checked the same source address we can skip this. + */ + if (!(ira->ira_flags & IRAF_VERIFIED_SRC) || + src != ira->ira_verified_src) { + switch (ipst->ips_src_check) { + case 0: + break; + case 2: + if (ip_type_v4(src, ipst) == IRE_BROADCAST) { + BUMP_MIB(ill->ill_ip_mib, + ipIfStatsForwProhibits); + BUMP_MIB(ill->ill_ip_mib, + ipIfStatsInAddrErrors); + ip_drop_input("ipIfStatsInAddrErrors", mp, ill); + freemsg(mp); + nce_refrele(nce); + return; + } + /* FALLTHRU */ + + case 1: + if (CLASSD(src)) { + BUMP_MIB(ill->ill_ip_mib, + ipIfStatsForwProhibits); + BUMP_MIB(ill->ill_ip_mib, + ipIfStatsInAddrErrors); + ip_drop_input("ipIfStatsInAddrErrors", mp, ill); + freemsg(mp); + nce_refrele(nce); + return; + } + break; + } + /* Remember for next packet */ + ira->ira_flags |= IRAF_VERIFIED_SRC; + ira->ira_verified_src = src; + } + + /* + * Check if packet is going out the same link on which it arrived. + * Means we might need to send a redirect. + */ + if (IS_ON_SAME_LAN(dst_ill, ill) && ipst->ips_ip_g_send_redirects) { + ip_send_potential_redirect_v4(mp, ipha, ire, ira); + } + + added_tx_len = 0; + if (ira->ira_flags & IRAF_SYSTEM_LABELED) { + mblk_t *mp1; + uint32_t old_pkt_len = ira->ira_pktlen; + + /* + * Check if it can be forwarded and add/remove + * CIPSO options as needed. + */ + if ((mp1 = tsol_ip_forward(ire, mp, ira)) == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); + ip_drop_input("tsol_ip_forward", mp, ill); + freemsg(mp); + nce_refrele(nce); + return; + } + /* + * Size may have changed. Remember amount added in case + * IP needs to send an ICMP too big. + */ + mp = mp1; + ipha = (ipha_t *)mp->b_rptr; + ira->ira_pktlen = ntohs(ipha->ipha_length); + ira->ira_ip_hdr_length = IPH_HDR_LENGTH(ipha); + if (ira->ira_pktlen > old_pkt_len) + added_tx_len = ira->ira_pktlen - old_pkt_len; + + /* Options can have been added or removed */ + if (ira->ira_ip_hdr_length != IP_SIMPLE_HDR_LENGTH) + ira->ira_flags |= IRAF_IPV4_OPTIONS; + else + ira->ira_flags &= ~IRAF_IPV4_OPTIONS; + } + + mtu = dst_ill->ill_mtu; + if ((iremtu = ire->ire_metrics.iulp_mtu) != 0 && iremtu < mtu) + mtu = iremtu; + ip_forward_xmit_v4(nce, ill, mp, ipha, ira, mtu, added_tx_len); + nce_refrele(nce); +} + +/* + * Used for sending out unicast and multicast packets that are + * forwarded. + */ +void +ip_forward_xmit_v4(nce_t *nce, ill_t *ill, mblk_t *mp, ipha_t *ipha, + ip_recv_attr_t *ira, uint32_t mtu, uint32_t added_tx_len) +{ + ill_t *dst_ill = nce->nce_ill; + uint32_t pkt_len; + uint32_t sum; + iaflags_t iraflags = ira->ira_flags; + ip_stack_t *ipst = ill->ill_ipst; + iaflags_t ixaflags; + + if (ipha->ipha_ttl <= 1) { + /* Perhaps the checksum was bad */ + if ((iraflags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); + ip_drop_input("ipIfStatsInCksumErrs", mp, ill); + freemsg(mp); + return; + } + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ICMP_TTL_EXCEEDED", mp, ill); + icmp_time_exceeded(mp, ICMP_TTL_EXCEEDED, ira); + return; + } + ipha->ipha_ttl--; + /* Adjust the checksum to reflect the ttl decrement. */ + sum = (int)ipha->ipha_hdr_checksum + IP_HDR_CSUM_TTL_ADJUST; + ipha->ipha_hdr_checksum = (uint16_t)(sum + (sum >> 16)); + + /* Check if there are options to update */ + if (iraflags & IRAF_IPV4_OPTIONS) { + ASSERT(ipha->ipha_version_and_hdr_length != + IP_SIMPLE_HDR_VERSION); + ASSERT(!(iraflags & IRAF_VERIFY_IP_CKSUM)); + + if (!ip_forward_options(mp, ipha, dst_ill, ira)) { + /* ipIfStatsForwProhibits and ip_drop_input done */ + return; + } + + ipha->ipha_hdr_checksum = 0; + ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); + } + + /* Initiate Write side IPPF processing before any fragmentation */ + if (IPP_ENABLED(IPP_FWD_OUT, ipst)) { + /* ip_process translates an IS_UNDER_IPMP */ + mp = ip_process(IPP_FWD_OUT, mp, dst_ill, dst_ill); + if (mp == NULL) { + /* ip_drop_packet and MIB done */ + ip2dbg(("ire_recv_forward_v4: pkt dropped/deferred" \ + " during IPPF processing\n")); + return; + } + } + + pkt_len = ira->ira_pktlen; + + BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsHCOutForwDatagrams); + + ixaflags = IXAF_IS_IPV4 | IXAF_NO_DEV_FLOW_CTL; + + if (pkt_len > mtu) { + /* + * It needs fragging on its way out. If we haven't + * verified the header checksum yet we do it now since + * are going to put a surely good checksum in the + * outgoing header, we have to make sure that it + * was good coming in. + */ + if ((iraflags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); + ip_drop_input("ipIfStatsInCksumErrs", mp, ill); + freemsg(mp); + return; + } + if (ipha->ipha_fragment_offset_and_flags & IPH_DF_HTONS) { + BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsOutFragFails); + ip_drop_output("ipIfStatsOutFragFails", mp, dst_ill); + if (iraflags & IRAF_SYSTEM_LABELED) { + /* + * Remove any CIPSO option added by + * tsol_ip_forward, and make sure we report + * a path MTU so that there + * is room to add such a CIPSO option for future + * packets. + */ + mtu = tsol_pmtu_adjust(mp, mtu, added_tx_len, + AF_INET); + } + + icmp_frag_needed(mp, mtu, ira); + return; + } + + (void) ip_fragment_v4(mp, nce, ixaflags, pkt_len, mtu, + ira->ira_xmit_hint, GLOBAL_ZONEID, 0, ip_xmit, NULL); + return; + } + + ASSERT(pkt_len == ntohs(((ipha_t *)mp->b_rptr)->ipha_length)); + if (iraflags & IRAF_LOOPBACK_COPY) { + /* + * IXAF_NO_LOOP_ZONEID is not set hence 7th arg + * is don't care + */ + (void) ip_postfrag_loopcheck(mp, nce, + ixaflags | IXAF_LOOPBACK_COPY, + pkt_len, ira->ira_xmit_hint, GLOBAL_ZONEID, 0, NULL); + } else { + (void) ip_xmit(mp, nce, ixaflags, pkt_len, ira->ira_xmit_hint, + GLOBAL_ZONEID, 0, NULL); + } +} + +/* + * ire_recvfn for RTF_REJECT and RTF_BLACKHOLE routes, including IRE_NOROUTE, + * which is what ire_route_recursive returns when there is no matching ire. + * Send ICMP unreachable unless blackhole. + */ +void +ire_recv_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira) +{ + ipha_t *ipha = (ipha_t *)iph_arg; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; + + /* Would we have forwarded this packet if we had a route? */ + if (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); + ip_drop_input("l2 multicast not forwarded", mp, ill); + freemsg(mp); + return; + } + + if (!(ill->ill_flags & ILLF_ROUTER)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); + ip_drop_input("ipIfStatsForwProhibits", mp, ill); + freemsg(mp); + return; + } + /* + * If we had a route this could have been forwarded. Count as such. + * + * ipIfStatsHCInForwDatagrams should only be increment if there + * will be an attempt to forward the packet, which is why we + * increment after the above condition has been checked. + */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInForwDatagrams); + + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInNoRoutes); + + ip_rts_change(RTM_MISS, ipha->ipha_dst, 0, 0, 0, 0, 0, 0, RTA_DST, + ipst); + + if (ire->ire_flags & RTF_BLACKHOLE) { + ip_drop_input("ipIfStatsInNoRoutes RTF_BLACKHOLE", mp, ill); + freemsg(mp); + } else { + ip_drop_input("ipIfStatsInNoRoutes RTF_REJECT", mp, ill); + + if (ip_source_routed(ipha, ipst)) { + icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, ira); + } else { + icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, ira); + } + } +} + +/* + * ire_recvfn for IRE_LOCALs marked with ire_noaccept. Such IREs are used for + * VRRP when in noaccept mode. + * We silently drop the packet. ARP handles packets even if noaccept is set. + */ +/* ARGSUSED */ +void +ire_recv_noaccept_v4(ire_t *ire, mblk_t *mp, void *iph_arg, + ip_recv_attr_t *ira) +{ + ill_t *ill = ira->ira_ill; + + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards - noaccept", mp, ill); + freemsg(mp); +} + +/* + * ire_recvfn for IRE_BROADCAST. + */ +void +ire_recv_broadcast_v4(ire_t *ire, mblk_t *mp, void *iph_arg, + ip_recv_attr_t *ira) +{ + ipha_t *ipha = (ipha_t *)iph_arg; + ill_t *ill = ira->ira_ill; + ill_t *dst_ill = ire->ire_ill; + ip_stack_t *ipst = ill->ill_ipst; + ire_t *alt_ire; + nce_t *nce; + ipaddr_t ipha_dst; + + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInBcastPkts); + + /* Tag for higher-level protocols */ + ira->ira_flags |= IRAF_BROADCAST; + + /* + * Whether local or directed broadcast forwarding: don't allow + * for TCP. + */ + if (ipha->ipha_protocol == IPPROTO_TCP) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); + freemsg(mp); + return; + } + + /* + * So that we don't end up with dups, only one ill an IPMP group is + * nominated to receive broadcast traffic. + * If we have no cast_ill we are liberal and accept everything. + */ + if (IS_UNDER_IPMP(ill)) { + /* For an under ill_grp can change under lock */ + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + if (!ill->ill_nom_cast && ill->ill_grp != NULL && + ill->ill_grp->ig_cast_ill != NULL) { + rw_exit(&ipst->ips_ill_g_lock); + /* No MIB since this is normal operation */ + ip_drop_input("not nom_cast", mp, ill); + freemsg(mp); + return; + } + rw_exit(&ipst->ips_ill_g_lock); + + ira->ira_ruifindex = ill_get_upper_ifindex(ill); + } + + /* + * After reassembly and IPsec we will need to duplicate the + * broadcast packet for all matching zones on the ill. + */ + ira->ira_zoneid = ALL_ZONES; + + /* + * Check for directed broadcast i.e. ire->ire_ill is different than + * the incoming ill. + * The same broadcast address can be assigned to multiple interfaces + * so have to check explicitly for that case by looking up the alt_ire + */ + if (dst_ill == ill && !(ire->ire_flags & RTF_MULTIRT)) { + /* Reassemble on the ill on which the packet arrived */ + ip_input_local_v4(ire, mp, ipha, ira); + /* Restore */ + ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex; + return; + } + + /* Is there an IRE_BROADCAST on the incoming ill? */ + ipha_dst = ((ira->ira_flags & IRAF_DHCP_UNICAST) ? INADDR_BROADCAST : + ipha->ipha_dst); + alt_ire = ire_ftable_lookup_v4(ipha_dst, 0, 0, IRE_BROADCAST, ill, + ALL_ZONES, ira->ira_tsl, + MATCH_IRE_TYPE|MATCH_IRE_ILL|MATCH_IRE_SECATTR, 0, ipst, NULL); + if (alt_ire != NULL) { + /* Not a directed broadcast */ + /* + * In the special case of multirouted broadcast + * packets, we unconditionally need to "gateway" + * them to the appropriate interface here so that reassembly + * works. We know that the IRE_BROADCAST on cgtp0 doesn't + * have RTF_MULTIRT set so we look for such an IRE in the + * bucket. + */ + if (alt_ire->ire_flags & RTF_MULTIRT) { + irb_t *irb; + ire_t *ire1; + + irb = ire->ire_bucket; + irb_refhold(irb); + for (ire1 = irb->irb_ire; ire1 != NULL; + ire1 = ire1->ire_next) { + if (IRE_IS_CONDEMNED(ire1)) + continue; + if (!(ire1->ire_type & IRE_BROADCAST) || + (ire1->ire_flags & RTF_MULTIRT)) + continue; + ill = ire1->ire_ill; + ill_refhold(ill); + break; + } + irb_refrele(irb); + if (ire1 != NULL) { + ill_t *orig_ill = ira->ira_ill; + + ire_refrele(alt_ire); + /* Reassemble on the new ill */ + ira->ira_ill = ill; + ip_input_local_v4(ire, mp, ipha, ira); + ill_refrele(ill); + /* Restore */ + ira->ira_ill = orig_ill; + ira->ira_ruifindex = + orig_ill->ill_phyint->phyint_ifindex; + return; + } + } + ire_refrele(alt_ire); + /* Reassemble on the ill on which the packet arrived */ + ip_input_local_v4(ire, mp, ipha, ira); + goto done; + } + + /* + * This is a directed broadcast + * + * If directed broadcast is allowed, then forward the packet out + * the destination interface with IXAF_LOOPBACK_COPY set. That will + * result in ip_input() receiving a copy of the packet on the + * appropriate ill. (We could optimize this to avoid the extra trip + * via ip_input(), but since directed broadcasts are normally disabled + * it doesn't make sense to optimize it.) + */ + if (!ipst->ips_ip_g_forward_directed_bcast || + (ira->ira_flags & (IRAF_L2DST_MULTICAST|IRAF_L2DST_BROADCAST))) { + ip_drop_input("directed broadcast not allowed", mp, ill); + freemsg(mp); + goto done; + } + if ((ira->ira_flags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); + ip_drop_input("ipIfStatsInCksumErrs", mp, ill); + freemsg(mp); + goto done; + } + + /* + * Clear the indication that this may have hardware + * checksum as we are not using it for forwarding. + */ + DB_CKSUMFLAGS(mp) = 0; + + /* + * Adjust ttl to 2 (1+1 - the forward engine will decrement it by one. + */ + ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl + 1; + ipha->ipha_hdr_checksum = 0; + ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); + + /* + * We use ip_forward_xmit to do any fragmentation. + * and loopback copy on the outbound interface. + * + * Make it so that IXAF_LOOPBACK_COPY to be set on transmit side. + */ + ira->ira_flags |= IRAF_LOOPBACK_COPY; + + nce = arp_nce_init(dst_ill, ipha->ipha_dst, IRE_BROADCAST); + if (nce == NULL) { + BUMP_MIB(dst_ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("No nce", mp, dst_ill); + freemsg(mp); + goto done; + } + + ip_forward_xmit_v4(nce, ill, mp, ipha, ira, dst_ill->ill_mtu, 0); + nce_refrele(nce); +done: + /* Restore */ + ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex; +} + +/* + * ire_recvfn for IRE_MULTICAST. + */ +void +ire_recv_multicast_v4(ire_t *ire, mblk_t *mp, void *iph_arg, + ip_recv_attr_t *ira) +{ + ipha_t *ipha = (ipha_t *)iph_arg; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; + + ASSERT(ire->ire_ill == ira->ira_ill); + + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastPkts); + UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInMcastOctets, ira->ira_pktlen); + + /* RSVP hook */ + if (ira->ira_flags & IRAF_RSVP) + goto forus; + + /* Tag for higher-level protocols */ + ira->ira_flags |= IRAF_MULTICAST; + + /* + * So that we don't end up with dups, only one ill an IPMP group is + * nominated to receive multicast traffic. + * If we have no cast_ill we are liberal and accept everything. + */ + if (IS_UNDER_IPMP(ill)) { + ip_stack_t *ipst = ill->ill_ipst; + + /* For an under ill_grp can change under lock */ + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + if (!ill->ill_nom_cast && ill->ill_grp != NULL && + ill->ill_grp->ig_cast_ill != NULL) { + rw_exit(&ipst->ips_ill_g_lock); + ip_drop_input("not on cast ill", mp, ill); + freemsg(mp); + return; + } + rw_exit(&ipst->ips_ill_g_lock); + /* + * We switch to the upper ill so that mrouter and hasmembers + * can operate on upper here and in ip_input_multicast. + */ + ill = ipmp_ill_hold_ipmp_ill(ill); + if (ill != NULL) { + ASSERT(ill != ira->ira_ill); + ASSERT(ire->ire_ill == ira->ira_ill); + ira->ira_ill = ill; + ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex; + } else { + ill = ira->ira_ill; + } + } + + /* + * Check if we are a multicast router - send ip_mforward a copy of + * the packet. + * Due to mroute_decap tunnels we consider forwarding packets even if + * mrouted has not joined the allmulti group on this interface. + */ + if (ipst->ips_ip_g_mrouter) { + int retval; + + /* + * Clear the indication that this may have hardware + * checksum as we are not using it for forwarding. + */ + DB_CKSUMFLAGS(mp) = 0; + + /* + * ip_mforward helps us make these distinctions: If received + * on tunnel and not IGMP, then drop. + * If IGMP packet, then don't check membership + * If received on a phyint and IGMP or PIM, then + * don't check membership + */ + retval = ip_mforward(mp, ira); + /* ip_mforward updates mib variables if needed */ + + switch (retval) { + case 0: + /* + * pkt is okay and arrived on phyint. + * + * If we are running as a multicast router + * we need to see all IGMP and/or PIM packets. + */ + if ((ipha->ipha_protocol == IPPROTO_IGMP) || + (ipha->ipha_protocol == IPPROTO_PIM)) { + goto forus; + } + break; + case -1: + /* pkt is mal-formed, toss it */ + freemsg(mp); + goto done; + case 1: + /* + * pkt is okay and arrived on a tunnel + * + * If we are running a multicast router + * we need to see all igmp packets. + */ + if (ipha->ipha_protocol == IPPROTO_IGMP) { + goto forus; + } + ip_drop_input("Multicast on tunnel ignored", mp, ill); + freemsg(mp); + goto done; + } + } + + /* + * Check if we have members on this ill. This is not necessary for + * correctness because even if the NIC/GLD had a leaky filter, we + * filter before passing to each conn_t. + */ + if (!ill_hasmembers_v4(ill, ipha->ipha_dst)) { + /* + * Nobody interested + * + * This might just be caused by the fact that + * multiple IP Multicast addresses map to the same + * link layer multicast - no need to increment counter! + */ + ip_drop_input("Multicast with no members", mp, ill); + freemsg(mp); + goto done; + } +forus: + ip2dbg(("ire_recv_multicast_v4: multicast for us: 0x%x\n", + ntohl(ipha->ipha_dst))); + + /* + * After reassembly and IPsec we will need to duplicate the + * multicast packet for all matching zones on the ill. + */ + ira->ira_zoneid = ALL_ZONES; + + /* Reassemble on the ill on which the packet arrived */ + ip_input_local_v4(ire, mp, ipha, ira); +done: + if (ill != ire->ire_ill) { + ill_refrele(ill); + ira->ira_ill = ire->ire_ill; + ira->ira_ruifindex = ira->ira_ill->ill_phyint->phyint_ifindex; + } +} + +/* + * ire_recvfn for IRE_OFFLINK with RTF_MULTIRT. + * Drop packets since we don't forward out multirt routes. + */ +/* ARGSUSED */ +void +ire_recv_multirt_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira) +{ + ill_t *ill = ira->ira_ill; + + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInNoRoutes); + ip_drop_input("Not forwarding out MULTIRT", mp, ill); + freemsg(mp); +} + +/* + * ire_recvfn for IRE_LOOPBACK. This is only used when a FW_HOOK + * has rewritten the packet to have a loopback destination address (We + * filter out packet with a loopback destination from arriving over the wire). + * We don't know what zone to use, thus we always use the GLOBAL_ZONEID. + */ +void +ire_recv_loopback_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira) +{ + ipha_t *ipha = (ipha_t *)iph_arg; + ill_t *ill = ira->ira_ill; + ill_t *ire_ill = ire->ire_ill; + + ira->ira_zoneid = GLOBAL_ZONEID; + + /* Switch to the lo0 ill for further processing */ + if (ire_ill != ill) { + /* + * Update ira_ill to be the ILL on which the IP address + * is hosted. + * No need to hold the ill since we have a hold on the ire + */ + ASSERT(ira->ira_ill == ira->ira_rill); + ira->ira_ill = ire_ill; + + ip_input_local_v4(ire, mp, ipha, ira); + + /* Restore */ + ASSERT(ira->ira_ill == ire_ill); + ira->ira_ill = ill; + return; + + } + ip_input_local_v4(ire, mp, ipha, ira); +} + +/* + * ire_recvfn for IRE_LOCAL. + */ +void +ire_recv_local_v4(ire_t *ire, mblk_t *mp, void *iph_arg, ip_recv_attr_t *ira) +{ + ipha_t *ipha = (ipha_t *)iph_arg; + ill_t *ill = ira->ira_ill; + ill_t *ire_ill = ire->ire_ill; + + /* Make a note for DAD that this address is in use */ + ire->ire_last_used_time = lbolt; + + /* Only target the IRE_LOCAL with the right zoneid. */ + ira->ira_zoneid = ire->ire_zoneid; + + /* + * If the packet arrived on the wrong ill, we check that + * this is ok. + * If it is, then we ensure that we do the reassembly on + * the ill on which the address is hosted. We keep ira_rill as + * the one on which the packet arrived, so that IP_PKTINFO and + * friends can report this. + */ + if (ire_ill != ill) { + ire_t *new_ire; + + new_ire = ip_check_multihome(&ipha->ipha_dst, ire, ill); + if (new_ire == NULL) { + /* Drop packet */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsForwProhibits); + ip_drop_input("ipIfStatsInForwProhibits", mp, ill); + freemsg(mp); + return; + } + /* + * Update ira_ill to be the ILL on which the IP address + * is hosted. No need to hold the ill since we have a + * hold on the ire. Note that we do the switch even if + * new_ire == ire (for IPMP, ire would be the one corresponding + * to the IPMP ill). + */ + ASSERT(ira->ira_ill == ira->ira_rill); + ira->ira_ill = new_ire->ire_ill; + + /* ira_ruifindex tracks the upper for ira_rill */ + if (IS_UNDER_IPMP(ill)) + ira->ira_ruifindex = ill_get_upper_ifindex(ill); + + ip_input_local_v4(new_ire, mp, ipha, ira); + + /* Restore */ + ASSERT(ira->ira_ill == new_ire->ire_ill); + ira->ira_ill = ill; + ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex; + + if (new_ire != ire) + ire_refrele(new_ire); + return; + } + + ip_input_local_v4(ire, mp, ipha, ira); +} + +/* + * Common function for packets arriving for the host. Handles + * checksum verification, reassembly checks, etc. + */ +static void +ip_input_local_v4(ire_t *ire, mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira) +{ + ill_t *ill = ira->ira_ill; + iaflags_t iraflags = ira->ira_flags; + + /* + * Verify IP header checksum. If the packet was AH or ESP then + * this flag has already been cleared. Likewise if the packet + * had a hardware checksum. + */ + if ((iraflags & IRAF_VERIFY_IP_CKSUM) && ip_csum_hdr(ipha)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInCksumErrs); + ip_drop_input("ipIfStatsInCksumErrs", mp, ill); + freemsg(mp); + return; + } + + if (iraflags & IRAF_IPV4_OPTIONS) { + if (!ip_input_local_options(mp, ipha, ira)) { + /* Error has been sent and mp consumed */ + return; + } + } + + /* + * Is packet part of fragmented IP packet? + * We compare against defined values in network byte order + */ + if (ipha->ipha_fragment_offset_and_flags & + (IPH_MF_HTONS | IPH_OFFSET_HTONS)) { + /* + * Make sure we have ira_l2src before we loose the original + * mblk + */ + if (!(ira->ira_flags & IRAF_L2SRC_SET)) + ip_setl2src(mp, ira, ira->ira_rill); + + mp = ip_input_fragment(mp, ipha, ira); + if (mp == NULL) + return; + /* Completed reassembly */ + ipha = (ipha_t *)mp->b_rptr; + } + + /* + * For broadcast and multicast we need some extra work before + * we call ip_fanout_v4(), since in the case of shared-IP zones + * we need to pretend that a packet arrived for each zoneid. + */ + if (iraflags & IRAF_MULTIBROADCAST) { + if (iraflags & IRAF_BROADCAST) + ip_input_broadcast_v4(ire, mp, ipha, ira); + else + ip_input_multicast_v4(ire, mp, ipha, ira); + return; + } + ip_fanout_v4(mp, ipha, ira); +} + + +/* + * Handle multiple zones which match the same broadcast address + * and ill by delivering a packet to each of them. + * Walk the bucket and look for different ire_zoneid but otherwise + * the same IRE (same ill/addr/mask/type). + * Note that ire_add() tracks IREs that are identical in all + * fields (addr/mask/type/gw/ill/zoneid) within a single IRE by + * increasing ire_identical_cnt. Thus we don't need to be concerned + * about those. + */ +static void +ip_input_broadcast_v4(ire_t *ire, mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira) +{ + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; + netstack_t *ns = ipst->ips_netstack; + irb_t *irb; + ire_t *ire1; + mblk_t *mp1; + ipha_t *ipha1; + + irb = ire->ire_bucket; + + /* + * If we don't have more than one shared-IP zone, or if + * there can't be more than one IRE_BROADCAST for this + * IP address, then just set the zoneid and proceed. + */ + if (ns->netstack_numzones == 1 || irb->irb_ire_cnt == 1) { + ira->ira_zoneid = ire->ire_zoneid; + + ip_fanout_v4(mp, ipha, ira); + return; + } + irb_refhold(irb); + for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { + /* We do the main IRE after the end of the loop */ + if (ire1 == ire) + continue; + + /* + * Only IREs for the same IP address should be in the same + * bucket. + * But could have IRE_HOSTs in the case of CGTP. + */ + ASSERT(ire1->ire_addr == ire->ire_addr); + if (!(ire1->ire_type & IRE_BROADCAST)) + continue; + + if (IRE_IS_CONDEMNED(ire1)) + continue; + + mp1 = copymsg(mp); + if (mp1 == NULL) { + /* Failed to deliver to one zone */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); + continue; + } + ira->ira_zoneid = ire1->ire_zoneid; + ipha1 = (ipha_t *)mp1->b_rptr; + ip_fanout_v4(mp1, ipha1, ira); + } + irb_refrele(irb); + /* Do the main ire */ + ira->ira_zoneid = ire->ire_zoneid; + ip_fanout_v4(mp, ipha, ira); +} + +/* + * Handle multiple zones which want to receive the same multicast packets + * on this ill by delivering a packet to each of them. + * + * Note that for packets delivered to transports we could instead do this + * as part of the fanout code, but since we need to handle icmp_inbound + * it is simpler to have multicast work the same as broadcast. + * + * The ip_fanout matching for multicast matches based on ilm independent of + * zoneid since the zoneid restriction is applied when joining a multicast + * group. + */ +/* ARGSUSED */ +static void +ip_input_multicast_v4(ire_t *ire, mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira) +{ + ill_t *ill = ira->ira_ill; + iaflags_t iraflags = ira->ira_flags; + ip_stack_t *ipst = ill->ill_ipst; + netstack_t *ns = ipst->ips_netstack; + zoneid_t zoneid; + mblk_t *mp1; + ipha_t *ipha1; + + /* ire_recv_multicast has switched to the upper ill for IPMP */ + ASSERT(!IS_UNDER_IPMP(ill)); + + /* + * If we don't have more than one shared-IP zone, or if + * there are no members in anything but the global zone, + * then just set the zoneid and proceed. + */ + if (ns->netstack_numzones == 1 || + !ill_hasmembers_otherzones_v4(ill, ipha->ipha_dst, + GLOBAL_ZONEID)) { + ira->ira_zoneid = GLOBAL_ZONEID; + + /* If sender didn't want this zone to receive it, drop */ + if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) && + ira->ira_no_loop_zoneid == ira->ira_zoneid) { + ip_drop_input("Multicast but wrong zoneid", mp, ill); + freemsg(mp); + return; + } + ip_fanout_v4(mp, ipha, ira); + return; + } + + /* + * Here we loop over all zoneids that have members in the group + * and deliver a packet to ip_fanout for each zoneid. + * + * First find any members in the lowest numeric zoneid by looking for + * first zoneid larger than -1 (ALL_ZONES). + * We terminate the loop when we receive -1 (ALL_ZONES). + */ + zoneid = ill_hasmembers_nextzone_v4(ill, ipha->ipha_dst, ALL_ZONES); + for (; zoneid != ALL_ZONES; + zoneid = ill_hasmembers_nextzone_v4(ill, ipha->ipha_dst, zoneid)) { + /* + * Avoid an extra copymsg/freemsg by skipping global zone here + * and doing that at the end. + */ + if (zoneid == GLOBAL_ZONEID) + continue; + + ira->ira_zoneid = zoneid; + + /* If sender didn't want this zone to receive it, skip */ + if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) && + ira->ira_no_loop_zoneid == ira->ira_zoneid) + continue; + + mp1 = copymsg(mp); + if (mp1 == NULL) { + /* Failed to deliver to one zone */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); + continue; + } + ipha1 = (ipha_t *)mp1->b_rptr; + ip_fanout_v4(mp1, ipha1, ira); + } + + /* Do the main ire */ + ira->ira_zoneid = GLOBAL_ZONEID; + /* If sender didn't want this zone to receive it, drop */ + if ((iraflags & IRAF_NO_LOOP_ZONEID_SET) && + ira->ira_no_loop_zoneid == ira->ira_zoneid) { + ip_drop_input("Multicast but wrong zoneid", mp, ill); + freemsg(mp); + } else { + ip_fanout_v4(mp, ipha, ira); + } +} + + +/* + * Determine the zoneid and IRAF_TX_* flags if trusted extensions + * is in use. Updates ira_zoneid and ira_flags as a result. + */ +static void +ip_fanout_tx_v4(mblk_t *mp, ipha_t *ipha, uint8_t protocol, + uint_t ip_hdr_length, ip_recv_attr_t *ira) +{ + uint16_t *up; + uint16_t lport; + zoneid_t zoneid; + + ASSERT(ira->ira_flags & IRAF_SYSTEM_LABELED); + + /* + * If the packet is unlabeled we might allow read-down + * for MAC_EXEMPT. Below we clear this if it is a multi-level + * port (MLP). + * Note that ira_tsl can be NULL here. + */ + if (ira->ira_tsl != NULL && ira->ira_tsl->tsl_flags & TSLF_UNLABELED) + ira->ira_flags |= IRAF_TX_MAC_EXEMPTABLE; + + if (ira->ira_zoneid != ALL_ZONES) + return; + + ira->ira_flags |= IRAF_TX_SHARED_ADDR; + + up = (uint16_t *)((uchar_t *)ipha + ip_hdr_length); + switch (protocol) { + case IPPROTO_TCP: + case IPPROTO_SCTP: + case IPPROTO_UDP: + /* Caller ensures this */ + ASSERT(((uchar_t *)ipha) + ip_hdr_length +4 <= mp->b_wptr); + + /* + * Only these transports support MLP. + * We know their destination port numbers is in + * the same place in the header. + */ + lport = up[1]; + + /* + * No need to handle exclusive-stack zones + * since ALL_ZONES only applies to the shared IP instance. + */ + zoneid = tsol_mlp_findzone(protocol, lport); + /* + * If no shared MLP is found, tsol_mlp_findzone returns + * ALL_ZONES. In that case, we assume it's SLP, and + * search for the zone based on the packet label. + * + * If there is such a zone, we prefer to find a + * connection in it. Otherwise, we look for a + * MAC-exempt connection in any zone whose label + * dominates the default label on the packet. + */ + if (zoneid == ALL_ZONES) + zoneid = tsol_attr_to_zoneid(ira); + else + ira->ira_flags &= ~IRAF_TX_MAC_EXEMPTABLE; + break; + default: + /* Handle shared address for other protocols */ + zoneid = tsol_attr_to_zoneid(ira); + break; + } + ira->ira_zoneid = zoneid; +} + +/* + * Increment checksum failure statistics + */ +static void +ip_input_cksum_err_v4(uint8_t protocol, uint16_t hck_flags, ill_t *ill) +{ + ip_stack_t *ipst = ill->ill_ipst; + + switch (protocol) { + case IPPROTO_TCP: + BUMP_MIB(ill->ill_ip_mib, tcpIfStatsInErrs); + + if (hck_flags & HCK_FULLCKSUM) + IP_STAT(ipst, ip_tcp_in_full_hw_cksum_err); + else if (hck_flags & HCK_PARTIALCKSUM) + IP_STAT(ipst, ip_tcp_in_part_hw_cksum_err); + else + IP_STAT(ipst, ip_tcp_in_sw_cksum_err); + break; + case IPPROTO_UDP: + BUMP_MIB(ill->ill_ip_mib, udpIfStatsInCksumErrs); + if (hck_flags & HCK_FULLCKSUM) + IP_STAT(ipst, ip_udp_in_full_hw_cksum_err); + else if (hck_flags & HCK_PARTIALCKSUM) + IP_STAT(ipst, ip_udp_in_part_hw_cksum_err); + else + IP_STAT(ipst, ip_udp_in_sw_cksum_err); + break; + case IPPROTO_ICMP: + BUMP_MIB(&ipst->ips_icmp_mib, icmpInCksumErrs); + break; + default: + ASSERT(0); + break; + } +} + +/* Calculate the IPv4 pseudo-header checksum */ +uint32_t +ip_input_cksum_pseudo_v4(ipha_t *ipha, ip_recv_attr_t *ira) +{ + uint_t ulp_len; + uint32_t cksum; + uint8_t protocol = ira->ira_protocol; + uint16_t ip_hdr_length = ira->ira_ip_hdr_length; + +#define iphs ((uint16_t *)ipha) + + switch (protocol) { + case IPPROTO_TCP: + ulp_len = ira->ira_pktlen - ip_hdr_length; + + /* Protocol and length */ + cksum = htons(ulp_len) + IP_TCP_CSUM_COMP; + /* IP addresses */ + cksum += iphs[6] + iphs[7] + iphs[8] + iphs[9]; + break; + + case IPPROTO_UDP: { + udpha_t *udpha; + + udpha = (udpha_t *)((uchar_t *)ipha + ip_hdr_length); + + /* Protocol and length */ + cksum = udpha->uha_length + IP_UDP_CSUM_COMP; + /* IP addresses */ + cksum += iphs[6] + iphs[7] + iphs[8] + iphs[9]; + break; + } + + default: + cksum = 0; + break; + } +#undef iphs + return (cksum); +} + + +/* + * Software verification of the ULP checksums. + * Returns B_TRUE if ok. + * Increments statistics of failed. + */ +static boolean_t +ip_input_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira) +{ + ip_stack_t *ipst = ira->ira_ill->ill_ipst; + uint32_t cksum; + uint8_t protocol = ira->ira_protocol; + uint16_t ip_hdr_length = ira->ira_ip_hdr_length; + + IP_STAT(ipst, ip_in_sw_cksum); + + ASSERT(protocol == IPPROTO_TCP || protocol == IPPROTO_UDP); + + cksum = ip_input_cksum_pseudo_v4(ipha, ira); + cksum = IP_CSUM(mp, ip_hdr_length, cksum); + if (cksum == 0) + return (B_TRUE); + + ip_input_cksum_err_v4(protocol, 0, ira->ira_ill); + return (B_FALSE); +} + +/* There are drivers that can't do partial checksum with IP options */ +int eri_cksum_workaround = 1; + +/* + * Verify the ULP checksums. + * Returns B_TRUE if ok, or if the ULP doesn't have a well-defined checksum + * algorithm. + * Increments statistics if failed. + */ +static boolean_t +ip_input_cksum_v4(iaflags_t iraflags, mblk_t *mp, ipha_t *ipha, + ip_recv_attr_t *ira) +{ + ill_t *ill = ira->ira_rill; + uint16_t hck_flags; + uint32_t cksum; + mblk_t *mp1; + int32_t len; + uint8_t protocol = ira->ira_protocol; + uint16_t ip_hdr_length = ira->ira_ip_hdr_length; + + + switch (protocol) { + case IPPROTO_TCP: + break; + + case IPPROTO_UDP: { + udpha_t *udpha; + + udpha = (udpha_t *)((uchar_t *)ipha + ip_hdr_length); + if (udpha->uha_checksum == 0) { + /* Packet doesn't have a UDP checksum */ + return (B_TRUE); + } + break; + } + case IPPROTO_SCTP: { + sctp_hdr_t *sctph; + uint32_t pktsum; + + sctph = (sctp_hdr_t *)((uchar_t *)ipha + ip_hdr_length); +#ifdef DEBUG + if (skip_sctp_cksum) + return (B_TRUE); +#endif + pktsum = sctph->sh_chksum; + sctph->sh_chksum = 0; + cksum = sctp_cksum(mp, ip_hdr_length); + sctph->sh_chksum = pktsum; + if (cksum == pktsum) + return (B_TRUE); + + /* + * Defer until later whether a bad checksum is ok + * in order to allow RAW sockets to use Adler checksum + * with SCTP. + */ + ira->ira_flags |= IRAF_SCTP_CSUM_ERR; + return (B_TRUE); + } + + default: + /* No ULP checksum to verify. */ + return (B_TRUE); + } + /* + * Revert to software checksum calculation if the interface + * isn't capable of checksum offload. + * We clear DB_CKSUMFLAGS when going through IPsec in ip_fanout. + * Note: IRAF_NO_HW_CKSUM is not currently used. + */ + ASSERT(!IS_IPMP(ill)); + if ((iraflags & IRAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) || + !dohwcksum) { + return (ip_input_sw_cksum_v4(mp, ipha, ira)); + } + + /* + * We apply this for all ULP protocols. Does the HW know to + * not set the flags for SCTP and other protocols. + */ + + hck_flags = DB_CKSUMFLAGS(mp); + + if (hck_flags & HCK_FULLCKSUM) { + /* + * Full checksum has been computed by the hardware + * and has been attached. If the driver wants us to + * verify the correctness of the attached value, in + * order to protect against faulty hardware, compare + * it against -0 (0xFFFF) to see if it's valid. + */ + if (hck_flags & HCK_FULLCKSUM_OK) + return (B_TRUE); + + cksum = DB_CKSUM16(mp); + if (cksum == 0xFFFF) + return (B_TRUE); + ip_input_cksum_err_v4(protocol, hck_flags, ira->ira_ill); + return (B_FALSE); + } + + mp1 = mp->b_cont; + if ((hck_flags & HCK_PARTIALCKSUM) && + (mp1 == NULL || mp1->b_cont == NULL) && + ip_hdr_length >= DB_CKSUMSTART(mp) && + (!eri_cksum_workaround || ip_hdr_length == IP_SIMPLE_HDR_LENGTH) && + ((len = ip_hdr_length - DB_CKSUMSTART(mp)) & 1) == 0) { + uint32_t adj; + uchar_t *cksum_start; + + cksum = ip_input_cksum_pseudo_v4(ipha, ira); + + cksum_start = ((uchar_t *)ipha + DB_CKSUMSTART(mp)); + + /* + * Partial checksum has been calculated by hardware + * and attached to the packet; in addition, any + * prepended extraneous data is even byte aligned, + * and there are at most two mblks associated with + * the packet. If any such data exists, we adjust + * the checksum; also take care any postpended data. + */ + IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, len, adj); + /* + * One's complement subtract extraneous checksum + */ + cksum += DB_CKSUM16(mp); + if (adj >= cksum) + cksum = ~(adj - cksum) & 0xFFFF; + else + cksum -= adj; + cksum = (cksum & 0xFFFF) + ((int)cksum >> 16); + cksum = (cksum & 0xFFFF) + ((int)cksum >> 16); + if (!(~cksum & 0xFFFF)) + return (B_TRUE); + + ip_input_cksum_err_v4(protocol, hck_flags, ira->ira_ill); + return (B_FALSE); + } + return (ip_input_sw_cksum_v4(mp, ipha, ira)); +} + + +/* + * Handle fanout of received packets. + * Unicast packets that are looped back (from ire_send_local_v4) and packets + * from the wire are differentiated by checking IRAF_VERIFY_ULP_CKSUM. + * + * IPQoS Notes + * Before sending it to the client, invoke IPPF processing. Policy processing + * takes place only if the callout_position, IPP_LOCAL_IN, is enabled. + */ +void +ip_fanout_v4(mblk_t *mp, ipha_t *ipha, ip_recv_attr_t *ira) +{ + ill_t *ill = ira->ira_ill; + iaflags_t iraflags = ira->ira_flags; + ip_stack_t *ipst = ill->ill_ipst; + uint8_t protocol = ipha->ipha_protocol; + conn_t *connp; +#define rptr ((uchar_t *)ipha) + uint_t ip_hdr_length; + uint_t min_ulp_header_length; + int offset; + ssize_t len; + netstack_t *ns = ipst->ips_netstack; + ipsec_stack_t *ipss = ns->netstack_ipsec; + ill_t *rill = ira->ira_rill; + + ASSERT(ira->ira_pktlen == ntohs(ipha->ipha_length)); + + ip_hdr_length = ira->ira_ip_hdr_length; + ira->ira_protocol = protocol; + + /* + * Time for IPP once we've done reassembly and IPsec. + * We skip this for loopback packets since we don't do IPQoS + * on loopback. + */ + if (IPP_ENABLED(IPP_LOCAL_IN, ipst) && + !(iraflags & IRAF_LOOPBACK) && + (protocol != IPPROTO_ESP || protocol != IPPROTO_AH)) { + /* + * Use the interface on which the packet arrived - not where + * the IP address is hosted. + */ + /* ip_process translates an IS_UNDER_IPMP */ + mp = ip_process(IPP_LOCAL_IN, mp, rill, ill); + if (mp == NULL) { + /* ip_drop_packet and MIB done */ + return; + } + } + + /* Determine the minimum required size of the upper-layer header */ + /* Need to do this for at least the set of ULPs that TX handles. */ + switch (protocol) { + case IPPROTO_TCP: + min_ulp_header_length = TCP_MIN_HEADER_LENGTH; + break; + case IPPROTO_SCTP: + min_ulp_header_length = SCTP_COMMON_HDR_LENGTH; + break; + case IPPROTO_UDP: + min_ulp_header_length = UDPH_SIZE; + break; + case IPPROTO_ICMP: + min_ulp_header_length = ICMPH_SIZE; + break; + default: + min_ulp_header_length = 0; + break; + } + /* Make sure we have the min ULP header length */ + len = mp->b_wptr - rptr; + if (len < ip_hdr_length + min_ulp_header_length) { + if (ira->ira_pktlen < ip_hdr_length + min_ulp_header_length) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInTruncatedPkts); + ip_drop_input("ipIfStatsInTruncatedPkts", mp, ill); + freemsg(mp); + return; + } + IP_STAT(ipst, ip_recv_pullup); + ipha = ip_pullup(mp, ip_hdr_length + min_ulp_header_length, + ira); + if (ipha == NULL) + goto discard; + len = mp->b_wptr - rptr; + } + + /* + * If trusted extensions then determine the zoneid and TX specific + * ira_flags. + */ + if (iraflags & IRAF_SYSTEM_LABELED) { + /* This can update ira->ira_flags and ira->ira_zoneid */ + ip_fanout_tx_v4(mp, ipha, protocol, ip_hdr_length, ira); + iraflags = ira->ira_flags; + } + + + /* Verify ULP checksum. Handles TCP, UDP, and SCTP */ + if (iraflags & IRAF_VERIFY_ULP_CKSUM) { + if (!ip_input_cksum_v4(iraflags, mp, ipha, ira)) { + /* Bad checksum. Stats are already incremented */ + ip_drop_input("Bad ULP checksum", mp, ill); + freemsg(mp); + return; + } + /* IRAF_SCTP_CSUM_ERR could have been set */ + iraflags = ira->ira_flags; + } + switch (protocol) { + case IPPROTO_TCP: + /* For TCP, discard broadcast and multicast packets. */ + if (iraflags & IRAF_MULTIBROADCAST) + goto discard; + + /* First mblk contains IP+TCP headers per above check */ + ASSERT(len >= ip_hdr_length + TCP_MIN_HEADER_LENGTH); + + /* TCP options present? */ + offset = ((uchar_t *)ipha)[ip_hdr_length + 12] >> 4; + if (offset != 5) { + if (offset < 5) + goto discard; + + /* + * There must be TCP options. + * Make sure we can grab them. + */ + offset <<= 2; + offset += ip_hdr_length; + if (len < offset) { + if (ira->ira_pktlen < offset) { + BUMP_MIB(ill->ill_ip_mib, + ipIfStatsInTruncatedPkts); + ip_drop_input( + "ipIfStatsInTruncatedPkts", + mp, ill); + freemsg(mp); + return; + } + IP_STAT(ipst, ip_recv_pullup); + ipha = ip_pullup(mp, offset, ira); + if (ipha == NULL) + goto discard; + len = mp->b_wptr - rptr; + } + } + + /* + * Pass up a squeue hint to tcp. + * If ira_sqp is already set (this is loopback) we leave it + * alone. + */ + if (ira->ira_sqp == NULL) { + ira->ira_sqp = ip_squeue_get(ira->ira_ring); + } + + /* Look for AF_INET or AF_INET6 that matches */ + connp = ipcl_classify_v4(mp, IPPROTO_TCP, ip_hdr_length, + ira, ipst); + if (connp == NULL) { + /* Send the TH_RST */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); + tcp_xmit_listeners_reset(mp, ira, ipst, NULL); + return; + } + if (connp->conn_incoming_ifindex != 0 && + connp->conn_incoming_ifindex != ira->ira_ruifindex) { + CONN_DEC_REF(connp); + + /* Send the TH_RST */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); + tcp_xmit_listeners_reset(mp, ira, ipst, NULL); + return; + } + if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || + (iraflags & IRAF_IPSEC_SECURE)) { + mp = ipsec_check_inbound_policy(mp, connp, + ipha, NULL, ira); + if (mp == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + /* Note that mp is NULL */ + ip_drop_input("ipIfStatsInDiscards", mp, ill); + CONN_DEC_REF(connp); + return; + } + } + /* Found a client; up it goes */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); + ira->ira_ill = ira->ira_rill = NULL; + if (!IPCL_IS_TCP(connp)) { + /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */ + (connp->conn_recv)(connp, mp, NULL, ira); + CONN_DEC_REF(connp); + ira->ira_ill = ill; + ira->ira_rill = rill; + return; + } + + /* + * We do different processing whether called from + * ip_accept_tcp and we match the target, don't match + * the target, and when we are called by ip_input. + */ + if (iraflags & IRAF_TARGET_SQP) { + if (ira->ira_target_sqp == connp->conn_sqp) { + mblk_t *attrmp; + + attrmp = ip_recv_attr_to_mblk(ira); + if (attrmp == NULL) { + BUMP_MIB(ill->ill_ip_mib, + ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", + mp, ill); + freemsg(mp); + CONN_DEC_REF(connp); + } else { + SET_SQUEUE(attrmp, connp->conn_recv, + connp); + attrmp->b_cont = mp; + ASSERT(ira->ira_target_sqp_mp == NULL); + ira->ira_target_sqp_mp = attrmp; + /* + * Conn ref release when drained from + * the squeue. + */ + } + } else { + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, + connp->conn_recv, connp, ira, SQ_FILL, + SQTAG_IP_TCP_INPUT); + } + } else { + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, + connp, ira, ip_squeue_flag, SQTAG_IP_TCP_INPUT); + } + ira->ira_ill = ill; + ira->ira_rill = rill; + return; + + case IPPROTO_SCTP: { + sctp_hdr_t *sctph; + in6_addr_t map_src, map_dst; + uint32_t ports; /* Source and destination ports */ + sctp_stack_t *sctps = ipst->ips_netstack->netstack_sctp; + + /* For SCTP, discard broadcast and multicast packets. */ + if (iraflags & IRAF_MULTIBROADCAST) + goto discard; + + /* + * Since there is no SCTP h/w cksum support yet, just + * clear the flag. + */ + DB_CKSUMFLAGS(mp) = 0; + + /* Length ensured above */ + ASSERT(MBLKL(mp) >= ip_hdr_length + SCTP_COMMON_HDR_LENGTH); + sctph = (sctp_hdr_t *)(rptr + ip_hdr_length); + + /* get the ports */ + ports = *(uint32_t *)&sctph->sh_sport; + + IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_dst); + IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_src); + if (iraflags & IRAF_SCTP_CSUM_ERR) { + /* + * No potential sctp checksum errors go to the Sun + * sctp stack however they might be Adler-32 summed + * packets a userland stack bound to a raw IP socket + * could reasonably use. Note though that Adler-32 is + * a long deprecated algorithm and customer sctp + * networks should eventually migrate to CRC-32 at + * which time this facility should be removed. + */ + ip_fanout_sctp_raw(mp, ipha, NULL, ports, ira); + return; + } + connp = sctp_fanout(&map_src, &map_dst, ports, ira, mp, sctps); + if (connp == NULL) { + /* Check for raw socket or OOTB handling */ + ip_fanout_sctp_raw(mp, ipha, NULL, ports, ira); + return; + } + if (connp->conn_incoming_ifindex != 0 && + connp->conn_incoming_ifindex != ira->ira_ruifindex) { + CONN_DEC_REF(connp); + /* Check for raw socket or OOTB handling */ + ip_fanout_sctp_raw(mp, ipha, NULL, ports, ira); + return; + } + + /* Found a client; up it goes */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); + sctp_input(connp, ipha, NULL, mp, ira); + /* sctp_input does a rele of the sctp_t */ + return; + } + + case IPPROTO_UDP: + /* First mblk contains IP+UDP headers as checked above */ + ASSERT(MBLKL(mp) >= ip_hdr_length + UDPH_SIZE); + + if (iraflags & IRAF_MULTIBROADCAST) { + uint16_t *up; /* Pointer to ports in ULP header */ + + up = (uint16_t *)((uchar_t *)ipha + ip_hdr_length); + ip_fanout_udp_multi_v4(mp, ipha, up[1], up[0], ira); + return; + } + + /* Look for AF_INET or AF_INET6 that matches */ + connp = ipcl_classify_v4(mp, IPPROTO_UDP, ip_hdr_length, + ira, ipst); + if (connp == NULL) { + no_udp_match: + if (ipst->ips_ipcl_proto_fanout_v4[IPPROTO_UDP]. + connf_head != NULL) { + ASSERT(ira->ira_protocol == IPPROTO_UDP); + ip_fanout_proto_v4(mp, ipha, ira); + } else { + ip_fanout_send_icmp_v4(mp, + ICMP_DEST_UNREACHABLE, + ICMP_PORT_UNREACHABLE, ira); + } + return; + + } + if (connp->conn_incoming_ifindex != 0 && + connp->conn_incoming_ifindex != ira->ira_ruifindex) { + CONN_DEC_REF(connp); + goto no_udp_match; + } + if (IPCL_IS_NONSTR(connp) ? connp->conn_flow_cntrld : + !canputnext(connp->conn_rq)) { + CONN_DEC_REF(connp); + BUMP_MIB(ill->ill_ip_mib, udpIfStatsInOverflows); + ip_drop_input("udpIfStatsInOverflows", mp, ill); + freemsg(mp); + return; + } + if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || + (iraflags & IRAF_IPSEC_SECURE)) { + mp = ipsec_check_inbound_policy(mp, connp, + ipha, NULL, ira); + if (mp == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + /* Note that mp is NULL */ + ip_drop_input("ipIfStatsInDiscards", mp, ill); + CONN_DEC_REF(connp); + return; + } + } + /* + * Remove 0-spi if it's 0, or move everything behind + * the UDP header over it and forward to ESP via + * ip_fanout_v4(). + */ + if (connp->conn_udp->udp_nat_t_endpoint) { + if (iraflags & IRAF_IPSEC_SECURE) { + ip_drop_packet(mp, B_TRUE, ira->ira_ill, + DROPPER(ipss, ipds_esp_nat_t_ipsec), + &ipss->ipsec_dropper); + CONN_DEC_REF(connp); + return; + } + + mp = zero_spi_check(mp, ira); + if (mp == NULL) { + /* + * Packet was consumed - probably sent to + * ip_fanout_v4. + */ + CONN_DEC_REF(connp); + return; + } + /* Else continue like a normal UDP packet. */ + ipha = (ipha_t *)mp->b_rptr; + protocol = ipha->ipha_protocol; + ira->ira_protocol = protocol; + } + /* Found a client; up it goes */ + IP_STAT(ipst, ip_udp_fannorm); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); + ira->ira_ill = ira->ira_rill = NULL; + (connp->conn_recv)(connp, mp, NULL, ira); + CONN_DEC_REF(connp); + ira->ira_ill = ill; + ira->ira_rill = rill; + return; + default: + break; + } + + /* + * Clear hardware checksumming flag as it is currently only + * used by TCP and UDP. + */ + DB_CKSUMFLAGS(mp) = 0; + + switch (protocol) { + case IPPROTO_ICMP: + /* + * We need to accomodate icmp messages coming in clear + * until we get everything secure from the wire. If + * icmp_accept_clear_messages is zero we check with + * the global policy and act accordingly. If it is + * non-zero, we accept the message without any checks. + * But *this does not mean* that this will be delivered + * to RAW socket clients. By accepting we might send + * replies back, change our MTU value etc., + * but delivery to the ULP/clients depends on their + * policy dispositions. + */ + if (ipst->ips_icmp_accept_clear_messages == 0) { + mp = ipsec_check_global_policy(mp, NULL, + ipha, NULL, ira, ns); + if (mp == NULL) + return; + } + + /* + * On a labeled system, we have to check whether the zone + * itself is permitted to receive raw traffic. + */ + if (ira->ira_flags & IRAF_SYSTEM_LABELED) { + if (!tsol_can_accept_raw(mp, ira, B_FALSE)) { + BUMP_MIB(&ipst->ips_icmp_mib, icmpInErrors); + ip_drop_input("tsol_can_accept_raw", mp, ill); + freemsg(mp); + return; + } + } + + /* + * ICMP header checksum, including checksum field, + * should be zero. + */ + if (IP_CSUM(mp, ip_hdr_length, 0)) { + BUMP_MIB(&ipst->ips_icmp_mib, icmpInCksumErrs); + ip_drop_input("icmpInCksumErrs", mp, ill); + freemsg(mp); + return; + } + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); + mp = icmp_inbound_v4(mp, ira); + if (mp == NULL) { + /* No need to pass to RAW sockets */ + return; + } + break; + + case IPPROTO_IGMP: + /* + * If we are not willing to accept IGMP packets in clear, + * then check with global policy. + */ + if (ipst->ips_igmp_accept_clear_messages == 0) { + mp = ipsec_check_global_policy(mp, NULL, + ipha, NULL, ira, ns); + if (mp == NULL) + return; + } + if ((ira->ira_flags & IRAF_SYSTEM_LABELED) && + !tsol_can_accept_raw(mp, ira, B_TRUE)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); + freemsg(mp); + return; + } + /* + * Validate checksum + */ + if (IP_CSUM(mp, ip_hdr_length, 0)) { + ++ipst->ips_igmpstat.igps_rcv_badsum; + ip_drop_input("igps_rcv_badsum", mp, ill); + freemsg(mp); + return; + } + + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); + mp = igmp_input(mp, ira); + if (mp == NULL) { + /* Bad packet - discarded by igmp_input */ + return; + } + break; + case IPPROTO_PIM: + /* + * If we are not willing to accept PIM packets in clear, + * then check with global policy. + */ + if (ipst->ips_pim_accept_clear_messages == 0) { + mp = ipsec_check_global_policy(mp, NULL, + ipha, NULL, ira, ns); + if (mp == NULL) + return; + } + if ((ira->ira_flags & IRAF_SYSTEM_LABELED) && + !tsol_can_accept_raw(mp, ira, B_TRUE)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); + freemsg(mp); + return; + } + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); + + /* Checksum is verified in pim_input */ + mp = pim_input(mp, ira); + if (mp == NULL) { + /* Bad packet - discarded by pim_input */ + return; + } + break; + case IPPROTO_AH: + case IPPROTO_ESP: { + /* + * Fast path for AH/ESP. + */ + netstack_t *ns = ipst->ips_netstack; + ipsec_stack_t *ipss = ns->netstack_ipsec; + + IP_STAT(ipst, ipsec_proto_ahesp); + + if (!ipsec_loaded(ipss)) { + ip_proto_not_sup(mp, ira); + return; + } + + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); + /* select inbound SA and have IPsec process the pkt */ + if (protocol == IPPROTO_ESP) { + esph_t *esph; + boolean_t esp_in_udp_sa; + boolean_t esp_in_udp_packet; + + mp = ipsec_inbound_esp_sa(mp, ira, &esph); + if (mp == NULL) + return; + + ASSERT(esph != NULL); + ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE); + ASSERT(ira->ira_ipsec_esp_sa != NULL); + ASSERT(ira->ira_ipsec_esp_sa->ipsa_input_func != NULL); + + esp_in_udp_sa = ((ira->ira_ipsec_esp_sa->ipsa_flags & + IPSA_F_NATT) != 0); + esp_in_udp_packet = + (ira->ira_flags & IRAF_ESP_UDP_PORTS) != 0; + + /* + * The following is a fancy, but quick, way of saying: + * ESP-in-UDP SA and Raw ESP packet --> drop + * OR + * ESP SA and ESP-in-UDP packet --> drop + */ + if (esp_in_udp_sa != esp_in_udp_packet) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_packet(mp, B_TRUE, ira->ira_ill, + DROPPER(ipss, ipds_esp_no_sa), + &ipss->ipsec_dropper); + return; + } + mp = ira->ira_ipsec_esp_sa->ipsa_input_func(mp, esph, + ira); + } else { + ah_t *ah; + + mp = ipsec_inbound_ah_sa(mp, ira, &ah); + if (mp == NULL) + return; + + ASSERT(ah != NULL); + ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE); + ASSERT(ira->ira_ipsec_ah_sa != NULL); + ASSERT(ira->ira_ipsec_ah_sa->ipsa_input_func != NULL); + mp = ira->ira_ipsec_ah_sa->ipsa_input_func(mp, ah, + ira); + } + + if (mp == NULL) { + /* + * Either it failed or is pending. In the former case + * ipIfStatsInDiscards was increased. + */ + return; + } + /* we're done with IPsec processing, send it up */ + ip_input_post_ipsec(mp, ira); + return; + } + case IPPROTO_ENCAP: { + ipha_t *inner_ipha; + + /* + * Handle self-encapsulated packets (IP-in-IP where + * the inner addresses == the outer addresses). + */ + if ((uchar_t *)ipha + ip_hdr_length + sizeof (ipha_t) > + mp->b_wptr) { + if (ira->ira_pktlen < + ip_hdr_length + sizeof (ipha_t)) { + BUMP_MIB(ill->ill_ip_mib, + ipIfStatsInTruncatedPkts); + ip_drop_input("ipIfStatsInTruncatedPkts", + mp, ill); + freemsg(mp); + return; + } + ipha = ip_pullup(mp, (uchar_t *)ipha + ip_hdr_length + + sizeof (ipha_t) - mp->b_rptr, ira); + if (ipha == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); + freemsg(mp); + return; + } + } + inner_ipha = (ipha_t *)((uchar_t *)ipha + ip_hdr_length); + /* + * Check the sanity of the inner IP header. + */ + if ((IPH_HDR_VERSION(inner_ipha) != IPV4_VERSION)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); + freemsg(mp); + return; + } + if (IPH_HDR_LENGTH(inner_ipha) < sizeof (ipha_t)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); + freemsg(mp); + return; + } + if (inner_ipha->ipha_src != ipha->ipha_src || + inner_ipha->ipha_dst != ipha->ipha_dst) { + /* We fallthru to iptun fanout below */ + goto iptun; + } + + /* + * Self-encapsulated tunnel packet. Remove + * the outer IP header and fanout again. + * We also need to make sure that the inner + * header is pulled up until options. + */ + mp->b_rptr = (uchar_t *)inner_ipha; + ipha = inner_ipha; + ip_hdr_length = IPH_HDR_LENGTH(ipha); + if ((uchar_t *)ipha + ip_hdr_length > mp->b_wptr) { + if (ira->ira_pktlen < + (uchar_t *)ipha + ip_hdr_length - mp->b_rptr) { + BUMP_MIB(ill->ill_ip_mib, + ipIfStatsInTruncatedPkts); + ip_drop_input("ipIfStatsInTruncatedPkts", + mp, ill); + freemsg(mp); + return; + } + ipha = ip_pullup(mp, + (uchar_t *)ipha + ip_hdr_length - mp->b_rptr, ira); + if (ipha == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); + freemsg(mp); + return; + } + } + if (ip_hdr_length > sizeof (ipha_t)) { + /* We got options on the inner packet. */ + ipaddr_t dst = ipha->ipha_dst; + int error = 0; + + dst = ip_input_options(ipha, dst, mp, ira, &error); + if (error != 0) { + /* + * An ICMP error has been sent and the packet + * has been dropped. + */ + return; + } + if (dst != ipha->ipha_dst) { + /* + * Someone put a source-route in + * the inside header of a self- + * encapsulated packet. Drop it + * with extreme prejudice and let + * the sender know. + */ + ip_drop_input("ICMP_SOURCE_ROUTE_FAILED", + mp, ill); + icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, + ira); + return; + } + } + if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) { + /* + * This means that somebody is sending + * Self-encapsualted packets without AH/ESP. + * + * Send this packet to find a tunnel endpoint. + * if I can't find one, an ICMP + * PROTOCOL_UNREACHABLE will get sent. + */ + protocol = ipha->ipha_protocol; + ira->ira_protocol = protocol; + goto iptun; + } + + /* Update based on removed IP header */ + ira->ira_ip_hdr_length = ip_hdr_length; + ira->ira_pktlen = ntohs(ipha->ipha_length); + + if (ira->ira_flags & IRAF_IPSEC_DECAPS) { + /* + * This packet is self-encapsulated multiple + * times. We don't want to recurse infinitely. + * To keep it simple, drop the packet. + */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); + freemsg(mp); + return; + } + ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE); + ira->ira_flags |= IRAF_IPSEC_DECAPS; + + ip_input_post_ipsec(mp, ira); + return; + } + + iptun: /* IPPROTO_ENCAPS that is not self-encapsulated */ + case IPPROTO_IPV6: + /* iptun will verify trusted label */ + connp = ipcl_classify_v4(mp, protocol, ip_hdr_length, + ira, ipst); + if (connp != NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInDelivers); + ira->ira_ill = ira->ira_rill = NULL; + (connp->conn_recv)(connp, mp, NULL, ira); + CONN_DEC_REF(connp); + ira->ira_ill = ill; + ira->ira_rill = rill; + return; + } + /* FALLTHRU */ + default: + /* + * On a labeled system, we have to check whether the zone + * itself is permitted to receive raw traffic. + */ + if (ira->ira_flags & IRAF_SYSTEM_LABELED) { + if (!tsol_can_accept_raw(mp, ira, B_FALSE)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); + freemsg(mp); + return; + } + } + break; + } + + /* + * The above input functions may have returned the pulled up message. + * So ipha need to be reinitialized. + */ + ipha = (ipha_t *)mp->b_rptr; + ira->ira_protocol = protocol = ipha->ipha_protocol; + if (ipst->ips_ipcl_proto_fanout_v4[protocol].connf_head == NULL) { + /* + * No user-level listener for these packets packets. + * Check for IPPROTO_ENCAP... + */ + if (protocol == IPPROTO_ENCAP && ipst->ips_ip_g_mrouter) { + /* + * Check policy here, + * THEN ship off to ip_mroute_decap(). + * + * BTW, If I match a configured IP-in-IP + * tunnel above, this path will not be reached, and + * ip_mroute_decap will never be called. + */ + mp = ipsec_check_global_policy(mp, connp, + ipha, NULL, ira, ns); + if (mp != NULL) { + ip_mroute_decap(mp, ira); + } /* Else we already freed everything! */ + } else { + ip_proto_not_sup(mp, ira); + } + return; + } + + /* + * Handle fanout to raw sockets. There + * can be more than one stream bound to a particular + * protocol. When this is the case, each one gets a copy + * of any incoming packets. + */ + ASSERT(ira->ira_protocol == ipha->ipha_protocol); + ip_fanout_proto_v4(mp, ipha, ira); + return; + +discard: + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); + freemsg(mp); +#undef rptr +} diff --git a/usr/src/uts/common/inet/ip/ip_ire.c b/usr/src/uts/common/inet/ip/ip_ire.c index 63a6863844..be0017cb62 100644 --- a/usr/src/uts/common/inet/ip/ip_ire.c +++ b/usr/src/uts/common/inet/ip/ip_ire.c @@ -60,9 +60,6 @@ #include <inet/ip_rts.h> #include <inet/nd.h> -#include <net/pfkeyv2.h> -#include <inet/ipsec_info.h> -#include <inet/sadb.h> #include <inet/tcp.h> #include <inet/ipclassifier.h> #include <sys/zone.h> @@ -73,6 +70,11 @@ struct kmem_cache *rt_entry_cache; +typedef struct nce_clookup_s { + ipaddr_t ncecl_addr; + boolean_t ncecl_found; +} nce_clookup_t; + /* * Synchronization notes: * @@ -80,17 +82,17 @@ struct kmem_cache *rt_entry_cache; * * ire_next/ire_ptpn * - * - bucket lock of the respective tables (cache or forwarding tables). + * - bucket lock of the forwarding table in which is ire stored. * - * ire_mp, ire_rfq, ire_stq, ire_u *except* ire_gateway_addr[v6], ire_mask, - * ire_type, ire_create_time, ire_masklen, ire_ipversion, ire_flags, ire_ipif, - * ire_ihandle, ire_phandle, ire_nce, ire_bucket, ire_in_ill, ire_in_src_addr + * ire_ill, ire_u *except* ire_gateway_addr[v6], ire_mask, + * ire_type, ire_create_time, ire_masklen, ire_ipversion, ire_flags, + * ire_bucket * * - Set in ire_create_v4/v6 and never changes after that. Thus, * we don't need a lock whenever these fields are accessed. * * - ire_bucket and ire_masklen (also set in ire_create) is set in - * ire_add_v4/ire_add_v6 before inserting in the bucket and never + * ire_add before inserting in the bucket and never * changes after that. Thus we don't need a lock whenever these * fields are accessed. * @@ -102,7 +104,7 @@ struct kmem_cache *rt_entry_cache; * does not use any locks. ire_gateway_addr_v6 updates are not atomic * and hence any access to it uses ire_lock to get/set the right value. * - * ire_ident, ire_refcnt + * ire_refcnt, ire_identical_ref * * - Updated atomically using atomic_add_32 * @@ -111,40 +113,33 @@ struct kmem_cache *rt_entry_cache; * - Assumes that 32 bit writes are atomic. No locks. ire_lock is * used to serialize updates to ire_ssthresh, ire_rtt_sd, ire_rtt. * - * ire_max_frag, ire_frag_flag - * - * - ire_lock is used to set/read both of them together. - * - * ire_tire_mark + * ire_generation + * - Under ire_lock * - * - Set in ire_create and updated in ire_expire, which is called - * by only one function namely ip_trash_timer_expire. Thus only - * one function updates and examines the value. + * ire_nce_cache + * - Under ire_lock * - * ire_marks - * - bucket lock protects this. + * ire_dep_parent (To next IRE in recursive lookup chain) + * - Under ips_ire_dep_lock. Write held when modifying. Read held when + * walking. We also hold ire_lock when modifying to allow the data path + * to only acquire ire_lock. * - * ire_ll_hdr_length + * ire_dep_parent_generation (Generation number from ire_dep_parent) + * - Under ips_ire_dep_lock and/or ire_lock. (A read claim on the dep_lock + * and ire_lock held when modifying) * - * - Place holder for returning the information to the upper layers - * when IRE_DB_REQ comes down. - * - * - * ipv6_ire_default_count is protected by the bucket lock of - * ip_forwarding_table_v6[0][0]. - * - * ipv6_ire_default_index is not protected as it is just a hint - * at which default gateway to use. There is nothing - * wrong in using the same gateway for two different connections. + * ire_dep_children (From parent to first child) + * ire_dep_sib_next (linked list of siblings) + * ire_dep_sib_ptpn (linked list of siblings) + * - Under ips_ire_dep_lock. Write held when modifying. Read held when + * walking. * * As we always hold the bucket locks in all the places while accessing * the above values, it is natural to use them for protecting them. * - * We have a separate cache table and forwarding table for IPv4 and IPv6. - * Cache table (ip_cache_table/ip_cache_table_v6) is a pointer to an - * array of irb_t structures. The IPv6 forwarding table + * We have a forwarding table for IPv4 and IPv6. The IPv6 forwarding table * (ip_forwarding_table_v6) is an array of pointers to arrays of irb_t - * structure. ip_forwarding_table_v6 is allocated dynamically in + * structures. ip_forwarding_table_v6 is allocated dynamically in * ire_add_v6. ire_ft_init_lock is used to serialize multiple threads * initializing the same bucket. Once a bucket is initialized, it is never * de-alloacted. This assumption enables us to access @@ -158,39 +153,37 @@ struct kmem_cache *rt_entry_cache; * a bucket and the ires residing in the bucket have a back pointer to * the bucket structure. It also has a reference count for the number * of threads walking the bucket - irb_refcnt which is bumped up - * using the macro IRB_REFHOLD macro. The flags irb_flags can be - * set to IRE_MARK_CONDEMNED indicating that there are some ires - * in this bucket that are marked with IRE_MARK_CONDEMNED and the + * using the irb_refhold function. The flags irb_marks can be + * set to IRB_MARK_CONDEMNED indicating that there are some ires + * in this bucket that are IRE_IS_CONDEMNED and the * last thread to leave the bucket should delete the ires. Usually - * this is done by the IRB_REFRELE macro which is used to decrement + * this is done by the irb_refrele function which is used to decrement * the reference count on a bucket. See comments above irb_t structure * definition in ip.h for further details. * - * IRE_REFHOLD/IRE_REFRELE macros operate on the ire which increments/ + * The ire_refhold/ire_refrele functions operate on the ire which increments/ * decrements the reference count, ire_refcnt, atomically on the ire. - * ire_refcnt is modified only using this macro. Operations on the IRE + * ire_refcnt is modified only using those functions. Operations on the IRE * could be described as follows : * * CREATE an ire with reference count initialized to 1. * * ADDITION of an ire holds the bucket lock, checks for duplicates - * and then adds the ire. ire_add_v4/ire_add_v6 returns the ire after + * and then adds the ire. ire_add returns the ire after * bumping up once more i.e the reference count is 2. This is to avoid * an extra lookup in the functions calling ire_add which wants to * work with the ire after adding. * - * LOOKUP of an ire bumps up the reference count using IRE_REFHOLD - * macro. It is valid to bump up the referece count of the IRE, + * LOOKUP of an ire bumps up the reference count using ire_refhold + * function. It is valid to bump up the referece count of the IRE, * after the lookup has returned an ire. Following are the lookup * functions that return an HELD ire : * - * ire_lookup_local[_v6], ire_ctable_lookup[_v6], ire_ftable_lookup[_v6], - * ire_cache_lookup[_v6], ire_lookup_multi[_v6], ire_route_lookup[_v6], - * ipif_to_ire[_v6]. + * ire_ftable_lookup[_v6], ire_lookup_multi_ill[_v6] * * DELETION of an ire holds the bucket lock, removes it from the list * and then decrements the reference count for having removed from the list - * by using the IRE_REFRELE macro. If some other thread has looked up + * by using the ire_refrele function. If some other thread has looked up * the ire, the reference count would have been bumped up and hence * this ire will not be freed once deleted. It will be freed once the * reference count drops to zero. @@ -198,27 +191,12 @@ struct kmem_cache *rt_entry_cache; * Add and Delete acquires the bucket lock as RW_WRITER, while all the * lookups acquire the bucket lock as RW_READER. * - * NOTE : The only functions that does the IRE_REFRELE when an ire is - * passed as an argument are : - * - * 1) ip_wput_ire : This is because it IRE_REFHOLD/RELEs the - * broadcast ires it looks up internally within - * the function. Currently, for simplicity it does - * not differentiate the one that is passed in and - * the ones it looks up internally. It always - * IRE_REFRELEs. - * 2) ire_send - * ire_send_v6 : As ire_send calls ip_wput_ire and other functions - * that take ire as an argument, it has to selectively - * IRE_REFRELE the ire. To maintain symmetry, - * ire_send_v6 does the same. - * - * Otherwise, the general rule is to do the IRE_REFRELE in the function + * The general rule is to do the ire_refrele in the function * that is passing the ire as an argument. * * In trying to locate ires the following points are to be noted. * - * IRE_MARK_CONDEMNED signifies that the ire has been logically deleted and is + * IRE_IS_CONDEMNED signifies that the ire has been logically deleted and is * to be ignored when walking the ires using ire_next. * * Zones note: @@ -230,14 +208,6 @@ struct kmem_cache *rt_entry_cache; */ /* - * The minimum size of IRE cache table. It will be recalcuated in - * ip_ire_init(). - * Setable in /etc/system - */ -uint32_t ip_cache_table_size = IP_CACHE_TABLE_SIZE; -uint32_t ip6_cache_table_size = IP6_CACHE_TABLE_SIZE; - -/* * The size of the forwarding table. We will make sure that it is a * power of 2 in ip_ire_init(). * Setable in /etc/system @@ -245,313 +215,213 @@ uint32_t ip6_cache_table_size = IP6_CACHE_TABLE_SIZE; uint32_t ip6_ftable_hash_size = IP6_FTABLE_HASH_SIZE; struct kmem_cache *ire_cache; -static ire_t ire_null; - -/* - * The threshold number of IRE in a bucket when the IREs are - * cleaned up. This threshold is calculated later in ip_open() - * based on the speed of CPU and available memory. This default - * value is the maximum. - * - * We have two kinds of cached IRE, temporary and - * non-temporary. Temporary IREs are marked with - * IRE_MARK_TEMPORARY. They are IREs created for non - * TCP traffic and for forwarding purposes. All others - * are non-temporary IREs. We don't mark IRE created for - * TCP as temporary because TCP is stateful and there are - * info stored in the IRE which can be shared by other TCP - * connections to the same destination. For connected - * endpoint, we also don't want to mark the IRE used as - * temporary because the same IRE will be used frequently, - * otherwise, the app should not do a connect(). We change - * the marking at ip_bind_connected_*() if necessary. - * - * We want to keep the cache IRE hash bucket length reasonably - * short, otherwise IRE lookup functions will take "forever." - * We use the "crude" function that the IRE bucket - * length should be based on the CPU speed, which is 1 entry - * per x MHz, depending on the shift factor ip_ire_cpu_ratio - * (n). This means that with a 750MHz CPU, the max bucket - * length can be (750 >> n) entries. - * - * Note that this threshold is separate for temp and non-temp - * IREs. This means that the actual bucket length can be - * twice as that. And while we try to keep temporary IRE - * length at most at the threshold value, we do not attempt to - * make the length for non-temporary IREs fixed, for the - * reason stated above. Instead, we start trying to find - * "unused" non-temporary IREs when the bucket length reaches - * this threshold and clean them up. - * - * We also want to limit the amount of memory used by - * IREs. So if we are allowed to use ~3% of memory (M) - * for those IREs, each bucket should not have more than - * - * M / num of cache bucket / sizeof (ire_t) - * - * Again the above memory uses are separate for temp and - * non-temp cached IREs. - * - * We may also want the limit to be a function of the number - * of interfaces and number of CPUs. Doing the initialization - * in ip_open() means that every time an interface is plumbed, - * the max is re-calculated. Right now, we don't do anything - * different. In future, when we have more experience, we - * may want to change this behavior. - */ -uint32_t ip_ire_max_bucket_cnt = 10; /* Setable in /etc/system */ -uint32_t ip6_ire_max_bucket_cnt = 10; -uint32_t ip_ire_cleanup_cnt = 2; - -/* - * The minimum of the temporary IRE bucket count. We do not want - * the length of each bucket to be too short. This may hurt - * performance of some apps as the temporary IREs are removed too - * often. - */ -uint32_t ip_ire_min_bucket_cnt = 3; /* /etc/system - not used */ -uint32_t ip6_ire_min_bucket_cnt = 3; - -/* - * The ratio of memory consumed by IRE used for temporary to available - * memory. This is a shift factor, so 6 means the ratio 1 to 64. This - * value can be changed in /etc/system. 6 is a reasonable number. - */ -uint32_t ip_ire_mem_ratio = 6; /* /etc/system */ -/* The shift factor for CPU speed to calculate the max IRE bucket length. */ -uint32_t ip_ire_cpu_ratio = 7; /* /etc/system */ - -typedef struct nce_clookup_s { - ipaddr_t ncecl_addr; - boolean_t ncecl_found; -} nce_clookup_t; - -/* - * The maximum number of buckets in IRE cache table. In future, we may - * want to make it a dynamic hash table. For the moment, we fix the - * size and allocate the table in ip_ire_init() when IP is first loaded. - * We take into account the amount of memory a system has. - */ -#define IP_MAX_CACHE_TABLE_SIZE 4096 - -/* Setable in /etc/system */ -static uint32_t ip_max_cache_table_size = IP_MAX_CACHE_TABLE_SIZE; -static uint32_t ip6_max_cache_table_size = IP_MAX_CACHE_TABLE_SIZE; +struct kmem_cache *ncec_cache; +struct kmem_cache *nce_cache; -/* Zero iulp_t for initialization. */ -const iulp_t ire_uinfo_null = { 0 }; +static ire_t ire_null; -static int ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, - ipsq_func_t func, boolean_t); +static ire_t *ire_add_v4(ire_t *ire); static void ire_delete_v4(ire_t *ire); +static void ire_dep_invalidate_children(ire_t *child); static void ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers, zoneid_t zoneid, ip_stack_t *); static void ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg, uchar_t vers, ill_t *ill); -static void ire_cache_cleanup(irb_t *irb, uint32_t threshold, - ire_t *ref_ire); -static void ip_nce_clookup_and_delete(nce_t *nce, void *arg); -static ire_t *ip4_ctable_lookup_impl(ire_ctable_args_t *margs); #ifdef DEBUG static void ire_trace_cleanup(const ire_t *); #endif /* - * To avoid bloating the code, we call this function instead of - * using the macro IRE_REFRELE. Use macro only in performance - * critical paths. - * - * Must not be called while holding any locks. Otherwise if this is - * the last reference to be released there is a chance of recursive mutex - * panic due to ire_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying - * to restart an ioctl. The one exception is when the caller is sure that - * this is not the last reference to be released. Eg. if the caller is - * sure that the ire has not been deleted and won't be deleted. + * Following are the functions to increment/decrement the reference + * count of the IREs and IRBs (ire bucket). + * + * 1) We bump up the reference count of an IRE to make sure that + * it does not get deleted and freed while we are using it. + * Typically all the lookup functions hold the bucket lock, + * and look for the IRE. If it finds an IRE, it bumps up the + * reference count before dropping the lock. Sometimes we *may* want + * to bump up the reference count after we *looked* up i.e without + * holding the bucket lock. So, the ire_refhold function does not assert + * on the bucket lock being held. Any thread trying to delete from + * the hash bucket can still do so but cannot free the IRE if + * ire_refcnt is not 0. + * + * 2) We bump up the reference count on the bucket where the IRE resides + * (IRB), when we want to prevent the IREs getting deleted from a given + * hash bucket. This makes life easier for ire_walk type functions which + * wants to walk the IRE list, call a function, but needs to drop + * the bucket lock to prevent recursive rw_enters. While the + * lock is dropped, the list could be changed by other threads or + * the same thread could end up deleting the ire or the ire pointed by + * ire_next. ire_refholding the ire or ire_next is not sufficient as + * a delete will still remove the ire from the bucket while we have + * dropped the lock and hence the ire_next would be NULL. Thus, we + * need a mechanism to prevent deletions from a given bucket. + * + * To prevent deletions, we bump up the reference count on the + * bucket. If the bucket is held, ire_delete just marks both + * the ire and irb as CONDEMNED. When the + * reference count on the bucket drops to zero, all the CONDEMNED ires + * are deleted. We don't have to bump up the reference count on the + * bucket if we are walking the bucket and never have to drop the bucket + * lock. Note that irb_refhold does not prevent addition of new ires + * in the list. It is okay because addition of new ires will not cause + * ire_next to point to freed memory. We do irb_refhold only when + * all of the 3 conditions are true : + * + * 1) The code needs to walk the IRE bucket from start to end. + * 2) It may have to drop the bucket lock sometimes while doing (1) + * 3) It does not want any ires to be deleted meanwhile. + */ + +/* + * Bump up the reference count on the hash bucket - IRB to + * prevent ires from being deleted in this bucket. */ void -ire_refrele(ire_t *ire) +irb_refhold(irb_t *irb) { - IRE_REFRELE(ire); + rw_enter(&irb->irb_lock, RW_WRITER); + irb->irb_refcnt++; + ASSERT(irb->irb_refcnt != 0); + rw_exit(&irb->irb_lock); } void -ire_refrele_notr(ire_t *ire) +irb_refhold_locked(irb_t *irb) { - IRE_REFRELE_NOTR(ire); + ASSERT(RW_WRITE_HELD(&irb->irb_lock)); + irb->irb_refcnt++; + ASSERT(irb->irb_refcnt != 0); } /* - * kmem_cache_alloc constructor for IRE in kma space. - * Note that when ire_mp is set the IRE is stored in that mblk and - * not in this cache. + * Note: when IRB_MARK_DYNAMIC is not set the irb_t + * is statically allocated, so that when the irb_refcnt goes to 0, + * we simply clean up the ire list and continue. */ -/* ARGSUSED */ -static int -ip_ire_constructor(void *buf, void *cdrarg, int kmflags) +void +irb_refrele(irb_t *irb) { - ire_t *ire = buf; + if (irb->irb_marks & IRB_MARK_DYNAMIC) { + irb_refrele_ftable(irb); + } else { + rw_enter(&irb->irb_lock, RW_WRITER); + ASSERT(irb->irb_refcnt != 0); + if (--irb->irb_refcnt == 0 && + (irb->irb_marks & IRB_MARK_CONDEMNED)) { + ire_t *ire_list; + + ire_list = ire_unlink(irb); + rw_exit(&irb->irb_lock); + ASSERT(ire_list != NULL); + ire_cleanup(ire_list); + } else { + rw_exit(&irb->irb_lock); + } + } +} - ire->ire_nce = NULL; - return (0); +/* + * Bump up the reference count on the IRE. We cannot assert that the + * bucket lock is being held as it is legal to bump up the reference + * count after the first lookup has returned the IRE without + * holding the lock. + */ +void +ire_refhold(ire_t *ire) +{ + atomic_add_32(&(ire)->ire_refcnt, 1); + ASSERT((ire)->ire_refcnt != 0); +#ifdef DEBUG + ire_trace_ref(ire); +#endif } -/* ARGSUSED1 */ -static void -ip_ire_destructor(void *buf, void *cdrarg) +void +ire_refhold_notr(ire_t *ire) { - ire_t *ire = buf; + atomic_add_32(&(ire)->ire_refcnt, 1); + ASSERT((ire)->ire_refcnt != 0); +} - ASSERT(ire->ire_nce == NULL); +void +ire_refhold_locked(ire_t *ire) +{ +#ifdef DEBUG + ire_trace_ref(ire); +#endif + ire->ire_refcnt++; } /* - * This function is associated with the IP_IOC_IRE_ADVISE_NO_REPLY - * IOCTL. It is used by TCP (or other ULPs) to supply revised information - * for an existing CACHED IRE. + * Release a ref on an IRE. + * + * Must not be called while holding any locks. Otherwise if this is + * the last reference to be released there is a chance of recursive mutex + * panic due to ire_refrele -> ipif_ill_refrele_tail -> qwriter_ip trying + * to restart an ioctl. The one exception is when the caller is sure that + * this is not the last reference to be released. Eg. if the caller is + * sure that the ire has not been deleted and won't be deleted. + * + * In architectures e.g sun4u, where atomic_add_32_nv is just + * a cas, we need to maintain the right memory barrier semantics + * as that of mutex_exit i.e all the loads and stores should complete + * before the cas is executed. membar_exit() does that here. */ -/* ARGSUSED */ -int -ip_ire_advise(queue_t *q, mblk_t *mp, cred_t *ioc_cr) +void +ire_refrele(ire_t *ire) { - uchar_t *addr_ucp; - ipic_t *ipic; - ire_t *ire; - ipaddr_t addr; - in6_addr_t v6addr; - irb_t *irb; - zoneid_t zoneid; - ip_stack_t *ipst = CONNQ_TO_IPST(q); - - ASSERT(q->q_next == NULL); - zoneid = Q_TO_CONN(q)->conn_zoneid; - - /* - * Check privilege using the ioctl credential; if it is NULL - * then this is a kernel message and therefor privileged. - */ - if (ioc_cr != NULL && secpolicy_ip_config(ioc_cr, B_FALSE) != 0) - return (EPERM); - - ipic = (ipic_t *)mp->b_rptr; - if (!(addr_ucp = mi_offset_param(mp, ipic->ipic_addr_offset, - ipic->ipic_addr_length))) { - return (EINVAL); - } - if (!OK_32PTR(addr_ucp)) - return (EINVAL); - switch (ipic->ipic_addr_length) { - case IP_ADDR_LEN: { - /* Extract the destination address. */ - addr = *(ipaddr_t *)addr_ucp; - /* Find the corresponding IRE. */ - ire = ire_cache_lookup(addr, zoneid, NULL, ipst); - break; - } - case IPV6_ADDR_LEN: { - /* Extract the destination address. */ - v6addr = *(in6_addr_t *)addr_ucp; - /* Find the corresponding IRE. */ - ire = ire_cache_lookup_v6(&v6addr, zoneid, NULL, ipst); - break; - } - default: - return (EINVAL); - } - - if (ire == NULL) - return (ENOENT); - /* - * Update the round trip time estimate and/or the max frag size - * and/or the slow start threshold. - * - * We serialize multiple advises using ire_lock. - */ - mutex_enter(&ire->ire_lock); - if (ipic->ipic_rtt) { - /* - * If there is no old cached values, initialize them - * conservatively. Set them to be (1.5 * new value). - */ - if (ire->ire_uinfo.iulp_rtt != 0) { - ire->ire_uinfo.iulp_rtt = (ire->ire_uinfo.iulp_rtt + - ipic->ipic_rtt) >> 1; - } else { - ire->ire_uinfo.iulp_rtt = ipic->ipic_rtt + - (ipic->ipic_rtt >> 1); - } - if (ire->ire_uinfo.iulp_rtt_sd != 0) { - ire->ire_uinfo.iulp_rtt_sd = - (ire->ire_uinfo.iulp_rtt_sd + - ipic->ipic_rtt_sd) >> 1; - } else { - ire->ire_uinfo.iulp_rtt_sd = ipic->ipic_rtt_sd + - (ipic->ipic_rtt_sd >> 1); - } - } - if (ipic->ipic_max_frag) - ire->ire_max_frag = MIN(ipic->ipic_max_frag, IP_MAXPACKET); - if (ipic->ipic_ssthresh != 0) { - if (ire->ire_uinfo.iulp_ssthresh != 0) - ire->ire_uinfo.iulp_ssthresh = - (ipic->ipic_ssthresh + - ire->ire_uinfo.iulp_ssthresh) >> 1; - else - ire->ire_uinfo.iulp_ssthresh = ipic->ipic_ssthresh; - } - /* - * Don't need the ire_lock below this. ire_type does not change - * after initialization. ire_marks is protected by irb_lock. - */ - mutex_exit(&ire->ire_lock); - - if (ipic->ipic_ire_marks != 0 && ire->ire_type == IRE_CACHE) { - /* - * Only increment the temporary IRE count if the original - * IRE is not already marked temporary. - */ - irb = ire->ire_bucket; - rw_enter(&irb->irb_lock, RW_WRITER); - if ((ipic->ipic_ire_marks & IRE_MARK_TEMPORARY) && - !(ire->ire_marks & IRE_MARK_TEMPORARY)) { - irb->irb_tmp_ire_cnt++; - } - ire->ire_marks |= ipic->ipic_ire_marks; - rw_exit(&irb->irb_lock); - } +#ifdef DEBUG + ire_untrace_ref(ire); +#endif + ASSERT((ire)->ire_refcnt != 0); + membar_exit(); + if (atomic_add_32_nv(&(ire)->ire_refcnt, -1) == 0) + ire_inactive(ire); +} - ire_refrele(ire); - return (0); +void +ire_refrele_notr(ire_t *ire) +{ + ASSERT((ire)->ire_refcnt != 0); + membar_exit(); + if (atomic_add_32_nv(&(ire)->ire_refcnt, -1) == 0) + ire_inactive(ire); } /* * This function is associated with the IP_IOC_IRE_DELETE[_NO_REPLY] - * IOCTL[s]. The NO_REPLY form is used by TCP to delete a route IRE - * for a host that is not responding. This will force an attempt to - * establish a new route, if available, and flush out the ARP entry so - * it will re-resolve. Management processes may want to use the - * version that generates a reply. - * - * This function does not support IPv6 since Neighbor Unreachability Detection - * means that negative advise like this is useless. + * IOCTL[s]. The NO_REPLY form is used by TCP to tell IP that it is + * having problems reaching a particular destination. + * This will make IP consider alternate routes (e.g., when there are + * muliple default routes), and it will also make IP discard any (potentially) + * stale redirect. + * Management processes may want to use the version that generates a reply. + * + * With the use of NUD like behavior for IPv4/ARP in addition to IPv6 + * this function shouldn't be necessary for IP to recover from a bad redirect, + * a bad default router (when there are multiple default routers), or + * a stale ND/ARP entry. But we retain it in any case. + * For instance, this is helpful when TCP suspects a failure before NUD does. */ -/* ARGSUSED */ int ip_ire_delete(queue_t *q, mblk_t *mp, cred_t *ioc_cr) { uchar_t *addr_ucp; - ipaddr_t addr; + uint_t ipversion; + sin_t *sin; + sin6_t *sin6; + ipaddr_t v4addr; + in6_addr_t v6addr; ire_t *ire; ipid_t *ipid; - boolean_t routing_sock_info = B_FALSE; /* Sent info? */ zoneid_t zoneid; - ire_t *gire = NULL; - ill_t *ill; - mblk_t *arp_mp; ip_stack_t *ipst; ASSERT(q->q_next == NULL); - zoneid = Q_TO_CONN(q)->conn_zoneid; + zoneid = IPCL_ZONEID(Q_TO_CONN(q)); ipst = CONNQ_TO_IPST(q); /* @@ -563,948 +433,192 @@ ip_ire_delete(queue_t *q, mblk_t *mp, cred_t *ioc_cr) ipid = (ipid_t *)mp->b_rptr; - /* Only actions on IRE_CACHEs are acceptable at present. */ - if (ipid->ipid_ire_type != IRE_CACHE) - return (EINVAL); - addr_ucp = mi_offset_param(mp, ipid->ipid_addr_offset, ipid->ipid_addr_length); if (addr_ucp == NULL || !OK_32PTR(addr_ucp)) return (EINVAL); switch (ipid->ipid_addr_length) { - case IP_ADDR_LEN: - /* addr_ucp points at IP addr */ - break; - case sizeof (sin_t): { - sin_t *sin; + case sizeof (sin_t): /* * got complete (sockaddr) address - increment addr_ucp to point * at the ip_addr field. */ sin = (sin_t *)addr_ucp; addr_ucp = (uchar_t *)&sin->sin_addr.s_addr; + ipversion = IPV4_VERSION; + break; + case sizeof (sin6_t): + /* + * got complete (sockaddr) address - increment addr_ucp to point + * at the ip_addr field. + */ + sin6 = (sin6_t *)addr_ucp; + addr_ucp = (uchar_t *)&sin6->sin6_addr; + ipversion = IPV6_VERSION; break; - } default: return (EINVAL); } - /* Extract the destination address. */ - bcopy(addr_ucp, &addr, IP_ADDR_LEN); - - /* Try to find the CACHED IRE. */ - ire = ire_cache_lookup(addr, zoneid, NULL, ipst); - - /* Nail it. */ - if (ire) { - /* Allow delete only on CACHE entries */ - if (ire->ire_type != IRE_CACHE) { - ire_refrele(ire); - return (EINVAL); - } - - /* - * Verify that the IRE has been around for a while. - * This is to protect against transport protocols - * that are too eager in sending delete messages. - */ - if (gethrestime_sec() < - ire->ire_create_time + ipst->ips_ip_ignore_delete_time) { - ire_refrele(ire); - return (EINVAL); - } - /* - * Now we have a potentially dead cache entry. We need - * to remove it. - * If this cache entry is generated from a - * default route (i.e., ire_cmask == 0), - * search the default list and mark it dead and some - * background process will try to activate it. - */ - if ((ire->ire_gateway_addr != 0) && (ire->ire_cmask == 0)) { - /* - * Make sure that we pick a different - * IRE_DEFAULT next time. - */ - ire_t *gw_ire; - irb_t *irb = NULL; - uint_t match_flags; - - match_flags = (MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE); - - gire = ire_ftable_lookup(ire->ire_addr, - ire->ire_cmask, 0, 0, - ire->ire_ipif, NULL, zoneid, 0, NULL, match_flags, - ipst); - - ip3dbg(("ire_ftable_lookup() returned gire %p\n", - (void *)gire)); - - if (gire != NULL) { - irb = gire->ire_bucket; - - /* - * We grab it as writer just to serialize - * multiple threads trying to bump up - * irb_rr_origin - */ - rw_enter(&irb->irb_lock, RW_WRITER); - if ((gw_ire = irb->irb_rr_origin) == NULL) { - rw_exit(&irb->irb_lock); - goto done; - } - - DTRACE_PROBE1(ip__ire__del__origin, - (ire_t *), gw_ire); - - /* Skip past the potentially bad gateway */ - if (ire->ire_gateway_addr == - gw_ire->ire_gateway_addr) { - ire_t *next = gw_ire->ire_next; - - DTRACE_PROBE2(ip__ire__del, - (ire_t *), gw_ire, (irb_t *), irb); - IRE_FIND_NEXT_ORIGIN(next); - irb->irb_rr_origin = next; - } - rw_exit(&irb->irb_lock); - } - } -done: - if (gire != NULL) - IRE_REFRELE(gire); - /* report the bad route to routing sockets */ - ip_rts_change(RTM_LOSING, ire->ire_addr, ire->ire_gateway_addr, - ire->ire_mask, ire->ire_src_addr, 0, 0, 0, - (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA), ipst); - routing_sock_info = B_TRUE; + if (ipversion == IPV4_VERSION) { + /* Extract the destination address. */ + bcopy(addr_ucp, &v4addr, IP_ADDR_LEN); - /* - * TCP is really telling us to start over completely, and it - * expects that we'll resend the ARP query. Tell ARP to - * discard the entry, if this is a local destination. - * - * But, if the ARP entry is permanent then it shouldn't be - * deleted, so we set ARED_F_PRESERVE_PERM. - */ - ill = ire->ire_stq->q_ptr; - if (ire->ire_gateway_addr == 0 && - (arp_mp = ill_ared_alloc(ill, addr)) != NULL) { - ared_t *ared = (ared_t *)arp_mp->b_rptr; - - ASSERT(ared->ared_cmd == AR_ENTRY_DELETE); - ared->ared_flags |= ARED_F_PRESERVE_PERM; - putnext(ill->ill_rq, arp_mp); - } + ire = ire_ftable_lookup_v4(v4addr, 0, 0, 0, NULL, + zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL); + } else { + /* Extract the destination address. */ + bcopy(addr_ucp, &v6addr, IPV6_ADDR_LEN); - ire_delete(ire); - ire_refrele(ire); + ire = ire_ftable_lookup_v6(&v6addr, NULL, NULL, 0, NULL, + zoneid, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL); } - /* - * Also look for an IRE_HOST type redirect ire and - * remove it if present. - */ - ire = ire_route_lookup(addr, 0, 0, IRE_HOST, NULL, NULL, - ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); - - /* Nail it. */ if (ire != NULL) { - if (ire->ire_flags & RTF_DYNAMIC) { - if (!routing_sock_info) { - ip_rts_change(RTM_LOSING, ire->ire_addr, - ire->ire_gateway_addr, ire->ire_mask, - ire->ire_src_addr, 0, 0, 0, - (RTA_DST | RTA_GATEWAY | - RTA_NETMASK | RTA_IFA), - ipst); - } - ire_delete(ire); - } + if (ipversion == IPV4_VERSION) { + ip_rts_change(RTM_LOSING, ire->ire_addr, + ire->ire_gateway_addr, ire->ire_mask, + (Q_TO_CONN(q))->conn_laddr_v4, 0, 0, 0, + (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA), + ire->ire_ipst); + } + (void) ire_no_good(ire); ire_refrele(ire); } return (0); } /* - * ip_ire_req is called by ip_wput when an IRE_DB_REQ_TYPE message is handed - * down from the Upper Level Protocol to request a copy of the IRE (to check - * its type or to extract information like round-trip time estimates or the - * MTU.) - * The address is assumed to be in the ire_addr field. If no IRE is found - * an IRE is returned with ire_type being zero. - * Note that the upper lavel protocol has to check for broadcast - * (IRE_BROADCAST) and multicast (CLASSD(addr)). - * If there is a b_cont the resulting IRE_DB_TYPE mblk is placed at the - * end of the returned message. - * - * TCP sends down a message of this type with a connection request packet - * chained on. UDP and ICMP send it down to verify that a route exists for - * the destination address when they get connected. - */ -void -ip_ire_req(queue_t *q, mblk_t *mp) -{ - ire_t *inire; - ire_t *ire; - mblk_t *mp1; - ire_t *sire = NULL; - zoneid_t zoneid = Q_TO_CONN(q)->conn_zoneid; - ip_stack_t *ipst = CONNQ_TO_IPST(q); - - ASSERT(q->q_next == NULL); - - if ((mp->b_wptr - mp->b_rptr) < sizeof (ire_t) || - !OK_32PTR(mp->b_rptr)) { - freemsg(mp); - return; - } - inire = (ire_t *)mp->b_rptr; - /* - * Got it, now take our best shot at an IRE. - */ - if (inire->ire_ipversion == IPV6_VERSION) { - ire = ire_route_lookup_v6(&inire->ire_addr_v6, 0, 0, 0, - NULL, &sire, zoneid, NULL, - (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst); - } else { - ASSERT(inire->ire_ipversion == IPV4_VERSION); - ire = ire_route_lookup(inire->ire_addr, 0, 0, 0, - NULL, &sire, zoneid, NULL, - (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst); - } - - /* - * We prevent returning IRES with source address INADDR_ANY - * as these were temporarily created for sending packets - * from endpoints that have conn_unspec_src set. - */ - if (ire == NULL || - (ire->ire_ipversion == IPV4_VERSION && - ire->ire_src_addr == INADDR_ANY) || - (ire->ire_ipversion == IPV6_VERSION && - IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6))) { - inire->ire_type = 0; - } else { - bcopy(ire, inire, sizeof (ire_t)); - /* Copy the route metrics from the parent. */ - if (sire != NULL) { - bcopy(&(sire->ire_uinfo), &(inire->ire_uinfo), - sizeof (iulp_t)); - } - - /* Pass the latest setting of the ip_path_mtu_discovery */ - inire->ire_frag_flag |= - (ipst->ips_ip_path_mtu_discovery) ? IPH_DF : 0; - } - if (ire != NULL) - ire_refrele(ire); - if (sire != NULL) - ire_refrele(sire); - mp->b_wptr = &mp->b_rptr[sizeof (ire_t)]; - mp->b_datap->db_type = IRE_DB_TYPE; - - /* Put the IRE_DB_TYPE mblk last in the chain */ - mp1 = mp->b_cont; - if (mp1 != NULL) { - mp->b_cont = NULL; - linkb(mp1, mp); - mp = mp1; - } - qreply(q, mp); -} - -/* - * Send a packet using the specified IRE. - * If ire_src_addr_v6 is all zero then discard the IRE after - * the packet has been sent. - */ -static void -ire_send(queue_t *q, mblk_t *pkt, ire_t *ire) -{ - mblk_t *ipsec_mp; - boolean_t is_secure; - uint_t ifindex; - ill_t *ill; - zoneid_t zoneid = ire->ire_zoneid; - ip_stack_t *ipst = ire->ire_ipst; - - ASSERT(ire->ire_ipversion == IPV4_VERSION); - ASSERT(!(ire->ire_type & IRE_LOCAL)); /* Has different ire_zoneid */ - ipsec_mp = pkt; - is_secure = (pkt->b_datap->db_type == M_CTL); - if (is_secure) { - ipsec_out_t *io; - - pkt = pkt->b_cont; - io = (ipsec_out_t *)ipsec_mp->b_rptr; - if (io->ipsec_out_type == IPSEC_OUT) - zoneid = io->ipsec_out_zoneid; - } - - /* If the packet originated externally then */ - if (pkt->b_prev) { - ire_refrele(ire); - /* - * Extract the ifindex from b_prev (set in ip_rput_noire). - * Look up interface to see if it still exists (it could have - * been unplumbed by the time the reply came back from ARP) - */ - ifindex = (uint_t)(uintptr_t)pkt->b_prev; - ill = ill_lookup_on_ifindex(ifindex, B_FALSE, - NULL, NULL, NULL, NULL, ipst); - if (ill == NULL) { - pkt->b_prev = NULL; - pkt->b_next = NULL; - freemsg(ipsec_mp); - return; - } - q = ill->ill_rq; - pkt->b_prev = NULL; - /* - * This packet has not gone through IPSEC processing - * and hence we should not have any IPSEC message - * prepended. - */ - ASSERT(ipsec_mp == pkt); - put(q, pkt); - ill_refrele(ill); - } else if (pkt->b_next) { - /* Packets from multicast router */ - pkt->b_next = NULL; - /* - * We never get the IPSEC_OUT while forwarding the - * packet for multicast router. - */ - ASSERT(ipsec_mp == pkt); - ip_rput_forward(ire, (ipha_t *)pkt->b_rptr, ipsec_mp, NULL); - ire_refrele(ire); - } else { - /* Locally originated packets */ - boolean_t delete_ire = B_FALSE; - ipha_t *ipha = (ipha_t *)pkt->b_rptr; - - /* - * If this IRE shouldn't be kept in the table (because its - * source address is unspecified), hold a reference to it so - * we can delete it even after e.g. ip_wput_ire() has dropped - * its reference. - */ - if (!(ire->ire_marks & IRE_MARK_NOADD) && - ire->ire_src_addr == INADDR_ANY) { - delete_ire = B_TRUE; - IRE_REFHOLD(ire); - } - - /* - * If we were resolving a router we can not use the - * routers IRE for sending the packet (since it would - * violate the uniqness of the IP idents) thus we - * make another pass through ip_wput to create the IRE_CACHE - * for the destination. - * When IRE_MARK_NOADD is set, ire_add() is not called. - * Thus ip_wput() will never find a ire and result in an - * infinite loop. Thus we check whether IRE_MARK_NOADD is - * is set. This also implies that IRE_MARK_NOADD can only be - * used to send packets to directly connected hosts. - */ - if (ipha->ipha_dst != ire->ire_addr && - !(ire->ire_marks & IRE_MARK_NOADD)) { - ire_refrele(ire); /* Held in ire_add */ - if (CONN_Q(q)) { - (void) ip_output(Q_TO_CONN(q), ipsec_mp, q, - IRE_SEND); - } else { - (void) ip_output((void *)(uintptr_t)zoneid, - ipsec_mp, q, IRE_SEND); - } - } else { - if (is_secure) { - ipsec_out_t *oi; - ipha_t *ipha; - - oi = (ipsec_out_t *)ipsec_mp->b_rptr; - ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr; - if (oi->ipsec_out_proc_begin) { - /* - * This is the case where - * ip_wput_ipsec_out could not find - * the IRE and recreated a new one. - * As ip_wput_ipsec_out does ire - * lookups, ire_refrele for the extra - * bump in ire_add. - */ - ire_refrele(ire); - ip_wput_ipsec_out(q, ipsec_mp, ipha, - NULL, NULL); - } else { - /* - * IRE_REFRELE will be done in - * ip_wput_ire. - */ - ip_wput_ire(q, ipsec_mp, ire, NULL, - IRE_SEND, zoneid); - } - } else { - /* - * IRE_REFRELE will be done in ip_wput_ire. - */ - ip_wput_ire(q, ipsec_mp, ire, NULL, - IRE_SEND, zoneid); - } - } - /* - * Special code to support sending a single packet with - * conn_unspec_src using an IRE which has no source address. - * The IRE is deleted here after sending the packet to avoid - * having other code trip on it. But before we delete the - * ire, somebody could have looked up this ire. - * We prevent returning/using this IRE by the upper layers - * by making checks to NULL source address in other places - * like e.g ip_ire_append, ip_ire_req and ip_bind_connected. - * Though this does not completely prevent other threads - * from using this ire, this should not cause any problems. - */ - if (delete_ire) { - ip1dbg(("ire_send: delete IRE\n")); - ire_delete(ire); - ire_refrele(ire); /* Held above */ - } - } -} - -/* - * Send a packet using the specified IRE. - * If ire_src_addr_v6 is all zero then discard the IRE after - * the packet has been sent. - */ -static void -ire_send_v6(queue_t *q, mblk_t *pkt, ire_t *ire) -{ - mblk_t *ipsec_mp; - boolean_t secure; - uint_t ifindex; - zoneid_t zoneid = ire->ire_zoneid; - ip_stack_t *ipst = ire->ire_ipst; - - ASSERT(ire->ire_ipversion == IPV6_VERSION); - ASSERT(!(ire->ire_type & IRE_LOCAL)); /* Has different ire_zoneid */ - if (pkt->b_datap->db_type == M_CTL) { - ipsec_out_t *io; - - ipsec_mp = pkt; - pkt = pkt->b_cont; - secure = B_TRUE; - io = (ipsec_out_t *)ipsec_mp->b_rptr; - if (io->ipsec_out_type == IPSEC_OUT) - zoneid = io->ipsec_out_zoneid; - } else { - ipsec_mp = pkt; - secure = B_FALSE; - } - - /* If the packet originated externally then */ - if (pkt->b_prev) { - ill_t *ill; - /* - * Extract the ifindex from b_prev (set in ip_rput_data_v6). - * Look up interface to see if it still exists (it could have - * been unplumbed by the time the reply came back from the - * resolver). - */ - ifindex = (uint_t)(uintptr_t)pkt->b_prev; - ill = ill_lookup_on_ifindex(ifindex, B_TRUE, - NULL, NULL, NULL, NULL, ipst); - if (ill == NULL) { - pkt->b_prev = NULL; - pkt->b_next = NULL; - freemsg(ipsec_mp); - ire_refrele(ire); /* Held in ire_add */ - return; - } - q = ill->ill_rq; - pkt->b_prev = NULL; - /* - * This packet has not gone through IPSEC processing - * and hence we should not have any IPSEC message - * prepended. - */ - ASSERT(ipsec_mp == pkt); - put(q, pkt); - ill_refrele(ill); - } else if (pkt->b_next) { - /* Packets from multicast router */ - pkt->b_next = NULL; - /* - * We never get the IPSEC_OUT while forwarding the - * packet for multicast router. - */ - ASSERT(ipsec_mp == pkt); - /* - * XXX TODO IPv6. - */ - freemsg(pkt); -#ifdef XXX - ip_rput_forward(ire, (ipha_t *)pkt->b_rptr, pkt, NULL); -#endif - } else { - if (secure) { - ipsec_out_t *oi; - ip6_t *ip6h; - - oi = (ipsec_out_t *)ipsec_mp->b_rptr; - ip6h = (ip6_t *)ipsec_mp->b_cont->b_rptr; - if (oi->ipsec_out_proc_begin) { - /* - * This is the case where - * ip_wput_ipsec_out could not find - * the IRE and recreated a new one. - */ - ip_wput_ipsec_out_v6(q, ipsec_mp, ip6h, - NULL, NULL); - } else { - if (CONN_Q(q)) { - (void) ip_output_v6(Q_TO_CONN(q), - ipsec_mp, q, IRE_SEND); - } else { - (void) ip_output_v6( - (void *)(uintptr_t)zoneid, - ipsec_mp, q, IRE_SEND); - } - } - } else { - /* - * Send packets through ip_output_v6 so that any - * ip6_info header can be processed again. - */ - if (CONN_Q(q)) { - (void) ip_output_v6(Q_TO_CONN(q), ipsec_mp, q, - IRE_SEND); - } else { - (void) ip_output_v6((void *)(uintptr_t)zoneid, - ipsec_mp, q, IRE_SEND); - } - } - /* - * Special code to support sending a single packet with - * conn_unspec_src using an IRE which has no source address. - * The IRE is deleted here after sending the packet to avoid - * having other code trip on it. But before we delete the - * ire, somebody could have looked up this ire. - * We prevent returning/using this IRE by the upper layers - * by making checks to NULL source address in other places - * like e.g ip_ire_append_v6, ip_ire_req and - * ip_bind_connected_v6. Though, this does not completely - * prevent other threads from using this ire, this should - * not cause any problems. - */ - if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6)) { - ip1dbg(("ire_send_v6: delete IRE\n")); - ire_delete(ire); - } - } - ire_refrele(ire); /* Held in ire_add */ -} - -/* - * Make sure that IRE bucket does not get too long. - * This can cause lock up because ire_cache_lookup() - * may take "forever" to finish. - * - * We only remove a maximum of cnt IREs each time. This - * should keep the bucket length approximately constant, - * depending on cnt. This should be enough to defend - * against DoS attack based on creating temporary IREs - * (for forwarding and non-TCP traffic). - * - * We also pass in the address of the newly created IRE - * as we do not want to remove this straight after adding - * it. New IREs are normally added at the tail of the - * bucket. This means that we are removing the "oldest" - * temporary IREs added. Only if there are IREs with - * the same ire_addr, do we not add it at the tail. Refer - * to ire_add_v*(). It should be OK for our purpose. - * - * For non-temporary cached IREs, we make sure that they - * have not been used for some time (defined below), they - * are non-local destinations, and there is no one using - * them at the moment (refcnt == 1). - * - * The above means that the IRE bucket length may become - * very long, consisting of mostly non-temporary IREs. - * This can happen when the hash function does a bad job - * so that most TCP connections cluster to a specific bucket. - * This "hopefully" should never happen. It can also - * happen if most TCP connections have very long lives. - * Even with the minimal hash table size of 256, there - * has to be a lot of such connections to make the bucket - * length unreasonably long. This should probably not - * happen either. The third can when this can happen is - * when the machine is under attack, such as SYN flooding. - * TCP should already have the proper mechanism to protect - * that. So we should be safe. - * - * This function is called by ire_add_then_send() after - * a new IRE is added and the packet is sent. - * - * The idle cutoff interval is set to 60s. It can be - * changed using /etc/system. - */ -uint32_t ire_idle_cutoff_interval = 60000; - -static void -ire_cache_cleanup(irb_t *irb, uint32_t threshold, ire_t *ref_ire) -{ - ire_t *ire; - clock_t cut_off = drv_usectohz(ire_idle_cutoff_interval * 1000); - int cnt = ip_ire_cleanup_cnt; - - /* - * Try to remove cnt temporary IREs first. - */ - for (ire = irb->irb_ire; cnt > 0 && ire != NULL; ire = ire->ire_next) { - if (ire == ref_ire) - continue; - if (ire->ire_marks & IRE_MARK_CONDEMNED) - continue; - if (ire->ire_marks & IRE_MARK_TEMPORARY) { - ASSERT(ire->ire_type == IRE_CACHE); - ire_delete(ire); - cnt--; - } - } - if (cnt == 0) - return; - - /* - * If we didn't satisfy our removal target from temporary IREs - * we see how many non-temporary IREs are currently in the bucket. - * If this quantity is above the threshold then we see if there are any - * candidates for removal. We are still limited to removing a maximum - * of cnt IREs. - */ - if ((irb->irb_ire_cnt - irb->irb_tmp_ire_cnt) > threshold) { - for (ire = irb->irb_ire; cnt > 0 && ire != NULL; - ire = ire->ire_next) { - if (ire == ref_ire) - continue; - if (ire->ire_type != IRE_CACHE) - continue; - if (ire->ire_marks & IRE_MARK_CONDEMNED) - continue; - if ((ire->ire_refcnt == 1) && - (lbolt - ire->ire_last_used_time > cut_off)) { - ire_delete(ire); - cnt--; - } - } - } -} - -/* - * ire_add_then_send is called when a new IRE has been created in order to - * route an outgoing packet. Typically, it is called from ip_wput when - * a response comes back down from a resolver. We add the IRE, and then - * possibly run the packet through ip_wput or ip_rput, as appropriate. - * However, we do not add the newly created IRE in the cache when - * IRE_MARK_NOADD is set in the IRE. IRE_MARK_NOADD is set at - * ip_newroute_ipif(). The ires with IRE_MARK_NOADD are ire_refrele'd by - * ip_wput_ire() and get deleted. - * Multirouting support: the packet is silently discarded when the new IRE - * holds the RTF_MULTIRT flag, but is not the first IRE to be added with the - * RTF_MULTIRT flag for the same destination address. - * In this case, we just want to register this additional ire without - * sending the packet, as it has already been replicated through - * existing multirt routes in ip_wput(). - */ -void -ire_add_then_send(queue_t *q, ire_t *ire, mblk_t *mp) -{ - irb_t *irb; - boolean_t drop = B_FALSE; - boolean_t mctl_present; - mblk_t *first_mp = NULL; - mblk_t *data_mp = NULL; - ire_t *dst_ire; - ipha_t *ipha; - ip6_t *ip6h; - ip_stack_t *ipst = ire->ire_ipst; - int ire_limit; - - if (mp != NULL) { - /* - * We first have to retrieve the destination address carried - * by the packet. - * We can't rely on ire as it can be related to a gateway. - * The destination address will help in determining if - * other RTF_MULTIRT ires are already registered. - * - * We first need to know where we are going : v4 or V6. - * the ire version is enough, as there is no risk that - * we resolve an IPv6 address with an IPv4 ire - * or vice versa. - */ - EXTRACT_PKT_MP(mp, first_mp, mctl_present); - data_mp = mp; - mp = first_mp; - if (ire->ire_ipversion == IPV4_VERSION) { - ipha = (ipha_t *)data_mp->b_rptr; - dst_ire = ire_cache_lookup(ipha->ipha_dst, - ire->ire_zoneid, msg_getlabel(mp), ipst); - } else { - ASSERT(ire->ire_ipversion == IPV6_VERSION); - ip6h = (ip6_t *)data_mp->b_rptr; - dst_ire = ire_cache_lookup_v6(&ip6h->ip6_dst, - ire->ire_zoneid, msg_getlabel(mp), ipst); - } - if (dst_ire != NULL) { - if (dst_ire->ire_flags & RTF_MULTIRT) { - /* - * At least one resolved multirt route - * already exists for the destination, - * don't sent this packet: either drop it - * or complete the pending resolution, - * depending on the ire. - */ - drop = B_TRUE; - } - ip1dbg(("ire_add_then_send: dst_ire %p " - "[dst %08x, gw %08x], drop %d\n", - (void *)dst_ire, - (dst_ire->ire_ipversion == IPV4_VERSION) ? \ - ntohl(dst_ire->ire_addr) : \ - ntohl(V4_PART_OF_V6(dst_ire->ire_addr_v6)), - (dst_ire->ire_ipversion == IPV4_VERSION) ? \ - ntohl(dst_ire->ire_gateway_addr) : \ - ntohl(V4_PART_OF_V6( - dst_ire->ire_gateway_addr_v6)), - drop)); - ire_refrele(dst_ire); - } - } - - if (!(ire->ire_marks & IRE_MARK_NOADD)) { - /* Regular packets with cache bound ires are here. */ - (void) ire_add(&ire, NULL, NULL, NULL, B_FALSE); - - if (ire == NULL) { - mp->b_prev = NULL; - mp->b_next = NULL; - MULTIRT_DEBUG_UNTAG(mp); - freemsg(mp); - return; - } - if (mp == NULL) { - ire_refrele(ire); /* Held in ire_add_v4/v6 */ - return; - } - } - if (drop) { - /* - * If we're adding an RTF_MULTIRT ire, the resolution - * is over: we just drop the packet. - */ - if (ire->ire_flags & RTF_MULTIRT) { - data_mp->b_prev = NULL; - data_mp->b_next = NULL; - MULTIRT_DEBUG_UNTAG(mp); - freemsg(mp); - } else { - /* - * Otherwise, we're adding the ire to a gateway - * for a multirt route. - * Invoke ip_newroute() to complete the resolution - * of the route. We will then come back here and - * finally drop this packet in the above code. - */ - if (ire->ire_ipversion == IPV4_VERSION) { - /* - * TODO: in order for CGTP to work in non-global - * zones, ip_newroute() must create the IRE - * cache in the zone indicated by - * ire->ire_zoneid. - */ - ip_newroute(q, mp, ipha->ipha_dst, - (CONN_Q(q) ? Q_TO_CONN(q) : NULL), - ire->ire_zoneid, ipst); - } else { - int minlen = sizeof (ip6i_t) + IPV6_HDR_LEN; - - ASSERT(ire->ire_ipversion == IPV6_VERSION); - - /* - * If necessary, skip over the ip6i_t to find - * the header with the actual source address. - */ - if (ip6h->ip6_nxt == IPPROTO_RAW) { - if (MBLKL(data_mp) < minlen && - pullupmsg(data_mp, -1) == 0) { - ip1dbg(("ire_add_then_send: " - "cannot pullupmsg ip6i\n")); - if (mctl_present) - freeb(first_mp); - ire_refrele(ire); - return; - } - ASSERT(MBLKL(data_mp) >= IPV6_HDR_LEN); - ip6h = (ip6_t *)(data_mp->b_rptr + - sizeof (ip6i_t)); - } - ip_newroute_v6(q, mp, &ip6h->ip6_dst, - &ip6h->ip6_src, NULL, ire->ire_zoneid, - ipst); - } - } - - ire_refrele(ire); /* As done by ire_send(). */ - return; - } - /* - * Need to remember ire_bucket here as ire_send*() may delete - * the ire so we cannot reference it after that. - */ - irb = ire->ire_bucket; - if (ire->ire_ipversion == IPV4_VERSION) { - ire_send(q, mp, ire); - ire_limit = ip_ire_max_bucket_cnt; - } else { - ire_send_v6(q, mp, ire); - ire_limit = ip6_ire_max_bucket_cnt; - } - - /* - * irb is NULL if the IRE was not added to the hash. This happens - * when IRE_MARK_NOADD is set and when IREs are returned from - * ire_update_srcif_v4(). - */ - if (irb != NULL) { - IRB_REFHOLD(irb); - if (irb->irb_ire_cnt > ire_limit) - ire_cache_cleanup(irb, ire_limit, ire); - IRB_REFRELE(irb); - } -} - -/* * Initialize the ire that is specific to IPv4 part and call * ire_init_common to finish it. + * Returns zero or errno. */ -ire_t * -ire_init(ire_t *ire, uchar_t *addr, uchar_t *mask, uchar_t *src_addr, - uchar_t *gateway, uint_t *max_fragp, nce_t *src_nce, queue_t *rfq, - queue_t *stq, ushort_t type, ipif_t *ipif, ipaddr_t cmask, uint32_t phandle, - uint32_t ihandle, uint32_t flags, const iulp_t *ulp_info, tsol_gc_t *gc, - tsol_gcgrp_t *gcgrp, ip_stack_t *ipst) +int +ire_init_v4(ire_t *ire, uchar_t *addr, uchar_t *mask, uchar_t *gateway, + ushort_t type, ill_t *ill, zoneid_t zoneid, uint_t flags, + tsol_gc_t *gc, ip_stack_t *ipst) { - ASSERT(type != IRE_CACHE || stq != NULL); + int error; + /* * Reject IRE security attribute creation/initialization * if system is not running in Trusted mode. */ - if ((gc != NULL || gcgrp != NULL) && !is_system_labeled()) - return (NULL); + if (gc != NULL && !is_system_labeled()) + return (EINVAL); BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_alloced); if (addr != NULL) bcopy(addr, &ire->ire_addr, IP_ADDR_LEN); - if (src_addr != NULL) - bcopy(src_addr, &ire->ire_src_addr, IP_ADDR_LEN); - if (mask != NULL) { - bcopy(mask, &ire->ire_mask, IP_ADDR_LEN); - ire->ire_masklen = ip_mask_to_plen(ire->ire_mask); - } - if (gateway != NULL) { + if (gateway != NULL) bcopy(gateway, &ire->ire_gateway_addr, IP_ADDR_LEN); + + /* Make sure we don't have stray values in some fields */ + switch (type) { + case IRE_LOOPBACK: + bcopy(&ire->ire_addr, &ire->ire_gateway_addr, IP_ADDR_LEN); + /* FALLTHRU */ + case IRE_HOST: + case IRE_BROADCAST: + case IRE_LOCAL: + case IRE_IF_CLONE: + ire->ire_mask = IP_HOST_MASK; + ire->ire_masklen = IPV4_ABITS; + break; + case IRE_PREFIX: + case IRE_DEFAULT: + case IRE_IF_RESOLVER: + case IRE_IF_NORESOLVER: + if (mask != NULL) { + bcopy(mask, &ire->ire_mask, IP_ADDR_LEN); + ire->ire_masklen = ip_mask_to_plen(ire->ire_mask); + } + break; + case IRE_MULTICAST: + case IRE_NOROUTE: + ASSERT(mask == NULL); + break; + default: + ASSERT(0); + return (EINVAL); } - if (type == IRE_CACHE) - ire->ire_cmask = cmask; + error = ire_init_common(ire, type, ill, zoneid, flags, IPV4_VERSION, + gc, ipst); + if (error != NULL) + return (error); - /* ire_init_common will free the mblks upon encountering any failure */ - if (!ire_init_common(ire, max_fragp, src_nce, rfq, stq, type, ipif, - phandle, ihandle, flags, IPV4_VERSION, ulp_info, gc, gcgrp, ipst)) - return (NULL); + /* Determine which function pointers to use */ + ire->ire_postfragfn = ip_xmit; /* Common case */ - return (ire); + switch (ire->ire_type) { + case IRE_LOCAL: + ire->ire_sendfn = ire_send_local_v4; + ire->ire_recvfn = ire_recv_local_v4; +#ifdef SO_VRRP + ASSERT(ire->ire_ill != NULL); + if (ire->ire_ill->ill_flags & ILLF_NOACCEPT) { + ire->ire_noaccept = B_TRUE; + ire->ire_recvfn = ire_recv_noaccept_v6; + } +#endif + break; + case IRE_LOOPBACK: + ire->ire_sendfn = ire_send_local_v4; + ire->ire_recvfn = ire_recv_loopback_v4; + break; + case IRE_BROADCAST: + ire->ire_postfragfn = ip_postfrag_loopcheck; + ire->ire_sendfn = ire_send_broadcast_v4; + ire->ire_recvfn = ire_recv_broadcast_v4; + break; + case IRE_MULTICAST: + ire->ire_postfragfn = ip_postfrag_loopcheck; + ire->ire_sendfn = ire_send_multicast_v4; + ire->ire_recvfn = ire_recv_multicast_v4; + break; + default: + /* + * For IRE_IF_ALL and IRE_OFFLINK we forward received + * packets by default. + */ + ire->ire_sendfn = ire_send_wire_v4; + ire->ire_recvfn = ire_recv_forward_v4; + break; + } + if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { + ire->ire_sendfn = ire_send_noroute_v4; + ire->ire_recvfn = ire_recv_noroute_v4; + } else if (ire->ire_flags & RTF_MULTIRT) { + ire->ire_postfragfn = ip_postfrag_multirt_v4; + ire->ire_sendfn = ire_send_multirt_v4; + /* Multirt receive of broadcast uses ire_recv_broadcast_v4 */ + if (ire->ire_type != IRE_BROADCAST) + ire->ire_recvfn = ire_recv_multirt_v4; + } + ire->ire_nce_capable = ire_determine_nce_capable(ire); + return (0); } /* - * Similar to ire_create except that it is called only when - * we want to allocate ire as an mblk e.g. we have an external - * resolver ARP. + * Determine ire_nce_capable */ -ire_t * -ire_create_mp(uchar_t *addr, uchar_t *mask, uchar_t *src_addr, uchar_t *gateway, - uint_t max_frag, nce_t *src_nce, queue_t *rfq, queue_t *stq, ushort_t type, - ipif_t *ipif, ipaddr_t cmask, uint32_t phandle, uint32_t ihandle, - uint32_t flags, const iulp_t *ulp_info, tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, - ip_stack_t *ipst) +boolean_t +ire_determine_nce_capable(ire_t *ire) { - ire_t *ire, *buf; - ire_t *ret_ire; - mblk_t *mp; - size_t bufsize; - frtn_t *frtnp; - ill_t *ill; + int max_masklen; - bufsize = sizeof (ire_t) + sizeof (frtn_t); - buf = kmem_alloc(bufsize, KM_NOSLEEP); - if (buf == NULL) { - ip1dbg(("ire_create_mp: alloc failed\n")); - return (NULL); - } - frtnp = (frtn_t *)(buf + 1); - frtnp->free_arg = (caddr_t)buf; - frtnp->free_func = ire_freemblk; + if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || + (ire->ire_type & IRE_MULTICAST)) + return (B_TRUE); - /* - * Allocate the new IRE. The ire created will hold a ref on - * an nce_t after ire_nce_init, and this ref must either be - * (a) transferred to the ire_cache entry created when ire_add_v4 - * is called after successful arp resolution, or, - * (b) released, when arp resolution fails - * Case (b) is handled in ire_freemblk() which will be called - * when mp is freed as a result of failed arp. - */ - mp = esballoc((unsigned char *)buf, bufsize, BPRI_MED, frtnp); - if (mp == NULL) { - ip1dbg(("ire_create_mp: alloc failed\n")); - kmem_free(buf, bufsize); - return (NULL); - } - ire = (ire_t *)mp->b_rptr; - mp->b_wptr = (uchar_t *)&ire[1]; + if (ire->ire_ipversion == IPV4_VERSION) + max_masklen = IPV4_ABITS; + else + max_masklen = IPV6_ABITS; - /* Start clean. */ - *ire = ire_null; - ire->ire_mp = mp; - mp->b_datap->db_type = IRE_DB_TYPE; - ire->ire_marks |= IRE_MARK_UNCACHED; - - ret_ire = ire_init(ire, addr, mask, src_addr, gateway, NULL, src_nce, - rfq, stq, type, ipif, cmask, phandle, ihandle, flags, ulp_info, gc, - gcgrp, ipst); - - ill = (ill_t *)(stq->q_ptr); - if (ret_ire == NULL) { - /* ire_freemblk needs these set */ - ire->ire_stq_ifindex = ill->ill_phyint->phyint_ifindex; - ire->ire_stackid = ipst->ips_netstack->netstack_stackid; - ire->ire_ipst = ipst; - freeb(ire->ire_mp); - return (NULL); - } - ret_ire->ire_stq_ifindex = ill->ill_phyint->phyint_ifindex; - ret_ire->ire_stackid = ipst->ips_netstack->netstack_stackid; - ASSERT(ret_ire == ire); - ASSERT(ret_ire->ire_ipst == ipst); - /* - * ire_max_frag is normally zero here and is atomically set - * under the irebucket lock in ire_add_v[46] except for the - * case of IRE_MARK_NOADD. In that event the the ire_max_frag - * is non-zero here. - */ - ire->ire_max_frag = max_frag; - return (ire); + if ((ire->ire_type & IRE_ONLINK) && ire->ire_masklen == max_masklen) + return (B_TRUE); + return (B_FALSE); } /* @@ -1514,49 +628,43 @@ ire_create_mp(uchar_t *addr, uchar_t *mask, uchar_t *src_addr, uchar_t *gateway, * by this function. */ ire_t * -ire_create(uchar_t *addr, uchar_t *mask, uchar_t *src_addr, uchar_t *gateway, - uint_t *max_fragp, nce_t *src_nce, queue_t *rfq, queue_t *stq, - ushort_t type, ipif_t *ipif, ipaddr_t cmask, uint32_t phandle, - uint32_t ihandle, uint32_t flags, const iulp_t *ulp_info, tsol_gc_t *gc, - tsol_gcgrp_t *gcgrp, ip_stack_t *ipst) +ire_create(uchar_t *addr, uchar_t *mask, uchar_t *gateway, + ushort_t type, ill_t *ill, zoneid_t zoneid, uint_t flags, tsol_gc_t *gc, + ip_stack_t *ipst) { ire_t *ire; - ire_t *ret_ire; + int error; ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); if (ire == NULL) { - ip1dbg(("ire_create: alloc failed\n")); + DTRACE_PROBE(kmem__cache__alloc); return (NULL); } *ire = ire_null; - ret_ire = ire_init(ire, addr, mask, src_addr, gateway, max_fragp, - src_nce, rfq, stq, type, ipif, cmask, phandle, ihandle, flags, - ulp_info, gc, gcgrp, ipst); - - if (ret_ire == NULL) { + error = ire_init_v4(ire, addr, mask, gateway, type, ill, zoneid, flags, + gc, ipst); + if (error != 0) { + DTRACE_PROBE2(ire__init, ire_t *, ire, int, error); kmem_cache_free(ire_cache, ire); return (NULL); } - ASSERT(ret_ire == ire); return (ire); } /* * Common to IPv4 and IPv6 + * Returns zero or errno. */ -boolean_t -ire_init_common(ire_t *ire, uint_t *max_fragp, nce_t *src_nce, queue_t *rfq, - queue_t *stq, ushort_t type, ipif_t *ipif, uint32_t phandle, - uint32_t ihandle, uint32_t flags, uchar_t ipversion, const iulp_t *ulp_info, - tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, ip_stack_t *ipst) +int +ire_init_common(ire_t *ire, ushort_t type, ill_t *ill, zoneid_t zoneid, + uint_t flags, uchar_t ipversion, tsol_gc_t *gc, ip_stack_t *ipst) { - ire->ire_max_fragp = max_fragp; - ire->ire_frag_flag |= (ipst->ips_ip_path_mtu_discovery) ? IPH_DF : 0; + int error; #ifdef DEBUG - if (ipif != NULL) { - if (ipif->ipif_isv6) + if (ill != NULL) { + if (ill->ill_isv6) ASSERT(ipversion == IPV6_VERSION); else ASSERT(ipversion == IPV4_VERSION); @@ -1565,223 +673,73 @@ ire_init_common(ire_t *ire, uint_t *max_fragp, nce_t *src_nce, queue_t *rfq, /* * Create/initialize IRE security attribute only in Trusted mode; - * if the passed in gc/gcgrp is non-NULL, we expect that the caller + * if the passed in gc is non-NULL, we expect that the caller * has held a reference to it and will release it when this routine * returns a failure, otherwise we own the reference. We do this * prior to initializing the rest IRE fields. - * - * Don't allocate ire_gw_secattr for the resolver case to prevent - * memory leak (in case of external resolution failure). We'll - * allocate it after a successful external resolution, in ire_add(). - * Note that ire->ire_mp != NULL here means this ire is headed - * to an external resolver. */ if (is_system_labeled()) { if ((type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST | - IRE_INTERFACE)) != 0) { + IRE_IF_ALL | IRE_MULTICAST | IRE_NOROUTE)) != 0) { /* release references on behalf of caller */ if (gc != NULL) GC_REFRELE(gc); - if (gcgrp != NULL) - GCGRP_REFRELE(gcgrp); - } else if ((ire->ire_mp == NULL) && - tsol_ire_init_gwattr(ire, ipversion, gc, gcgrp) != 0) { - return (B_FALSE); + } else { + error = tsol_ire_init_gwattr(ire, ipversion, gc); + if (error != 0) + return (error); } } - ire->ire_stq = stq; - ire->ire_rfq = rfq; ire->ire_type = type; ire->ire_flags = RTF_UP | flags; - ire->ire_ident = TICK_TO_MSEC(lbolt); - bcopy(ulp_info, &ire->ire_uinfo, sizeof (iulp_t)); - - ire->ire_tire_mark = ire->ire_ob_pkt_count + ire->ire_ib_pkt_count; - ire->ire_last_used_time = lbolt; ire->ire_create_time = (uint32_t)gethrestime_sec(); + ire->ire_generation = IRE_GENERATION_INITIAL; /* - * If this IRE is an IRE_CACHE, inherit the handles from the - * parent IREs. For others in the forwarding table, assign appropriate - * new ones. + * The ill_ire_cnt isn't increased until + * the IRE is added to ensure that a walker will find + * all IREs that hold a reference on an ill. * - * The mutex protecting ire_handle is because ire_create is not always - * called as a writer. + * Note that ill_ire_multicast doesn't hold a ref on the ill since + * ire_add() is not called for the IRE_MULTICAST. */ - if (ire->ire_type & IRE_OFFSUBNET) { - mutex_enter(&ipst->ips_ire_handle_lock); - ire->ire_phandle = (uint32_t)ipst->ips_ire_handle++; - mutex_exit(&ipst->ips_ire_handle_lock); - } else if (ire->ire_type & IRE_INTERFACE) { - mutex_enter(&ipst->ips_ire_handle_lock); - ire->ire_ihandle = (uint32_t)ipst->ips_ire_handle++; - mutex_exit(&ipst->ips_ire_handle_lock); - } else if (ire->ire_type == IRE_CACHE) { - ire->ire_phandle = phandle; - ire->ire_ihandle = ihandle; - } - ire->ire_ipif = ipif; - if (ipif != NULL) { - ire->ire_ipif_seqid = ipif->ipif_seqid; - ire->ire_ipif_ifindex = - ipif->ipif_ill->ill_phyint->phyint_ifindex; - ire->ire_zoneid = ipif->ipif_zoneid; - } else { - ire->ire_zoneid = GLOBAL_ZONEID; - } + ire->ire_ill = ill; + ire->ire_zoneid = zoneid; ire->ire_ipversion = ipversion; + mutex_init(&ire->ire_lock, NULL, MUTEX_DEFAULT, NULL); - if (ipversion == IPV4_VERSION) { - /* - * IPv6 initializes the ire_nce in ire_add_v6, which expects - * to find the ire_nce to be null when it is called. - */ - if (ire_nce_init(ire, src_nce) != 0) { - /* some failure occurred. propagate error back */ - return (B_FALSE); - } - } ire->ire_refcnt = 1; + ire->ire_identical_ref = 1; /* Number of ire_delete's needed */ ire->ire_ipst = ipst; /* No netstack_hold */ ire->ire_trace_disable = B_FALSE; - return (B_TRUE); + return (0); } /* - * This routine is called repeatedly by ipif_up to create broadcast IREs. - * It is passed a pointer to a slot in an IRE pointer array into which to - * place the pointer to the new IRE, if indeed we create one. If the - * IRE corresponding to the address passed in would be a duplicate of an - * existing one, we don't create the new one. irep is incremented before - * return only if we do create a new IRE. (Always called as writer.) + * This creates an IRE_BROADCAST based on the arguments. + * A mirror is ire_lookup_bcast(). * - * Note that with the "match_flags" parameter, we can match on either - * a particular logical interface (MATCH_IRE_IPIF) or for all logical - * interfaces for a given physical interface (MATCH_IRE_ILL). Currently, - * we only create broadcast ire's on a per physical interface basis. If - * someone is going to be mucking with logical interfaces, it is important - * to call "ipif_check_bcast_ires()" to make sure that any change to a - * logical interface will not cause critical broadcast IRE's to be deleted. - */ -ire_t ** -ire_check_and_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep, - int match_flags) -{ - ire_t *ire; - uint64_t check_flags = IPIF_DEPRECATED | IPIF_NOLOCAL | IPIF_ANYCAST; - boolean_t prefer; - ill_t *ill = ipif->ipif_ill; - ip_stack_t *ipst = ill->ill_ipst; - - /* - * No broadcast IREs for the LOOPBACK interface - * or others such as point to point and IPIF_NOXMIT. - */ - if (!(ipif->ipif_flags & IPIF_BROADCAST) || - (ipif->ipif_flags & IPIF_NOXMIT)) - return (irep); - - /* - * If this new IRE would be a duplicate, only prefer it if one of - * the following is true: - * - * 1. The existing one has IPIF_DEPRECATED|IPIF_LOCAL|IPIF_ANYCAST - * set and the new one has all of those clear. - * - * 2. The existing one corresponds to an underlying ILL in an IPMP - * group and the new one corresponds to an IPMP group interface. - */ - if ((ire = ire_ctable_lookup(addr, 0, IRE_BROADCAST, ipif, - ipif->ipif_zoneid, NULL, match_flags, ipst)) != NULL) { - prefer = ((ire->ire_ipif->ipif_flags & check_flags) && - !(ipif->ipif_flags & check_flags)) || - (IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) && IS_IPMP(ill)); - if (!prefer) { - ire_refrele(ire); - return (irep); - } - - /* - * Bcast ires exist in pairs. Both have to be deleted, - * Since we are exclusive we can make the above assertion. - * The 1st has to be refrele'd since it was ctable_lookup'd. - */ - ASSERT(IAM_WRITER_IPIF(ipif)); - ASSERT(ire->ire_next->ire_addr == ire->ire_addr); - ire_delete(ire->ire_next); - ire_delete(ire); - ire_refrele(ire); - } - return (ire_create_bcast(ipif, addr, irep)); -} - -uint_t ip_loopback_mtu = IP_LOOPBACK_MTU; - -/* - * This routine is called from ipif_check_bcast_ires and ire_check_bcast. - * It leaves all the verifying and deleting to those routines. So it always - * creates 2 bcast ires and chains them into the ire array passed in. + * Any supression of unneeded ones is done in ire_add_v4. + * We add one IRE_BROADCAST per address. ire_send_broadcast_v4() + * takes care of generating a loopback copy of the packet. */ ire_t ** -ire_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep) +ire_create_bcast(ill_t *ill, ipaddr_t addr, zoneid_t zoneid, ire_t **irep) { - ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; - ill_t *ill = ipif->ipif_ill; - - ASSERT(IAM_WRITER_IPIF(ipif)); + ip_stack_t *ipst = ill->ill_ipst; - if (IS_IPMP(ill)) { - /* - * Broadcast IREs for the IPMP meta-interface use the - * nominated broadcast interface to send and receive packets. - * If there's no nominated interface, send the packets down to - * the IPMP stub driver, which will discard them. If the - * nominated broadcast interface changes, ill_refresh_bcast() - * will refresh the broadcast IREs. - */ - if ((ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL) - ill = ipif->ipif_ill; - } + ASSERT(IAM_WRITER_ILL(ill)); *irep++ = ire_create( (uchar_t *)&addr, /* dest addr */ (uchar_t *)&ip_g_all_ones, /* mask */ - (uchar_t *)&ipif->ipif_src_addr, /* source addr */ NULL, /* no gateway */ - &ipif->ipif_mtu, /* max frag */ - NULL, /* no src nce */ - ill->ill_rq, /* recv-from queue */ - ill->ill_wq, /* send-to queue */ IRE_BROADCAST, - ipif, - 0, - 0, - 0, - 0, - &ire_uinfo_null, - NULL, - NULL, - ipst); - - *irep++ = ire_create( - (uchar_t *)&addr, /* dest address */ - (uchar_t *)&ip_g_all_ones, /* mask */ - (uchar_t *)&ipif->ipif_src_addr, /* source address */ - NULL, /* no gateway */ - &ip_loopback_mtu, /* max frag size */ - NULL, /* no src_nce */ - ill->ill_rq, /* recv-from queue */ - NULL, /* no send-to queue */ - IRE_BROADCAST, /* Needed for fanout in wput */ - ipif, - 0, - 0, - 0, - 0, - &ire_uinfo_null, - NULL, + ill, + zoneid, + RTF_KERNEL, NULL, ipst); @@ -1789,174 +747,34 @@ ire_create_bcast(ipif_t *ipif, ipaddr_t addr, ire_t **irep) } /* - * ire_walk routine to delete or update any IRE_CACHE that might contain - * stale information. - * The flags state which entries to delete or update. - * Garbage collection is done separately using kmem alloc callbacks to - * ip_trash_ire_reclaim. - * Used for both IPv4 and IPv6. However, IPv6 only uses FLUSH_MTU_TIME - * since other stale information is cleaned up using NUD. - */ -void -ire_expire(ire_t *ire, char *arg) -{ - ire_expire_arg_t *ieap = (ire_expire_arg_t *)(uintptr_t)arg; - ill_t *stq_ill; - int flush_flags = ieap->iea_flush_flag; - ip_stack_t *ipst = ieap->iea_ipst; - - if ((flush_flags & FLUSH_REDIRECT_TIME) && - (ire->ire_flags & RTF_DYNAMIC)) { - /* Make sure we delete the corresponding IRE_CACHE */ - ip1dbg(("ire_expire: all redirects\n")); - ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst); - ire_delete(ire); - atomic_dec_32(&ipst->ips_ip_redirect_cnt); - return; - } - if (ire->ire_type != IRE_CACHE) - return; - - if (flush_flags & FLUSH_ARP_TIME) { - /* - * Remove all IRE_CACHE except IPv4 multicast ires. These - * ires will be deleted by ip_trash_ire_reclaim_stack() - * when system runs low in memory. - * Verify that create time is more than ip_ire_arp_interval - * milliseconds ago. - */ - - if (!(ire->ire_ipversion == IPV4_VERSION && - CLASSD(ire->ire_addr)) && NCE_EXPIRED(ire->ire_nce, ipst)) { - ire_delete(ire); - return; - } - } - - if (ipst->ips_ip_path_mtu_discovery && (flush_flags & FLUSH_MTU_TIME) && - (ire->ire_ipif != NULL)) { - /* Increase pmtu if it is less than the interface mtu */ - mutex_enter(&ire->ire_lock); - /* - * If the ipif is a vni (whose mtu is 0, since it's virtual) - * get the mtu from the sending interfaces' ipif - */ - if (IS_VNI(ire->ire_ipif->ipif_ill)) { - stq_ill = ire->ire_stq->q_ptr; - ire->ire_max_frag = MIN(stq_ill->ill_ipif->ipif_mtu, - IP_MAXPACKET); - } else { - ire->ire_max_frag = MIN(ire->ire_ipif->ipif_mtu, - IP_MAXPACKET); - } - ire->ire_marks &= ~IRE_MARK_PMTU; - ire->ire_frag_flag |= IPH_DF; - mutex_exit(&ire->ire_lock); - } -} - -/* - * Return any local address. We use this to target ourselves - * when the src address was specified as 'default'. - * Preference for IRE_LOCAL entries. + * This looks up an IRE_BROADCAST based on the arguments. + * Mirrors ire_create_bcast(). */ ire_t * -ire_lookup_local(zoneid_t zoneid, ip_stack_t *ipst) +ire_lookup_bcast(ill_t *ill, ipaddr_t addr, zoneid_t zoneid) { - ire_t *ire; - irb_t *irb; - ire_t *maybe = NULL; - int i; + ire_t *ire; + int match_args; - for (i = 0; i < ipst->ips_ip_cache_table_size; i++) { - irb = &ipst->ips_ip_cache_table[i]; - if (irb->irb_ire == NULL) - continue; - rw_enter(&irb->irb_lock, RW_READER); - for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { - if ((ire->ire_marks & IRE_MARK_CONDEMNED) || - (ire->ire_zoneid != zoneid && - ire->ire_zoneid != ALL_ZONES)) - continue; - switch (ire->ire_type) { - case IRE_LOOPBACK: - if (maybe == NULL) { - IRE_REFHOLD(ire); - maybe = ire; - } - break; - case IRE_LOCAL: - if (maybe != NULL) { - ire_refrele(maybe); - } - IRE_REFHOLD(ire); - rw_exit(&irb->irb_lock); - return (ire); - } - } - rw_exit(&irb->irb_lock); - } - return (maybe); -} + match_args = MATCH_IRE_TYPE | MATCH_IRE_ILL | MATCH_IRE_GW | + MATCH_IRE_MASK | MATCH_IRE_ZONEONLY; -/* - * If the specified IRE is associated with a particular ILL, return - * that ILL pointer (May be called as writer.). - * - * NOTE : This is not a generic function that can be used always. - * This function always returns the ill of the outgoing packets - * if this ire is used. - */ -ill_t * -ire_to_ill(const ire_t *ire) -{ - ill_t *ill = NULL; + if (IS_UNDER_IPMP(ill)) + match_args |= MATCH_IRE_TESTHIDDEN; - /* - * 1) For an IRE_CACHE, ire_ipif is the one where it obtained - * the source address from. ire_stq is the one where the - * packets will be sent out on. We return that here. - * - * 2) IRE_BROADCAST normally has a loopback and a non-loopback - * copy and they always exist next to each other with loopback - * copy being the first one. If we are called on the non-loopback - * copy, return the one pointed by ire_stq. If it was called on - * a loopback copy, we still return the one pointed by the next - * ire's ire_stq pointer i.e the one pointed by the non-loopback - * copy. We don't want use ire_ipif as it might represent the - * source address (if we borrow source addresses for - * IRE_BROADCASTS in the future). - * However if an interface is currently coming up, the above - * condition may not hold during that period since the ires - * are added one at a time. Thus one of the pair could have been - * added and the other not yet added. - * 3) For many other IREs (e.g., IRE_LOCAL), ire_rfq indicates the ill. - * 4) For all others return the ones pointed by ire_ipif->ipif_ill. - * That handles IRE_LOOPBACK. - */ - - if (ire->ire_type == IRE_CACHE) { - ill = (ill_t *)ire->ire_stq->q_ptr; - } else if (ire->ire_type == IRE_BROADCAST) { - if (ire->ire_stq != NULL) { - ill = (ill_t *)ire->ire_stq->q_ptr; - } else { - ire_t *ire_next; - - ire_next = ire->ire_next; - if (ire_next != NULL && - ire_next->ire_type == IRE_BROADCAST && - ire_next->ire_addr == ire->ire_addr && - ire_next->ire_ipif == ire->ire_ipif) { - ill = (ill_t *)ire_next->ire_stq->q_ptr; - } - } - } else if (ire->ire_rfq != NULL) { - ill = ire->ire_rfq->q_ptr; - } else if (ire->ire_ipif != NULL) { - ill = ire->ire_ipif->ipif_ill; - } - return (ill); + ire = ire_ftable_lookup_v4( + addr, /* dest addr */ + ip_g_all_ones, /* mask */ + 0, /* no gateway */ + IRE_BROADCAST, + ill, + zoneid, + NULL, + match_args, + 0, + ill->ill_ipst, + NULL); + return (ire); } /* Arrange to call the specified function for every IRE in the world. */ @@ -1992,15 +810,13 @@ ire_walk_ipvers(pfv_t func, void *arg, uchar_t vers, zoneid_t zoneid, */ ire_walk_ill_tables(0, 0, func, arg, IP_MASK_TABLE_SIZE, 0, NULL, - ipst->ips_ip_cache_table_size, ipst->ips_ip_cache_table, NULL, zoneid, ipst); } if (vers != IPV4_VERSION) { ire_walk_ill_tables(0, 0, func, arg, IP6_MASK_TABLE_SIZE, ipst->ips_ip6_ftable_hash_size, ipst->ips_ip_forwarding_table_v6, - ipst->ips_ip6_cache_table_size, - ipst->ips_ip_cache_table_v6, NULL, zoneid, ipst); + NULL, zoneid, ipst); } } @@ -2016,22 +832,6 @@ ire_walk_ill(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg, ire_walk_ill_ipvers(match_flags, ire_type, func, arg, vers, ill); } -void -ire_walk_ill_v4(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg, - ill_t *ill) -{ - ire_walk_ill_ipvers(match_flags, ire_type, func, arg, IPV4_VERSION, - ill); -} - -void -ire_walk_ill_v6(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg, - ill_t *ill) -{ - ire_walk_ill_ipvers(match_flags, ire_type, func, arg, IPV6_VERSION, - ill); -} - /* * Walk a particular ill and version. */ @@ -2043,137 +843,121 @@ ire_walk_ill_ipvers(uint_t match_flags, uint_t ire_type, pfv_t func, if (vers == IPV4_VERSION) { ire_walk_ill_tables(match_flags, ire_type, func, arg, - IP_MASK_TABLE_SIZE, 0, - NULL, ipst->ips_ip_cache_table_size, - ipst->ips_ip_cache_table, ill, ALL_ZONES, ipst); - } else if (vers == IPV6_VERSION) { + IP_MASK_TABLE_SIZE, + 0, NULL, + ill, ALL_ZONES, ipst); + } + if (vers != IPV4_VERSION) { ire_walk_ill_tables(match_flags, ire_type, func, arg, IP6_MASK_TABLE_SIZE, ipst->ips_ip6_ftable_hash_size, ipst->ips_ip_forwarding_table_v6, - ipst->ips_ip6_cache_table_size, - ipst->ips_ip_cache_table_v6, ill, ALL_ZONES, ipst); + ill, ALL_ZONES, ipst); } } +/* + * Do the specific matching of IREs to shared-IP zones. + * + * We have the same logic as in ire_match_args but implemented slightly + * differently. + */ boolean_t ire_walk_ill_match(uint_t match_flags, uint_t ire_type, ire_t *ire, ill_t *ill, zoneid_t zoneid, ip_stack_t *ipst) { - ill_t *ire_stq_ill = NULL; - ill_t *ire_ipif_ill = NULL; + ill_t *dst_ill = NULL; ASSERT(match_flags != 0 || zoneid != ALL_ZONES); - /* - * MATCH_IRE_ILL: We match both on ill pointed by ire_stq and - * ire_ipif. Only in the case of IRE_CACHEs can ire_stq and - * ire_ipif be pointing to different ills. But we want to keep - * this function generic enough for future use. So, we always - * try to match on both. The only caller of this function - * ire_walk_ill_tables, will call "func" after we return from - * this function. We expect "func" to do the right filtering - * of ires in this case. - */ if (match_flags & MATCH_IRE_ILL) { - if (ire->ire_stq != NULL) - ire_stq_ill = ire->ire_stq->q_ptr; - if (ire->ire_ipif != NULL) - ire_ipif_ill = ire->ire_ipif->ipif_ill; + dst_ill = ire->ire_ill; } - if (zoneid != ALL_ZONES) { + if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid && + ire->ire_zoneid != ALL_ZONES) { /* * We're walking the IREs for a specific zone. The only relevant * IREs are: * - all IREs with a matching ire_zoneid - * - all IRE_OFFSUBNETs as they're shared across all zones - * - IRE_INTERFACE IREs for interfaces with a usable source addr + * - IRE_IF_ALL IREs for interfaces with a usable source addr * with a matching zone - * - IRE_DEFAULTs with a gateway reachable from the zone - * We should really match on IRE_OFFSUBNETs and IRE_DEFAULTs - * using the same rule; but the above rules are consistent with - * the behavior of ire_ftable_lookup[_v6]() so that all the - * routes that can be matched during lookup are also matched - * here. + * - IRE_OFFLINK with a gateway reachable from the zone + * Note that ealier we only did the IRE_OFFLINK check for + * IRE_DEFAULT (and only when we had multiple IRE_DEFAULTs). */ - if (zoneid != ire->ire_zoneid && ire->ire_zoneid != ALL_ZONES) { + dst_ill = ire->ire_ill; + + if (ire->ire_type & IRE_ONLINK) { + uint_t ifindex; + /* - * Note, IRE_INTERFACE can have the stq as NULL. For - * example, if the default multicast route is tied to - * the loopback address. + * Note there is no IRE_INTERFACE on vniN thus + * can't do an IRE lookup for a matching route. */ - if ((ire->ire_type & IRE_INTERFACE) && - (ire->ire_stq != NULL)) { - ire_stq_ill = (ill_t *)ire->ire_stq->q_ptr; - if (ire->ire_ipversion == IPV4_VERSION) { - if (!ipif_usesrc_avail(ire_stq_ill, - zoneid)) - /* No usable src addr in zone */ - return (B_FALSE); - } else if (ire_stq_ill->ill_usesrc_ifindex - != 0) { - /* - * For IPv6 use ipif_select_source_v6() - * so the right scope selection is done - */ - ipif_t *src_ipif; - src_ipif = - ipif_select_source_v6(ire_stq_ill, - &ire->ire_addr_v6, B_FALSE, - IPV6_PREFER_SRC_DEFAULT, - zoneid); - if (src_ipif != NULL) { - ipif_refrele(src_ipif); - } else { - return (B_FALSE); - } - } else { - return (B_FALSE); - } + ifindex = dst_ill->ill_usesrc_ifindex; + if (ifindex == 0) + return (B_FALSE); - } else if (!(ire->ire_type & IRE_OFFSUBNET)) { + /* + * If there is a usable source address in the + * zone, then it's ok to return an + * IRE_INTERFACE + */ + if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6, + zoneid, ipst)) { + return (B_FALSE); + } + } + + if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) { + ipif_t *tipif; + + mutex_enter(&dst_ill->ill_lock); + for (tipif = dst_ill->ill_ipif; + tipif != NULL; tipif = tipif->ipif_next) { + if (!IPIF_IS_CONDEMNED(tipif) && + (tipif->ipif_flags & IPIF_UP) && + (tipif->ipif_zoneid == zoneid || + tipif->ipif_zoneid == ALL_ZONES)) + break; + } + mutex_exit(&dst_ill->ill_lock); + if (tipif == NULL) { return (B_FALSE); } } /* - * Match all default routes from the global zone, irrespective + * Match all offlink routes from the global zone, irrespective * of reachability. For a non-global zone only match those - * where ire_gateway_addr has a IRE_INTERFACE for the zoneid. + * where ire_gateway_addr has an IRE_INTERFACE for the zoneid. */ - if (ire->ire_type == IRE_DEFAULT && zoneid != GLOBAL_ZONEID) { - int ire_match_flags = 0; + if ((ire->ire_type & IRE_OFFLINK) && zoneid != GLOBAL_ZONEID && + zoneid != ALL_ZONES) { in6_addr_t gw_addr_v6; - ire_t *rire; - - ire_match_flags |= MATCH_IRE_TYPE; - if (ire->ire_ipif != NULL) - ire_match_flags |= MATCH_IRE_ILL; if (ire->ire_ipversion == IPV4_VERSION) { - rire = ire_route_lookup(ire->ire_gateway_addr, - 0, 0, IRE_INTERFACE, ire->ire_ipif, NULL, - zoneid, NULL, ire_match_flags, ipst); + if (!ire_gateway_ok_zone_v4( + ire->ire_gateway_addr, zoneid, + dst_ill, NULL, ipst, B_FALSE)) + return (B_FALSE); } else { ASSERT(ire->ire_ipversion == IPV6_VERSION); mutex_enter(&ire->ire_lock); gw_addr_v6 = ire->ire_gateway_addr_v6; mutex_exit(&ire->ire_lock); - rire = ire_route_lookup_v6(&gw_addr_v6, - NULL, NULL, IRE_INTERFACE, ire->ire_ipif, - NULL, zoneid, NULL, ire_match_flags, ipst); - } - if (rire == NULL) { - return (B_FALSE); + + if (!ire_gateway_ok_zone_v6(&gw_addr_v6, zoneid, + dst_ill, NULL, ipst, B_FALSE)) + return (B_FALSE); } - ire_refrele(rire); } } if (((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & ire_type)) && ((!(match_flags & MATCH_IRE_ILL)) || - (ire_stq_ill == ill || ire_ipif_ill == ill || - ire_ipif_ill != NULL && IS_IN_SAME_ILLGRP(ire_ipif_ill, ill)))) { + (dst_ill == ill || + dst_ill != NULL && IS_IN_SAME_ILLGRP(dst_ill, ill)))) { return (B_TRUE); } return (B_FALSE); @@ -2197,8 +981,9 @@ rtfunc(struct radix_node *rn, void *arg) ret = ire_walk_ill_match(rtf->rt_match_flags, rtf->rt_ire_type, ire, rtf->rt_ill, rtf->rt_zoneid, rtf->rt_ipst); - } else + } else { ret = B_TRUE; + } if (ret) (*rtf->rt_func)(ire, rtf->rt_arg); } @@ -2206,12 +991,12 @@ rtfunc(struct radix_node *rn, void *arg) } /* - * Walk the ftable and the ctable entries that match the ill. + * Walk the ftable entries that match the ill. */ void ire_walk_ill_tables(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg, size_t ftbl_sz, size_t htbl_sz, irb_t **ipftbl, - size_t ctbl_sz, irb_t *ipctbl, ill_t *ill, zoneid_t zoneid, + ill_t *ill, zoneid_t zoneid, ip_stack_t *ipst) { irb_t *irb_ptr; @@ -2223,85 +1008,50 @@ ire_walk_ill_tables(uint_t match_flags, uint_t ire_type, pfv_t func, ASSERT((!(match_flags & MATCH_IRE_ILL)) || (ill != NULL)); ASSERT(!(match_flags & MATCH_IRE_TYPE) || (ire_type != 0)); - /* - * Optimize by not looking at the forwarding table if there - * is a MATCH_IRE_TYPE specified with no IRE_FORWARDTABLE - * specified in ire_type. - */ - if (!(match_flags & MATCH_IRE_TYPE) || - ((ire_type & IRE_FORWARDTABLE) != 0)) { - /* knobs such that routine is called only for v6 case */ - if (ipftbl == ipst->ips_ip_forwarding_table_v6) { - for (i = (ftbl_sz - 1); i >= 0; i--) { - if ((irb_ptr = ipftbl[i]) == NULL) + + /* knobs such that routine is called only for v6 case */ + if (ipftbl == ipst->ips_ip_forwarding_table_v6) { + for (i = (ftbl_sz - 1); i >= 0; i--) { + if ((irb_ptr = ipftbl[i]) == NULL) + continue; + for (j = 0; j < htbl_sz; j++) { + irb = &irb_ptr[j]; + if (irb->irb_ire == NULL) continue; - for (j = 0; j < htbl_sz; j++) { - irb = &irb_ptr[j]; - if (irb->irb_ire == NULL) - continue; - - IRB_REFHOLD(irb); - for (ire = irb->irb_ire; ire != NULL; - ire = ire->ire_next) { - if (match_flags == 0 && - zoneid == ALL_ZONES) { - ret = B_TRUE; - } else { - ret = - ire_walk_ill_match( - match_flags, - ire_type, ire, ill, - zoneid, ipst); - } - if (ret) - (*func)(ire, arg); + + irb_refhold(irb); + for (ire = irb->irb_ire; ire != NULL; + ire = ire->ire_next) { + if (match_flags == 0 && + zoneid == ALL_ZONES) { + ret = B_TRUE; + } else { + ret = + ire_walk_ill_match( + match_flags, + ire_type, ire, ill, + zoneid, ipst); } - IRB_REFRELE(irb); + if (ret) + (*func)(ire, arg); } + irb_refrele(irb); } - } else { - (void) memset(&rtfarg, 0, sizeof (rtfarg)); - rtfarg.rt_func = func; - rtfarg.rt_arg = arg; - if (match_flags != 0) { - rtfarg.rt_match_flags = match_flags; - } - rtfarg.rt_ire_type = ire_type; - rtfarg.rt_ill = ill; - rtfarg.rt_zoneid = zoneid; - rtfarg.rt_ipst = ipst; /* No netstack_hold */ - (void) ipst->ips_ip_ftable->rnh_walktree_mt( - ipst->ips_ip_ftable, - rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn); } - } - - /* - * Optimize by not looking at the cache table if there - * is a MATCH_IRE_TYPE specified with no IRE_CACHETABLE - * specified in ire_type. - */ - if (!(match_flags & MATCH_IRE_TYPE) || - ((ire_type & IRE_CACHETABLE) != 0)) { - for (i = 0; i < ctbl_sz; i++) { - irb = &ipctbl[i]; - if (irb->irb_ire == NULL) - continue; - IRB_REFHOLD(irb); - for (ire = irb->irb_ire; ire != NULL; - ire = ire->ire_next) { - if (match_flags == 0 && zoneid == ALL_ZONES) { - ret = B_TRUE; - } else { - ret = ire_walk_ill_match( - match_flags, ire_type, - ire, ill, zoneid, ipst); - } - if (ret) - (*func)(ire, arg); - } - IRB_REFRELE(irb); + } else { + (void) memset(&rtfarg, 0, sizeof (rtfarg)); + rtfarg.rt_func = func; + rtfarg.rt_arg = arg; + if (match_flags != 0) { + rtfarg.rt_match_flags = match_flags; } + rtfarg.rt_ire_type = ire_type; + rtfarg.rt_ill = ill; + rtfarg.rt_zoneid = zoneid; + rtfarg.rt_ipst = ipst; /* No netstack_hold */ + (void) ipst->ips_ip_ftable->rnh_walktree_mt( + ipst->ips_ip_ftable, + rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn); } } @@ -2323,557 +1073,178 @@ ip_mask_to_plen(ipaddr_t mask) ipaddr_t ip_plen_to_mask(uint_t masklen) { + if (masklen == 0) + return (0); + return (htonl(IP_HOST_MASK << (IP_ABITS - masklen))); } void ire_atomic_end(irb_t *irb_ptr, ire_t *ire) { - ill_t *stq_ill, *ipif_ill; - ip_stack_t *ipst = ire->ire_ipst; + ill_t *ill; - stq_ill = ire->ire_stq != NULL ? ire->ire_stq->q_ptr : NULL; - ipif_ill = ire->ire_ipif != NULL ? ire->ire_ipif->ipif_ill : NULL; - RELEASE_ILL_LOCKS(ipif_ill, stq_ill); + ill = ire->ire_ill; + if (ill != NULL) + mutex_exit(&ill->ill_lock); rw_exit(&irb_ptr->irb_lock); - rw_exit(&ipst->ips_ill_g_usesrc_lock); } /* - * ire_add_v[46] atomically make sure that the ipif or ill associated - * with the new ire being added is stable and not IPIF_CHANGING or ILL_CHANGING - * before adding the ire to the table. This ensures that we don't create - * new IRE_CACHEs with stale values for parameters that are passed to - * ire_create such as ire_max_frag. Note that ire_create() is passed a pointer - * to the ipif_mtu, and not the value. The actual value is derived from the - * parent ire or ipif under the bucket lock. + * ire_add_v[46] atomically make sure that the ill associated + * with the new ire is not going away i.e., we check ILL_CONDEMNED. */ int -ire_atomic_start(irb_t *irb_ptr, ire_t *ire, queue_t *q, mblk_t *mp, - ipsq_func_t func) +ire_atomic_start(irb_t *irb_ptr, ire_t *ire) { - ill_t *stq_ill; - ill_t *ipif_ill; - int error = 0; - ill_t *ill = NULL; - ip_stack_t *ipst = ire->ire_ipst; + ill_t *ill; - stq_ill = ire->ire_stq != NULL ? ire->ire_stq->q_ptr : NULL; - ipif_ill = ire->ire_ipif != NULL ? ire->ire_ipif->ipif_ill : NULL; + ill = ire->ire_ill; - ASSERT((q != NULL && mp != NULL && func != NULL) || - (q == NULL && mp == NULL && func == NULL)); - rw_enter(&ipst->ips_ill_g_usesrc_lock, RW_READER); - GRAB_CONN_LOCK(q); rw_enter(&irb_ptr->irb_lock, RW_WRITER); - GRAB_ILL_LOCKS(ipif_ill, stq_ill); + if (ill != NULL) { + mutex_enter(&ill->ill_lock); - /* - * While the IRE is in the process of being added, a user may have - * invoked the ifconfig usesrc option on the stq_ill to make it a - * usesrc client ILL. Check for this possibility here, if it is true - * then we fail adding the IRE_CACHE. Another check is to make sure - * that an ipif_ill of an IRE_CACHE being added is not part of a usesrc - * group. The ill_g_usesrc_lock is released in ire_atomic_end - */ - if ((ire->ire_type & IRE_CACHE) && - (ire->ire_marks & IRE_MARK_USESRC_CHECK)) { - if (stq_ill->ill_usesrc_ifindex != 0) { - ASSERT(stq_ill->ill_usesrc_grp_next != NULL); - if ((ipif_ill->ill_phyint->phyint_ifindex != - stq_ill->ill_usesrc_ifindex) || - (ipif_ill->ill_usesrc_grp_next == NULL) || - (ipif_ill->ill_usesrc_ifindex != 0)) { - error = EINVAL; - goto done; - } - } else if (ipif_ill->ill_usesrc_grp_next != NULL) { - error = EINVAL; - goto done; + /* + * Don't allow IRE's to be created on dying ills. + */ + if (ill->ill_state_flags & ILL_CONDEMNED) { + ire_atomic_end(irb_ptr, ire); + return (ENXIO); } - } - /* - * Don't allow IRE's to be created on changing ill's. Also, since - * IPMP flags can be set on an ill without quiescing it, if we're not - * a writer on stq_ill, check that the flags still allow IRE creation. - */ - if ((stq_ill != NULL) && !IAM_WRITER_ILL(stq_ill)) { - if (stq_ill->ill_state_flags & ILL_CHANGING) { - ill = stq_ill; - error = EAGAIN; - } else if (IS_UNDER_IPMP(stq_ill)) { - mutex_enter(&stq_ill->ill_phyint->phyint_lock); - if (!ipmp_ill_is_active(stq_ill) && - !(ire->ire_marks & IRE_MARK_TESTHIDDEN)) { + if (IS_UNDER_IPMP(ill)) { + int error = 0; + mutex_enter(&ill->ill_phyint->phyint_lock); + if (!ipmp_ill_is_active(ill) && + IRE_HIDDEN_TYPE(ire->ire_type) && + !ire->ire_testhidden) { error = EINVAL; } - mutex_exit(&stq_ill->ill_phyint->phyint_lock); + mutex_exit(&ill->ill_phyint->phyint_lock); + if (error != 0) { + ire_atomic_end(irb_ptr, ire); + return (error); + } } - if (error != 0) - goto done; - } - if ((ipif_ill != NULL) && !IAM_WRITER_ILL(ipif_ill) && - (ipif_ill->ill_state_flags & ILL_CHANGING)) { - ill = ipif_ill; - error = EAGAIN; - goto done; } - - if ((ire->ire_ipif != NULL) && !IAM_WRITER_IPIF(ire->ire_ipif) && - (ire->ire_ipif->ipif_state_flags & IPIF_CHANGING)) { - ill = ire->ire_ipif->ipif_ill; - ASSERT(ill != NULL); - error = EAGAIN; - goto done; - } - -done: - if (error == EAGAIN && ILL_CAN_WAIT(ill, q)) { - ipsq_t *ipsq = ill->ill_phyint->phyint_ipsq; - mutex_enter(&ipsq->ipsq_lock); - mutex_enter(&ipsq->ipsq_xop->ipx_lock); - ire_atomic_end(irb_ptr, ire); - ipsq_enq(ipsq, q, mp, func, NEW_OP, ill); - mutex_exit(&ipsq->ipsq_xop->ipx_lock); - mutex_exit(&ipsq->ipsq_lock); - error = EINPROGRESS; - } else if (error != 0) { - ire_atomic_end(irb_ptr, ire); - } - - RELEASE_CONN_LOCK(q); - return (error); + return (0); } /* - * Add a fully initialized IRE to an appropriate table based on - * ire_type. - * - * allow_unresolved == B_FALSE indicates a legacy code-path call - * that has prohibited the addition of incomplete ire's. If this - * parameter is set, and we find an nce that is in a state other - * than ND_REACHABLE, we fail the add. Note that nce_state could be - * something other than ND_REACHABLE if the nce had just expired and - * the ire_create preceding the ire_add added a new ND_INITIAL nce. + * Add a fully initialized IRE to the forwarding table. + * This returns NULL on failure, or a held IRE on success. + * Normally the returned IRE is the same as the argument. But a different + * IRE will be returned if the added IRE is deemed identical to an existing + * one. In that case ire_identical_ref will be increased. + * The caller always needs to do an ire_refrele() on the returned IRE. */ -int -ire_add(ire_t **irep, queue_t *q, mblk_t *mp, ipsq_func_t func, - boolean_t allow_unresolved) +ire_t * +ire_add(ire_t *ire) { - ire_t *ire1; - ill_t *stq_ill = NULL; - ill_t *ill; - ipif_t *ipif = NULL; - ill_walk_context_t ctx; - ire_t *ire = *irep; - int error; - boolean_t ire_is_mblk = B_FALSE; - tsol_gcgrp_t *gcgrp = NULL; - tsol_gcgrp_addr_t ga; - ip_stack_t *ipst = ire->ire_ipst; - - /* get ready for the day when original ire is not created as mblk */ - if (ire->ire_mp != NULL) { - ire_is_mblk = B_TRUE; - /* Copy the ire to a kmem_alloc'ed area */ - ire1 = kmem_cache_alloc(ire_cache, KM_NOSLEEP); - if (ire1 == NULL) { - ip1dbg(("ire_add: alloc failed\n")); - ire_delete(ire); - *irep = NULL; - return (ENOMEM); - } - ire->ire_marks &= ~IRE_MARK_UNCACHED; - *ire1 = *ire; - ire1->ire_mp = NULL; - ire1->ire_stq_ifindex = 0; - freeb(ire->ire_mp); - ire = ire1; - } - if (ire->ire_stq != NULL) - stq_ill = ire->ire_stq->q_ptr; - - if (stq_ill != NULL && ire->ire_type == IRE_CACHE && - stq_ill->ill_net_type == IRE_IF_RESOLVER) { - rw_enter(&ipst->ips_ill_g_lock, RW_READER); - ill = ILL_START_WALK_ALL(&ctx, ipst); - for (; ill != NULL; ill = ill_next(&ctx, ill)) { - mutex_enter(&ill->ill_lock); - if (ill->ill_state_flags & ILL_CONDEMNED) { - mutex_exit(&ill->ill_lock); - continue; - } - /* - * We need to make sure that the ipif is a valid one - * before adding the IRE_CACHE. This happens only - * with IRE_CACHE when there is an external resolver. - * - * We can unplumb a logical interface while the - * packet is waiting in ARP with the IRE. Then, - * later on when we feed the IRE back, the ipif - * has to be re-checked. This can't happen with - * NDP currently, as we never queue the IRE with - * the packet. We always try to recreate the IRE - * when the resolution is completed. But, we do - * it for IPv6 also here so that in future if - * we have external resolvers, it will work without - * any change. - */ - ipif = ipif_lookup_seqid(ill, ire->ire_ipif_seqid); - if (ipif != NULL) { - ipif_refhold_locked(ipif); - mutex_exit(&ill->ill_lock); - break; - } - mutex_exit(&ill->ill_lock); - } - rw_exit(&ipst->ips_ill_g_lock); - if (ipif == NULL || - (ipif->ipif_isv6 && - !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6) && - !IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6, - &ipif->ipif_v6src_addr)) || - (!ipif->ipif_isv6 && - ire->ire_src_addr != ipif->ipif_src_addr) || - ire->ire_zoneid != ipif->ipif_zoneid) { - if (ipif != NULL) - ipif_refrele(ipif); - ire->ire_ipif = NULL; - ire_delete(ire); - *irep = NULL; - return (EINVAL); - } - - ASSERT(ill != NULL); - + if (IRE_HIDDEN_TYPE(ire->ire_type) && + ire->ire_ill != NULL && IS_UNDER_IPMP(ire->ire_ill)) { /* - * Since we didn't attach label security attributes to the - * ire for the resolver case, we need to add it now. (only - * for v4 resolver and v6 xresolv case). + * IREs hosted on interfaces that are under IPMP + * should be hidden so that applications don't + * accidentally end up sending packets with test + * addresses as their source addresses, or + * sending out interfaces that are e.g. IFF_INACTIVE. + * Hide them here. */ - if (is_system_labeled() && ire_is_mblk) { - if (ire->ire_ipversion == IPV4_VERSION) { - ga.ga_af = AF_INET; - IN6_IPADDR_TO_V4MAPPED(ire->ire_gateway_addr != - INADDR_ANY ? ire->ire_gateway_addr : - ire->ire_addr, &ga.ga_addr); - } else { - ga.ga_af = AF_INET6; - ga.ga_addr = IN6_IS_ADDR_UNSPECIFIED( - &ire->ire_gateway_addr_v6) ? - ire->ire_addr_v6 : - ire->ire_gateway_addr_v6; - } - gcgrp = gcgrp_lookup(&ga, B_FALSE); - error = tsol_ire_init_gwattr(ire, ire->ire_ipversion, - NULL, gcgrp); - if (error != 0) { - if (gcgrp != NULL) { - GCGRP_REFRELE(gcgrp); - gcgrp = NULL; - } - ipif_refrele(ipif); - ire->ire_ipif = NULL; - ire_delete(ire); - *irep = NULL; - return (error); - } - } + ire->ire_testhidden = B_TRUE; } - /* - * In case ire was changed - */ - *irep = ire; if (ire->ire_ipversion == IPV6_VERSION) - error = ire_add_v6(irep, q, mp, func); + return (ire_add_v6(ire)); else - error = ire_add_v4(irep, q, mp, func, allow_unresolved); - if (ipif != NULL) - ipif_refrele(ipif); - return (error); + return (ire_add_v4(ire)); } /* - * Add an initialized IRE to an appropriate table based on ire_type. - * - * The forward table contains IRE_PREFIX/IRE_HOST and - * IRE_IF_RESOLVER/IRE_IF_NORESOLVER and IRE_DEFAULT. - * - * The cache table contains IRE_BROADCAST/IRE_LOCAL/IRE_LOOPBACK - * and IRE_CACHE. - * - * NOTE : This function is called as writer though not required - * by this function. + * Add a fully initialized IPv4 IRE to the forwarding table. + * This returns NULL on failure, or a held IRE on success. + * Normally the returned IRE is the same as the argument. But a different + * IRE will be returned if the added IRE is deemed identical to an existing + * one. In that case ire_identical_ref will be increased. + * The caller always needs to do an ire_refrele() on the returned IRE. */ -static int -ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func, - boolean_t allow_unresolved) +static ire_t * +ire_add_v4(ire_t *ire) { ire_t *ire1; irb_t *irb_ptr; ire_t **irep; - int flags; - ire_t *pire = NULL; - ill_t *stq_ill; - ire_t *ire = *ire_p; + int match_flags; int error; - boolean_t need_refrele = B_FALSE; - nce_t *nce; ip_stack_t *ipst = ire->ire_ipst; - uint_t marks = 0; - /* - * IREs with source addresses hosted on interfaces that are under IPMP - * should be hidden so that applications don't accidentally end up - * sending packets with test addresses as their source addresses, or - * sending out interfaces that are e.g. IFF_INACTIVE. Hide them here. - */ - if (ire->ire_ipif != NULL && IS_UNDER_IPMP(ire->ire_ipif->ipif_ill)) - marks |= IRE_MARK_TESTHIDDEN; - - if (ire->ire_ipif != NULL) - ASSERT(!MUTEX_HELD(&ire->ire_ipif->ipif_ill->ill_lock)); - if (ire->ire_stq != NULL) - ASSERT(!MUTEX_HELD( - &((ill_t *)(ire->ire_stq->q_ptr))->ill_lock)); + if (ire->ire_ill != NULL) + ASSERT(!MUTEX_HELD(&ire->ire_ill->ill_lock)); ASSERT(ire->ire_ipversion == IPV4_VERSION); - ASSERT(ire->ire_mp == NULL); /* Calls should go through ire_add */ - - /* Find the appropriate list head. */ - switch (ire->ire_type) { - case IRE_HOST: - ire->ire_mask = IP_HOST_MASK; - ire->ire_masklen = IP_ABITS; - ire->ire_marks |= marks; - if ((ire->ire_flags & RTF_SETSRC) == 0) - ire->ire_src_addr = 0; - break; - case IRE_CACHE: - ire->ire_mask = IP_HOST_MASK; - ire->ire_masklen = IP_ABITS; - ire->ire_marks |= marks; - break; - case IRE_BROADCAST: - case IRE_LOCAL: - case IRE_LOOPBACK: - ire->ire_mask = IP_HOST_MASK; - ire->ire_masklen = IP_ABITS; - break; - case IRE_PREFIX: - case IRE_DEFAULT: - ire->ire_marks |= marks; - if ((ire->ire_flags & RTF_SETSRC) == 0) - ire->ire_src_addr = 0; - break; - case IRE_IF_RESOLVER: - case IRE_IF_NORESOLVER: - ire->ire_marks |= marks; - break; - default: - ip0dbg(("ire_add_v4: ire %p has unrecognized IRE type (%d)\n", - (void *)ire, ire->ire_type)); - ire_delete(ire); - *ire_p = NULL; - return (EINVAL); - } /* Make sure the address is properly masked. */ ire->ire_addr &= ire->ire_mask; - /* - * ip_newroute/ip_newroute_multi are unable to prevent the deletion - * of the interface route while adding an IRE_CACHE for an on-link - * destination in the IRE_IF_RESOLVER case, since the ire has to - * go to ARP and return. We can't do a REFHOLD on the - * associated interface ire for fear of ARP freeing the message. - * Here we look up the interface ire in the forwarding table and - * make sure that the interface route has not been deleted. - */ - if (ire->ire_type == IRE_CACHE && ire->ire_gateway_addr == 0 && - ((ill_t *)ire->ire_stq->q_ptr)->ill_net_type == IRE_IF_RESOLVER) { - - ASSERT(ire->ire_max_fragp == NULL); - if (CLASSD(ire->ire_addr) && !(ire->ire_flags & RTF_SETSRC)) { - /* - * The ihandle that we used in ip_newroute_multi - * comes from the interface route corresponding - * to ire_ipif. Lookup here to see if it exists - * still. - * If the ire has a source address assigned using - * RTF_SETSRC, ire_ipif is the logical interface holding - * this source address, so we can't use it to check for - * the existence of the interface route. Instead we rely - * on the brute force ihandle search in - * ire_ihandle_lookup_onlink() below. - */ - pire = ipif_to_ire(ire->ire_ipif); - if (pire == NULL) { - ire_delete(ire); - *ire_p = NULL; - return (EINVAL); - } else if (pire->ire_ihandle != ire->ire_ihandle) { - ire_refrele(pire); - ire_delete(ire); - *ire_p = NULL; - return (EINVAL); - } - } else { - pire = ire_ihandle_lookup_onlink(ire); - if (pire == NULL) { - ire_delete(ire); - *ire_p = NULL; - return (EINVAL); - } - } - /* Prevent pire from getting deleted */ - IRB_REFHOLD(pire->ire_bucket); - /* Has it been removed already ? */ - if (pire->ire_marks & IRE_MARK_CONDEMNED) { - IRB_REFRELE(pire->ire_bucket); - ire_refrele(pire); - ire_delete(ire); - *ire_p = NULL; - return (EINVAL); - } - } else { - ASSERT(ire->ire_max_fragp != NULL); - } - flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW); + match_flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW); - if (ire->ire_ipif != NULL) { - /* - * We use MATCH_IRE_IPIF while adding IRE_CACHES only - * for historic reasons and to maintain symmetry with - * IPv6 code path. Historically this was used by - * multicast code to create multiple IRE_CACHES on - * a single ill with different ipifs. This was used - * so that multicast packets leaving the node had the - * right source address. This is no longer needed as - * ip_wput initializes the address correctly. - */ - flags |= MATCH_IRE_IPIF; - /* - * If we are creating a hidden IRE, make sure we search for - * hidden IREs when searching for duplicates below. - * Otherwise, we might find an IRE on some other interface - * that's not marked hidden. - */ - if (ire->ire_marks & IRE_MARK_TESTHIDDEN) - flags |= MATCH_IRE_MARK_TESTHIDDEN; + if (ire->ire_ill != NULL) { + match_flags |= MATCH_IRE_ILL; } - if ((ire->ire_type & IRE_CACHETABLE) == 0) { - irb_ptr = ire_get_bucket(ire); - need_refrele = B_TRUE; - if (irb_ptr == NULL) { - /* - * This assumes that the ire has not added - * a reference to the ipif. - */ - ire->ire_ipif = NULL; - ire_delete(ire); - if (pire != NULL) { - IRB_REFRELE(pire->ire_bucket); - ire_refrele(pire); - } - *ire_p = NULL; - return (EINVAL); - } - } else { - irb_ptr = &(ipst->ips_ip_cache_table[IRE_ADDR_HASH( - ire->ire_addr, ipst->ips_ip_cache_table_size)]); + irb_ptr = ire_get_bucket(ire); + if (irb_ptr == NULL) { + printf("no bucket for %p\n", (void *)ire); + ire_delete(ire); + return (NULL); } /* - * Start the atomic add of the ire. Grab the ill locks, - * ill_g_usesrc_lock and the bucket lock. Check for condemned - * - * If ipif or ill is changing ire_atomic_start() may queue the - * request and return EINPROGRESS. - * To avoid lock order problems, get the ndp4->ndp_g_lock. + * Start the atomic add of the ire. Grab the ill lock, + * the bucket lock. Check for condemned. */ - mutex_enter(&ipst->ips_ndp4->ndp_g_lock); - error = ire_atomic_start(irb_ptr, ire, q, mp, func); + error = ire_atomic_start(irb_ptr, ire); if (error != 0) { - mutex_exit(&ipst->ips_ndp4->ndp_g_lock); - /* - * We don't know whether it is a valid ipif or not. - * So, set it to NULL. This assumes that the ire has not added - * a reference to the ipif. - */ - ire->ire_ipif = NULL; + printf("no ire_atomic_start for %p\n", (void *)ire); ire_delete(ire); - if (pire != NULL) { - IRB_REFRELE(pire->ire_bucket); - ire_refrele(pire); - } - *ire_p = NULL; - if (need_refrele) - IRB_REFRELE(irb_ptr); - return (error); + irb_refrele(irb_ptr); + return (NULL); } /* - * To avoid creating ires having stale values for the ire_max_frag - * we get the latest value atomically here. For more details - * see the block comment in ip_sioctl_mtu and in DL_NOTE_SDU_CHANGE - * in ip_rput_dlpi_writer + * If we are creating a hidden IRE, make sure we search for + * hidden IREs when searching for duplicates below. + * Otherwise, we might find an IRE on some other interface + * that's not marked hidden. */ - if (ire->ire_max_fragp == NULL) { - if (CLASSD(ire->ire_addr)) - ire->ire_max_frag = ire->ire_ipif->ipif_mtu; - else - ire->ire_max_frag = pire->ire_max_frag; - } else { - uint_t max_frag; + if (ire->ire_testhidden) + match_flags |= MATCH_IRE_TESTHIDDEN; - max_frag = *ire->ire_max_fragp; - ire->ire_max_fragp = NULL; - ire->ire_max_frag = max_frag; - } /* * Atomically check for duplicate and insert in the table. */ for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { - if (ire1->ire_marks & IRE_MARK_CONDEMNED) + if (IRE_IS_CONDEMNED(ire1)) continue; - if (ire->ire_ipif != NULL) { - /* - * We do MATCH_IRE_ILL implicitly here for IREs - * with a non-null ire_ipif, including IRE_CACHEs. - * As ire_ipif and ire_stq could point to two - * different ills, we can't pass just ire_ipif to - * ire_match_args and get a match on both ills. - * This is just needed for duplicate checks here and - * so we don't add an extra argument to - * ire_match_args for this. Do it locally. - * - * NOTE : Currently there is no part of the code - * that asks for both MATH_IRE_IPIF and MATCH_IRE_ILL - * match for IRE_CACHEs. Thus we don't want to - * extend the arguments to ire_match_args. - */ - if (ire1->ire_stq != ire->ire_stq) - continue; - /* - * Multiroute IRE_CACHEs for a given destination can - * have the same ire_ipif, typically if their source - * address is forced using RTF_SETSRC, and the same - * send-to queue. We differentiate them using the parent - * handle. - */ - if (ire->ire_type == IRE_CACHE && - (ire1->ire_flags & RTF_MULTIRT) && - (ire->ire_flags & RTF_MULTIRT) && - (ire1->ire_phandle != ire->ire_phandle)) - continue; - } + /* + * Here we need an exact match on zoneid, i.e., + * ire_match_args doesn't fit. + */ if (ire1->ire_zoneid != ire->ire_zoneid) continue; + + if (ire1->ire_type != ire->ire_type) + continue; + + /* + * Note: We do not allow multiple routes that differ only + * in the gateway security attributes; such routes are + * considered duplicates. + * To change that we explicitly have to treat them as + * different here. + */ if (ire_match_args(ire1, ire->ire_addr, ire->ire_mask, - ire->ire_gateway_addr, ire->ire_type, ire->ire_ipif, - ire->ire_zoneid, 0, NULL, flags, NULL)) { + ire->ire_gateway_addr, ire->ire_type, ire->ire_ill, + ire->ire_zoneid, NULL, match_flags)) { /* * Return the old ire after doing a REFHOLD. * As most of the callers continue to use the IRE @@ -2881,149 +1252,36 @@ ire_add_v4(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func, * avoid a lookup in the caller again. If the callers * don't want to use it, they need to do a REFRELE. */ - ip1dbg(("found dup ire existing %p new %p\n", - (void *)ire1, (void *)ire)); - IRE_REFHOLD(ire1); + atomic_add_32(&ire1->ire_identical_ref, 1); + DTRACE_PROBE2(ire__add__exist, ire_t *, ire1, + ire_t *, ire); + ire_refhold(ire1); ire_atomic_end(irb_ptr, ire); - mutex_exit(&ipst->ips_ndp4->ndp_g_lock); ire_delete(ire); - if (pire != NULL) { - /* - * Assert that it is not removed from the - * list yet. - */ - ASSERT(pire->ire_ptpn != NULL); - IRB_REFRELE(pire->ire_bucket); - ire_refrele(pire); - } - *ire_p = ire1; - if (need_refrele) - IRB_REFRELE(irb_ptr); - return (0); + irb_refrele(irb_ptr); + return (ire1); } } - if (ire->ire_type & IRE_CACHE) { - ASSERT(ire->ire_stq != NULL); - nce = ndp_lookup_v4(ire_to_ill(ire), - ((ire->ire_gateway_addr != INADDR_ANY) ? - &ire->ire_gateway_addr : &ire->ire_addr), - B_TRUE); - if (nce != NULL) - mutex_enter(&nce->nce_lock); - /* - * if the nce is NCE_F_CONDEMNED, or if it is not ND_REACHABLE - * and the caller has prohibited the addition of incomplete - * ire's, we fail the add. Note that nce_state could be - * something other than ND_REACHABLE if the nce had - * just expired and the ire_create preceding the - * ire_add added a new ND_INITIAL nce. - */ - if ((nce == NULL) || - (nce->nce_flags & NCE_F_CONDEMNED) || - (!allow_unresolved && - (nce->nce_state != ND_REACHABLE))) { - if (nce != NULL) { - DTRACE_PROBE1(ire__bad__nce, nce_t *, nce); - mutex_exit(&nce->nce_lock); - } - ire_atomic_end(irb_ptr, ire); - mutex_exit(&ipst->ips_ndp4->ndp_g_lock); - if (nce != NULL) - NCE_REFRELE(nce); - DTRACE_PROBE1(ire__no__nce, ire_t *, ire); - ire_delete(ire); - if (pire != NULL) { - IRB_REFRELE(pire->ire_bucket); - ire_refrele(pire); - } - *ire_p = NULL; - if (need_refrele) - IRB_REFRELE(irb_ptr); - return (EINVAL); - } else { - ire->ire_nce = nce; - mutex_exit(&nce->nce_lock); - /* - * We are associating this nce to the ire, so - * change the nce ref taken in ndp_lookup_v4() from - * NCE_REFHOLD to NCE_REFHOLD_NOTR - */ - NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce); - } - } /* - * Make it easy for ip_wput_ire() to hit multiple broadcast ires by - * grouping identical addresses together on the hash chain. We do - * this only for IRE_BROADCASTs as ip_wput_ire is currently interested - * in such groupings only for broadcasts. - * - * Find the first entry that matches ire_addr. *irep will be null - * if no match. - * - * Note: the loopback and non-loopback broadcast entries for an - * interface MUST be added before any MULTIRT entries. + * Normally we do head insertion since most things do not care about + * the order of the IREs in the bucket. Note that ip_cgtp_bcast_add + * assumes we at least do head insertion so that its IRE_BROADCAST + * arrive ahead of existing IRE_HOST for the same address. + * However, due to shared-IP zones (and restrict_interzone_loopback) + * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same + * address. For that reason we do tail insertion for IRE_IF_CLONE. + * Due to the IRE_BROADCAST on cgtp0, which must be last in the bucket, + * we do tail insertion of IRE_BROADCASTs that do not have RTF_MULTIRT + * set. */ irep = (ire_t **)irb_ptr; - while ((ire1 = *irep) != NULL && ire->ire_addr != ire1->ire_addr) - irep = &ire1->ire_next; - if (ire->ire_type == IRE_BROADCAST && *irep != NULL) { - /* - * We found some ire (i.e *irep) with a matching addr. We - * want to group ires with same addr. - */ - for (;;) { - ire1 = *irep; - if ((ire1->ire_next == NULL) || - (ire1->ire_next->ire_addr != ire->ire_addr) || - (ire1->ire_type != IRE_BROADCAST) || - (ire1->ire_flags & RTF_MULTIRT) || - (ire1->ire_ipif->ipif_ill->ill_grp == - ire->ire_ipif->ipif_ill->ill_grp)) - break; - irep = &ire1->ire_next; - } - ASSERT(*irep != NULL); - /* - * The ire will be added before *irep, so - * if irep is a MULTIRT ire, just break to - * ire insertion code. - */ - if (((*irep)->ire_flags & RTF_MULTIRT) != 0) - goto insert_ire; - - irep = &((*irep)->ire_next); - - /* - * Either we have hit the end of the list or the address - * did not match. - */ - while (*irep != NULL) { - ire1 = *irep; - if ((ire1->ire_addr != ire->ire_addr) || - (ire1->ire_type != IRE_BROADCAST)) - break; - if (ire1->ire_ipif == ire->ire_ipif) { - irep = &ire1->ire_next; - break; - } - irep = &ire1->ire_next; - } - } else if (*irep != NULL) { - /* - * Find the last ire which matches ire_addr. - * Needed to do tail insertion among entries with the same - * ire_addr. - */ - while (ire->ire_addr == ire1->ire_addr) { + if ((ire->ire_type & IRE_IF_CLONE) || + ((ire->ire_type & IRE_BROADCAST) && + !(ire->ire_flags & RTF_MULTIRT))) { + while ((ire1 = *irep) != NULL) irep = &ire1->ire_next; - ire1 = *irep; - if (ire1 == NULL) - break; - } } - -insert_ire: /* Insert at *irep */ ire1 = *irep; if (ire1 != NULL) @@ -3058,82 +1316,31 @@ insert_ire: * in the list for the first time and no one else can bump * up the reference count on this yet. */ - IRE_REFHOLD_LOCKED(ire); + ire_refhold_locked(ire); BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_inserted); irb_ptr->irb_ire_cnt++; - if (irb_ptr->irb_marks & IRB_MARK_FTABLE) + if (irb_ptr->irb_marks & IRB_MARK_DYNAMIC) irb_ptr->irb_nire++; - if (ire->ire_marks & IRE_MARK_TEMPORARY) - irb_ptr->irb_tmp_ire_cnt++; - - if (ire->ire_ipif != NULL) { - DTRACE_PROBE3(ipif__incr__cnt, (ipif_t *), ire->ire_ipif, - (char *), "ire", (void *), ire); - ire->ire_ipif->ipif_ire_cnt++; - if (ire->ire_stq != NULL) { - stq_ill = (ill_t *)ire->ire_stq->q_ptr; - DTRACE_PROBE3(ill__incr__cnt, (ill_t *), stq_ill, - (char *), "ire", (void *), ire); - stq_ill->ill_ire_cnt++; - } - } else { - ASSERT(ire->ire_stq == NULL); + if (ire->ire_ill != NULL) { + ire->ire_ill->ill_ire_cnt++; + ASSERT(ire->ire_ill->ill_ire_cnt != 0); /* Wraparound */ } ire_atomic_end(irb_ptr, ire); - mutex_exit(&ipst->ips_ndp4->ndp_g_lock); - if (pire != NULL) { - /* Assert that it is not removed from the list yet */ - ASSERT(pire->ire_ptpn != NULL); - IRB_REFRELE(pire->ire_bucket); - ire_refrele(pire); - } + /* Make any caching of the IREs be notified or updated */ + ire_flush_cache_v4(ire, IRE_FLUSH_ADD); - if (ire->ire_type != IRE_CACHE) { - /* - * For ire's with host mask see if there is an entry - * in the cache. If there is one flush the whole cache as - * there might be multiple entries due to RTF_MULTIRT (CGTP). - * If no entry is found than there is no need to flush the - * cache. - */ - if (ire->ire_mask == IP_HOST_MASK) { - ire_t *lire; - lire = ire_ctable_lookup(ire->ire_addr, NULL, IRE_CACHE, - NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); - if (lire != NULL) { - ire_refrele(lire); - ire_flush_cache_v4(ire, IRE_FLUSH_ADD); - } - } else { - ire_flush_cache_v4(ire, IRE_FLUSH_ADD); - } - } - /* - * We had to delay the fast path probe until the ire is inserted - * in the list. Otherwise the fast path ack won't find the ire in - * the table. - */ - if (ire->ire_type == IRE_CACHE || - (ire->ire_type == IRE_BROADCAST && ire->ire_stq != NULL)) { - ASSERT(ire->ire_nce != NULL); - if (ire->ire_nce->nce_state == ND_REACHABLE) - nce_fastpath(ire->ire_nce); - } - if (ire->ire_ipif != NULL) - ASSERT(!MUTEX_HELD(&ire->ire_ipif->ipif_ill->ill_lock)); - *ire_p = ire; - if (need_refrele) { - IRB_REFRELE(irb_ptr); - } - return (0); + if (ire->ire_ill != NULL) + ASSERT(!MUTEX_HELD(&ire->ire_ill->ill_lock)); + irb_refrele(irb_ptr); + return (ire); } /* - * IRB_REFRELE is the only caller of the function. ire_unlink calls to + * irb_refrele is the only caller of the function. ire_unlink calls to * do the final cleanup for this ire. */ void @@ -3162,13 +1369,13 @@ ire_cleanup(ire_t *ire) * so. */ ire->ire_next = NULL; - IRE_REFRELE_NOTR(ire); + ire_refrele_notr(ire); ire = ire_next; } } /* - * IRB_REFRELE is the only caller of the function. It calls to unlink + * irb_refrele is the only caller of the function. It calls to unlink * all the CONDEMNED ires from this bucket. */ ire_t * @@ -3180,16 +1387,14 @@ ire_unlink(irb_t *irb) ire_t *ire_list = NULL; ASSERT(RW_WRITE_HELD(&irb->irb_lock)); - ASSERT(((irb->irb_marks & IRB_MARK_FTABLE) && irb->irb_refcnt == 1) || + ASSERT(((irb->irb_marks & IRB_MARK_DYNAMIC) && irb->irb_refcnt == 1) || (irb->irb_refcnt == 0)); ASSERT(irb->irb_marks & IRB_MARK_CONDEMNED); ASSERT(irb->irb_ire != NULL); for (ire = irb->irb_ire; ire != NULL; ire = ire1) { - ip_stack_t *ipst = ire->ire_ipst; - ire1 = ire->ire_next; - if (ire->ire_marks & IRE_MARK_CONDEMNED) { + if (IRE_IS_CONDEMNED(ire)) { ptpn = ire->ire_ptpn; ire1 = ire->ire_next; if (ire1) @@ -3197,22 +1402,10 @@ ire_unlink(irb_t *irb) *ptpn = ire1; ire->ire_ptpn = NULL; ire->ire_next = NULL; - if (ire->ire_type == IRE_DEFAULT) { - /* - * IRE is out of the list. We need to adjust - * the accounting before the caller drops - * the lock. - */ - if (ire->ire_ipversion == IPV6_VERSION) { - ASSERT(ipst-> - ips_ipv6_ire_default_count != - 0); - ipst->ips_ipv6_ire_default_count--; - } - } + /* - * We need to call ire_delete_v4 or ire_delete_v6 - * to clean up the cache or the redirects pointing at + * We need to call ire_delete_v4 or ire_delete_v6 to + * clean up dependents and the redirects pointing at * the default gateway. We need to drop the lock * as ire_flush_cache/ire_delete_host_redircts require * so. But we can't drop the lock, as ire_unlink needs @@ -3230,76 +1423,7 @@ ire_unlink(irb_t *irb) } /* - * Delete all the cache entries with this 'addr'. When IP gets a gratuitous - * ARP message on any of its interface queue, it scans the nce table and - * deletes and calls ndp_delete() for the appropriate nce. This action - * also deletes all the neighbor/ire cache entries for that address. - * This function is called from ip_arp_news in ip.c and also for - * ARP ioctl processing in ip_if.c. ip_ire_clookup_and_delete returns - * true if it finds a nce entry which is used by ip_arp_news to determine if - * it needs to do an ire_walk_v4. The return value is also used for the - * same purpose by ARP IOCTL processing * in ip_if.c when deleting - * ARP entries. For SIOC*IFARP ioctls in addition to the address, - * ip_if->ipif_ill also needs to be matched. - */ -boolean_t -ip_ire_clookup_and_delete(ipaddr_t addr, ipif_t *ipif, ip_stack_t *ipst) -{ - ill_t *ill; - nce_t *nce; - - ill = (ipif ? ipif->ipif_ill : NULL); - - if (ill != NULL) { - /* - * clean up the nce (and any relevant ire's) that matches - * on addr and ill. - */ - nce = ndp_lookup_v4(ill, &addr, B_FALSE); - if (nce != NULL) { - ndp_delete(nce); - return (B_TRUE); - } - } else { - /* - * ill is wildcard. clean up all nce's and - * ire's that match on addr - */ - nce_clookup_t cl; - - cl.ncecl_addr = addr; - cl.ncecl_found = B_FALSE; - - ndp_walk_common(ipst->ips_ndp4, NULL, - (pfi_t)ip_nce_clookup_and_delete, (uchar_t *)&cl, B_TRUE); - - /* - * ncecl_found would be set by ip_nce_clookup_and_delete if - * we found a matching nce. - */ - return (cl.ncecl_found); - } - return (B_FALSE); - -} - -/* Delete the supplied nce if its nce_addr matches the supplied address */ -static void -ip_nce_clookup_and_delete(nce_t *nce, void *arg) -{ - nce_clookup_t *cl = (nce_clookup_t *)arg; - ipaddr_t nce_addr; - - IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr); - if (nce_addr == cl->ncecl_addr) { - cl->ncecl_found = B_TRUE; - /* clean up the nce (and any relevant ire's) */ - ndp_delete(nce); - } -} - -/* - * Clean up the radix node for this ire. Must be called by IRB_REFRELE + * Clean up the radix node for this ire. Must be called by irb_refrele * when there are no ire's left in the bucket. Returns TRUE if the bucket * is deleted and freed. */ @@ -3335,40 +1459,55 @@ irb_inactive(irb_t *irb) /* * Delete the specified IRE. + * We assume that if ire_bucket is not set then ire_ill->ill_ire_cnt was + * not incremented i.e., that the insertion in the bucket and the increment + * of that counter is done atomically. */ void ire_delete(ire_t *ire) { ire_t *ire1; ire_t **ptpn; - irb_t *irb; + irb_t *irb; + nce_t *nce; ip_stack_t *ipst = ire->ire_ipst; + /* We can clear ire_nce_cache under ire_lock even if the IRE is used */ + mutex_enter(&ire->ire_lock); + nce = ire->ire_nce_cache; + ire->ire_nce_cache = NULL; + mutex_exit(&ire->ire_lock); + if (nce != NULL) + nce_refrele(nce); + if ((irb = ire->ire_bucket) == NULL) { /* * It was never inserted in the list. Should call REFRELE * to free this IRE. */ - IRE_REFRELE_NOTR(ire); + ire_refrele_notr(ire); return; } - rw_enter(&irb->irb_lock, RW_WRITER); - - if (irb->irb_rr_origin == ire) { - irb->irb_rr_origin = NULL; - } - /* - * In case of V4 we might still be waiting for fastpath ack. + * Move the use counts from an IRE_IF_CLONE to its parent + * IRE_INTERFACE. + * We need to do this before acquiring irb_lock. */ - if (ire->ire_ipversion == IPV4_VERSION && - (ire->ire_type == IRE_CACHE || - (ire->ire_type == IRE_BROADCAST && ire->ire_stq != NULL))) { - ASSERT(ire->ire_nce != NULL); - nce_fastpath_list_delete(ire->ire_nce); + if (ire->ire_type & IRE_IF_CLONE) { + ire_t *parent; + + rw_enter(&ipst->ips_ire_dep_lock, RW_READER); + if ((parent = ire->ire_dep_parent) != NULL) { + parent->ire_ob_pkt_count += ire->ire_ob_pkt_count; + parent->ire_ib_pkt_count += ire->ire_ib_pkt_count; + ire->ire_ob_pkt_count = 0; + ire->ire_ib_pkt_count = 0; + } + rw_exit(&ipst->ips_ire_dep_lock); } + rw_enter(&irb->irb_lock, RW_WRITER); if (ire->ire_ptpn == NULL) { /* * Some other thread has removed us from the list. @@ -3378,13 +1517,17 @@ ire_delete(ire_t *ire) return; } - if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { - irb->irb_ire_cnt--; - ire->ire_marks |= IRE_MARK_CONDEMNED; - if (ire->ire_marks & IRE_MARK_TEMPORARY) { - irb->irb_tmp_ire_cnt--; - ire->ire_marks &= ~IRE_MARK_TEMPORARY; + if (!IRE_IS_CONDEMNED(ire)) { + /* Is this an IRE representing multiple duplicate entries? */ + ASSERT(ire->ire_identical_ref >= 1); + if (atomic_add_32_nv(&ire->ire_identical_ref, -1) != 0) { + /* Removed one of the identical parties */ + rw_exit(&irb->irb_lock); + return; } + + irb->irb_ire_cnt--; + ire_make_condemned(ire); } if (irb->irb_refcnt != 0) { @@ -3419,22 +1562,9 @@ ire_delete(ire_t *ire) } else { BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_deleted); } - /* - * ip_wput/ip_wput_v6 checks this flag to see whether - * it should still use the cached ire or not. - */ - if (ire->ire_type == IRE_DEFAULT) { - /* - * IRE is out of the list. We need to adjust the - * accounting before we drop the lock. - */ - if (ire->ire_ipversion == IPV6_VERSION) { - ASSERT(ipst->ips_ipv6_ire_default_count != 0); - ipst->ips_ipv6_ire_default_count--; - } - } rw_exit(&irb->irb_lock); + /* Cleanup dependents and related stuff */ if (ire->ire_ipversion == IPV6_VERSION) { ire_delete_v6(ire); } else { @@ -3444,7 +1574,7 @@ ire_delete(ire_t *ire) * We removed it from the list. Decrement the * reference count. */ - IRE_REFRELE_NOTR(ire); + ire_refrele_notr(ire); } /* @@ -3463,8 +1593,7 @@ ire_delete_v4(ire_t *ire) ASSERT(ire->ire_refcnt >= 1); ASSERT(ire->ire_ipversion == IPV4_VERSION); - if (ire->ire_type != IRE_CACHE) - ire_flush_cache_v4(ire, IRE_FLUSH_DELETE); + ire_flush_cache_v4(ire, IRE_FLUSH_DELETE); if (ire->ire_type == IRE_DEFAULT) { /* * when a default gateway is going away @@ -3473,20 +1602,33 @@ ire_delete_v4(ire_t *ire) */ ire_delete_host_redirects(ire->ire_gateway_addr, ipst); } + + /* + * If we are deleting an IRE_INTERFACE then we make sure we also + * delete any IRE_IF_CLONE that has been created from it. + * Those are always in ire_dep_children. + */ + if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != NULL) + ire_dep_delete_if_clone(ire); + + /* Remove from parent dependencies and child */ + rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER); + if (ire->ire_dep_parent != NULL) + ire_dep_remove(ire); + + while (ire->ire_dep_children != NULL) + ire_dep_remove(ire->ire_dep_children); + rw_exit(&ipst->ips_ire_dep_lock); } /* - * IRE_REFRELE/ire_refrele are the only caller of the function. It calls + * ire_refrele is the only caller of the function. It calls * to free the ire when the reference count goes to zero. */ void ire_inactive(ire_t *ire) { - nce_t *nce; - ill_t *ill = NULL; - ill_t *stq_ill = NULL; - ipif_t *ipif; - boolean_t need_wakeup = B_FALSE; + ill_t *ill; irb_t *irb; ip_stack_t *ipst = ire->ire_ipst; @@ -3494,128 +1636,71 @@ ire_inactive(ire_t *ire) ASSERT(ire->ire_ptpn == NULL); ASSERT(ire->ire_next == NULL); + /* Count how many condemned ires for kmem_cache callback */ + if (IRE_IS_CONDEMNED(ire)) + atomic_add_32(&ipst->ips_num_ire_condemned, -1); + if (ire->ire_gw_secattr != NULL) { ire_gw_secattr_free(ire->ire_gw_secattr); ire->ire_gw_secattr = NULL; } - if (ire->ire_mp != NULL) { - ASSERT(ire->ire_bucket == NULL); - mutex_destroy(&ire->ire_lock); - BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_freed); - if (ire->ire_nce != NULL) - NCE_REFRELE_NOTR(ire->ire_nce); - freeb(ire->ire_mp); - return; - } - - if ((nce = ire->ire_nce) != NULL) { - NCE_REFRELE_NOTR(nce); - ire->ire_nce = NULL; - } - - if (ire->ire_ipif == NULL) - goto end; - - ipif = ire->ire_ipif; - ill = ipif->ipif_ill; + /* + * ire_nce_cache is cleared in ire_delete, and we make sure we don't + * set it once the ire is marked condemned. + */ + ASSERT(ire->ire_nce_cache == NULL); - if (ire->ire_bucket == NULL) { - /* The ire was never inserted in the table. */ - goto end; - } + /* + * Since any parent would have a refhold on us they would already + * have been removed. + */ + ASSERT(ire->ire_dep_parent == NULL); + ASSERT(ire->ire_dep_sib_next == NULL); + ASSERT(ire->ire_dep_sib_ptpn == NULL); /* - * ipif_ire_cnt on this ipif goes down by 1. If the ire_stq is - * non-null ill_ire_count also goes down by 1. - * - * The ipif that is associated with an ire is ire->ire_ipif and - * hence when the ire->ire_ipif->ipif_ire_cnt drops to zero we call - * ipif_ill_refrele_tail. Usually stq_ill is null or the same as - * ire->ire_ipif->ipif_ill. So nothing more needs to be done. - * However, for VNI or IPMP IRE entries, stq_ill can be different. - * If this is different from ire->ire_ipif->ipif_ill and if the - * ill_ire_cnt on the stq_ill also has dropped to zero, we call - * ipif_ill_refrele_tail on the stq_ill. + * Since any children would have a refhold on us they should have + * already been removed. */ - if (ire->ire_stq != NULL) - stq_ill = ire->ire_stq->q_ptr; + ASSERT(ire->ire_dep_children == NULL); - if (stq_ill == NULL || stq_ill == ill) { - /* Optimize the most common case */ + /* + * ill_ire_ref is increased when the IRE is inserted in the + * bucket - not when the IRE is created. + */ + irb = ire->ire_bucket; + ill = ire->ire_ill; + if (irb != NULL && ill != NULL) { mutex_enter(&ill->ill_lock); - ASSERT(ipif->ipif_ire_cnt != 0); - DTRACE_PROBE3(ipif__decr__cnt, (ipif_t *), ipif, + ASSERT(ill->ill_ire_cnt != 0); + DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill, (char *), "ire", (void *), ire); - ipif->ipif_ire_cnt--; - if (IPIF_DOWN_OK(ipif)) - need_wakeup = B_TRUE; - if (stq_ill != NULL) { - ASSERT(stq_ill->ill_ire_cnt != 0); - DTRACE_PROBE3(ill__decr__cnt, (ill_t *), stq_ill, - (char *), "ire", (void *), ire); - stq_ill->ill_ire_cnt--; - if (ILL_DOWN_OK(stq_ill)) - need_wakeup = B_TRUE; - } - if (need_wakeup) { + ill->ill_ire_cnt--; + if (ILL_DOWN_OK(ill)) { /* Drops the ill lock */ ipif_ill_refrele_tail(ill); } else { mutex_exit(&ill->ill_lock); } - } else { - /* - * We can't grab all the ill locks at the same time. - * It can lead to recursive lock enter in the call to - * ipif_ill_refrele_tail and later. Instead do it 1 at - * a time. - */ - mutex_enter(&ill->ill_lock); - ASSERT(ipif->ipif_ire_cnt != 0); - DTRACE_PROBE3(ipif__decr__cnt, (ipif_t *), ipif, - (char *), "ire", (void *), ire); - ipif->ipif_ire_cnt--; - if (IPIF_DOWN_OK(ipif)) { - /* Drops the lock */ - ipif_ill_refrele_tail(ill); - } else { - mutex_exit(&ill->ill_lock); - } - if (stq_ill != NULL) { - mutex_enter(&stq_ill->ill_lock); - ASSERT(stq_ill->ill_ire_cnt != 0); - DTRACE_PROBE3(ill__decr__cnt, (ill_t *), stq_ill, - (char *), "ire", (void *), ire); - stq_ill->ill_ire_cnt--; - if (ILL_DOWN_OK(stq_ill)) { - /* Drops the ill lock */ - ipif_ill_refrele_tail(stq_ill); - } else { - mutex_exit(&stq_ill->ill_lock); - } - } } -end: - /* This should be true for both V4 and V6 */ + ire->ire_ill = NULL; - if ((ire->ire_type & IRE_FORWARDTABLE) && - (ire->ire_ipversion == IPV4_VERSION) && - ((irb = ire->ire_bucket) != NULL)) { + /* This should be true for both V4 and V6 */ + if (irb != NULL && (irb->irb_marks & IRB_MARK_DYNAMIC)) { rw_enter(&irb->irb_lock, RW_WRITER); irb->irb_nire--; /* * Instead of examining the conditions for freeing * the radix node here, we do it by calling - * IRB_REFRELE which is a single point in the code + * irb_refrele which is a single point in the code * that embeds that logic. Bump up the refcnt to - * be able to call IRB_REFRELE + * be able to call irb_refrele */ - IRB_REFHOLD_LOCKED(irb); + irb_refhold_locked(irb); rw_exit(&irb->irb_lock); - IRB_REFRELE(irb); + irb_refrele(irb); } - ire->ire_ipif = NULL; #ifdef DEBUG ire_trace_cleanup(ire); @@ -3626,333 +1711,276 @@ end: } else { BUMP_IRE_STATS(ipst->ips_ire_stats_v4, ire_stats_freed); } - ASSERT(ire->ire_mp == NULL); - /* Has been allocated out of the cache */ kmem_cache_free(ire_cache, ire); } /* - * ire_walk routine to delete all IRE_CACHE/IRE_HOST types redirect - * entries that have a given gateway address. + * ire_update_generation is the callback function provided by + * ire_get_bucket() to update the generation number of any + * matching shorter route when a new route is added. + * + * This fucntion always returns a failure return (B_FALSE) + * to force the caller (rn_matchaddr_args) + * to back-track up the tree looking for shorter matches. + */ +/* ARGSUSED */ +static boolean_t +ire_update_generation(struct radix_node *rn, void *arg) +{ + struct rt_entry *rt = (struct rt_entry *)rn; + + /* We need to handle all in the same bucket */ + irb_increment_generation(&rt->rt_irb); + return (B_FALSE); +} + +/* + * Take care of all the generation numbers in the bucket. */ void -ire_delete_cache_gw(ire_t *ire, char *cp) +irb_increment_generation(irb_t *irb) { - ipaddr_t gw_addr; + ire_t *ire; - if (!(ire->ire_type & IRE_CACHE) && - !(ire->ire_flags & RTF_DYNAMIC)) + if (irb == NULL || irb->irb_ire_cnt == 0) return; - bcopy(cp, &gw_addr, sizeof (gw_addr)); - if (ire->ire_gateway_addr == gw_addr) { - ip1dbg(("ire_delete_cache_gw: deleted 0x%x type %d to 0x%x\n", - (int)ntohl(ire->ire_addr), ire->ire_type, - (int)ntohl(ire->ire_gateway_addr))); - ire_delete(ire); + irb_refhold(irb); + for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { + if (!IRE_IS_CONDEMNED(ire)) + ire_increment_generation(ire); /* Ourselves */ + ire_dep_incr_generation(ire); /* Dependants */ } + irb_refrele(irb); } /* - * Remove all IRE_CACHE entries that match the ire specified. + * When an IRE is added or deleted this routine is called to make sure + * any caching of IRE information is notified or updated. * * The flag argument indicates if the flush request is due to addition - * of new route (IRE_FLUSH_ADD) or deletion of old route (IRE_FLUSH_DELETE). - * - * This routine takes only the IREs from the forwarding table and flushes - * the corresponding entries from the cache table. - * - * When flushing due to the deletion of an old route, it - * just checks the cache handles (ire_phandle and ire_ihandle) and - * deletes the ones that match. - * - * When flushing due to the creation of a new route, it checks - * if a cache entry's address matches the one in the IRE and - * that the cache entry's parent has a less specific mask than the - * one in IRE. The destination of such a cache entry could be the - * gateway for other cache entries, so we need to flush those as - * well by looking for gateway addresses matching the IRE's address. + * of new route (IRE_FLUSH_ADD), deletion of old route (IRE_FLUSH_DELETE), + * or a change to ire_gateway_addr (IRE_FLUSH_GWCHANGE). */ void ire_flush_cache_v4(ire_t *ire, int flag) { - int i; - ire_t *cire; - irb_t *irb; - ip_stack_t *ipst = ire->ire_ipst; + irb_t *irb = ire->ire_bucket; + struct rt_entry *rt = IRB2RT(irb); + ip_stack_t *ipst = ire->ire_ipst; - if (ire->ire_type & IRE_CACHE) + /* + * IRE_IF_CLONE ire's don't provide any new information + * than the parent from which they are cloned, so don't + * perturb the generation numbers. + */ + if (ire->ire_type & IRE_IF_CLONE) return; /* - * If a default is just created, there is no point - * in going through the cache, as there will not be any - * cached ires. + * Ensure that an ire_add during a lookup serializes the updates of the + * generation numbers under the radix head lock so that the lookup gets + * either the old ire and old generation number, or a new ire and new + * generation number. + */ + RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable); + + /* + * If a route was just added, we need to notify everybody that + * has cached an IRE_NOROUTE since there might now be a better + * route for them. */ - if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) - return; if (flag == IRE_FLUSH_ADD) { + ire_increment_generation(ipst->ips_ire_reject_v4); + ire_increment_generation(ipst->ips_ire_blackhole_v4); + } + + /* Adding a default can't otherwise provide a better route */ + if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) { + RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); + return; + } + + switch (flag) { + case IRE_FLUSH_DELETE: + case IRE_FLUSH_GWCHANGE: /* - * This selective flush is due to the addition of - * new IRE. + * Update ire_generation for all ire_dep_children chains + * starting with this IRE */ - for (i = 0; i < ipst->ips_ip_cache_table_size; i++) { - irb = &ipst->ips_ip_cache_table[i]; - if ((cire = irb->irb_ire) == NULL) - continue; - IRB_REFHOLD(irb); - for (cire = irb->irb_ire; cire != NULL; - cire = cire->ire_next) { - if (cire->ire_type != IRE_CACHE) - continue; - /* - * If 'cire' belongs to the same subnet - * as the new ire being added, and 'cire' - * is derived from a prefix that is less - * specific than the new ire being added, - * we need to flush 'cire'; for instance, - * when a new interface comes up. - */ - if (((cire->ire_addr & ire->ire_mask) == - (ire->ire_addr & ire->ire_mask)) && - (ip_mask_to_plen(cire->ire_cmask) <= - ire->ire_masklen)) { - ire_delete(cire); - continue; - } - /* - * This is the case when the ire_gateway_addr - * of 'cire' belongs to the same subnet as - * the new ire being added. - * Flushing such ires is sometimes required to - * avoid misrouting: say we have a machine with - * two interfaces (I1 and I2), a default router - * R on the I1 subnet, and a host route to an - * off-link destination D with a gateway G on - * the I2 subnet. - * Under normal operation, we will have an - * on-link cache entry for G and an off-link - * cache entry for D with G as ire_gateway_addr, - * traffic to D will reach its destination - * through gateway G. - * If the administrator does 'ifconfig I2 down', - * the cache entries for D and G will be - * flushed. However, G will now be resolved as - * an off-link destination using R (the default - * router) as gateway. Then D will also be - * resolved as an off-link destination using G - * as gateway - this behavior is due to - * compatibility reasons, see comment in - * ire_ihandle_lookup_offlink(). Traffic to D - * will go to the router R and probably won't - * reach the destination. - * The administrator then does 'ifconfig I2 up'. - * Since G is on the I2 subnet, this routine - * will flush its cache entry. It must also - * flush the cache entry for D, otherwise - * traffic will stay misrouted until the IRE - * times out. - */ - if ((cire->ire_gateway_addr & ire->ire_mask) == - (ire->ire_addr & ire->ire_mask)) { - ire_delete(cire); - continue; - } - } - IRB_REFRELE(irb); - } - } else { + ire_dep_incr_generation(ire); + break; + case IRE_FLUSH_ADD: /* - * delete the cache entries based on - * handle in the IRE as this IRE is - * being deleted/changed. + * Update the generation numbers of all shorter matching routes. + * ire_update_generation takes care of the dependants by + * using ire_dep_incr_generation. */ - for (i = 0; i < ipst->ips_ip_cache_table_size; i++) { - irb = &ipst->ips_ip_cache_table[i]; - if ((cire = irb->irb_ire) == NULL) - continue; - IRB_REFHOLD(irb); - for (cire = irb->irb_ire; cire != NULL; - cire = cire->ire_next) { - if (cire->ire_type != IRE_CACHE) - continue; - if ((cire->ire_phandle == 0 || - cire->ire_phandle != ire->ire_phandle) && - (cire->ire_ihandle == 0 || - cire->ire_ihandle != ire->ire_ihandle)) - continue; - ire_delete(cire); - } - IRB_REFRELE(irb); - } + (void) ipst->ips_ip_ftable->rnh_matchaddr_args(&rt->rt_dst, + ipst->ips_ip_ftable, ire_update_generation, NULL); + break; } + RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); } /* * Matches the arguments passed with the values in the ire. * - * Note: for match types that match using "ipif" passed in, ipif + * Note: for match types that match using "ill" passed in, ill * must be checked for non-NULL before calling this routine. */ boolean_t ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, - int type, const ipif_t *ipif, zoneid_t zoneid, uint32_t ihandle, - const ts_label_t *tsl, int match_flags, queue_t *wq) + int type, const ill_t *ill, zoneid_t zoneid, + const ts_label_t *tsl, int match_flags) { ill_t *ire_ill = NULL, *dst_ill; - ill_t *ipif_ill = NULL; + ip_stack_t *ipst = ire->ire_ipst; ASSERT(ire->ire_ipversion == IPV4_VERSION); ASSERT((ire->ire_addr & ~ire->ire_mask) == 0); ASSERT((!(match_flags & MATCH_IRE_ILL)) || - (ipif != NULL && !ipif->ipif_isv6)); - ASSERT(!(match_flags & MATCH_IRE_WQ) || wq != NULL); + (ill != NULL && !ill->ill_isv6)); /* - * If MATCH_IRE_MARK_TESTHIDDEN is set, then only return the IRE if it - * is in fact hidden, to ensure the caller gets the right one. One - * exception: if the caller passed MATCH_IRE_IHANDLE, then they - * already know the identity of the given IRE_INTERFACE entry and - * there's no point trying to hide it from them. + * If MATCH_IRE_TESTHIDDEN is set, then only return the IRE if it is + * in fact hidden, to ensure the caller gets the right one. */ - if (ire->ire_marks & IRE_MARK_TESTHIDDEN) { - if (match_flags & MATCH_IRE_IHANDLE) - match_flags |= MATCH_IRE_MARK_TESTHIDDEN; - - if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) + if (ire->ire_testhidden) { + if (!(match_flags & MATCH_IRE_TESTHIDDEN)) return (B_FALSE); } - /* - * MATCH_IRE_MARK_PRIVATE_ADDR is set when IP_NEXTHOP option - * is used. In that case the routing table is bypassed and the - * packets are sent directly to the specified nexthop. The - * IRE_CACHE entry representing this route should be marked - * with IRE_MARK_PRIVATE_ADDR. - */ - - if (!(match_flags & MATCH_IRE_MARK_PRIVATE_ADDR) && - (ire->ire_marks & IRE_MARK_PRIVATE_ADDR)) - return (B_FALSE); - if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid && ire->ire_zoneid != ALL_ZONES) { /* - * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid is - * valid and does not match that of ire_zoneid, a failure to + * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid + * does not match that of ire_zoneid, a failure to * match is reported at this point. Otherwise, since some IREs * that are available in the global zone can be used in local * zones, additional checks need to be performed: * - * IRE_BROADCAST, IRE_CACHE and IRE_LOOPBACK + * IRE_LOOPBACK * entries should never be matched in this situation. + * Each zone has its own IRE_LOOPBACK. + * + * IRE_LOCAL + * We allow them for any zoneid. ire_route_recursive + * does additional checks when + * ip_restrict_interzone_loopback is set. * - * IRE entries that have an interface associated with them - * should in general not match unless they are an IRE_LOCAL - * or in the case when MATCH_IRE_DEFAULT has been set in - * the caller. In the case of the former, checking of the - * other fields supplied should take place. + * If ill_usesrc_ifindex is set + * Then we check if the zone has a valid source address + * on the usesrc ill. * - * In the case where MATCH_IRE_DEFAULT has been set, - * all of the ipif's associated with the IRE's ill are - * checked to see if there is a matching zoneid. If any - * one ipif has a matching zoneid, this IRE is a - * potential candidate so checking of the other fields - * takes place. + * If ire_ill is set, then check that the zone has an ipif + * on that ill. * - * In the case where the IRE_INTERFACE has a usable source - * address (indicated by ill_usesrc_ifindex) in the - * correct zone then it's permitted to return this IRE + * Outside of this function (in ire_round_robin) we check + * that any IRE_OFFLINK has a gateway that reachable from the + * zone when we have multiple choices (ECMP). */ if (match_flags & MATCH_IRE_ZONEONLY) return (B_FALSE); - if (ire->ire_type & (IRE_BROADCAST | IRE_CACHE | IRE_LOOPBACK)) + if (ire->ire_type & IRE_LOOPBACK) return (B_FALSE); + + if (ire->ire_type & IRE_LOCAL) + goto matchit; + /* - * Note, IRE_INTERFACE can have the stq as NULL. For - * example, if the default multicast route is tied to - * the loopback address. + * The normal case of IRE_ONLINK has a matching zoneid. + * Here we handle the case when shared-IP zones have been + * configured with IP addresses on vniN. In that case it + * is ok for traffic from a zone to use IRE_ONLINK routes + * if the ill has a usesrc pointing at vniN */ - if ((ire->ire_type & IRE_INTERFACE) && - (ire->ire_stq != NULL)) { - dst_ill = (ill_t *)ire->ire_stq->q_ptr; + dst_ill = ire->ire_ill; + if (ire->ire_type & IRE_ONLINK) { + uint_t ifindex; + + /* + * Note there is no IRE_INTERFACE on vniN thus + * can't do an IRE lookup for a matching route. + */ + ifindex = dst_ill->ill_usesrc_ifindex; + if (ifindex == 0) + return (B_FALSE); + /* * If there is a usable source address in the - * zone, then it's ok to return an - * IRE_INTERFACE + * zone, then it's ok to return this IRE_INTERFACE */ - if (ipif_usesrc_avail(dst_ill, zoneid)) { - ip3dbg(("ire_match_args: dst_ill %p match %d\n", - (void *)dst_ill, - (ire->ire_addr == (addr & mask)))); - } else { - ip3dbg(("ire_match_args: src_ipif NULL" + if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6, + zoneid, ipst)) { + ip3dbg(("ire_match_args: no usrsrc for zone" " dst_ill %p\n", (void *)dst_ill)); return (B_FALSE); } } - if (ire->ire_ipif != NULL && ire->ire_type != IRE_LOCAL && - !(ire->ire_type & IRE_INTERFACE)) { + /* + * For exampe, with + * route add 11.0.0.0 gw1 -ifp bge0 + * route add 11.0.0.0 gw2 -ifp bge1 + * this code would differentiate based on + * where the sending zone has addresses. + * Only if the zone has an address on bge0 can it use the first + * route. It isn't clear if this behavior is documented + * anywhere. + */ + if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) { ipif_t *tipif; - if ((match_flags & MATCH_IRE_DEFAULT) == 0) { - return (B_FALSE); - } - mutex_enter(&ire->ire_ipif->ipif_ill->ill_lock); - for (tipif = ire->ire_ipif->ipif_ill->ill_ipif; + mutex_enter(&dst_ill->ill_lock); + for (tipif = dst_ill->ill_ipif; tipif != NULL; tipif = tipif->ipif_next) { - if (IPIF_CAN_LOOKUP(tipif) && + if (!IPIF_IS_CONDEMNED(tipif) && (tipif->ipif_flags & IPIF_UP) && (tipif->ipif_zoneid == zoneid || tipif->ipif_zoneid == ALL_ZONES)) break; } - mutex_exit(&ire->ire_ipif->ipif_ill->ill_lock); + mutex_exit(&dst_ill->ill_lock); if (tipif == NULL) { return (B_FALSE); } } } - /* - * For IRE_CACHE entries, MATCH_IRE_ILL means that somebody wants to - * send out ire_stq (ire_ipif for IRE_CACHE entries is just the means - * of getting a source address -- i.e., ire_src_addr == - * ire->ire_ipif->ipif_src_addr). ire_to_ill() handles this. - * - * NOTE: For IPMP, MATCH_IRE_ILL usually matches any ill in the group. - * However, if MATCH_IRE_MARK_TESTHIDDEN is set (i.e., the IRE is for - * IPMP test traffic), then the ill must match exactly. - */ +matchit: if (match_flags & MATCH_IRE_ILL) { - ire_ill = ire_to_ill(ire); - ipif_ill = ipif->ipif_ill; + ire_ill = ire->ire_ill; + + /* + * If asked to match an ill, we *must* match + * on the ire_ill for ipmp test addresses, or + * any of the ill in the group for data addresses. + * If we don't, we may as well fail. + * However, we need an exception for IRE_LOCALs to ensure + * we loopback packets even sent to test addresses on different + * interfaces in the group. + */ + if ((match_flags & MATCH_IRE_TESTHIDDEN) && + !(ire->ire_type & IRE_LOCAL)) { + if (ire->ire_ill != ill) + return (B_FALSE); + } else { + match_flags &= ~MATCH_IRE_TESTHIDDEN; + /* + * We know that ill is not NULL, but ire_ill could be + * NULL + */ + if (ire_ill == NULL || !IS_ON_SAME_LAN(ill, ire_ill)) + return (B_FALSE); + } } if ((ire->ire_addr == (addr & mask)) && ((!(match_flags & MATCH_IRE_GW)) || (ire->ire_gateway_addr == gateway)) && - ((!(match_flags & MATCH_IRE_TYPE)) || - (ire->ire_type & type)) && - ((!(match_flags & MATCH_IRE_SRC)) || - (ire->ire_src_addr == ipif->ipif_src_addr)) && - ((!(match_flags & MATCH_IRE_IPIF)) || - (ire->ire_ipif == ipif)) && - ((!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) || - (ire->ire_marks & IRE_MARK_TESTHIDDEN)) && - ((!(match_flags & MATCH_IRE_MARK_PRIVATE_ADDR)) || - (ire->ire_type != IRE_CACHE || - ire->ire_marks & IRE_MARK_PRIVATE_ADDR)) && - ((!(match_flags & MATCH_IRE_WQ)) || - (ire->ire_stq == wq)) && - ((!(match_flags & MATCH_IRE_ILL)) || - (ire_ill == ipif_ill || - (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN) && - ire_ill != NULL && IS_IN_SAME_ILLGRP(ipif_ill, ire_ill)))) && - ((!(match_flags & MATCH_IRE_IHANDLE)) || - (ire->ire_ihandle == ihandle)) && - ((!(match_flags & MATCH_IRE_MASK)) || - (ire->ire_mask == mask)) && + ((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & type)) && + ((!(match_flags & MATCH_IRE_TESTHIDDEN)) || ire->ire_testhidden) && + ((!(match_flags & MATCH_IRE_MASK)) || (ire->ire_mask == mask)) && ((!(match_flags & MATCH_IRE_SECATTR)) || (!is_system_labeled()) || (tsol_ire_match_gwattr(ire, tsl) == 0))) { @@ -3963,494 +1991,207 @@ ire_match_args(ire_t *ire, ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, } /* - * Lookup for a route in all the tables + * Check if the IRE_LOCAL uses the same ill as another route would use. + * If there is no alternate route, or the alternate is a REJECT or BLACKHOLE, + * then we don't allow this IRE_LOCAL to be used. + * We always return an IRE; will be RTF_REJECT if no route available. */ ire_t * -ire_route_lookup(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, - int type, const ipif_t *ipif, ire_t **pire, zoneid_t zoneid, - const ts_label_t *tsl, int flags, ip_stack_t *ipst) +ire_alt_local(ire_t *ire, zoneid_t zoneid, const ts_label_t *tsl, + const ill_t *ill, uint_t *generationp) { - ire_t *ire = NULL; + ip_stack_t *ipst = ire->ire_ipst; + ire_t *alt_ire; + uint_t ire_type; + uint_t generation; + uint_t match_flags; - /* - * ire_match_args() will dereference ipif MATCH_IRE_SRC or - * MATCH_IRE_ILL is set. - */ - if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL)) - return (NULL); + ASSERT(ire->ire_type & IRE_LOCAL); + ASSERT(ire->ire_ill != NULL); /* - * might be asking for a cache lookup, - * This is not best way to lookup cache, - * user should call ire_cache_lookup directly. - * - * If MATCH_IRE_TYPE was set, first lookup in the cache table and then - * in the forwarding table, if the applicable type flags were set. + * Need to match on everything but local. + * This might result in the creation of a IRE_IF_CLONE for the + * same address as the IRE_LOCAL when restrict_interzone_loopback is + * set. ire_add_*() ensures that the IRE_IF_CLONE are tail inserted + * to make sure the IRE_LOCAL is always found first. */ - if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_CACHETABLE) != 0) { - ire = ire_ctable_lookup(addr, gateway, type, ipif, zoneid, - tsl, flags, ipst); - if (ire != NULL) - return (ire); + ire_type = (IRE_ONLINK | IRE_OFFLINK) & ~(IRE_LOCAL|IRE_LOOPBACK); + match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR; + if (ill != NULL) + match_flags |= MATCH_IRE_ILL; + + if (ire->ire_ipversion == IPV4_VERSION) { + alt_ire = ire_route_recursive_v4(ire->ire_addr, ire_type, + ill, zoneid, tsl, match_flags, B_TRUE, 0, ipst, NULL, NULL, + &generation); + } else { + alt_ire = ire_route_recursive_v6(&ire->ire_addr_v6, ire_type, + ill, zoneid, tsl, match_flags, B_TRUE, 0, ipst, NULL, NULL, + &generation); } - if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_FORWARDTABLE) != 0) { - ire = ire_ftable_lookup(addr, mask, gateway, type, ipif, pire, - zoneid, 0, tsl, flags, ipst); + ASSERT(alt_ire != NULL); + + if (alt_ire->ire_ill == ire->ire_ill) { + /* Going out the same ILL - ok to send to IRE_LOCAL */ + ire_refrele(alt_ire); + } else { + /* Different ill - ignore IRE_LOCAL */ + ire_refrele(ire); + ire = alt_ire; + if (generationp != NULL) + *generationp = generation; } return (ire); } -/* - * Delete the IRE cache for the gateway and all IRE caches whose - * ire_gateway_addr points to this gateway, and allow them to - * be created on demand by ip_newroute. - */ -void -ire_clookup_delete_cache_gw(ipaddr_t addr, zoneid_t zoneid, ip_stack_t *ipst) +boolean_t +ire_find_zoneid(struct radix_node *rn, void *arg) { + struct rt_entry *rt = (struct rt_entry *)rn; irb_t *irb; ire_t *ire; + ire_ftable_args_t *margs = arg; - irb = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(addr, - ipst->ips_ip_cache_table_size)]; - IRB_REFHOLD(irb); - for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { - if (ire->ire_marks & IRE_MARK_CONDEMNED) - continue; - - ASSERT(ire->ire_mask == IP_HOST_MASK); - if (ire_match_args(ire, addr, ire->ire_mask, 0, IRE_CACHE, - NULL, zoneid, 0, NULL, MATCH_IRE_TYPE, NULL)) { - ire_delete(ire); - } - } - IRB_REFRELE(irb); + ASSERT(rt != NULL); - ire_walk_v4(ire_delete_cache_gw, &addr, zoneid, ipst); -} + irb = &rt->rt_irb; -/* - * Looks up cache table for a route. - * specific lookup can be indicated by - * passing the MATCH_* flags and the - * necessary parameters. - */ -ire_t * -ire_ctable_lookup(ipaddr_t addr, ipaddr_t gateway, int type, const ipif_t *ipif, - zoneid_t zoneid, const ts_label_t *tsl, int flags, ip_stack_t *ipst) -{ - ire_ctable_args_t margs; - - margs.ict_addr = &addr; - margs.ict_gateway = &gateway; - margs.ict_type = type; - margs.ict_ipif = ipif; - margs.ict_zoneid = zoneid; - margs.ict_tsl = tsl; - margs.ict_flags = flags; - margs.ict_ipst = ipst; - margs.ict_wq = NULL; - - return (ip4_ctable_lookup_impl(&margs)); -} + if (irb->irb_ire_cnt == 0) + return (B_FALSE); -/* - * Check whether the IRE_LOCAL and the IRE potentially used to transmit - * (could be an IRE_CACHE, IRE_BROADCAST, or IRE_INTERFACE) are identical - * or part of the same illgrp. (In the IPMP case, usually the two IREs - * will both belong to the IPMP ill, but exceptions are possible -- e.g. - * if IPMP test addresses are on their own subnet.) - */ -boolean_t -ire_local_same_lan(ire_t *ire_local, ire_t *xmit_ire) -{ - ill_t *recv_ill, *xmit_ill; + rw_enter(&irb->irb_lock, RW_READER); + for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { + if (IRE_IS_CONDEMNED(ire)) + continue; - ASSERT(ire_local->ire_type & (IRE_LOCAL|IRE_LOOPBACK)); - ASSERT(xmit_ire->ire_type & (IRE_CACHETABLE|IRE_INTERFACE)); + if (ire->ire_zoneid != ALL_ZONES && + ire->ire_zoneid != margs->ift_zoneid) + continue; - recv_ill = ire_to_ill(ire_local); - xmit_ill = ire_to_ill(xmit_ire); + if (margs->ift_ill != NULL && margs->ift_ill != ire->ire_ill) + continue; - ASSERT(recv_ill != NULL); - ASSERT(xmit_ill != NULL); + if (is_system_labeled() && + tsol_ire_match_gwattr(ire, margs->ift_tsl) != 0) + continue; - return (IS_ON_SAME_LAN(recv_ill, xmit_ill)); + rw_exit(&irb->irb_lock); + return (B_TRUE); + } + rw_exit(&irb->irb_lock); + return (B_FALSE); } /* - * Check if the IRE_LOCAL uses the same ill as another route would use. - * If there is no alternate route, or the alternate is a REJECT or BLACKHOLE, - * then we don't allow this IRE_LOCAL to be used. + * Check if the zoneid (not ALL_ZONES) has an IRE_INTERFACE for the specified + * gateway address. If ill is non-NULL we also match on it. + * The caller must hold a read lock on RADIX_NODE_HEAD if lock_held is set. */ boolean_t -ire_local_ok_across_zones(ire_t *ire_local, zoneid_t zoneid, void *addr, - const ts_label_t *tsl, ip_stack_t *ipst) +ire_gateway_ok_zone_v4(ipaddr_t gateway, zoneid_t zoneid, ill_t *ill, + const ts_label_t *tsl, ip_stack_t *ipst, boolean_t lock_held) { - ire_t *alt_ire; - boolean_t rval; - int flags; + struct rt_sockaddr rdst; + struct rt_entry *rt; + ire_ftable_args_t margs; - flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | MATCH_IRE_RJ_BHOLE; + ASSERT(ill == NULL || !ill->ill_isv6); + if (lock_held) + ASSERT(RW_READ_HELD(&ipst->ips_ip_ftable->rnh_lock)); + else + RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); - if (ire_local->ire_ipversion == IPV4_VERSION) { - alt_ire = ire_ftable_lookup(*((ipaddr_t *)addr), 0, 0, 0, NULL, - NULL, zoneid, 0, tsl, flags, ipst); - } else { - alt_ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL, - NULL, zoneid, 0, tsl, flags, ipst); - } + rdst.rt_sin_len = sizeof (rdst); + rdst.rt_sin_family = AF_INET; + rdst.rt_sin_addr.s_addr = gateway; - if (alt_ire == NULL) - return (B_FALSE); + /* + * We only use margs for ill, zoneid, and tsl matching in + * ire_find_zoneid + */ + (void) memset(&margs, 0, sizeof (margs)); + margs.ift_ill = ill; + margs.ift_zoneid = zoneid; + margs.ift_tsl = tsl; + rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, + ipst->ips_ip_ftable, ire_find_zoneid, (void *)&margs); - if (alt_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { - ire_refrele(alt_ire); - return (B_FALSE); - } - rval = ire_local_same_lan(ire_local, alt_ire); + if (!lock_held) + RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); - ire_refrele(alt_ire); - return (rval); + return (rt != NULL); } /* - * Lookup cache - * - * In general the zoneid has to match (where ALL_ZONES match all of them). - * But for IRE_LOCAL we also need to handle the case where L2 should - * conceptually loop back the packet. This is necessary since neither - * Ethernet drivers nor Ethernet hardware loops back packets sent to their - * own MAC address. This loopback is needed when the normal - * routes (ignoring IREs with different zoneids) would send out the packet on - * the same ill as the ill with which this IRE_LOCAL is associated. - * - * Earlier versions of this code always matched an IRE_LOCAL independently of - * the zoneid. We preserve that earlier behavior when - * ip_restrict_interzone_loopback is turned off. + * ire_walk routine to delete a fraction of redirect IREs and IRE_CLONE_IF IREs. + * The fraction argument tells us what fraction of the IREs to delete. + * Common for IPv4 and IPv6. + * Used when memory backpressure. */ -ire_t * -ire_cache_lookup(ipaddr_t addr, zoneid_t zoneid, const ts_label_t *tsl, - ip_stack_t *ipst) +static void +ire_delete_reclaim(ire_t *ire, char *arg) { - irb_t *irb_ptr; - ire_t *ire; - - irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH(addr, - ipst->ips_ip_cache_table_size)]; - rw_enter(&irb_ptr->irb_lock, RW_READER); - for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { - if (ire->ire_marks & (IRE_MARK_CONDEMNED | - IRE_MARK_TESTHIDDEN | IRE_MARK_PRIVATE_ADDR)) { - continue; - } - if (ire->ire_addr == addr) { - /* - * Finally, check if the security policy has any - * restriction on using this route for the specified - * message. - */ - if (tsl != NULL && - ire->ire_gw_secattr != NULL && - tsol_ire_match_gwattr(ire, tsl) != 0) { - continue; - } + ip_stack_t *ipst = ire->ire_ipst; + uint_t fraction = *(uint_t *)arg; + uint_t rand; - if (zoneid == ALL_ZONES || ire->ire_zoneid == zoneid || - ire->ire_zoneid == ALL_ZONES) { - IRE_REFHOLD(ire); - rw_exit(&irb_ptr->irb_lock); - return (ire); - } + if ((ire->ire_flags & RTF_DYNAMIC) || + (ire->ire_type & IRE_IF_CLONE)) { - if (ire->ire_type == IRE_LOCAL) { - if (ipst->ips_ip_restrict_interzone_loopback && - !ire_local_ok_across_zones(ire, zoneid, - &addr, tsl, ipst)) - continue; + /* Pick a random number */ + rand = (uint_t)lbolt + + IRE_ADDR_HASH_V6(ire->ire_addr_v6, 256); - IRE_REFHOLD(ire); - rw_exit(&irb_ptr->irb_lock); - return (ire); - } + /* Use truncation */ + if ((rand/fraction)*fraction == rand) { + IP_STAT(ipst, ip_ire_reclaim_deleted); + ire_delete(ire); } } - rw_exit(&irb_ptr->irb_lock); - return (NULL); -} -ire_t * -ire_cache_lookup_simple(ipaddr_t dst, ip_stack_t *ipst) -{ - irb_t *irb_ptr; - ire_t *ire; - - /* - * Look for an ire in the cachetable whose - * ire_addr matches the destination. - * Since we are being called by forwarding fastpath - * no need to check for Trusted Solaris label. - */ - irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH( - dst, ipst->ips_ip_cache_table_size)]; - rw_enter(&irb_ptr->irb_lock, RW_READER); - for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { - if (ire->ire_marks & (IRE_MARK_CONDEMNED | IRE_MARK_TESTHIDDEN | - IRE_MARK_PRIVATE_ADDR)) { - continue; - } - if (ire->ire_addr == dst) { - IRE_REFHOLD(ire); - rw_exit(&irb_ptr->irb_lock); - return (ire); - } - } - rw_exit(&irb_ptr->irb_lock); - return (NULL); } /* - * Locate the interface ire that is tied to the cache ire 'cire' via - * cire->ire_ihandle. + * kmem_cache callback to free up memory. * - * We are trying to create the cache ire for an offlink destn based - * on the cache ire of the gateway in 'cire'. 'pire' is the prefix ire - * as found by ip_newroute(). We are called from ip_newroute() in - * the IRE_CACHE case. + * Free a fraction (ips_ip_ire_reclaim_fraction) of things IP added dynamically + * (RTF_DYNAMIC and IRE_IF_CLONE). */ -ire_t * -ire_ihandle_lookup_offlink(ire_t *cire, ire_t *pire) +static void +ip_ire_reclaim_stack(ip_stack_t *ipst) { - ire_t *ire; - int match_flags; - ipaddr_t gw_addr; - ipif_t *gw_ipif; - ip_stack_t *ipst = cire->ire_ipst; - - ASSERT(cire != NULL && pire != NULL); - - /* - * We don't need to specify the zoneid to ire_ftable_lookup() below - * because the ihandle refers to an ipif which can be in only one zone. - */ - match_flags = MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK; - if (pire->ire_ipif != NULL) - match_flags |= MATCH_IRE_ILL; - /* - * We know that the mask of the interface ire equals cire->ire_cmask. - * (When ip_newroute() created 'cire' for the gateway it set its - * cmask from the interface ire's mask) - */ - ire = ire_ftable_lookup(cire->ire_addr, cire->ire_cmask, 0, - IRE_INTERFACE, pire->ire_ipif, NULL, ALL_ZONES, cire->ire_ihandle, - NULL, match_flags, ipst); - if (ire != NULL) - return (ire); - /* - * If we didn't find an interface ire above, we can't declare failure. - * For backwards compatibility, we need to support prefix routes - * pointing to next hop gateways that are not on-link. - * - * Assume we are trying to ping some offlink destn, and we have the - * routing table below. - * - * Eg. default - gw1 <--- pire (line 1) - * gw1 - gw2 (line 2) - * gw2 - hme0 (line 3) - * - * If we already have a cache ire for gw1 in 'cire', the - * ire_ftable_lookup above would have failed, since there is no - * interface ire to reach gw1. We will fallthru below. - * - * Here we duplicate the steps that ire_ftable_lookup() did in - * getting 'cire' from 'pire', in the MATCH_IRE_RECURSIVE case. - * The differences are the following - * i. We want the interface ire only, so we call ire_ftable_lookup() - * instead of ire_route_lookup() - * ii. We look for only prefix routes in the 1st call below. - * ii. We want to match on the ihandle in the 2nd call below. - */ - match_flags = MATCH_IRE_TYPE; - if (pire->ire_ipif != NULL) - match_flags |= MATCH_IRE_ILL; - ire = ire_ftable_lookup(pire->ire_gateway_addr, 0, 0, IRE_OFFSUBNET, - pire->ire_ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst); - if (ire == NULL) - return (NULL); - /* - * At this point 'ire' corresponds to the entry shown in line 2. - * gw_addr is 'gw2' in the example above. - */ - gw_addr = ire->ire_gateway_addr; - gw_ipif = ire->ire_ipif; - ire_refrele(ire); + uint_t fraction = ipst->ips_ip_ire_reclaim_fraction; - match_flags |= MATCH_IRE_IHANDLE; - ire = ire_ftable_lookup(gw_addr, 0, 0, IRE_INTERFACE, - gw_ipif, NULL, ALL_ZONES, cire->ire_ihandle, NULL, match_flags, - ipst); - return (ire); -} + IP_STAT(ipst, ip_ire_reclaim_calls); -/* - * Return the IRE_LOOPBACK, IRE_IF_RESOLVER or IRE_IF_NORESOLVER - * ire associated with the specified ipif. - * - * This might occasionally be called when IPIF_UP is not set since - * the IP_MULTICAST_IF as well as creating interface routes - * allows specifying a down ipif (ipif_lookup* match ipifs that are down). - * - * Note that if IPIF_NOLOCAL, IPIF_NOXMIT, or IPIF_DEPRECATED is set on - * the ipif, this routine might return NULL. - */ -ire_t * -ipif_to_ire(const ipif_t *ipif) -{ - ire_t *ire; - ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; - uint_t match_flags = MATCH_IRE_TYPE | MATCH_IRE_IPIF | MATCH_IRE_MASK; + ire_walk(ire_delete_reclaim, &fraction, ipst); /* - * IRE_INTERFACE entries for ills under IPMP are IRE_MARK_TESTHIDDEN - * so that they aren't accidentally returned. However, if the - * caller's ipif is on an ill under IPMP, there's no need to hide 'em. + * Walk all CONNs that can have a reference on an ire, nce or dce. + * Get them to update any stale references to drop any refholds they + * have. */ - if (IS_UNDER_IPMP(ipif->ipif_ill)) - match_flags |= MATCH_IRE_MARK_TESTHIDDEN; - - ASSERT(!ipif->ipif_isv6); - if (ipif->ipif_ire_type == IRE_LOOPBACK) { - ire = ire_ctable_lookup(ipif->ipif_lcl_addr, 0, IRE_LOOPBACK, - ipif, ALL_ZONES, NULL, (MATCH_IRE_TYPE | MATCH_IRE_IPIF), - ipst); - } else if (ipif->ipif_flags & IPIF_POINTOPOINT) { - /* In this case we need to lookup destination address. */ - ire = ire_ftable_lookup(ipif->ipif_pp_dst_addr, IP_HOST_MASK, 0, - IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, NULL, match_flags, - ipst); - } else { - ire = ire_ftable_lookup(ipif->ipif_subnet, - ipif->ipif_net_mask, 0, IRE_INTERFACE, ipif, NULL, - ALL_ZONES, 0, NULL, match_flags, ipst); - } - return (ire); + ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst); } /* - * ire_walk function. - * Count the number of IRE_CACHE entries in different categories. - */ -void -ire_cache_count(ire_t *ire, char *arg) -{ - ire_cache_count_t *icc = (ire_cache_count_t *)arg; - - if (ire->ire_type != IRE_CACHE) - return; - - icc->icc_total++; - - if (ire->ire_ipversion == IPV6_VERSION) { - mutex_enter(&ire->ire_lock); - if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) { - mutex_exit(&ire->ire_lock); - icc->icc_onlink++; - return; - } - mutex_exit(&ire->ire_lock); - } else { - if (ire->ire_gateway_addr == 0) { - icc->icc_onlink++; - return; - } - } - - ASSERT(ire->ire_ipif != NULL); - if (ire->ire_max_frag < ire->ire_ipif->ipif_mtu) - icc->icc_pmtu++; - else if (ire->ire_tire_mark != ire->ire_ob_pkt_count + - ire->ire_ib_pkt_count) - icc->icc_offlink++; - else - icc->icc_unused++; -} - -/* - * ire_walk function called by ip_trash_ire_reclaim(). - * Free a fraction of the IRE_CACHE cache entries. The fractions are - * different for different categories of IRE_CACHE entries. - * A fraction of zero means to not free any in that category. - * Use the hash bucket id plus lbolt as a random number. Thus if the fraction - * is N then every Nth hash bucket chain will be freed. + * Called by the memory allocator subsystem directly, when the system + * is running low on memory. */ +/* ARGSUSED */ void -ire_cache_reclaim(ire_t *ire, char *arg) +ip_ire_reclaim(void *args) { - ire_cache_reclaim_t *icr = (ire_cache_reclaim_t *)arg; - uint_t rand; - ip_stack_t *ipst = icr->icr_ipst; - - if (ire->ire_type != IRE_CACHE) - return; + netstack_handle_t nh; + netstack_t *ns; - if (ire->ire_ipversion == IPV6_VERSION) { - rand = (uint_t)lbolt + - IRE_ADDR_HASH_V6(ire->ire_addr_v6, - ipst->ips_ip6_cache_table_size); - mutex_enter(&ire->ire_lock); - if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) { - mutex_exit(&ire->ire_lock); - if (icr->icr_onlink != 0 && - (rand/icr->icr_onlink)*icr->icr_onlink == rand) { - ire_delete(ire); - return; - } - goto done; - } - mutex_exit(&ire->ire_lock); - } else { - rand = (uint_t)lbolt + - IRE_ADDR_HASH(ire->ire_addr, ipst->ips_ip_cache_table_size); - if (ire->ire_gateway_addr == 0) { - if (icr->icr_onlink != 0 && - (rand/icr->icr_onlink)*icr->icr_onlink == rand) { - ire_delete(ire); - return; - } - goto done; - } - } - /* Not onlink IRE */ - ASSERT(ire->ire_ipif != NULL); - if (ire->ire_max_frag < ire->ire_ipif->ipif_mtu) { - /* Use ptmu fraction */ - if (icr->icr_pmtu != 0 && - (rand/icr->icr_pmtu)*icr->icr_pmtu == rand) { - ire_delete(ire); - return; - } - } else if (ire->ire_tire_mark != ire->ire_ob_pkt_count + - ire->ire_ib_pkt_count) { - /* Use offlink fraction */ - if (icr->icr_offlink != 0 && - (rand/icr->icr_offlink)*icr->icr_offlink == rand) { - ire_delete(ire); - return; - } - } else { - /* Use unused fraction */ - if (icr->icr_unused != 0 && - (rand/icr->icr_unused)*icr->icr_unused == rand) { - ire_delete(ire); - return; - } + netstack_next_init(&nh); + while ((ns = netstack_next(&nh)) != NULL) { + ip_ire_reclaim_stack(ns->netstack_ip); + netstack_rele(ns); } -done: - /* - * Update tire_mark so that those that haven't been used since this - * reclaim will be considered unused next time we reclaim. - */ - ire->ire_tire_mark = ire->ire_ob_pkt_count + ire->ire_ib_pkt_count; + netstack_next_fini(&nh); } static void @@ -4470,14 +2211,21 @@ void ip_ire_g_init() { /* - * Create ire caches, ire_reclaim() - * will give IRE_CACHE back to system when needed. + * Create kmem_caches. ip_ire_reclaim() and ip_nce_reclaim() + * will give disposable IREs back to system when needed. * This needs to be done here before anything else, since * ire_add() expects the cache to be created. */ ire_cache = kmem_cache_create("ire_cache", - sizeof (ire_t), 0, ip_ire_constructor, - ip_ire_destructor, ip_trash_ire_reclaim, NULL, NULL, 0); + sizeof (ire_t), 0, NULL, NULL, + ip_ire_reclaim, NULL, NULL, 0); + + ncec_cache = kmem_cache_create("ncec_cache", + sizeof (ncec_t), 0, NULL, NULL, + ip_nce_reclaim, NULL, NULL, 0); + nce_cache = kmem_cache_create("nce_cache", + sizeof (nce_t), 0, NULL, NULL, + NULL, NULL, NULL, 0); rt_entry_cache = kmem_cache_create("rt_entry", sizeof (struct rt_entry), 0, NULL, NULL, NULL, NULL, NULL, 0); @@ -4491,104 +2239,65 @@ ip_ire_g_init() void ip_ire_init(ip_stack_t *ipst) { - int i; - uint32_t mem_cnt; - uint32_t cpu_cnt; - uint32_t min_cnt; - pgcnt_t mem_avail; - - /* - * ip_ire_max_bucket_cnt is sized below based on the memory - * size and the cpu speed of the machine. This is upper - * bounded by the compile time value of ip_ire_max_bucket_cnt - * and is lower bounded by the compile time value of - * ip_ire_min_bucket_cnt. Similar logic applies to - * ip6_ire_max_bucket_cnt. - * - * We calculate this for each IP Instances in order to use - * the kmem_avail and ip_ire_{min,max}_bucket_cnt that are - * in effect when the zone is booted. - */ - mem_avail = kmem_avail(); - mem_cnt = (mem_avail >> ip_ire_mem_ratio) / - ip_cache_table_size / sizeof (ire_t); - cpu_cnt = CPU->cpu_type_info.pi_clock >> ip_ire_cpu_ratio; - - min_cnt = MIN(cpu_cnt, mem_cnt); - if (min_cnt < ip_ire_min_bucket_cnt) - min_cnt = ip_ire_min_bucket_cnt; - if (ip_ire_max_bucket_cnt > min_cnt) { - ip_ire_max_bucket_cnt = min_cnt; - } - - mem_cnt = (mem_avail >> ip_ire_mem_ratio) / - ip6_cache_table_size / sizeof (ire_t); - min_cnt = MIN(cpu_cnt, mem_cnt); - if (min_cnt < ip6_ire_min_bucket_cnt) - min_cnt = ip6_ire_min_bucket_cnt; - if (ip6_ire_max_bucket_cnt > min_cnt) { - ip6_ire_max_bucket_cnt = min_cnt; - } + ire_t *ire; + int error; mutex_init(&ipst->ips_ire_ft_init_lock, NULL, MUTEX_DEFAULT, 0); - mutex_init(&ipst->ips_ire_handle_lock, NULL, MUTEX_DEFAULT, NULL); (void) rn_inithead((void **)&ipst->ips_ip_ftable, 32); - /* Calculate the IPv4 cache table size. */ - ipst->ips_ip_cache_table_size = MAX(ip_cache_table_size, - ((mem_avail >> ip_ire_mem_ratio) / sizeof (ire_t) / - ip_ire_max_bucket_cnt)); - if (ipst->ips_ip_cache_table_size > ip_max_cache_table_size) - ipst->ips_ip_cache_table_size = ip_max_cache_table_size; /* - * Make sure that the table size is always a power of 2. The - * hash macro IRE_ADDR_HASH() depends on that. + * Make sure that the forwarding table size is a power of 2. + * The IRE*_ADDR_HASH() macroes depend on that. */ - power2_roundup(&ipst->ips_ip_cache_table_size); - - ipst->ips_ip_cache_table = kmem_zalloc(ipst->ips_ip_cache_table_size * - sizeof (irb_t), KM_SLEEP); - - for (i = 0; i < ipst->ips_ip_cache_table_size; i++) { - rw_init(&ipst->ips_ip_cache_table[i].irb_lock, NULL, - RW_DEFAULT, NULL); - } + ipst->ips_ip6_ftable_hash_size = ip6_ftable_hash_size; + power2_roundup(&ipst->ips_ip6_ftable_hash_size); - /* Calculate the IPv6 cache table size. */ - ipst->ips_ip6_cache_table_size = MAX(ip6_cache_table_size, - ((mem_avail >> ip_ire_mem_ratio) / sizeof (ire_t) / - ip6_ire_max_bucket_cnt)); - if (ipst->ips_ip6_cache_table_size > ip6_max_cache_table_size) - ipst->ips_ip6_cache_table_size = ip6_max_cache_table_size; /* - * Make sure that the table size is always a power of 2. The - * hash macro IRE_ADDR_HASH_V6() depends on that. + * Allocate/initialize a pair of IRE_NOROUTEs for each of IPv4 and IPv6. + * The ire_reject_v* has RTF_REJECT set, and the ire_blackhole_v* has + * RTF_BLACKHOLE set. We use the latter for transient errors such + * as memory allocation failures and tripping on IRE_IS_CONDEMNED + * entries. */ - power2_roundup(&ipst->ips_ip6_cache_table_size); + ire = kmem_cache_alloc(ire_cache, KM_SLEEP); + *ire = ire_null; + error = ire_init_v4(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES, + RTF_REJECT|RTF_UP, NULL, ipst); + ASSERT(error == 0); + ipst->ips_ire_reject_v4 = ire; - ipst->ips_ip_cache_table_v6 = kmem_zalloc( - ipst->ips_ip6_cache_table_size * sizeof (irb_t), KM_SLEEP); + ire = kmem_cache_alloc(ire_cache, KM_SLEEP); + *ire = ire_null; + error = ire_init_v6(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES, + RTF_REJECT|RTF_UP, NULL, ipst); + ASSERT(error == 0); + ipst->ips_ire_reject_v6 = ire; - for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) { - rw_init(&ipst->ips_ip_cache_table_v6[i].irb_lock, NULL, - RW_DEFAULT, NULL); - } + ire = kmem_cache_alloc(ire_cache, KM_SLEEP); + *ire = ire_null; + error = ire_init_v4(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES, + RTF_BLACKHOLE|RTF_UP, NULL, ipst); + ASSERT(error == 0); + ipst->ips_ire_blackhole_v4 = ire; - /* - * Make sure that the forwarding table size is a power of 2. - * The IRE*_ADDR_HASH() macroes depend on that. - */ - ipst->ips_ip6_ftable_hash_size = ip6_ftable_hash_size; - power2_roundup(&ipst->ips_ip6_ftable_hash_size); + ire = kmem_cache_alloc(ire_cache, KM_SLEEP); + *ire = ire_null; + error = ire_init_v6(ire, 0, 0, 0, IRE_NOROUTE, NULL, ALL_ZONES, + RTF_BLACKHOLE|RTF_UP, NULL, ipst); + ASSERT(error == 0); + ipst->ips_ire_blackhole_v6 = ire; - ipst->ips_ire_handle = 1; + rw_init(&ipst->ips_ip6_ire_head_lock, NULL, RW_DEFAULT, NULL); + rw_init(&ipst->ips_ire_dep_lock, NULL, RW_DEFAULT, NULL); } void ip_ire_g_fini(void) { kmem_cache_destroy(ire_cache); + kmem_cache_destroy(ncec_cache); + kmem_cache_destroy(nce_cache); kmem_cache_destroy(rt_entry_cache); rn_fini(); @@ -4599,9 +2308,21 @@ ip_ire_fini(ip_stack_t *ipst) { int i; + rw_destroy(&ipst->ips_ire_dep_lock); + rw_destroy(&ipst->ips_ip6_ire_head_lock); + + ire_refrele_notr(ipst->ips_ire_reject_v6); + ipst->ips_ire_reject_v6 = NULL; + ire_refrele_notr(ipst->ips_ire_reject_v4); + ipst->ips_ire_reject_v4 = NULL; + ire_refrele_notr(ipst->ips_ire_blackhole_v6); + ipst->ips_ire_blackhole_v6 = NULL; + ire_refrele_notr(ipst->ips_ire_blackhole_v4); + ipst->ips_ire_blackhole_v4 = NULL; + /* * Delete all IREs - assumes that the ill/ipifs have - * been removed so what remains are just the ftable and IRE_CACHE. + * been removed so what remains are just the ftable to handle. */ ire_walk(ire_delete, NULL, ipst); @@ -4609,23 +2330,6 @@ ip_ire_fini(ip_stack_t *ipst) ipst->ips_ip_ftable = NULL; mutex_destroy(&ipst->ips_ire_ft_init_lock); - mutex_destroy(&ipst->ips_ire_handle_lock); - - for (i = 0; i < ipst->ips_ip_cache_table_size; i++) { - ASSERT(ipst->ips_ip_cache_table[i].irb_ire == NULL); - rw_destroy(&ipst->ips_ip_cache_table[i].irb_lock); - } - kmem_free(ipst->ips_ip_cache_table, - ipst->ips_ip_cache_table_size * sizeof (irb_t)); - ipst->ips_ip_cache_table = NULL; - - for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) { - ASSERT(ipst->ips_ip_cache_table_v6[i].irb_ire == NULL); - rw_destroy(&ipst->ips_ip_cache_table_v6[i].irb_lock); - } - kmem_free(ipst->ips_ip_cache_table_v6, - ipst->ips_ip6_cache_table_size * sizeof (irb_t)); - ipst->ips_ip_cache_table_v6 = NULL; for (i = 0; i < IP6_MASK_TABLE_SIZE; i++) { irb_t *ptr; @@ -4643,1116 +2347,1177 @@ ip_ire_fini(ip_stack_t *ipst) } } +#ifdef DEBUG +void +ire_trace_ref(ire_t *ire) +{ + mutex_enter(&ire->ire_lock); + if (ire->ire_trace_disable) { + mutex_exit(&ire->ire_lock); + return; + } + + if (th_trace_ref(ire, ire->ire_ipst)) { + mutex_exit(&ire->ire_lock); + } else { + ire->ire_trace_disable = B_TRUE; + mutex_exit(&ire->ire_lock); + ire_trace_cleanup(ire); + } +} + +void +ire_untrace_ref(ire_t *ire) +{ + mutex_enter(&ire->ire_lock); + if (!ire->ire_trace_disable) + th_trace_unref(ire); + mutex_exit(&ire->ire_lock); +} + +static void +ire_trace_cleanup(const ire_t *ire) +{ + th_trace_cleanup(ire, ire->ire_trace_disable); +} +#endif /* DEBUG */ + /* - * Check if another multirt route resolution is needed. - * B_TRUE is returned is there remain a resolvable route, - * or if no route for that dst is resolved yet. - * B_FALSE is returned if all routes for that dst are resolved - * or if the remaining unresolved routes are actually not - * resolvable. - * This only works in the global zone. + * Find, or create if needed, the nce_t pointer to the neighbor cache + * entry ncec_t for an IPv4 address. The nce_t will be created on the ill_t + * in the non-IPMP case, or on the cast-ill in the IPMP bcast/mcast case, or + * on the next available under-ill (selected by the IPMP rotor) in the + * unicast IPMP case. + * + * If a neighbor-cache entry has to be created (i.e., one does not already + * exist in the nce list) the ncec_lladdr and ncec_state of the neighbor cache + * entry are initialized in nce_add_v4(). The broadcast, multicast, and + * link-layer type determine the contents of {ncec_state, ncec_lladdr} of + * the ncec_t created. The ncec_lladdr is non-null for all link types with + * non-zero ill_phys_addr_length, though the contents may be zero in cases + * where the link-layer type is not known at the time of creation + * (e.g., IRE_IFRESOLVER links) + * + * All IRE_BROADCAST entries have ncec_state = ND_REACHABLE, and the nce_lladr + * has the physical broadcast address of the outgoing interface. + * For unicast ire entries, + * - if the outgoing interface is of type IRE_IF_RESOLVER, a newly created + * ncec_t with 0 nce_lladr contents, and will be in the ND_INITIAL state. + * - if the outgoing interface is a IRE_IF_NORESOLVER interface, no link + * layer resolution is necessary, so that the ncec_t will be in the + * ND_REACHABLE state + * + * The link layer information needed for broadcast addresses, and for + * packets sent on IRE_IF_NORESOLVER interfaces is a constant mapping that + * never needs re-verification for the lifetime of the ncec_t. These are + * therefore marked NCE_F_NONUD. + * + * The nce returned will be created such that the nce_ill == ill that + * is passed in. Note that the nce itself may not have ncec_ill == ill + * where IPMP links are involved. */ -boolean_t -ire_multirt_need_resolve(ipaddr_t dst, const ts_label_t *tsl, ip_stack_t *ipst) +static nce_t * +ire_nce_init(ill_t *ill, const void *addr, int ire_type) { - ire_t *first_fire; - ire_t *first_cire; - ire_t *fire; - ire_t *cire; - irb_t *firb; - irb_t *cirb; - int unres_cnt = 0; - boolean_t resolvable = B_FALSE; - - /* Retrieve the first IRE_HOST that matches the destination */ - first_fire = ire_ftable_lookup(dst, IP_HOST_MASK, 0, IRE_HOST, NULL, - NULL, ALL_ZONES, 0, tsl, - MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_SECATTR, ipst); - - /* No route at all */ - if (first_fire == NULL) { - return (B_TRUE); + int err; + nce_t *nce = NULL; + uint16_t ncec_flags; + uchar_t *hwaddr; + boolean_t need_refrele = B_FALSE; + ill_t *in_ill = ill; + boolean_t is_unicast; + uint_t hwaddr_len; + + is_unicast = ((ire_type & (IRE_MULTICAST|IRE_BROADCAST)) == 0); + if (IS_IPMP(ill) || + ((ire_type & IRE_BROADCAST) && IS_UNDER_IPMP(ill))) { + if ((ill = ipmp_ill_get_xmit_ill(ill, is_unicast)) == NULL) + return (NULL); + need_refrele = B_TRUE; } + ncec_flags = (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0; - firb = first_fire->ire_bucket; - ASSERT(firb != NULL); + switch (ire_type) { + case IRE_BROADCAST: + ASSERT(!ill->ill_isv6); + ncec_flags |= (NCE_F_BCAST|NCE_F_NONUD); + break; + case IRE_MULTICAST: + ncec_flags |= (NCE_F_MCAST|NCE_F_NONUD); + break; + } - /* Retrieve the first IRE_CACHE ire for that destination. */ - first_cire = ire_cache_lookup(dst, GLOBAL_ZONEID, tsl, ipst); + if (ill->ill_net_type == IRE_IF_NORESOLVER && is_unicast) { + hwaddr = ill->ill_dest_addr; + } else { + hwaddr = NULL; + } + hwaddr_len = ill->ill_phys_addr_length; - /* No resolved route. */ - if (first_cire == NULL) { - ire_refrele(first_fire); - return (B_TRUE); +retry: + /* nce_state will be computed by nce_add_common() */ + if (!ill->ill_isv6) { + err = nce_lookup_then_add_v4(ill, hwaddr, hwaddr_len, addr, + ncec_flags, ND_UNCHANGED, &nce); + } else { + err = nce_lookup_then_add_v6(ill, hwaddr, hwaddr_len, addr, + ncec_flags, ND_UNCHANGED, &nce); } + switch (err) { + case 0: + break; + case EEXIST: + /* + * When subnets change or partially overlap what was once + * a broadcast address could now be a unicast, or vice versa. + */ + if (((ncec_flags ^ nce->nce_common->ncec_flags) & + NCE_F_BCAST) != 0) { + ASSERT(!ill->ill_isv6); + ncec_delete(nce->nce_common); + nce_refrele(nce); + goto retry; + } + break; + default: + DTRACE_PROBE2(nce__init__fail, ill_t *, ill, int, err); + if (need_refrele) + ill_refrele(ill); + return (NULL); + } /* - * At least one route is resolved. Here we look through the forward - * and cache tables, to compare the number of declared routes - * with the number of resolved routes. The search for a resolvable - * route is performed only if at least one route remains - * unresolved. + * If the ill was an under-ill of an IPMP group, we need to verify + * that it is still active so that we select an active interface in + * the group. However, since ipmp_ill_is_active ASSERTs for + * IS_UNDER_IPMP(), we first need to verify that the ill is an + * under-ill, and since this is being done in the data path, the + * only way to ascertain this is by holding the ill_g_lock. */ - cirb = first_cire->ire_bucket; - ASSERT(cirb != NULL); - - /* Count the number of routes to that dest that are declared. */ - IRB_REFHOLD(firb); - for (fire = first_fire; fire != NULL; fire = fire->ire_next) { - if (!(fire->ire_flags & RTF_MULTIRT)) - continue; - if (fire->ire_addr != dst) - continue; - unres_cnt++; + rw_enter(&ill->ill_ipst->ips_ill_g_lock, RW_READER); + mutex_enter(&ill->ill_lock); + mutex_enter(&ill->ill_phyint->phyint_lock); + if (need_refrele && IS_UNDER_IPMP(ill) && !ipmp_ill_is_active(ill)) { + /* + * need_refrele implies that the under ill was selected by + * ipmp_ill_get_xmit_ill() because either the in_ill was an + * ipmp_ill, or we are sending a non-unicast packet on + * an under_ill. However, when we get here, the ill selected by + * ipmp_ill_get_xmit_ill was pulled out of the active set + * (for unicast) or cast_ill nomination (for + * !unicast) after it was picked as the outgoing ill. + * We have to pick an active interface and/or cast_ill in the + * group. + */ + mutex_exit(&ill->ill_phyint->phyint_lock); + nce_delete(nce); + mutex_exit(&ill->ill_lock); + rw_exit(&ill->ill_ipst->ips_ill_g_lock); + nce_refrele(nce); + ill_refrele(ill); + if ((ill = ipmp_ill_get_xmit_ill(in_ill, is_unicast)) == NULL) + return (NULL); + goto retry; + } else { + mutex_exit(&ill->ill_phyint->phyint_lock); + mutex_exit(&ill->ill_lock); + rw_exit(&ill->ill_ipst->ips_ill_g_lock); } - IRB_REFRELE(firb); +done: + ASSERT(nce->nce_ill == ill); + if (need_refrele) + ill_refrele(ill); + return (nce); +} - /* Then subtract the number of routes to that dst that are resolved */ - IRB_REFHOLD(cirb); - for (cire = first_cire; cire != NULL; cire = cire->ire_next) { - if (!(cire->ire_flags & RTF_MULTIRT)) - continue; - if (cire->ire_addr != dst) - continue; - if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN)) - continue; - unres_cnt--; - } - IRB_REFRELE(cirb); +nce_t * +arp_nce_init(ill_t *ill, in_addr_t addr4, int ire_type) +{ + return (ire_nce_init(ill, &addr4, ire_type)); +} - /* At least one route is unresolved; search for a resolvable route. */ - if (unres_cnt > 0) - resolvable = ire_multirt_lookup(&first_cire, &first_fire, - MULTIRT_USESTAMP | MULTIRT_CACHEGW, NULL, tsl, ipst); +nce_t * +ndp_nce_init(ill_t *ill, const in6_addr_t *addr6, int ire_type) +{ + ASSERT((ire_type & IRE_BROADCAST) == 0); + return (ire_nce_init(ill, addr6, ire_type)); +} - if (first_fire != NULL) - ire_refrele(first_fire); +/* + * The caller should hold irb_lock as a writer if the ire is in a bucket. + */ +void +ire_make_condemned(ire_t *ire) +{ + ip_stack_t *ipst = ire->ire_ipst; + + mutex_enter(&ire->ire_lock); + ASSERT(ire->ire_bucket == NULL || + RW_WRITE_HELD(&ire->ire_bucket->irb_lock)); + ASSERT(!IRE_IS_CONDEMNED(ire)); + ire->ire_generation = IRE_GENERATION_CONDEMNED; + /* Count how many condemned ires for kmem_cache callback */ + atomic_add_32(&ipst->ips_num_ire_condemned, 1); + mutex_exit(&ire->ire_lock); +} - if (first_cire != NULL) - ire_refrele(first_cire); +/* + * Increment the generation avoiding the special condemned value + */ +void +ire_increment_generation(ire_t *ire) +{ + uint_t generation; - return (resolvable); + mutex_enter(&ire->ire_lock); + /* + * Even though the caller has a hold it can't prevent a concurrent + * ire_delete marking the IRE condemned + */ + if (!IRE_IS_CONDEMNED(ire)) { + generation = ire->ire_generation + 1; + if (generation == IRE_GENERATION_CONDEMNED) + generation = IRE_GENERATION_INITIAL; + ASSERT(generation != IRE_GENERATION_VERIFY); + ire->ire_generation = generation; + } + mutex_exit(&ire->ire_lock); } /* - * Explore a forward_table bucket, starting from fire_arg. - * fire_arg MUST be an IRE_HOST entry. - * - * Return B_TRUE and update *ire_arg and *fire_arg - * if at least one resolvable route is found. *ire_arg - * is the IRE entry for *fire_arg's gateway. - * - * Return B_FALSE otherwise (all routes are resolved or - * the remaining unresolved routes are all unresolvable). - * - * The IRE selection relies on a priority mechanism - * driven by the flags passed in by the caller. - * The caller, such as ip_newroute_ipif(), can get the most - * relevant ire at each stage of a multiple route resolution. - * - * The rules are: - * - * - if MULTIRT_CACHEGW is specified in flags, IRE_CACHETABLE - * ires are preferred for the gateway. This gives the highest - * priority to routes that can be resolved without using - * a resolver. + * Increment ire_generation on all the IRE_MULTICASTs + * Used when the default multicast interface (as determined by + * ill_lookup_multicast) might have changed. * - * - if MULTIRT_CACHEGW is not specified, or if MULTIRT_CACHEGW - * is specified but no IRE_CACHETABLE ire entry for the gateway - * is found, the following rules apply. - * - * - if MULTIRT_USESTAMP is specified in flags, IRE_INTERFACE - * ires for the gateway, that have not been tried since - * a configurable amount of time, are preferred. - * This applies when a resolver must be invoked for - * a missing route, but we don't want to use the resolver - * upon each packet emission. If no such resolver is found, - * B_FALSE is returned. - * The MULTIRT_USESTAMP flag can be combined with - * MULTIRT_CACHEGW. - * - * - if MULTIRT_USESTAMP is not specified in flags, the first - * unresolved but resolvable route is selected. - * - * - Otherwise, there is no resolvable route, and - * B_FALSE is returned. - * - * At last, MULTIRT_SETSTAMP can be specified in flags to - * request the timestamp of unresolvable routes to - * be refreshed. This prevents the useless exploration - * of those routes for a while, when MULTIRT_USESTAMP is used. - * - * The argument already_resolved_count is an output variable to track number - * of already resolved multirt routes. - * - * This only works in the global zone. + * That includes the zoneid, IFF_ flags, the IPv6 scope of the address, and + * ill unplumb. */ -boolean_t -ire_multirt_lookup(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags, - int *already_resolved_count, const ts_label_t *tsl, ip_stack_t *ipst) +void +ire_increment_multicast_generation(ip_stack_t *ipst, boolean_t isv6) { - clock_t delta; - ire_t *best_fire = NULL; - ire_t *best_cire = NULL; - ire_t *first_fire; - ire_t *first_cire; - ire_t *fire; - ire_t *cire; - irb_t *firb = NULL; - irb_t *cirb = NULL; - ire_t *gw_ire; - boolean_t already_resolved; - boolean_t res; - ipaddr_t dst; - ipaddr_t gw; - - ip2dbg(("ire_multirt_lookup: *ire_arg %p, *fire_arg %p, flags %04x\n", - (void *)*ire_arg, (void *)*fire_arg, flags)); - - ASSERT(ire_arg != NULL); - ASSERT(fire_arg != NULL); - - /* Not an IRE_HOST ire; give up. */ - if ((*fire_arg == NULL) || ((*fire_arg)->ire_type != IRE_HOST)) { - return (B_FALSE); + ill_t *ill; + ill_walk_context_t ctx; + + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + if (isv6) + ill = ILL_START_WALK_V6(&ctx, ipst); + else + ill = ILL_START_WALK_V4(&ctx, ipst); + for (; ill != NULL; ill = ill_next(&ctx, ill)) { + if (ILL_IS_CONDEMNED(ill)) + continue; + if (ill->ill_ire_multicast != NULL) + ire_increment_generation(ill->ill_ire_multicast); } + rw_exit(&ipst->ips_ill_g_lock); +} - /* This is the first IRE_HOST ire for that destination. */ - first_fire = *fire_arg; - firb = first_fire->ire_bucket; - ASSERT(firb != NULL); +/* + * Return a held IRE_NOROUTE with RTF_REJECT set + */ +ire_t * +ire_reject(ip_stack_t *ipst, boolean_t isv6) +{ + ire_t *ire; - dst = first_fire->ire_addr; + if (isv6) + ire = ipst->ips_ire_reject_v6; + else + ire = ipst->ips_ire_reject_v4; - ip2dbg(("ire_multirt_lookup: dst %08x\n", ntohl(dst))); + ASSERT(ire->ire_generation != IRE_GENERATION_CONDEMNED); + ire_refhold(ire); + return (ire); +} - /* - * Retrieve the first IRE_CACHE ire for that destination; - * if we don't find one, no route for that dest is - * resolved yet. - */ - first_cire = ire_cache_lookup(dst, GLOBAL_ZONEID, tsl, ipst); - if (first_cire != NULL) { - cirb = first_cire->ire_bucket; - } +/* + * Return a held IRE_NOROUTE with RTF_BLACKHOLE set + */ +ire_t * +ire_blackhole(ip_stack_t *ipst, boolean_t isv6) +{ + ire_t *ire; - ip2dbg(("ire_multirt_lookup: first_cire %p\n", (void *)first_cire)); + if (isv6) + ire = ipst->ips_ire_blackhole_v6; + else + ire = ipst->ips_ire_blackhole_v4; - /* - * Search for a resolvable route, giving the top priority - * to routes that can be resolved without any call to the resolver. - */ - IRB_REFHOLD(firb); + ASSERT(ire->ire_generation != IRE_GENERATION_CONDEMNED); + ire_refhold(ire); + return (ire); +} + +/* + * Return a held IRE_MULTICAST. + */ +ire_t * +ire_multicast(ill_t *ill) +{ + ire_t *ire = ill->ill_ire_multicast; + + ASSERT(ire == NULL || ire->ire_generation != IRE_GENERATION_CONDEMNED); + if (ire == NULL) + ire = ire_blackhole(ill->ill_ipst, ill->ill_isv6); + else + ire_refhold(ire); + return (ire); +} + +/* + * Given an IRE return its nexthop IRE. The nexthop IRE is an IRE_ONLINK + * that is an exact match (i.e., a /32 for IPv4 and /128 for IPv6). + * This can return an RTF_REJECT|RTF_BLACKHOLE. + * The returned IRE is held. + * The assumption is that ip_select_route() has been called and returned the + * IRE (thus ip_select_route would have set up the ire_dep* information.) + * If some IRE is deleteted then ire_dep_remove() will have been called and + * we might not find a nexthop IRE, in which case we return NULL. + */ +ire_t * +ire_nexthop(ire_t *ire) +{ + ip_stack_t *ipst = ire->ire_ipst; - if (!CLASSD(dst)) { + /* Acquire lock to walk ire_dep_parent */ + rw_enter(&ipst->ips_ire_dep_lock, RW_READER); + while (ire != NULL) { + if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { + goto done; + } /* - * For all multiroute IRE_HOST ires for that destination, - * check if the route via the IRE_HOST's gateway is - * resolved yet. + * If we find an IRE_ONLINK we are done. This includes + * the case of IRE_MULTICAST. + * Note that in order to send packets we need a host-specific + * IRE_IF_ALL first in the ire_dep_parent chain. Normally this + * is done by inserting an IRE_IF_CLONE if the IRE_INTERFACE + * was not host specific. + * However, ip_rts_request doesn't want to send packets + * hence doesn't want to allocate an IRE_IF_CLONE. Yet + * it needs an IRE_IF_ALL to get to the ill. Thus + * we return IRE_IF_ALL that are not host specific here. */ - for (fire = first_fire; fire != NULL; fire = fire->ire_next) { - - if (!(fire->ire_flags & RTF_MULTIRT)) - continue; - if (fire->ire_addr != dst) - continue; + if (ire->ire_type & IRE_ONLINK) + goto done; + ire = ire->ire_dep_parent; + } + rw_exit(&ipst->ips_ire_dep_lock); + return (NULL); - if (fire->ire_gw_secattr != NULL && - tsol_ire_match_gwattr(fire, tsl) != 0) { - continue; - } +done: + ire_refhold(ire); + rw_exit(&ipst->ips_ire_dep_lock); + return (ire); +} - gw = fire->ire_gateway_addr; - - ip2dbg(("ire_multirt_lookup: fire %p, " - "ire_addr %08x, ire_gateway_addr %08x\n", - (void *)fire, ntohl(fire->ire_addr), ntohl(gw))); - - already_resolved = B_FALSE; - - if (first_cire != NULL) { - ASSERT(cirb != NULL); - - IRB_REFHOLD(cirb); - /* - * For all IRE_CACHE ires for that - * destination. - */ - for (cire = first_cire; - cire != NULL; - cire = cire->ire_next) { - - if (!(cire->ire_flags & RTF_MULTIRT)) - continue; - if (cire->ire_addr != dst) - continue; - if (cire->ire_marks & - (IRE_MARK_CONDEMNED | - IRE_MARK_TESTHIDDEN)) - continue; - - if (cire->ire_gw_secattr != NULL && - tsol_ire_match_gwattr(cire, - tsl) != 0) { - continue; - } +/* + * Find the ill used to send packets. This will be NULL in case + * of a reject or blackhole. + * The returned ill is held; caller needs to do ill_refrele when done. + */ +ill_t * +ire_nexthop_ill(ire_t *ire) +{ + ill_t *ill; - /* - * Check if the IRE_CACHE's gateway - * matches the IRE_HOST's gateway. - */ - if (cire->ire_gateway_addr == gw) { - already_resolved = B_TRUE; - break; - } - } - IRB_REFRELE(cirb); - } + ire = ire_nexthop(ire); + if (ire == NULL) + return (NULL); - /* - * This route is already resolved; - * proceed with next one. - */ - if (already_resolved) { - ip2dbg(("ire_multirt_lookup: found cire %p, " - "already resolved\n", (void *)cire)); + /* ire_ill can not change for an existing ire */ + ill = ire->ire_ill; + if (ill != NULL) + ill_refhold(ill); + ire_refrele(ire); + return (ill); +} - if (already_resolved_count != NULL) - (*already_resolved_count)++; - continue; - } +#ifdef DEBUG +static boolean_t +parent_has_child(ire_t *parent, ire_t *child) +{ + ire_t *ire; + ire_t *prev; - /* - * The route is unresolved; is it actually - * resolvable, i.e. is there a cache or a resolver - * for the gateway? - */ - gw_ire = ire_route_lookup(gw, 0, 0, 0, NULL, NULL, - ALL_ZONES, tsl, - MATCH_IRE_RECURSIVE | MATCH_IRE_SECATTR, ipst); + ire = parent->ire_dep_children; + prev = NULL; + while (ire != NULL) { + if (prev == NULL) { + ASSERT(ire->ire_dep_sib_ptpn == + &(parent->ire_dep_children)); + } else { + ASSERT(ire->ire_dep_sib_ptpn == + &(prev->ire_dep_sib_next)); + } + if (ire == child) + return (B_TRUE); + prev = ire; + ire = ire->ire_dep_sib_next; + } + return (B_FALSE); +} - ip2dbg(("ire_multirt_lookup: looked up gw_ire %p\n", - (void *)gw_ire)); +static void +ire_dep_verify(ire_t *ire) +{ + ire_t *parent = ire->ire_dep_parent; + ire_t *child = ire->ire_dep_children; - /* - * If gw_ire is typed IRE_CACHETABLE, - * this route can be resolved without any call to the - * resolver. If the MULTIRT_CACHEGW flag is set, - * give the top priority to this ire and exit the - * loop. - * This is typically the case when an ARP reply - * is processed through ip_wput_nondata(). - */ - if ((flags & MULTIRT_CACHEGW) && - (gw_ire != NULL) && - (gw_ire->ire_type & IRE_CACHETABLE)) { - ASSERT(gw_ire->ire_nce == NULL || - gw_ire->ire_nce->nce_state == ND_REACHABLE); - /* - * Release the resolver associated to the - * previous candidate best ire, if any. - */ - if (best_cire != NULL) { - ire_refrele(best_cire); - ASSERT(best_fire != NULL); - } + ASSERT(ire->ire_ipversion == IPV4_VERSION || + ire->ire_ipversion == IPV6_VERSION); + if (parent != NULL) { + ASSERT(parent->ire_ipversion == IPV4_VERSION || + parent->ire_ipversion == IPV6_VERSION); + ASSERT(parent->ire_refcnt >= 1); + ASSERT(parent_has_child(parent, ire)); + } + if (child != NULL) { + ASSERT(child->ire_ipversion == IPV4_VERSION || + child->ire_ipversion == IPV6_VERSION); + ASSERT(child->ire_dep_parent == ire); + ASSERT(child->ire_dep_sib_ptpn != NULL); + ASSERT(parent_has_child(ire, child)); + } +} +#endif /* DEBUG */ - best_fire = fire; - best_cire = gw_ire; +/* + * Assumes ire_dep_parent is set. Remove this child from its parent's linkage. + */ +void +ire_dep_remove(ire_t *ire) +{ + ip_stack_t *ipst = ire->ire_ipst; + ire_t *parent = ire->ire_dep_parent; + ire_t *next; + nce_t *nce; - ip2dbg(("ire_multirt_lookup: found top prio " - "best_fire %p, best_cire %p\n", - (void *)best_fire, (void *)best_cire)); - break; - } + ASSERT(RW_WRITE_HELD(&ipst->ips_ire_dep_lock)); + ASSERT(ire->ire_dep_parent != NULL); + ASSERT(ire->ire_dep_sib_ptpn != NULL); - /* - * Compute the time elapsed since our preceding - * attempt to resolve that route. - * If the MULTIRT_USESTAMP flag is set, we take that - * route into account only if this time interval - * exceeds ip_multirt_resolution_interval; - * this prevents us from attempting to resolve a - * broken route upon each sending of a packet. - */ - delta = lbolt - fire->ire_last_used_time; - delta = TICK_TO_MSEC(delta); - - res = (boolean_t)((delta > - ipst->ips_ip_multirt_resolution_interval) || - (!(flags & MULTIRT_USESTAMP))); - - ip2dbg(("ire_multirt_lookup: fire %p, delta %lu, " - "res %d\n", - (void *)fire, delta, res)); - - if (res) { - /* - * We are here if MULTIRT_USESTAMP flag is set - * and the resolver for fire's gateway - * has not been tried since - * ip_multirt_resolution_interval, or if - * MULTIRT_USESTAMP is not set but gw_ire did - * not fill the conditions for MULTIRT_CACHEGW, - * or if neither MULTIRT_USESTAMP nor - * MULTIRT_CACHEGW are set. - */ - if (gw_ire != NULL) { - if (best_fire == NULL) { - ASSERT(best_cire == NULL); - - best_fire = fire; - best_cire = gw_ire; - - ip2dbg(("ire_multirt_lookup:" - "found candidate " - "best_fire %p, " - "best_cire %p\n", - (void *)best_fire, - (void *)best_cire)); - - /* - * If MULTIRT_CACHEGW is not - * set, we ignore the top - * priority ires that can - * be resolved without any - * call to the resolver; - * In that case, there is - * actually no need - * to continue the loop. - */ - if (!(flags & - MULTIRT_CACHEGW)) { - break; - } - continue; - } - } else { - /* - * No resolver for the gateway: the - * route is not resolvable. - * If the MULTIRT_SETSTAMP flag is - * set, we stamp the IRE_HOST ire, - * so we will not select it again - * during this resolution interval. - */ - if (flags & MULTIRT_SETSTAMP) - fire->ire_last_used_time = - lbolt; - } - } +#ifdef DEBUG + ire_dep_verify(ire); + ire_dep_verify(parent); +#endif - if (gw_ire != NULL) - ire_refrele(gw_ire); - } - } else { /* CLASSD(dst) */ + next = ire->ire_dep_sib_next; + if (next != NULL) + next->ire_dep_sib_ptpn = ire->ire_dep_sib_ptpn; - for (fire = first_fire; - fire != NULL; - fire = fire->ire_next) { + ASSERT(*(ire->ire_dep_sib_ptpn) == ire); + *(ire->ire_dep_sib_ptpn) = ire->ire_dep_sib_next; - if (!(fire->ire_flags & RTF_MULTIRT)) - continue; - if (fire->ire_addr != dst) - continue; + ire->ire_dep_sib_ptpn = NULL; + ire->ire_dep_sib_next = NULL; - if (fire->ire_gw_secattr != NULL && - tsol_ire_match_gwattr(fire, tsl) != 0) { - continue; - } + mutex_enter(&ire->ire_lock); + parent = ire->ire_dep_parent; + ire->ire_dep_parent = NULL; + mutex_exit(&ire->ire_lock); - already_resolved = B_FALSE; + /* + * Make sure all our children, grandchildren, etc set + * ire_dep_parent_generation to IRE_GENERATION_VERIFY since + * we can no longer guarantee than the children have a current + * ire_nce_cache and ire_nexthop_ill(). + */ + if (ire->ire_dep_children != NULL) + ire_dep_invalidate_children(ire->ire_dep_children); - gw = fire->ire_gateway_addr; + /* + * Since the parent is gone we make sure we clear ire_nce_cache. + * We can clear it under ire_lock even if the IRE is used + */ + mutex_enter(&ire->ire_lock); + nce = ire->ire_nce_cache; + ire->ire_nce_cache = NULL; + mutex_exit(&ire->ire_lock); + if (nce != NULL) + nce_refrele(nce); - gw_ire = ire_ftable_lookup(gw, 0, 0, IRE_INTERFACE, - NULL, NULL, ALL_ZONES, 0, tsl, - MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE | - MATCH_IRE_SECATTR, ipst); +#ifdef DEBUG + ire_dep_verify(ire); + ire_dep_verify(parent); +#endif - /* No resolver for the gateway; we skip this ire. */ - if (gw_ire == NULL) { - continue; - } - ASSERT(gw_ire->ire_nce == NULL || - gw_ire->ire_nce->nce_state == ND_REACHABLE); - - if (first_cire != NULL) { - - IRB_REFHOLD(cirb); - /* - * For all IRE_CACHE ires for that - * destination. - */ - for (cire = first_cire; - cire != NULL; - cire = cire->ire_next) { - - if (!(cire->ire_flags & RTF_MULTIRT)) - continue; - if (cire->ire_addr != dst) - continue; - if (cire->ire_marks & - (IRE_MARK_CONDEMNED | - IRE_MARK_TESTHIDDEN)) - continue; - - if (cire->ire_gw_secattr != NULL && - tsol_ire_match_gwattr(cire, - tsl) != 0) { - continue; - } + ire_refrele_notr(parent); + ire_refrele_notr(ire); +} - /* - * Cache entries are linked to the - * parent routes using the parent handle - * (ire_phandle). If no cache entry has - * the same handle as fire, fire is - * still unresolved. - */ - ASSERT(cire->ire_phandle != 0); - if (cire->ire_phandle == - fire->ire_phandle) { - already_resolved = B_TRUE; - break; - } - } - IRB_REFRELE(cirb); - } +/* + * Insert the child in the linkage of the parent + */ +static void +ire_dep_parent_insert(ire_t *child, ire_t *parent) +{ + ip_stack_t *ipst = child->ire_ipst; + ire_t *next; - /* - * This route is already resolved; proceed with - * next one. - */ - if (already_resolved) { - ire_refrele(gw_ire); - if (already_resolved_count != NULL) - (*already_resolved_count)++; - continue; - } + ASSERT(RW_WRITE_HELD(&ipst->ips_ire_dep_lock)); + ASSERT(child->ire_dep_parent == NULL); - /* - * Compute the time elapsed since our preceding - * attempt to resolve that route. - * If the MULTIRT_USESTAMP flag is set, we take - * that route into account only if this time - * interval exceeds ip_multirt_resolution_interval; - * this prevents us from attempting to resolve a - * broken route upon each sending of a packet. - */ - delta = lbolt - fire->ire_last_used_time; - delta = TICK_TO_MSEC(delta); - - res = (boolean_t)((delta > - ipst->ips_ip_multirt_resolution_interval) || - (!(flags & MULTIRT_USESTAMP))); - - ip3dbg(("ire_multirt_lookup: fire %p, delta %lx, " - "flags %04x, res %d\n", - (void *)fire, delta, flags, res)); - - if (res) { - if (best_cire != NULL) { - /* - * Release the resolver associated - * to the preceding candidate best - * ire, if any. - */ - ire_refrele(best_cire); - ASSERT(best_fire != NULL); - } - best_fire = fire; - best_cire = gw_ire; - continue; - } +#ifdef DEBUG + ire_dep_verify(child); + ire_dep_verify(parent); +#endif + /* No parents => no siblings */ + ASSERT(child->ire_dep_sib_ptpn == NULL); + ASSERT(child->ire_dep_sib_next == NULL); - ire_refrele(gw_ire); - } - } + ire_refhold_notr(parent); + ire_refhold_notr(child); - if (best_fire != NULL) { - IRE_REFHOLD(best_fire); + /* Head insertion */ + next = parent->ire_dep_children; + if (next != NULL) { + ASSERT(next->ire_dep_sib_ptpn == &(parent->ire_dep_children)); + child->ire_dep_sib_next = next; + next->ire_dep_sib_ptpn = &(child->ire_dep_sib_next); } - IRB_REFRELE(firb); + parent->ire_dep_children = child; + child->ire_dep_sib_ptpn = &(parent->ire_dep_children); - /* Release the first IRE_CACHE we initially looked up, if any. */ - if (first_cire != NULL) - ire_refrele(first_cire); + mutex_enter(&child->ire_lock); + child->ire_dep_parent = parent; + mutex_exit(&child->ire_lock); - /* Found a resolvable route. */ - if (best_fire != NULL) { - ASSERT(best_cire != NULL); - - if (*fire_arg != NULL) - ire_refrele(*fire_arg); - if (*ire_arg != NULL) - ire_refrele(*ire_arg); +#ifdef DEBUG + ire_dep_verify(child); + ire_dep_verify(parent); +#endif +} - /* - * Update the passed-in arguments with the - * resolvable multirt route we found. - */ - *fire_arg = best_fire; - *ire_arg = best_cire; - ip2dbg(("ire_multirt_lookup: returning B_TRUE, " - "*fire_arg %p, *ire_arg %p\n", - (void *)best_fire, (void *)best_cire)); +/* + * Given count worth of ires and generations, build ire_dep_* relationships + * from ires[0] to ires[count-1]. Record generations[i+1] in + * ire_dep_parent_generation for ires[i]. + * We graft onto an existing parent chain by making sure that we don't + * touch ire_dep_parent for ires[count-1]. + * + * We check for any condemned ire_generation count and return B_FALSE in + * that case so that the caller can tear it apart. + * + * Note that generations[0] is not used. Caller handles that. + */ +boolean_t +ire_dep_build(ire_t *ires[], uint_t generations[], uint_t count) +{ + ire_t *ire = ires[0]; + ip_stack_t *ipst; + uint_t i; + ASSERT(count > 0); + if (count == 1) { + /* No work to do */ return (B_TRUE); } + ipst = ire->ire_ipst; + rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER); + /* + * Do not remove the linkage for any existing parent chain i.e., + * ires[count-1] is left alone. + */ + for (i = 0; i < count-1; i++) { + /* Remove existing parent if we need to change it */ + if (ires[i]->ire_dep_parent != NULL && + ires[i]->ire_dep_parent != ires[i+1]) + ire_dep_remove(ires[i]); + } - ASSERT(best_cire == NULL); + for (i = 0; i < count - 1; i++) { + ASSERT(ires[i]->ire_ipversion == IPV4_VERSION || + ires[i]->ire_ipversion == IPV6_VERSION); + /* Does it need to change? */ + if (ires[i]->ire_dep_parent != ires[i+1]) + ire_dep_parent_insert(ires[i], ires[i+1]); - ip2dbg(("ire_multirt_lookup: returning B_FALSE, *fire_arg %p, " - "*ire_arg %p\n", - (void *)*fire_arg, (void *)*ire_arg)); + mutex_enter(&ires[i+1]->ire_lock); + if (IRE_IS_CONDEMNED(ires[i+1])) { + mutex_exit(&ires[i+1]->ire_lock); + rw_exit(&ipst->ips_ire_dep_lock); + return (B_FALSE); + } + mutex_exit(&ires[i+1]->ire_lock); - /* No resolvable route. */ - return (B_FALSE); + mutex_enter(&ires[i]->ire_lock); + ires[i]->ire_dep_parent_generation = generations[i+1]; + mutex_exit(&ires[i]->ire_lock); + } + rw_exit(&ipst->ips_ire_dep_lock); + return (B_TRUE); } /* - * IRE iterator for inbound and loopback broadcast processing. - * Given an IRE_BROADCAST ire, walk the ires with the same destination - * address, but skip over the passed-in ire. Returns the next ire without - * a hold - assumes that the caller holds a reference on the IRE bucket. + * Given count worth of ires, unbuild ire_dep_* relationships + * from ires[0] to ires[count-1]. */ -ire_t * -ire_get_next_bcast_ire(ire_t *curr, ire_t *ire) +void +ire_dep_unbuild(ire_t *ires[], uint_t count) { - ill_t *ill; + ip_stack_t *ipst; + uint_t i; - if (curr == NULL) { - for (curr = ire->ire_bucket->irb_ire; curr != NULL; - curr = curr->ire_next) { - if (curr->ire_addr == ire->ire_addr) - break; - } - } else { - curr = curr->ire_next; + if (count == 0) { + /* No work to do */ + return; } - ill = ire_to_ill(ire); - for (; curr != NULL; curr = curr->ire_next) { - if (curr->ire_addr != ire->ire_addr) { - /* - * All the IREs to a given destination are contiguous; - * break out once the address doesn't match. - */ - break; - } - if (curr == ire) { - /* skip over the passed-in ire */ - continue; - } - if ((curr->ire_stq != NULL && ire->ire_stq == NULL) || - (curr->ire_stq == NULL && ire->ire_stq != NULL)) { + ipst = ires[0]->ire_ipst; + rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER); + for (i = 0; i < count; i++) { + ASSERT(ires[i]->ire_ipversion == IPV4_VERSION || + ires[i]->ire_ipversion == IPV6_VERSION); + if (ires[i]->ire_dep_parent != NULL) + ire_dep_remove(ires[i]); + mutex_enter(&ires[i]->ire_lock); + ires[i]->ire_dep_parent_generation = IRE_GENERATION_VERIFY; + mutex_exit(&ires[i]->ire_lock); + } + rw_exit(&ipst->ips_ire_dep_lock); +} + +/* + * Both the forwarding and the outbound code paths can trip on + * a condemned NCE, in which case we call this function. + * We have two different behaviors: if the NCE was UNREACHABLE + * it is an indication that something failed. In that case + * we see if we should look for a different IRE (for example, + * delete any matching redirect IRE, or try a different + * IRE_DEFAULT (ECMP)). We mark the ire as bad so a hopefully + * different IRE will be picked next time we send/forward. + * + * If we are called by the output path then fail_if_better is set + * and we return NULL if there could be a better IRE. This is because the + * output path retries the IRE lookup. (The input/forward path can not retry.) + * + * If the NCE was not unreachable then we pick/allocate a + * new (most likely ND_INITIAL) NCE and proceed with it. + * + * ipha/ip6h are needed for multicast packets; ipha needs to be + * set for IPv4 and ip6h needs to be set for IPv6 packets. + */ +nce_t * +ire_handle_condemned_nce(nce_t *nce, ire_t *ire, ipha_t *ipha, ip6_t *ip6h, + boolean_t fail_if_better) +{ + if (nce->nce_common->ncec_state == ND_UNREACHABLE) { + if (ire_no_good(ire) && fail_if_better) { /* - * If the passed-in ire is loopback, skip over - * non-loopback ires and vice versa. + * Did some changes, or ECMP likely to exist. + * Make ip_output look for a different IRE */ - continue; + return (NULL); } - if (ire_to_ill(curr) != ill) { - /* skip over IREs going through a different interface */ - continue; + } + if (ire_revalidate_nce(ire) == ENETUNREACH) { + /* The ire_dep_parent chain went bad, or no memory? */ + (void) ire_no_good(ire); + return (NULL); + } + if (ire->ire_ipversion == IPV4_VERSION) { + ASSERT(ipha != NULL); + nce = ire_to_nce(ire, ipha->ipha_dst, NULL); + } else { + ASSERT(ip6h != NULL); + nce = ire_to_nce(ire, INADDR_ANY, &ip6h->ip6_dst); + } + + if (nce == NULL) + return (NULL); + if (nce->nce_is_condemned) { + nce_refrele(nce); + return (NULL); + } + return (nce); +} + +/* + * The caller has found that the ire is bad, either due to a reference to an NCE + * in ND_UNREACHABLE state, or a MULTIRT route whose gateway can't be resolved. + * We update things so a subsequent attempt to send to the destination + * is likely to find different IRE, or that a new NCE would be created. + * + * Returns B_TRUE if it is likely that a subsequent ire_ftable_lookup would + * find a different route (either due to having deleted a redirect, or there + * being ECMP routes.) + * + * If we have a redirect (RTF_DYNAMIC) we delete it. + * Otherwise we increment ire_badcnt and increment the generation number so + * that a cached ixa_ire will redo the route selection. ire_badcnt is taken + * into account in the route selection when we have multiple choices (multiple + * default routes or ECMP in general). + * Any time ip_select_route find an ire with a condemned ire_nce_cache + * (e.g., if no equal cost route to the bad one) ip_select_route will make + * sure the NCE is revalidated to avoid getting stuck on a + * NCE_F_CONDMNED ncec that caused ire_no_good to be called. + */ +boolean_t +ire_no_good(ire_t *ire) +{ + ip_stack_t *ipst = ire->ire_ipst; + ire_t *ire2; + nce_t *nce; + + if (ire->ire_flags & RTF_DYNAMIC) { + ire_delete(ire); + return (B_TRUE); + } + if (ire->ire_flags & RTF_INDIRECT) { + /* Check if next IRE is a redirect */ + rw_enter(&ipst->ips_ire_dep_lock, RW_READER); + if (ire->ire_dep_parent != NULL && + (ire->ire_dep_parent->ire_flags & RTF_DYNAMIC)) { + ire2 = ire->ire_dep_parent; + ire_refhold(ire2); + } else { + ire2 = NULL; } - if (curr->ire_marks & IRE_MARK_CONDEMNED) { - /* skip over deleted IREs */ - continue; + rw_exit(&ipst->ips_ire_dep_lock); + if (ire2 != NULL) { + ire_delete(ire2); + ire_refrele(ire2); + return (B_TRUE); } - return (curr); } - return (NULL); + /* + * No redirect involved. Increment badcnt so that if we have ECMP + * routes we are likely to pick a different one for the next packet. + * + * If the NCE is unreachable and condemned we should drop the reference + * to it so that a new NCE can be created. + * + * Finally we increment the generation number so that any ixa_ire + * cache will be revalidated. + */ + mutex_enter(&ire->ire_lock); + ire->ire_badcnt++; + ire->ire_last_badcnt = TICK_TO_SEC(lbolt64); + nce = ire->ire_nce_cache; + if (nce != NULL && nce->nce_is_condemned && + nce->nce_common->ncec_state == ND_UNREACHABLE) + ire->ire_nce_cache = NULL; + else + nce = NULL; + mutex_exit(&ire->ire_lock); + if (nce != NULL) + nce_refrele(nce); + + ire_increment_generation(ire); + ire_dep_incr_generation(ire); + + return (ire->ire_bucket->irb_ire_cnt > 1); } -#ifdef DEBUG -void -ire_trace_ref(ire_t *ire) +/* + * Walk ire_dep_parent chain and validate that ire_dep_parent->ire_generation == + * ire_dep_parent_generation. + * If they all match we just return ire_generation from the topmost IRE. + * Otherwise we propagate the mismatch by setting all ire_dep_parent_generation + * above the mismatch to IRE_GENERATION_VERIFY and also returning + * IRE_GENERATION_VERIFY. + */ +uint_t +ire_dep_validate_generations(ire_t *ire) { - mutex_enter(&ire->ire_lock); - if (ire->ire_trace_disable) { + ip_stack_t *ipst = ire->ire_ipst; + uint_t generation; + ire_t *ire1; + + rw_enter(&ipst->ips_ire_dep_lock, RW_READER); + generation = ire->ire_generation; /* Assuming things match */ + for (ire1 = ire; ire1 != NULL; ire1 = ire1->ire_dep_parent) { + ASSERT(ire1->ire_ipversion == IPV4_VERSION || + ire1->ire_ipversion == IPV6_VERSION); + if (ire1->ire_dep_parent == NULL) + break; + if (ire1->ire_dep_parent_generation != + ire1->ire_dep_parent->ire_generation) + goto mismatch; + } + rw_exit(&ipst->ips_ire_dep_lock); + return (generation); + +mismatch: + generation = IRE_GENERATION_VERIFY; + /* Fill from top down to the mismatch with _VERIFY */ + while (ire != ire1) { + ASSERT(ire->ire_ipversion == IPV4_VERSION || + ire->ire_ipversion == IPV6_VERSION); + mutex_enter(&ire->ire_lock); + ire->ire_dep_parent_generation = IRE_GENERATION_VERIFY; mutex_exit(&ire->ire_lock); - return; + ire = ire->ire_dep_parent; } + rw_exit(&ipst->ips_ire_dep_lock); + return (generation); +} - if (th_trace_ref(ire, ire->ire_ipst)) { - mutex_exit(&ire->ire_lock); - } else { - ire->ire_trace_disable = B_TRUE; +/* + * Used when we need to return an ire with ire_dep_parent, but we + * know the chain is invalid for instance we didn't create an IRE_IF_CLONE + * Using IRE_GENERATION_VERIFY means that next time we'll redo the + * recursive lookup. + */ +void +ire_dep_invalidate_generations(ire_t *ire) +{ + ip_stack_t *ipst = ire->ire_ipst; + + rw_enter(&ipst->ips_ire_dep_lock, RW_READER); + while (ire != NULL) { + ASSERT(ire->ire_ipversion == IPV4_VERSION || + ire->ire_ipversion == IPV6_VERSION); + mutex_enter(&ire->ire_lock); + ire->ire_dep_parent_generation = IRE_GENERATION_VERIFY; mutex_exit(&ire->ire_lock); - ire_trace_cleanup(ire); + ire = ire->ire_dep_parent; } + rw_exit(&ipst->ips_ire_dep_lock); } -void -ire_untrace_ref(ire_t *ire) +/* Set _VERIFY ire_dep_parent_generation for all children recursively */ +static void +ire_dep_invalidate_children(ire_t *child) { - mutex_enter(&ire->ire_lock); - if (!ire->ire_trace_disable) - th_trace_unref(ire); - mutex_exit(&ire->ire_lock); + ip_stack_t *ipst = child->ire_ipst; + + ASSERT(RW_WRITE_HELD(&ipst->ips_ire_dep_lock)); + /* Depth first */ + if (child->ire_dep_children != NULL) + ire_dep_invalidate_children(child->ire_dep_children); + + while (child != NULL) { + mutex_enter(&child->ire_lock); + child->ire_dep_parent_generation = IRE_GENERATION_VERIFY; + mutex_exit(&child->ire_lock); + child = child->ire_dep_sib_next; + } } static void -ire_trace_cleanup(const ire_t *ire) +ire_dep_increment_children(ire_t *child) { - th_trace_cleanup(ire, ire->ire_trace_disable); + ip_stack_t *ipst = child->ire_ipst; + + ASSERT(RW_READ_HELD(&ipst->ips_ire_dep_lock)); + /* Depth first */ + if (child->ire_dep_children != NULL) + ire_dep_increment_children(child->ire_dep_children); + + while (child != NULL) { + if (!IRE_IS_CONDEMNED(child)) + ire_increment_generation(child); + child = child->ire_dep_sib_next; + } } -#endif /* DEBUG */ /* - * Generate a message chain with an arp request to resolve the in_ire. - * It is assumed that in_ire itself is currently in the ire cache table, - * so we create a fake_ire filled with enough information about ire_addr etc. - * to retrieve in_ire when the DL_UNITDATA response from the resolver - * comes back. The fake_ire itself is created by calling esballoc with - * the fr_rtnp (free routine) set to ire_freemblk. This routine will be - * invoked when the mblk containing fake_ire is freed. + * Walk all the children of this ire recursively and increment their + * generation number. */ void -ire_arpresolve(ire_t *in_ire) +ire_dep_incr_generation(ire_t *parent) { - areq_t *areq; - ipaddr_t *addrp; - mblk_t *ire_mp, *areq_mp; - ire_t *ire, *buf; - size_t bufsize; - frtn_t *frtnp; - ill_t *dst_ill; - ip_stack_t *ipst; + ip_stack_t *ipst = parent->ire_ipst; - ASSERT(in_ire->ire_nce != NULL); + rw_enter(&ipst->ips_ire_dep_lock, RW_READER); + if (parent->ire_dep_children != NULL) + ire_dep_increment_children(parent->ire_dep_children); + rw_exit(&ipst->ips_ire_dep_lock); +} - dst_ill = ire_to_ill(in_ire); - ipst = dst_ill->ill_ipst; +/* + * Get a new ire_nce_cache for this IRE as well as its nexthop. + * Returns zero if it succeeds. Can fail due to lack of memory or when + * the route has become unreachable. Returns ENOMEM and ENETUNREACH in those + * cases. + * + * In the in.mpathd case, the ire will have ire_testhidden + * set; so we should create the ncec for the underlying ill. + * + * Note that the error returned by ire_revalidate_nce() is ignored by most + * callers except ire_handle_condemned_nce(), which handles the ENETUNREACH + * error to mark potentially bad ire's. For all the other callers, an + * error return could indicate a transient condition like ENOMEM, or could + * be the result of an interface that is going down/unplumbing. In the former + * case (transient error), we would leave the old stale ire/ire_nce_cache + * in place, and possibly use incorrect link-layer information to send packets + * but would eventually recover. In the latter case (ill down/replumb), + * ire_revalidate_nce() might return a condemned nce back, but we would then + * recover in the packet output path. + */ +int +ire_revalidate_nce(ire_t *ire) +{ + nce_t *nce, *old_nce; + ire_t *nexthop; /* - * Construct message chain for the resolver - * of the form: - * ARP_REQ_MBLK-->IRE_MBLK - * - * NOTE : If the response does not - * come back, ARP frees the packet. For this reason, - * we can't REFHOLD the bucket of save_ire to prevent - * deletions. We may not be able to REFRELE the bucket - * if the response never comes back. Thus, before - * adding the ire, ire_add_v4 will make sure that the - * interface route does not get deleted. This is the - * only case unlike ip_newroute_v6, ip_newroute_ipif_v6 - * where we can always prevent deletions because of - * the synchronous nature of adding IRES i.e - * ire_add_then_send is called after creating the IRE. + * For multicast we conceptually have an NCE but we don't store it + * in ire_nce_cache; when ire_to_nce is called we allocate the nce. */ + if (ire->ire_type & IRE_MULTICAST) + return (0); - /* - * We use esballoc to allocate the second part (IRE_MBLK) - * of the message chain depicted above. This mblk will be freed - * by arp when there is a timeout, and otherwise passed to IP - * and IP will free it after processing the ARP response. - */ + /* ire_testhidden should only be set on under-interfaces */ + ASSERT(!ire->ire_testhidden || !IS_IPMP(ire->ire_ill)); - bufsize = sizeof (ire_t) + sizeof (frtn_t); - buf = kmem_alloc(bufsize, KM_NOSLEEP); - if (buf == NULL) { - ip1dbg(("ire_arpresolve: alloc buffer failed\n")); - return; - } - frtnp = (frtn_t *)(buf + 1); - frtnp->free_arg = (caddr_t)buf; - frtnp->free_func = ire_freemblk; - - ire_mp = esballoc((unsigned char *)buf, bufsize, BPRI_MED, frtnp); - if (ire_mp == NULL) { - ip1dbg(("ire_arpresolve: esballoc failed\n")); - kmem_free(buf, bufsize); - return; + nexthop = ire_nexthop(ire); + if (nexthop == NULL) { + /* The route is potentially bad */ + (void) ire_no_good(ire); + return (ENETUNREACH); } + if (ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK)) { + ASSERT(ire->ire_ill != NULL); - areq_mp = copyb(dst_ill->ill_resolver_mp); - if (areq_mp == NULL) { - freemsg(ire_mp); - return; + if (ire->ire_ipversion == IPV4_VERSION) + nce = nce_lookup_v4(ire->ire_ill, &ire->ire_addr); + else + nce = nce_lookup_v6(ire->ire_ill, &ire->ire_addr_v6); + } else { + ASSERT(nexthop->ire_type & IRE_ONLINK); + if (ire->ire_ipversion == IPV4_VERSION) { + nce = arp_nce_init(nexthop->ire_ill, nexthop->ire_addr, + nexthop->ire_type); + } else { + nce = ndp_nce_init(nexthop->ire_ill, + &nexthop->ire_addr_v6, nexthop->ire_type); + } + } + if (nce == NULL) { + /* + * Leave the old stale one in place to avoid a NULL + * ire_nce_cache. + */ + ire_refrele(nexthop); + return (ENOMEM); } - ire_mp->b_datap->db_type = IRE_ARPRESOLVE_TYPE; - ire = (ire_t *)buf; - /* - * keep enough info in the fake ire so that we can pull up - * the incomplete ire (in_ire) after result comes back from - * arp and make it complete. - */ - *ire = ire_null; - ire->ire_u = in_ire->ire_u; - ire->ire_ipif_seqid = in_ire->ire_ipif_seqid; - ire->ire_ipif_ifindex = in_ire->ire_ipif_ifindex; - ire->ire_ipif = in_ire->ire_ipif; - ire->ire_stq = dst_ill->ill_wq; - ire->ire_stq_ifindex = dst_ill->ill_phyint->phyint_ifindex; - ire->ire_zoneid = in_ire->ire_zoneid; - ire->ire_stackid = ipst->ips_netstack->netstack_stackid; - ire->ire_ipst = ipst; - - /* - * ire_freemblk will be called when ire_mp is freed, both for - * successful and failed arp resolution. IRE_MARK_UNCACHED will be set - * when the arp resolution failed. - */ - ire->ire_marks |= IRE_MARK_UNCACHED; - ire->ire_mp = ire_mp; - ire_mp->b_wptr = (uchar_t *)&ire[1]; - ire_mp->b_cont = NULL; - linkb(areq_mp, ire_mp); - - /* - * Fill in the source and dest addrs for the resolver. - * NOTE: this depends on memory layouts imposed by - * ill_init(). - */ - areq = (areq_t *)areq_mp->b_rptr; - addrp = (ipaddr_t *)((char *)areq + areq->areq_sender_addr_offset); - *addrp = ire->ire_src_addr; - - addrp = (ipaddr_t *)((char *)areq + areq->areq_target_addr_offset); - if (ire->ire_gateway_addr != INADDR_ANY) { - *addrp = ire->ire_gateway_addr; - } else { - *addrp = ire->ire_addr; + if (nexthop != ire) { + /* Update the nexthop ire */ + mutex_enter(&nexthop->ire_lock); + old_nce = nexthop->ire_nce_cache; + if (!IRE_IS_CONDEMNED(nexthop)) { + nce_refhold(nce); + nexthop->ire_nce_cache = nce; + } else { + nexthop->ire_nce_cache = NULL; + } + mutex_exit(&nexthop->ire_lock); + if (old_nce != NULL) + nce_refrele(old_nce); } + ire_refrele(nexthop); - /* Up to the resolver. */ - if (canputnext(dst_ill->ill_rq)) { - putnext(dst_ill->ill_rq, areq_mp); + mutex_enter(&ire->ire_lock); + old_nce = ire->ire_nce_cache; + if (!IRE_IS_CONDEMNED(ire)) { + nce_refhold(nce); + ire->ire_nce_cache = nce; } else { - freemsg(areq_mp); + ire->ire_nce_cache = NULL; } + mutex_exit(&ire->ire_lock); + if (old_nce != NULL) + nce_refrele(old_nce); + + nce_refrele(nce); + return (0); } /* - * Esballoc free function for AR_ENTRY_QUERY request to clean up any - * unresolved ire_t and/or nce_t structures when ARP resolution fails. - * - * This function can be called by ARP via free routine for ire_mp or - * by IPv4(both host and forwarding path) via ire_delete - * in case ARP resolution fails. - * NOTE: Since IP is MT, ARP can call into IP but not vice versa - * (for IP to talk to ARP, it still has to send AR* messages). - * - * Note that the ARP/IP merge should replace the functioanlity by providing - * direct function calls to clean up unresolved entries in ire/nce lists. + * Get a held nce for a given ire. + * In the common case this is just from ire_nce_cache. + * For IRE_MULTICAST this needs to do an explicit lookup since we do not + * have an IRE_MULTICAST per address. + * Note that this explicitly returns CONDEMNED NCEs. The caller needs those + * so they can check whether the NCE went unreachable (as opposed to was + * condemned for some other reason). */ -void -ire_freemblk(ire_t *ire_mp) +nce_t * +ire_to_nce(ire_t *ire, ipaddr_t v4nexthop, const in6_addr_t *v6nexthop) { - nce_t *nce = NULL; - ill_t *ill; - ip_stack_t *ipst; - netstack_t *ns = NULL; + nce_t *nce; - ASSERT(ire_mp != NULL); + if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) + return (NULL); - if ((ire_mp->ire_addr == NULL) && (ire_mp->ire_gateway_addr == NULL)) { - ip1dbg(("ire_freemblk(0x%p) ire_addr is NULL\n", - (void *)ire_mp)); - goto cleanup; - } - if ((ire_mp->ire_marks & IRE_MARK_UNCACHED) == 0) { - goto cleanup; /* everything succeeded. just free and return */ + /* ire_testhidden should only be set on under-interfaces */ + ASSERT(!ire->ire_testhidden || !IS_IPMP(ire->ire_ill)); + + mutex_enter(&ire->ire_lock); + nce = ire->ire_nce_cache; + if (nce != NULL) { + nce_refhold(nce); + mutex_exit(&ire->ire_lock); + return (nce); } + mutex_exit(&ire->ire_lock); - /* - * the arp information corresponding to this ire_mp was not - * transferred to an ire_cache entry. Need - * to clean up incomplete ire's and nce, if necessary. - */ - ASSERT(ire_mp->ire_stq != NULL); - ASSERT(ire_mp->ire_stq_ifindex != 0); - ASSERT(ire_mp->ire_ipst != NULL); + if (ire->ire_type & IRE_MULTICAST) { + ASSERT(ire->ire_ill != NULL); - ns = netstack_find_by_stackid(ire_mp->ire_stackid); - ipst = (ns ? ns->netstack_ip : NULL); - if (ipst == NULL || ipst != ire_mp->ire_ipst) /* Disapeared on us */ - goto cleanup; + if (ire->ire_ipversion == IPV4_VERSION) { + ASSERT(v6nexthop == NULL); - /* - * Get any nce's corresponding to this ire_mp. We first have to - * make sure that the ill is still around. - */ - ill = ill_lookup_on_ifindex(ire_mp->ire_stq_ifindex, - B_FALSE, NULL, NULL, NULL, NULL, ipst); - if (ill == NULL || (ire_mp->ire_stq != ill->ill_wq) || - (ill->ill_state_flags & ILL_CONDEMNED)) { - /* - * ill went away. no nce to clean up. - * Note that the ill_state_flags could be set to - * ILL_CONDEMNED after this point, but if we know - * that it is CONDEMNED now, we just bail out quickly. - */ - if (ill != NULL) - ill_refrele(ill); - goto cleanup; + nce = arp_nce_init(ire->ire_ill, v4nexthop, + ire->ire_type); + } else { + ASSERT(v6nexthop != NULL); + ASSERT(v4nexthop == 0); + nce = ndp_nce_init(ire->ire_ill, v6nexthop, + ire->ire_type); + } + return (nce); } - nce = ndp_lookup_v4(ill, - ((ire_mp->ire_gateway_addr != INADDR_ANY) ? - &ire_mp->ire_gateway_addr : &ire_mp->ire_addr), - B_FALSE); - ill_refrele(ill); + return (NULL); +} - if ((nce != NULL) && (nce->nce_state != ND_REACHABLE)) { - /* - * some incomplete nce was found. - */ - DTRACE_PROBE2(ire__freemblk__arp__resolv__fail, - nce_t *, nce, ire_t *, ire_mp); - /* - * Send the icmp_unreachable messages for the queued mblks in - * ire->ire_nce->nce_qd_mp, since ARP resolution failed - * for this ire - */ - arp_resolv_failed(nce); - /* - * Delete the nce and clean up all ire's pointing at this nce - * in the cachetable - */ - ndp_delete(nce); - } - if (nce != NULL) - NCE_REFRELE(nce); /* release the ref taken by ndp_lookup_v4 */ +nce_t * +ire_to_nce_pkt(ire_t *ire, mblk_t *mp) +{ + ipha_t *ipha; + ip6_t *ip6h; -cleanup: - if (ns != NULL) - netstack_rele(ns); - /* - * Get rid of the ire buffer - * We call kmem_free here(instead of ire_delete()), since - * this is the freeb's callback. - */ - kmem_free(ire_mp, sizeof (ire_t) + sizeof (frtn_t)); + if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { + ipha = (ipha_t *)mp->b_rptr; + return (ire_to_nce(ire, ipha->ipha_dst, NULL)); + } else { + ip6h = (ip6_t *)mp->b_rptr; + return (ire_to_nce(ire, INADDR_ANY, &ip6h->ip6_dst)); + } } /* - * find, or create if needed, a neighbor cache entry nce_t for IRE_CACHE and - * non-loopback IRE_BROADCAST ire's. - * - * If a neighbor-cache entry has to be created (i.e., one does not already - * exist in the nce list) the nce_res_mp and nce_state of the neighbor cache - * entry are initialized in ndp_add_v4(). These values are picked from - * the src_nce, if one is passed in. Otherwise (if src_nce == NULL) the - * ire->ire_type and the outgoing interface (ire_to_ill(ire)) values - * determine the {nce_state, nce_res_mp} of the nce_t created. All - * IRE_BROADCAST entries have nce_state = ND_REACHABLE, and the nce_res_mp - * is set to the ill_bcast_mp of the outgoing inerface. For unicast ire - * entries, - * - if the outgoing interface is of type IRE_IF_RESOLVER, a newly created - * nce_t will have a null nce_res_mp, and will be in the ND_INITIAL state. - * - if the outgoing interface is a IRE_IF_NORESOLVER interface, no link - * layer resolution is necessary, so that the nce_t will be in the - * ND_REACHABLE state and the nce_res_mp will have a copy of the - * ill_resolver_mp of the outgoing interface. - * - * The link layer information needed for broadcast addresses, and for - * packets sent on IRE_IF_NORESOLVER interfaces is a constant mapping that - * never needs re-verification for the lifetime of the nce_t. These are - * therefore marked NCE_F_PERMANENT, and never allowed to expire via - * NCE_EXPIRED. - * - * IRE_CACHE ire's contain the information for the nexthop (ire_gateway_addr) - * in the case of indirect routes, and for the dst itself (ire_addr) in the - * case of direct routes, with the nce_res_mp containing a template - * DL_UNITDATA request. - * - * The actual association of the ire_nce to the nce created here is - * typically done in ire_add_v4 for IRE_CACHE entries. Exceptions - * to this rule are SO_DONTROUTE ire's (IRE_MARK_NO_ADD), for which - * the ire_nce assignment is done in ire_add_then_send. + * Given an IRE_INTERFACE (that matches more than one address) create + * and return an IRE_IF_CLONE for the specific address. + * Return the generation number. + * Returns NULL is no memory for the IRE. + * Handles both IPv4 and IPv6. */ -int -ire_nce_init(ire_t *ire, nce_t *src_nce) +ire_t * +ire_create_if_clone(ire_t *ire_if, const in6_addr_t *addr, uint_t *generationp) { - in_addr_t addr4; - int err; - nce_t *nce = NULL; - ill_t *ire_ill; - uint16_t nce_flags = 0; - ip_stack_t *ipst; - - if (ire->ire_stq == NULL) - return (0); /* no need to create nce for local/loopback */ - - switch (ire->ire_type) { - case IRE_CACHE: - if (ire->ire_gateway_addr != INADDR_ANY) - addr4 = ire->ire_gateway_addr; /* 'G' route */ - else - addr4 = ire->ire_addr; /* direct route */ - break; - case IRE_BROADCAST: - addr4 = ire->ire_addr; - nce_flags |= (NCE_F_PERMANENT|NCE_F_BCAST); - break; - default: - return (0); + ire_t *ire; + ire_t *nire; + + if (ire_if->ire_ipversion == IPV4_VERSION) { + ipaddr_t v4addr; + ipaddr_t mask = IP_HOST_MASK; + + ASSERT(IN6_IS_ADDR_V4MAPPED(addr)); + IN6_V4MAPPED_TO_IPADDR(addr, v4addr); + + ire = ire_create( + (uchar_t *)&v4addr, /* dest address */ + (uchar_t *)&mask, /* mask */ + (uchar_t *)&ire_if->ire_gateway_addr, + IRE_IF_CLONE, /* IRE type */ + ire_if->ire_ill, + ire_if->ire_zoneid, + ire_if->ire_flags | RTF_HOST, + NULL, /* No security attr for IRE_IF_ALL */ + ire_if->ire_ipst); + } else { + ASSERT(!IN6_IS_ADDR_V4MAPPED(addr)); + ire = ire_create_v6( + addr, /* dest address */ + &ipv6_all_ones, /* mask */ + &ire_if->ire_gateway_addr_v6, /* gateway addr */ + IRE_IF_CLONE, /* IRE type */ + ire_if->ire_ill, + ire_if->ire_zoneid, + ire_if->ire_flags | RTF_HOST, + NULL, /* No security attr for IRE_IF_ALL */ + ire_if->ire_ipst); } + if (ire == NULL) + return (NULL); - /* - * ire_ipif is picked based on RTF_SETSRC, usesrc etc. - * rules in ire_forward_src_ipif. We want the dlureq_mp - * for the outgoing interface, which we get from the ire_stq. - */ - ire_ill = ire_to_ill(ire); - ipst = ire_ill->ill_ipst; - - /* - * IRE_IF_NORESOLVER entries never need re-verification and - * do not expire, so we mark them as NCE_F_PERMANENT. - */ - if (ire_ill->ill_net_type == IRE_IF_NORESOLVER) - nce_flags |= NCE_F_PERMANENT; - -retry_nce: - err = ndp_lookup_then_add_v4(ire_ill, &addr4, nce_flags, - &nce, src_nce); + /* Take the metrics, in particular the mtu, from the IRE_IF */ + ire->ire_metrics = ire_if->ire_metrics; - if (err == EEXIST && NCE_EXPIRED(nce, ipst)) { - /* - * We looked up an expired nce. - * Go back and try to create one again. - */ - ndp_delete(nce); - NCE_REFRELE(nce); - nce = NULL; - goto retry_nce; - } + nire = ire_add(ire); + if (nire == NULL) /* Some failure */ + return (NULL); - ip1dbg(("ire 0x%p addr 0x%lx type 0x%x; found nce 0x%p err %d\n", - (void *)ire, (ulong_t)addr4, ire->ire_type, (void *)nce, err)); + if (generationp != NULL) + *generationp = nire->ire_generation; - switch (err) { - case 0: - case EEXIST: - /* - * return a pointer to a newly created or existing nce_t; - * note that the ire-nce mapping is many-one, i.e., - * multiple ire's could point to the same nce_t. - */ - break; - default: - DTRACE_PROBE2(nce__init__fail, ill_t *, ire_ill, int, err); - return (EINVAL); - } /* - * IRE_BROADCAST ire's must be linked to NCE_F_BCAST nce's and - * vice-versa (IRE_CACHE <-> unicast nce entries). We may have found an - * existing unicast (or bcast) nce when trying to add a BROADCAST (or - * unicast) ire, e.g., when address/netmask modifications were in - * progress, and the ipif_ndp_down() call to quiesce existing state - * during the addr/mask modification may have skipped the ndp_delete() - * because the ipif being affected was not the last one on the ill. We - * recover from the missed ndp_delete() now, by deleting the old nce and - * adding a new one with the correct NCE_F_BCAST state. + * Make sure races don't add a duplicate by + * catching the case when an identical was returned. */ - if (ire->ire_type == IRE_BROADCAST) { - if ((nce->nce_flags & NCE_F_BCAST) == 0) { - /* IRE_BROADCAST needs NCE_F_BCAST */ - ndp_delete(nce); - NCE_REFRELE(nce); - goto retry_nce; - } - /* - * Two bcast ires are created for each interface; - * 1. loopback copy (which does not have an - * ire_stq, and therefore has no ire_nce), and, - * 2. the non-loopback copy, which has the nce_res_mp - * initialized to a copy of the ill_bcast_mp, and - * is marked as ND_REACHABLE at this point. - * This nce does not undergo any further state changes, - * and exists as long as the interface is plumbed. - * Note: the assignment of ire_nce here is a historical - * artifact of old code that used to inline ire_add(). - */ - ire->ire_nce = nce; - /* - * We are associating this nce to the ire, - * so change the nce ref taken in - * ndp_lookup_then_add_v4() from - * NCE_REFHOLD to NCE_REFHOLD_NOTR - */ - NCE_REFHOLD_TO_REFHOLD_NOTR(ire->ire_nce); - } else { - if ((nce->nce_flags & NCE_F_BCAST) != 0) { - /* IRE_CACHE needs unicast nce */ - ndp_delete(nce); - NCE_REFRELE(nce); - goto retry_nce; - } - /* - * We are not using this nce_t just yet so release - * the ref taken in ndp_lookup_then_add_v4() - */ - NCE_REFRELE(nce); + if (nire != ire) { + ASSERT(nire->ire_identical_ref > 1); + ire_delete(nire); } - return (0); + return (nire); } /* - * This is the implementation of the IPv4 IRE cache lookup procedure. - * Separating the interface from the implementation allows additional - * flexibility when specifying search criteria. + * The argument is an IRE_INTERFACE. Delete all of IRE_IF_CLONE in the + * ire_dep_children (just walk the ire_dep_sib_next since they are all + * immediate children.) + * Since we hold a lock while we remove them we need to defer the actual + * calls to ire_delete() until we have dropped the lock. This makes things + * less efficient since we restart at the top after dropping the lock. But + * we only run when an IRE_INTERFACE is deleted which is infrquent. + * + * Note that ire_dep_children can be any mixture of offlink routes and + * IRE_IF_CLONE entries. */ -static ire_t * -ip4_ctable_lookup_impl(ire_ctable_args_t *margs) +void +ire_dep_delete_if_clone(ire_t *parent) { - irb_t *irb_ptr; - ire_t *ire; - ip_stack_t *ipst = margs->ict_ipst; + ip_stack_t *ipst = parent->ire_ipst; + ire_t *child, *next; - if ((margs->ict_flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && - (margs->ict_ipif == NULL)) { - return (NULL); +restart: + rw_enter(&ipst->ips_ire_dep_lock, RW_READER); + if (parent->ire_dep_children == NULL) { + rw_exit(&ipst->ips_ire_dep_lock); + return; } - - irb_ptr = &ipst->ips_ip_cache_table[IRE_ADDR_HASH( - *((ipaddr_t *)margs->ict_addr), ipst->ips_ip_cache_table_size)]; - rw_enter(&irb_ptr->irb_lock, RW_READER); - for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { - if (ire->ire_marks & IRE_MARK_CONDEMNED) - continue; - ASSERT(ire->ire_mask == IP_HOST_MASK); - if (ire_match_args(ire, *((ipaddr_t *)margs->ict_addr), - ire->ire_mask, *((ipaddr_t *)margs->ict_gateway), - margs->ict_type, margs->ict_ipif, margs->ict_zoneid, 0, - margs->ict_tsl, margs->ict_flags, margs->ict_wq)) { - IRE_REFHOLD(ire); - rw_exit(&irb_ptr->irb_lock); - return (ire); + child = parent->ire_dep_children; + while (child != NULL) { + next = child->ire_dep_sib_next; + if ((child->ire_type & IRE_IF_CLONE) && + !IRE_IS_CONDEMNED(child)) { + ire_refhold(child); + rw_exit(&ipst->ips_ire_dep_lock); + ire_delete(child); + ASSERT(IRE_IS_CONDEMNED(child)); + ire_refrele(child); + goto restart; } + child = next; } - - rw_exit(&irb_ptr->irb_lock); - return (NULL); + rw_exit(&ipst->ips_ire_dep_lock); } /* - * This function locates IRE_CACHE entries which were added by the - * ire_forward() path. We can fully specify the IRE we are looking for by - * providing the ipif (MATCH_IRE_IPIF) *and* the stq (MATCH_IRE_WQ). + * ire_pref() is used in recursive route-resolution for a destination to + * determine the preference of an ire, where "preference" is determined + * based on the level of indirection to the destination of the ire. + * A higher preference indicates that fewer lookups are needed to complete + * recursive route lookup. Thus + * ire_pref(RTF_INDIRECT) < ire_pref(IRE_IF_RESOLVER) < ire_pref(IRE_PREF_CLONE) */ -ire_t * -ire_arpresolve_lookup(ipaddr_t addr, ipaddr_t gw, ipif_t *ipif, - zoneid_t zoneid, ip_stack_t *ipst, queue_t *wq) -{ - ire_ctable_args_t margs; - - margs.ict_addr = &addr; - margs.ict_gateway = &gw; - margs.ict_type = IRE_CACHE; - margs.ict_ipif = ipif; - margs.ict_zoneid = zoneid; - margs.ict_tsl = NULL; - margs.ict_flags = MATCH_IRE_GW | MATCH_IRE_IPIF | MATCH_IRE_ZONEONLY | - MATCH_IRE_TYPE | MATCH_IRE_WQ; - margs.ict_ipst = ipst; - margs.ict_wq = wq; - - return (ip4_ctable_lookup_impl(&margs)); +int +ire_pref(ire_t *ire) +{ + if (ire->ire_flags & RTF_INDIRECT) + return (1); + if (ire->ire_type & IRE_OFFLINK) + return (2); + if (ire->ire_type & (IRE_IF_RESOLVER|IRE_IF_NORESOLVER)) + return (3); + if (ire->ire_type & IRE_IF_CLONE) + return (4); + if (ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) + return (5); + return (-1); /* unknown ire_type */ } diff --git a/usr/src/uts/common/inet/ip/ip_mroute.c b/usr/src/uts/common/inet/ip/ip_mroute.c index 5418c2d8d4..41f4f3f221 100644 --- a/usr/src/uts/common/inet/ip/ip_mroute.c +++ b/usr/src/uts/common/inet/ip/ip_mroute.c @@ -1,8 +1,4 @@ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ -/* * CDDL HEADER START * * The contents of this file are subject to the terms of the @@ -23,8 +19,8 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. - * All rights reserved. Use is subject to license terms. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -65,6 +61,7 @@ #include <netinet/in.h> #include <net/if_dl.h> +#include <inet/ipsec_impl.h> #include <inet/common.h> #include <inet/mi.h> #include <inet/nd.h> @@ -79,6 +76,7 @@ #include <netinet/ip_mroute.h> #include <inet/ip_multi.h> #include <inet/ip_ire.h> +#include <inet/ip_ndp.h> #include <inet/ip_if.h> #include <inet/ipclassifier.h> @@ -98,7 +96,7 @@ * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates * that vif is being initalized. * Each structure is freed when the refcnt goes down to zero. If a delete comes - * in when the the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED + * in when the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED * which prevents the struct from further use. When the refcnt goes to zero * the struct is freed and is marked VIF_MARK_NOTINUSE. * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill @@ -171,9 +169,9 @@ /* Function declarations */ static int add_mfc(struct mfcctl *, ip_stack_t *); -static int add_vif(struct vifctl *, conn_t *, mblk_t *, ip_stack_t *); +static int add_vif(struct vifctl *, conn_t *, ip_stack_t *); static int del_mfc(struct mfcctl *, ip_stack_t *); -static int del_vif(vifi_t *, conn_t *, mblk_t *, ip_stack_t *); +static int del_vif(vifi_t *, ip_stack_t *); static void del_vifp(struct vif *); static void encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); static void expire_upcalls(void *); @@ -188,7 +186,7 @@ static int ip_mdq(mblk_t *, ipha_t *, ill_t *, ipaddr_t, struct mfc *); static int ip_mrouter_init(conn_t *, uchar_t *, int, ip_stack_t *); static void phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); -static int register_mforward(queue_t *, mblk_t *, ill_t *); +static int register_mforward(mblk_t *, ip_recv_attr_t *); static void register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t); static int set_assert(int *, ip_stack_t *); @@ -331,10 +329,9 @@ static ipha_t multicast_encap_iphdr = { * Handle MRT setsockopt commands to modify the multicast routing tables. */ int -ip_mrouter_set(int cmd, queue_t *q, int checkonly, uchar_t *data, - int datalen, mblk_t *first_mp) +ip_mrouter_set(int cmd, conn_t *connp, int checkonly, uchar_t *data, + int datalen) { - conn_t *connp = Q_TO_CONN(q); ip_stack_t *ipst = connp->conn_netstack->netstack_ip; mutex_enter(&ipst->ips_ip_g_mrouter_mutex); @@ -376,11 +373,9 @@ ip_mrouter_set(int cmd, queue_t *q, int checkonly, uchar_t *data, switch (cmd) { case MRT_INIT: return (ip_mrouter_init(connp, data, datalen, ipst)); - case MRT_DONE: return (ip_mrouter_done(first_mp, ipst)); - case MRT_ADD_VIF: return (add_vif((struct vifctl *)data, connp, - first_mp, ipst)); - case MRT_DEL_VIF: return (del_vif((vifi_t *)data, connp, first_mp, - ipst)); + case MRT_DONE: return (ip_mrouter_done(ipst)); + case MRT_ADD_VIF: return (add_vif((struct vifctl *)data, connp, ipst)); + case MRT_DEL_VIF: return (del_vif((vifi_t *)data, ipst)); case MRT_ADD_MFC: return (add_mfc((struct mfcctl *)data, ipst)); case MRT_DEL_MFC: return (del_mfc((struct mfcctl *)data, ipst)); case MRT_ASSERT: return (set_assert((int *)data, ipst)); @@ -392,9 +387,8 @@ ip_mrouter_set(int cmd, queue_t *q, int checkonly, uchar_t *data, * Handle MRT getsockopt commands */ int -ip_mrouter_get(int cmd, queue_t *q, uchar_t *data) +ip_mrouter_get(int cmd, conn_t *connp, uchar_t *data) { - conn_t *connp = Q_TO_CONN(q); ip_stack_t *ipst = connp->conn_netstack->netstack_ip; if (connp != ipst->ips_ip_g_mrouter) @@ -611,7 +605,7 @@ ip_mrouter_stack_init(ip_stack_t *ipst) * Didn't use global timeout_val (BSD version), instead check the mfctable. */ int -ip_mrouter_done(mblk_t *mp, ip_stack_t *ipst) +ip_mrouter_done(ip_stack_t *ipst) { conn_t *mrouter; vifi_t vifi; @@ -665,47 +659,19 @@ ip_mrouter_done(mblk_t *mp, ip_stack_t *ipst) /* Phyint only */ if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { ipif_t *ipif = vifp->v_ipif; - ipsq_t *ipsq; - boolean_t suc; - ill_t *ill; + ilm_t *ilm = vifp->v_ilm; - ill = ipif->ipif_ill; - suc = B_FALSE; - if (mp == NULL) { - /* - * being called from ip_close, - * lets do it synchronously. - * Clear VIF_MARK_GOOD and - * set VIF_MARK_CONDEMNED. - */ - vifp->v_marks &= ~VIF_MARK_GOOD; - vifp->v_marks |= VIF_MARK_CONDEMNED; - mutex_exit(&(vifp)->v_lock); - suc = ipsq_enter(ill, B_FALSE, NEW_OP); - ipsq = ill->ill_phyint->phyint_ipsq; - } else { - ipsq = ipsq_try_enter(ipif, NULL, - mrouter->conn_wq, mp, - ip_restart_optmgmt, NEW_OP, B_TRUE); - if (ipsq == NULL) { - mutex_exit(&(vifp)->v_lock); - ipif_refrele(ipif); - return (EINPROGRESS); - } - /* - * Clear VIF_MARK_GOOD and - * set VIF_MARK_CONDEMNED. - */ - vifp->v_marks &= ~VIF_MARK_GOOD; - vifp->v_marks |= VIF_MARK_CONDEMNED; - mutex_exit(&(vifp)->v_lock); - suc = B_TRUE; - } + vifp->v_ilm = NULL; + vifp->v_marks &= ~VIF_MARK_GOOD; + vifp->v_marks |= VIF_MARK_CONDEMNED; - if (suc) { - (void) ip_delmulti(INADDR_ANY, ipif, - B_TRUE, B_TRUE); - ipsq_exit(ipsq); + mutex_exit(&(vifp)->v_lock); + if (ilm != NULL) { + ill_t *ill = ipif->ipif_ill; + + (void) ip_delmulti(ilm); + ASSERT(ill->ill_mrouter_cnt > 0); + atomic_dec_32(&ill->ill_mrouter_cnt); } mutex_enter(&vifp->v_lock); } @@ -866,14 +832,15 @@ lock_good_vif(struct vif *vifp) * Add a vif to the vif table. */ static int -add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst) +add_vif(struct vifctl *vifcp, conn_t *connp, ip_stack_t *ipst) { struct vif *vifp = ipst->ips_vifs + vifcp->vifc_vifi; ipif_t *ipif; - int error; + int error = 0; struct tbf *v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi; - ipsq_t *ipsq; conn_t *mrouter = ipst->ips_ip_g_mrouter; + ilm_t *ilm; + ill_t *ill; ASSERT(connp != NULL); @@ -913,28 +880,12 @@ add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst) mutex_exit(&vifp->v_lock); /* Find the interface with the local address */ ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL, - connp->conn_zoneid, CONNP_TO_WQ(connp), first_mp, - ip_restart_optmgmt, &error, ipst); + IPCL_ZONEID(connp), ipst); if (ipif == NULL) { VIF_REFRELE(vifp); - if (error == EINPROGRESS) - return (error); return (EADDRNOTAVAIL); } - /* - * We have to be exclusive as we have to call ip_addmulti() - * This is the best position to try to be exclusive in case - * we have to wait. - */ - ipsq = ipsq_try_enter(ipif, NULL, CONNP_TO_WQ(connp), first_mp, - ip_restart_optmgmt, NEW_OP, B_TRUE); - if ((ipsq) == NULL) { - VIF_REFRELE(vifp); - ipif_refrele(ipif); - return (EINPROGRESS); - } - if (ipst->ips_ip_mrtdebug > 1) { (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, "add_vif: src 0x%x enter", @@ -959,7 +910,6 @@ add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst) "add_vif: source route tunnels not supported\n"); VIF_REFRELE_LOCKED(vifp); ipif_refrele(ipif); - ipsq_exit(ipsq); return (EOPNOTSUPP); } vifp->v_rmt_addr = vifcp->vifc_rmt_addr; @@ -981,7 +931,6 @@ add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst) mutex_exit(&ipst->ips_numvifs_mutex); VIF_REFRELE_LOCKED(vifp); ipif_refrele(ipif); - ipsq_exit(ipsq); return (EADDRINUSE); } } @@ -995,22 +944,39 @@ add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst) ipst->ips_reg_vif_num = ALL_VIFS; mutex_exit(&ipst->ips_numvifs_mutex); } - ipsq_exit(ipsq); return (EOPNOTSUPP); } /* Enable promiscuous reception of all IP mcasts from the if */ mutex_exit(&vifp->v_lock); - error = ip_addmulti(INADDR_ANY, ipif, ILGSTAT_NONE, - MODE_IS_EXCLUDE, NULL); + + ill = ipif->ipif_ill; + if (IS_UNDER_IPMP(ill)) + ill = ipmp_ill_hold_ipmp_ill(ill); + + if (ill == NULL) { + ilm = NULL; + } else { + ilm = ip_addmulti(&ipv6_all_zeros, ill, + ipif->ipif_zoneid, &error); + if (ilm != NULL) + atomic_inc_32(&ill->ill_mrouter_cnt); + if (IS_UNDER_IPMP(ipif->ipif_ill)) { + ill_refrele(ill); + ill = ipif->ipif_ill; + } + } + mutex_enter(&vifp->v_lock); /* * since we released the lock lets make sure that * ip_mrouter_done() has not been called. */ - if (error != 0 || is_mrouter_off(ipst)) { - if (error == 0) - (void) ip_delmulti(INADDR_ANY, ipif, B_TRUE, - B_TRUE); + if (ilm == NULL || is_mrouter_off(ipst)) { + if (ilm != NULL) { + (void) ip_delmulti(ilm); + ASSERT(ill->ill_mrouter_cnt > 0); + atomic_dec_32(&ill->ill_mrouter_cnt); + } if (vifcp->vifc_flags & VIFF_REGISTER) { mutex_enter(&ipst->ips_numvifs_mutex); ipst->ips_reg_vif_num = ALL_VIFS; @@ -1018,9 +984,9 @@ add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst) } VIF_REFRELE_LOCKED(vifp); ipif_refrele(ipif); - ipsq_exit(ipsq); return (error?error:EINVAL); } + vifp->v_ilm = ilm; } /* Define parameters for the tbf structure */ vifp->v_tbf = v_tbf; @@ -1063,7 +1029,6 @@ add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst) vifp->v_marks = VIF_MARK_GOOD; mutex_exit(&vifp->v_lock); - ipsq_exit(ipsq); return (0); } @@ -1131,10 +1096,9 @@ del_vifp(struct vif *vifp) } static int -del_vif(vifi_t *vifip, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst) +del_vif(vifi_t *vifip, ip_stack_t *ipst) { struct vif *vifp = ipst->ips_vifs + *vifip; - ipsq_t *ipsq; if (*vifip >= ipst->ips_numvifs) return (EINVAL); @@ -1151,41 +1115,6 @@ del_vif(vifi_t *vifip, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst) return (EADDRNOTAVAIL); } - /* - * This is an optimization, if first_mp == NULL - * than we are being called from reset_mrt_vif_ipif() - * so we already have exclusive access to the ipsq. - * the ASSERT below is a check for this condition. - */ - if (first_mp != NULL && - !(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { - ASSERT(connp != NULL); - /* - * We have to be exclusive as we have to call ip_delmulti() - * This is the best position to try to be exclusive in case - * we have to wait. - */ - ipsq = ipsq_try_enter(vifp->v_ipif, NULL, CONNP_TO_WQ(connp), - first_mp, ip_restart_optmgmt, NEW_OP, B_TRUE); - if ((ipsq) == NULL) { - mutex_exit(&vifp->v_lock); - return (EINPROGRESS); - } - /* recheck after being exclusive */ - if (vifp->v_lcl_addr.s_addr == 0 || - !vifp->v_marks & VIF_MARK_GOOD) { - /* - * someone beat us. - */ - mutex_exit(&vifp->v_lock); - ipsq_exit(ipsq); - return (EADDRNOTAVAIL); - } - } - - - ASSERT(IAM_WRITER_IPIF(vifp->v_ipif)); - /* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */ vifp->v_marks &= ~VIF_MARK_GOOD; vifp->v_marks |= VIF_MARK_CONDEMNED; @@ -1193,18 +1122,30 @@ del_vif(vifi_t *vifip, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst) /* Phyint only */ if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { ipif_t *ipif = vifp->v_ipif; + ilm_t *ilm = vifp->v_ilm; + + vifp->v_ilm = NULL; + ASSERT(ipif != NULL); /* * should be OK to drop the lock as we * have marked this as CONDEMNED. */ mutex_exit(&(vifp)->v_lock); - (void) ip_delmulti(INADDR_ANY, ipif, B_TRUE, B_TRUE); - if (first_mp != NULL) - ipsq_exit(ipsq); + if (ilm != NULL) { + (void) ip_delmulti(ilm); + ASSERT(ipif->ipif_ill->ill_mrouter_cnt > 0); + atomic_dec_32(&ipif->ipif_ill->ill_mrouter_cnt); + } mutex_enter(&(vifp)->v_lock); } + if (vifp->v_flags & VIFF_REGISTER) { + mutex_enter(&ipst->ips_numvifs_mutex); + ipst->ips_reg_vif_num = ALL_VIFS; + mutex_exit(&ipst->ips_numvifs_mutex); + } + /* * decreases the refcnt added in add_vif. */ @@ -1584,16 +1525,21 @@ del_mfc(struct mfcctl *mfccp, ip_stack_t *ipst) * 1 - pkt came in on tunnel */ int -ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp) +ip_mforward(mblk_t *mp, ip_recv_attr_t *ira) { + ipha_t *ipha = (ipha_t *)mp->b_rptr; + ill_t *ill = ira->ira_ill; struct mfc *rt; ipaddr_t src, dst, tunnel_src = 0; static int srctun = 0; vifi_t vifi; boolean_t pim_reg_packet = B_FALSE; - struct mfcb *mfcbp; + struct mfcb *mfcbp; ip_stack_t *ipst = ill->ill_ipst; conn_t *mrouter = ipst->ips_ip_g_mrouter; + ill_t *rill = ira->ira_rill; + + ASSERT(ira->ira_pktlen == msgdsize(mp)); if (ipst->ips_ip_mrtdebug > 1) { (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, @@ -1603,10 +1549,10 @@ ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp) } dst = ipha->ipha_dst; - if ((uint32_t)(uintptr_t)mp->b_prev == PIM_REGISTER_MARKER) + if (ira->ira_flags & IRAF_PIM_REGISTER) pim_reg_packet = B_TRUE; - else - tunnel_src = (ipaddr_t)(uintptr_t)mp->b_prev; + else if (ira->ira_flags & IRAF_MROUTE_TUNNEL_SET) + tunnel_src = ira->ira_mroute_tunnel; /* * Don't forward a packet with time-to-live of zero or one, @@ -1620,7 +1566,6 @@ ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp) " dst 0x%x ill %s", ipha->ipha_ttl, ntohl(dst), ill->ill_name); } - mp->b_prev = NULL; if (tunnel_src != 0) return (1); else @@ -1630,10 +1575,8 @@ ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp) if ((tunnel_src != 0) || pim_reg_packet) { /* * Packet arrived over an encapsulated tunnel or via a PIM - * register message. Both ip_mroute_decap() and pim_input() - * encode information in mp->b_prev. + * register message. */ - mp->b_prev = NULL; if (ipst->ips_ip_mrtdebug > 1) { if (tunnel_src != 0) { (void) mi_strlog(mrouter->conn_rq, 1, @@ -1926,10 +1869,16 @@ ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp) mutex_exit(&mfc_rt->mfc_mutex); mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); /* Pass to RAWIP */ - (mrouter->conn_recv)(mrouter, mp_copy, NULL); + ira->ira_ill = ira->ira_rill = NULL; + (mrouter->conn_recv)(mrouter, mp_copy, NULL, ira); + ira->ira_ill = ill; + ira->ira_rill = rill; } else { mutex_exit(&mfc_rt->mfc_mutex); mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock)); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ip_mforward - upcall already waiting", + mp_copy, ill); freemsg(mp_copy); } @@ -1945,8 +1894,11 @@ ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp) mi_free((char *)mfc_rt); if (rte != NULL) mi_free((char *)rte); - if (mp_copy != NULL) + if (mp_copy != NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ip_mforward error", mp_copy, ill); freemsg(mp_copy); + } if (mp0 != NULL) freemsg(mp0); return (-1); @@ -2023,7 +1975,6 @@ static int ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src, struct mfc *rt) { - ill_t *vill; vifi_t vifi; struct vif *vifp; ipaddr_t dst = ipha->ipha_dst; @@ -2031,6 +1982,7 @@ ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src, vifi_t num_of_vifs; ip_stack_t *ipst = ill->ill_ipst; conn_t *mrouter = ipst->ips_ip_g_mrouter; + ip_recv_attr_t iras; if (ipst->ips_ip_mrtdebug > 1) { (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, @@ -2091,19 +2043,19 @@ ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src, * Don't forward if it didn't arrive from the parent vif for its * origin. */ - vill = ipst->ips_vifs[vifi].v_ipif->ipif_ill; - if ((vill != ill && !IS_IN_SAME_ILLGRP(vill, ill)) || + if ((ipst->ips_vifs[vifi].v_ipif->ipif_ill != ill) || (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) { /* Came in the wrong interface */ ip1dbg(("ip_mdq: arrived wrong if, vifi %d " "numvifs %d ill %s viftable ill %s\n", (int)vifi, (int)ipst->ips_numvifs, ill->ill_name, - vill->ill_name)); + ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name)); if (ipst->ips_ip_mrtdebug > 1) { (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, "ip_mdq: arrived wrong if, vifi %d ill " "%s viftable ill %s\n", - (int)vifi, ill->ill_name, vill->ill_name); + (int)vifi, ill->ill_name, + ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name); } ipst->ips_mrtstat->mrts_wrong_if++; rt->mfc_wrong_if++; @@ -2137,7 +2089,14 @@ ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src, im->im_mbz = 0; im->im_vif = (ushort_t)vifi; /* Pass to RAWIP */ - (mrouter->conn_recv)(mrouter, mp_copy, NULL); + + bzero(&iras, sizeof (iras)); + iras.ira_flags = IRAF_IS_IPV4; + iras.ira_ip_hdr_length = + IPH_HDR_LENGTH(mp_copy->b_rptr); + iras.ira_pktlen = msgdsize(mp_copy); + (mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras); + ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); } unlock_good_vif(&ipst->ips_vifs[vifi]); if (tunnel_src != 0) @@ -2239,8 +2198,10 @@ register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) struct igmpmsg *im; mblk_t *mp_copy; ipha_t *ipha_copy; - ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; + ill_t *ill = vifp->v_ipif->ipif_ill; + ip_stack_t *ipst = ill->ill_ipst; conn_t *mrouter = ipst->ips_ip_g_mrouter; + ip_recv_attr_t iras; if (ipst->ips_ip_mrtdebug > 1) { (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, @@ -2307,16 +2268,24 @@ register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) im->im_mbz = 0; ++ipst->ips_mrtstat->mrts_upcalls; - if (!canputnext(mrouter->conn_rq)) { + if (IPCL_IS_NONSTR(mrouter) ? mrouter->conn_flow_cntrld : + !canputnext(mrouter->conn_rq)) { ++ipst->ips_mrtstat->mrts_pim_regsend_drops; if (ipst->ips_ip_mrtdebug > 3) { (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, "register_send: register upcall failure."); } + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("mrts_pim_regsend_drops", mp_copy, ill); freemsg(mp_copy); } else { /* Pass to RAWIP */ - (mrouter->conn_recv)(mrouter, mp_copy, NULL); + bzero(&iras, sizeof (iras)); + iras.ira_flags = IRAF_IS_IPV4; + iras.ira_ip_hdr_length = sizeof (ipha_t); + iras.ira_pktlen = msgdsize(mp_copy); + (mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras); + ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); } } @@ -2349,18 +2318,22 @@ pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp) } /* - * int - * pim_input(queue_t *, mblk_t *, ill_t *ill) - Process PIM protocol packets. - * IP Protocol 103. Register messages are decapsulated and sent - * onto multicast forwarding. + * Process PIM protocol packets i.e. IP Protocol 103. + * Register messages are decapsulated and sent onto multicast forwarding. + * + * Return NULL for a bad packet that is discarded here. + * Return mp if the message is OK and should be handed to "raw" receivers. + * Callers of pim_input() may need to reinitialize variables that were copied + * from the mblk as this calls pullupmsg(). */ -int -pim_input(queue_t *q, mblk_t *mp, ill_t *ill) +mblk_t * +pim_input(mblk_t *mp, ip_recv_attr_t *ira) { ipha_t *eip, *ip; int iplen, pimlen, iphlen; struct pim *pimp; /* pointer to a pim struct */ uint32_t *reghdr; + ill_t *ill = ira->ira_ill; ip_stack_t *ipst = ill->ill_ipst; conn_t *mrouter = ipst->ips_ip_g_mrouter; @@ -2369,8 +2342,10 @@ pim_input(queue_t *q, mblk_t *mp, ill_t *ill) */ if (pullupmsg(mp, -1) == 0) { ++ipst->ips_mrtstat->mrts_pim_nomemory; + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("mrts_pim_nomemory", mp, ill); freemsg(mp); - return (-1); + return (NULL); } ip = (ipha_t *)mp->b_rptr; @@ -2387,8 +2362,10 @@ pim_input(queue_t *q, mblk_t *mp, ill_t *ill) (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, "pim_input: length not at least minlen"); } + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("mrts_pim_malformed", mp, ill); freemsg(mp); - return (-1); + return (NULL); } /* @@ -2405,8 +2382,10 @@ pim_input(queue_t *q, mblk_t *mp, ill_t *ill) (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, "pim_input: unknown version of PIM"); } + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("mrts_pim_badversion", mp, ill); freemsg(mp); - return (-1); + return (NULL); } /* @@ -2418,12 +2397,14 @@ pim_input(queue_t *q, mblk_t *mp, ill_t *ill) (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, "pim_input: invalid checksum"); } + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("pim_rcv_badcsum", mp, ill); freemsg(mp); - return (-1); + return (NULL); } if (pimp->pim_type != PIM_REGISTER) - return (0); + return (mp); reghdr = (uint32_t *)(pimp + 1); eip = (ipha_t *)(reghdr + 1); @@ -2437,8 +2418,10 @@ pim_input(queue_t *q, mblk_t *mp, ill_t *ill) (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, "pim_input: Inner pkt not mcast .. !"); } + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("mrts_pim_badregisters", mp, ill); freemsg(mp); - return (-1); + return (NULL); } if (ipst->ips_ip_mrtdebug > 1) { (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, @@ -2450,27 +2433,36 @@ pim_input(queue_t *q, mblk_t *mp, ill_t *ill) /* * If the null register bit is not set, decapsulate * the packet before forwarding it. + * Avoid this in no register vif */ - if (!(ntohl(*reghdr) & PIM_NULL_REGISTER)) { + if (!(ntohl(*reghdr) & PIM_NULL_REGISTER) && + ipst->ips_reg_vif_num != ALL_VIFS) { mblk_t *mp_copy; + uint_t saved_pktlen; /* Copy the message */ if ((mp_copy = copymsg(mp)) == NULL) { ++ipst->ips_mrtstat->mrts_pim_nomemory; + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("mrts_pim_nomemory", mp, ill); freemsg(mp); - return (-1); + return (NULL); } /* * Decapsulate the packet and give it to * register_mforward. */ - mp_copy->b_rptr += iphlen + sizeof (pim_t) + - sizeof (*reghdr); - if (register_mforward(q, mp_copy, ill) != 0) { + mp_copy->b_rptr += iphlen + sizeof (pim_t) + sizeof (*reghdr); + saved_pktlen = ira->ira_pktlen; + ira->ira_pktlen -= iphlen + sizeof (pim_t) + sizeof (*reghdr); + if (register_mforward(mp_copy, ira) != 0) { + /* register_mforward already called ip_drop_input */ freemsg(mp); - return (-1); + ira->ira_pktlen = saved_pktlen; + return (NULL); } + ira->ira_pktlen = saved_pktlen; } /* @@ -2478,7 +2470,7 @@ pim_input(queue_t *q, mblk_t *mp, ill_t *ill) * PIM socket. For Solaris it is done right after pim_input() is * called. */ - return (0); + return (mp); } /* @@ -2486,38 +2478,52 @@ pim_input(queue_t *q, mblk_t *mp, ill_t *ill) * the packet. Loop back the packet, as if we have received it. * In pim_input() we have to check if the destination is a multicast address. */ -/* ARGSUSED */ static int -register_mforward(queue_t *q, mblk_t *mp, ill_t *ill) +register_mforward(mblk_t *mp, ip_recv_attr_t *ira) { + ire_t *ire; + ipha_t *ipha = (ipha_t *)mp->b_rptr; + ill_t *ill = ira->ira_ill; ip_stack_t *ipst = ill->ill_ipst; conn_t *mrouter = ipst->ips_ip_g_mrouter; ASSERT(ipst->ips_reg_vif_num <= ipst->ips_numvifs); if (ipst->ips_ip_mrtdebug > 3) { - ipha_t *ipha; - - ipha = (ipha_t *)mp->b_rptr; (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, "register_mforward: src %x, dst %x\n", ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst)); } /* * Need to pass in to ip_mforward() the information that the - * packet has arrived on the register_vif. We use the solution that - * ip_mroute_decap() employs: use mp->b_prev to pass some information - * to ip_mforward(). Nonzero value means the packet has arrived on a - * tunnel (ip_mroute_decap() puts the address of the other side of the - * tunnel there.) This is safe since ip_rput() either frees the packet - * or passes it to ip_mforward(). We use - * PIM_REGISTER_MARKER = 0xffffffff to indicate the has arrived on the - * register vif. If in the future we have more than one register vifs, - * then this will need re-examination. + * packet has arrived on the register_vif. We mark it with + * the IRAF_PIM_REGISTER attribute. + * pim_input verified that the (inner) destination is multicast, + * hence we skip the generic code in ip_input. */ - mp->b_prev = (mblk_t *)PIM_REGISTER_MARKER; + ira->ira_flags |= IRAF_PIM_REGISTER; ++ipst->ips_mrtstat->mrts_pim_regforwards; - ip_rput(q, mp); + + if (!CLASSD(ipha->ipha_dst)) { + ire = ire_route_recursive_v4(ipha->ipha_dst, 0, NULL, ALL_ZONES, + ira->ira_tsl, MATCH_IRE_SECATTR, B_TRUE, 0, ipst, NULL, + NULL, NULL); + } else { + ire = ire_multicast(ill); + } + ASSERT(ire != NULL); + /* Normally this will return the IRE_MULTICAST */ + if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("mrts_pim RTF_REJECT", mp, ill); + freemsg(mp); + ire_refrele(ire); + return (-1); + } + ASSERT(ire->ire_type & IRE_MULTICAST); + (*ire->ire_recvfn)(ire, mp, ipha, ira); + ire_refrele(ire); + return (0); } @@ -2575,6 +2581,8 @@ encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) ipha->ipha_hdr_checksum = 0; ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); + ipha_copy->ipha_ttl = ipha->ipha_ttl; + if (ipst->ips_ip_mrtdebug > 1) { (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, "encap_send: group 0x%x", ntohl(ipha->ipha_dst)); @@ -2587,21 +2595,53 @@ encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst) } /* - * De-encapsulate a packet and feed it back through IP input. + * De-encapsulate a packet and feed it back through IP input if it + * matches one of our multicast tunnels. + * * This routine is called whenever IP gets a packet with prototype - * IPPROTO_ENCAP and a local destination address. + * IPPROTO_ENCAP and a local destination address and the packet didn't + * match one of our configured IP-in-IP tunnels. */ void -ip_mroute_decap(queue_t *q, mblk_t *mp, ill_t *ill) +ip_mroute_decap(mblk_t *mp, ip_recv_attr_t *ira) { ipha_t *ipha = (ipha_t *)mp->b_rptr; ipha_t *ipha_encap; int hlen = IPH_HDR_LENGTH(ipha); + int hlen_encap; ipaddr_t src; struct vif *vifp; + ire_t *ire; + ill_t *ill = ira->ira_ill; ip_stack_t *ipst = ill->ill_ipst; conn_t *mrouter = ipst->ips_ip_g_mrouter; + /* Make sure we have all of the inner header */ + ipha_encap = (ipha_t *)((char *)ipha + hlen); + if (mp->b_wptr - mp->b_rptr < hlen + IP_SIMPLE_HDR_LENGTH) { + ipha = ip_pullup(mp, hlen + IP_SIMPLE_HDR_LENGTH, ira); + if (ipha == NULL) { + ipst->ips_mrtstat->mrts_bad_tunnel++; + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ip_mroute_decap: too short", mp, ill); + freemsg(mp); + return; + } + ipha_encap = (ipha_t *)((char *)ipha + hlen); + } + hlen_encap = IPH_HDR_LENGTH(ipha_encap); + if (mp->b_wptr - mp->b_rptr < hlen + hlen_encap) { + ipha = ip_pullup(mp, hlen + hlen_encap, ira); + if (ipha == NULL) { + ipst->ips_mrtstat->mrts_bad_tunnel++; + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ip_mroute_decap: too short", mp, ill); + freemsg(mp); + return; + } + ipha_encap = (ipha_t *)((char *)ipha + hlen); + } + /* * Dump the packet if it's not to a multicast destination or if * we don't have an encapsulating tunnel with the source. @@ -2609,10 +2649,11 @@ ip_mroute_decap(queue_t *q, mblk_t *mp, ill_t *ill) * uniquely identifies the tunnel (i.e., that this site has * at most one tunnel with the remote site). */ - ipha_encap = (ipha_t *)((char *)ipha + hlen); if (!CLASSD(ipha_encap->ipha_dst)) { ipst->ips_mrtstat->mrts_bad_tunnel++; ip1dbg(("ip_mroute_decap: bad tunnel\n")); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("mrts_bad_tunnel", mp, ill); freemsg(mp); return; } @@ -2648,6 +2689,8 @@ ip_mroute_decap(queue_t *q, mblk_t *mp, ill_t *ill) if ((vifp = ipst->ips_last_encap_vif) == 0) { mutex_exit(&ipst->ips_last_encap_lock); ipst->ips_mrtstat->mrts_bad_tunnel++; + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("mrts_bad_tunnel", mp, ill); freemsg(mp); ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n", (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(src))); @@ -2657,14 +2700,43 @@ ip_mroute_decap(queue_t *q, mblk_t *mp, ill_t *ill) /* * Need to pass in the tunnel source to ip_mforward (so that it can - * verify that the packet arrived over the correct vif.) We use b_prev - * to pass this information. This is safe since the ip_rput either - * frees the packet or passes it to ip_mforward. + * verify that the packet arrived over the correct vif.) */ - mp->b_prev = (mblk_t *)(uintptr_t)src; + ira->ira_flags |= IRAF_MROUTE_TUNNEL_SET; + ira->ira_mroute_tunnel = src; mp->b_rptr += hlen; - /* Feed back into ip_rput as an M_DATA. */ - ip_rput(q, mp); + ira->ira_pktlen -= hlen; + ira->ira_ip_hdr_length = hlen_encap; + + /* + * We don't redo any of the filtering in ill_input_full_v4 and we + * have checked that all of ipha_encap and any IP options are + * pulled up. Hence we call ire_recv_multicast_v4 directly. + * However, we have to check for RSVP as in ip_input_full_v4 + * and if so we pass it to ire_recv_broadcast_v4 for local delivery + * to the rsvpd. + */ + if (ipha_encap->ipha_protocol == IPPROTO_RSVP && + ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head != NULL) { + ire = ire_route_recursive_v4(INADDR_BROADCAST, 0, ill, + ALL_ZONES, ira->ira_tsl, MATCH_IRE_ILL|MATCH_IRE_SECATTR, + B_TRUE, 0, ipst, NULL, NULL, NULL); + } else { + ire = ire_multicast(ill); + } + ASSERT(ire != NULL); + /* Normally this will return the IRE_MULTICAST or IRE_BROADCAST */ + if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ip_mroute_decap: RTF_REJECT", mp, ill); + freemsg(mp); + ire_refrele(ire); + return; + } + ire->ire_ib_pkt_count++; + ASSERT(ire->ire_type & (IRE_MULTICAST|IRE_BROADCAST)); + (*ire->ire_recvfn)(ire, mp, ipha_encap, ira); + ire_refrele(ire); } /* @@ -2687,7 +2759,7 @@ reset_mrt_vif_ipif(ipif_t *ipif) for (vifi = num_of_vifs; vifi != 0; vifi--) { tmp_vifi = vifi - 1; if (ipst->ips_vifs[tmp_vifi].v_ipif == ipif) { - (void) del_vif(&tmp_vifi, NULL, NULL, ipst); + (void) del_vif(&tmp_vifi, ipst); } } } @@ -2696,11 +2768,12 @@ reset_mrt_vif_ipif(ipif_t *ipif) void reset_mrt_ill(ill_t *ill) { - struct mfc *rt; + struct mfc *rt; struct rtdetq *rte; - int i; + int i; ip_stack_t *ipst = ill->ill_ipst; conn_t *mrouter = ipst->ips_ip_g_mrouter; + timeout_id_t id; for (i = 0; i < MFCTBLSIZ; i++) { MFCB_REFHOLD(&ipst->ips_mfcs[i]); @@ -2713,6 +2786,18 @@ reset_mrt_ill(ill_t *ill) while (rt != NULL) { mutex_enter(&rt->mfc_mutex); while ((rte = rt->mfc_rte) != NULL) { + if (rte->ill == ill && + (id = rt->mfc_timeout_id) != 0) { + /* + * Its ok to drop the lock, the + * struct cannot be freed since + * we have a ref on the hash + * bucket. + */ + mutex_exit(&rt->mfc_mutex); + (void) untimeout(id); + mutex_enter(&rt->mfc_mutex); + } if (rte->ill == ill) { if (ipst->ips_ip_mrtdebug > 1) { (void) mi_strlog( @@ -2744,12 +2829,15 @@ tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha) size_t p_len = msgdsize(mp); struct tbf *t = vifp->v_tbf; timeout_id_t id = 0; - ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; + ill_t *ill = vifp->v_ipif->ipif_ill; + ip_stack_t *ipst = ill->ill_ipst; conn_t *mrouter = ipst->ips_ip_g_mrouter; /* Drop if packet is too large */ if (p_len > MAX_BKT_SIZE) { ipst->ips_mrtstat->mrts_pkt2large++; + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("tbf_control - too large", mp, ill); freemsg(mp); return; } @@ -2800,6 +2888,9 @@ tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha) if ((mp->b_wptr - mp->b_rptr) < hdr_length) { if (!pullupmsg(mp, hdr_length)) { + BUMP_MIB(ill->ill_ip_mib, + ipIfStatsOutDiscards); + ip_drop_output("tbf_control - pullup", mp, ill); freemsg(mp); ip1dbg(("tbf_ctl: couldn't pullup udp hdr, " "vif %ld src 0x%x dst 0x%x\n", @@ -2818,6 +2909,8 @@ tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha) */ if (!tbf_dq_sel(vifp, ipha)) { ipst->ips_mrtstat->mrts_q_overflow++; + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("mrts_q_overflow", mp, ill); freemsg(mp); } else { tbf_queue(vifp, mp); @@ -2958,7 +3051,8 @@ tbf_dq_sel(struct vif *vifp, ipha_t *ipha) struct tbf *t = vifp->v_tbf; mblk_t **np; mblk_t *last, *mp; - ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; + ill_t *ill = vifp->v_ipif->ipif_ill; + ip_stack_t *ipst = ill->ill_ipst; conn_t *mrouter = ipst->ips_ip_g_mrouter; if (ipst->ips_ip_mrtdebug > 1) { @@ -2979,6 +3073,8 @@ tbf_dq_sel(struct vif *vifp, ipha_t *ipha) if (mp == t->tbf_t) t->tbf_t = last; mp->b_prev = mp->b_next = NULL; + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("tbf_dq_send", mp, ill); freemsg(mp); /* * It's impossible for the queue to be empty, but @@ -3000,76 +3096,97 @@ tbf_dq_sel(struct vif *vifp, ipha_t *ipha) static void tbf_send_packet(struct vif *vifp, mblk_t *mp) { - ipif_t *ipif; - ip_stack_t *ipst = vifp->v_ipif->ipif_ill->ill_ipst; + ipif_t *ipif = vifp->v_ipif; + ill_t *ill = ipif->ipif_ill; + ip_stack_t *ipst = ill->ill_ipst; conn_t *mrouter = ipst->ips_ip_g_mrouter; + ipha_t *ipha; + ipha = (ipha_t *)mp->b_rptr; /* If encap tunnel options */ if (vifp->v_flags & VIFF_TUNNEL) { + ip_xmit_attr_t ixas; + if (ipst->ips_ip_mrtdebug > 1) { (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, - "tbf_send_pkt: ENCAP tunnel vif %ld", + "tbf_send_packet: ENCAP tunnel vif %ld", (ptrdiff_t)(vifp - ipst->ips_vifs)); } + bzero(&ixas, sizeof (ixas)); + ixas.ixa_flags = IXAF_IS_IPV4 | IXAF_NO_TTL_CHANGE; + ixas.ixa_ipst = ipst; + ixas.ixa_ifindex = 0; + ixas.ixa_cred = kcred; + ixas.ixa_cpid = NOPID; + ixas.ixa_tsl = NULL; + ixas.ixa_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */ + ixas.ixa_pktlen = ntohs(ipha->ipha_length); + ixas.ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha); /* - * Feed into ip_wput which will set the ident field and - * checksum the encapsulating header. + * Feed into ip_output_simple which will set the ident field + * and checksum the encapsulating header. * BSD gets the cached route vifp->v_route from ip_output() * to speed up route table lookups. Not necessary in SunOS 5.x. + * One could make multicast forwarding faster by putting an + * ip_xmit_attr_t in each vif thereby caching the ire/nce. */ - put(vifp->v_ipif->ipif_wq, mp); + (void) ip_output_simple(mp, &ixas); + ixa_cleanup(&ixas); return; /* phyint */ } else { /* Need to loop back to members on the outgoing interface. */ - ipha_t *ipha; - ipaddr_t dst; - ipha = (ipha_t *)mp->b_rptr; - dst = ipha->ipha_dst; - ipif = vifp->v_ipif; - - if (ilm_lookup_ipif(ipif, dst) != NULL) { - /* - * The packet is not yet reassembled, thus we need to - * pass it to ip_rput_local for checksum verification - * and reassembly (and fanout the user stream). - */ - mblk_t *mp_loop; - ire_t *ire; - - if (ipst->ips_ip_mrtdebug > 1) { - (void) mi_strlog(mrouter->conn_rq, 1, - SL_TRACE, - "tbf_send_pkt: loopback vif %ld", - (ptrdiff_t)(vifp - ipst->ips_vifs)); - } - mp_loop = copymsg(mp); - ire = ire_ctable_lookup(~0, 0, IRE_BROADCAST, NULL, - ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); - - if (mp_loop != NULL && ire != NULL) { - IP_RPUT_LOCAL(ipif->ipif_rq, mp_loop, - ((ipha_t *)mp_loop->b_rptr), - ire, (ill_t *)ipif->ipif_rq->q_ptr); - } else { - /* Either copymsg failed or no ire */ - (void) mi_strlog(mrouter->conn_rq, 1, - SL_TRACE, - "tbf_send_pkt: mp_loop 0x%p, ire 0x%p " - "vif %ld\n", (void *)mp_loop, (void *)ire, - (ptrdiff_t)(vifp - ipst->ips_vifs)); - } - if (ire != NULL) - ire_refrele(ire); + ipaddr_t dst; + ip_recv_attr_t iras; + nce_t *nce; + + bzero(&iras, sizeof (iras)); + iras.ira_flags = IRAF_IS_IPV4; + iras.ira_ill = iras.ira_rill = ill; + iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; + iras.ira_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */ + iras.ira_pktlen = ntohs(ipha->ipha_length); + iras.ira_ip_hdr_length = IPH_HDR_LENGTH(ipha); + + dst = ipha->ipha_dst; + if (ill_hasmembers_v4(ill, dst)) { + iras.ira_flags |= IRAF_LOOPBACK_COPY; } if (ipst->ips_ip_mrtdebug > 1) { (void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE, "tbf_send_pkt: phyint forward vif %ld dst = 0x%x", (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(dst)); } - ip_rput_forward_multicast(dst, mp, ipif); + /* + * Find an NCE which matches the nexthop. + * For a pt-pt interface we use the other end of the pt-pt + * link. + */ + if (ipif->ipif_flags & IPIF_POINTOPOINT) { + dst = ipif->ipif_pp_dst_addr; + nce = arp_nce_init(ill, dst, ill->ill_net_type); + } else { + nce = arp_nce_init(ill, dst, IRE_MULTICAST); + } + if (nce == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("tbf_send_packet - no nce", mp, ill); + freemsg(mp); + return; + } + + /* + * We don't remeber the incoming ill. Thus we + * pretend the packet arrived on the outbound ill. This means + * statistics for input errors will be increased on the wrong + * ill but that isn't a big deal. + */ + ip_forward_xmit_v4(nce, ill, mp, ipha, &iras, ill->ill_mtu, 0); + ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); + + nce_refrele(nce); } } diff --git a/usr/src/uts/common/inet/ip/ip_multi.c b/usr/src/uts/common/inet/ip/ip_multi.c index d7be67cd26..0912d87227 100644 --- a/usr/src/uts/common/inet/ip/ip_multi.c +++ b/usr/src/uts/common/inet/ip/ip_multi.c @@ -66,29 +66,41 @@ static void ilm_bld_flists(conn_t *conn, void *arg); static void ilm_gen_filter(ilm_t *ilm, mcast_record_t *fmode, slist_t *flist); -static ilm_t *ilm_add_v6(ipif_t *ipif, const in6_addr_t *group, +static ilm_t *ilm_add(ill_t *ill, const in6_addr_t *group, ilg_stat_t ilgstat, mcast_record_t ilg_fmode, slist_t *ilg_flist, zoneid_t zoneid); static void ilm_delete(ilm_t *ilm); -static int ip_ll_addmulti_v6(ipif_t *ipif, const in6_addr_t *group); -static int ip_ll_delmulti_v6(ipif_t *ipif, const in6_addr_t *group); -static ilg_t *ilg_lookup_ipif(conn_t *connp, ipaddr_t group, - ipif_t *ipif); -static int ilg_add(conn_t *connp, ipaddr_t group, ipif_t *ipif, - mcast_record_t fmode, ipaddr_t src); -static int ilg_add_v6(conn_t *connp, const in6_addr_t *group, ill_t *ill, - mcast_record_t fmode, const in6_addr_t *v6src); +static int ilm_numentries(ill_t *, const in6_addr_t *); + +static ilm_t *ip_addmulti_serial(const in6_addr_t *, ill_t *, zoneid_t, + ilg_stat_t, mcast_record_t, slist_t *, int *); +static ilm_t *ip_addmulti_impl(const in6_addr_t *, ill_t *, + zoneid_t, ilg_stat_t, mcast_record_t, slist_t *, int *); +static int ip_delmulti_serial(ilm_t *, boolean_t, boolean_t); +static int ip_delmulti_impl(ilm_t *, boolean_t, boolean_t); + +static int ip_ll_multireq(ill_t *ill, const in6_addr_t *group, + t_uscalar_t); +static ilg_t *ilg_lookup(conn_t *, const in6_addr_t *, ipaddr_t ifaddr, + uint_t ifindex); + +static int ilg_add(conn_t *connp, const in6_addr_t *group, + ipaddr_t ifaddr, uint_t ifindex, ill_t *ill, mcast_record_t fmode, + const in6_addr_t *v6src); static void ilg_delete(conn_t *connp, ilg_t *ilg, const in6_addr_t *src); static mblk_t *ill_create_dl(ill_t *ill, uint32_t dl_primitive, - uint32_t length, uint32_t *addr_lenp, uint32_t *addr_offp); -static void conn_ilg_reap(conn_t *connp); -static int ip_opt_delete_group_excl(conn_t *connp, ipaddr_t group, - ipif_t *ipif, mcast_record_t fmode, ipaddr_t src); -static int ip_opt_delete_group_excl_v6(conn_t *connp, - const in6_addr_t *v6group, ill_t *ill, mcast_record_t fmode, - const in6_addr_t *v6src); -static void ill_ilm_walker_hold(ill_t *ill); -static void ill_ilm_walker_rele(ill_t *ill); + uint32_t *addr_lenp, uint32_t *addr_offp); +static int ip_opt_delete_group_excl(conn_t *connp, + const in6_addr_t *v6group, ipaddr_t ifaddr, uint_t ifindex, + mcast_record_t fmode, const in6_addr_t *v6src); + +static ilm_t *ilm_lookup(ill_t *, const in6_addr_t *, zoneid_t); + +static int ip_msfilter_ill(conn_t *, mblk_t *, const ip_ioctl_cmd_t *, + ill_t **); + +static void ilg_check_detach(conn_t *, ill_t *); +static void ilg_check_reattach(conn_t *); /* * MT notes: @@ -98,124 +110,122 @@ static void ill_ilm_walker_rele(ill_t *ill); * need to synchronize when operating on the ilg. Multiple threads * potentially operating on different conn (socket endpoints) trying to * do multicast joins could eventually end up trying to manipulate the - * ilm simultaneously and need to synchronize access to the ilm. Currently, - * this is done by synchronizing join/leave via per-phyint ipsq_t - * serialization. + * ilm simulatenously and need to synchronize on the access to the ilm. + * The access and lookup of the ilm, as well as other ill multicast state, + * is under ill_mcast_lock. + * The modifications and lookup of ilg entries is serialized using conn_ilg_lock + * rwlock. An ilg will not be freed until ilg_refcnt drops to zero. + * + * In some cases we hold ill_mcast_lock and then acquire conn_ilg_lock, but + * never the other way around. * * An ilm is an IP data structure used to track multicast join/leave. * An ilm is associated with a <multicast group, ipif> tuple in IPv4 and * with just <multicast group> in IPv6. ilm_refcnt is the number of ilg's - * referencing the ilm. ilms are created / destroyed only as writer. ilms - * are not passed around, instead they are looked up and used under the - * ill_lock or as writer. So we don't need a dynamic refcount of the number + * referencing the ilm. + * The modifications and lookup of ilm entries is serialized using the + * ill_mcast_lock rwlock; that lock handles all the igmp/mld modifications + * of the ilm state. + * ilms are created / destroyed only as writer. ilms + * are not passed around. The datapath (anything outside of this file + * and igmp.c) use functions that do not return ilms - just the number + * of members. So we don't need a dynamic refcount of the number * of threads holding reference to an ilm. * - * Multicast Join operation: - * - * The first step is to determine the ipif (v4) or ill (v6) on which - * the join operation is to be done. The join is done after becoming - * exclusive on the ipsq associated with the ipif or ill. The conn->conn_ilg - * and ill->ill_ilm are thus accessed and modified exclusively per ill. - * Multiple threads can attempt to join simultaneously on different ipif/ill - * on the same conn. In this case the ipsq serialization does not help in - * protecting the ilg. It is the conn_lock that is used to protect the ilg. - * The conn_lock also protects all the ilg_t members. + * In the cases where we serially access the ilg and ilm, which happens when + * we handle the applications requests to join or leave groups and sources, + * we use the ill_mcast_serializer mutex to ensure that a multithreaded + * application which does concurrent joins and/or leaves on the same group on + * the same socket always results in a consistent order for the ilg and ilm + * modifications. * - * Leave operation. - * - * Similar to the join operation, the first step is to determine the ipif - * or ill (v6) on which the leave operation is to be done. The leave operation - * is done after becoming exclusive on the ipsq associated with the ipif or ill. - * As with join ilg modification is done under the protection of the conn lock. + * When a multicast operation results in needing to send a message to + * the driver (to join/leave a L2 multicast address), we use ill_dlpi_queue() + * which serialized the DLPI requests. The IGMP/MLD code uses ill_mcast_queue() + * to send IGMP/MLD IP packet to avoid dropping the lock just to send a packet. */ -#define IPSQ_ENTER_IPIF(ipif, connp, first_mp, func, ipsq, type) \ - ASSERT(connp != NULL); \ - (ipsq) = ipsq_try_enter((ipif), NULL, CONNP_TO_WQ(connp), \ - (first_mp), (func), (type), B_TRUE); \ - if ((ipsq) == NULL) { \ - ipif_refrele(ipif); \ - return (EINPROGRESS); \ - } - -#define IPSQ_ENTER_ILL(ill, connp, first_mp, func, ipsq, type) \ - ASSERT(connp != NULL); \ - (ipsq) = ipsq_try_enter(NULL, ill, CONNP_TO_WQ(connp), \ - (first_mp), (func), (type), B_TRUE); \ - if ((ipsq) == NULL) { \ - ill_refrele(ill); \ - return (EINPROGRESS); \ - } - -#define IPSQ_EXIT(ipsq) \ - if (ipsq != NULL) \ - ipsq_exit(ipsq); +#define GETSTRUCT(structure, number) \ + ((structure *)mi_zalloc(sizeof (structure) * (number))) -#define ILG_WALKER_HOLD(connp) (connp)->conn_ilg_walker_cnt++ +/* + * Caller must ensure that the ilg has not been condemned + * The condemned flag is only set in ilg_delete under conn_ilg_lock. + * + * The caller must hold conn_ilg_lock as writer. + */ +static void +ilg_refhold(ilg_t *ilg) +{ + ASSERT(ilg->ilg_refcnt != 0); + ASSERT(!ilg->ilg_condemned); + ASSERT(RW_WRITE_HELD(&ilg->ilg_connp->conn_ilg_lock)); -#define ILG_WALKER_RELE(connp) \ - { \ - (connp)->conn_ilg_walker_cnt--; \ - if ((connp)->conn_ilg_walker_cnt == 0) \ - conn_ilg_reap(connp); \ - } + ilg->ilg_refcnt++; +} static void -conn_ilg_reap(conn_t *connp) +ilg_inactive(ilg_t *ilg) { - int to; - int from; - ilg_t *ilg; - - ASSERT(MUTEX_HELD(&connp->conn_lock)); + ASSERT(ilg->ilg_ill == NULL); + ASSERT(ilg->ilg_ilm == NULL); + ASSERT(ilg->ilg_filter == NULL); + ASSERT(ilg->ilg_condemned); - to = 0; - from = 0; - while (from < connp->conn_ilg_inuse) { - if (connp->conn_ilg[from].ilg_flags & ILG_DELETED) { - ilg = &connp->conn_ilg[from]; - FREE_SLIST(ilg->ilg_filter); - ilg->ilg_flags &= ~ILG_DELETED; - from++; - continue; - } - if (to != from) - connp->conn_ilg[to] = connp->conn_ilg[from]; - to++; - from++; - } + /* Unlink from list */ + *ilg->ilg_ptpn = ilg->ilg_next; + if (ilg->ilg_next != NULL) + ilg->ilg_next->ilg_ptpn = ilg->ilg_ptpn; + ilg->ilg_next = NULL; + ilg->ilg_ptpn = NULL; - connp->conn_ilg_inuse = to; + ilg->ilg_connp = NULL; + kmem_free(ilg, sizeof (*ilg)); +} - if (connp->conn_ilg_inuse == 0) { - mi_free((char *)connp->conn_ilg); - connp->conn_ilg = NULL; - cv_broadcast(&connp->conn_refcv); - } +/* + * The caller must hold conn_ilg_lock as writer. + */ +static void +ilg_refrele(ilg_t *ilg) +{ + ASSERT(RW_WRITE_HELD(&ilg->ilg_connp->conn_ilg_lock)); + ASSERT(ilg->ilg_refcnt != 0); + if (--ilg->ilg_refcnt == 0) + ilg_inactive(ilg); } -#define GETSTRUCT(structure, number) \ - ((structure *)mi_zalloc(sizeof (structure) * (number))) +/* + * Acquire reference on ilg and drop reference on held_ilg. + * In the case when held_ilg is the same as ilg we already have + * a reference, but the held_ilg might be condemned. In that case + * we avoid the ilg_refhold/rele so that we can assert in ire_refhold + * that the ilg isn't condemned. + */ +static void +ilg_transfer_hold(ilg_t *held_ilg, ilg_t *ilg) +{ + if (held_ilg == ilg) + return; -#define ILG_ALLOC_CHUNK 16 + ilg_refhold(ilg); + if (held_ilg != NULL) + ilg_refrele(held_ilg); +} /* - * Returns a pointer to the next available ilg in conn_ilg. Allocs more - * buffers in size of ILG_ALLOC_CHUNK ilgs when needed, and updates conn's - * ilg tracking fields appropriately (conn_ilg_inuse reflects usage of the - * returned ilg). Returns NULL on failure, in which case `*errp' will be + * Allocate a new ilg_t and links it into conn_ilg. + * Returns NULL on failure, in which case `*errp' will be * filled in with the reason. * - * Assumes connp->conn_lock is held. + * Assumes connp->conn_ilg_lock is held. */ static ilg_t * conn_ilg_alloc(conn_t *connp, int *errp) { - ilg_t *new, *ret; - int curcnt; + ilg_t *ilg; - ASSERT(MUTEX_HELD(&connp->conn_lock)); - ASSERT(connp->conn_ilg_inuse <= connp->conn_ilg_allocated); + ASSERT(RW_WRITE_HELD(&connp->conn_ilg_lock)); /* * If CONN_CLOSING is set, conn_ilg cleanup has begun and we must not @@ -226,44 +236,23 @@ conn_ilg_alloc(conn_t *connp, int *errp) return (NULL); } - if (connp->conn_ilg == NULL) { - connp->conn_ilg = GETSTRUCT(ilg_t, ILG_ALLOC_CHUNK); - if (connp->conn_ilg == NULL) { - *errp = ENOMEM; - return (NULL); - } - connp->conn_ilg_allocated = ILG_ALLOC_CHUNK; - connp->conn_ilg_inuse = 0; - } - if (connp->conn_ilg_inuse == connp->conn_ilg_allocated) { - if (connp->conn_ilg_walker_cnt != 0) { - /* - * XXX We cannot grow the array at this point - * because a list walker could be in progress, and - * we cannot wipe out the existing array until the - * walker is done. Just return NULL for now. - * ilg_delete_all() will have to be changed when - * this logic is changed. - */ - *errp = EBUSY; - return (NULL); - } - curcnt = connp->conn_ilg_allocated; - new = GETSTRUCT(ilg_t, curcnt + ILG_ALLOC_CHUNK); - if (new == NULL) { - *errp = ENOMEM; - return (NULL); - } - bcopy(connp->conn_ilg, new, sizeof (ilg_t) * curcnt); - mi_free((char *)connp->conn_ilg); - connp->conn_ilg = new; - connp->conn_ilg_allocated += ILG_ALLOC_CHUNK; + ilg = kmem_zalloc(sizeof (ilg_t), KM_NOSLEEP); + if (ilg == NULL) { + *errp = ENOMEM; + return (NULL); } - ret = &connp->conn_ilg[connp->conn_ilg_inuse++]; - ASSERT((ret->ilg_flags & ILG_DELETED) == 0); - bzero(ret, sizeof (*ret)); - return (ret); + ilg->ilg_refcnt = 1; + + /* Insert at head */ + if (connp->conn_ilg != NULL) + connp->conn_ilg->ilg_ptpn = &ilg->ilg_next; + ilg->ilg_next = connp->conn_ilg; + ilg->ilg_ptpn = &connp->conn_ilg; + connp->conn_ilg = ilg; + + ilg->ilg_connp = connp; + return (ilg); } typedef struct ilm_fbld_s { @@ -275,15 +264,18 @@ typedef struct ilm_fbld_s { boolean_t fbld_in_overflow; } ilm_fbld_t; +/* + * Caller must hold ill_mcast_lock + */ static void -ilm_bld_flists(conn_t *conn, void *arg) +ilm_bld_flists(conn_t *connp, void *arg) { - int i; + ilg_t *ilg; ilm_fbld_t *fbld = (ilm_fbld_t *)(arg); ilm_t *ilm = fbld->fbld_ilm; in6_addr_t *v6group = &ilm->ilm_v6addr; - if (conn->conn_ilg_inuse == 0) + if (connp->conn_ilg == NULL) return; /* @@ -300,12 +292,26 @@ ilm_bld_flists(conn_t *conn, void *arg) * ilm (group, interface match). If so, update the master * include and exclude lists we're building in the fbld struct * with this ilg's filter info. + * + * Note that the caller has already serialized on the ill we care + * about. */ - mutex_enter(&conn->conn_lock); - for (i = 0; i < conn->conn_ilg_inuse; i++) { - ilg_t *ilg = &conn->conn_ilg[i]; + ASSERT(MUTEX_HELD(&ilm->ilm_ill->ill_mcast_serializer)); + + rw_enter(&connp->conn_ilg_lock, RW_READER); + for (ilg = connp->conn_ilg; ilg != NULL; ilg = ilg->ilg_next) { + if (ilg->ilg_condemned) + continue; + + /* + * Since we are under the ill_mcast_serializer we know + * that any ilg+ilm operations on this ilm have either + * not started or completed, except for the last ilg + * (the one that caused us to be called) which doesn't + * have ilg_ilm set yet. Hence we compare using ilg_ill + * and the address. + */ if ((ilg->ilg_ill == ilm->ilm_ill) && - (ilg->ilg_ipif == ilm->ilm_ipif) && IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, v6group)) { if (ilg->ilg_fmode == MODE_IS_INCLUDE) { fbld->fbld_in_cnt++; @@ -337,9 +343,12 @@ ilm_bld_flists(conn_t *conn, void *arg) break; } } - mutex_exit(&conn->conn_lock); + rw_exit(&connp->conn_ilg_lock); } +/* + * Caller must hold ill_mcast_lock + */ static void ilm_gen_filter(ilm_t *ilm, mcast_record_t *fmode, slist_t *flist) { @@ -385,15 +394,17 @@ ilm_gen_filter(ilm_t *ilm, mcast_record_t *fmode, slist_t *flist) } } +/* + * Caller must hold ill_mcast_lock + */ static int -ilm_update_add(ilm_t *ilm, ilg_stat_t ilgstat, slist_t *ilg_flist, - boolean_t isv6) +ilm_update_add(ilm_t *ilm, ilg_stat_t ilgstat, slist_t *ilg_flist) { mcast_record_t fmode; slist_t *flist; boolean_t fdefault; char buf[INET6_ADDRSTRLEN]; - ill_t *ill = isv6 ? ilm->ilm_ill : ilm->ilm_ipif->ipif_ill; + ill_t *ill = ilm->ilm_ill; /* * There are several cases where the ilm's filter state @@ -444,7 +455,7 @@ ilm_update_add(ilm_t *ilm, ilg_stat_t ilgstat, slist_t *ilg_flist, /* send the state change report */ if (!IS_LOOPBACK(ill)) { - if (isv6) + if (ill->ill_isv6) mld_statechange(ilm, fmode, flist); else igmp_statechange(ilm, fmode, flist); @@ -464,12 +475,15 @@ ilm_update_add(ilm_t *ilm, ilg_stat_t ilgstat, slist_t *ilg_flist, return (0); } +/* + * Caller must hold ill_mcast_lock + */ static int -ilm_update_del(ilm_t *ilm, boolean_t isv6) +ilm_update_del(ilm_t *ilm) { mcast_record_t fmode; slist_t *flist; - ill_t *ill = isv6 ? ilm->ilm_ill : ilm->ilm_ipif->ipif_ill; + ill_t *ill = ilm->ilm_ill; ip1dbg(("ilm_update_del: still %d left; updating state\n", ilm->ilm_refcnt)); @@ -500,7 +514,7 @@ ilm_update_del(ilm_t *ilm, boolean_t isv6) } if (!IS_LOOPBACK(ill)) { - if (isv6) + if (ill->ill_isv6) mld_statechange(ilm, fmode, flist); else igmp_statechange(ilm, fmode, flist); @@ -531,240 +545,245 @@ ilm_update_del(ilm_t *ilm, boolean_t isv6) } /* - * INADDR_ANY means all multicast addresses. - * INADDR_ANY is stored as IPv6 unspecified addr. + * Create/update the ilm for the group/ill. Used by other parts of IP to + * do the ILGSTAT_NONE (no ilg), MODE_IS_EXCLUDE, with no slist join. + * Returns with a refhold on the ilm. + * + * The unspecified address means all multicast addresses for in both the + * case of IPv4 and IPv6. + * + * The caller should have already mapped an IPMP under ill to the upper. */ -int -ip_addmulti(ipaddr_t group, ipif_t *ipif, ilg_stat_t ilgstat, - mcast_record_t ilg_fmode, slist_t *ilg_flist) +ilm_t * +ip_addmulti(const in6_addr_t *v6group, ill_t *ill, zoneid_t zoneid, + int *errorp) { - ill_t *ill = ipif->ipif_ill; - ilm_t *ilm; - in6_addr_t v6group; - int ret; - - ASSERT(IAM_WRITER_IPIF(ipif)); - - if (!CLASSD(group) && group != INADDR_ANY) - return (EINVAL); - - if (IS_UNDER_IPMP(ill)) - return (EINVAL); - - /* - * INADDR_ANY is represented as the IPv6 unspecified addr. - */ - if (group == INADDR_ANY) - v6group = ipv6_all_zeros; - else - IN6_IPADDR_TO_V4MAPPED(group, &v6group); - - ilm = ilm_lookup_ipif(ipif, group); - /* - * Since we are writer, we know the ilm_flags itself cannot - * change at this point, and ilm_lookup_ipif would not have - * returned a DELETED ilm. However, the data path can free - * ilm->ilm_next via ilm_walker_cleanup() so we can safely - * access anything in ilm except ilm_next (for safe access to - * ilm_next we'd have to take the ill_lock). - */ - if (ilm != NULL) - return (ilm_update_add(ilm, ilgstat, ilg_flist, B_FALSE)); - - ilm = ilm_add_v6(ipif, &v6group, ilgstat, ilg_fmode, ilg_flist, - ipif->ipif_zoneid); - if (ilm == NULL) - return (ENOMEM); - - if (group == INADDR_ANY) { - /* - * Check how many ipif's have members in this group - - * if more then one we should not tell the driver to join - * this time - */ - if (ilm_numentries_v6(ill, &v6group) > 1) - return (0); - ret = ill_join_allmulti(ill); - if (ret != 0) - ilm_delete(ilm); - return (ret); - } - - if (!IS_LOOPBACK(ill)) - igmp_joingroup(ilm); - - if (ilm_numentries_v6(ill, &v6group) > 1) - return (0); + ilm_t *ilm; - ret = ip_ll_addmulti_v6(ipif, &v6group); - if (ret != 0) - ilm_delete(ilm); - return (ret); + /* Acquire serializer to keep assert in ilm_bld_flists happy */ + mutex_enter(&ill->ill_mcast_serializer); + ilm = ip_addmulti_serial(v6group, ill, zoneid, ILGSTAT_NONE, + MODE_IS_EXCLUDE, NULL, errorp); + mutex_exit(&ill->ill_mcast_serializer); + return (ilm); } /* - * The unspecified address means all multicast addresses. + * Create/update the ilm for the group/ill. If ILGSTAT_CHANGE is not set + * then this returns with a refhold on the ilm. + * + * Internal routine which assumes the caller has already acquired + * ill_multi_serializer. * - * ill identifies the interface to join on. + * The unspecified address means all multicast addresses for in both the + * case of IPv4 and IPv6. * * ilgstat tells us if there's an ilg associated with this join, * and if so, if it's a new ilg or a change to an existing one. * ilg_fmode and ilg_flist give us the current filter state of * the ilg (and will be EXCLUDE {NULL} in the case of no ilg). + * + * The caller should have already mapped an IPMP under ill to the upper. */ -int -ip_addmulti_v6(const in6_addr_t *v6group, ill_t *ill, zoneid_t zoneid, - ilg_stat_t ilgstat, mcast_record_t ilg_fmode, slist_t *ilg_flist) +static ilm_t * +ip_addmulti_serial(const in6_addr_t *v6group, ill_t *ill, zoneid_t zoneid, + ilg_stat_t ilgstat, mcast_record_t ilg_fmode, slist_t *ilg_flist, + int *errorp) { - ilm_t *ilm; - int ret; + ilm_t *ilm; - ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(MUTEX_HELD(&ill->ill_mcast_serializer)); - if (!IN6_IS_ADDR_MULTICAST(v6group) && - !IN6_IS_ADDR_UNSPECIFIED(v6group)) { - return (EINVAL); + if (ill->ill_isv6) { + if (!IN6_IS_ADDR_MULTICAST(v6group) && + !IN6_IS_ADDR_UNSPECIFIED(v6group)) { + *errorp = EINVAL; + return (NULL); + } + } else { + if (IN6_IS_ADDR_V4MAPPED(v6group)) { + ipaddr_t v4group; + + IN6_V4MAPPED_TO_IPADDR(v6group, v4group); + if (!CLASSD(v4group)) { + *errorp = EINVAL; + return (NULL); + } + } else if (!IN6_IS_ADDR_UNSPECIFIED(v6group)) { + *errorp = EINVAL; + return (NULL); + } } - if (IS_UNDER_IPMP(ill) && !IN6_IS_ADDR_MC_SOLICITEDNODE(v6group)) - return (EINVAL); + if (IS_UNDER_IPMP(ill)) { + *errorp = EINVAL; + return (NULL); + } + + rw_enter(&ill->ill_mcast_lock, RW_WRITER); + /* + * We do the equivalent of a lookup by checking after we get the lock + * This is needed since the ill could have been condemned after + * we looked it up, and we need to check condemned after we hold + * ill_mcast_lock to synchronize with the unplumb code. + */ + if (ill->ill_state_flags & ILL_CONDEMNED) { + rw_exit(&ill->ill_mcast_lock); + *errorp = ENXIO; + return (NULL); + } + ilm = ip_addmulti_impl(v6group, ill, zoneid, ilgstat, ilg_fmode, + ilg_flist, errorp); + rw_exit(&ill->ill_mcast_lock); + + /* Send any deferred/queued DLPI or IP packets */ + ill_mcast_send_queued(ill); + ill_dlpi_send_queued(ill); + ill_mcast_timer_start(ill->ill_ipst); + return (ilm); +} + +static ilm_t * +ip_addmulti_impl(const in6_addr_t *v6group, ill_t *ill, zoneid_t zoneid, + ilg_stat_t ilgstat, mcast_record_t ilg_fmode, slist_t *ilg_flist, + int *errorp) +{ + ilm_t *ilm; + int ret = 0; + + ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock)); + *errorp = 0; /* * An ilm is uniquely identified by the tuple of (group, ill) where * `group' is the multicast group address, and `ill' is the interface * on which it is currently joined. */ - ilm = ilm_lookup_ill_v6(ill, v6group, B_TRUE, zoneid); - if (ilm != NULL) - return (ilm_update_add(ilm, ilgstat, ilg_flist, B_TRUE)); - ilm = ilm_add_v6(ill->ill_ipif, v6group, ilgstat, ilg_fmode, - ilg_flist, zoneid); - if (ilm == NULL) - return (ENOMEM); + ilm = ilm_lookup(ill, v6group, zoneid); + if (ilm != NULL) { + /* ilm_update_add bumps ilm_refcnt unless ILGSTAT_CHANGE */ + ret = ilm_update_add(ilm, ilgstat, ilg_flist); + if (ret == 0) + return (ilm); - if (IN6_IS_ADDR_UNSPECIFIED(v6group)) { - /* - * Check how many ipif's that have members in this group - - * if more then one we should not tell the driver to join - * this time - */ - if (ilm_numentries_v6(ill, v6group) > 1) - return (0); - ret = ill_join_allmulti(ill); - if (ret != 0) - ilm_delete(ilm); - return (ret); + *errorp = ret; + return (NULL); } - if (!IS_LOOPBACK(ill)) - mld_joingroup(ilm); - /* - * If we have more then one we should not tell the driver - * to join this time. + * The callers checks on the ilg and the ilg+ilm consistency under + * ill_mcast_serializer ensures that we can not have ILGSTAT_CHANGE + * and no ilm. */ - if (ilm_numentries_v6(ill, v6group) > 1) - return (0); - - ret = ip_ll_addmulti_v6(ill->ill_ipif, v6group); - if (ret != 0) - ilm_delete(ilm); - return (ret); -} + ASSERT(ilgstat != ILGSTAT_CHANGE); + ilm = ilm_add(ill, v6group, ilgstat, ilg_fmode, ilg_flist, zoneid); + if (ilm == NULL) { + *errorp = ENOMEM; + return (NULL); + } -/* - * Mapping the given IP multicast address to the L2 multicast mac address. - */ -static void -ill_multicast_mapping(ill_t *ill, ipaddr_t ip_addr, uint8_t *hw_addr, - uint32_t hw_addrlen) -{ - dl_unitdata_req_t *dlur; - ipaddr_t proto_extract_mask; - uint8_t *from, *bcast_addr; - uint32_t hw_extract_start; - int len; + if (IN6_IS_ADDR_UNSPECIFIED(v6group)) { + /* + * If we have more then one we should not tell the driver + * to join this time. + */ + if (ilm_numentries(ill, v6group) == 1) { + ret = ill_join_allmulti(ill); + } + } else { + if (!IS_LOOPBACK(ill)) { + if (ill->ill_isv6) + mld_joingroup(ilm); + else + igmp_joingroup(ilm); + } - ASSERT(IN_CLASSD(ntohl(ip_addr))); - ASSERT(hw_addrlen == ill->ill_phys_addr_length); - ASSERT((ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) == 0); - ASSERT((ill->ill_flags & ILLF_MULTICAST) != 0); + /* + * If we have more then one we should not tell the driver + * to join this time. + */ + if (ilm_numentries(ill, v6group) == 1) { + ret = ip_ll_multireq(ill, v6group, DL_ENABMULTI_REQ); + } + } + if (ret != 0) { + if (ret == ENETDOWN) { + char buf[INET6_ADDRSTRLEN]; - /* - * Find the physical broadcast address. - */ - dlur = (dl_unitdata_req_t *)ill->ill_bcast_mp->b_rptr; - bcast_addr = (uint8_t *)dlur + dlur->dl_dest_addr_offset; - if (ill->ill_sap_length > 0) - bcast_addr += ill->ill_sap_length; - - VERIFY(MEDIA_V4MINFO(ill->ill_media, hw_addrlen, bcast_addr, - hw_addr, &hw_extract_start, &proto_extract_mask)); - - len = MIN((int)hw_addrlen - hw_extract_start, IP_ADDR_LEN); - ip_addr &= proto_extract_mask; - from = (uint8_t *)&ip_addr; - while (len-- > 0) - hw_addr[hw_extract_start + len] |= from[len]; + ip0dbg(("ip_addmulti: ENETDOWN for %s on %s", + inet_ntop(AF_INET6, &ilm->ilm_v6addr, + buf, sizeof (buf)), ill->ill_name)); + } + ilm_delete(ilm); + *errorp = ret; + return (NULL); + } else { + return (ilm); + } } /* - * Send a multicast request to the driver for enabling multicast reception - * for v6groupp address. The caller has already checked whether it is - * appropriate to send one or not. + * Send a multicast request to the driver for enabling or disabling + * multicast reception for v6groupp address. The caller has already + * checked whether it is appropriate to send one or not. + * + * For IPMP we switch to the cast_ill since it has the right hardware + * information. */ -int -ip_ll_send_enabmulti_req(ill_t *ill, const in6_addr_t *v6groupp) +static int +ip_ll_send_multireq(ill_t *ill, const in6_addr_t *v6groupp, t_uscalar_t prim) { mblk_t *mp; uint32_t addrlen, addroff; - char group_buf[INET6_ADDRSTRLEN]; - - ASSERT(IAM_WRITER_ILL(ill)); - - /* - * If we're on the IPMP ill, use the nominated multicast interface to - * send and receive DLPI messages, if one exists. (If none exists, - * there are no usable interfaces and thus nothing to do.) - */ - if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL) - return (0); - - /* - * Create a DL_ENABMULTI_REQ. - */ - mp = ill_create_dl(ill, DL_ENABMULTI_REQ, sizeof (dl_enabmulti_req_t), - &addrlen, &addroff); - if (!mp) - return (ENOMEM); - - if (IN6_IS_ADDR_V4MAPPED(v6groupp)) { - ipaddr_t v4group; + ill_t *release_ill = NULL; + int err = 0; - IN6_V4MAPPED_TO_IPADDR(v6groupp, v4group); + ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock)); - ill_multicast_mapping(ill, v4group, - mp->b_rptr + addroff, addrlen); + if (IS_IPMP(ill)) { + /* On the upper IPMP ill. */ + release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); + if (release_ill == NULL) { + /* + * Avoid sending it down to the ipmpstub. + * We will be called again once the members of the + * group are in place + */ + ip1dbg(("ip_ll_send_multireq: no cast_ill for %s %d\n", + ill->ill_name, ill->ill_isv6)); + return (0); + } + ill = release_ill; + } + /* Create a DL_ENABMULTI_REQ or DL_DISABMULTI_REQ message. */ + mp = ill_create_dl(ill, prim, &addrlen, &addroff); + if (mp == NULL) { + err = ENOMEM; + goto done; + } - ip1dbg(("ip_ll_send_enabmulti_req: IPv4 %s on %s\n", - inet_ntop(AF_INET6, v6groupp, group_buf, - sizeof (group_buf)), - ill->ill_name)); + mp = ndp_mcastreq(ill, v6groupp, addrlen, addroff, mp); + if (mp == NULL) { + ip0dbg(("null from ndp_mcastreq(ill %s)\n", ill->ill_name)); + err = ENOMEM; + goto done; + } + switch (((union DL_primitives *)mp->b_rptr)->dl_primitive) { + case DL_ENABMULTI_REQ: + mutex_enter(&ill->ill_lock); /* Track the state if this is the first enabmulti */ if (ill->ill_dlpi_multicast_state == IDS_UNKNOWN) ill->ill_dlpi_multicast_state = IDS_INPROGRESS; - ill_dlpi_send(ill, mp); - } else { - ip1dbg(("ip_ll_send_enabmulti_req: IPv6 ndp_mcastreq %s on" - " %s\n", - inet_ntop(AF_INET6, v6groupp, group_buf, - sizeof (group_buf)), - ill->ill_name)); - return (ndp_mcastreq(ill, v6groupp, addrlen, addroff, mp)); + mutex_exit(&ill->ill_lock); + break; } - return (0); + ill_dlpi_queue(ill, mp); +done: + if (release_ill != NULL) + ill_refrele(release_ill); + return (err); } /* @@ -772,132 +791,71 @@ ip_ll_send_enabmulti_req(ill_t *ill, const in6_addr_t *v6groupp) * membership for v6group if appropriate. */ static int -ip_ll_addmulti_v6(ipif_t *ipif, const in6_addr_t *v6groupp) +ip_ll_multireq(ill_t *ill, const in6_addr_t *v6groupp, t_uscalar_t prim) { - ill_t *ill = ipif->ipif_ill; - - ASSERT(IAM_WRITER_IPIF(ipif)); - if (ill->ill_net_type != IRE_IF_RESOLVER || - ipif->ipif_flags & IPIF_POINTOPOINT) { - ip1dbg(("ip_ll_addmulti_v6: not resolver\n")); + ill->ill_ipif->ipif_flags & IPIF_POINTOPOINT) { + ip1dbg(("ip_ll_multireq: not resolver\n")); return (0); /* Must be IRE_IF_NORESOLVER */ } if (ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) { - ip1dbg(("ip_ll_addmulti_v6: MULTI_BCAST\n")); - return (0); - } - if (!ill->ill_dl_up) { - /* - * Nobody there. All multicast addresses will be re-joined - * when we get the DL_BIND_ACK bringing the interface up. - */ - ip1dbg(("ip_ll_addmulti_v6: nobody up\n")); + ip1dbg(("ip_ll_multireq: MULTI_BCAST\n")); return (0); } - return (ip_ll_send_enabmulti_req(ill, v6groupp)); + return (ip_ll_send_multireq(ill, v6groupp, prim)); } /* - * INADDR_ANY means all multicast addresses. - * INADDR_ANY is stored as the IPv6 unspecified addr. + * Delete the ilm. Used by other parts of IP for the case of no_ilg/leaving + * being true. */ int -ip_delmulti(ipaddr_t group, ipif_t *ipif, boolean_t no_ilg, boolean_t leaving) +ip_delmulti(ilm_t *ilm) { - ill_t *ill = ipif->ipif_ill; - ilm_t *ilm; - in6_addr_t v6group; - - ASSERT(IAM_WRITER_IPIF(ipif)); - - if (!CLASSD(group) && group != INADDR_ANY) - return (EINVAL); - - /* - * INADDR_ANY is represented as the IPv6 unspecified addr. - */ - if (group == INADDR_ANY) - v6group = ipv6_all_zeros; - else - IN6_IPADDR_TO_V4MAPPED(group, &v6group); + ill_t *ill = ilm->ilm_ill; + int error; - /* - * Look for a match on the ipif. - * (IP_DROP_MEMBERSHIP specifies an ipif using an IP address). - */ - ilm = ilm_lookup_ipif(ipif, group); - if (ilm == NULL) - return (ENOENT); - - /* Update counters */ - if (no_ilg) - ilm->ilm_no_ilg_cnt--; - - if (leaving) - ilm->ilm_refcnt--; - - if (ilm->ilm_refcnt > 0) - return (ilm_update_del(ilm, B_FALSE)); - - if (group == INADDR_ANY) { - ilm_delete(ilm); - /* - * Check how many ipif's that have members in this group - - * if there are still some left then don't tell the driver - * to drop it. - */ - if (ilm_numentries_v6(ill, &v6group) != 0) - return (0); - - /* If we never joined, then don't leave. */ - if (ill->ill_join_allmulti) - ill_leave_allmulti(ill); - - return (0); - } - - if (!IS_LOOPBACK(ill)) - igmp_leavegroup(ilm); - - ilm_delete(ilm); - /* - * Check how many ipif's that have members in this group - - * if there are still some left then don't tell the driver - * to drop it. - */ - if (ilm_numentries_v6(ill, &v6group) != 0) - return (0); - return (ip_ll_delmulti_v6(ipif, &v6group)); + /* Acquire serializer to keep assert in ilm_bld_flists happy */ + mutex_enter(&ill->ill_mcast_serializer); + error = ip_delmulti_serial(ilm, B_TRUE, B_TRUE); + mutex_exit(&ill->ill_mcast_serializer); + return (error); } + /* - * The unspecified address means all multicast addresses. + * Delete the ilm. + * Assumes ill_multi_serializer is held by the caller. */ -int -ip_delmulti_v6(const in6_addr_t *v6group, ill_t *ill, zoneid_t zoneid, - boolean_t no_ilg, boolean_t leaving) +static int +ip_delmulti_serial(ilm_t *ilm, boolean_t no_ilg, boolean_t leaving) { - ipif_t *ipif; - ilm_t *ilm; + ill_t *ill = ilm->ilm_ill; + int ret; - ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(MUTEX_HELD(&ill->ill_mcast_serializer)); + ASSERT(!(IS_UNDER_IPMP(ill))); - if (!IN6_IS_ADDR_MULTICAST(v6group) && - !IN6_IS_ADDR_UNSPECIFIED(v6group)) - return (EINVAL); + rw_enter(&ill->ill_mcast_lock, RW_WRITER); + ret = ip_delmulti_impl(ilm, no_ilg, leaving); + rw_exit(&ill->ill_mcast_lock); + /* Send any deferred/queued DLPI or IP packets */ + ill_mcast_send_queued(ill); + ill_dlpi_send_queued(ill); + ill_mcast_timer_start(ill->ill_ipst); - /* - * Look for a match on the ill. - */ - ilm = ilm_lookup_ill_v6(ill, v6group, B_TRUE, zoneid); - if (ilm == NULL) - return (ENOENT); + return (ret); +} - ASSERT(ilm->ilm_ill == ill); +static int +ip_delmulti_impl(ilm_t *ilm, boolean_t no_ilg, boolean_t leaving) +{ + ill_t *ill = ilm->ilm_ill; + int error; + in6_addr_t v6group; - ipif = ill->ill_ipif; + ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock)); /* Update counters */ if (no_ilg) @@ -907,150 +865,90 @@ ip_delmulti_v6(const in6_addr_t *v6group, ill_t *ill, zoneid_t zoneid, ilm->ilm_refcnt--; if (ilm->ilm_refcnt > 0) - return (ilm_update_del(ilm, B_TRUE)); + return (ilm_update_del(ilm)); - if (IN6_IS_ADDR_UNSPECIFIED(v6group)) { + v6group = ilm->ilm_v6addr; + + if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr)) { ilm_delete(ilm); /* - * Check how many ipif's that have members in this group - - * if there are still some left then don't tell the driver - * to drop it. + * If we have some left then one we should not tell the driver + * to leave. */ - if (ilm_numentries_v6(ill, v6group) != 0) + if (ilm_numentries(ill, &v6group) != 0) return (0); - /* If we never joined, then don't leave. */ - if (ill->ill_join_allmulti) - ill_leave_allmulti(ill); + ill_leave_allmulti(ill); return (0); } - if (!IS_LOOPBACK(ill)) - mld_leavegroup(ilm); + if (!IS_LOOPBACK(ill)) { + if (ill->ill_isv6) + mld_leavegroup(ilm); + else + igmp_leavegroup(ilm); + } ilm_delete(ilm); /* - * Check how many ipif's that have members in this group - - * if there are still some left then don't tell the driver - * to drop it. + * If we have some left then one we should not tell the driver + * to leave. */ - if (ilm_numentries_v6(ill, v6group) != 0) + if (ilm_numentries(ill, &v6group) != 0) return (0); - return (ip_ll_delmulti_v6(ipif, v6group)); -} -/* - * Send a multicast request to the driver for disabling multicast reception - * for v6groupp address. The caller has already checked whether it is - * appropriate to send one or not. - */ -int -ip_ll_send_disabmulti_req(ill_t *ill, const in6_addr_t *v6groupp) -{ - mblk_t *mp; - char group_buf[INET6_ADDRSTRLEN]; - uint32_t addrlen, addroff; + error = ip_ll_multireq(ill, &v6group, DL_DISABMULTI_REQ); + /* We ignore the case when ill_dl_up is not set */ + if (error == ENETDOWN) { + char buf[INET6_ADDRSTRLEN]; - ASSERT(IAM_WRITER_ILL(ill)); - - /* - * See comment in ip_ll_send_enabmulti_req(). - */ - if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL) - return (0); - - /* - * Create a DL_DISABMULTI_REQ. - */ - mp = ill_create_dl(ill, DL_DISABMULTI_REQ, - sizeof (dl_disabmulti_req_t), &addrlen, &addroff); - if (!mp) - return (ENOMEM); - - if (IN6_IS_ADDR_V4MAPPED(v6groupp)) { - ipaddr_t v4group; - - IN6_V4MAPPED_TO_IPADDR(v6groupp, v4group); - - ill_multicast_mapping(ill, v4group, - mp->b_rptr + addroff, addrlen); - - ip1dbg(("ip_ll_send_disabmulti_req: IPv4 %s on %s\n", - inet_ntop(AF_INET6, v6groupp, group_buf, - sizeof (group_buf)), + ip0dbg(("ip_delmulti: ENETDOWN for %s on %s", + inet_ntop(AF_INET6, &v6group, buf, sizeof (buf)), ill->ill_name)); - ill_dlpi_send(ill, mp); - } else { - ip1dbg(("ip_ll_send_disabmulti_req: IPv6 ndp_mcastreq %s on" - " %s\n", - inet_ntop(AF_INET6, v6groupp, group_buf, - sizeof (group_buf)), - ill->ill_name)); - return (ndp_mcastreq(ill, v6groupp, addrlen, addroff, mp)); - } - return (0); -} - -/* - * Send a multicast request to the driver for disabling multicast - * membership for v6group if appropriate. - */ -static int -ip_ll_delmulti_v6(ipif_t *ipif, const in6_addr_t *v6group) -{ - ill_t *ill = ipif->ipif_ill; - - ASSERT(IAM_WRITER_IPIF(ipif)); - - if (ill->ill_net_type != IRE_IF_RESOLVER || - ipif->ipif_flags & IPIF_POINTOPOINT) { - return (0); /* Must be IRE_IF_NORESOLVER */ - } - if (ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST) { - ip1dbg(("ip_ll_delmulti_v6: MULTI_BCAST\n")); - return (0); } - if (!ill->ill_dl_up) { - /* - * Nobody there. All multicast addresses will be re-joined - * when we get the DL_BIND_ACK bringing the interface up. - */ - ip1dbg(("ip_ll_delmulti_v6: nobody up\n")); - return (0); - } - return (ip_ll_send_disabmulti_req(ill, v6group)); + return (error); } /* - * Make the driver pass up all multicast packets. NOTE: to keep callers - * IPMP-unaware, if an IPMP ill is passed in, the ill_join_allmulti flag is - * set on it (rather than the cast ill). + * Make the driver pass up all multicast packets. */ int ill_join_allmulti(ill_t *ill) { - mblk_t *promiscon_mp, *promiscoff_mp; + mblk_t *promiscon_mp, *promiscoff_mp = NULL; uint32_t addrlen, addroff; - ill_t *join_ill = ill; + ill_t *release_ill = NULL; - ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock)); if (!ill->ill_dl_up) { /* * Nobody there. All multicast addresses will be re-joined * when we get the DL_BIND_ACK bringing the interface up. */ - return (0); + return (ENETDOWN); } - /* - * See comment in ip_ll_send_enabmulti_req(). - */ - if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL) - return (0); - - ASSERT(!join_ill->ill_join_allmulti); + if (IS_IPMP(ill)) { + /* On the upper IPMP ill. */ + release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); + if (release_ill == NULL) { + /* + * Avoid sending it down to the ipmpstub. + * We will be called again once the members of the + * group are in place + */ + ip1dbg(("ill_join_allmulti: no cast_ill for %s %d\n", + ill->ill_name, ill->ill_isv6)); + return (0); + } + ill = release_ill; + if (!ill->ill_dl_up) { + ill_refrele(ill); + return (ENETDOWN); + } + } /* * Create a DL_PROMISCON_REQ message and send it directly to the DLPI @@ -1062,19 +960,24 @@ ill_join_allmulti(ill_t *ill) if ((ill->ill_net_type == IRE_IF_RESOLVER) && !(ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST)) { promiscon_mp = ill_create_dl(ill, DL_PROMISCON_REQ, - sizeof (dl_promiscon_req_t), &addrlen, &addroff); - promiscoff_mp = ill_create_dl(ill, DL_PROMISCOFF_REQ, - sizeof (dl_promiscoff_req_t), &addrlen, &addroff); - if (promiscon_mp == NULL || promiscoff_mp == NULL) { + &addrlen, &addroff); + if (ill->ill_promiscoff_mp == NULL) + promiscoff_mp = ill_create_dl(ill, DL_PROMISCOFF_REQ, + &addrlen, &addroff); + if (promiscon_mp == NULL || + (ill->ill_promiscoff_mp == NULL && promiscoff_mp == NULL)) { freemsg(promiscon_mp); freemsg(promiscoff_mp); + if (release_ill != NULL) + ill_refrele(release_ill); return (ENOMEM); } - ill->ill_promiscoff_mp = promiscoff_mp; - ill_dlpi_send(ill, promiscon_mp); + if (ill->ill_promiscoff_mp == NULL) + ill->ill_promiscoff_mp = promiscoff_mp; + ill_dlpi_queue(ill, promiscon_mp); } - - join_ill->ill_join_allmulti = B_TRUE; + if (release_ill != NULL) + ill_refrele(release_ill); return (0); } @@ -1085,9 +988,9 @@ void ill_leave_allmulti(ill_t *ill) { mblk_t *promiscoff_mp; - ill_t *leave_ill = ill; + ill_t *release_ill = NULL; - ASSERT(IAM_WRITER_ILL(ill)); + ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock)); if (!ill->ill_dl_up) { /* @@ -1097,105 +1000,130 @@ ill_leave_allmulti(ill_t *ill) return; } - /* - * See comment in ip_ll_send_enabmulti_req(). - */ - if (IS_IPMP(ill) && (ill = ipmp_illgrp_cast_ill(ill->ill_grp)) == NULL) - return; - - ASSERT(leave_ill->ill_join_allmulti); + if (IS_IPMP(ill)) { + /* On the upper IPMP ill. */ + release_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); + if (release_ill == NULL) { + /* + * Avoid sending it down to the ipmpstub. + * We will be called again once the members of the + * group are in place + */ + ip1dbg(("ill_leave_allmulti: no cast_ill on %s %d\n", + ill->ill_name, ill->ill_isv6)); + return; + } + ill = release_ill; + if (!ill->ill_dl_up) + goto done; + } /* - * Create a DL_PROMISCOFF_REQ message and send it directly to - * the DLPI provider. We don't need to do this for certain - * media types for which we never need to turn promiscuous - * mode on. + * In the case of IPMP and ill_dl_up not being set when we joined + * we didn't allocate a promiscoff_mp. In that case we have + * nothing to do when we leave. + * Ditto for PHYI_MULTI_BCAST */ - if ((ill->ill_net_type == IRE_IF_RESOLVER) && - !(ill->ill_phyint->phyint_flags & PHYI_MULTI_BCAST)) { - promiscoff_mp = ill->ill_promiscoff_mp; - ASSERT(promiscoff_mp != NULL); + promiscoff_mp = ill->ill_promiscoff_mp; + if (promiscoff_mp != NULL) { ill->ill_promiscoff_mp = NULL; - ill_dlpi_send(ill, promiscoff_mp); - } - - leave_ill->ill_join_allmulti = B_FALSE; -} - -static ill_t * -ipsq_enter_byifindex(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst) -{ - ill_t *ill; - boolean_t in_ipsq; - - ill = ill_lookup_on_ifindex(ifindex, isv6, NULL, NULL, NULL, NULL, - ipst); - if (ill != NULL) { - if (!ill_waiter_inc(ill)) { - ill_refrele(ill); - return (NULL); - } - ill_refrele(ill); - in_ipsq = ipsq_enter(ill, B_FALSE, NEW_OP); - ill_waiter_dcr(ill); - if (!in_ipsq) - ill = NULL; + ill_dlpi_queue(ill, promiscoff_mp); } - return (ill); +done: + if (release_ill != NULL) + ill_refrele(release_ill); } int ip_join_allmulti(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst) { ill_t *ill; - int ret = 0; + int ret; + ilm_t *ilm; - if ((ill = ipsq_enter_byifindex(ifindex, isv6, ipst)) == NULL) + ill = ill_lookup_on_ifindex(ifindex, isv6, ipst); + if (ill == NULL) return (ENODEV); /* - * The ip_addmulti*() functions won't allow IPMP underlying interfaces + * The ip_addmulti() function doesn't allow IPMP underlying interfaces * to join allmulti since only the nominated underlying interface in * the group should receive multicast. We silently succeed to avoid * having to teach IPobs (currently the only caller of this routine) * to ignore failures in this case. */ - if (IS_UNDER_IPMP(ill)) - goto out; + if (IS_UNDER_IPMP(ill)) { + ill_refrele(ill); + return (0); + } + mutex_enter(&ill->ill_lock); + if (ill->ill_ipallmulti_cnt > 0) { + /* Already joined */ + ASSERT(ill->ill_ipallmulti_ilm != NULL); + ill->ill_ipallmulti_cnt++; + mutex_exit(&ill->ill_lock); + goto done; + } + mutex_exit(&ill->ill_lock); - if (isv6) { - ret = ip_addmulti_v6(&ipv6_all_zeros, ill, ill->ill_zoneid, - ILGSTAT_NONE, MODE_IS_EXCLUDE, NULL); - } else { - ret = ip_addmulti(INADDR_ANY, ill->ill_ipif, ILGSTAT_NONE, - MODE_IS_EXCLUDE, NULL); + ilm = ip_addmulti(&ipv6_all_zeros, ill, ill->ill_zoneid, &ret); + if (ilm == NULL) { + ASSERT(ret != 0); + ill_refrele(ill); + return (ret); } + + mutex_enter(&ill->ill_lock); + if (ill->ill_ipallmulti_cnt > 0) { + /* Another thread added it concurrently */ + (void) ip_delmulti(ilm); + mutex_exit(&ill->ill_lock); + goto done; + } + ASSERT(ill->ill_ipallmulti_ilm == NULL); + ill->ill_ipallmulti_ilm = ilm; ill->ill_ipallmulti_cnt++; -out: - ipsq_exit(ill->ill_phyint->phyint_ipsq); - return (ret); + mutex_exit(&ill->ill_lock); +done: + ill_refrele(ill); + return (0); } - int ip_leave_allmulti(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst) { ill_t *ill; + ilm_t *ilm; - if ((ill = ipsq_enter_byifindex(ifindex, isv6, ipst)) == NULL) + ill = ill_lookup_on_ifindex(ifindex, isv6, ipst); + if (ill == NULL) return (ENODEV); - if (ill->ill_ipallmulti_cnt > 0) { - if (isv6) { - (void) ip_delmulti_v6(&ipv6_all_zeros, ill, - ill->ill_zoneid, B_TRUE, B_TRUE); - } else { - (void) ip_delmulti(INADDR_ANY, ill->ill_ipif, B_TRUE, - B_TRUE); - } - ill->ill_ipallmulti_cnt--; + if (IS_UNDER_IPMP(ill)) { + ill_refrele(ill); + return (0); + } + + mutex_enter(&ill->ill_lock); + if (ill->ill_ipallmulti_cnt == 0) { + /* ip_purge_allmulti could have removed them all */ + mutex_exit(&ill->ill_lock); + goto done; + } + ill->ill_ipallmulti_cnt--; + if (ill->ill_ipallmulti_cnt == 0) { + /* Last one */ + ilm = ill->ill_ipallmulti_ilm; + ill->ill_ipallmulti_ilm = NULL; + } else { + ilm = NULL; } - ipsq_exit(ill->ill_phyint->phyint_ipsq); + mutex_exit(&ill->ill_lock); + if (ilm != NULL) + (void) ip_delmulti(ilm); + +done: + ill_refrele(ill); return (0); } @@ -1206,108 +1134,34 @@ ip_leave_allmulti(uint_t ifindex, boolean_t isv6, ip_stack_t *ipst) void ip_purge_allmulti(ill_t *ill) { - ASSERT(IAM_WRITER_ILL(ill)); - - for (; ill->ill_ipallmulti_cnt > 0; ill->ill_ipallmulti_cnt--) { - if (ill->ill_isv6) { - (void) ip_delmulti_v6(&ipv6_all_zeros, ill, - ill->ill_zoneid, B_TRUE, B_TRUE); - } else { - (void) ip_delmulti(INADDR_ANY, ill->ill_ipif, B_TRUE, - B_TRUE); - } - } -} - -/* - * Copy mp_orig and pass it in as a local message. - */ -void -ip_multicast_loopback(queue_t *q, ill_t *ill, mblk_t *mp_orig, int fanout_flags, - zoneid_t zoneid) -{ - mblk_t *mp; - mblk_t *ipsec_mp; - ipha_t *iph; - ip_stack_t *ipst = ill->ill_ipst; - - if (DB_TYPE(mp_orig) == M_DATA && - ((ipha_t *)mp_orig->b_rptr)->ipha_protocol == IPPROTO_UDP) { - uint_t hdrsz; - - hdrsz = IPH_HDR_LENGTH((ipha_t *)mp_orig->b_rptr) + - sizeof (udpha_t); - ASSERT(MBLKL(mp_orig) >= hdrsz); - - if (((mp = allocb(hdrsz, BPRI_MED)) != NULL) && - (mp_orig = dupmsg(mp_orig)) != NULL) { - cred_t *cr; - - bcopy(mp_orig->b_rptr, mp->b_rptr, hdrsz); - mp->b_wptr += hdrsz; - mp->b_cont = mp_orig; - mp_orig->b_rptr += hdrsz; - if (is_system_labeled() && - (cr = msg_getcred(mp_orig, NULL)) != NULL) - mblk_setcred(mp, cr, NOPID); - if (MBLKL(mp_orig) == 0) { - mp->b_cont = mp_orig->b_cont; - mp_orig->b_cont = NULL; - freeb(mp_orig); - } - } else if (mp != NULL) { - freeb(mp); - mp = NULL; - } - } else { - mp = ip_copymsg(mp_orig); /* No refcnt on ipsec_out netstack */ - } - - if (mp == NULL) - return; - if (DB_TYPE(mp) == M_CTL) { - ipsec_mp = mp; - mp = mp->b_cont; - } else { - ipsec_mp = mp; - } - - iph = (ipha_t *)mp->b_rptr; - - /* - * DTrace this as ip:::send. A blocked packet will fire the send - * probe, but not the receive probe. - */ - DTRACE_IP7(send, mblk_t *, ipsec_mp, conn_t *, NULL, void_ip_t *, iph, - __dtrace_ipsr_ill_t *, ill, ipha_t *, iph, ip6_t *, NULL, int, 1); - - DTRACE_PROBE4(ip4__loopback__out__start, - ill_t *, NULL, ill_t *, ill, - ipha_t *, iph, mblk_t *, ipsec_mp); + ilm_t *ilm; - FW_HOOKS(ipst->ips_ip4_loopback_out_event, - ipst->ips_ipv4firewall_loopback_out, - NULL, ill, iph, ipsec_mp, mp, HPE_MULTICAST, ipst); + ASSERT(IAM_WRITER_ILL(ill)); - DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, ipsec_mp); + mutex_enter(&ill->ill_lock); + ilm = ill->ill_ipallmulti_ilm; + ill->ill_ipallmulti_ilm = NULL; + ill->ill_ipallmulti_cnt = 0; + mutex_exit(&ill->ill_lock); - if (ipsec_mp != NULL) - ip_wput_local(q, ill, iph, ipsec_mp, NULL, - fanout_flags, zoneid); + if (ilm != NULL) + (void) ip_delmulti(ilm); } /* - * Create a DLPI message; for DL_{ENAB,DISAB}MULTI_REQ, room is left for - * the hardware address. + * Create a dlpi message with room for phys+sap. Later + * we will strip the sap for those primitives which + * only need a physical address. */ static mblk_t * -ill_create_dl(ill_t *ill, uint32_t dl_primitive, uint32_t length, +ill_create_dl(ill_t *ill, uint32_t dl_primitive, uint32_t *addr_lenp, uint32_t *addr_offp) { mblk_t *mp; uint32_t hw_addr_length; char *cp; uint32_t offset; + uint32_t length; uint32_t size; *addr_lenp = *addr_offp = 0; @@ -1318,14 +1172,18 @@ ill_create_dl(ill_t *ill, uint32_t dl_primitive, uint32_t length, return (NULL); } - size = length; switch (dl_primitive) { case DL_ENABMULTI_REQ: + length = sizeof (dl_enabmulti_req_t); + size = length + hw_addr_length; + break; case DL_DISABMULTI_REQ: - size += hw_addr_length; + length = sizeof (dl_disabmulti_req_t); + size = length + hw_addr_length; break; case DL_PROMISCON_REQ: case DL_PROMISCOFF_REQ: + size = length = sizeof (dl_promiscon_req_t); break; default: return (NULL); @@ -1373,33 +1231,29 @@ ill_create_dl(ill_t *ill, uint32_t dl_primitive, uint32_t length, } /* - * Rejoin any groups which have been explicitly joined by the application (we - * left all explicitly joined groups as part of ill_leave_multicast() prior to - * bringing the interface down). Note that because groups can be joined and - * left while an interface is down, this may not be the same set of groups - * that we left in ill_leave_multicast(). + * Rejoin any groups for which we have ilms. + * + * This is only needed for IPMP when the cast_ill changes since that + * change is invisible to the ilm. Other interface changes are handled + * by conn_update_ill. */ void ill_recover_multicast(ill_t *ill) { ilm_t *ilm; - ipif_t *ipif = ill->ill_ipif; char addrbuf[INET6_ADDRSTRLEN]; - ASSERT(IAM_WRITER_ILL(ill)); - ill->ill_need_recover_multicast = 0; - ill_ilm_walker_hold(ill); + rw_enter(&ill->ill_mcast_lock, RW_WRITER); for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { /* - * Check how many ipif's that have members in this group - - * if more then one we make sure that this entry is first - * in the list. + * If we have more then one ilm for the group (e.g., with + * different zoneid) then we should not tell the driver + * to join unless this is the first ilm for the group. */ - if (ilm_numentries_v6(ill, &ilm->ilm_v6addr) > 1 && - ilm_lookup_ill_v6(ill, &ilm->ilm_v6addr, B_TRUE, - ALL_ZONES) != ilm) { + if (ilm_numentries(ill, &ilm->ilm_v6addr) > 1 && + ilm_lookup(ill, &ilm->ilm_v6addr, ALL_ZONES) != ilm) { continue; } @@ -1414,38 +1268,42 @@ ill_recover_multicast(ill_t *ill) else igmp_joingroup(ilm); - (void) ip_ll_addmulti_v6(ipif, &ilm->ilm_v6addr); + (void) ip_ll_multireq(ill, &ilm->ilm_v6addr, + DL_ENABMULTI_REQ); } } - ill_ilm_walker_rele(ill); - + rw_exit(&ill->ill_mcast_lock); + /* Send any deferred/queued DLPI or IP packets */ + ill_mcast_send_queued(ill); + ill_dlpi_send_queued(ill); + ill_mcast_timer_start(ill->ill_ipst); } /* * The opposite of ill_recover_multicast() -- leaves all multicast groups * that were explicitly joined. + * + * This is only needed for IPMP when the cast_ill changes since that + * change is invisible to the ilm. Other interface changes are handled + * by conn_update_ill. */ void ill_leave_multicast(ill_t *ill) { ilm_t *ilm; - ipif_t *ipif = ill->ill_ipif; char addrbuf[INET6_ADDRSTRLEN]; - ASSERT(IAM_WRITER_ILL(ill)); - ill->ill_need_recover_multicast = 1; - ill_ilm_walker_hold(ill); + rw_enter(&ill->ill_mcast_lock, RW_WRITER); for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { /* - * Check how many ipif's that have members in this group - - * if more then one we make sure that this entry is first - * in the list. + * If we have more then one ilm for the group (e.g., with + * different zoneid) then we should not tell the driver + * to leave unless this is the first ilm for the group. */ - if (ilm_numentries_v6(ill, &ilm->ilm_v6addr) > 1 && - ilm_lookup_ill_v6(ill, &ilm->ilm_v6addr, B_TRUE, - ALL_ZONES) != ilm) { + if (ilm_numentries(ill, &ilm->ilm_v6addr) > 1 && + ilm_lookup(ill, &ilm->ilm_v6addr, ALL_ZONES) != ilm) { continue; } @@ -1460,126 +1318,186 @@ ill_leave_multicast(ill_t *ill) else igmp_leavegroup(ilm); - (void) ip_ll_delmulti_v6(ipif, &ilm->ilm_v6addr); + (void) ip_ll_multireq(ill, &ilm->ilm_v6addr, + DL_DISABMULTI_REQ); } } - ill_ilm_walker_rele(ill); + rw_exit(&ill->ill_mcast_lock); + /* Send any deferred/queued DLPI or IP packets */ + ill_mcast_send_queued(ill); + ill_dlpi_send_queued(ill); + ill_mcast_timer_start(ill->ill_ipst); } -/* Find an ilm for matching the ill */ -ilm_t * -ilm_lookup_ill(ill_t *ill, ipaddr_t group, zoneid_t zoneid) +/* + * Interface used by IP input/output. + * Returns true if there is a member on the ill for any zoneid. + */ +boolean_t +ill_hasmembers_v6(ill_t *ill, const in6_addr_t *v6group) +{ + ilm_t *ilm; + + rw_enter(&ill->ill_mcast_lock, RW_READER); + ilm = ilm_lookup(ill, v6group, ALL_ZONES); + rw_exit(&ill->ill_mcast_lock); + return (ilm != NULL); +} + +/* + * Interface used by IP input/output. + * Returns true if there is a member on the ill for any zoneid. + * + * The group and source can't be INADDR_ANY here so no need to translate to + * the unspecified IPv6 address. + */ +boolean_t +ill_hasmembers_v4(ill_t *ill, ipaddr_t group) { in6_addr_t v6group; - /* - * INADDR_ANY is represented as the IPv6 unspecified addr. - */ - if (group == INADDR_ANY) - v6group = ipv6_all_zeros; - else - IN6_IPADDR_TO_V4MAPPED(group, &v6group); + IN6_IPADDR_TO_V4MAPPED(group, &v6group); + return (ill_hasmembers_v6(ill, &v6group)); +} + +/* + * Interface used by IP input/output. + * Returns true if there is a member on the ill for any zoneid except skipzone. + */ +boolean_t +ill_hasmembers_otherzones_v6(ill_t *ill, const in6_addr_t *v6group, + zoneid_t skipzone) +{ + ilm_t *ilm; - return (ilm_lookup_ill_v6(ill, &v6group, B_TRUE, zoneid)); + rw_enter(&ill->ill_mcast_lock, RW_READER); + for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { + if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group) && + ilm->ilm_zoneid != skipzone) { + rw_exit(&ill->ill_mcast_lock); + return (B_TRUE); + } + } + rw_exit(&ill->ill_mcast_lock); + return (B_FALSE); } /* - * Find an ilm for address `v6group' on `ill' and zone `zoneid' (which may be - * ALL_ZONES). In general, if `ill' is in an IPMP group, we will match - * against any ill in the group. However, if `restrict_solicited' is set, - * then specifically for IPv6 solicited-node multicast, the match will be - * restricted to the specified `ill'. + * Interface used by IP input/output. + * Returns true if there is a member on the ill for any zoneid except skipzone. + * + * The group and source can't be INADDR_ANY here so no need to translate to + * the unspecified IPv6 address. */ -ilm_t * -ilm_lookup_ill_v6(ill_t *ill, const in6_addr_t *v6group, - boolean_t restrict_solicited, zoneid_t zoneid) +boolean_t +ill_hasmembers_otherzones_v4(ill_t *ill, ipaddr_t group, zoneid_t skipzone) { - ilm_t *ilm; - ilm_walker_t ilw; - boolean_t restrict_ill = B_FALSE; + in6_addr_t v6group; - /* - * In general, underlying interfaces cannot have multicast memberships - * and thus lookups always match across the illgrp. However, we must - * allow IPv6 solicited-node multicast memberships on underlying - * interfaces, and thus an IPMP meta-interface and one of its - * underlying ills may have the same solicited-node multicast address. - * In that case, we need to restrict the lookup to the requested ill. - * However, we may receive packets on an underlying interface that - * are for the corresponding IPMP interface's solicited-node multicast - * address, and thus in that case we need to match across the group -- - * hence the unfortunate `restrict_solicited' argument. - */ - if (IN6_IS_ADDR_MC_SOLICITEDNODE(v6group) && restrict_solicited) - restrict_ill = (IS_IPMP(ill) || IS_UNDER_IPMP(ill)); + IN6_IPADDR_TO_V4MAPPED(group, &v6group); + return (ill_hasmembers_otherzones_v6(ill, &v6group, skipzone)); +} - ilm = ilm_walker_start(&ilw, ill); - for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { - if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group)) - continue; - if (zoneid != ALL_ZONES && zoneid != ilm->ilm_zoneid) - continue; - if (!restrict_ill || ill == (ill->ill_isv6 ? - ilm->ilm_ill : ilm->ilm_ipif->ipif_ill)) { - break; +/* + * Interface used by IP input. + * Returns the next numerically larger zoneid that has a member. If none exist + * then returns -1 (ALL_ZONES). + * The normal usage is for the caller to start with a -1 zoneid (ALL_ZONES) + * to find the first zoneid which has a member, and then pass that in for + * subsequent calls until ALL_ZONES is returned. + * + * The implementation of ill_hasmembers_nextzone() assumes the ilms + * are sorted by zoneid for efficiency. + */ +zoneid_t +ill_hasmembers_nextzone_v6(ill_t *ill, const in6_addr_t *v6group, + zoneid_t zoneid) +{ + ilm_t *ilm; + + rw_enter(&ill->ill_mcast_lock, RW_READER); + for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { + if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group) && + ilm->ilm_zoneid > zoneid) { + zoneid = ilm->ilm_zoneid; + rw_exit(&ill->ill_mcast_lock); + return (zoneid); } } - ilm_walker_finish(&ilw); - return (ilm); + rw_exit(&ill->ill_mcast_lock); + return (ALL_ZONES); } /* - * Find an ilm for the ipif. Only needed for IPv4 which does - * ipif specific socket options. + * Interface used by IP input. + * Returns the next numerically larger zoneid that has a member. If none exist + * then returns -1 (ALL_ZONES). + * + * The group and source can't be INADDR_ANY here so no need to translate to + * the unspecified IPv6 address. */ -ilm_t * -ilm_lookup_ipif(ipif_t *ipif, ipaddr_t group) +zoneid_t +ill_hasmembers_nextzone_v4(ill_t *ill, ipaddr_t group, zoneid_t zoneid) { - ilm_t *ilm; - ilm_walker_t ilw; + in6_addr_t v6group; - ilm = ilm_walker_start(&ilw, ipif->ipif_ill); - for (; ilm != NULL; ilm = ilm_walker_step(&ilw, ilm)) { - if (ilm->ilm_ipif == ipif && ilm->ilm_addr == group) - break; + IN6_IPADDR_TO_V4MAPPED(group, &v6group); + + return (ill_hasmembers_nextzone_v6(ill, &v6group, zoneid)); +} + +/* + * Find an ilm matching the ill, group, and zoneid. + */ +static ilm_t * +ilm_lookup(ill_t *ill, const in6_addr_t *v6group, zoneid_t zoneid) +{ + ilm_t *ilm; + + ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock)); + + for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { + if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group)) + continue; + if (zoneid != ALL_ZONES && zoneid != ilm->ilm_zoneid) + continue; + + ASSERT(ilm->ilm_ill == ill); + return (ilm); } - ilm_walker_finish(&ilw); - return (ilm); + return (NULL); } /* * How many members on this ill? + * Since each shared-IP zone has a separate ilm for the same group/ill + * we can have several. */ -int -ilm_numentries_v6(ill_t *ill, const in6_addr_t *v6group) +static int +ilm_numentries(ill_t *ill, const in6_addr_t *v6group) { ilm_t *ilm; int i = 0; - mutex_enter(&ill->ill_lock); + ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock)); for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { - if (ilm->ilm_flags & ILM_DELETED) - continue; if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group)) { i++; } } - mutex_exit(&ill->ill_lock); return (i); } /* Caller guarantees that the group is not already on the list */ static ilm_t * -ilm_add_v6(ipif_t *ipif, const in6_addr_t *v6group, ilg_stat_t ilgstat, +ilm_add(ill_t *ill, const in6_addr_t *v6group, ilg_stat_t ilgstat, mcast_record_t ilg_fmode, slist_t *ilg_flist, zoneid_t zoneid) { - ill_t *ill = ipif->ipif_ill; ilm_t *ilm; ilm_t *ilm_cur; ilm_t **ilm_ptpn; - ASSERT(IAM_WRITER_IPIF(ipif)); - + ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock)); ilm = GETSTRUCT(ilm_t, 1); if (ilm == NULL) return (NULL); @@ -1596,44 +1514,23 @@ ilm_add_v6(ipif_t *ipif, const in6_addr_t *v6group, ilg_stat_t ilgstat, ilm->ilm_timer = INFINITY; ilm->ilm_rtx.rtx_timer = INFINITY; - /* - * IPv4 Multicast groups are joined using ipif. - * IPv6 Multicast groups are joined using ill. - */ - if (ill->ill_isv6) { - ilm->ilm_ill = ill; - ilm->ilm_ipif = NULL; - DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill, - (char *), "ilm", (void *), ilm); - ill->ill_ilm_cnt++; - } else { - ASSERT(ilm->ilm_zoneid == ipif->ipif_zoneid); - ilm->ilm_ipif = ipif; - ilm->ilm_ill = NULL; - DTRACE_PROBE3(ipif__incr__cnt, (ipif_t *), ipif, - (char *), "ilm", (void *), ilm); - ipif->ipif_ilm_cnt++; - } + ilm->ilm_ill = ill; + DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill, + (char *), "ilm", (void *), ilm); + ill->ill_ilm_cnt++; ASSERT(ill->ill_ipst); ilm->ilm_ipst = ill->ill_ipst; /* No netstack_hold */ - ASSERT(!(ipif->ipif_state_flags & IPIF_CONDEMNED)); - ASSERT(!(ill->ill_state_flags & ILL_CONDEMNED)); + /* The ill/ipif could have just been marked as condemned */ /* - * Grab lock to give consistent view to readers - */ - mutex_enter(&ill->ill_lock); - /* - * All ilms in the same zone are contiguous in the ill_ilm list. - * The loops in ip_proto_input() and ip_wput_local() use this to avoid - * sending duplicates up when two applications in the same zone join the - * same group on different logical interfaces. + * To make ill_hasmembers_nextzone_v6 work we keep the list + * sorted by zoneid. */ ilm_cur = ill->ill_ilm; ilm_ptpn = &ill->ill_ilm; - while (ilm_cur != NULL && ilm_cur->ilm_zoneid != ilm->ilm_zoneid) { + while (ilm_cur != NULL && ilm_cur->ilm_zoneid < ilm->ilm_zoneid) { ilm_ptpn = &ilm_cur->ilm_next; ilm_cur = ilm_cur->ilm_next; } @@ -1653,7 +1550,6 @@ ilm_add_v6(ipif_t *ipif, const in6_addr_t *v6group, ilg_stat_t ilgstat, ilm->ilm_fmode = MODE_IS_EXCLUDE; } - mutex_exit(&ill->ill_lock); return (ilm); } @@ -1668,118 +1564,40 @@ ilm_inactive(ilm_t *ilm) mi_free((char *)ilm); } -void -ilm_walker_cleanup(ill_t *ill) -{ - ilm_t **ilmp; - ilm_t *ilm; - boolean_t need_wakeup = B_FALSE; - - ASSERT(MUTEX_HELD(&ill->ill_lock)); - ASSERT(ill->ill_ilm_walker_cnt == 0); - - ilmp = &ill->ill_ilm; - while (*ilmp != NULL) { - if ((*ilmp)->ilm_flags & ILM_DELETED) { - ilm = *ilmp; - *ilmp = ilm->ilm_next; - /* - * check if there are any pending FREE or unplumb - * operations that need to be restarted. - */ - if (ilm->ilm_ipif != NULL) { - /* - * IPv4 ilms hold a ref on the ipif. - */ - DTRACE_PROBE3(ipif__decr__cnt, - (ipif_t *), ilm->ilm_ipif, - (char *), "ilm", (void *), ilm); - ilm->ilm_ipif->ipif_ilm_cnt--; - if (IPIF_FREE_OK(ilm->ilm_ipif)) - need_wakeup = B_TRUE; - } else { - /* - * IPv6 ilms hold a ref on the ill. - */ - ASSERT(ilm->ilm_ill == ill); - DTRACE_PROBE3(ill__decr__cnt, - (ill_t *), ill, - (char *), "ilm", (void *), ilm); - ASSERT(ill->ill_ilm_cnt > 0); - ill->ill_ilm_cnt--; - if (ILL_FREE_OK(ill)) - need_wakeup = B_TRUE; - } - ilm_inactive(ilm); /* frees ilm */ - } else { - ilmp = &(*ilmp)->ilm_next; - } - } - ill->ill_ilm_cleanup_reqd = 0; - if (need_wakeup) - ipif_ill_refrele_tail(ill); - else - mutex_exit(&ill->ill_lock); -} - /* * Unlink ilm and free it. */ static void ilm_delete(ilm_t *ilm) { - ill_t *ill; + ill_t *ill = ilm->ilm_ill; ilm_t **ilmp; boolean_t need_wakeup; - - if (ilm->ilm_ipif != NULL) { - ASSERT(IAM_WRITER_IPIF(ilm->ilm_ipif)); - ASSERT(ilm->ilm_ill == NULL); - ill = ilm->ilm_ipif->ipif_ill; - ASSERT(!ill->ill_isv6); - } else { - ASSERT(IAM_WRITER_ILL(ilm->ilm_ill)); - ASSERT(ilm->ilm_ipif == NULL); - ill = ilm->ilm_ill; - ASSERT(ill->ill_isv6); - } /* * Delete under lock protection so that readers don't stumble * on bad ilm_next */ - mutex_enter(&ill->ill_lock); - if (ill->ill_ilm_walker_cnt != 0) { - ilm->ilm_flags |= ILM_DELETED; - ill->ill_ilm_cleanup_reqd = 1; - mutex_exit(&ill->ill_lock); - return; - } + ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock)); for (ilmp = &ill->ill_ilm; *ilmp != ilm; ilmp = &(*ilmp)->ilm_next) - ; + ; + *ilmp = ilm->ilm_next; + mutex_enter(&ill->ill_lock); /* - * if we are the last reference to the ipif (for IPv4 ilms) - * or the ill (for IPv6 ilms), we may need to wakeup any - * pending FREE or unplumb operations. + * if we are the last reference to the ill, we may need to wakeup any + * pending FREE or unplumb operations. This is because conn_update_ill + * bails if there is a ilg_delete_all in progress. */ need_wakeup = B_FALSE; - if (ilm->ilm_ipif != NULL) { - DTRACE_PROBE3(ipif__decr__cnt, (ipif_t *), ilm->ilm_ipif, - (char *), "ilm", (void *), ilm); - ilm->ilm_ipif->ipif_ilm_cnt--; - if (IPIF_FREE_OK(ilm->ilm_ipif)) - need_wakeup = B_TRUE; - } else { - DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill, - (char *), "ilm", (void *), ilm); - ASSERT(ill->ill_ilm_cnt > 0); - ill->ill_ilm_cnt--; - if (ILL_FREE_OK(ill)) - need_wakeup = B_TRUE; - } + DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill, + (char *), "ilm", (void *), ilm); + ASSERT(ill->ill_ilm_cnt > 0); + ill->ill_ilm_cnt--; + if (ILL_FREE_OK(ill)) + need_wakeup = B_TRUE; ilm_inactive(ilm); /* frees this ilm */ @@ -1791,185 +1609,103 @@ ilm_delete(ilm_t *ilm) } } -/* Increment the ILM walker count for `ill' */ -static void -ill_ilm_walker_hold(ill_t *ill) -{ - mutex_enter(&ill->ill_lock); - ill->ill_ilm_walker_cnt++; - mutex_exit(&ill->ill_lock); -} - -/* Decrement the ILM walker count for `ill' */ -static void -ill_ilm_walker_rele(ill_t *ill) -{ - mutex_enter(&ill->ill_lock); - ill->ill_ilm_walker_cnt--; - if (ill->ill_ilm_walker_cnt == 0 && ill->ill_ilm_cleanup_reqd) - ilm_walker_cleanup(ill); /* drops ill_lock */ - else - mutex_exit(&ill->ill_lock); -} - -/* - * Start walking the ILMs associated with `ill'; the first ILM in the walk - * (if any) is returned. State associated with the walk is stored in `ilw'. - * Note that walks associated with interfaces under IPMP also walk the ILMs - * on the associated IPMP interface; this is handled transparently to callers - * via ilm_walker_step(). (Usually with IPMP all ILMs will be on the IPMP - * interface; the only exception is to support IPv6 test addresses, which - * require ILMs for their associated solicited-node multicast addresses.) - */ -ilm_t * -ilm_walker_start(ilm_walker_t *ilw, ill_t *ill) -{ - ilw->ilw_ill = ill; - if (IS_UNDER_IPMP(ill)) - ilw->ilw_ipmp_ill = ipmp_ill_hold_ipmp_ill(ill); - else - ilw->ilw_ipmp_ill = NULL; - - ill_ilm_walker_hold(ill); - if (ilw->ilw_ipmp_ill != NULL) - ill_ilm_walker_hold(ilw->ilw_ipmp_ill); - - if (ilw->ilw_ipmp_ill != NULL && ilw->ilw_ipmp_ill->ill_ilm != NULL) - ilw->ilw_walk_ill = ilw->ilw_ipmp_ill; - else - ilw->ilw_walk_ill = ilw->ilw_ill; - - return (ilm_walker_step(ilw, NULL)); -} - /* - * Helper function for ilm_walker_step() that returns the next ILM - * associated with `ilw', regardless of whether it's deleted. + * Lookup an ill based on the group, ifindex, ifaddr, and zoneid. + * Applies to both IPv4 and IPv6, although ifaddr is only used with + * IPv4. + * Returns an error for IS_UNDER_IPMP and VNI interfaces. + * On error it sets *errorp. */ -static ilm_t * -ilm_walker_step_all(ilm_walker_t *ilw, ilm_t *ilm) +static ill_t * +ill_mcast_lookup(const in6_addr_t *group, ipaddr_t ifaddr, uint_t ifindex, + zoneid_t zoneid, ip_stack_t *ipst, int *errorp) { - if (ilm == NULL) - return (ilw->ilw_walk_ill->ill_ilm); + ill_t *ill; + ipaddr_t v4group; - if (ilm->ilm_next != NULL) - return (ilm->ilm_next); + if (IN6_IS_ADDR_V4MAPPED(group)) { + IN6_V4MAPPED_TO_IPADDR(group, v4group); - if (ilw->ilw_ipmp_ill != NULL && IS_IPMP(ilw->ilw_walk_ill)) { - ilw->ilw_walk_ill = ilw->ilw_ill; - /* - * It's possible that ilw_ill left the group during our walk, - * so we can't ASSERT() that it's under IPMP. Callers that - * care will be writer on the IPSQ anyway. - */ - return (ilw->ilw_walk_ill->ill_ilm); - } - return (NULL); -} + if (ifindex != 0) { + ill = ill_lookup_on_ifindex_zoneid(ifindex, zoneid, + B_FALSE, ipst); + } else if (ifaddr != INADDR_ANY) { + ipif_t *ipif; -/* - * Step to the next ILM associated with `ilw'. - */ -ilm_t * -ilm_walker_step(ilm_walker_t *ilw, ilm_t *ilm) -{ - while ((ilm = ilm_walker_step_all(ilw, ilm)) != NULL) { - if (!(ilm->ilm_flags & ILM_DELETED)) - break; - } - return (ilm); -} - -/* - * Finish the ILM walk associated with `ilw'. - */ -void -ilm_walker_finish(ilm_walker_t *ilw) -{ - ill_ilm_walker_rele(ilw->ilw_ill); - if (ilw->ilw_ipmp_ill != NULL) { - ill_ilm_walker_rele(ilw->ilw_ipmp_ill); - ill_refrele(ilw->ilw_ipmp_ill); + ipif = ipif_lookup_addr(ifaddr, NULL, zoneid, ipst); + if (ipif == NULL) { + ill = NULL; + } else { + ill = ipif->ipif_ill; + ill_refhold(ill); + ipif_refrele(ipif); + } + } else { + ill = ill_lookup_group_v4(v4group, zoneid, ipst, NULL, + NULL); + } + } else { + if (ifindex != 0) { + ill = ill_lookup_on_ifindex_zoneid(ifindex, zoneid, + B_TRUE, ipst); + } else { + ill = ill_lookup_group_v6(group, zoneid, ipst, NULL, + NULL); + } } - bzero(&ilw, sizeof (ilw)); -} - -/* - * Looks up the appropriate ipif given a v4 multicast group and interface - * address. On success, returns 0, with *ipifpp pointing to the found - * struct. On failure, returns an errno and *ipifpp is NULL. - */ -int -ip_opt_check(conn_t *connp, ipaddr_t group, ipaddr_t src, ipaddr_t ifaddr, - uint_t *ifindexp, mblk_t *first_mp, ipsq_func_t func, ipif_t **ipifpp) -{ - ipif_t *ipif; - int err = 0; - zoneid_t zoneid; - ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - - if (!CLASSD(group) || CLASSD(src)) { - return (EINVAL); + if (ill == NULL) { + if (ifindex != 0) + *errorp = ENXIO; + else + *errorp = EADDRNOTAVAIL; + return (NULL); } - *ipifpp = NULL; - - zoneid = IPCL_ZONEID(connp); - - ASSERT(!(ifaddr != INADDR_ANY && ifindexp != NULL && *ifindexp != 0)); - if (ifaddr != INADDR_ANY) { - ipif = ipif_lookup_addr(ifaddr, NULL, zoneid, - CONNP_TO_WQ(connp), first_mp, func, &err, ipst); - if (err != 0 && err != EINPROGRESS) - err = EADDRNOTAVAIL; - } else if (ifindexp != NULL && *ifindexp != 0) { - ipif = ipif_lookup_on_ifindex(*ifindexp, B_FALSE, zoneid, - CONNP_TO_WQ(connp), first_mp, func, &err, ipst); - } else { - ipif = ipif_lookup_group(group, zoneid, ipst); - if (ipif == NULL) - return (EADDRNOTAVAIL); + /* operation not supported on the virtual network interface */ + if (IS_UNDER_IPMP(ill) || IS_VNI(ill)) { + ill_refrele(ill); + *errorp = EINVAL; + return (NULL); } - if (ipif == NULL) - return (err); - - *ipifpp = ipif; - return (0); + return (ill); } /* - * Looks up the appropriate ill (or ipif if v4mapped) given an interface - * index and IPv6 multicast group. On success, returns 0, with *illpp (or - * *ipifpp if v4mapped) pointing to the found struct. On failure, returns - * an errno and *illpp and *ipifpp are undefined. + * Looks up the appropriate ill given an interface index (or interface address) + * and multicast group. On success, returns 0, with *illpp pointing to the + * found struct. On failure, returns an errno and *illpp is set to NULL. + * + * Returns an error for IS_UNDER_IPMP and VNI interfaces. + * + * Handles both IPv4 and IPv6. The ifaddr argument only applies in the + * case of IPv4. */ int -ip_opt_check_v6(conn_t *connp, const in6_addr_t *v6group, ipaddr_t *v4group, - const in6_addr_t *v6src, ipaddr_t *v4src, boolean_t *isv6, int ifindex, - mblk_t *first_mp, ipsq_func_t func, ill_t **illpp, ipif_t **ipifpp) +ip_opt_check(conn_t *connp, const in6_addr_t *v6group, + const in6_addr_t *v6src, ipaddr_t ifaddr, uint_t ifindex, ill_t **illpp) { boolean_t src_unspec; ill_t *ill = NULL; - ipif_t *ipif = NULL; - int err; - zoneid_t zoneid = connp->conn_zoneid; - queue_t *wq = CONNP_TO_WQ(connp); ip_stack_t *ipst = connp->conn_netstack->netstack_ip; + int error = 0; + + *illpp = NULL; src_unspec = IN6_IS_ADDR_UNSPECIFIED(v6src); if (IN6_IS_ADDR_V4MAPPED(v6group)) { + ipaddr_t v4group; + ipaddr_t v4src; + if (!IN6_IS_ADDR_V4MAPPED(v6src) && !src_unspec) return (EINVAL); - IN6_V4MAPPED_TO_IPADDR(v6group, *v4group); + IN6_V4MAPPED_TO_IPADDR(v6group, v4group); if (src_unspec) { - *v4src = INADDR_ANY; + v4src = INADDR_ANY; } else { - IN6_V4MAPPED_TO_IPADDR(v6src, *v4src); + IN6_V4MAPPED_TO_IPADDR(v6src, v4src); } - if (!CLASSD(*v4group) || CLASSD(*v4src)) + if (!CLASSD(v4group) || CLASSD(v4src)) return (EINVAL); - *ipifpp = NULL; - *isv6 = B_FALSE; } else { if (IN6_IS_ADDR_V4MAPPED(v6src) && !src_unspec) return (EINVAL); @@ -1977,43 +1713,17 @@ ip_opt_check_v6(conn_t *connp, const in6_addr_t *v6group, ipaddr_t *v4group, IN6_IS_ADDR_MULTICAST(v6src)) { return (EINVAL); } - *illpp = NULL; - *isv6 = B_TRUE; } - if (ifindex == 0) { - if (*isv6) - ill = ill_lookup_group_v6(v6group, zoneid, ipst); - else - ipif = ipif_lookup_group(*v4group, zoneid, ipst); - if (ill == NULL && ipif == NULL) - return (EADDRNOTAVAIL); - } else { - if (*isv6) { - ill = ill_lookup_on_ifindex(ifindex, B_TRUE, - wq, first_mp, func, &err, ipst); - if (ill != NULL && - !ipif_lookup_zoneid(ill, zoneid, 0, NULL)) { - ill_refrele(ill); - ill = NULL; - err = EADDRNOTAVAIL; - } - } else { - ipif = ipif_lookup_on_ifindex(ifindex, B_FALSE, - zoneid, wq, first_mp, func, &err, ipst); - } - if (ill == NULL && ipif == NULL) - return (err); - } - - *ipifpp = ipif; + ill = ill_mcast_lookup(v6group, ifaddr, ifindex, IPCL_ZONEID(connp), + ipst, &error); *illpp = ill; - return (0); + return (error); } static int ip_get_srcfilter(conn_t *connp, struct group_filter *gf, - struct ip_msfilter *imsf, ipaddr_t grp, ipif_t *ipif, boolean_t isv4mapped) + struct ip_msfilter *imsf, const struct in6_addr *group, boolean_t issin6) { ilg_t *ilg; int i, numsrc, fmode, outsrcs; @@ -2022,24 +1732,30 @@ ip_get_srcfilter(conn_t *connp, struct group_filter *gf, struct in_addr *addrp; slist_t *fp; boolean_t is_v4only_api; - - mutex_enter(&connp->conn_lock); - - ilg = ilg_lookup_ipif(connp, grp, ipif); - if (ilg == NULL) { - mutex_exit(&connp->conn_lock); - return (EADDRNOTAVAIL); - } + ipaddr_t ifaddr; + uint_t ifindex; if (gf == NULL) { ASSERT(imsf != NULL); - ASSERT(!isv4mapped); + ASSERT(!issin6); is_v4only_api = B_TRUE; outsrcs = imsf->imsf_numsrc; + ifaddr = imsf->imsf_interface.s_addr; + ifindex = 0; } else { ASSERT(imsf == NULL); is_v4only_api = B_FALSE; outsrcs = gf->gf_numsrc; + ifaddr = INADDR_ANY; + ifindex = gf->gf_interface; + } + + /* No need to use ill_mcast_serializer for the reader */ + rw_enter(&connp->conn_ilg_lock, RW_READER); + ilg = ilg_lookup(connp, group, ifaddr, ifindex); + if (ilg == NULL) { + rw_exit(&connp->conn_ilg_lock); + return (EADDRNOTAVAIL); } /* @@ -2055,7 +1771,7 @@ ip_get_srcfilter(conn_t *connp, struct group_filter *gf, for (i = 0; i < outsrcs; i++) { if (i == fp->sl_numsrc) break; - if (isv4mapped) { + if (issin6) { sin6 = (struct sockaddr_in6 *)&gf->gf_slist[i]; sin6->sin6_family = AF_INET6; sin6->sin6_addr = fp->sl_addr[i]; @@ -2082,57 +1798,18 @@ ip_get_srcfilter(conn_t *connp, struct group_filter *gf, gf->gf_fmode = fmode; } - mutex_exit(&connp->conn_lock); - - return (0); -} - -static int -ip_get_srcfilter_v6(conn_t *connp, struct group_filter *gf, - const struct in6_addr *grp, ill_t *ill) -{ - ilg_t *ilg; - int i; - struct sockaddr_storage *sl; - struct sockaddr_in6 *sin6; - slist_t *fp; - - mutex_enter(&connp->conn_lock); - - ilg = ilg_lookup_ill_v6(connp, grp, ill); - if (ilg == NULL) { - mutex_exit(&connp->conn_lock); - return (EADDRNOTAVAIL); - } - - /* - * In the kernel, we use the state definitions MODE_IS_[IN|EX]CLUDE - * to identify the filter mode; but the API uses MCAST_[IN|EX]CLUDE. - * So we need to translate here. - */ - gf->gf_fmode = (ilg->ilg_fmode == MODE_IS_INCLUDE) ? - MCAST_INCLUDE : MCAST_EXCLUDE; - if ((fp = ilg->ilg_filter) == NULL) { - gf->gf_numsrc = 0; - } else { - for (i = 0, sl = gf->gf_slist; i < gf->gf_numsrc; i++, sl++) { - if (i == fp->sl_numsrc) - break; - sin6 = (struct sockaddr_in6 *)sl; - sin6->sin6_family = AF_INET6; - sin6->sin6_addr = fp->sl_addr[i]; - } - gf->gf_numsrc = fp->sl_numsrc; - } - - mutex_exit(&connp->conn_lock); + rw_exit(&connp->conn_ilg_lock); return (0); } +/* + * Common for IPv4 and IPv6. + */ static int ip_set_srcfilter(conn_t *connp, struct group_filter *gf, - struct ip_msfilter *imsf, ipaddr_t grp, ipif_t *ipif, boolean_t isv4mapped) + struct ip_msfilter *imsf, const struct in6_addr *group, ill_t *ill, + boolean_t issin6) { ilg_t *ilg; int i, err, infmode, new_fmode; @@ -2143,20 +1820,27 @@ ip_set_srcfilter(conn_t *connp, struct group_filter *gf, slist_t *orig_filter = NULL; slist_t *new_filter = NULL; mcast_record_t orig_fmode; - boolean_t leave_grp, is_v4only_api; + boolean_t leave_group, is_v4only_api; ilg_stat_t ilgstat; + ilm_t *ilm; + ipaddr_t ifaddr; + uint_t ifindex; if (gf == NULL) { ASSERT(imsf != NULL); - ASSERT(!isv4mapped); + ASSERT(!issin6); is_v4only_api = B_TRUE; insrcs = imsf->imsf_numsrc; infmode = imsf->imsf_fmode; + ifaddr = imsf->imsf_interface.s_addr; + ifindex = 0; } else { ASSERT(imsf == NULL); is_v4only_api = B_FALSE; insrcs = gf->gf_numsrc; infmode = gf->gf_fmode; + ifaddr = INADDR_ANY; + ifindex = gf->gf_interface; } /* Make sure we can handle the source list */ @@ -2167,32 +1851,52 @@ ip_set_srcfilter(conn_t *connp, struct group_filter *gf, * setting the filter to (INCLUDE, NULL) is treated * as a request to leave the group. */ - leave_grp = (infmode == MCAST_INCLUDE && insrcs == 0); - - ASSERT(IAM_WRITER_IPIF(ipif)); + leave_group = (infmode == MCAST_INCLUDE && insrcs == 0); - mutex_enter(&connp->conn_lock); - - ilg = ilg_lookup_ipif(connp, grp, ipif); + mutex_enter(&ill->ill_mcast_serializer); + rw_enter(&connp->conn_ilg_lock, RW_WRITER); + ilg = ilg_lookup(connp, group, ifaddr, ifindex); if (ilg == NULL) { /* * if the request was actually to leave, and we * didn't find an ilg, there's nothing to do. */ - if (!leave_grp) - ilg = conn_ilg_alloc(connp, &err); - if (leave_grp || ilg == NULL) { - mutex_exit(&connp->conn_lock); - return (leave_grp ? 0 : err); + if (leave_group) { + rw_exit(&connp->conn_ilg_lock); + mutex_exit(&ill->ill_mcast_serializer); + return (0); + } + ilg = conn_ilg_alloc(connp, &err); + if (ilg == NULL) { + rw_exit(&connp->conn_ilg_lock); + mutex_exit(&ill->ill_mcast_serializer); + return (err); } ilgstat = ILGSTAT_NEW; - IN6_IPADDR_TO_V4MAPPED(grp, &ilg->ilg_v6group); - ilg->ilg_ipif = ipif; - ilg->ilg_ill = NULL; - } else if (leave_grp) { + ilg->ilg_v6group = *group; + ilg->ilg_ill = ill; + ilg->ilg_ifaddr = ifaddr; + ilg->ilg_ifindex = ifindex; + } else if (leave_group) { + /* + * Make sure we have the correct serializer. The ill argument + * might not match ilg_ill. + */ + ilg_refhold(ilg); + mutex_exit(&ill->ill_mcast_serializer); + ill = ilg->ilg_ill; + rw_exit(&connp->conn_ilg_lock); + + mutex_enter(&ill->ill_mcast_serializer); + rw_enter(&connp->conn_ilg_lock, RW_WRITER); + ilm = ilg->ilg_ilm; + ilg->ilg_ilm = NULL; ilg_delete(connp, ilg, NULL); - mutex_exit(&connp->conn_lock); - (void) ip_delmulti(grp, ipif, B_FALSE, B_TRUE); + ilg_refrele(ilg); + rw_exit(&connp->conn_ilg_lock); + if (ilm != NULL) + (void) ip_delmulti_serial(ilm, B_FALSE, B_TRUE); + mutex_exit(&ill->ill_mcast_serializer); return (0); } else { ilgstat = ILGSTAT_CHANGE; @@ -2203,7 +1907,8 @@ ip_set_srcfilter(conn_t *connp, struct group_filter *gf, } else { orig_filter = l_alloc_copy(ilg->ilg_filter); if (orig_filter == NULL) { - mutex_exit(&connp->conn_lock); + rw_exit(&connp->conn_ilg_lock); + mutex_exit(&ill->ill_mcast_serializer); return (ENOMEM); } } @@ -2214,7 +1919,7 @@ ip_set_srcfilter(conn_t *connp, struct group_filter *gf, * we make any changes, so we can bail if it fails. */ if ((new_filter = l_alloc()) == NULL) { - mutex_exit(&connp->conn_lock); + rw_exit(&connp->conn_ilg_lock); err = ENOMEM; goto free_and_exit; } @@ -2228,7 +1933,7 @@ ip_set_srcfilter(conn_t *connp, struct group_filter *gf, if (fp == NULL) { if (ilgstat == ILGSTAT_NEW) ilg_delete(connp, ilg, NULL); - mutex_exit(&connp->conn_lock); + rw_exit(&connp->conn_ilg_lock); err = ENOMEM; goto free_and_exit; } @@ -2236,7 +1941,7 @@ ip_set_srcfilter(conn_t *connp, struct group_filter *gf, fp = ilg->ilg_filter; } for (i = 0; i < insrcs; i++) { - if (isv4mapped) { + if (issin6) { sin6 = (struct sockaddr_in6 *)&gf->gf_slist[i]; fp->sl_addr[i] = sin6->sin6_addr; } else { @@ -2263,177 +1968,70 @@ ip_set_srcfilter(conn_t *connp, struct group_filter *gf, /* * Save copy of ilg's filter state to pass to other functions, - * so we can release conn_lock now. + * so we can release conn_ilg_lock now. */ new_fmode = ilg->ilg_fmode; l_copy(ilg->ilg_filter, new_filter); - mutex_exit(&connp->conn_lock); - - err = ip_addmulti(grp, ipif, ilgstat, new_fmode, new_filter); - if (err != 0) { - /* - * Restore the original filter state, or delete the - * newly-created ilg. We need to look up the ilg - * again, though, since we've not been holding the - * conn_lock. - */ - mutex_enter(&connp->conn_lock); - ilg = ilg_lookup_ipif(connp, grp, ipif); - ASSERT(ilg != NULL); - if (ilgstat == ILGSTAT_NEW) { - ilg_delete(connp, ilg, NULL); - } else { - ilg->ilg_fmode = orig_fmode; - if (SLIST_IS_EMPTY(orig_filter)) { - CLEAR_SLIST(ilg->ilg_filter); - } else { - /* - * We didn't free the filter, even if we - * were trying to make the source list empty; - * so if orig_filter isn't empty, the ilg - * must still have a filter alloc'd. - */ - l_copy(orig_filter, ilg->ilg_filter); - } - } - mutex_exit(&connp->conn_lock); - } - -free_and_exit: - l_free(orig_filter); - l_free(new_filter); - - return (err); -} - -static int -ip_set_srcfilter_v6(conn_t *connp, struct group_filter *gf, - const struct in6_addr *grp, ill_t *ill) -{ - ilg_t *ilg; - int i, orig_fmode, new_fmode, err; - slist_t *orig_filter = NULL; - slist_t *new_filter = NULL; - struct sockaddr_storage *sl; - struct sockaddr_in6 *sin6; - boolean_t leave_grp; - ilg_stat_t ilgstat; - - /* Make sure we can handle the source list */ - if (gf->gf_numsrc > MAX_FILTER_SIZE) - return (ENOBUFS); + rw_exit(&connp->conn_ilg_lock); /* - * setting the filter to (INCLUDE, NULL) is treated - * as a request to leave the group. + * Now update the ill. We wait to do this until after the ilg + * has been updated because we need to update the src filter + * info for the ill, which involves looking at the status of + * all the ilgs associated with this group/interface pair. */ - leave_grp = (gf->gf_fmode == MCAST_INCLUDE && gf->gf_numsrc == 0); - - ASSERT(IAM_WRITER_ILL(ill)); - - mutex_enter(&connp->conn_lock); - ilg = ilg_lookup_ill_v6(connp, grp, ill); - if (ilg == NULL) { - /* - * if the request was actually to leave, and we - * didn't find an ilg, there's nothing to do. - */ - if (!leave_grp) - ilg = conn_ilg_alloc(connp, &err); - if (leave_grp || ilg == NULL) { - mutex_exit(&connp->conn_lock); - return (leave_grp ? 0 : err); - } - ilgstat = ILGSTAT_NEW; - ilg->ilg_v6group = *grp; - ilg->ilg_ipif = NULL; - ilg->ilg_ill = ill; - } else if (leave_grp) { - ilg_delete(connp, ilg, NULL); - mutex_exit(&connp->conn_lock); - (void) ip_delmulti_v6(grp, ill, connp->conn_zoneid, B_FALSE, - B_TRUE); - return (0); - } else { - ilgstat = ILGSTAT_CHANGE; - /* preserve existing state in case ip_addmulti() fails */ - orig_fmode = ilg->ilg_fmode; - if (ilg->ilg_filter == NULL) { - orig_filter = NULL; - } else { - orig_filter = l_alloc_copy(ilg->ilg_filter); - if (orig_filter == NULL) { - mutex_exit(&connp->conn_lock); - return (ENOMEM); - } - } - } + ilm = ip_addmulti_serial(group, ill, connp->conn_zoneid, ilgstat, + new_fmode, new_filter, &err); + rw_enter(&connp->conn_ilg_lock, RW_WRITER); /* - * Alloc buffer to copy new state into (see below) before - * we make any changes, so we can bail if it fails. + * Must look up the ilg again since we've not been holding + * conn_ilg_lock. The ilg could have disappeared due to an unplumb + * having called conn_update_ill, which can run once we dropped the + * conn_ilg_lock above. */ - if ((new_filter = l_alloc()) == NULL) { - mutex_exit(&connp->conn_lock); - err = ENOMEM; + ilg = ilg_lookup(connp, group, ifaddr, ifindex); + if (ilg == NULL) { + rw_exit(&connp->conn_ilg_lock); + if (ilm != NULL) { + (void) ip_delmulti_serial(ilm, B_FALSE, + (ilgstat == ILGSTAT_NEW)); + } + err = ENXIO; goto free_and_exit; } - if (gf->gf_numsrc == 0) { - CLEAR_SLIST(ilg->ilg_filter); - } else { - slist_t *fp; - if (ilg->ilg_filter == NULL) { - fp = l_alloc(); - if (fp == NULL) { - if (ilgstat == ILGSTAT_NEW) - ilg_delete(connp, ilg, NULL); - mutex_exit(&connp->conn_lock); - err = ENOMEM; - goto free_and_exit; - } + if (ilm != NULL) { + /* Succeeded. Update the ilg to point at the ilm */ + if (ilgstat == ILGSTAT_NEW) { + ASSERT(ilg->ilg_ilm == NULL); + ilg->ilg_ilm = ilm; + ilm->ilm_ifaddr = ifaddr; /* For netstat */ } else { - fp = ilg->ilg_filter; - } - for (i = 0, sl = gf->gf_slist; i < gf->gf_numsrc; i++, sl++) { - sin6 = (struct sockaddr_in6 *)sl; - fp->sl_addr[i] = sin6->sin6_addr; + /* + * ip_addmulti didn't get a held ilm for + * ILGSTAT_CHANGE; ilm_refcnt was unchanged. + */ + ASSERT(ilg->ilg_ilm == ilm); } - fp->sl_numsrc = gf->gf_numsrc; - ilg->ilg_filter = fp; - } - /* - * In the kernel, we use the state definitions MODE_IS_[IN|EX]CLUDE - * to identify the filter mode; but the API uses MCAST_[IN|EX]CLUDE. - * So we need to translate here. - */ - ilg->ilg_fmode = (gf->gf_fmode == MCAST_INCLUDE) ? - MODE_IS_INCLUDE : MODE_IS_EXCLUDE; - - /* - * Save copy of ilg's filter state to pass to other functions, - * so we can release conn_lock now. - */ - new_fmode = ilg->ilg_fmode; - l_copy(ilg->ilg_filter, new_filter); - - mutex_exit(&connp->conn_lock); - - err = ip_addmulti_v6(grp, ill, connp->conn_zoneid, ilgstat, new_fmode, - new_filter); - if (err != 0) { + } else { + ASSERT(err != 0); /* + * Failed to allocate the ilm. * Restore the original filter state, or delete the - * newly-created ilg. We need to look up the ilg - * again, though, since we've not been holding the - * conn_lock. + * newly-created ilg. + * If ENETDOWN just clear ill_ilg since so that we + * will rejoin when the ill comes back; don't report ENETDOWN + * to application. */ - mutex_enter(&connp->conn_lock); - ilg = ilg_lookup_ill_v6(connp, grp, ill); - ASSERT(ilg != NULL); if (ilgstat == ILGSTAT_NEW) { - ilg_delete(connp, ilg, NULL); + if (err == ENETDOWN) { + ilg->ilg_ill = NULL; + err = 0; + } else { + ilg_delete(connp, ilg, NULL); + } } else { ilg->ilg_fmode = orig_fmode; if (SLIST_IS_EMPTY(orig_filter)) { @@ -2448,10 +2046,11 @@ ip_set_srcfilter_v6(conn_t *connp, struct group_filter *gf, l_copy(orig_filter, ilg->ilg_filter); } } - mutex_exit(&connp->conn_lock); } + rw_exit(&connp->conn_ilg_lock); free_and_exit: + mutex_exit(&ill->ill_mcast_serializer); l_free(orig_filter); l_free(new_filter); @@ -2475,11 +2074,17 @@ ip_sioctl_msfilter(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, boolean_t isv6, is_v4only_api, getcmd; struct sockaddr_in *gsin; struct sockaddr_in6 *gsin6; - ipaddr_t v4grp; - in6_addr_t v6grp; + ipaddr_t v4group; + in6_addr_t v6group; struct group_filter *gf = NULL; struct ip_msfilter *imsf = NULL; mblk_t *ndp; + ill_t *ill; + + connp = Q_TO_CONN(q); + err = ip_msfilter_ill(connp, mp, ipip, &ill); + if (err != 0) + return (err); if (data_mp->b_cont != NULL) { if ((ndp = msgpullup(data_mp, -1)) == NULL) @@ -2519,132 +2124,119 @@ ip_sioctl_msfilter(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp, if (datalen < expsize) return (EINVAL); - connp = Q_TO_CONN(q); - - /* operation not supported on the virtual network interface */ - if (IS_VNI(ipif->ipif_ill)) - return (EINVAL); - if (isv6) { - ill_t *ill = ipif->ipif_ill; - ill_refhold(ill); - gsin6 = (struct sockaddr_in6 *)&gf->gf_group; - v6grp = gsin6->sin6_addr; - if (getcmd) - err = ip_get_srcfilter_v6(connp, gf, &v6grp, ill); - else - err = ip_set_srcfilter_v6(connp, gf, &v6grp, ill); - - ill_refrele(ill); + v6group = gsin6->sin6_addr; + if (getcmd) { + err = ip_get_srcfilter(connp, gf, NULL, &v6group, + B_TRUE); + } else { + err = ip_set_srcfilter(connp, gf, NULL, &v6group, ill, + B_TRUE); + } } else { - boolean_t isv4mapped = B_FALSE; + boolean_t issin6 = B_FALSE; if (is_v4only_api) { - v4grp = (ipaddr_t)imsf->imsf_multiaddr.s_addr; + v4group = (ipaddr_t)imsf->imsf_multiaddr.s_addr; + IN6_IPADDR_TO_V4MAPPED(v4group, &v6group); } else { if (gf->gf_group.ss_family == AF_INET) { gsin = (struct sockaddr_in *)&gf->gf_group; - v4grp = (ipaddr_t)gsin->sin_addr.s_addr; + v4group = (ipaddr_t)gsin->sin_addr.s_addr; + IN6_IPADDR_TO_V4MAPPED(v4group, &v6group); } else { gsin6 = (struct sockaddr_in6 *)&gf->gf_group; IN6_V4MAPPED_TO_IPADDR(&gsin6->sin6_addr, - v4grp); - isv4mapped = B_TRUE; + v4group); + issin6 = B_TRUE; } } - if (getcmd) - err = ip_get_srcfilter(connp, gf, imsf, v4grp, ipif, - isv4mapped); + /* + * INADDR_ANY is represented as the IPv6 unspecifed addr. + */ + if (v4group == INADDR_ANY) + v6group = ipv6_all_zeros; else - err = ip_set_srcfilter(connp, gf, imsf, v4grp, ipif, - isv4mapped); + IN6_IPADDR_TO_V4MAPPED(v4group, &v6group); + + if (getcmd) { + err = ip_get_srcfilter(connp, gf, imsf, &v6group, + issin6); + } else { + err = ip_set_srcfilter(connp, gf, imsf, &v6group, ill, + issin6); + } } + ill_refrele(ill); return (err); } /* - * Finds the ipif based on information in the ioctl headers. Needed to make - * ip_process_ioctl() happy (it needs to know the ipif for IPI_WR-flagged - * ioctls prior to calling the ioctl's handler function). + * Determine the ill for the SIOC*MSFILTER ioctls + * + * Returns an error for IS_UNDER_IPMP interfaces. + * + * Finds the ill based on information in the ioctl headers. */ -int -ip_extract_msfilter(queue_t *q, mblk_t *mp, const ip_ioctl_cmd_t *ipip, - cmd_info_t *ci, ipsq_func_t func) +static int +ip_msfilter_ill(conn_t *connp, mblk_t *mp, const ip_ioctl_cmd_t *ipip, + ill_t **illp) { int cmd = ipip->ipi_cmd; int err = 0; - conn_t *connp; - ipif_t *ipif; + ill_t *ill; /* caller has verified this mblk exists */ char *dbuf = (char *)mp->b_cont->b_cont->b_rptr; struct ip_msfilter *imsf; struct group_filter *gf; - ipaddr_t v4addr, v4grp; - in6_addr_t v6grp; + ipaddr_t v4addr, v4group; + in6_addr_t v6group; uint32_t index; - zoneid_t zoneid; ip_stack_t *ipst; - connp = Q_TO_CONN(q); - zoneid = connp->conn_zoneid; ipst = connp->conn_netstack->netstack_ip; + *illp = NULL; + /* don't allow multicast operations on a tcp conn */ if (IPCL_IS_TCP(connp)) return (ENOPROTOOPT); if (cmd == SIOCSIPMSFILTER || cmd == SIOCGIPMSFILTER) { /* don't allow v4-specific ioctls on v6 socket */ - if (connp->conn_af_isv6) + if (connp->conn_family == AF_INET6) return (EAFNOSUPPORT); imsf = (struct ip_msfilter *)dbuf; v4addr = imsf->imsf_interface.s_addr; - v4grp = imsf->imsf_multiaddr.s_addr; - if (v4addr == INADDR_ANY) { - ipif = ipif_lookup_group(v4grp, zoneid, ipst); - if (ipif == NULL) - err = EADDRNOTAVAIL; - } else { - ipif = ipif_lookup_addr(v4addr, NULL, zoneid, q, mp, - func, &err, ipst); - } + v4group = imsf->imsf_multiaddr.s_addr; + IN6_IPADDR_TO_V4MAPPED(v4group, &v6group); + ill = ill_mcast_lookup(&v6group, v4addr, 0, IPCL_ZONEID(connp), + ipst, &err); + if (ill == NULL && v4addr != INADDR_ANY) + err = ENXIO; } else { - boolean_t isv6 = B_FALSE; gf = (struct group_filter *)dbuf; index = gf->gf_interface; if (gf->gf_group.ss_family == AF_INET6) { struct sockaddr_in6 *sin6; + sin6 = (struct sockaddr_in6 *)&gf->gf_group; - v6grp = sin6->sin6_addr; - if (IN6_IS_ADDR_V4MAPPED(&v6grp)) - IN6_V4MAPPED_TO_IPADDR(&v6grp, v4grp); - else - isv6 = B_TRUE; + v6group = sin6->sin6_addr; } else if (gf->gf_group.ss_family == AF_INET) { struct sockaddr_in *sin; + sin = (struct sockaddr_in *)&gf->gf_group; - v4grp = sin->sin_addr.s_addr; + v4group = sin->sin_addr.s_addr; + IN6_IPADDR_TO_V4MAPPED(v4group, &v6group); } else { return (EAFNOSUPPORT); } - if (index == 0) { - if (isv6) { - ipif = ipif_lookup_group_v6(&v6grp, zoneid, - ipst); - } else { - ipif = ipif_lookup_group(v4grp, zoneid, ipst); - } - if (ipif == NULL) - err = EADDRNOTAVAIL; - } else { - ipif = ipif_lookup_on_ifindex(index, isv6, zoneid, - q, mp, func, &err, ipst); - } + ill = ill_mcast_lookup(&v6group, INADDR_ANY, index, + IPCL_ZONEID(connp), ipst, &err); } - - ci->ci_ipif = ipif; + *illp = ill; return (err); } @@ -2695,6 +2287,7 @@ ip_copyin_msfilter(queue_t *q, mblk_t *mp) /* * Handle the following optmgmt: * IP_ADD_MEMBERSHIP must not have joined already + * IPV6_JOIN_GROUP must not have joined already * MCAST_JOIN_GROUP must not have joined already * IP_BLOCK_SOURCE must have joined already * MCAST_BLOCK_SOURCE must have joined already @@ -2702,91 +2295,15 @@ ip_copyin_msfilter(queue_t *q, mblk_t *mp) * MCAST_JOIN_SOURCE_GROUP may have joined already * * fmode and src parameters may be used to determine which option is - * being set, as follows (the IP_* and MCAST_* versions of each option - * are functionally equivalent): - * opt fmode src - * IP_ADD_MEMBERSHIP MODE_IS_EXCLUDE INADDR_ANY - * MCAST_JOIN_GROUP MODE_IS_EXCLUDE INADDR_ANY - * IP_BLOCK_SOURCE MODE_IS_EXCLUDE v4 addr - * MCAST_BLOCK_SOURCE MODE_IS_EXCLUDE v4 addr - * IP_JOIN_SOURCE_GROUP MODE_IS_INCLUDE v4 addr - * MCAST_JOIN_SOURCE_GROUP MODE_IS_INCLUDE v4 addr - * - * Changing the filter mode is not allowed; if a matching ilg already - * exists and fmode != ilg->ilg_fmode, EINVAL is returned. - * - * Verifies that there is a source address of appropriate scope for - * the group; if not, EADDRNOTAVAIL is returned. - * - * The interface to be used may be identified by an address or by an - * index. A pointer to the index is passed; if it is NULL, use the - * address, otherwise, use the index. - */ -int -ip_opt_add_group(conn_t *connp, boolean_t checkonly, ipaddr_t group, - ipaddr_t ifaddr, uint_t *ifindexp, mcast_record_t fmode, ipaddr_t src, - mblk_t *first_mp) -{ - ipif_t *ipif; - ipsq_t *ipsq; - int err = 0; - ill_t *ill; - - err = ip_opt_check(connp, group, src, ifaddr, ifindexp, first_mp, - ip_restart_optmgmt, &ipif); - if (err != 0) { - if (err != EINPROGRESS) { - ip1dbg(("ip_opt_add_group: no ipif for group 0x%x, " - "ifaddr 0x%x, ifindex %d\n", ntohl(group), - ntohl(ifaddr), (ifindexp == NULL) ? 0 : *ifindexp)); - } - return (err); - } - ASSERT(ipif != NULL); - - ill = ipif->ipif_ill; - /* Operation not supported on a virtual network interface */ - if (IS_VNI(ill)) { - ipif_refrele(ipif); - return (EINVAL); - } - - if (checkonly) { - /* - * do not do operation, just pretend to - new T_CHECK - * semantics. The error return case above if encountered - * considered a good enough "check" here. - */ - ipif_refrele(ipif); - return (0); - } - - IPSQ_ENTER_IPIF(ipif, connp, first_mp, ip_restart_optmgmt, ipsq, - NEW_OP); - - /* unspecified source addr => no source filtering */ - err = ilg_add(connp, group, ipif, fmode, src); - - IPSQ_EXIT(ipsq); - - ipif_refrele(ipif); - return (err); -} - -/* - * Handle the following optmgmt: - * IPV6_JOIN_GROUP must not have joined already - * MCAST_JOIN_GROUP must not have joined already - * MCAST_BLOCK_SOURCE must have joined already - * MCAST_JOIN_SOURCE_GROUP may have joined already - * - * fmode and src parameters may be used to determine which option is * being set, as follows (IPV6_JOIN_GROUP and MCAST_JOIN_GROUP options * are functionally equivalent): * opt fmode v6src + * IP_ADD_MEMBERSHIP MODE_IS_EXCLUDE unspecified * IPV6_JOIN_GROUP MODE_IS_EXCLUDE unspecified * MCAST_JOIN_GROUP MODE_IS_EXCLUDE unspecified + * IP_BLOCK_SOURCE MODE_IS_EXCLUDE IPv4-mapped addr * MCAST_BLOCK_SOURCE MODE_IS_EXCLUDE v6 addr + * IP_JOIN_SOURCE_GROUP MODE_IS_INCLUDE IPv4-mapped addr * MCAST_JOIN_SOURCE_GROUP MODE_IS_INCLUDE v6 addr * * Changing the filter mode is not allowed; if a matching ilg already @@ -2795,47 +2312,29 @@ ip_opt_add_group(conn_t *connp, boolean_t checkonly, ipaddr_t group, * Verifies that there is a source address of appropriate scope for * the group; if not, EADDRNOTAVAIL is returned. * + * The interface to be used may be identified by an IPv4 address or by an + * interface index. + * * Handles IPv4-mapped IPv6 multicast addresses by associating them - * with the link-local ipif. Assumes that if v6group is v4-mapped, + * with the IPv4 address. Assumes that if v6group is v4-mapped, * v6src is also v4-mapped. */ int -ip_opt_add_group_v6(conn_t *connp, boolean_t checkonly, - const in6_addr_t *v6group, int ifindex, mcast_record_t fmode, - const in6_addr_t *v6src, mblk_t *first_mp) +ip_opt_add_group(conn_t *connp, boolean_t checkonly, + const in6_addr_t *v6group, ipaddr_t ifaddr, uint_t ifindex, + mcast_record_t fmode, const in6_addr_t *v6src) { ill_t *ill; - ipif_t *ipif; char buf[INET6_ADDRSTRLEN]; - ipaddr_t v4group, v4src; - boolean_t isv6; - ipsq_t *ipsq; int err; - err = ip_opt_check_v6(connp, v6group, &v4group, v6src, &v4src, &isv6, - ifindex, first_mp, ip_restart_optmgmt, &ill, &ipif); + err = ip_opt_check(connp, v6group, v6src, ifaddr, ifindex, &ill); if (err != 0) { - if (err != EINPROGRESS) { - ip1dbg(("ip_opt_add_group_v6: no ill for group %s/" - "index %d\n", inet_ntop(AF_INET6, v6group, buf, - sizeof (buf)), ifindex)); - } + ip1dbg(("ip_opt_add_group: no ill for group %s/" + "index %d\n", inet_ntop(AF_INET6, v6group, buf, + sizeof (buf)), ifindex)); return (err); } - ASSERT((!isv6 && ipif != NULL) || (isv6 && ill != NULL)); - - /* operation is not supported on the virtual network interface */ - if (isv6) { - if (IS_VNI(ill)) { - ill_refrele(ill); - return (EINVAL); - } - } else { - if (IS_VNI(ipif->ipif_ill)) { - ipif_refrele(ipif); - return (EINVAL); - } - } if (checkonly) { /* @@ -2843,104 +2342,70 @@ ip_opt_add_group_v6(conn_t *connp, boolean_t checkonly, * semantics. The error return case above if encountered * considered a good enough "check" here. */ - if (isv6) - ill_refrele(ill); - else - ipif_refrele(ipif); - return (0); - } - - if (!isv6) { - IPSQ_ENTER_IPIF(ipif, connp, first_mp, ip_restart_optmgmt, - ipsq, NEW_OP); - err = ilg_add(connp, v4group, ipif, fmode, v4src); - IPSQ_EXIT(ipsq); - ipif_refrele(ipif); - } else { - IPSQ_ENTER_ILL(ill, connp, first_mp, ip_restart_optmgmt, - ipsq, NEW_OP); - err = ilg_add_v6(connp, v6group, ill, fmode, v6src); - IPSQ_EXIT(ipsq); ill_refrele(ill); + return (0); } + mutex_enter(&ill->ill_mcast_serializer); + err = ilg_add(connp, v6group, ifaddr, ifindex, ill, fmode, v6src); + mutex_exit(&ill->ill_mcast_serializer); + ill_refrele(ill); return (err); } +/* + * Common for IPv6 and IPv4. + * Here we handle ilgs that are still attached to their original ill + * (the one ifaddr/ifindex points at), as well as detached ones. + * The detached ones might have been attached to some other ill. + */ static int -ip_opt_delete_group_excl(conn_t *connp, ipaddr_t group, ipif_t *ipif, - mcast_record_t fmode, ipaddr_t src) +ip_opt_delete_group_excl(conn_t *connp, const in6_addr_t *v6group, + ipaddr_t ifaddr, uint_t ifindex, mcast_record_t fmode, + const in6_addr_t *v6src) { ilg_t *ilg; - in6_addr_t v6src; - boolean_t leaving = B_FALSE; - - ASSERT(IAM_WRITER_IPIF(ipif)); - - /* - * The ilg is valid only while we hold the conn lock. Once we drop - * the lock, another thread can locate another ilg on this connp, - * but on a different ipif, and delete it, and cause the ilg array - * to be reallocated and copied. Hence do the ilg_delete before - * dropping the lock. - */ - mutex_enter(&connp->conn_lock); - ilg = ilg_lookup_ipif(connp, group, ipif); - if ((ilg == NULL) || (ilg->ilg_flags & ILG_DELETED)) { - mutex_exit(&connp->conn_lock); - return (EADDRNOTAVAIL); - } + boolean_t leaving; + ilm_t *ilm; + ill_t *ill; + int err = 0; - /* - * Decide if we're actually deleting the ilg or just removing a - * source filter address; if just removing an addr, make sure we - * aren't trying to change the filter mode, and that the addr is - * actually in our filter list already. If we're removing the - * last src in an include list, just delete the ilg. - */ - if (src == INADDR_ANY) { - v6src = ipv6_all_zeros; - leaving = B_TRUE; - } else { - int err = 0; - IN6_IPADDR_TO_V4MAPPED(src, &v6src); - if (fmode != ilg->ilg_fmode) - err = EINVAL; - else if (ilg->ilg_filter == NULL || - !list_has_addr(ilg->ilg_filter, &v6src)) +retry: + rw_enter(&connp->conn_ilg_lock, RW_WRITER); + ilg = ilg_lookup(connp, v6group, ifaddr, ifindex); + if (ilg == NULL) { + rw_exit(&connp->conn_ilg_lock); + /* + * Since we didn't have any ilg we now do the error checks + * to determine the best errno. + */ + err = ip_opt_check(connp, v6group, v6src, ifaddr, ifindex, + &ill); + if (ill != NULL) { + /* The only error was a missing ilg for the group */ + ill_refrele(ill); err = EADDRNOTAVAIL; - if (err != 0) { - mutex_exit(&connp->conn_lock); - return (err); - } - if (fmode == MODE_IS_INCLUDE && - ilg->ilg_filter->sl_numsrc == 1) { - v6src = ipv6_all_zeros; - leaving = B_TRUE; } + return (err); } - ilg_delete(connp, ilg, &v6src); - mutex_exit(&connp->conn_lock); - - (void) ip_delmulti(group, ipif, B_FALSE, leaving); - return (0); -} - -static int -ip_opt_delete_group_excl_v6(conn_t *connp, const in6_addr_t *v6group, - ill_t *ill, mcast_record_t fmode, const in6_addr_t *v6src) -{ - ilg_t *ilg; - boolean_t leaving = B_TRUE; - - ASSERT(IAM_WRITER_ILL(ill)); - - mutex_enter(&connp->conn_lock); - ilg = ilg_lookup_ill_v6(connp, v6group, ill); - if ((ilg == NULL) || (ilg->ilg_flags & ILG_DELETED)) { - mutex_exit(&connp->conn_lock); - return (EADDRNOTAVAIL); + /* If the ilg is attached then we serialize using that ill */ + ill = ilg->ilg_ill; + if (ill != NULL) { + /* Prevent the ill and ilg from being freed */ + ill_refhold(ill); + ilg_refhold(ilg); + rw_exit(&connp->conn_ilg_lock); + mutex_enter(&ill->ill_mcast_serializer); + rw_enter(&connp->conn_ilg_lock, RW_WRITER); + if (ilg->ilg_condemned) { + /* Disappeared */ + ilg_refrele(ilg); + rw_exit(&connp->conn_ilg_lock); + mutex_exit(&ill->ill_mcast_serializer); + ill_refrele(ill); + goto retry; + } } /* @@ -2950,198 +2415,107 @@ ip_opt_delete_group_excl_v6(conn_t *connp, const in6_addr_t *v6group, * actually in our filter list already. If we're removing the * last src in an include list, just delete the ilg. */ - if (!IN6_IS_ADDR_UNSPECIFIED(v6src)) { - int err = 0; + if (IN6_IS_ADDR_UNSPECIFIED(v6src)) { + leaving = B_TRUE; + } else { if (fmode != ilg->ilg_fmode) err = EINVAL; else if (ilg->ilg_filter == NULL || !list_has_addr(ilg->ilg_filter, v6src)) err = EADDRNOTAVAIL; if (err != 0) { - mutex_exit(&connp->conn_lock); - return (err); + if (ill != NULL) + ilg_refrele(ilg); + rw_exit(&connp->conn_ilg_lock); + goto done; } if (fmode == MODE_IS_INCLUDE && - ilg->ilg_filter->sl_numsrc == 1) + ilg->ilg_filter->sl_numsrc == 1) { + leaving = B_TRUE; v6src = NULL; - else + } else { leaving = B_FALSE; + } } + ilm = ilg->ilg_ilm; + if (leaving) + ilg->ilg_ilm = NULL; ilg_delete(connp, ilg, v6src); - mutex_exit(&connp->conn_lock); - (void) ip_delmulti_v6(v6group, ill, connp->conn_zoneid, B_FALSE, - leaving); - - return (0); -} - -/* - * Handle the following optmgmt: - * IP_DROP_MEMBERSHIP will leave - * MCAST_LEAVE_GROUP will leave - * IP_UNBLOCK_SOURCE will not leave - * MCAST_UNBLOCK_SOURCE will not leave - * IP_LEAVE_SOURCE_GROUP may leave (if leaving last source) - * MCAST_LEAVE_SOURCE_GROUP may leave (if leaving last source) - * - * fmode and src parameters may be used to determine which option is - * being set, as follows (the IP_* and MCAST_* versions of each option - * are functionally equivalent): - * opt fmode src - * IP_DROP_MEMBERSHIP MODE_IS_INCLUDE INADDR_ANY - * MCAST_LEAVE_GROUP MODE_IS_INCLUDE INADDR_ANY - * IP_UNBLOCK_SOURCE MODE_IS_EXCLUDE v4 addr - * MCAST_UNBLOCK_SOURCE MODE_IS_EXCLUDE v4 addr - * IP_LEAVE_SOURCE_GROUP MODE_IS_INCLUDE v4 addr - * MCAST_LEAVE_SOURCE_GROUP MODE_IS_INCLUDE v4 addr - * - * Changing the filter mode is not allowed; if a matching ilg already - * exists and fmode != ilg->ilg_fmode, EINVAL is returned. - * - * The interface to be used may be identified by an address or by an - * index. A pointer to the index is passed; if it is NULL, use the - * address, otherwise, use the index. - */ -int -ip_opt_delete_group(conn_t *connp, boolean_t checkonly, ipaddr_t group, - ipaddr_t ifaddr, uint_t *ifindexp, mcast_record_t fmode, ipaddr_t src, - mblk_t *first_mp) -{ - ipif_t *ipif; - ipsq_t *ipsq; - int err; - ill_t *ill; - - err = ip_opt_check(connp, group, src, ifaddr, ifindexp, first_mp, - ip_restart_optmgmt, &ipif); - if (err != 0) { - if (err != EINPROGRESS) { - ip1dbg(("ip_opt_delete_group: no ipif for group " - "0x%x, ifaddr 0x%x\n", - (int)ntohl(group), (int)ntohl(ifaddr))); - } - return (err); - } - ASSERT(ipif != NULL); + if (ill != NULL) + ilg_refrele(ilg); + rw_exit(&connp->conn_ilg_lock); - ill = ipif->ipif_ill; - /* Operation not supported on a virtual network interface */ - if (IS_VNI(ill)) { - ipif_refrele(ipif); - return (EINVAL); + if (ilm != NULL) { + ASSERT(ill != NULL); + (void) ip_delmulti_serial(ilm, B_FALSE, leaving); } - - if (checkonly) { - /* - * do not do operation, just pretend to - new T_CHECK - * semantics. The error return case above if encountered - * considered a good enough "check" here. - */ - ipif_refrele(ipif); - return (0); +done: + if (ill != NULL) { + mutex_exit(&ill->ill_mcast_serializer); + ill_refrele(ill); } - - IPSQ_ENTER_IPIF(ipif, connp, first_mp, ip_restart_optmgmt, ipsq, - NEW_OP); - err = ip_opt_delete_group_excl(connp, group, ipif, fmode, src); - IPSQ_EXIT(ipsq); - - ipif_refrele(ipif); return (err); } /* * Handle the following optmgmt: + * IP_DROP_MEMBERSHIP will leave * IPV6_LEAVE_GROUP will leave * MCAST_LEAVE_GROUP will leave + * IP_UNBLOCK_SOURCE will not leave * MCAST_UNBLOCK_SOURCE will not leave + * IP_LEAVE_SOURCE_GROUP may leave (if leaving last source) * MCAST_LEAVE_SOURCE_GROUP may leave (if leaving last source) * * fmode and src parameters may be used to determine which option is - * being set, as follows (IPV6_LEAVE_GROUP and MCAST_LEAVE_GROUP options - * are functionally equivalent): + * being set, as follows: * opt fmode v6src + * IP_DROP_MEMBERSHIP MODE_IS_INCLUDE unspecified * IPV6_LEAVE_GROUP MODE_IS_INCLUDE unspecified * MCAST_LEAVE_GROUP MODE_IS_INCLUDE unspecified + * IP_UNBLOCK_SOURCE MODE_IS_EXCLUDE IPv4-mapped addr * MCAST_UNBLOCK_SOURCE MODE_IS_EXCLUDE v6 addr + * IP_LEAVE_SOURCE_GROUP MODE_IS_INCLUDE IPv4-mapped addr * MCAST_LEAVE_SOURCE_GROUP MODE_IS_INCLUDE v6 addr * * Changing the filter mode is not allowed; if a matching ilg already * exists and fmode != ilg->ilg_fmode, EINVAL is returned. * + * The interface to be used may be identified by an IPv4 address or by an + * interface index. + * * Handles IPv4-mapped IPv6 multicast addresses by associating them - * with the link-local ipif. Assumes that if v6group is v4-mapped, + * with the IPv4 address. Assumes that if v6group is v4-mapped, * v6src is also v4-mapped. */ int -ip_opt_delete_group_v6(conn_t *connp, boolean_t checkonly, - const in6_addr_t *v6group, int ifindex, mcast_record_t fmode, - const in6_addr_t *v6src, mblk_t *first_mp) +ip_opt_delete_group(conn_t *connp, boolean_t checkonly, + const in6_addr_t *v6group, ipaddr_t ifaddr, uint_t ifindex, + mcast_record_t fmode, const in6_addr_t *v6src) { - ill_t *ill; - ipif_t *ipif; - char buf[INET6_ADDRSTRLEN]; - ipaddr_t v4group, v4src; - boolean_t isv6; - ipsq_t *ipsq; - int err; - - err = ip_opt_check_v6(connp, v6group, &v4group, v6src, &v4src, &isv6, - ifindex, first_mp, ip_restart_optmgmt, &ill, &ipif); - if (err != 0) { - if (err != EINPROGRESS) { - ip1dbg(("ip_opt_delete_group_v6: no ill for group %s/" - "index %d\n", inet_ntop(AF_INET6, v6group, buf, - sizeof (buf)), ifindex)); - } - return (err); - } - ASSERT((isv6 && ill != NULL) || (!isv6 && ipif != NULL)); - - /* operation is not supported on the virtual network interface */ - if (isv6) { - if (IS_VNI(ill)) { - ill_refrele(ill); - return (EINVAL); - } - } else { - if (IS_VNI(ipif->ipif_ill)) { - ipif_refrele(ipif); - return (EINVAL); - } - } + /* + * In the normal case below we don't check for the ill existing. + * Instead we look for an existing ilg in _excl. + * If checkonly we sanity check the arguments + */ if (checkonly) { + ill_t *ill; + int err; + + err = ip_opt_check(connp, v6group, v6src, ifaddr, ifindex, + &ill); /* - * do not do operation, just pretend to - new T_CHECK - * semantics. The error return case above if encountered - * considered a good enough "check" here. + * do not do operation, just pretend to - new T_CHECK semantics. + * ip_opt_check is considered a good enough "check" here. */ - if (isv6) + if (ill != NULL) ill_refrele(ill); - else - ipif_refrele(ipif); - return (0); - } - - if (!isv6) { - IPSQ_ENTER_IPIF(ipif, connp, first_mp, ip_restart_optmgmt, - ipsq, NEW_OP); - err = ip_opt_delete_group_excl(connp, v4group, ipif, fmode, - v4src); - IPSQ_EXIT(ipsq); - ipif_refrele(ipif); - } else { - IPSQ_ENTER_ILL(ill, connp, first_mp, ip_restart_optmgmt, - ipsq, NEW_OP); - err = ip_opt_delete_group_excl_v6(connp, v6group, ill, fmode, - v6src); - IPSQ_EXIT(ipsq); - ill_refrele(ill); + return (err); } - - return (err); + return (ip_opt_delete_group_excl(connp, v6group, ifaddr, ifindex, + fmode, v6src)); } /* @@ -3155,185 +2529,26 @@ ip_opt_delete_group_v6(conn_t *connp, boolean_t checkonly, /* * Add a group to an upper conn group data structure and pass things down * to the interface multicast list (and DLPI) + * Common for IPv4 and IPv6; for IPv4 we can have an ifaddr. */ static int -ilg_add(conn_t *connp, ipaddr_t group, ipif_t *ipif, mcast_record_t fmode, - ipaddr_t src) -{ - int error = 0; - ill_t *ill; - ilg_t *ilg; - ilg_stat_t ilgstat; - slist_t *new_filter = NULL; - int new_fmode; - - ASSERT(IAM_WRITER_IPIF(ipif)); - - ill = ipif->ipif_ill; - - if (!(ill->ill_flags & ILLF_MULTICAST)) - return (EADDRNOTAVAIL); - - /* - * conn_ilg[] is protected by conn_lock. Need to hold the conn_lock - * to walk the conn_ilg[] list in ilg_lookup_ipif(); also needed to - * serialize 2 threads doing join (sock, group1, hme0:0) and - * (sock, group2, hme1:0) where hme0 and hme1 map to different ipsqs, - * but both operations happen on the same conn. - */ - mutex_enter(&connp->conn_lock); - ilg = ilg_lookup_ipif(connp, group, ipif); - - /* - * Depending on the option we're handling, may or may not be okay - * if group has already been added. Figure out our rules based - * on fmode and src params. Also make sure there's enough room - * in the filter if we're adding a source to an existing filter. - */ - if (src == INADDR_ANY) { - /* we're joining for all sources, must not have joined */ - if (ilg != NULL) - error = EADDRINUSE; - } else { - if (fmode == MODE_IS_EXCLUDE) { - /* (excl {addr}) => block source, must have joined */ - if (ilg == NULL) - error = EADDRNOTAVAIL; - } - /* (incl {addr}) => join source, may have joined */ - - if (ilg != NULL && - SLIST_CNT(ilg->ilg_filter) == MAX_FILTER_SIZE) - error = ENOBUFS; - } - if (error != 0) { - mutex_exit(&connp->conn_lock); - return (error); - } - - ASSERT(!(ipif->ipif_state_flags & IPIF_CONDEMNED)); - - /* - * Alloc buffer to copy new state into (see below) before - * we make any changes, so we can bail if it fails. - */ - if ((new_filter = l_alloc()) == NULL) { - mutex_exit(&connp->conn_lock); - return (ENOMEM); - } - - if (ilg == NULL) { - ilgstat = ILGSTAT_NEW; - if ((ilg = conn_ilg_alloc(connp, &error)) == NULL) { - mutex_exit(&connp->conn_lock); - l_free(new_filter); - return (error); - } - if (src != INADDR_ANY) { - ilg->ilg_filter = l_alloc(); - if (ilg->ilg_filter == NULL) { - ilg_delete(connp, ilg, NULL); - mutex_exit(&connp->conn_lock); - l_free(new_filter); - return (ENOMEM); - } - ilg->ilg_filter->sl_numsrc = 1; - IN6_IPADDR_TO_V4MAPPED(src, - &ilg->ilg_filter->sl_addr[0]); - } - if (group == INADDR_ANY) { - ilg->ilg_v6group = ipv6_all_zeros; - } else { - IN6_IPADDR_TO_V4MAPPED(group, &ilg->ilg_v6group); - } - ilg->ilg_ipif = ipif; - ilg->ilg_ill = NULL; - ilg->ilg_fmode = fmode; - } else { - int index; - in6_addr_t v6src; - ilgstat = ILGSTAT_CHANGE; - if (ilg->ilg_fmode != fmode || src == INADDR_ANY) { - mutex_exit(&connp->conn_lock); - l_free(new_filter); - return (EINVAL); - } - if (ilg->ilg_filter == NULL) { - ilg->ilg_filter = l_alloc(); - if (ilg->ilg_filter == NULL) { - mutex_exit(&connp->conn_lock); - l_free(new_filter); - return (ENOMEM); - } - } - IN6_IPADDR_TO_V4MAPPED(src, &v6src); - if (list_has_addr(ilg->ilg_filter, &v6src)) { - mutex_exit(&connp->conn_lock); - l_free(new_filter); - return (EADDRNOTAVAIL); - } - index = ilg->ilg_filter->sl_numsrc++; - ilg->ilg_filter->sl_addr[index] = v6src; - } - - /* - * Save copy of ilg's filter state to pass to other functions, - * so we can release conn_lock now. - */ - new_fmode = ilg->ilg_fmode; - l_copy(ilg->ilg_filter, new_filter); - - mutex_exit(&connp->conn_lock); - - error = ip_addmulti(group, ipif, ilgstat, new_fmode, new_filter); - if (error != 0) { - /* - * Need to undo what we did before calling ip_addmulti()! - * Must look up the ilg again since we've not been holding - * conn_lock. - */ - in6_addr_t v6src; - if (ilgstat == ILGSTAT_NEW) - v6src = ipv6_all_zeros; - else - IN6_IPADDR_TO_V4MAPPED(src, &v6src); - mutex_enter(&connp->conn_lock); - ilg = ilg_lookup_ipif(connp, group, ipif); - ASSERT(ilg != NULL); - ilg_delete(connp, ilg, &v6src); - mutex_exit(&connp->conn_lock); - l_free(new_filter); - return (error); - } - - l_free(new_filter); - return (0); -} - -static int -ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill, - mcast_record_t fmode, const in6_addr_t *v6src) +ilg_add(conn_t *connp, const in6_addr_t *v6group, ipaddr_t ifaddr, + uint_t ifindex, ill_t *ill, mcast_record_t fmode, const in6_addr_t *v6src) { int error = 0; ilg_t *ilg; ilg_stat_t ilgstat; slist_t *new_filter = NULL; int new_fmode; - - ASSERT(IAM_WRITER_ILL(ill)); + ilm_t *ilm; if (!(ill->ill_flags & ILLF_MULTICAST)) return (EADDRNOTAVAIL); - /* - * conn_lock protects the ilg list. Serializes 2 threads doing - * join (sock, group1, hme0) and (sock, group2, hme1) where hme0 - * and hme1 map to different ipsq's, but both operations happen - * on the same conn. - */ - mutex_enter(&connp->conn_lock); - - ilg = ilg_lookup_ill_v6(connp, v6group, ill); + /* conn_ilg_lock protects the ilg list. */ + ASSERT(MUTEX_HELD(&ill->ill_mcast_serializer)); + rw_enter(&connp->conn_ilg_lock, RW_WRITER); + ilg = ilg_lookup(connp, v6group, ifaddr, ifindex); /* * Depending on the option we're handling, may or may not be okay @@ -3358,7 +2573,7 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill, error = ENOBUFS; } if (error != 0) { - mutex_exit(&connp->conn_lock); + rw_exit(&connp->conn_ilg_lock); return (error); } @@ -3367,21 +2582,23 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill, * we make any changes, so we can bail if it fails. */ if ((new_filter = l_alloc()) == NULL) { - mutex_exit(&connp->conn_lock); + rw_exit(&connp->conn_ilg_lock); return (ENOMEM); } if (ilg == NULL) { if ((ilg = conn_ilg_alloc(connp, &error)) == NULL) { - mutex_exit(&connp->conn_lock); + rw_exit(&connp->conn_ilg_lock); l_free(new_filter); return (error); } + ilg->ilg_ifindex = ifindex; + ilg->ilg_ifaddr = ifaddr; if (!IN6_IS_ADDR_UNSPECIFIED(v6src)) { ilg->ilg_filter = l_alloc(); if (ilg->ilg_filter == NULL) { ilg_delete(connp, ilg, NULL); - mutex_exit(&connp->conn_lock); + rw_exit(&connp->conn_ilg_lock); l_free(new_filter); return (ENOMEM); } @@ -3391,25 +2608,24 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill, ilgstat = ILGSTAT_NEW; ilg->ilg_v6group = *v6group; ilg->ilg_fmode = fmode; - ilg->ilg_ipif = NULL; ilg->ilg_ill = ill; } else { int index; if (ilg->ilg_fmode != fmode || IN6_IS_ADDR_UNSPECIFIED(v6src)) { - mutex_exit(&connp->conn_lock); + rw_exit(&connp->conn_ilg_lock); l_free(new_filter); return (EINVAL); } if (ilg->ilg_filter == NULL) { ilg->ilg_filter = l_alloc(); if (ilg->ilg_filter == NULL) { - mutex_exit(&connp->conn_lock); + rw_exit(&connp->conn_ilg_lock); l_free(new_filter); return (ENOMEM); } } if (list_has_addr(ilg->ilg_filter, v6src)) { - mutex_exit(&connp->conn_lock); + rw_exit(&connp->conn_ilg_lock); l_free(new_filter); return (EADDRNOTAVAIL); } @@ -3420,12 +2636,12 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill, /* * Save copy of ilg's filter state to pass to other functions, - * so we can release conn_lock now. + * so we can release conn_ilg_lock now. */ new_fmode = ilg->ilg_fmode; l_copy(ilg->ilg_filter, new_filter); - mutex_exit(&connp->conn_lock); + rw_exit(&connp->conn_ilg_lock); /* * Now update the ill. We wait to do this until after the ilg @@ -3433,72 +2649,105 @@ ilg_add_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill, * info for the ill, which involves looking at the status of * all the ilgs associated with this group/interface pair. */ - error = ip_addmulti_v6(v6group, ill, connp->conn_zoneid, ilgstat, - new_fmode, new_filter); - if (error != 0) { + ilm = ip_addmulti_serial(v6group, ill, connp->conn_zoneid, ilgstat, + new_fmode, new_filter, &error); + + rw_enter(&connp->conn_ilg_lock, RW_WRITER); + /* + * Must look up the ilg again since we've not been holding + * conn_ilg_lock. The ilg could have disappeared due to an unplumb + * having called conn_update_ill, which can run once we dropped the + * conn_ilg_lock above. + */ + ilg = ilg_lookup(connp, v6group, ifaddr, ifindex); + if (ilg == NULL) { + rw_exit(&connp->conn_ilg_lock); + if (ilm != NULL) { + (void) ip_delmulti_serial(ilm, B_FALSE, + (ilgstat == ILGSTAT_NEW)); + } + error = ENXIO; + goto free_and_exit; + } + + if (ilm != NULL) { + /* Succeeded. Update the ilg to point at the ilm */ + if (ilgstat == ILGSTAT_NEW) { + ASSERT(ilg->ilg_ilm == NULL); + ilg->ilg_ilm = ilm; + ilm->ilm_ifaddr = ifaddr; /* For netstat */ + } else { + /* + * ip_addmulti didn't get a held ilm for + * ILGSTAT_CHANGE; ilm_refcnt was unchanged. + */ + ASSERT(ilg->ilg_ilm == ilm); + } + } else { + ASSERT(error != 0); /* - * But because we waited, we have to undo the ilg update - * if ip_addmulti_v6() fails. We also must lookup ilg - * again, since we've not been holding conn_lock. + * Failed to allocate the ilm. + * Need to undo what we did before calling ip_addmulti() + * If ENETDOWN just clear ill_ilg since so that we + * will rejoin when the ill comes back; don't report ENETDOWN + * to application. */ - in6_addr_t delsrc = - (ilgstat == ILGSTAT_NEW) ? ipv6_all_zeros : *v6src; - mutex_enter(&connp->conn_lock); - ilg = ilg_lookup_ill_v6(connp, v6group, ill); - ASSERT(ilg != NULL); - ilg_delete(connp, ilg, &delsrc); - mutex_exit(&connp->conn_lock); - l_free(new_filter); - return (error); + if (ilgstat == ILGSTAT_NEW && error == ENETDOWN) { + ilg->ilg_ill = NULL; + error = 0; + } else { + in6_addr_t delsrc = + (ilgstat == ILGSTAT_NEW) ? ipv6_all_zeros : *v6src; + + ilg_delete(connp, ilg, &delsrc); + } } + rw_exit(&connp->conn_ilg_lock); +free_and_exit: l_free(new_filter); - - return (0); + return (error); } /* - * Find an IPv4 ilg matching group, ill and source + * Find an IPv4 ilg matching group, ill and source. + * The group and source can't be INADDR_ANY here so no need to translate to + * the unspecified IPv6 address. */ -ilg_t * -ilg_lookup_ill_withsrc(conn_t *connp, ipaddr_t group, ipaddr_t src, ill_t *ill) +boolean_t +conn_hasmembers_ill_withsrc_v4(conn_t *connp, ipaddr_t group, ipaddr_t src, + ill_t *ill) { in6_addr_t v6group, v6src; int i; boolean_t isinlist; ilg_t *ilg; - ipif_t *ipif; - ill_t *ilg_ill; - - ASSERT(MUTEX_HELD(&connp->conn_lock)); - /* - * INADDR_ANY is represented as the IPv6 unspecified addr. - */ - if (group == INADDR_ANY) - v6group = ipv6_all_zeros; - else - IN6_IPADDR_TO_V4MAPPED(group, &v6group); + rw_enter(&connp->conn_ilg_lock, RW_READER); + IN6_IPADDR_TO_V4MAPPED(group, &v6group); + for (ilg = connp->conn_ilg; ilg != NULL; ilg = ilg->ilg_next) { + if (ilg->ilg_condemned) + continue; - for (i = 0; i < connp->conn_ilg_inuse; i++) { - ilg = &connp->conn_ilg[i]; - if ((ipif = ilg->ilg_ipif) == NULL || - (ilg->ilg_flags & ILG_DELETED) != 0) + /* ilg_ill could be NULL if an add is in progress */ + if (ilg->ilg_ill != ill) continue; - ASSERT(ilg->ilg_ill == NULL); - ilg_ill = ipif->ipif_ill; - ASSERT(!ilg_ill->ill_isv6); - if (IS_ON_SAME_LAN(ilg_ill, ill) && - IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, &v6group)) { + + /* The callers use upper ill for IPMP */ + ASSERT(!IS_UNDER_IPMP(ill)); + if (IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, &v6group)) { if (SLIST_IS_EMPTY(ilg->ilg_filter)) { /* no source filter, so this is a match */ - return (ilg); + rw_exit(&connp->conn_ilg_lock); + return (B_TRUE); } break; } } - if (i == connp->conn_ilg_inuse) - return (NULL); + if (ilg == NULL) { + rw_exit(&connp->conn_ilg_lock); + return (B_FALSE); + } /* * we have an ilg with matching ill and group; but @@ -3514,44 +2763,49 @@ ilg_lookup_ill_withsrc(conn_t *connp, ipaddr_t group, ipaddr_t src, ill_t *ill) } if ((isinlist && ilg->ilg_fmode == MODE_IS_INCLUDE) || - (!isinlist && ilg->ilg_fmode == MODE_IS_EXCLUDE)) - return (ilg); - - return (NULL); + (!isinlist && ilg->ilg_fmode == MODE_IS_EXCLUDE)) { + rw_exit(&connp->conn_ilg_lock); + return (B_TRUE); + } + rw_exit(&connp->conn_ilg_lock); + return (B_FALSE); } /* * Find an IPv6 ilg matching group, ill, and source */ -ilg_t * -ilg_lookup_ill_withsrc_v6(conn_t *connp, const in6_addr_t *v6group, +boolean_t +conn_hasmembers_ill_withsrc_v6(conn_t *connp, const in6_addr_t *v6group, const in6_addr_t *v6src, ill_t *ill) { int i; boolean_t isinlist; ilg_t *ilg; - ill_t *ilg_ill; - ASSERT(MUTEX_HELD(&connp->conn_lock)); + rw_enter(&connp->conn_ilg_lock, RW_READER); + for (ilg = connp->conn_ilg; ilg != NULL; ilg = ilg->ilg_next) { + if (ilg->ilg_condemned) + continue; - for (i = 0; i < connp->conn_ilg_inuse; i++) { - ilg = &connp->conn_ilg[i]; - if ((ilg_ill = ilg->ilg_ill) == NULL || - (ilg->ilg_flags & ILG_DELETED) != 0) + /* ilg_ill could be NULL if an add is in progress */ + if (ilg->ilg_ill != ill) continue; - ASSERT(ilg->ilg_ipif == NULL); - ASSERT(ilg_ill->ill_isv6); - if (IS_ON_SAME_LAN(ilg_ill, ill) && - IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, v6group)) { + + /* The callers use upper ill for IPMP */ + ASSERT(!IS_UNDER_IPMP(ill)); + if (IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, v6group)) { if (SLIST_IS_EMPTY(ilg->ilg_filter)) { /* no source filter, so this is a match */ - return (ilg); + rw_exit(&connp->conn_ilg_lock); + return (B_TRUE); } break; } } - if (i == connp->conn_ilg_inuse) - return (NULL); + if (ilg == NULL) { + rw_exit(&connp->conn_ilg_lock); + return (B_FALSE); + } /* * we have an ilg with matching ill and group; but @@ -3566,61 +2820,34 @@ ilg_lookup_ill_withsrc_v6(conn_t *connp, const in6_addr_t *v6group, } if ((isinlist && ilg->ilg_fmode == MODE_IS_INCLUDE) || - (!isinlist && ilg->ilg_fmode == MODE_IS_EXCLUDE)) - return (ilg); - - return (NULL); -} - -/* - * Find an IPv6 ilg matching group and ill - */ -ilg_t * -ilg_lookup_ill_v6(conn_t *connp, const in6_addr_t *v6group, ill_t *ill) -{ - ilg_t *ilg; - int i; - ill_t *mem_ill; - - ASSERT(MUTEX_HELD(&connp->conn_lock)); - - for (i = 0; i < connp->conn_ilg_inuse; i++) { - ilg = &connp->conn_ilg[i]; - if ((mem_ill = ilg->ilg_ill) == NULL || - (ilg->ilg_flags & ILG_DELETED) != 0) - continue; - ASSERT(ilg->ilg_ipif == NULL); - ASSERT(mem_ill->ill_isv6); - if (mem_ill == ill && - IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, v6group)) - return (ilg); + (!isinlist && ilg->ilg_fmode == MODE_IS_EXCLUDE)) { + rw_exit(&connp->conn_ilg_lock); + return (B_TRUE); } - return (NULL); + rw_exit(&connp->conn_ilg_lock); + return (B_FALSE); } /* - * Find an IPv4 ilg matching group and ipif + * Find an ilg matching group and ifaddr/ifindex. + * We check both ifaddr and ifindex even though at most one of them + * will be non-zero; that way we always find the right one. */ static ilg_t * -ilg_lookup_ipif(conn_t *connp, ipaddr_t group, ipif_t *ipif) +ilg_lookup(conn_t *connp, const in6_addr_t *v6group, ipaddr_t ifaddr, + uint_t ifindex) { - in6_addr_t v6group; - int i; ilg_t *ilg; - ASSERT(MUTEX_HELD(&connp->conn_lock)); - ASSERT(!ipif->ipif_ill->ill_isv6); + ASSERT(RW_LOCK_HELD(&connp->conn_ilg_lock)); - if (group == INADDR_ANY) - v6group = ipv6_all_zeros; - else - IN6_IPADDR_TO_V4MAPPED(group, &v6group); + for (ilg = connp->conn_ilg; ilg != NULL; ilg = ilg->ilg_next) { + if (ilg->ilg_condemned) + continue; - for (i = 0; i < connp->conn_ilg_inuse; i++) { - ilg = &connp->conn_ilg[i]; - if ((ilg->ilg_flags & ILG_DELETED) == 0 && - IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, &v6group) && - ilg->ilg_ipif == ipif) + if (ilg->ilg_ifaddr == ifaddr && + ilg->ilg_ifindex == ifindex && + IN6_ARE_ADDR_EQUAL(&ilg->ilg_v6group, v6group)) return (ilg); } return (NULL); @@ -3634,363 +2861,479 @@ ilg_lookup_ipif(conn_t *connp, ipaddr_t group, ipif_t *ipif) static void ilg_delete(conn_t *connp, ilg_t *ilg, const in6_addr_t *src) { - int i; - - ASSERT((ilg->ilg_ipif != NULL) ^ (ilg->ilg_ill != NULL)); - ASSERT(ilg->ilg_ipif == NULL || IAM_WRITER_IPIF(ilg->ilg_ipif)); - ASSERT(ilg->ilg_ill == NULL || IAM_WRITER_ILL(ilg->ilg_ill)); - ASSERT(MUTEX_HELD(&connp->conn_lock)); - ASSERT(!(ilg->ilg_flags & ILG_DELETED)); + ASSERT(RW_WRITE_HELD(&connp->conn_ilg_lock)); + ASSERT(ilg->ilg_ptpn != NULL); + ASSERT(!ilg->ilg_condemned); if (src == NULL || IN6_IS_ADDR_UNSPECIFIED(src)) { - if (connp->conn_ilg_walker_cnt != 0) { - ilg->ilg_flags |= ILG_DELETED; - return; - } - FREE_SLIST(ilg->ilg_filter); + ilg->ilg_filter = NULL; - i = ilg - &connp->conn_ilg[0]; - ASSERT(i >= 0 && i < connp->conn_ilg_inuse); - - /* Move other entries up one step */ - connp->conn_ilg_inuse--; - for (; i < connp->conn_ilg_inuse; i++) - connp->conn_ilg[i] = connp->conn_ilg[i+1]; + ASSERT(ilg->ilg_ilm == NULL); + ilg->ilg_ill = NULL; + ilg->ilg_condemned = B_TRUE; - if (connp->conn_ilg_inuse == 0) { - mi_free((char *)connp->conn_ilg); - connp->conn_ilg = NULL; - cv_broadcast(&connp->conn_refcv); - } + /* ilg_inactive will unlink from the list */ + ilg_refrele(ilg); } else { l_remove(ilg->ilg_filter, src); } } /* - * Called from conn close. No new ilg can be added or removed. + * Called from conn close. No new ilg can be added or removed * because CONN_CLOSING has been set by ip_close. ilg_add / ilg_delete * will return error if conn has started closing. + * + * We handle locking as follows. + * Under conn_ilg_lock we get the first ilg. As we drop the conn_ilg_lock to + * proceed with the ilm part of the delete we hold a reference on both the ill + * and the ilg. This doesn't prevent changes to the ilg, but prevents it from + * being deleted. + * + * Since the ilg_add code path uses two locks (conn_ilg_lock for the ilg part, + * and ill_mcast_lock for the ip_addmulti part) we can run at a point between + * the two. At that point ilg_ill is set, but ilg_ilm hasn't yet been set. In + * that case we delete the ilg here, which makes ilg_add discover that the ilg + * has disappeared when ip_addmulti returns, so it will discard the ilm it just + * added. */ void ilg_delete_all(conn_t *connp) { - int i; - ipif_t *ipif = NULL; - ill_t *ill = NULL; - ilg_t *ilg; - in6_addr_t v6group; - boolean_t success; - ipsq_t *ipsq; + ilg_t *ilg, *next_ilg, *held_ilg; + ilm_t *ilm; + ill_t *ill; + boolean_t need_refrele; + /* + * Can not run if there is a conn_update_ill already running. + * Wait for it to complete. Caller should have already set CONN_CLOSING + * which prevents any new threads to run in conn_update_ill. + */ mutex_enter(&connp->conn_lock); -retry: - ILG_WALKER_HOLD(connp); - for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) { - ilg = &connp->conn_ilg[i]; - /* - * Since this walk is not atomic (we drop the - * conn_lock and wait in ipsq_enter) we need - * to check for the ILG_DELETED flag. - */ - if (ilg->ilg_flags & ILG_DELETED) - continue; - - if (IN6_IS_ADDR_V4MAPPED(&ilg->ilg_v6group)) { - ipif = ilg->ilg_ipif; - ill = ipif->ipif_ill; - } else { - ipif = NULL; - ill = ilg->ilg_ill; - } + ASSERT(connp->conn_state_flags & CONN_CLOSING); + while (connp->conn_state_flags & CONN_UPDATE_ILL) + cv_wait(&connp->conn_cv, &connp->conn_lock); + mutex_exit(&connp->conn_lock); - /* - * We may not be able to refhold the ill if the ill/ipif - * is changing. But we need to make sure that the ill will - * not vanish. So we just bump up the ill_waiter count. - * If we are unable to do even that, then the ill is closing, - * in which case the unplumb thread will handle the cleanup, - * and we move on to the next ilg. - */ - if (!ill_waiter_inc(ill)) + rw_enter(&connp->conn_ilg_lock, RW_WRITER); + ilg = connp->conn_ilg; + held_ilg = NULL; + while (ilg != NULL) { + if (ilg->ilg_condemned) { + ilg = ilg->ilg_next; continue; - - mutex_exit(&connp->conn_lock); - /* - * To prevent deadlock between ill close which waits inside - * the perimeter, and conn close, ipsq_enter returns error, - * the moment ILL_CONDEMNED is set, in which case ill close - * takes responsibility to cleanup the ilgs. Note that we - * have not yet set condemned flag, otherwise the conn can't - * be refheld for cleanup by those routines and it would be - * a mutual deadlock. - */ - success = ipsq_enter(ill, B_FALSE, NEW_OP); - ipsq = ill->ill_phyint->phyint_ipsq; - ill_waiter_dcr(ill); - mutex_enter(&connp->conn_lock); - if (!success) + } + /* If the ilg is detached then no need to serialize */ + if (ilg->ilg_ilm == NULL) { + next_ilg = ilg->ilg_next; + ilg_delete(connp, ilg, NULL); + ilg = next_ilg; continue; + } + ill = ilg->ilg_ilm->ilm_ill; /* - * Move on if the ilg was deleted while conn_lock was dropped. + * In order to serialize on the ill we try to enter + * and if that fails we unlock and relock and then + * check that we still have an ilm. */ - if (ilg->ilg_flags & ILG_DELETED) { - mutex_exit(&connp->conn_lock); - ipsq_exit(ipsq); - mutex_enter(&connp->conn_lock); - continue; + need_refrele = B_FALSE; + if (!mutex_tryenter(&ill->ill_mcast_serializer)) { + ill_refhold(ill); + need_refrele = B_TRUE; + ilg_refhold(ilg); + if (held_ilg != NULL) + ilg_refrele(held_ilg); + held_ilg = ilg; + rw_exit(&connp->conn_ilg_lock); + mutex_enter(&ill->ill_mcast_serializer); + rw_enter(&connp->conn_ilg_lock, RW_WRITER); + if (ilg->ilg_condemned) { + ilg = ilg->ilg_next; + goto next; + } } - v6group = ilg->ilg_v6group; + ilm = ilg->ilg_ilm; + ilg->ilg_ilm = NULL; + next_ilg = ilg->ilg_next; ilg_delete(connp, ilg, NULL); - mutex_exit(&connp->conn_lock); + ilg = next_ilg; + rw_exit(&connp->conn_ilg_lock); - if (ipif != NULL) { - (void) ip_delmulti(V4_PART_OF_V6(v6group), ipif, - B_FALSE, B_TRUE); - } else { - (void) ip_delmulti_v6(&v6group, ill, - connp->conn_zoneid, B_FALSE, B_TRUE); - } - ipsq_exit(ipsq); - mutex_enter(&connp->conn_lock); - } - ILG_WALKER_RELE(connp); + if (ilm != NULL) + (void) ip_delmulti_serial(ilm, B_FALSE, B_TRUE); - /* If any ill was skipped above wait and retry */ - if (connp->conn_ilg_inuse != 0) { - cv_wait(&connp->conn_refcv, &connp->conn_lock); - goto retry; + next: + mutex_exit(&ill->ill_mcast_serializer); + if (need_refrele) { + /* Drop ill reference while we hold no locks */ + ill_refrele(ill); + } + rw_enter(&connp->conn_ilg_lock, RW_WRITER); } - mutex_exit(&connp->conn_lock); + if (held_ilg != NULL) + ilg_refrele(held_ilg); + rw_exit(&connp->conn_ilg_lock); } /* - * Called from ill close by ipcl_walk for clearing conn_ilg and - * conn_multicast_ipif for a given ipif. conn is held by caller. - * Note that ipcl_walk only walks conns that are not yet condemned. - * condemned conns can't be refheld. For this reason, conn must become clean - * first, i.e. it must not refer to any ill/ire/ipif and then only set - * condemned flag. + * Attach the ilg to an ilm on the ill. If it fails we leave ilg_ill as NULL so + * that a subsequent attempt can attach it. + * Drops and reacquires conn_ilg_lock. */ static void -conn_delete_ipif(conn_t *connp, caddr_t arg) +ilg_attach(conn_t *connp, ilg_t *ilg, ill_t *ill) { - ipif_t *ipif = (ipif_t *)arg; - int i; - char group_buf1[INET6_ADDRSTRLEN]; - char group_buf2[INET6_ADDRSTRLEN]; - ipaddr_t group; - ilg_t *ilg; + ilg_stat_t ilgstat; + slist_t *new_filter; + int new_fmode; + in6_addr_t v6group; + ipaddr_t ifaddr; + uint_t ifindex; + ilm_t *ilm; + int error = 0; + ASSERT(RW_WRITE_HELD(&connp->conn_ilg_lock)); /* - * Even though conn_ilg_inuse can change while we are in this loop, - * i.e.ilgs can be created or deleted on this connp, no new ilgs can - * be created or deleted for this connp, on this ill, since this ill - * is the perimeter. So we won't miss any ilg in this cleanup. + * Alloc buffer to copy new state into (see below) before + * we make any changes, so we can bail if it fails. */ - mutex_enter(&connp->conn_lock); + if ((new_filter = l_alloc()) == NULL) + return; /* - * Increment the walker count, so that ilg repacking does not - * occur while we are in the loop. + * Save copy of ilg's filter state to pass to other functions, so + * we can release conn_ilg_lock now. + * Set ilg_ill so that an unplumb can find us. */ - ILG_WALKER_HOLD(connp); - for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) { - ilg = &connp->conn_ilg[i]; - if (ilg->ilg_ipif != ipif || (ilg->ilg_flags & ILG_DELETED)) - continue; - /* - * ip_close cannot be cleaning this ilg at the same time. - * since it also has to execute in this ill's perimeter which - * we are now holding. Only a clean conn can be condemned. - */ - ASSERT(!(connp->conn_state_flags & CONN_CONDEMNED)); - - /* Blow away the membership */ - ip1dbg(("conn_delete_ilg_ipif: %s on %s (%s)\n", - inet_ntop(AF_INET6, &connp->conn_ilg[i].ilg_v6group, - group_buf1, sizeof (group_buf1)), - inet_ntop(AF_INET6, &ipif->ipif_v6lcl_addr, - group_buf2, sizeof (group_buf2)), - ipif->ipif_ill->ill_name)); - - /* ilg_ipif is NULL for V6, so we won't be here */ - ASSERT(IN6_IS_ADDR_V4MAPPED(&ilg->ilg_v6group)); + new_fmode = ilg->ilg_fmode; + l_copy(ilg->ilg_filter, new_filter); + v6group = ilg->ilg_v6group; + ifaddr = ilg->ilg_ifaddr; + ifindex = ilg->ilg_ifindex; + ilgstat = ILGSTAT_NEW; - group = V4_PART_OF_V6(ilg->ilg_v6group); - ilg_delete(connp, &connp->conn_ilg[i], NULL); - mutex_exit(&connp->conn_lock); + ilg->ilg_ill = ill; + ASSERT(ilg->ilg_ilm == NULL); + rw_exit(&connp->conn_ilg_lock); - (void) ip_delmulti(group, ipif, B_FALSE, B_TRUE); - mutex_enter(&connp->conn_lock); - } + ilm = ip_addmulti_serial(&v6group, ill, connp->conn_zoneid, ilgstat, + new_fmode, new_filter, &error); + l_free(new_filter); + rw_enter(&connp->conn_ilg_lock, RW_WRITER); /* - * If we are the last walker, need to physically delete the - * ilgs and repack. + * Must look up the ilg again since we've not been holding + * conn_ilg_lock. The ilg could have disappeared due to an unplumb + * having called conn_update_ill, which can run once we dropped the + * conn_ilg_lock above. */ - ILG_WALKER_RELE(connp); - - if (connp->conn_multicast_ipif == ipif) { - /* Revert to late binding */ - connp->conn_multicast_ipif = NULL; + ilg = ilg_lookup(connp, &v6group, ifaddr, ifindex); + if (ilg == NULL) { + if (ilm != NULL) { + rw_exit(&connp->conn_ilg_lock); + (void) ip_delmulti_serial(ilm, B_FALSE, + (ilgstat == ILGSTAT_NEW)); + rw_enter(&connp->conn_ilg_lock, RW_WRITER); + } + return; } - mutex_exit(&connp->conn_lock); - - conn_delete_ire(connp, (caddr_t)ipif); + if (ilm == NULL) { + ilg->ilg_ill = NULL; + return; + } + ASSERT(ilg->ilg_ilm == NULL); + ilg->ilg_ilm = ilm; + ilm->ilm_ifaddr = ifaddr; /* For netstat */ } /* - * Called from ill close by ipcl_walk for clearing conn_ilg and - * conn_multicast_ill for a given ill. conn is held by caller. + * Called when an ill is unplumbed to make sure that there are no + * dangling conn references to that ill. In that case ill is non-NULL and + * we make sure we remove all references to it. + * Also called when we should revisit the ilg_ill used for multicast + * memberships, in which case ill is NULL. + * + * conn is held by caller. + * * Note that ipcl_walk only walks conns that are not yet condemned. * condemned conns can't be refheld. For this reason, conn must become clean - * first, i.e. it must not refer to any ill/ire/ipif and then only set + * first, i.e. it must not refer to any ill/ire and then only set * condemned flag. + * + * We leave ixa_multicast_ifindex in place. We prefer dropping + * packets instead of sending them out the wrong interface. + * + * We keep the ilg around in a detached state (with ilg_ill and ilg_ilm being + * NULL) so that the application can leave it later. Also, if ilg_ifaddr and + * ilg_ifindex are zero, indicating that the system should pick the interface, + * then we attempt to reselect the ill and join on it. + * + * Locking notes: + * Under conn_ilg_lock we get the first ilg. As we drop the conn_ilg_lock to + * proceed with the ilm part of the delete we hold a reference on both the ill + * and the ilg. This doesn't prevent changes to the ilg, but prevents it from + * being deleted. + * + * Note: if this function is called when new ill/ipif's arrive or change status + * (SIOCSLIFINDEX, SIOCSLIFADDR) then we will attempt to attach any ilgs with + * a NULL ilg_ill to an ill/ilm. */ static void -conn_delete_ill(conn_t *connp, caddr_t arg) +conn_update_ill(conn_t *connp, caddr_t arg) { ill_t *ill = (ill_t *)arg; - int i; - char group_buf[INET6_ADDRSTRLEN]; - in6_addr_t v6group; - ilg_t *ilg; /* - * Even though conn_ilg_inuse can change while we are in this loop, - * no new ilgs can be created/deleted for this connp, on this - * ill, since this ill is the perimeter. So we won't miss any ilg - * in this cleanup. + * We have to prevent ip_close/ilg_delete_all from running at + * the same time. ip_close sets CONN_CLOSING before doing the ilg_delete + * all, and we set CONN_UPDATE_ILL. That ensures that only one of + * ilg_delete_all and conn_update_ill run at a time for a given conn. + * If ilg_delete_all got here first, then we have nothing to do. */ mutex_enter(&connp->conn_lock); + if (connp->conn_state_flags & (CONN_CLOSING|CONN_UPDATE_ILL)) { + /* Caller has to wait for ill_ilm_cnt to drop to zero */ + mutex_exit(&connp->conn_lock); + return; + } + connp->conn_state_flags |= CONN_UPDATE_ILL; + mutex_exit(&connp->conn_lock); - /* - * Increment the walker count, so that ilg repacking does not - * occur while we are in the loop. - */ - ILG_WALKER_HOLD(connp); - for (i = connp->conn_ilg_inuse - 1; i >= 0; i--) { - ilg = &connp->conn_ilg[i]; - if ((ilg->ilg_ill == ill) && !(ilg->ilg_flags & ILG_DELETED)) { - /* - * ip_close cannot be cleaning this ilg at the same - * time, since it also has to execute in this ill's - * perimeter which we are now holding. Only a clean - * conn can be condemned. - */ - ASSERT(!(connp->conn_state_flags & CONN_CONDEMNED)); - - /* Blow away the membership */ - ip1dbg(("conn_delete_ilg_ill: %s on %s\n", - inet_ntop(AF_INET6, &ilg->ilg_v6group, - group_buf, sizeof (group_buf)), - ill->ill_name)); + if (ill != NULL) + ilg_check_detach(connp, ill); - v6group = ilg->ilg_v6group; - ilg_delete(connp, ilg, NULL); - mutex_exit(&connp->conn_lock); + ilg_check_reattach(connp); - (void) ip_delmulti_v6(&v6group, ill, - connp->conn_zoneid, B_FALSE, B_TRUE); - mutex_enter(&connp->conn_lock); - } - } - /* - * If we are the last walker, need to physically delete the - * ilgs and repack. - */ - ILG_WALKER_RELE(connp); - - if (connp->conn_multicast_ill == ill) { - /* Revert to late binding */ - connp->conn_multicast_ill = NULL; - } + /* Do we need to wake up a thread in ilg_delete_all? */ + mutex_enter(&connp->conn_lock); + connp->conn_state_flags &= ~CONN_UPDATE_ILL; + if (connp->conn_state_flags & CONN_CLOSING) + cv_broadcast(&connp->conn_cv); mutex_exit(&connp->conn_lock); } -/* - * Called when an ipif is unplumbed to make sure that there are no - * dangling conn references to that ipif. - * Handles ilg_ipif and conn_multicast_ipif - */ -void -reset_conn_ipif(ipif) - ipif_t *ipif; +/* Detach from an ill that is going away */ +static void +ilg_check_detach(conn_t *connp, ill_t *ill) { - ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; + char group_buf[INET6_ADDRSTRLEN]; + ilg_t *ilg, *held_ilg; + ilm_t *ilm; - ipcl_walk(conn_delete_ipif, (caddr_t)ipif, ipst); -} + mutex_enter(&ill->ill_mcast_serializer); + rw_enter(&connp->conn_ilg_lock, RW_WRITER); + held_ilg = NULL; + for (ilg = connp->conn_ilg; ilg != NULL; ilg = ilg->ilg_next) { + if (ilg->ilg_condemned) + continue; -/* - * Called when an ill is unplumbed to make sure that there are no - * dangling conn references to that ill. - * Handles ilg_ill, conn_multicast_ill. - */ -void -reset_conn_ill(ill_t *ill) -{ - ip_stack_t *ipst = ill->ill_ipst; + if (ilg->ilg_ill != ill) + continue; + + /* Detach from current ill */ + ip1dbg(("ilg_check_detach: detach %s on %s\n", + inet_ntop(AF_INET6, &ilg->ilg_v6group, + group_buf, sizeof (group_buf)), + ilg->ilg_ill->ill_name)); + + /* Detach this ilg from the ill/ilm */ + ilm = ilg->ilg_ilm; + ilg->ilg_ilm = NULL; + ilg->ilg_ill = NULL; + if (ilm == NULL) + continue; - ipcl_walk(conn_delete_ill, (caddr_t)ill, ipst); + /* Prevent ilg from disappearing */ + ilg_transfer_hold(held_ilg, ilg); + held_ilg = ilg; + rw_exit(&connp->conn_ilg_lock); + + (void) ip_delmulti_serial(ilm, B_FALSE, B_TRUE); + rw_enter(&connp->conn_ilg_lock, RW_WRITER); + } + if (held_ilg != NULL) + ilg_refrele(held_ilg); + rw_exit(&connp->conn_ilg_lock); + mutex_exit(&ill->ill_mcast_serializer); } -#ifdef DEBUG /* - * Walk functions walk all the interfaces in the system to make - * sure that there is no refernece to the ipif or ill that is - * going away. + * Check if there is a place to attach the conn_ilgs. We do this for both + * detached ilgs and attached ones, since for the latter there could be + * a better ill to attach them to. */ -int -ilm_walk_ill(ill_t *ill) +static void +ilg_check_reattach(conn_t *connp) { - int cnt = 0; - ill_t *till; - ilm_t *ilm; - ill_walk_context_t ctx; - ip_stack_t *ipst = ill->ill_ipst; - - rw_enter(&ipst->ips_ill_g_lock, RW_READER); - till = ILL_START_WALK_ALL(&ctx, ipst); - for (; till != NULL; till = ill_next(&ctx, till)) { - mutex_enter(&till->ill_lock); - for (ilm = till->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { - if (ilm->ilm_ill == ill) { - cnt++; + ill_t *ill; + char group_buf[INET6_ADDRSTRLEN]; + ilg_t *ilg, *held_ilg; + ilm_t *ilm; + zoneid_t zoneid = IPCL_ZONEID(connp); + int error; + ip_stack_t *ipst = connp->conn_netstack->netstack_ip; + + rw_enter(&connp->conn_ilg_lock, RW_WRITER); + held_ilg = NULL; + for (ilg = connp->conn_ilg; ilg != NULL; ilg = ilg->ilg_next) { + if (ilg->ilg_condemned) + continue; + + /* Check if the conn_ill matches what we would pick now */ + ill = ill_mcast_lookup(&ilg->ilg_v6group, ilg->ilg_ifaddr, + ilg->ilg_ifindex, zoneid, ipst, &error); + + /* + * Make sure the ill is usable for multicast and that + * we can send the DL_ADDMULTI_REQ before we create an + * ilm. + */ + if (ill != NULL && + (!(ill->ill_flags & ILLF_MULTICAST) || !ill->ill_dl_up)) { + /* Drop locks across ill_refrele */ + ilg_transfer_hold(held_ilg, ilg); + held_ilg = ilg; + rw_exit(&connp->conn_ilg_lock); + ill_refrele(ill); + ill = NULL; + rw_enter(&connp->conn_ilg_lock, RW_WRITER); + /* Note that ilg could have become condemned */ + } + + /* Is the ill unchanged, even if both are NULL? */ + if (ill == ilg->ilg_ill) { + if (ill != NULL) { + /* Drop locks across ill_refrele */ + ilg_transfer_hold(held_ilg, ilg); + held_ilg = ilg; + rw_exit(&connp->conn_ilg_lock); + ill_refrele(ill); + rw_enter(&connp->conn_ilg_lock, RW_WRITER); } + continue; } - mutex_exit(&till->ill_lock); - } - rw_exit(&ipst->ips_ill_g_lock); - return (cnt); + /* Something changed; detach from old first if needed */ + if (ilg->ilg_ill != NULL) { + ill_t *ill2 = ilg->ilg_ill; + boolean_t need_refrele = B_FALSE; + + /* + * In order to serialize on the ill we try to enter + * and if that fails we unlock and relock. + */ + if (!mutex_tryenter(&ill2->ill_mcast_serializer)) { + ill_refhold(ill2); + need_refrele = B_TRUE; + ilg_transfer_hold(held_ilg, ilg); + held_ilg = ilg; + rw_exit(&connp->conn_ilg_lock); + mutex_enter(&ill2->ill_mcast_serializer); + rw_enter(&connp->conn_ilg_lock, RW_WRITER); + /* Note that ilg could have become condemned */ + } + /* + * Check that nobody else re-attached the ilg while we + * dropped the lock. + */ + if (ilg->ilg_ill == ill2) { + ASSERT(!ilg->ilg_condemned); + /* Detach from current ill */ + ip1dbg(("conn_check_reattach: detach %s/%s\n", + inet_ntop(AF_INET6, &ilg->ilg_v6group, + group_buf, sizeof (group_buf)), + ill2->ill_name)); + + ilm = ilg->ilg_ilm; + ilg->ilg_ilm = NULL; + ilg->ilg_ill = NULL; + } else { + ilm = NULL; + } + rw_exit(&connp->conn_ilg_lock); + if (ilm != NULL) + (void) ip_delmulti_serial(ilm, B_FALSE, B_TRUE); + mutex_exit(&ill2->ill_mcast_serializer); + if (need_refrele) { + /* Drop ill reference while we hold no locks */ + ill_refrele(ill2); + } + rw_enter(&connp->conn_ilg_lock, RW_WRITER); + /* + * While we dropped conn_ilg_lock some other thread + * could have attached this ilg, thus we check again. + */ + if (ilg->ilg_ill != NULL) { + if (ill != NULL) { + /* Drop locks across ill_refrele */ + ilg_transfer_hold(held_ilg, ilg); + held_ilg = ilg; + rw_exit(&connp->conn_ilg_lock); + ill_refrele(ill); + rw_enter(&connp->conn_ilg_lock, + RW_WRITER); + } + continue; + } + } + if (ill != NULL) { + /* + * In order to serialize on the ill we try to enter + * and if that fails we unlock and relock. + */ + if (!mutex_tryenter(&ill->ill_mcast_serializer)) { + /* Already have a refhold on ill */ + ilg_transfer_hold(held_ilg, ilg); + held_ilg = ilg; + rw_exit(&connp->conn_ilg_lock); + mutex_enter(&ill->ill_mcast_serializer); + rw_enter(&connp->conn_ilg_lock, RW_WRITER); + /* Note that ilg could have become condemned */ + } + + /* + * Check that nobody else attached the ilg and that + * it wasn't condemned while we dropped the lock. + */ + if (ilg->ilg_ill == NULL && !ilg->ilg_condemned) { + /* + * Attach to the new ill. Can fail in which + * case ilg_ill will remain NULL. ilg_attach + * drops and reacquires conn_ilg_lock. + */ + ip1dbg(("conn_check_reattach: attach %s/%s\n", + inet_ntop(AF_INET6, &ilg->ilg_v6group, + group_buf, sizeof (group_buf)), + ill->ill_name)); + ilg_attach(connp, ilg, ill); + ASSERT(RW_WRITE_HELD(&connp->conn_ilg_lock)); + } + mutex_exit(&ill->ill_mcast_serializer); + /* Drop locks across ill_refrele */ + ilg_transfer_hold(held_ilg, ilg); + held_ilg = ilg; + rw_exit(&connp->conn_ilg_lock); + ill_refrele(ill); + rw_enter(&connp->conn_ilg_lock, RW_WRITER); + } + } + if (held_ilg != NULL) + ilg_refrele(held_ilg); + rw_exit(&connp->conn_ilg_lock); } /* - * This function is called before the ipif is freed. + * Called when an ill is unplumbed to make sure that there are no + * dangling conn references to that ill. In that case ill is non-NULL and + * we make sure we remove all references to it. + * Also called when we should revisit the ilg_ill used for multicast + * memberships, in which case ill is NULL. */ -int -ilm_walk_ipif(ipif_t *ipif) +void +update_conn_ill(ill_t *ill, ip_stack_t *ipst) { - int cnt = 0; - ill_t *till; - ilm_t *ilm; - ill_walk_context_t ctx; - ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; - - till = ILL_START_WALK_ALL(&ctx, ipst); - for (; till != NULL; till = ill_next(&ctx, till)) { - mutex_enter(&till->ill_lock); - for (ilm = till->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { - if (ilm->ilm_ipif == ipif) { - cnt++; - } - } - mutex_exit(&till->ill_lock); - } - return (cnt); + ipcl_walk(conn_update_ill, (caddr_t)ill, ipst); } -#endif diff --git a/usr/src/uts/common/inet/ip/ip_ndp.c b/usr/src/uts/common/inet/ip/ip_ndp.c index 35f9d541e8..97096bea99 100644 --- a/usr/src/uts/common/inet/ip/ip_ndp.c +++ b/usr/src/uts/common/inet/ip/ip_ndp.c @@ -40,6 +40,7 @@ #include <sys/zone.h> #include <sys/ethernet.h> #include <sys/sdt.h> +#include <sys/mac.h> #include <net/if.h> #include <net/if_types.h> @@ -61,53 +62,93 @@ #include <inet/ip_rts.h> #include <inet/ip6.h> #include <inet/ip_ndp.h> -#include <inet/ipsec_impl.h> -#include <inet/ipsec_info.h> #include <inet/sctp_ip.h> +#include <inet/ip_arp.h> #include <inet/ip2mac_impl.h> +#define ANNOUNCE_INTERVAL(isv6) \ + (isv6 ? ipst->ips_ip_ndp_unsolicit_interval : \ + ipst->ips_ip_arp_publish_interval) + +#define DEFENSE_INTERVAL(isv6) \ + (isv6 ? ipst->ips_ndp_defend_interval : \ + ipst->ips_arp_defend_interval) + +/* Non-tunable probe interval, based on link capabilities */ +#define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500) + +/* + * The IPv4 Link Local address space is special; we do extra duplicate checking + * there, as the entire assignment mechanism rests on random numbers. + */ +#define IS_IPV4_LL_SPACE(ptr) (((uchar_t *)ptr)[0] == 169 && \ + ((uchar_t *)ptr)[1] == 254) + +/* + * NCE_EXTERNAL_FLAGS_MASK defines the set of ncec_flags that may be passed + * in to the ncec*add* functions. + * + * NCE_F_AUTHORITY means that we ignore any incoming adverts for that + * mapping (though DAD is performed for the mapping). NCE_F_PUBLISH means + * that we will respond to requests for the protocol address. + */ +#define NCE_EXTERNAL_FLAGS_MASK \ + (NCE_F_MYADDR | NCE_F_ISROUTER | NCE_F_NONUD | \ + NCE_F_ANYCAST | NCE_F_UNSOL_ADV | NCE_F_BCAST | NCE_F_MCAST | \ + NCE_F_AUTHORITY | NCE_F_PUBLISH | NCE_F_STATIC) + /* * Function names with nce_ prefix are static while function * names with ndp_ prefix are used by rest of the IP. * * Lock ordering: * - * ndp_g_lock -> ill_lock -> nce_lock + * ndp_g_lock -> ill_lock -> ncec_lock * * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and - * nce_next. Nce_lock protects the contents of the NCE (particularly - * nce_refcnt). - */ - -static boolean_t nce_cmp_ll_addr(const nce_t *nce, const uchar_t *new_ll_addr, - uint32_t ll_addr_len); -static void nce_ire_delete(nce_t *nce); -static void nce_ire_delete1(ire_t *ire, char *nce_arg); -static void nce_set_ll(nce_t *nce, uchar_t *ll_addr); -static nce_t *nce_lookup_addr(ill_t *, boolean_t, const in6_addr_t *, - nce_t *); -static nce_t *nce_lookup_mapping(ill_t *, const in6_addr_t *); -static void nce_make_mapping(nce_t *nce, uchar_t *addrpos, - uchar_t *addr); -static int nce_set_multicast(ill_t *ill, const in6_addr_t *addr); -static void nce_queue_mp(nce_t *nce, mblk_t *mp); -static mblk_t *nce_udreq_alloc(ill_t *ill); -static void nce_update(nce_t *nce, uint16_t new_state, - uchar_t *new_ll_addr); -static uint32_t nce_solicit(nce_t *nce, in6_addr_t src); -static boolean_t nce_xmit(ill_t *ill, uint8_t type, - boolean_t use_lla_addr, const in6_addr_t *sender, + * ncec_next. ncec_lock protects the contents of the NCE (particularly + * ncec_refcnt). + */ + +static void nce_cleanup_list(ncec_t *ncec); +static void nce_set_ll(ncec_t *ncec, uchar_t *ll_addr); +static ncec_t *ncec_lookup_illgrp(ill_t *, const in6_addr_t *, + ncec_t *); +static nce_t *nce_lookup_addr(ill_t *, const in6_addr_t *); +static int nce_set_multicast_v6(ill_t *ill, const in6_addr_t *addr, + uint16_t ncec_flags, nce_t **newnce); +static int nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst, + uint16_t ncec_flags, nce_t **newnce); +static boolean_t ndp_xmit(ill_t *ill, uint32_t operation, + uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender, const in6_addr_t *target, int flag); -static boolean_t nce_xmit_advert(nce_t *nce, boolean_t use_nd_lla, - const in6_addr_t *target, uint_t flags); -static boolean_t nce_xmit_solicit(nce_t *nce, boolean_t use_nd_lla, - const in6_addr_t *src, uint_t flags); -static int ndp_add_v4(ill_t *, const in_addr_t *, uint16_t, - nce_t **, nce_t *); -static ipif_t *ip_ndp_lookup_addr_v6(const in6_addr_t *v6addrp, ill_t *ill); +static void ncec_refhold_locked(ncec_t *); +static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *); +static void nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t); +static int nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *, + uint16_t, uint16_t, nce_t **); +static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *); +static nce_t *nce_add(ill_t *, ncec_t *); +static void nce_inactive(nce_t *); +extern nce_t *nce_lookup(ill_t *, const in6_addr_t *); +static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *); +static int nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *, + uint16_t, uint16_t, nce_t **); +static int nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *, + uint16_t, uint16_t, nce_t **); +static int nce_add_v6_postprocess(nce_t *); +static int nce_add_v4_postprocess(nce_t *); +static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *); +static clock_t nce_fuzz_interval(clock_t, boolean_t); +static void nce_resolv_ipmp_ok(ncec_t *); +static void nce_walk_common(ill_t *, pfi_t, void *); +static void nce_start_timer(ncec_t *, uint_t); +static nce_t *nce_fastpath_create(ill_t *, ncec_t *); +static void nce_fastpath_trigger(nce_t *); +static nce_t *nce_fastpath(ncec_t *, boolean_t, nce_t *); #ifdef DEBUG -static void nce_trace_cleanup(const nce_t *); +static void ncec_trace_cleanup(const ncec_t *); #endif #define NCE_HASH_PTR_V4(ipst, addr) \ @@ -117,233 +158,245 @@ static void nce_trace_cleanup(const nce_t *); (&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \ NCE_TABLE_SIZE)])) -/* Non-tunable probe interval, based on link capabilities */ -#define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500) +extern kmem_cache_t *ncec_cache; +extern kmem_cache_t *nce_cache; + +/* + * Send out a IPv6 (unicast) or IPv4 (broadcast) DAD probe + * If src_ill is not null, the ncec_addr is bound to src_ill. The + * src_ill is ignored by nce_dad for IPv4 Neighbor Cache entries where + * the probe is sent on the ncec_ill (in the non-IPMP case) or the + * IPMP cast_ill (in the IPMP case). + * + * Note that the probe interval is based on ncec->ncec_ill which + * may be the ipmp_ill. + */ +static void +nce_dad(ncec_t *ncec, ill_t *src_ill, boolean_t send_probe) +{ + boolean_t dropped; + uint32_t probe_interval; + + ASSERT(!(ncec->ncec_flags & NCE_F_MCAST)); + ASSERT(!(ncec->ncec_flags & NCE_F_BCAST)); + if (ncec->ncec_ipversion == IPV6_VERSION) { + dropped = ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT, + ncec->ncec_lladdr, ncec->ncec_lladdr_length, + &ipv6_all_zeros, &ncec->ncec_addr, NDP_PROBE); + probe_interval = ILL_PROBE_INTERVAL(ncec->ncec_ill); + } else { + /* IPv4 DAD delay the initial probe. */ + if (send_probe) + dropped = arp_probe(ncec); + else + dropped = B_TRUE; + probe_interval = nce_fuzz_interval(ncec->ncec_xmit_interval, + !send_probe); + } + if (!dropped) { + mutex_enter(&ncec->ncec_lock); + ncec->ncec_pcnt--; + mutex_exit(&ncec->ncec_lock); + } + nce_restart_timer(ncec, probe_interval); +} + +/* + * Compute default flags to use for an advertisement of this ncec's address. + */ +static int +nce_advert_flags(const ncec_t *ncec) +{ + int flag = 0; + + if (ncec->ncec_flags & NCE_F_ISROUTER) + flag |= NDP_ISROUTER; + if (!(ncec->ncec_flags & NCE_F_ANYCAST)) + flag |= NDP_ORIDE; + + return (flag); +} /* * NDP Cache Entry creation routine. * Mapped entries will never do NUD . * This routine must always be called with ndp6->ndp_g_lock held. - * Prior to return, nce_refcnt is incremented. */ int -ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr, - const in6_addr_t *mask, const in6_addr_t *extract_mask, - uint32_t hw_extract_start, uint16_t flags, uint16_t state, - nce_t **newnce) +nce_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, + const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) { - static nce_t nce_nil; - nce_t *nce; - mblk_t *mp; - mblk_t *template; - nce_t **ncep; int err; - boolean_t dropped = B_FALSE; - ip_stack_t *ipst = ill->ill_ipst; + nce_t *nce; - ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock)); + ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock)); ASSERT(ill != NULL && ill->ill_isv6); - if (IN6_IS_ADDR_UNSPECIFIED(addr)) { - ip0dbg(("ndp_add_v6: no addr\n")); - return (EINVAL); - } - if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) { - ip0dbg(("ndp_add_v6: flags = %x\n", (int)flags)); - return (EINVAL); - } - if (IN6_IS_ADDR_UNSPECIFIED(extract_mask) && - (flags & NCE_F_MAPPING)) { - ip0dbg(("ndp_add_v6: extract mask zero for mapping")); - return (EINVAL); - } - /* - * Allocate the mblk to hold the nce. - * - * XXX This can come out of a separate cache - nce_cache. - * We don't need the mp anymore as there are no more - * "qwriter"s - */ - mp = allocb(sizeof (nce_t), BPRI_MED); - if (mp == NULL) - return (ENOMEM); - nce = (nce_t *)mp->b_rptr; - mp->b_wptr = (uchar_t *)&nce[1]; - *nce = nce_nil; - - /* - * This one holds link layer address - */ - if (ill->ill_net_type == IRE_IF_RESOLVER) { - template = nce_udreq_alloc(ill); - } else { - if (ill->ill_phys_addr_length == IPV6_ADDR_LEN && - ill->ill_mactype != DL_IPV6) { - /* - * We create a nce_res_mp with the IP nexthop address - * as the destination address if the physical length - * is exactly 16 bytes for point-to-multipoint links - * that do their own resolution from IP to link-layer - * address. - */ - template = ill_dlur_gen((uchar_t *)addr, - ill->ill_phys_addr_length, ill->ill_sap, - ill->ill_sap_length); - } else { - if (ill->ill_resolver_mp == NULL) { - freeb(mp); - return (EINVAL); - } - ASSERT((ill->ill_net_type == IRE_IF_NORESOLVER)); - template = copyb(ill->ill_resolver_mp); - } - } - if (template == NULL) { - freeb(mp); - return (ENOMEM); - } - nce->nce_ill = ill; - nce->nce_ipversion = IPV6_VERSION; - nce->nce_flags = flags; - nce->nce_state = state; - nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT; - nce->nce_rcnt = ill->ill_xmit_count; - nce->nce_addr = *addr; - nce->nce_mask = *mask; - nce->nce_extract_mask = *extract_mask; - nce->nce_ll_extract_start = hw_extract_start; - nce->nce_fp_mp = NULL; - nce->nce_res_mp = template; - if (state == ND_REACHABLE) - nce->nce_last = TICK_TO_MSEC(lbolt64); - else - nce->nce_last = 0; - nce->nce_qd_mp = NULL; - nce->nce_mp = mp; - if (hw_addr != NULL) - nce_set_ll(nce, hw_addr); - /* This one is for nce getting created */ - nce->nce_refcnt = 1; - mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL); - if (nce->nce_flags & NCE_F_MAPPING) { - ASSERT(IN6_IS_ADDR_MULTICAST(addr)); - ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_mask)); - ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask)); - ncep = &ipst->ips_ndp6->nce_mask_entries; - } else { - ncep = ((nce_t **)NCE_HASH_PTR_V6(ipst, *addr)); - } + err = nce_add_common(ill, hw_addr, hw_addr_len, addr, flags, state, + &nce); + if (err != 0) + return (err); + ASSERT(newnce != NULL); + *newnce = nce; + return (err); +} - nce->nce_trace_disable = B_FALSE; +/* + * Post-processing routine to be executed after nce_add_v6(). This function + * triggers fastpath (if appropriate) and DAD on the newly added nce entry + * and must be called without any locks held. + */ +int +nce_add_v6_postprocess(nce_t *nce) +{ + ncec_t *ncec = nce->nce_common; + boolean_t dropped = B_FALSE; + uchar_t *hw_addr = ncec->ncec_lladdr; + uint_t hw_addr_len = ncec->ncec_lladdr_length; + ill_t *ill = ncec->ncec_ill; + int err = 0; + uint16_t flags = ncec->ncec_flags; + ip_stack_t *ipst = ill->ill_ipst; + boolean_t trigger_fastpath = B_TRUE; - list_create(&nce->nce_cb, sizeof (nce_cb_t), - offsetof(nce_cb_t, nce_cb_node)); /* - * Atomically ensure that the ill is not CONDEMNED, before - * adding the NCE. + * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then + * we call nce_fastpath as soon as the ncec is resolved in nce_process. + * We call nce_fastpath from nce_update if the link layer address of + * the peer changes from nce_update */ - mutex_enter(&ill->ill_lock); - if (ill->ill_state_flags & ILL_CONDEMNED) { - mutex_exit(&ill->ill_lock); - freeb(mp); - freeb(template); - return (EINVAL); - } - if ((nce->nce_next = *ncep) != NULL) - nce->nce_next->nce_ptpn = &nce->nce_next; - *ncep = nce; - nce->nce_ptpn = ncep; - *newnce = nce; - /* This one is for nce being used by an active thread */ - NCE_REFHOLD(*newnce); - - /* Bump up the number of nce's referencing this ill */ - DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill, - (char *), "nce", (void *), nce); - ill->ill_nce_cnt++; - mutex_exit(&ill->ill_lock); - - err = 0; - if ((flags & NCE_F_PERMANENT) && state == ND_PROBE) { - mutex_enter(&nce->nce_lock); - mutex_exit(&ipst->ips_ndp6->ndp_g_lock); - nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT; - mutex_exit(&nce->nce_lock); - dropped = nce_xmit_solicit(nce, B_FALSE, NULL, NDP_PROBE); - if (dropped) { - mutex_enter(&nce->nce_lock); - nce->nce_pcnt++; - mutex_exit(&nce->nce_lock); + if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || + (hw_addr == NULL && ill->ill_net_type != IRE_IF_NORESOLVER)) + trigger_fastpath = B_FALSE; + + if (trigger_fastpath) + nce_fastpath_trigger(nce); + if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) { + ill_t *hwaddr_ill; + /* + * Unicast entry that needs DAD. + */ + if (IS_IPMP(ill)) { + hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, + hw_addr, hw_addr_len); + } else { + hwaddr_ill = ill; } - NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill)); - mutex_enter(&ipst->ips_ndp6->ndp_g_lock); + nce_dad(ncec, hwaddr_ill, B_TRUE); err = EINPROGRESS; } else if (flags & NCE_F_UNSOL_ADV) { /* * We account for the transmit below by assigning one * less than the ndd variable. Subsequent decrements - * are done in ndp_timer. + * are done in nce_timer. */ - mutex_enter(&nce->nce_lock); - mutex_exit(&ipst->ips_ndp6->ndp_g_lock); - nce->nce_unsolicit_count = ipst->ips_ip_ndp_unsolicit_count - 1; - mutex_exit(&nce->nce_lock); - dropped = nce_xmit_advert(nce, B_TRUE, &ipv6_all_hosts_mcast, - 0); - mutex_enter(&nce->nce_lock); + mutex_enter(&ncec->ncec_lock); + ncec->ncec_unsolicit_count = + ipst->ips_ip_ndp_unsolicit_count - 1; + mutex_exit(&ncec->ncec_lock); + dropped = ndp_xmit(ill, + ND_NEIGHBOR_ADVERT, + hw_addr, + hw_addr_len, + &ncec->ncec_addr, /* Source and target of the adv */ + &ipv6_all_hosts_mcast, /* Destination of the packet */ + nce_advert_flags(ncec)); + mutex_enter(&ncec->ncec_lock); if (dropped) - nce->nce_unsolicit_count++; - if (nce->nce_unsolicit_count != 0) { - ASSERT(nce->nce_timeout_id == 0); - nce->nce_timeout_id = timeout(ndp_timer, nce, - MSEC_TO_TICK(ipst->ips_ip_ndp_unsolicit_interval)); + ncec->ncec_unsolicit_count++; + else + ncec->ncec_last_time_defended = ddi_get_lbolt(); + if (ncec->ncec_unsolicit_count != 0) { + nce_start_timer(ncec, + ipst->ips_ip_ndp_unsolicit_interval); } - mutex_exit(&nce->nce_lock); - mutex_enter(&ipst->ips_ndp6->ndp_g_lock); + mutex_exit(&ncec->ncec_lock); } - - /* - * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then - * we call nce_fastpath as soon as the nce is resolved in ndp_process. - * We call nce_fastpath from nce_update if the link layer address of - * the peer changes from nce_update - */ - if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) - nce_fastpath(nce); return (err); } +/* + * Atomically lookup and add (if needed) Neighbor Cache information for + * an address. + * + * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses + * are always added pointing at the ipmp_ill. Thus, when the ill passed + * to nce_add_v6 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t + * entries will be created, both pointing at the same ncec_t. The nce_t + * entries will have their nce_ill set to the ipmp_ill and the under_ill + * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill. + * Local addresses are always created on the ill passed to nce_add_v6. + */ int -ndp_lookup_then_add_v6(ill_t *ill, boolean_t match_illgrp, uchar_t *hw_addr, - const in6_addr_t *addr, const in6_addr_t *mask, - const in6_addr_t *extract_mask, uint32_t hw_extract_start, uint16_t flags, - uint16_t state, nce_t **newnce) +nce_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, + const in6_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) { - int err = 0; - nce_t *nce; + int err = 0; ip_stack_t *ipst = ill->ill_ipst; + nce_t *nce, *upper_nce = NULL; + ill_t *in_ill = ill; + boolean_t need_ill_refrele = B_FALSE; + if (flags & NCE_F_MCAST) { + /* + * hw_addr will be figured out in nce_set_multicast_v6; + * caller has to select the cast_ill + */ + ASSERT(hw_addr == NULL); + ASSERT(!IS_IPMP(ill)); + err = nce_set_multicast_v6(ill, addr, flags, newnce); + return (err); + } ASSERT(ill->ill_isv6); - mutex_enter(&ipst->ips_ndp6->ndp_g_lock); + if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) { + ill = ipmp_ill_hold_ipmp_ill(ill); + if (ill == NULL) + return (ENXIO); + need_ill_refrele = B_TRUE; + } - /* Get head of v6 hash table */ - nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr)); - nce = nce_lookup_addr(ill, match_illgrp, addr, nce); + mutex_enter(&ipst->ips_ndp6->ndp_g_lock); + nce = nce_lookup_addr(ill, addr); if (nce == NULL) { - err = ndp_add_v6(ill, - hw_addr, - addr, - mask, - extract_mask, - hw_extract_start, - flags, - state, - newnce); + err = nce_add_v6(ill, hw_addr, hw_addr_len, addr, flags, state, + &nce); } else { - *newnce = nce; err = EEXIST; } mutex_exit(&ipst->ips_ndp6->ndp_g_lock); + if (err == 0) + err = nce_add_v6_postprocess(nce); + if (in_ill != ill && nce != NULL) { + nce_t *under_nce; + + /* + * in_ill was the under_ill. Try to create the under_nce. + * Hold the ill_g_lock to prevent changes to group membership + * until we are done. + */ + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + if (IS_IN_SAME_ILLGRP(in_ill, ill)) { + under_nce = nce_fastpath_create(in_ill, + nce->nce_common); + upper_nce = nce; + if ((nce = under_nce) == NULL) + err = EINVAL; + } + rw_exit(&ipst->ips_ill_g_lock); + if (under_nce != NULL && NCE_ISREACHABLE(nce->nce_common)) + nce_fastpath_trigger(under_nce); + } + if (nce != NULL) { + if (newnce != NULL) + *newnce = nce; + else + nce_refrele(nce); + } + /* nce_refrele is deferred until the lock is dropped */ + if (upper_nce != NULL) + nce_refrele(upper_nce); + if (need_ill_refrele) + ill_refrele(ill); return (err); } @@ -351,53 +404,51 @@ ndp_lookup_then_add_v6(ill_t *ill, boolean_t match_illgrp, uchar_t *hw_addr, * Remove all the CONDEMNED nces from the appropriate hash table. * We create a private list of NCEs, these may have ires pointing * to them, so the list will be passed through to clean up dependent - * ires and only then we can do NCE_REFRELE which can make NCE inactive. + * ires and only then we can do ncec_refrele() which can make NCE inactive. */ static void -nce_remove(ndp_g_t *ndp, nce_t *nce, nce_t **free_nce_list) +nce_remove(ndp_g_t *ndp, ncec_t *ncec, ncec_t **free_nce_list) { - nce_t *nce1; - nce_t **ptpn; + ncec_t *ncec1; + ncec_t **ptpn; ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); ASSERT(ndp->ndp_g_walker == 0); - for (; nce; nce = nce1) { - nce1 = nce->nce_next; - mutex_enter(&nce->nce_lock); - if (nce->nce_flags & NCE_F_CONDEMNED) { - ptpn = nce->nce_ptpn; - nce1 = nce->nce_next; - if (nce1 != NULL) - nce1->nce_ptpn = ptpn; - *ptpn = nce1; - nce->nce_ptpn = NULL; - nce->nce_next = NULL; - nce->nce_next = *free_nce_list; - *free_nce_list = nce; + for (; ncec; ncec = ncec1) { + ncec1 = ncec->ncec_next; + mutex_enter(&ncec->ncec_lock); + if (NCE_ISCONDEMNED(ncec)) { + ptpn = ncec->ncec_ptpn; + ncec1 = ncec->ncec_next; + if (ncec1 != NULL) + ncec1->ncec_ptpn = ptpn; + *ptpn = ncec1; + ncec->ncec_ptpn = NULL; + ncec->ncec_next = NULL; + ncec->ncec_next = *free_nce_list; + *free_nce_list = ncec; } - mutex_exit(&nce->nce_lock); + mutex_exit(&ncec->ncec_lock); } } /* - * 1. Mark the nce CONDEMNED. This ensures that no new nce_lookup() - * will return this NCE. Also no new IREs will be created that - * point to this NCE (See ire_add_v6). Also no new timeouts will - * be started (See NDP_RESTART_TIMER). + * 1. Mark the entry CONDEMNED. This ensures that no new nce_lookup() + * will return this NCE. Also no new timeouts will + * be started (See nce_restart_timer). * 2. Cancel any currently running timeouts. * 3. If there is an ndp walker, return. The walker will do the cleanup. * This ensures that walkers see a consistent list of NCEs while walking. * 4. Otherwise remove the NCE from the list of NCEs - * 5. Delete all IREs pointing to this NCE. */ void -ndp_delete(nce_t *nce) +ncec_delete(ncec_t *ncec) { - nce_t **ptpn; - nce_t *nce1; - int ipversion = nce->nce_ipversion; + ncec_t **ptpn; + ncec_t *ncec1; + int ipversion = ncec->ncec_ipversion; ndp_g_t *ndp; - ip_stack_t *ipst = nce->nce_ill->ill_ipst; + ip_stack_t *ipst = ncec->ncec_ipst; if (ipversion == IPV4_VERSION) ndp = ipst->ips_ndp4; @@ -405,40 +456,42 @@ ndp_delete(nce_t *nce) ndp = ipst->ips_ndp6; /* Serialize deletes */ - mutex_enter(&nce->nce_lock); - if (nce->nce_flags & NCE_F_CONDEMNED) { + mutex_enter(&ncec->ncec_lock); + if (NCE_ISCONDEMNED(ncec)) { /* Some other thread is doing the delete */ - mutex_exit(&nce->nce_lock); + mutex_exit(&ncec->ncec_lock); return; } /* * Caller has a refhold. Also 1 ref for being in the list. Thus * refcnt has to be >= 2 */ - ASSERT(nce->nce_refcnt >= 2); - nce->nce_flags |= NCE_F_CONDEMNED; - mutex_exit(&nce->nce_lock); + ASSERT(ncec->ncec_refcnt >= 2); + ncec->ncec_flags |= NCE_F_CONDEMNED; + mutex_exit(&ncec->ncec_lock); - nce_fastpath_list_delete(nce); + /* Count how many condemned ires for kmem_cache callback */ + atomic_add_32(&ipst->ips_num_nce_condemned, 1); + nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL); /* Complete any waiting callbacks */ - nce_cb_dispatch(nce); + ncec_cb_dispatch(ncec); /* * Cancel any running timer. Timeout can't be restarted - * since CONDEMNED is set. Can't hold nce_lock across untimeout. + * since CONDEMNED is set. Can't hold ncec_lock across untimeout. * Passing invalid timeout id is fine. */ - if (nce->nce_timeout_id != 0) { - (void) untimeout(nce->nce_timeout_id); - nce->nce_timeout_id = 0; + if (ncec->ncec_timeout_id != 0) { + (void) untimeout(ncec->ncec_timeout_id); + ncec->ncec_timeout_id = 0; } mutex_enter(&ndp->ndp_g_lock); - if (nce->nce_ptpn == NULL) { + if (ncec->ncec_ptpn == NULL) { /* - * The last ndp walker has already removed this nce from - * the list after we marked the nce CONDEMNED and before + * The last ndp walker has already removed this ncec from + * the list after we marked the ncec CONDEMNED and before * we grabbed the global lock. */ mutex_exit(&ndp->ndp_g_lock); @@ -454,62 +507,68 @@ ndp_delete(nce_t *nce) } /* - * Now remove the nce from the list. NDP_RESTART_TIMER won't restart + * Now remove the ncec from the list. nce_restart_timer won't restart * the timer since it is marked CONDEMNED. */ - ptpn = nce->nce_ptpn; - nce1 = nce->nce_next; - if (nce1 != NULL) - nce1->nce_ptpn = ptpn; - *ptpn = nce1; - nce->nce_ptpn = NULL; - nce->nce_next = NULL; + ptpn = ncec->ncec_ptpn; + ncec1 = ncec->ncec_next; + if (ncec1 != NULL) + ncec1->ncec_ptpn = ptpn; + *ptpn = ncec1; + ncec->ncec_ptpn = NULL; + ncec->ncec_next = NULL; mutex_exit(&ndp->ndp_g_lock); - nce_ire_delete(nce); + /* Removed from ncec_ptpn/ncec_next list */ + ncec_refrele_notr(ncec); } void -ndp_inactive(nce_t *nce) +ncec_inactive(ncec_t *ncec) { mblk_t **mpp; - ill_t *ill; + ill_t *ill = ncec->ncec_ill; + ip_stack_t *ipst = ncec->ncec_ipst; - ASSERT(nce->nce_refcnt == 0); - ASSERT(MUTEX_HELD(&nce->nce_lock)); - ASSERT(nce->nce_fastpath == NULL); + ASSERT(ncec->ncec_refcnt == 0); + ASSERT(MUTEX_HELD(&ncec->ncec_lock)); - /* Free all nce allocated messages */ - mpp = &nce->nce_first_mp_to_free; - do { - while (*mpp != NULL) { - mblk_t *mp; + /* Count how many condemned nces for kmem_cache callback */ + if (NCE_ISCONDEMNED(ncec)) + atomic_add_32(&ipst->ips_num_nce_condemned, -1); - mp = *mpp; - *mpp = mp->b_next; + /* Free all allocated messages */ + mpp = &ncec->ncec_qd_mp; + while (*mpp != NULL) { + mblk_t *mp; - inet_freemsg(mp); - } - } while (mpp++ != &nce->nce_last_mp_to_free); + mp = *mpp; + *mpp = mp->b_next; - if (nce->nce_ipversion == IPV6_VERSION) { - /* - * must have been cleaned up in nce_delete - */ - ASSERT(list_is_empty(&nce->nce_cb)); - list_destroy(&nce->nce_cb); + inet_freemsg(mp); } + /* + * must have been cleaned up in ncec_delete + */ + ASSERT(list_is_empty(&ncec->ncec_cb)); + list_destroy(&ncec->ncec_cb); + /* + * free the ncec_lladdr if one was allocated in nce_add_common() + */ + if (ncec->ncec_lladdr_length > 0) + kmem_free(ncec->ncec_lladdr, ncec->ncec_lladdr_length); + #ifdef DEBUG - nce_trace_cleanup(nce); + ncec_trace_cleanup(ncec); #endif - ill = nce->nce_ill; mutex_enter(&ill->ill_lock); DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill, - (char *), "nce", (void *), nce); - ill->ill_nce_cnt--; + (char *), "ncec", (void *), ncec); + ill->ill_ncec_cnt--; + ncec->ncec_ill = NULL; /* - * If the number of nce's associated with this ill have dropped + * If the number of ncec's associated with this ill have dropped * to zero, check whether we need to restart any operation that * is waiting for this to happen. */ @@ -519,104 +578,59 @@ ndp_inactive(nce_t *nce) } else { mutex_exit(&ill->ill_lock); } - mutex_destroy(&nce->nce_lock); - if (nce->nce_mp != NULL) - inet_freemsg(nce->nce_mp); + + mutex_destroy(&ncec->ncec_lock); + kmem_cache_free(ncec_cache, ncec); } /* - * ndp_walk routine. Delete the nce if it is associated with the ill + * ncec_walk routine. Delete the ncec if it is associated with the ill * that is going away. Always called as a writer. */ void -ndp_delete_per_ill(nce_t *nce, uchar_t *arg) +ncec_delete_per_ill(ncec_t *ncec, uchar_t *arg) { - if ((nce != NULL) && nce->nce_ill == (ill_t *)arg) { - ndp_delete(nce); + if ((ncec != NULL) && ncec->ncec_ill == (ill_t *)arg) { + ncec_delete(ncec); } } /* - * Walk a list of to be inactive NCEs and blow away all the ires. + * Neighbor Cache cleanup logic for a list of ncec_t entries. */ static void -nce_ire_delete_list(nce_t *nce) +nce_cleanup_list(ncec_t *ncec) { - nce_t *nce_next; + ncec_t *ncec_next; - ASSERT(nce != NULL); - while (nce != NULL) { - nce_next = nce->nce_next; - nce->nce_next = NULL; + ASSERT(ncec != NULL); + while (ncec != NULL) { + ncec_next = ncec->ncec_next; + ncec->ncec_next = NULL; /* * It is possible for the last ndp walker (this thread) - * to come here after ndp_delete has marked the nce CONDEMNED - * and before it has removed the nce from the fastpath list + * to come here after ncec_delete has marked the ncec CONDEMNED + * and before it has removed the ncec from the fastpath list * or called untimeout. So we need to do it here. It is safe - * for both ndp_delete and this thread to do it twice or + * for both ncec_delete and this thread to do it twice or * even simultaneously since each of the threads has a - * reference on the nce. + * reference on the ncec. */ - nce_fastpath_list_delete(nce); + nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL); /* * Cancel any running timer. Timeout can't be restarted - * since CONDEMNED is set. Can't hold nce_lock across untimeout. - * Passing invalid timeout id is fine. + * since CONDEMNED is set. The ncec_lock can't be + * held across untimeout though passing invalid timeout + * id is fine. */ - if (nce->nce_timeout_id != 0) { - (void) untimeout(nce->nce_timeout_id); - nce->nce_timeout_id = 0; + if (ncec->ncec_timeout_id != 0) { + (void) untimeout(ncec->ncec_timeout_id); + ncec->ncec_timeout_id = 0; } - /* - * We might hit this func thus in the v4 case: - * ipif_down->ipif_ndp_down->ndp_walk - */ - - if (nce->nce_ipversion == IPV4_VERSION) { - ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, - IRE_CACHE, nce_ire_delete1, nce, nce->nce_ill); - } else { - ASSERT(nce->nce_ipversion == IPV6_VERSION); - ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, - IRE_CACHE, nce_ire_delete1, nce, nce->nce_ill); - } - NCE_REFRELE_NOTR(nce); - nce = nce_next; - } -} - -/* - * Delete an ire when the nce goes away. - */ -/* ARGSUSED */ -static void -nce_ire_delete(nce_t *nce) -{ - if (nce->nce_ipversion == IPV6_VERSION) { - ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE, - nce_ire_delete1, (char *)nce, nce->nce_ill); - NCE_REFRELE_NOTR(nce); - } else { - ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE, - nce_ire_delete1, (char *)nce, nce->nce_ill); - NCE_REFRELE_NOTR(nce); - } -} - -/* - * ire_walk routine used to delete every IRE that shares this nce - */ -static void -nce_ire_delete1(ire_t *ire, char *nce_arg) -{ - nce_t *nce = (nce_t *)nce_arg; - - ASSERT(ire->ire_type == IRE_CACHE); - - if (ire->ire_nce == nce) { - ASSERT(ire->ire_ipversion == nce->nce_ipversion); - ire_delete(ire); + /* Removed from ncec_ptpn/ncec_next list */ + ncec_refrele_notr(ncec); + ncec = ncec_next; } } @@ -624,100 +638,97 @@ nce_ire_delete1(ire_t *ire, char *nce_arg) * Restart DAD on given NCE. Returns B_TRUE if DAD has been restarted. */ boolean_t -ndp_restart_dad(nce_t *nce) +nce_restart_dad(ncec_t *ncec) { boolean_t started; - boolean_t dropped; + ill_t *ill, *hwaddr_ill; - if (nce == NULL) + if (ncec == NULL) return (B_FALSE); - mutex_enter(&nce->nce_lock); - if (nce->nce_state == ND_PROBE) { - mutex_exit(&nce->nce_lock); + ill = ncec->ncec_ill; + mutex_enter(&ncec->ncec_lock); + if (ncec->ncec_state == ND_PROBE) { + mutex_exit(&ncec->ncec_lock); started = B_TRUE; - } else if (nce->nce_state == ND_REACHABLE) { - nce->nce_state = ND_PROBE; - nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT - 1; - mutex_exit(&nce->nce_lock); - dropped = nce_xmit_solicit(nce, B_FALSE, NULL, NDP_PROBE); - if (dropped) { - mutex_enter(&nce->nce_lock); - nce->nce_pcnt++; - mutex_exit(&nce->nce_lock); + } else if (ncec->ncec_state == ND_REACHABLE) { + ASSERT(ncec->ncec_lladdr != NULL); + ncec->ncec_state = ND_PROBE; + ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; + /* + * Slight cheat here: we don't use the initial probe delay + * for IPv4 in this obscure case. + */ + mutex_exit(&ncec->ncec_lock); + if (IS_IPMP(ill)) { + hwaddr_ill = ipmp_illgrp_find_ill(ill->ill_grp, + ncec->ncec_lladdr, ncec->ncec_lladdr_length); + } else { + hwaddr_ill = ill; } - NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(nce->nce_ill)); + nce_dad(ncec, hwaddr_ill, B_TRUE); started = B_TRUE; } else { - mutex_exit(&nce->nce_lock); + mutex_exit(&ncec->ncec_lock); started = B_FALSE; } return (started); } /* - * IPv6 Cache entry lookup. Try to find an nce matching the parameters passed. - * If one is found, the refcnt on the nce will be incremented. + * IPv6 Cache entry lookup. Try to find an ncec matching the parameters passed. + * If one is found, the refcnt on the ncec will be incremented. */ -nce_t * -ndp_lookup_v6(ill_t *ill, boolean_t match_illgrp, const in6_addr_t *addr, - boolean_t caller_holds_lock) +ncec_t * +ncec_lookup_illgrp_v6(ill_t *ill, const in6_addr_t *addr) { - nce_t *nce; - ip_stack_t *ipst = ill->ill_ipst; + ncec_t *ncec; + ip_stack_t *ipst = ill->ill_ipst; - ASSERT(ill->ill_isv6); - if (!caller_holds_lock) - mutex_enter(&ipst->ips_ndp6->ndp_g_lock); + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + mutex_enter(&ipst->ips_ndp6->ndp_g_lock); /* Get head of v6 hash table */ - nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr)); - nce = nce_lookup_addr(ill, match_illgrp, addr, nce); - if (nce == NULL) - nce = nce_lookup_mapping(ill, addr); - if (!caller_holds_lock) - mutex_exit(&ipst->ips_ndp6->ndp_g_lock); - return (nce); + ncec = *((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr)); + ncec = ncec_lookup_illgrp(ill, addr, ncec); + mutex_exit(&ipst->ips_ndp6->ndp_g_lock); + rw_exit(&ipst->ips_ill_g_lock); + return (ncec); } /* - * IPv4 Cache entry lookup. Try to find an nce matching the parameters passed. - * If one is found, the refcnt on the nce will be incremented. - * Since multicast mappings are handled in arp, there are no nce_mcast_entries - * so we skip the nce_lookup_mapping call. - * XXX TODO: if the nce is found to be ND_STALE, ndp_delete it and return NULL + * IPv4 Cache entry lookup. Try to find an ncec matching the parameters passed. + * If one is found, the refcnt on the ncec will be incremented. */ -nce_t * -ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock) +ncec_t * +ncec_lookup_illgrp_v4(ill_t *ill, const in_addr_t *addr) { - nce_t *nce; + ncec_t *ncec = NULL; in6_addr_t addr6; ip_stack_t *ipst = ill->ill_ipst; - if (!caller_holds_lock) - mutex_enter(&ipst->ips_ndp4->ndp_g_lock); + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + mutex_enter(&ipst->ips_ndp4->ndp_g_lock); /* Get head of v4 hash table */ - nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr)); + ncec = *((ncec_t **)NCE_HASH_PTR_V4(ipst, *addr)); IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); - /* - * NOTE: IPv4 never matches across the illgrp since the NCE's we're - * looking up have fastpath headers that are inherently per-ill. - */ - nce = nce_lookup_addr(ill, B_FALSE, &addr6, nce); - if (!caller_holds_lock) - mutex_exit(&ipst->ips_ndp4->ndp_g_lock); - return (nce); + ncec = ncec_lookup_illgrp(ill, &addr6, ncec); + mutex_exit(&ipst->ips_ndp4->ndp_g_lock); + rw_exit(&ipst->ips_ill_g_lock); + return (ncec); } /* - * Cache entry lookup. Try to find an nce matching the parameters passed. - * Look only for exact entries (no mappings). If an nce is found, increment - * the hold count on that nce. The caller passes in the start of the - * appropriate hash table, and must be holding the appropriate global - * lock (ndp_g_lock). + * Cache entry lookup. Try to find an ncec matching the parameters passed. + * If an ncec is found, increment the hold count on that ncec. + * The caller passes in the start of the appropriate hash table, and must + * be holding the appropriate global lock (ndp_g_lock). In addition, since + * this function matches ncec_t entries across the illgrp, the ips_ill_g_lock + * must be held as reader. + * + * This function always matches across the ipmp group. */ -static nce_t * -nce_lookup_addr(ill_t *ill, boolean_t match_illgrp, const in6_addr_t *addr, - nce_t *nce) +ncec_t * +ncec_lookup_illgrp(ill_t *ill, const in6_addr_t *addr, ncec_t *ncec) { ndp_g_t *ndp; ip_stack_t *ipst = ill->ill_ipst; @@ -727,348 +738,246 @@ nce_lookup_addr(ill_t *ill, boolean_t match_illgrp, const in6_addr_t *addr, else ndp = ipst->ips_ndp4; + ASSERT(ill != NULL); ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); if (IN6_IS_ADDR_UNSPECIFIED(addr)) return (NULL); - for (; nce != NULL; nce = nce->nce_next) { - if (nce->nce_ill == ill || - match_illgrp && IS_IN_SAME_ILLGRP(ill, nce->nce_ill)) { - if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) && - IN6_ARE_ADDR_EQUAL(&nce->nce_mask, - &ipv6_all_ones)) { - mutex_enter(&nce->nce_lock); - if (!(nce->nce_flags & NCE_F_CONDEMNED)) { - NCE_REFHOLD_LOCKED(nce); - mutex_exit(&nce->nce_lock); + for (; ncec != NULL; ncec = ncec->ncec_next) { + if (ncec->ncec_ill == ill || + IS_IN_SAME_ILLGRP(ill, ncec->ncec_ill)) { + if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) { + mutex_enter(&ncec->ncec_lock); + if (!NCE_ISCONDEMNED(ncec)) { + ncec_refhold_locked(ncec); + mutex_exit(&ncec->ncec_lock); break; } - mutex_exit(&nce->nce_lock); + mutex_exit(&ncec->ncec_lock); } } } + return (ncec); +} + +/* + * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t + * entries for ill only, i.e., when ill is part of an ipmp group, + * nce_lookup_v4 will never try to match across the group. + */ +nce_t * +nce_lookup_v4(ill_t *ill, const in_addr_t *addr) +{ + nce_t *nce; + in6_addr_t addr6; + ip_stack_t *ipst = ill->ill_ipst; + + mutex_enter(&ipst->ips_ndp4->ndp_g_lock); + IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); + nce = nce_lookup_addr(ill, &addr6); + mutex_exit(&ipst->ips_ndp4->ndp_g_lock); return (nce); } /* - * Cache entry lookup. Try to find an nce matching the parameters passed. - * Look only for mappings. + * Find an nce_t on ill with nce_addr == addr. Lookup the nce_t + * entries for ill only, i.e., when ill is part of an ipmp group, + * nce_lookup_v6 will never try to match across the group. */ +nce_t * +nce_lookup_v6(ill_t *ill, const in6_addr_t *addr6) +{ + nce_t *nce; + ip_stack_t *ipst = ill->ill_ipst; + + mutex_enter(&ipst->ips_ndp6->ndp_g_lock); + nce = nce_lookup_addr(ill, addr6); + mutex_exit(&ipst->ips_ndp6->ndp_g_lock); + return (nce); +} + static nce_t * -nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr) +nce_lookup_addr(ill_t *ill, const in6_addr_t *addr) { - nce_t *nce; - ip_stack_t *ipst = ill->ill_ipst; + nce_t *nce; - ASSERT(ill != NULL && ill->ill_isv6); - ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock)); - if (!IN6_IS_ADDR_MULTICAST(addr)) - return (NULL); - nce = ipst->ips_ndp6->nce_mask_entries; - for (; nce != NULL; nce = nce->nce_next) - if (nce->nce_ill == ill && - (V6_MASK_EQ(*addr, nce->nce_mask, nce->nce_addr))) { - mutex_enter(&nce->nce_lock); - if (!(nce->nce_flags & NCE_F_CONDEMNED)) { - NCE_REFHOLD_LOCKED(nce); - mutex_exit(&nce->nce_lock); - break; - } - mutex_exit(&nce->nce_lock); - } + ASSERT(ill != NULL); +#ifdef DEBUG + if (ill->ill_isv6) + ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp6->ndp_g_lock)); + else + ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock)); +#endif + mutex_enter(&ill->ill_lock); + nce = nce_lookup(ill, addr); + mutex_exit(&ill->ill_lock); return (nce); } + +/* + * Router turned to host. We need to make sure that cached copies of the ncec + * are not used for forwarding packets if they were derived from the default + * route, and that the default route itself is removed, as required by + * section 7.2.5 of RFC 2461. + * + * Note that the ncec itself probably has valid link-layer information for the + * nexthop, so that there is no reason to delete the ncec, as long as the + * ISROUTER flag is turned off. + */ +static void +ncec_router_to_host(ncec_t *ncec) +{ + ire_t *ire; + ip_stack_t *ipst = ncec->ncec_ipst; + + mutex_enter(&ncec->ncec_lock); + ncec->ncec_flags &= ~NCE_F_ISROUTER; + mutex_exit(&ncec->ncec_lock); + + ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros, + &ncec->ncec_addr, IRE_DEFAULT, ncec->ncec_ill, ALL_ZONES, NULL, + MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW, 0, ipst, NULL); + if (ire != NULL) { + ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst); + ire_delete(ire); + ire_refrele(ire); + } +} + /* * Process passed in parameters either from an incoming packet or via * user ioctl. */ -static void -nce_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv) +void +nce_process(ncec_t *ncec, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv) { - ill_t *ill = nce->nce_ill; - uint32_t hw_addr_len = ill->ill_nd_lla_len; - mblk_t *mp; + ill_t *ill = ncec->ncec_ill; + uint32_t hw_addr_len = ill->ill_phys_addr_length; boolean_t ll_updated = B_FALSE; boolean_t ll_changed; - ip_stack_t *ipst = ill->ill_ipst; + nce_t *nce; - ASSERT(nce->nce_ipversion == IPV6_VERSION); + ASSERT(ncec->ncec_ipversion == IPV6_VERSION); /* * No updates of link layer address or the neighbor state is * allowed, when the cache is in NONUD state. This still * allows for responding to reachability solicitation. */ - mutex_enter(&nce->nce_lock); - if (nce->nce_state == ND_INCOMPLETE) { + mutex_enter(&ncec->ncec_lock); + if (ncec->ncec_state == ND_INCOMPLETE) { if (hw_addr == NULL) { - mutex_exit(&nce->nce_lock); + mutex_exit(&ncec->ncec_lock); return; } - nce_set_ll(nce, hw_addr); + nce_set_ll(ncec, hw_addr); /* - * Update nce state and send the queued packets + * Update ncec state and send the queued packets * back to ip this time ire will be added. */ if (flag & ND_NA_FLAG_SOLICITED) { - nce_update(nce, ND_REACHABLE, NULL); + nce_update(ncec, ND_REACHABLE, NULL); } else { - nce_update(nce, ND_STALE, NULL); - } - mutex_exit(&nce->nce_lock); - nce_fastpath(nce); - nce_cb_dispatch(nce); /* complete callbacks */ - mutex_enter(&nce->nce_lock); - mp = nce->nce_qd_mp; - nce->nce_qd_mp = NULL; - mutex_exit(&nce->nce_lock); - while (mp != NULL) { - mblk_t *nxt_mp, *data_mp; - - nxt_mp = mp->b_next; - mp->b_next = NULL; - - if (mp->b_datap->db_type == M_CTL) - data_mp = mp->b_cont; - else - data_mp = mp; - if (data_mp->b_prev != NULL) { - ill_t *inbound_ill; - queue_t *fwdq = NULL; - uint_t ifindex; - - ifindex = (uint_t)(uintptr_t)data_mp->b_prev; - inbound_ill = ill_lookup_on_ifindex(ifindex, - B_TRUE, NULL, NULL, NULL, NULL, ipst); - if (inbound_ill == NULL) { - data_mp->b_prev = NULL; - freemsg(mp); - return; - } else { - fwdq = inbound_ill->ill_rq; - } - data_mp->b_prev = NULL; - /* - * Send a forwarded packet back into ip_rput_v6 - * just as in ire_send_v6(). - * Extract the queue from b_prev (set in - * ip_rput_data_v6). - */ - if (fwdq != NULL) { - /* - * Forwarded packets hop count will - * get decremented in ip_rput_data_v6 - */ - if (data_mp != mp) - freeb(mp); - put(fwdq, data_mp); - } else { - /* - * Send locally originated packets back - * into ip_wput_v6. - */ - put(ill->ill_wq, mp); - } - ill_refrele(inbound_ill); - } else { - put(ill->ill_wq, mp); - } - mp = nxt_mp; + nce_update(ncec, ND_STALE, NULL); } + mutex_exit(&ncec->ncec_lock); + nce = nce_fastpath(ncec, B_TRUE, NULL); + nce_resolv_ok(ncec); + if (nce != NULL) + nce_refrele(nce); return; } - ll_changed = nce_cmp_ll_addr(nce, hw_addr, hw_addr_len); + ll_changed = nce_cmp_ll_addr(ncec, hw_addr, hw_addr_len); if (!is_adv) { /* If this is a SOLICITATION request only */ if (ll_changed) - nce_update(nce, ND_STALE, hw_addr); - mutex_exit(&nce->nce_lock); - nce_cb_dispatch(nce); + nce_update(ncec, ND_STALE, hw_addr); + mutex_exit(&ncec->ncec_lock); + ncec_cb_dispatch(ncec); return; } if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) { /* If in any other state than REACHABLE, ignore */ - if (nce->nce_state == ND_REACHABLE) { - nce_update(nce, ND_STALE, NULL); + if (ncec->ncec_state == ND_REACHABLE) { + nce_update(ncec, ND_STALE, NULL); } - mutex_exit(&nce->nce_lock); - nce_cb_dispatch(nce); + mutex_exit(&ncec->ncec_lock); + ncec_cb_dispatch(ncec); return; } else { if (ll_changed) { - nce_update(nce, ND_UNCHANGED, hw_addr); + nce_update(ncec, ND_UNCHANGED, hw_addr); ll_updated = B_TRUE; } if (flag & ND_NA_FLAG_SOLICITED) { - nce_update(nce, ND_REACHABLE, NULL); + nce_update(ncec, ND_REACHABLE, NULL); } else { if (ll_updated) { - nce_update(nce, ND_STALE, NULL); + nce_update(ncec, ND_STALE, NULL); } } - mutex_exit(&nce->nce_lock); - if (!(flag & ND_NA_FLAG_ROUTER) && (nce->nce_flags & + mutex_exit(&ncec->ncec_lock); + if (!(flag & ND_NA_FLAG_ROUTER) && (ncec->ncec_flags & NCE_F_ISROUTER)) { - ire_t *ire; - - /* - * Router turned to host. We need to remove the - * entry as well as any default route that may be - * using this as a next hop. This is required by - * section 7.2.5 of RFC 2461. - */ - ire = ire_ftable_lookup_v6(&ipv6_all_zeros, - &ipv6_all_zeros, &nce->nce_addr, IRE_DEFAULT, - nce->nce_ill->ill_ipif, NULL, ALL_ZONES, 0, NULL, - MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW | - MATCH_IRE_DEFAULT, ipst); - if (ire != NULL) { - ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst); - ire_delete(ire); - ire_refrele(ire); - } - ndp_delete(nce); /* will do nce_cb_dispatch */ + ncec_router_to_host(ncec); } else { - nce_cb_dispatch(nce); + ncec_cb_dispatch(ncec); } } } /* - * Walker state structure used by ndp_process() / ndp_process_entry(). - */ -typedef struct ndp_process_data { - ill_t *np_ill; /* ill/illgrp to match against */ - const in6_addr_t *np_addr; /* IPv6 address to match */ - uchar_t *np_hw_addr; /* passed to nce_process() */ - uint32_t np_flag; /* passed to nce_process() */ - boolean_t np_is_adv; /* passed to nce_process() */ -} ndp_process_data_t; - -/* - * Walker callback used by ndp_process() for IPMP groups: calls nce_process() - * for each NCE with a matching address that's in the same IPMP group. - */ -static void -ndp_process_entry(nce_t *nce, void *arg) -{ - ndp_process_data_t *npp = arg; - - if (IS_IN_SAME_ILLGRP(nce->nce_ill, npp->np_ill) && - IN6_ARE_ADDR_EQUAL(&nce->nce_addr, npp->np_addr) && - IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) { - nce_process(nce, npp->np_hw_addr, npp->np_flag, npp->np_is_adv); - } -} - -/* - * Wrapper around nce_process() that handles IPMP. In particular, for IPMP, - * NCEs are per-underlying-ill (because of nce_fp_mp) and thus we may have - * more than one NCE for a given IPv6 address to tend to. In that case, we - * need to walk all NCEs and callback nce_process() for each one. Since this - * is expensive, in the non-IPMP case we just directly call nce_process(). - * Ultimately, nce_fp_mp needs to be moved out of the nce_t so that all IP - * interfaces in an IPMP group share the same NCEs -- at which point this - * function can be removed entirely. - */ -void -ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv) -{ - ill_t *ill = nce->nce_ill; - struct ndp_g_s *ndp = ill->ill_ipst->ips_ndp6; - ndp_process_data_t np; - - if (ill->ill_grp == NULL) { - nce_process(nce, hw_addr, flag, is_adv); - return; - } - - /* IPMP case: walk all NCEs */ - np.np_ill = ill; - np.np_addr = &nce->nce_addr; - np.np_flag = flag; - np.np_is_adv = is_adv; - np.np_hw_addr = hw_addr; - - ndp_walk_common(ndp, NULL, (pfi_t)ndp_process_entry, &np, ALL_ZONES); -} - -/* - * Pass arg1 to the pfi supplied, along with each nce in existence. - * ndp_walk() places a REFHOLD on the nce and drops the lock when + * Pass arg1 to the pfi supplied, along with each ncec in existence. + * ncec_walk() places a REFHOLD on the ncec and drops the lock when * walking the hash list. */ void -ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1, +ncec_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1, boolean_t trace) { - nce_t *nce; - nce_t *nce1; - nce_t **ncep; - nce_t *free_nce_list = NULL; + ncec_t *ncec; + ncec_t *ncec1; + ncec_t **ncep; + ncec_t *free_nce_list = NULL; mutex_enter(&ndp->ndp_g_lock); - /* Prevent ndp_delete from unlink and free of NCE */ + /* Prevent ncec_delete from unlink and free of NCE */ ndp->ndp_g_walker++; mutex_exit(&ndp->ndp_g_lock); for (ncep = ndp->nce_hash_tbl; ncep < A_END(ndp->nce_hash_tbl); ncep++) { - for (nce = *ncep; nce != NULL; nce = nce1) { - nce1 = nce->nce_next; - if (ill == NULL || nce->nce_ill == ill) { + for (ncec = *ncep; ncec != NULL; ncec = ncec1) { + ncec1 = ncec->ncec_next; + if (ill == NULL || ncec->ncec_ill == ill) { if (trace) { - NCE_REFHOLD(nce); - (*pfi)(nce, arg1); - NCE_REFRELE(nce); + ncec_refhold(ncec); + (*pfi)(ncec, arg1); + ncec_refrele(ncec); } else { - NCE_REFHOLD_NOTR(nce); - (*pfi)(nce, arg1); - NCE_REFRELE_NOTR(nce); + ncec_refhold_notr(ncec); + (*pfi)(ncec, arg1); + ncec_refrele_notr(ncec); } } } } - for (nce = ndp->nce_mask_entries; nce != NULL; nce = nce1) { - nce1 = nce->nce_next; - if (ill == NULL || nce->nce_ill == ill) { - if (trace) { - NCE_REFHOLD(nce); - (*pfi)(nce, arg1); - NCE_REFRELE(nce); - } else { - NCE_REFHOLD_NOTR(nce); - (*pfi)(nce, arg1); - NCE_REFRELE_NOTR(nce); - } - } - } mutex_enter(&ndp->ndp_g_lock); ndp->ndp_g_walker--; - /* - * While NCE's are removed from global list they are placed - * in a private list, to be passed to nce_ire_delete_list(). - * The reason is, there may be ires pointing to this nce - * which needs to cleaned up. - */ if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) { /* Time to delete condemned entries */ for (ncep = ndp->nce_hash_tbl; ncep < A_END(ndp->nce_hash_tbl); ncep++) { - nce = *ncep; - if (nce != NULL) { - nce_remove(ndp, nce, &free_nce_list); + ncec = *ncep; + if (ncec != NULL) { + nce_remove(ndp, ncec, &free_nce_list); } } - nce = ndp->nce_mask_entries; - if (nce != NULL) { - nce_remove(ndp, nce, &free_nce_list); - } ndp->ndp_g_walker_cleanup = B_FALSE; } mutex_exit(&ndp->ndp_g_lock); if (free_nce_list != NULL) { - nce_ire_delete_list(free_nce_list); + nce_cleanup_list(free_nce_list); } } @@ -1077,198 +986,10 @@ ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1, * Note that ill can be NULL hence can't derive the ipst from it. */ void -ndp_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst) -{ - ndp_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE); - ndp_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE); -} - -/* - * Process resolve requests. Handles both mapped entries - * as well as cases that needs to be send out on the wire. - * Lookup a NCE for a given IRE. Regardless of whether one exists - * or one is created, we defer making ire point to nce until the - * ire is actually added at which point the nce_refcnt on the nce is - * incremented. This is done primarily to have symmetry between ire_add() - * and ire_delete() which decrements the nce_refcnt, when an ire is deleted. - */ -int -ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid) -{ - nce_t *nce, *hw_nce = NULL; - int err; - ill_t *ipmp_ill; - uint16_t nce_flags; - mblk_t *mp_nce = NULL; - ip_stack_t *ipst = ill->ill_ipst; - uchar_t *hwaddr = NULL; - - ASSERT(ill->ill_isv6); - - if (IN6_IS_ADDR_MULTICAST(dst)) - return (nce_set_multicast(ill, dst)); - - nce_flags = (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0; - - /* - * If `ill' is under IPMP, then first check to see if there's an NCE - * for `dst' on the IPMP meta-interface (e.g., because an application - * explicitly did an SIOCLIFSETND to tie a hardware address to `dst'). - * If so, we use that hardware address when creating the NCE below. - * Note that we don't yet have a mechanism to remove these NCEs if the - * NCE for `dst' on the IPMP meta-interface is subsequently removed -- - * but rather than build such a beast, we should fix NCEs so that they - * can be properly shared across an IPMP group. - */ - if (IS_UNDER_IPMP(ill)) { - if ((ipmp_ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) { - hw_nce = ndp_lookup_v6(ipmp_ill, B_FALSE, dst, B_FALSE); - if (hw_nce != NULL && hw_nce->nce_res_mp != NULL) { - hwaddr = hw_nce->nce_res_mp->b_rptr + - NCE_LL_ADDR_OFFSET(ipmp_ill); - nce_flags |= hw_nce->nce_flags; - } - ill_refrele(ipmp_ill); - } - } - - err = ndp_lookup_then_add_v6(ill, - B_FALSE, /* NCE fastpath is per ill; don't match across group */ - hwaddr, - dst, - &ipv6_all_ones, - &ipv6_all_zeros, - 0, - nce_flags, - hwaddr != NULL ? ND_REACHABLE : ND_INCOMPLETE, - &nce); - - if (hw_nce != NULL) - NCE_REFRELE(hw_nce); - - switch (err) { - case 0: - /* - * New cache entry was created. Make sure that the state - * is not ND_INCOMPLETE. It can be in some other state - * even before we send out the solicitation as we could - * get un-solicited advertisements. - * - * If this is an XRESOLV interface, simply return 0, - * since we don't want to solicit just yet. - */ - if (ill->ill_flags & ILLF_XRESOLV) { - NCE_REFRELE(nce); - return (0); - } - - mutex_enter(&nce->nce_lock); - if (nce->nce_state != ND_INCOMPLETE) { - mutex_exit(&nce->nce_lock); - NCE_REFRELE(nce); - return (0); - } - if (nce->nce_rcnt == 0) { - /* The caller will free mp */ - mutex_exit(&nce->nce_lock); - ndp_delete(nce); - NCE_REFRELE(nce); - return (ESRCH); - } - mp_nce = ip_prepend_zoneid(mp, zoneid, ipst); - if (mp_nce == NULL) { - /* The caller will free mp */ - mutex_exit(&nce->nce_lock); - ndp_delete(nce); - NCE_REFRELE(nce); - return (ENOMEM); - } - nce_queue_mp(nce, mp_nce); - ip_ndp_resolve(nce); - mutex_exit(&nce->nce_lock); - NCE_REFRELE(nce); - return (EINPROGRESS); - case EEXIST: - /* Resolution in progress just queue the packet */ - mutex_enter(&nce->nce_lock); - if (nce->nce_state == ND_INCOMPLETE) { - mp_nce = ip_prepend_zoneid(mp, zoneid, ipst); - if (mp_nce == NULL) { - err = ENOMEM; - } else { - nce_queue_mp(nce, mp_nce); - err = EINPROGRESS; - } - } else { - /* - * Any other state implies we have - * a nce but IRE needs to be added ... - * ire_add_v6() will take care of the - * the case when the nce becomes CONDEMNED - * before the ire is added to the table. - */ - err = 0; - } - mutex_exit(&nce->nce_lock); - NCE_REFRELE(nce); - break; - default: - ip1dbg(("ndp_resolver: Can't create NCE %d\n", err)); - break; - } - return (err); -} - -/* - * When there is no resolver, the link layer template is passed in - * the IRE. - * Lookup a NCE for a given IRE. Regardless of whether one exists - * or one is created, we defer making ire point to nce until the - * ire is actually added at which point the nce_refcnt on the nce is - * incremented. This is done primarily to have symmetry between ire_add() - * and ire_delete() which decrements the nce_refcnt, when an ire is deleted. - */ -int -ndp_noresolver(ill_t *ill, const in6_addr_t *dst) +ncec_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst) { - nce_t *nce; - int err = 0; - - ASSERT(ill != NULL); - ASSERT(ill->ill_isv6); - if (IN6_IS_ADDR_MULTICAST(dst)) { - err = nce_set_multicast(ill, dst); - return (err); - } - - err = ndp_lookup_then_add_v6(ill, - B_FALSE, /* NCE fastpath is per ill; don't match across group */ - ill->ill_dest_addr, /* hardware address is NULL in most cases */ - dst, - &ipv6_all_ones, - &ipv6_all_zeros, - 0, - (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0, - ND_REACHABLE, - &nce); - - switch (err) { - case 0: - /* - * Cache entry with a proper resolver cookie was - * created. - */ - NCE_REFRELE(nce); - break; - case EEXIST: - err = 0; - NCE_REFRELE(nce); - break; - default: - ip1dbg(("ndp_noresolver: Can't create NCE %d\n", err)); - break; - } - return (err); + ncec_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE); + ncec_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE); } /* @@ -1277,83 +998,73 @@ ndp_noresolver(ill_t *ill, const in6_addr_t *dst) * multicast destination. */ static int -nce_set_multicast(ill_t *ill, const in6_addr_t *dst) +nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst, + uint16_t flags, nce_t **newnce) { - nce_t *mnce; /* Multicast mapping entry */ - nce_t *nce; - uchar_t *hw_addr = NULL; + uchar_t *hw_addr; int err = 0; ip_stack_t *ipst = ill->ill_ipst; + nce_t *nce; ASSERT(ill != NULL); ASSERT(ill->ill_isv6); ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst))); mutex_enter(&ipst->ips_ndp6->ndp_g_lock); - nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *dst)); - nce = nce_lookup_addr(ill, B_FALSE, dst, nce); + nce = nce_lookup_addr(ill, dst); if (nce != NULL) { mutex_exit(&ipst->ips_ndp6->ndp_g_lock); - NCE_REFRELE(nce); - return (0); - } - /* No entry, now lookup for a mapping this should never fail */ - mnce = nce_lookup_mapping(ill, dst); - if (mnce == NULL) { - /* Something broken for the interface. */ - mutex_exit(&ipst->ips_ndp6->ndp_g_lock); - return (ESRCH); + goto done; } - ASSERT(mnce->nce_flags & NCE_F_MAPPING); if (ill->ill_net_type == IRE_IF_RESOLVER) { /* * For IRE_IF_RESOLVER a hardware mapping can be - * generated, for IRE_IF_NORESOLVER, resolution cookie - * in the ill is copied in ndp_add_v6(). + * generated. */ hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP); if (hw_addr == NULL) { mutex_exit(&ipst->ips_ndp6->ndp_g_lock); - NCE_REFRELE(mnce); return (ENOMEM); } - nce_make_mapping(mnce, hw_addr, (uchar_t *)dst); + ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr); + } else { + /* + * So no hw_addr is needed for IRE_IF_NORESOLVER. + */ + hw_addr = NULL; } - NCE_REFRELE(mnce); - /* - * IRE_IF_NORESOLVER type simply copies the resolution - * cookie passed in. So no hw_addr is needed. - */ - err = ndp_add_v6(ill, - hw_addr, - dst, - &ipv6_all_ones, - &ipv6_all_zeros, - 0, - NCE_F_NONUD, - ND_REACHABLE, - &nce); + ASSERT((flags & NCE_F_MCAST) != 0); + ASSERT((flags & NCE_F_NONUD) != 0); + /* nce_state will be computed by nce_add_common() */ + err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags, + ND_UNCHANGED, &nce); mutex_exit(&ipst->ips_ndp6->ndp_g_lock); + if (err == 0) + err = nce_add_v6_postprocess(nce); if (hw_addr != NULL) kmem_free(hw_addr, ill->ill_nd_lla_len); if (err != 0) { - ip1dbg(("nce_set_multicast: create failed" "%d\n", err)); + ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err)); return (err); } - NCE_REFRELE(nce); +done: + ASSERT(nce->nce_common->ncec_state == ND_REACHABLE); + if (newnce != NULL) + *newnce = nce; + else + nce_refrele(nce); return (0); } /* - * Return the link layer address, and any flags of a nce. + * Return the link layer address, and any flags of a ncec. */ int ndp_query(ill_t *ill, struct lif_nd_req *lnr) { - nce_t *nce; + ncec_t *ncec; in6_addr_t *addr; sin6_t *sin6; - dl_unitdata_req_t *dl; ASSERT(ill != NULL && ill->ill_isv6); sin6 = (sin6_t *)&lnr->lnr_addr; @@ -1363,158 +1074,135 @@ ndp_query(ill_t *ill, struct lif_nd_req *lnr) * NOTE: if the ill is an IPMP interface, then match against the whole * illgrp. This e.g. allows in.ndpd to retrieve the link layer * addresses for the data addresses on an IPMP interface even though - * ipif_ndp_up() created them with an nce_ill of ipif_bound_ill. + * ipif_ndp_up() created them with an ncec_ill of ipif_bound_ill. */ - nce = ndp_lookup_v6(ill, IS_IPMP(ill), addr, B_FALSE); - if (nce == NULL) + ncec = ncec_lookup_illgrp_v6(ill, addr); + if (ncec == NULL) return (ESRCH); - /* If in INCOMPLETE state, no link layer address is available yet */ - if (!NCE_ISREACHABLE(nce)) { - NCE_REFRELE(nce); + /* If no link layer address is available yet, return ESRCH */ + if (!NCE_ISREACHABLE(ncec)) { + ncec_refrele(ncec); return (ESRCH); } - dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr; - if (ill->ill_flags & ILLF_XRESOLV) - lnr->lnr_hdw_len = dl->dl_dest_addr_length; - else - lnr->lnr_hdw_len = ill->ill_nd_lla_len; - ASSERT(NCE_LL_ADDR_OFFSET(ill) + lnr->lnr_hdw_len <= - sizeof (lnr->lnr_hdw_addr)); - bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill), - (uchar_t *)&lnr->lnr_hdw_addr, lnr->lnr_hdw_len); - if (nce->nce_flags & NCE_F_ISROUTER) + lnr->lnr_hdw_len = ill->ill_phys_addr_length; + bcopy(ncec->ncec_lladdr, (uchar_t *)&lnr->lnr_hdw_addr, + lnr->lnr_hdw_len); + if (ncec->ncec_flags & NCE_F_ISROUTER) lnr->lnr_flags = NDF_ISROUTER_ON; - if (nce->nce_flags & NCE_F_ANYCAST) + if (ncec->ncec_flags & NCE_F_ANYCAST) lnr->lnr_flags |= NDF_ANYCAST_ON; - NCE_REFRELE(nce); + ncec_refrele(ncec); return (0); } /* - * Send Enable/Disable multicast reqs to driver. + * Finish setting up the Enable/Disable multicast for the driver. */ -int -ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len, +mblk_t * +ndp_mcastreq(ill_t *ill, const in6_addr_t *v6group, uint32_t hw_addr_len, uint32_t hw_addr_offset, mblk_t *mp) { - nce_t *nce; uchar_t *hw_addr; - ip_stack_t *ipst = ill->ill_ipst; + ipaddr_t v4group; + uchar_t *addr; - ASSERT(ill != NULL && ill->ill_isv6); ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); - hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len); - if (hw_addr == NULL || !IN6_IS_ADDR_MULTICAST(addr)) { - freemsg(mp); - return (EINVAL); + if (IN6_IS_ADDR_V4MAPPED(v6group)) { + IN6_V4MAPPED_TO_IPADDR(v6group, v4group); + + ASSERT(CLASSD(v4group)); + ASSERT(!(ill->ill_isv6)); + + addr = (uchar_t *)&v4group; + } else { + ASSERT(IN6_IS_ADDR_MULTICAST(v6group)); + ASSERT(ill->ill_isv6); + + addr = (uchar_t *)v6group; } - mutex_enter(&ipst->ips_ndp6->ndp_g_lock); - nce = nce_lookup_mapping(ill, addr); - if (nce == NULL) { - mutex_exit(&ipst->ips_ndp6->ndp_g_lock); + hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len); + if (hw_addr == NULL) { + ip0dbg(("ndp_mcastreq NULL hw_addr\n")); freemsg(mp); - return (ESRCH); - } - mutex_exit(&ipst->ips_ndp6->ndp_g_lock); - /* - * Update dl_addr_length and dl_addr_offset for primitives that - * have physical addresses as opposed to full saps - */ - switch (((union DL_primitives *)mp->b_rptr)->dl_primitive) { - case DL_ENABMULTI_REQ: - /* Track the state if this is the first enabmulti */ - if (ill->ill_dlpi_multicast_state == IDS_UNKNOWN) - ill->ill_dlpi_multicast_state = IDS_INPROGRESS; - ip1dbg(("ndp_mcastreq: ENABMULTI\n")); - break; - case DL_DISABMULTI_REQ: - ip1dbg(("ndp_mcastreq: DISABMULTI\n")); - break; - default: - NCE_REFRELE(nce); - ip1dbg(("ndp_mcastreq: default\n")); - return (EINVAL); + return (NULL); } - nce_make_mapping(nce, hw_addr, (uchar_t *)addr); - NCE_REFRELE(nce); - ill_dlpi_send(ill, mp); - return (0); -} + ip_mcast_mapping(ill, addr, hw_addr); + return (mp); +} -/* - * Send out a NS for resolving the ip address in nce. - */ void -ip_ndp_resolve(nce_t *nce) +ip_ndp_resolve(ncec_t *ncec) { + in_addr_t sender4 = INADDR_ANY; in6_addr_t sender6 = ipv6_all_zeros; + ill_t *src_ill; uint32_t ms; - mblk_t *mp; - ip6_t *ip6h; - ASSERT(MUTEX_HELD(&nce->nce_lock)); - /* - * Pick the src from outgoing packet, if one is available. - * Otherwise let nce_xmit figure out the src. - */ - if ((mp = nce->nce_qd_mp) != NULL) { - /* Handle ip_newroute_v6 giving us IPSEC packets */ - if (mp->b_datap->db_type == M_CTL) - mp = mp->b_cont; - ip6h = (ip6_t *)mp->b_rptr; - if (ip6h->ip6_nxt == IPPROTO_RAW) { - /* - * This message should have been pulled up already in - * ip_wput_v6. We can't do pullups here because - * the message could be from the nce_qd_mp which could - * have b_next/b_prev non-NULL. - */ - ASSERT(MBLKL(mp) >= sizeof (ip6i_t) + IPV6_HDR_LEN); - ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t)); - } - sender6 = ip6h->ip6_src; + src_ill = nce_resolve_src(ncec, &sender6); + if (src_ill == NULL) { + /* Make sure we try again later */ + ms = ncec->ncec_ill->ill_reachable_retrans_time; + nce_restart_timer(ncec, (clock_t)ms); + return; } - ms = nce_solicit(nce, sender6); - mutex_exit(&nce->nce_lock); + if (ncec->ncec_ipversion == IPV4_VERSION) + IN6_V4MAPPED_TO_IPADDR(&sender6, sender4); + mutex_enter(&ncec->ncec_lock); + if (ncec->ncec_ipversion == IPV6_VERSION) + ms = ndp_solicit(ncec, sender6, src_ill); + else + ms = arp_request(ncec, sender4, src_ill); + mutex_exit(&ncec->ncec_lock); if (ms == 0) { - if (nce->nce_state != ND_REACHABLE) { - nce_resolv_failed(nce); - ndp_delete(nce); + if (ncec->ncec_state != ND_REACHABLE) { + if (ncec->ncec_ipversion == IPV6_VERSION) + ndp_resolv_failed(ncec); + else + arp_resolv_failed(ncec); + ASSERT((ncec->ncec_flags & NCE_F_STATIC) == 0); + nce_make_unreachable(ncec); + ncec_delete(ncec); } } else { - NDP_RESTART_TIMER(nce, (clock_t)ms); + nce_restart_timer(ncec, (clock_t)ms); } - mutex_enter(&nce->nce_lock); +done: + ill_refrele(src_ill); } /* - * Send a neighbor solicitation. + * Send an IPv6 neighbor solicitation. * Returns number of milliseconds after which we should either rexmit or abort. * Return of zero means we should abort. - * The caller holds the nce_lock to protect nce_qd_mp and nce_rcnt. + * The caller holds the ncec_lock to protect ncec_qd_mp and ncec_rcnt. + * The optional source address is used as a hint to ndp_solicit for + * which source to use in the packet. * - * NOTE: This routine drops nce_lock (and later reacquires it) when sending + * NOTE: This routine drops ncec_lock (and later reacquires it) when sending * the packet. */ uint32_t -nce_solicit(nce_t *nce, in6_addr_t sender) +ndp_solicit(ncec_t *ncec, in6_addr_t src, ill_t *ill) { - boolean_t dropped; + in6_addr_t dst; + boolean_t dropped = B_FALSE; - ASSERT(nce->nce_ipversion == IPV6_VERSION); - ASSERT(MUTEX_HELD(&nce->nce_lock)); + ASSERT(ncec->ncec_ipversion == IPV6_VERSION); + ASSERT(MUTEX_HELD(&ncec->ncec_lock)); - if (nce->nce_rcnt == 0) + if (ncec->ncec_rcnt == 0) return (0); - nce->nce_rcnt--; - mutex_exit(&nce->nce_lock); - dropped = nce_xmit_solicit(nce, B_TRUE, &sender, 0); - mutex_enter(&nce->nce_lock); + dst = ncec->ncec_addr; + ncec->ncec_rcnt--; + mutex_exit(&ncec->ncec_lock); + dropped = ndp_xmit(ill, ND_NEIGHBOR_SOLICIT, ill->ill_phys_addr, + ill->ill_phys_addr_length, &src, &dst, 0); + mutex_enter(&ncec->ncec_lock); if (dropped) - nce->nce_rcnt++; - return (nce->nce_ill->ill_reachable_retrans_time); + ncec->ncec_rcnt++; + return (ncec->ncec_ill->ill_reachable_retrans_time); } /* @@ -1528,23 +1216,30 @@ nce_solicit(nce_t *nce, in6_addr_t sender) * ip_ndp_excl. */ /* ARGSUSED */ -static void -ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) +void +ip_addr_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) { ill_t *ill = rq->q_ptr; ipif_t *ipif; - in6_addr_t *addr = (in6_addr_t *)mp->b_rptr; + in6_addr_t *addr6 = (in6_addr_t *)mp->b_rptr; + in_addr_t *addr4 = (in_addr_t *)mp->b_rptr; + boolean_t addr_equal; for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { /* * We do not support recovery of proxy ARP'd interfaces, * because the system lacks a complete proxy ARP mechanism. */ - if ((ipif->ipif_flags & IPIF_POINTOPOINT) || - !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, addr)) { - continue; + if (ill->ill_isv6) { + addr_equal = IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, + addr6); + } else { + addr_equal = (ipif->ipif_lcl_addr == *addr4); } + if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !addr_equal) + continue; + /* * If we have already recovered or if the interface is going * away, then ignore. @@ -1561,13 +1256,20 @@ ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) mutex_exit(&ill->ill_lock); ipif->ipif_was_dup = B_TRUE; - VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS); - (void) ipif_up_done_v6(ipif); + if (ill->ill_isv6) { + VERIFY(ipif_ndp_up(ipif, B_TRUE) != EINPROGRESS); + (void) ipif_up_done_v6(ipif); + } else { + VERIFY(ipif_arp_up(ipif, Res_act_initial, B_TRUE) != + EINPROGRESS); + (void) ipif_up_done(ipif); + } } freeb(mp); } /* + * * Attempt to recover an IPv6 interface that's been shut down as a duplicate. * As long as someone else holds the address, the interface will stay down. * When that conflict goes away, the interface is brought back up. This is @@ -1579,8 +1281,8 @@ ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) * * This function is entered on a timer expiry; the ID is in ipif_recovery_id. */ -static void -ipif6_dup_recovery(void *arg) +void +ipif_dup_recovery(void *arg) { ipif_t *ipif = arg; @@ -1598,7 +1300,7 @@ ipif6_dup_recovery(void *arg) if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING)) return; - ndp_do_recovery(ipif); + ipif_do_recovery(ipif); } /* @@ -1608,18 +1310,24 @@ ipif6_dup_recovery(void *arg) * Called both by recovery timer expiry and link-up notification. */ void -ndp_do_recovery(ipif_t *ipif) +ipif_do_recovery(ipif_t *ipif) { ill_t *ill = ipif->ipif_ill; mblk_t *mp; ip_stack_t *ipst = ill->ill_ipst; + size_t mp_size; - mp = allocb(sizeof (ipif->ipif_v6lcl_addr), BPRI_MED); + if (ipif->ipif_isv6) + mp_size = sizeof (ipif->ipif_v6lcl_addr); + else + mp_size = sizeof (ipif->ipif_lcl_addr); + mp = allocb(mp_size, BPRI_MED); if (mp == NULL) { mutex_enter(&ill->ill_lock); - if (ipif->ipif_recovery_id == 0 && + if (ipst->ips_ip_dup_recovery > 0 && + ipif->ipif_recovery_id == 0 && !(ipif->ipif_state_flags & IPIF_CONDEMNED)) { - ipif->ipif_recovery_id = timeout(ipif6_dup_recovery, + ipif->ipif_recovery_id = timeout(ipif_dup_recovery, ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); } mutex_exit(&ill->ill_lock); @@ -1632,10 +1340,15 @@ ndp_do_recovery(ipif_t *ipif) (void) untimeout(ipif->ipif_recovery_id); ipif->ipif_recovery_id = 0; - bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr, - sizeof (ipif->ipif_v6lcl_addr)); + if (ipif->ipif_isv6) { + bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr, + sizeof (ipif->ipif_v6lcl_addr)); + } else { + bcopy(&ipif->ipif_lcl_addr, mp->b_rptr, + sizeof (ipif->ipif_lcl_addr)); + } ill_refhold(ill); - qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_recover, NEW_OP, + qwriter_ip(ill, ill->ill_rq, mp, ip_addr_recover, NEW_OP, B_FALSE); } } @@ -1644,80 +1357,19 @@ ndp_do_recovery(ipif_t *ipif) * Find the MAC and IP addresses in an NA/NS message. */ static void -ip_ndp_find_addresses(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, in6_addr_t *targp, - uchar_t **haddr, uint_t *haddrlenp) +ip_ndp_find_addresses(mblk_t *mp, ip_recv_attr_t *ira, ill_t *ill, + in6_addr_t *targp, uchar_t **haddr, uint_t *haddrlenp) { - ip6_t *ip6h = (ip6_t *)mp->b_rptr; icmp6_t *icmp6 = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); - nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6; nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6; uchar_t *addr; - int alen = 0; + int alen; - if (dl_mp == NULL) { - nd_opt_hdr_t *opt = NULL; - int len; - - /* - * If it's from the fast-path, then it can't be a probe - * message, and thus must include a linkaddr option. - * Extract that here. - */ - switch (icmp6->icmp6_type) { - case ND_NEIGHBOR_SOLICIT: - len = mp->b_wptr - (uchar_t *)ns; - if ((len -= sizeof (*ns)) > 0) { - opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1), - len, ND_OPT_SOURCE_LINKADDR); - } - break; - case ND_NEIGHBOR_ADVERT: - len = mp->b_wptr - (uchar_t *)na; - if ((len -= sizeof (*na)) > 0) { - opt = ndp_get_option((nd_opt_hdr_t *)(na + 1), - len, ND_OPT_TARGET_LINKADDR); - } - break; - } - - if (opt != NULL && opt->nd_opt_len * 8 - sizeof (*opt) >= - ill->ill_nd_lla_len) { - addr = (uchar_t *)(opt + 1); - alen = ill->ill_nd_lla_len; - } - - /* - * We cheat a bit here for the sake of printing usable log - * messages in the rare case where the reply we got was unicast - * without a source linkaddr option, and the interface is in - * fastpath mode. (Sigh.) - */ - if (alen == 0 && ill->ill_type == IFT_ETHER && - MBLKHEAD(mp) >= sizeof (struct ether_header)) { - struct ether_header *pether; - - pether = (struct ether_header *)((char *)ip6h - - sizeof (*pether)); - addr = pether->ether_shost.ether_addr_octet; - alen = ETHERADDRL; - } - } else { - dl_unitdata_ind_t *dlu; - - dlu = (dl_unitdata_ind_t *)dl_mp->b_rptr; - alen = dlu->dl_src_addr_length; - if (alen > 0 && dlu->dl_src_addr_offset >= sizeof (*dlu) && - dlu->dl_src_addr_offset + alen <= MBLKL(dl_mp)) { - addr = dl_mp->b_rptr + dlu->dl_src_addr_offset; - if (ill->ill_sap_length < 0) { - alen += ill->ill_sap_length; - } else { - addr += ill->ill_sap_length; - alen -= ill->ill_sap_length; - } - } - } + /* icmp_inbound_v6 ensures this */ + ASSERT(ira->ira_flags & IRAF_L2SRC_SET); + addr = ira->ira_l2src; + alen = ill->ill_phys_addr_length; if (alen > 0) { *haddr = addr; *haddrlenp = alen; @@ -1740,35 +1392,58 @@ ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) { ill_t *ill = rq->q_ptr; ipif_t *ipif; - mblk_t *dl_mp = NULL; uchar_t *haddr; uint_t haddrlen; ip_stack_t *ipst = ill->ill_ipst; in6_addr_t targ; - - if (DB_TYPE(mp) != M_DATA) { - dl_mp = mp; - mp = mp->b_cont; + ip_recv_attr_t iras; + mblk_t *attrmp; + + attrmp = mp; + mp = mp->b_cont; + attrmp->b_cont = NULL; + if (!ip_recv_attr_from_mblk(attrmp, &iras)) { + /* The ill or ip_stack_t disappeared on us */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ip_recv_attr_from_mblk", mp, ill); + freemsg(mp); + ira_cleanup(&iras, B_TRUE); + return; } - ip_ndp_find_addresses(mp, dl_mp, ill, &targ, &haddr, &haddrlen); + ASSERT(ill == iras.ira_rill); + + ip_ndp_find_addresses(mp, &iras, ill, &targ, &haddr, &haddrlen); if (haddr != NULL && haddrlen == ill->ill_phys_addr_length) { /* * Ignore conflicts generated by misbehaving switches that * just reflect our own messages back to us. For IPMP, we may * see reflections across any ill in the illgrp. + * + * RFC2462 and revisions tried to detect both the case + * when a statically configured IPv6 address is a duplicate, + * and the case when the L2 address itself is a duplicate. The + * later is important because, with stateles address autoconf, + * if the L2 address is a duplicate, the resulting IPv6 + * address(es) would also be duplicates. We rely on DAD of the + * IPv6 address itself to detect the latter case. */ + /* For an under ill_grp can change under lock */ + rw_enter(&ipst->ips_ill_g_lock, RW_READER); if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 || IS_UNDER_IPMP(ill) && - ipmp_illgrp_find_ill(ill->ill_grp, haddr, haddrlen) != NULL) + ipmp_illgrp_find_ill(ill->ill_grp, haddr, + haddrlen) != NULL) { + rw_exit(&ipst->ips_ill_g_lock); goto ignore_conflict; + } + rw_exit(&ipst->ips_ill_g_lock); } /* * Look up the appropriate ipif. */ - ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, NULL, NULL, NULL, - NULL, ipst); + ipif = ipif_lookup_addr_v6(&targ, ill, ALL_ZONES, ipst); if (ipif == NULL) goto ignore_conflict; @@ -1802,43 +1477,64 @@ ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) ill->ill_ipif_dup_count++; mutex_exit(&ill->ill_lock); (void) ipif_down(ipif, NULL, NULL); - ipif_down_tail(ipif); + (void) ipif_down_tail(ipif); mutex_enter(&ill->ill_lock); if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && ill->ill_net_type == IRE_IF_RESOLVER && !(ipif->ipif_state_flags & IPIF_CONDEMNED) && ipst->ips_ip_dup_recovery > 0) { ASSERT(ipif->ipif_recovery_id == 0); - ipif->ipif_recovery_id = timeout(ipif6_dup_recovery, + ipif->ipif_recovery_id = timeout(ipif_dup_recovery, ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); } mutex_exit(&ill->ill_lock); ipif_refrele(ipif); + ignore_conflict: - if (dl_mp != NULL) - freeb(dl_mp); freemsg(mp); + ira_cleanup(&iras, B_TRUE); } /* * Handle failure by tearing down the ipifs with the specified address. Note - * that tearing down the ipif also means deleting the nce through ipif_down, so - * it's not possible to do recovery by just restarting the nce timer. Instead, + * that tearing down the ipif also means deleting the ncec through ipif_down, so + * it's not possible to do recovery by just restarting the ncec timer. Instead, * we start a timer on the ipif. + * Caller has to free mp; */ static void -ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) +ndp_failure(mblk_t *mp, ip_recv_attr_t *ira) { + const uchar_t *haddr; + ill_t *ill = ira->ira_rill; + + /* + * Ignore conflicts generated by misbehaving switches that just + * reflect our own messages back to us. + */ + + /* icmp_inbound_v6 ensures this */ + ASSERT(ira->ira_flags & IRAF_L2SRC_SET); + haddr = ira->ira_l2src; + if (haddr != NULL && + bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) { + return; + } + if ((mp = copymsg(mp)) != NULL) { - if (dl_mp == NULL) - dl_mp = mp; - else if ((dl_mp = copyb(dl_mp)) != NULL) - dl_mp->b_cont = mp; - if (dl_mp == NULL) { + mblk_t *attrmp; + + attrmp = ip_recv_attr_to_mblk(ira); + if (attrmp == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); freemsg(mp); } else { + ASSERT(attrmp->b_cont == NULL); + attrmp->b_cont = mp; + mp = attrmp; ill_refhold(ill); - qwriter_ip(ill, ill->ill_rq, dl_mp, ip_ndp_excl, NEW_OP, + qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_excl, NEW_OP, B_FALSE); } } @@ -1848,20 +1544,39 @@ ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) * Handle a discovered conflict: some other system is advertising that it owns * one of our IP addresses. We need to defend ourselves, or just shut down the * interface. + * + * Handles both IPv4 and IPv6 */ -static void -ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce) +boolean_t +ip_nce_conflict(mblk_t *mp, ip_recv_attr_t *ira, ncec_t *ncec) { - ipif_t *ipif; - uint32_t now; - uint_t maxdefense; - uint_t defs; - ip_stack_t *ipst = ill->ill_ipst; + ipif_t *ipif; + clock_t now; + uint_t maxdefense; + uint_t defs; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; + uint32_t elapsed; + boolean_t isv6 = ill->ill_isv6; + ipaddr_t ncec_addr; - ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill, ALL_ZONES, NULL, NULL, - NULL, NULL, ipst); + if (isv6) { + ipif = ipif_lookup_addr_v6(&ncec->ncec_addr, ill, ALL_ZONES, + ipst); + } else { + if (arp_no_defense) { + /* + * Yes, there is a conflict, but no, we do not + * defend ourself. + */ + return (B_TRUE); + } + IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr); + ipif = ipif_lookup_addr(ncec_addr, ill, ALL_ZONES, + ipst); + } if (ipif == NULL) - return; + return (B_FALSE); /* * First, figure out if this address is disposable. @@ -1875,50 +1590,51 @@ ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce) * Now figure out how many times we've defended ourselves. Ignore * defenses that happened long in the past. */ - now = gethrestime_sec(); - mutex_enter(&nce->nce_lock); - if ((defs = nce->nce_defense_count) > 0 && - now - nce->nce_defense_time > ipst->ips_ip_defend_interval) { - nce->nce_defense_count = defs = 0; + now = ddi_get_lbolt(); + elapsed = (drv_hztousec(now - ncec->ncec_last_time_defended))/1000000; + mutex_enter(&ncec->ncec_lock); + if ((defs = ncec->ncec_defense_count) > 0 && + elapsed > ipst->ips_ip_defend_interval) { + /* + * ip_defend_interval has elapsed. + * reset the defense count. + */ + ncec->ncec_defense_count = defs = 0; } - nce->nce_defense_count++; - nce->nce_defense_time = now; - mutex_exit(&nce->nce_lock); + ncec->ncec_defense_count++; + ncec->ncec_last_time_defended = now; + mutex_exit(&ncec->ncec_lock); ipif_refrele(ipif); /* * If we've defended ourselves too many times already, then give up and - * tear down the interface(s) using this address. Otherwise, defend by - * sending out an unsolicited Neighbor Advertisement. + * tear down the interface(s) using this address. + * Otherwise, caller has to defend by sending out an announce. */ if (defs >= maxdefense) { - ip_ndp_failure(ill, mp, dl_mp); + if (isv6) + ndp_failure(mp, ira); + else + arp_failure(mp, ira); } else { - char hbuf[MAC_STR_LEN]; - char sbuf[INET6_ADDRSTRLEN]; - uchar_t *haddr; - uint_t haddrlen; - in6_addr_t targ; - - ip_ndp_find_addresses(mp, dl_mp, ill, &targ, &haddr, &haddrlen); - cmn_err(CE_WARN, "node %s is using our IP address %s on %s", - mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)), - inet_ntop(AF_INET6, &targ, sbuf, sizeof (sbuf)), - ill->ill_name); - - (void) nce_xmit_advert(nce, B_FALSE, &ipv6_all_hosts_mcast, 0); + return (B_TRUE); /* caller must defend this address */ } + return (B_FALSE); } +/* + * Handle reception of Neighbor Solicitation messages. + */ static void -ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) +ndp_input_solicit(mblk_t *mp, ip_recv_attr_t *ira) { + ill_t *ill = ira->ira_ill, *under_ill; nd_neighbor_solicit_t *ns; - uint32_t hlen = ill->ill_nd_lla_len; + uint32_t hlen = ill->ill_phys_addr_length; uchar_t *haddr = NULL; icmp6_t *icmp_nd; ip6_t *ip6h; - nce_t *our_nce = NULL; + ncec_t *our_ncec = NULL; in6_addr_t target; in6_addr_t src; int len; @@ -1926,6 +1642,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) nd_opt_hdr_t *opt = NULL; boolean_t bad_solicit = B_FALSE; mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; + boolean_t need_ill_refrele = B_FALSE; ip6h = (ip6_t *)mp->b_rptr; icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); @@ -1951,7 +1668,6 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) bad_solicit = B_TRUE; goto done; } - } if (IN6_IS_ADDR_UNSPECIFIED(&src)) { /* Check to see if this is a valid DAD solicitation */ @@ -1974,20 +1690,20 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) * e.g. the IPMP ill's data link-local. So we match across the illgrp * to ensure we find the associated NCE. */ - our_nce = ndp_lookup_v6(ill, B_TRUE, &target, B_FALSE); + our_ncec = ncec_lookup_illgrp_v6(ill, &target); /* - * If this is a valid Solicitation, a permanent - * entry should exist in the cache + * If this is a valid Solicitation for an address we are publishing, + * then a PUBLISH entry should exist in the cache */ - if (our_nce == NULL || - !(our_nce->nce_flags & NCE_F_PERMANENT)) { + if (our_ncec == NULL || !NCE_PUBLISH(our_ncec)) { ip1dbg(("ndp_input_solicit: Wrong target in NS?!" "ifname=%s ", ill->ill_name)); if (ip_debug > 2) { /* ip1dbg */ pr_addr_dbg(" dst %s\n", AF_INET6, &target); } - bad_solicit = B_TRUE; + if (our_ncec == NULL) + bad_solicit = B_TRUE; goto done; } @@ -1998,7 +1714,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) haddr = (uchar_t *)&opt[1]; if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) || hlen == 0) { - ip1dbg(("ndp_input_solicit: bad SLLA\n")); + ip1dbg(("ndp_input_advert: bad SLLA\n")); bad_solicit = B_TRUE; goto done; } @@ -2010,7 +1726,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) flag |= NDP_UNICAST; /* - * Create/update the entry for the soliciting node. + * Create/update the entry for the soliciting node on the ipmp_ill. * or respond to outstanding queries, don't if * the source is unspecified address. */ @@ -2035,7 +1751,7 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) * process of verifying the address, then don't respond at all * and don't keep track of the sender. */ - if (our_nce->nce_state == ND_PROBE) + if (our_ncec->ncec_state == ND_PROBE) goto done; /* @@ -2048,27 +1764,37 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) if (haddr == NULL) goto no_source; - err = ndp_lookup_then_add_v6(ill, - B_FALSE, - haddr, + under_ill = ill; + if (IS_UNDER_IPMP(under_ill)) { + ill = ipmp_ill_hold_ipmp_ill(under_ill); + if (ill == NULL) + ill = under_ill; + else + need_ill_refrele = B_TRUE; + } + err = nce_lookup_then_add_v6(ill, + haddr, hlen, &src, /* Soliciting nodes address */ - &ipv6_all_ones, - &ipv6_all_zeros, - 0, 0, ND_STALE, &nnce); + + if (need_ill_refrele) { + ill_refrele(ill); + ill = under_ill; + need_ill_refrele = B_FALSE; + } switch (err) { case 0: /* done with this entry */ - NCE_REFRELE(nnce); + nce_refrele(nnce); break; case EEXIST: /* * B_FALSE indicates this is not an an advertisement. */ - ndp_process(nnce, haddr, 0, B_FALSE); - NCE_REFRELE(nnce); + nce_process(nnce->nce_common, haddr, 0, B_FALSE); + nce_refrele(nnce); break; default: ip1dbg(("ndp_input_solicit: Can't create NCE %d\n", @@ -2088,19 +1814,18 @@ no_source: bad_solicit = B_TRUE; goto done; } - if (our_nce->nce_state == ND_PROBE) { + if (our_ncec->ncec_state == ND_PROBE) { /* - * Internally looped-back probes won't have DLPI - * attached to them. External ones (which are sent by - * multicast) always will. Just ignore our own + * Internally looped-back probes will have + * IRAF_L2SRC_LOOPBACK set so we can ignore our own * transmissions. */ - if (dl_mp != NULL) { + if (!(ira->ira_flags & IRAF_L2SRC_LOOPBACK)) { /* * If someone else is probing our address, then * we've crossed wires. Declare failure. */ - ip_ndp_failure(ill, mp, dl_mp); + ndp_failure(mp, ira); } goto done; } @@ -2110,24 +1835,34 @@ no_source: */ src = ipv6_all_hosts_mcast; } - /* Response to a solicitation */ - (void) nce_xmit_advert(our_nce, B_TRUE, &src, flag); + flag |= nce_advert_flags(our_ncec); + (void) ndp_xmit(ill, + ND_NEIGHBOR_ADVERT, + our_ncec->ncec_lladdr, + our_ncec->ncec_lladdr_length, + &target, /* Source and target of the advertisement pkt */ + &src, /* IP Destination (source of original pkt) */ + flag); done: if (bad_solicit) BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations); - if (our_nce != NULL) - NCE_REFRELE(our_nce); + if (our_ncec != NULL) + ncec_refrele(our_ncec); } +/* + * Handle reception of Neighbor Solicitation messages + */ void -ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) +ndp_input_advert(mblk_t *mp, ip_recv_attr_t *ira) { + ill_t *ill = ira->ira_ill; nd_neighbor_advert_t *na; - uint32_t hlen = ill->ill_nd_lla_len; + uint32_t hlen = ill->ill_phys_addr_length; uchar_t *haddr = NULL; icmp6_t *icmp_nd; ip6_t *ip6h; - nce_t *dst_nce = NULL; + ncec_t *dst_ncec = NULL; in6_addr_t target; nd_opt_hdr_t *opt = NULL; int len; @@ -2138,6 +1873,7 @@ ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; na = (nd_neighbor_advert_t *)icmp_nd; + if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) && (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) { ip1dbg(("ndp_input_advert: Target is multicast but the " @@ -2179,17 +1915,25 @@ ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) * our local addresses, and those are spread across all the active * ills in the group. */ - if ((dst_nce = ndp_lookup_v6(ill, B_TRUE, &target, B_FALSE)) == NULL) + if ((dst_ncec = ncec_lookup_illgrp_v6(ill, &target)) == NULL) return; - if (dst_nce->nce_flags & NCE_F_PERMANENT) { + if (NCE_PUBLISH(dst_ncec)) { /* - * Someone just advertised one of our local addresses. First, + * Someone just advertised an addresses that we publish. First, * check it it was us -- if so, we can safely ignore it. + * We don't get the haddr from the ira_l2src because, in the + * case that the packet originated from us, on an IPMP group, + * the ira_l2src may would be the link-layer address of the + * cast_ill used to send the packet, which may not be the same + * as the dst_ncec->ncec_lladdr of the address. */ if (haddr != NULL) { - if (!nce_cmp_ll_addr(dst_nce, haddr, hlen)) - goto out; /* from us -- no conflict */ + if (ira->ira_flags & IRAF_L2SRC_LOOPBACK) + goto out; + + if (!nce_cmp_ll_addr(dst_ncec, haddr, hlen)) + goto out; /* from us -- no conflict */ /* * If we're in an IPMP group, check if this is an echo @@ -2209,59 +1953,96 @@ ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) } /* - * Our own (looped-back) unsolicited neighbor advertisements - * will get here with dl_mp == NULL. (These will usually be - * filtered by the `haddr' checks above, but point-to-point - * links have no hardware address and thus make it here.) - */ - if (dl_mp == NULL && dst_nce->nce_state != ND_PROBE) - goto out; - - /* * This appears to be a real conflict. If we're trying to * configure this NCE (ND_PROBE), then shut it down. * Otherwise, handle the discovered conflict. - * - * In the ND_PROBE case, dl_mp might be NULL if we're getting - * a unicast reply. This isn't typically done (multicast is - * the norm in response to a probe), but we can handle it. */ - if (dst_nce->nce_state == ND_PROBE) - ip_ndp_failure(ill, mp, dl_mp); - else - ip_ndp_conflict(ill, mp, dl_mp, dst_nce); + if (dst_ncec->ncec_state == ND_PROBE) { + ndp_failure(mp, ira); + } else { + if (ip_nce_conflict(mp, ira, dst_ncec)) { + char hbuf[MAC_STR_LEN]; + char sbuf[INET6_ADDRSTRLEN]; + + cmn_err(CE_WARN, + "node '%s' is using %s on %s", + inet_ntop(AF_INET6, &target, sbuf, + sizeof (sbuf)), + haddr == NULL ? "<none>" : + mac_colon_addr(haddr, hlen, hbuf, + sizeof (hbuf)), ill->ill_name); + /* + * RFC 4862, Section 5.4.4 does not mandate + * any specific behavior when an NA matches + * a non-tentative address assigned to the + * receiver. We make the choice of defending + * our address, based on the assumption that + * the sender has not detected the Duplicate. + * + * ncec_last_time_defended has been adjusted + * in ip_nce_conflict() + */ + (void) ndp_announce(dst_ncec); + } + } } else { if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER) - dst_nce->nce_flags |= NCE_F_ISROUTER; + dst_ncec->ncec_flags |= NCE_F_ISROUTER; /* B_TRUE indicates this an advertisement */ - ndp_process(dst_nce, haddr, na->nd_na_flags_reserved, B_TRUE); + nce_process(dst_ncec, haddr, na->nd_na_flags_reserved, B_TRUE); } out: - NCE_REFRELE(dst_nce); + ncec_refrele(dst_ncec); } /* * Process NDP neighbor solicitation/advertisement messages. * The checksum has already checked o.k before reaching here. + * Information about the datalink header is contained in ira_l2src, but + * that should be ignored for loopback packets. */ void -ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) +ndp_input(mblk_t *mp, ip_recv_attr_t *ira) { + ill_t *ill = ira->ira_rill; icmp6_t *icmp_nd; ip6_t *ip6h; int len; mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; + ill_t *orig_ill = NULL; - + /* + * Since ira_ill is where the IRE_LOCAL was hosted we use ira_rill + * and make it be the IPMP upper so avoid being confused by a packet + * addressed to a unicast address on a different ill. + */ + if (IS_UNDER_IPMP(ill)) { + orig_ill = ill; + ill = ipmp_ill_hold_ipmp_ill(orig_ill); + if (ill == NULL) { + ill = orig_ill; + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards - IPMP ill", + mp, ill); + freemsg(mp); + return; + } + ASSERT(ill != orig_ill); + orig_ill = ira->ira_ill; + ira->ira_ill = ill; + mib = ill->ill_icmp6_mib; + } if (!pullupmsg(mp, -1)) { ip1dbg(("ndp_input: pullupmsg failed\n")); BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards - pullupmsg", mp, ill); goto done; } ip6h = (ip6_t *)mp->b_rptr; if (ip6h->ip6_hops != IPV6_MAX_HOPS) { ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n")); + ip_drop_input("ipv6IfIcmpBadHoplimit", mp, ill); BUMP_MIB(mib, ipv6IfIcmpBadHoplimit); goto done; } @@ -2275,6 +2056,7 @@ ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) if (ip6h->ip6_nxt != IPPROTO_ICMPV6) { ip1dbg(("ndp_input: Wrong next header 0x%x\n", ip6h->ip6_nxt)); + ip_drop_input("Wrong next header", mp, ill); BUMP_MIB(mib, ipv6IfIcmpInErrors); goto done; } @@ -2283,6 +2065,7 @@ ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT); if (icmp_nd->icmp6_code != 0) { ip1dbg(("ndp_input: icmp6 code != 0 \n")); + ip_drop_input("code non-zero", mp, ill); BUMP_MIB(mib, ipv6IfIcmpInErrors); goto done; } @@ -2293,54 +2076,25 @@ ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) */ if (len < sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) { ip1dbg(("ndp_input: packet too short\n")); + ip_drop_input("packet too short", mp, ill); BUMP_MIB(mib, ipv6IfIcmpInErrors); goto done; } if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) { - ndp_input_solicit(ill, mp, dl_mp); + ndp_input_solicit(mp, ira); } else { - ndp_input_advert(ill, mp, dl_mp); + ndp_input_advert(mp, ira); } done: freemsg(mp); + if (orig_ill != NULL) { + ill_refrele(ill); + ira->ira_ill = orig_ill; + } } /* - * Utility routine to send an advertisement. Assumes that the NCE cannot - * go away (e.g., because it's refheld). - */ -static boolean_t -nce_xmit_advert(nce_t *nce, boolean_t use_nd_lla, const in6_addr_t *target, - uint_t flags) -{ - ASSERT((flags & NDP_PROBE) == 0); - - if (nce->nce_flags & NCE_F_ISROUTER) - flags |= NDP_ISROUTER; - if (!(nce->nce_flags & NCE_F_ANYCAST)) - flags |= NDP_ORIDE; - - return (nce_xmit(nce->nce_ill, ND_NEIGHBOR_ADVERT, use_nd_lla, - &nce->nce_addr, target, flags)); -} - -/* - * Utility routine to send a solicitation. Assumes that the NCE cannot - * go away (e.g., because it's refheld). - */ -static boolean_t -nce_xmit_solicit(nce_t *nce, boolean_t use_nd_lla, const in6_addr_t *sender, - uint_t flags) -{ - if (flags & NDP_PROBE) - sender = &ipv6_all_zeros; - - return (nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, use_nd_lla, - sender, &nce->nce_addr, flags)); -} - -/* - * nce_xmit is called to form and transmit a ND solicitation or + * ndp_xmit is called to form and transmit a ND solicitation or * advertisement ICMP packet. * * If the source address is unspecified and this isn't a probe (used for @@ -2353,112 +2107,123 @@ nce_xmit_solicit(nce_t *nce, boolean_t use_nd_lla, const in6_addr_t *sender, * corresponding ill's ill_wq otherwise returns B_TRUE. */ static boolean_t -nce_xmit(ill_t *ill, uint8_t type, boolean_t use_nd_lla, +ndp_xmit(ill_t *ill, uint32_t operation, uint8_t *hw_addr, uint_t hw_addr_len, const in6_addr_t *sender, const in6_addr_t *target, int flag) { - ill_t *hwaddr_ill; uint32_t len; icmp6_t *icmp6; mblk_t *mp; ip6_t *ip6h; nd_opt_hdr_t *opt; - uint_t plen, maxplen; - ip6i_t *ip6i; - ipif_t *src_ipif = NULL; - uint8_t *hw_addr; + uint_t plen; zoneid_t zoneid = GLOBAL_ZONEID; - char buf[INET6_ADDRSTRLEN]; + ill_t *hwaddr_ill = ill; + ip_xmit_attr_t ixas; + ip_stack_t *ipst = ill->ill_ipst; + boolean_t need_refrele = B_FALSE; + boolean_t probe = B_FALSE; - ASSERT(!IS_IPMP(ill)); + if (IS_UNDER_IPMP(ill)) { + probe = ipif_lookup_testaddr_v6(ill, sender, NULL); + /* + * We send non-probe packets on the upper IPMP interface. + * ip_output_simple() will use cast_ill for sending any + * multicast packets. Note that we can't follow the same + * logic for probe packets because all interfaces in the ipmp + * group may have failed, so that we really want to only try + * to send the ND packet on the ill corresponding to the src + * address. + */ + if (!probe) { + ill = ipmp_ill_hold_ipmp_ill(ill); + if (ill != NULL) + need_refrele = B_TRUE; + else + ill = hwaddr_ill; + } + } /* - * Check that the sender is actually a usable address on `ill', and if - * so, track that as the src_ipif. If not, for solicitations, set the - * sender to :: so that a new one will be picked below; for adverts, - * drop the packet since we expect nce_xmit_advert() to always provide - * a valid sender. + * If we have a unspecified source(sender) address, select a + * proper source address for the solicitation here itself so + * that we can initialize the h/w address correctly. + * + * If the sender is specified then we use this address in order + * to lookup the zoneid before calling ip_output_v6(). This is to + * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly + * by IP (we cannot guarantee that the global zone has an interface + * route to the destination). + * + * Note that the NA never comes here with the unspecified source + * address. */ - if (!IN6_IS_ADDR_UNSPECIFIED(sender)) { - if ((src_ipif = ip_ndp_lookup_addr_v6(sender, ill)) == NULL || - !src_ipif->ipif_addr_ready) { - if (src_ipif != NULL) { - ipif_refrele(src_ipif); - src_ipif = NULL; - } - if (type == ND_NEIGHBOR_ADVERT) { - ip1dbg(("nce_xmit: No source ipif for src %s\n", - inet_ntop(AF_INET6, sender, buf, - sizeof (buf)))); - return (B_TRUE); - } - sender = &ipv6_all_zeros; - } - } /* - * If we still have an unspecified source (sender) address and this - * isn't a probe, select a source address from `ill'. + * Probes will have unspec src at this point. */ - if (IN6_IS_ADDR_UNSPECIFIED(sender) && !(flag & NDP_PROBE)) { - ASSERT(type != ND_NEIGHBOR_ADVERT); + if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) { + zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ipst); /* - * Pick a source address for this solicitation, but restrict - * the selection to addresses assigned to the output - * interface. We do this because the destination will create - * a neighbor cache entry for the source address of this - * packet, so the source address needs to be a valid neighbor. + * It's possible for ipif_lookup_addr_zoneid_v6() to return + * ALL_ZONES if it cannot find a matching ipif for the address + * we are trying to use. In this case we err on the side of + * trying to send the packet by defaulting to the GLOBAL_ZONEID. */ - src_ipif = ipif_select_source_v6(ill, target, B_TRUE, - IPV6_PREFER_SRC_DEFAULT, ALL_ZONES); - if (src_ipif == NULL) { - ip1dbg(("nce_xmit: No source ipif for dst %s\n", - inet_ntop(AF_INET6, target, buf, sizeof (buf)))); - return (B_TRUE); - } - sender = &src_ipif->ipif_v6src_addr; + if (zoneid == ALL_ZONES) + zoneid = GLOBAL_ZONEID; } - /* - * We're either sending a probe or we have a source address. - */ - ASSERT((flag & NDP_PROBE) || src_ipif != NULL); - - maxplen = roundup(sizeof (nd_opt_hdr_t) + ND_MAX_HDW_LEN, 8); - len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) + - maxplen; + plen = (sizeof (nd_opt_hdr_t) + hw_addr_len + 7) / 8; + len = IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t) + plen * 8; mp = allocb(len, BPRI_LO); if (mp == NULL) { - if (src_ipif != NULL) - ipif_refrele(src_ipif); + if (need_refrele) + ill_refrele(ill); return (B_TRUE); } + bzero((char *)mp->b_rptr, len); mp->b_wptr = mp->b_rptr + len; - ip6i = (ip6i_t *)mp->b_rptr; - ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW; - ip6i->ip6i_nxt = IPPROTO_RAW; - ip6i->ip6i_flags = IP6I_HOPLIMIT; - if (flag & NDP_PROBE) - ip6i->ip6i_flags |= IP6I_UNSPEC_SRC; + bzero(&ixas, sizeof (ixas)); + ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6 | IXAF_NO_HW_CKSUM; - ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t)); + ixas.ixa_ifindex = ill->ill_phyint->phyint_ifindex; + ixas.ixa_ipst = ipst; + ixas.ixa_cred = kcred; + ixas.ixa_cpid = NOPID; + ixas.ixa_tsl = NULL; + ixas.ixa_zoneid = zoneid; + + ip6h = (ip6_t *)mp->b_rptr; ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW; - ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t)); + ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); ip6h->ip6_nxt = IPPROTO_ICMPV6; ip6h->ip6_hops = IPV6_MAX_HOPS; - ip6h->ip6_src = *sender; + ixas.ixa_multicast_ttl = ip6h->ip6_hops; ip6h->ip6_dst = *target; icmp6 = (icmp6_t *)&ip6h[1]; - opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN + - sizeof (nd_neighbor_advert_t)); - - if (type == ND_NEIGHBOR_SOLICIT) { + if (hw_addr_len != 0) { + opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN + + sizeof (nd_neighbor_advert_t)); + } else { + opt = NULL; + } + if (operation == ND_NEIGHBOR_SOLICIT) { nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6; - if (!(flag & NDP_PROBE)) + if (opt != NULL && !(flag & NDP_PROBE)) { + /* + * Note that we don't send out SLLA for ND probes + * per RFC 4862, even though we do send out the src + * haddr for IPv4 DAD probes, even though both IPv4 + * and IPv6 go out with the unspecified/INADDR_ANY + * src IP addr. + */ opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR; + } + ip6h->ip6_src = *sender; ns->nd_ns_target = *target; if (!(flag & NDP_UNICAST)) { /* Form multicast address of the target */ @@ -2470,7 +2235,9 @@ nce_xmit(ill_t *ill, uint8_t type, boolean_t use_nd_lla, nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6; ASSERT(!(flag & NDP_PROBE)); - opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; + if (opt != NULL) + opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; + ip6h->ip6_src = *sender; na->nd_na_target = *sender; if (flag & NDP_ISROUTER) na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER; @@ -2480,231 +2247,223 @@ nce_xmit(ill_t *ill, uint8_t type, boolean_t use_nd_lla, na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE; } - hw_addr = NULL; if (!(flag & NDP_PROBE)) { - /* - * Use our source address to find the hardware address to put - * in the packet, so that the hardware address and IP address - * will match up -- even if that hardware address doesn't - * match the ill we actually transmit the packet through. - */ - if (IS_IPMP(src_ipif->ipif_ill)) { - hwaddr_ill = ipmp_ipif_hold_bound_ill(src_ipif); - if (hwaddr_ill == NULL) { - ip1dbg(("nce_xmit: no bound ill!\n")); - ipif_refrele(src_ipif); - freemsg(mp); - return (B_TRUE); - } - } else { - hwaddr_ill = src_ipif->ipif_ill; - ill_refhold(hwaddr_ill); /* for symmetry */ - } - - plen = roundup(sizeof (nd_opt_hdr_t) + - hwaddr_ill->ill_nd_lla_len, 8); - - hw_addr = use_nd_lla ? hwaddr_ill->ill_nd_lla : - hwaddr_ill->ill_phys_addr; - if (hw_addr != NULL) { + if (hw_addr != NULL && opt != NULL) { /* Fill in link layer address and option len */ - opt->nd_opt_len = (uint8_t)(plen / 8); - bcopy(hw_addr, &opt[1], hwaddr_ill->ill_nd_lla_len); + opt->nd_opt_len = (uint8_t)plen; + bcopy(hw_addr, &opt[1], hw_addr_len); } - - ill_refrele(hwaddr_ill); + } + if (opt != NULL && opt->nd_opt_type == 0) { + /* If there's no link layer address option, then strip it. */ + len -= plen * 8; + mp->b_wptr = mp->b_rptr + len; + ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); } - if (hw_addr == NULL) - plen = 0; - - /* Fix up the length of the packet now that plen is known */ - len -= (maxplen - plen); - mp->b_wptr = mp->b_rptr + len; - ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t)); - - icmp6->icmp6_type = type; + icmp6->icmp6_type = (uint8_t)operation; icmp6->icmp6_code = 0; /* * Prepare for checksum by putting icmp length in the icmp - * checksum field. The checksum is calculated in ip_wput_v6. + * checksum field. The checksum is calculated in ip_output.c. */ icmp6->icmp6_cksum = ip6h->ip6_plen; - /* - * Before we toss the src_ipif, look up the zoneid to pass to - * ip_output_v6(). This is to ensure unicast ND_NEIGHBOR_ADVERT - * packets to be routed correctly by IP (we cannot guarantee that the - * global zone has an interface route to the destination). - */ - if (src_ipif != NULL) { - if ((zoneid = src_ipif->ipif_zoneid) == ALL_ZONES) - zoneid = GLOBAL_ZONEID; - ipif_refrele(src_ipif); - } - - ip_output_v6((void *)(uintptr_t)zoneid, mp, ill->ill_wq, IP_WPUT); + (void) ip_output_simple(mp, &ixas); + ixa_cleanup(&ixas); + if (need_refrele) + ill_refrele(ill); return (B_FALSE); } /* - * Make a link layer address (does not include the SAP) from an nce. - * To form the link layer address, use the last four bytes of ipv6 - * address passed in and the fixed offset stored in nce. + * Used to set ND_UNREACHBLE before ncec_delete sets it NCE_F_CONDEMNED. + * The datapath uses this as an indication that there + * is a problem (as opposed to a NCE that was just + * reclaimed due to lack of memory. + * Note that static ARP entries never become unreachable. */ -static void -nce_make_mapping(nce_t *nce, uchar_t *addrpos, uchar_t *addr) -{ - uchar_t *mask, *to; - ill_t *ill = nce->nce_ill; - int len; - - if (ill->ill_net_type == IRE_IF_NORESOLVER) - return; - ASSERT(nce->nce_res_mp != NULL); - ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); - ASSERT(nce->nce_flags & NCE_F_MAPPING); - ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask)); - ASSERT(addr != NULL); - bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill), - addrpos, ill->ill_nd_lla_len); - len = MIN((int)ill->ill_nd_lla_len - nce->nce_ll_extract_start, - IPV6_ADDR_LEN); - mask = (uchar_t *)&nce->nce_extract_mask; - mask += (IPV6_ADDR_LEN - len); - addr += (IPV6_ADDR_LEN - len); - to = addrpos + nce->nce_ll_extract_start; - while (len-- > 0) - *to++ |= *mask++ & *addr++; -} - -mblk_t * -nce_udreq_alloc(ill_t *ill) +void +nce_make_unreachable(ncec_t *ncec) { - mblk_t *template_mp = NULL; - dl_unitdata_req_t *dlur; - int sap_length; - - ASSERT(ill->ill_isv6); - - sap_length = ill->ill_sap_length; - template_mp = ip_dlpi_alloc(sizeof (dl_unitdata_req_t) + - ill->ill_nd_lla_len + ABS(sap_length), DL_UNITDATA_REQ); - if (template_mp == NULL) - return (NULL); - - dlur = (dl_unitdata_req_t *)template_mp->b_rptr; - dlur->dl_priority.dl_min = 0; - dlur->dl_priority.dl_max = 0; - dlur->dl_dest_addr_length = ABS(sap_length) + ill->ill_nd_lla_len; - dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t); - - /* Copy in the SAP value. */ - NCE_LL_SAP_COPY(ill, template_mp); - - return (template_mp); + mutex_enter(&ncec->ncec_lock); + ncec->ncec_state = ND_UNREACHABLE; + mutex_exit(&ncec->ncec_lock); } /* - * NDP retransmit timer. + * NCE retransmit timer. Common to IPv4 and IPv6. * This timer goes off when: - * a. It is time to retransmit NS for resolver. + * a. It is time to retransmit a resolution for resolver. * b. It is time to send reachability probes. */ void -ndp_timer(void *arg) +nce_timer(void *arg) { - nce_t *nce = arg; - ill_t *ill = nce->nce_ill; + ncec_t *ncec = arg; + ill_t *ill = ncec->ncec_ill, *src_ill; char addrbuf[INET6_ADDRSTRLEN]; boolean_t dropped = B_FALSE; - ip_stack_t *ipst = ill->ill_ipst; + ip_stack_t *ipst = ncec->ncec_ipst; + boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); + in_addr_t sender4 = INADDR_ANY; + in6_addr_t sender6 = ipv6_all_zeros; /* - * The timer has to be cancelled by ndp_delete before doing the final + * The timer has to be cancelled by ncec_delete before doing the final * refrele. So the NCE is guaranteed to exist when the timer runs * until it clears the timeout_id. Before clearing the timeout_id - * bump up the refcnt so that we can continue to use the nce + * bump up the refcnt so that we can continue to use the ncec */ - ASSERT(nce != NULL); - - mutex_enter(&nce->nce_lock); - NCE_REFHOLD_LOCKED(nce); - nce->nce_timeout_id = 0; + ASSERT(ncec != NULL); + mutex_enter(&ncec->ncec_lock); + ncec_refhold_locked(ncec); + ncec->ncec_timeout_id = 0; + mutex_exit(&ncec->ncec_lock); + + src_ill = nce_resolve_src(ncec, &sender6); + /* if we could not find a sender address, return */ + if (src_ill == NULL) { + if (!isv6) { + IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, sender4); + ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET, + &sender4, addrbuf, sizeof (addrbuf)))); + } else { + ip1dbg(("no src ill for %s\n", inet_ntop(AF_INET6, + &ncec->ncec_addr, addrbuf, sizeof (addrbuf)))); + } + nce_restart_timer(ncec, ill->ill_reachable_retrans_time); + ncec_refrele(ncec); + return; + } + if (!isv6) + IN6_V4MAPPED_TO_IPADDR(&sender6, sender4); + mutex_enter(&ncec->ncec_lock); /* - * Check the reachability state first. + * Check the reachability state. */ - switch (nce->nce_state) { + switch (ncec->ncec_state) { case ND_DELAY: - nce->nce_state = ND_PROBE; - mutex_exit(&nce->nce_lock); - (void) nce_xmit_solicit(nce, B_FALSE, &ipv6_all_zeros, - NDP_UNICAST); + ASSERT(ncec->ncec_lladdr != NULL); + ncec->ncec_state = ND_PROBE; + ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; + if (isv6) { + mutex_exit(&ncec->ncec_lock); + (void) ndp_xmit(src_ill, ND_NEIGHBOR_SOLICIT, + src_ill->ill_phys_addr, + src_ill->ill_phys_addr_length, + &sender6, &ncec->ncec_addr, + NDP_UNICAST); + } else { + (void) arp_request(ncec, sender4, src_ill); + mutex_exit(&ncec->ncec_lock); + } if (ip_debug > 3) { /* ip2dbg */ - pr_addr_dbg("ndp_timer: state for %s changed " - "to PROBE\n", AF_INET6, &nce->nce_addr); + pr_addr_dbg("nce_timer: state for %s changed " + "to PROBE\n", AF_INET6, &ncec->ncec_addr); } - NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time); - NCE_REFRELE(nce); - return; + nce_restart_timer(ncec, ill->ill_reachable_retrans_time); + break; case ND_PROBE: /* must be retransmit timer */ - nce->nce_pcnt--; - ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT && - nce->nce_pcnt >= -1); - if (nce->nce_pcnt > 0) { + ASSERT(ncec->ncec_pcnt >= -1); + if (ncec->ncec_pcnt > 0) { /* - * As per RFC2461, the nce gets deleted after + * As per RFC2461, the ncec gets deleted after * MAX_UNICAST_SOLICIT unsuccessful re-transmissions. * Note that the first unicast solicitation is sent * during the DELAY state. */ - ip2dbg(("ndp_timer: pcount=%x dst %s\n", - nce->nce_pcnt, inet_ntop(AF_INET6, &nce->nce_addr, - addrbuf, sizeof (addrbuf)))); - mutex_exit(&nce->nce_lock); - dropped = nce_xmit_solicit(nce, B_FALSE, - &ipv6_all_zeros, - (nce->nce_flags & NCE_F_PERMANENT) ? NDP_PROBE : - NDP_UNICAST); - if (dropped) { - mutex_enter(&nce->nce_lock); - nce->nce_pcnt++; - mutex_exit(&nce->nce_lock); + ip2dbg(("nce_timer: pcount=%x dst %s\n", + ncec->ncec_pcnt, + inet_ntop((isv6? AF_INET6 : AF_INET), + &ncec->ncec_addr, addrbuf, sizeof (addrbuf)))); + if (NCE_PUBLISH(ncec)) { + mutex_exit(&ncec->ncec_lock); + /* + * send out a probe; note that src_ill + * is ignored by nce_dad() for all + * DAD message types other than IPv6 + * unicast probes + */ + nce_dad(ncec, src_ill, B_TRUE); + } else { + ASSERT(src_ill != NULL); + ncec->ncec_pcnt--; + if (isv6) { + mutex_exit(&ncec->ncec_lock); + (void) ndp_xmit(src_ill, + ND_NEIGHBOR_SOLICIT, + src_ill->ill_phys_addr, + src_ill->ill_phys_addr_length, + &sender6, &ncec->ncec_addr, + NDP_UNICAST); + } else { + /* + * since the nce is REACHABLE, + * the ARP request will be sent out + * as a link-layer unicast. + */ + (void) arp_request(ncec, sender4, + src_ill); + mutex_exit(&ncec->ncec_lock); + } + nce_restart_timer(ncec, + ill->ill_reachable_retrans_time); } - NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill)); - } else if (nce->nce_pcnt < 0) { - /* No hope, delete the nce */ - nce->nce_state = ND_UNREACHABLE; - mutex_exit(&nce->nce_lock); + } else if (ncec->ncec_pcnt < 0) { + /* No hope, delete the ncec */ + /* Tell datapath it went bad */ + ncec->ncec_state = ND_UNREACHABLE; + mutex_exit(&ncec->ncec_lock); if (ip_debug > 2) { /* ip1dbg */ - pr_addr_dbg("ndp_timer: Delete IRE for" - " dst %s\n", AF_INET6, &nce->nce_addr); + pr_addr_dbg("nce_timer: Delete NCE for" + " dst %s\n", (isv6? AF_INET6: AF_INET), + &ncec->ncec_addr); } - ndp_delete(nce); - } else if (!(nce->nce_flags & NCE_F_PERMANENT)) { - /* Wait RetransTimer, before deleting the entry */ - ip2dbg(("ndp_timer: pcount=%x dst %s\n", - nce->nce_pcnt, inet_ntop(AF_INET6, - &nce->nce_addr, addrbuf, sizeof (addrbuf)))); - mutex_exit(&nce->nce_lock); + /* if static ARP can't delete. */ + if ((ncec->ncec_flags & NCE_F_STATIC) == 0) + ncec_delete(ncec); + + } else if (!NCE_PUBLISH(ncec)) { + /* + * Probe count is 0 for a dynamic entry (one that we + * ourselves are not publishing). We should never get + * here if NONUD was requested, hence the ASSERT below. + */ + ASSERT((ncec->ncec_flags & NCE_F_NONUD) == 0); + ip2dbg(("nce_timer: pcount=%x dst %s\n", + ncec->ncec_pcnt, inet_ntop(AF_INET6, + &ncec->ncec_addr, addrbuf, sizeof (addrbuf)))); + ncec->ncec_pcnt--; + mutex_exit(&ncec->ncec_lock); /* Wait one interval before killing */ - NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time); + nce_restart_timer(ncec, + ill->ill_reachable_retrans_time); } else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) { ipif_t *ipif; + ipaddr_t ncec_addr; /* * We're done probing, and we can now declare this * address to be usable. Let IP know that it's ok to * use. */ - nce->nce_state = ND_REACHABLE; - mutex_exit(&nce->nce_lock); - ipif = ip_ndp_lookup_addr_v6(&nce->nce_addr, - nce->nce_ill); + ncec->ncec_state = ND_REACHABLE; + ncec->ncec_flags &= ~NCE_F_UNVERIFIED; + mutex_exit(&ncec->ncec_lock); + if (isv6) { + ipif = ipif_lookup_addr_exact_v6( + &ncec->ncec_addr, ill, ipst); + } else { + IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, + ncec_addr); + ipif = ipif_lookup_addr_exact(ncec_addr, ill, + ipst); + } if (ipif != NULL) { if (ipif->ipif_was_dup) { char ibuf[LIFNAMSIZ + 10]; @@ -2725,17 +2484,28 @@ ndp_timer(void *arg) ipif->ipif_addr_ready = 1; ipif_refrele(ipif); } + if (!isv6 && arp_no_defense) + break; /* Begin defending our new address */ - nce->nce_unsolicit_count = 0; - dropped = nce_xmit_advert(nce, B_FALSE, - &ipv6_all_hosts_mcast, 0); - if (dropped) { - nce->nce_unsolicit_count = 1; - NDP_RESTART_TIMER(nce, - ipst->ips_ip_ndp_unsolicit_interval); - } else if (ipst->ips_ip_ndp_defense_interval != 0) { - NDP_RESTART_TIMER(nce, - ipst->ips_ip_ndp_defense_interval); + if (ncec->ncec_unsolicit_count > 0) { + ncec->ncec_unsolicit_count--; + if (isv6) { + dropped = ndp_announce(ncec); + } else { + dropped = arp_announce(ncec); + } + + if (dropped) + ncec->ncec_unsolicit_count++; + else + ncec->ncec_last_time_defended = + ddi_get_lbolt(); + } + if (ncec->ncec_unsolicit_count > 0) { + nce_restart_timer(ncec, + ANNOUNCE_INTERVAL(isv6)); + } else if (DEFENSE_INTERVAL(isv6) != 0) { + nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6)); } } else { /* @@ -2744,76 +2514,93 @@ ndp_timer(void *arg) * doing anything, but switch to reachable state so * that the restart will work. */ - nce->nce_state = ND_REACHABLE; - mutex_exit(&nce->nce_lock); + ncec->ncec_state = ND_REACHABLE; + mutex_exit(&ncec->ncec_lock); } - NCE_REFRELE(nce); - return; + break; case ND_INCOMPLETE: { - ip6_t *ip6h; - ip6i_t *ip6i; - mblk_t *mp, *datamp, *nextmp, **prevmpp; + mblk_t *mp, *nextmp; + mblk_t **prevmpp; /* - * Per case (2) in the nce_queue_mp() comments, scan nce_qd_mp - * for any IPMP probe packets, and toss 'em. IPMP probe - * packets will always be at the head of nce_qd_mp and always - * have an ip6i_t header, so we can stop at the first queued - * ND packet without an ip6i_t. + * Per case (2) in the nce_queue_mp() comments, scan ncec_qd_mp + * for any IPMP probe packets, and toss them. IPMP probe + * packets will always be at the head of ncec_qd_mp, so that + * we can stop at the first queued ND packet that is + * not a probe packet. */ - prevmpp = &nce->nce_qd_mp; - for (mp = nce->nce_qd_mp; mp != NULL; mp = nextmp) { + prevmpp = &ncec->ncec_qd_mp; + for (mp = ncec->ncec_qd_mp; mp != NULL; mp = nextmp) { nextmp = mp->b_next; - datamp = (DB_TYPE(mp) == M_CTL) ? mp->b_cont : mp; - ip6h = (ip6_t *)datamp->b_rptr; - if (ip6h->ip6_nxt != IPPROTO_RAW) - break; - ip6i = (ip6i_t *)ip6h; - if (ip6i->ip6i_flags & IP6I_IPMP_PROBE) { + if (IS_UNDER_IPMP(ill) && ncec->ncec_nprobes > 0) { inet_freemsg(mp); + ncec->ncec_nprobes--; *prevmpp = nextmp; } else { prevmpp = &mp->b_next; } } - ip_ndp_resolve(nce); - mutex_exit(&nce->nce_lock); - NCE_REFRELE(nce); + + /* + * Must be resolver's retransmit timer. + */ + mutex_exit(&ncec->ncec_lock); + ip_ndp_resolve(ncec); break; } case ND_REACHABLE: - if (((nce->nce_flags & NCE_F_UNSOL_ADV) && - nce->nce_unsolicit_count != 0) || - ((nce->nce_flags & NCE_F_PERMANENT) && - ipst->ips_ip_ndp_defense_interval != 0)) { - if (nce->nce_unsolicit_count > 0) - nce->nce_unsolicit_count--; - mutex_exit(&nce->nce_lock); - dropped = nce_xmit_advert(nce, B_FALSE, - &ipv6_all_hosts_mcast, 0); + if (((ncec->ncec_flags & NCE_F_UNSOL_ADV) && + ncec->ncec_unsolicit_count != 0) || + (NCE_PUBLISH(ncec) && DEFENSE_INTERVAL(isv6) != 0)) { + if (ncec->ncec_unsolicit_count > 0) { + ncec->ncec_unsolicit_count--; + mutex_exit(&ncec->ncec_lock); + /* + * When we get to zero announcements left, + * switch to address defense + */ + } else { + boolean_t rate_limit; + + mutex_exit(&ncec->ncec_lock); + rate_limit = ill_defend_rate_limit(ill, ncec); + if (rate_limit) { + nce_restart_timer(ncec, + DEFENSE_INTERVAL(isv6)); + break; + } + } + if (isv6) { + dropped = ndp_announce(ncec); + } else { + dropped = arp_announce(ncec); + } + mutex_enter(&ncec->ncec_lock); if (dropped) { - mutex_enter(&nce->nce_lock); - nce->nce_unsolicit_count++; - mutex_exit(&nce->nce_lock); + ncec->ncec_unsolicit_count++; + } else { + ncec->ncec_last_time_defended = + ddi_get_lbolt(); } - if (nce->nce_unsolicit_count != 0) { - NDP_RESTART_TIMER(nce, - ipst->ips_ip_ndp_unsolicit_interval); + mutex_exit(&ncec->ncec_lock); + if (ncec->ncec_unsolicit_count != 0) { + nce_restart_timer(ncec, + ANNOUNCE_INTERVAL(isv6)); } else { - NDP_RESTART_TIMER(nce, - ipst->ips_ip_ndp_defense_interval); + nce_restart_timer(ncec, DEFENSE_INTERVAL(isv6)); } } else { - mutex_exit(&nce->nce_lock); + mutex_exit(&ncec->ncec_lock); } - NCE_REFRELE(nce); break; default: - mutex_exit(&nce->nce_lock); - NCE_REFRELE(nce); + mutex_exit(&ncec->ncec_lock); break; } +done: + ncec_refrele(ncec); + ill_refrele(src_ill); } /* @@ -2821,31 +2608,21 @@ ndp_timer(void *arg) * Copy SAP from ill. */ static void -nce_set_ll(nce_t *nce, uchar_t *ll_addr) +nce_set_ll(ncec_t *ncec, uchar_t *ll_addr) { - ill_t *ill = nce->nce_ill; - uchar_t *woffset; + ill_t *ill = ncec->ncec_ill; ASSERT(ll_addr != NULL); - /* Always called before fast_path_probe */ - ASSERT(nce->nce_fp_mp == NULL); - if (ill->ill_sap_length != 0) { - /* - * Copy the SAP type specified in the - * request into the xmit template. - */ - NCE_LL_SAP_COPY(ill, nce->nce_res_mp); - } if (ill->ill_phys_addr_length > 0) { /* * The bcopy() below used to be called for the physical address * length rather than the link layer address length. For * ethernet and many other media, the phys_addr and lla are * identical. - * However, with xresolv interfaces being introduced, the - * phys_addr and lla are no longer the same, and the physical - * address may not have any useful meaning, so we use the lla - * for IPv6 address resolution and destination addressing. + * + * The phys_addr and lla may not be the same for devices that + * support DL_IPV6_LINK_LAYER_ADDR, though there are currently + * no known instances of these. * * For PPP or other interfaces with a zero length * physical address, don't do anything here. @@ -2854,22 +2631,18 @@ nce_set_ll(nce_t *nce, uchar_t *ll_addr) * Using the lla for them would change the way they operate. * Doing nothing in such cases preserves expected behavior. */ - woffset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill); - bcopy(ll_addr, woffset, ill->ill_nd_lla_len); + bcopy(ll_addr, ncec->ncec_lladdr, ill->ill_nd_lla_len); } } -static boolean_t -nce_cmp_ll_addr(const nce_t *nce, const uchar_t *ll_addr, uint32_t ll_addr_len) +boolean_t +nce_cmp_ll_addr(const ncec_t *ncec, const uchar_t *ll_addr, + uint32_t ll_addr_len) { - ill_t *ill = nce->nce_ill; - uchar_t *ll_offset; - - ASSERT(nce->nce_res_mp != NULL); + ASSERT(ncec->ncec_lladdr != NULL); if (ll_addr == NULL) return (B_FALSE); - ll_offset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill); - if (bcmp(ll_addr, ll_offset, ll_addr_len) != 0) + if (bcmp(ll_addr, ncec->ncec_lladdr, ll_addr_len) != 0) return (B_TRUE); return (B_FALSE); } @@ -2878,15 +2651,16 @@ nce_cmp_ll_addr(const nce_t *nce, const uchar_t *ll_addr, uint32_t ll_addr_len) * Updates the link layer address or the reachability state of * a cache entry. Reset probe counter if needed. */ -static void -nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr) +void +nce_update(ncec_t *ncec, uint16_t new_state, uchar_t *new_ll_addr) { - ill_t *ill = nce->nce_ill; + ill_t *ill = ncec->ncec_ill; boolean_t need_stop_timer = B_FALSE; boolean_t need_fastpath_update = B_FALSE; + nce_t *nce = NULL; + timeout_id_t tid; - ASSERT(MUTEX_HELD(&nce->nce_lock)); - ASSERT(nce->nce_ipversion == IPV6_VERSION); + ASSERT(MUTEX_HELD(&ncec->ncec_lock)); /* * If this interface does not do NUD, there is no point * in allowing an update to the cache entry. Although @@ -2896,184 +2670,251 @@ nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr) * Non-Resolvers will always be created as REACHABLE. */ if (new_state != ND_UNCHANGED) { - if ((nce->nce_flags & NCE_F_NONUD) && - (nce->nce_state != ND_INCOMPLETE)) + if ((ncec->ncec_flags & NCE_F_NONUD) && + (ncec->ncec_state != ND_INCOMPLETE)) return; ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN); ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX); need_stop_timer = B_TRUE; if (new_state == ND_REACHABLE) - nce->nce_last = TICK_TO_MSEC(lbolt64); + ncec->ncec_last = TICK_TO_MSEC(lbolt64); else { /* We force NUD in this case */ - nce->nce_last = 0; + ncec->ncec_last = 0; } - nce->nce_state = new_state; - nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT; + ncec->ncec_state = new_state; + ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; + ASSERT(ncec->ncec_lladdr != NULL || new_state == ND_INITIAL || + new_state == ND_INCOMPLETE); + } + if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) { + tid = ncec->ncec_timeout_id; + ncec->ncec_timeout_id = 0; } /* - * In case of fast path we need to free the the fastpath - * M_DATA and do another probe. Otherwise we can just + * Re-trigger fastpath probe and * overwrite the DL_UNITDATA_REQ data, noting we'll lose * whatever packets that happens to be transmitting at the time. */ if (new_ll_addr != NULL) { - ASSERT(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill) + - ill->ill_nd_lla_len <= nce->nce_res_mp->b_wptr); - bcopy(new_ll_addr, nce->nce_res_mp->b_rptr + - NCE_LL_ADDR_OFFSET(ill), ill->ill_nd_lla_len); - if (nce->nce_fp_mp != NULL) { - freemsg(nce->nce_fp_mp); - nce->nce_fp_mp = NULL; - } + bcopy(new_ll_addr, ncec->ncec_lladdr, + ill->ill_phys_addr_length); need_fastpath_update = B_TRUE; } - mutex_exit(&nce->nce_lock); - if (need_stop_timer) { - (void) untimeout(nce->nce_timeout_id); - nce->nce_timeout_id = 0; + mutex_exit(&ncec->ncec_lock); + if (need_stop_timer || (ncec->ncec_flags & NCE_F_STATIC)) { + if (tid != 0) + (void) untimeout(tid); } - if (need_fastpath_update) - nce_fastpath(nce); - mutex_enter(&nce->nce_lock); + if (need_fastpath_update) { + /* + * Delete any existing existing dlur_mp and fp_mp information. + * For IPMP interfaces, all underlying ill's must be checked + * and purged. + */ + nce_fastpath_list_delete(ncec->ncec_ill, ncec, NULL); + /* + * add the new dlur_mp and fp_mp + */ + nce = nce_fastpath(ncec, B_TRUE, NULL); + if (nce != NULL) + nce_refrele(nce); + } + mutex_enter(&ncec->ncec_lock); } -void -nce_queue_mp_common(nce_t *nce, mblk_t *mp, boolean_t head_insert) +static void +nce_queue_mp_common(ncec_t *ncec, mblk_t *mp, boolean_t head_insert) { uint_t count = 0; mblk_t **mpp, *tmp; - ASSERT(MUTEX_HELD(&nce->nce_lock)); + ASSERT(MUTEX_HELD(&ncec->ncec_lock)); - for (mpp = &nce->nce_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) { - if (++count > nce->nce_ill->ill_max_buf) { - tmp = nce->nce_qd_mp->b_next; - nce->nce_qd_mp->b_next = NULL; - nce->nce_qd_mp->b_prev = NULL; - freemsg(nce->nce_qd_mp); - nce->nce_qd_mp = tmp; + for (mpp = &ncec->ncec_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) { + if (++count > ncec->ncec_ill->ill_max_buf) { + tmp = ncec->ncec_qd_mp->b_next; + ncec->ncec_qd_mp->b_next = NULL; + /* + * if we never create data addrs on the under_ill + * does this matter? + */ + BUMP_MIB(ncec->ncec_ill->ill_ip_mib, + ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards", ncec->ncec_qd_mp, + ncec->ncec_ill); + freemsg(ncec->ncec_qd_mp); + ncec->ncec_qd_mp = tmp; } } if (head_insert) { - mp->b_next = nce->nce_qd_mp; - nce->nce_qd_mp = mp; + ncec->ncec_nprobes++; + mp->b_next = ncec->ncec_qd_mp; + ncec->ncec_qd_mp = mp; } else { *mpp = mp; } } -static void -nce_queue_mp(nce_t *nce, mblk_t *mp) +/* + * nce_queue_mp will queue the packet into the ncec_qd_mp. The packet will be + * queued at the head or tail of the queue based on the input argument + * 'head_insert'. The caller should specify this argument as B_TRUE if this + * packet is an IPMP probe packet, in which case the following happens: + * + * 1. Insert it at the head of the ncec_qd_mp list. Consider the normal + * (non-ipmp_probe) load-speading case where the source address of the ND + * packet is not tied to ncec_ill. If the ill bound to the source address + * cannot receive, the response to the ND packet will not be received. + * However, if ND packets for ncec_ill's probes are queued behind that ND + * packet, those probes will also fail to be sent, and thus in.mpathd will + * erroneously conclude that ncec_ill has also failed. + * + * 2. Drop the ipmp_probe packet in ndp_timer() if the ND did not succeed on + * the first attempt. This ensures that ND problems do not manifest as + * probe RTT spikes. + * + * We achieve this by inserting ipmp_probe() packets at the head of the + * nce_queue. + * + * The ncec for the probe target is created with ncec_ill set to the ipmp_ill, + * but the caller needs to set head_insert to B_TRUE if this is a probe packet. + */ +void +nce_queue_mp(ncec_t *ncec, mblk_t *mp, boolean_t head_insert) { - boolean_t head_insert = B_FALSE; - ip6_t *ip6h; - ip6i_t *ip6i; - mblk_t *data_mp; + ASSERT(MUTEX_HELD(&ncec->ncec_lock)); + nce_queue_mp_common(ncec, mp, head_insert); +} - ASSERT(MUTEX_HELD(&nce->nce_lock)); +/* + * Called when address resolution failed due to a timeout. + * Send an ICMP unreachable in response to all queued packets. + */ +void +ndp_resolv_failed(ncec_t *ncec) +{ + mblk_t *mp, *nxt_mp; + char buf[INET6_ADDRSTRLEN]; + ill_t *ill = ncec->ncec_ill; + ip_recv_attr_t iras; - if (mp->b_datap->db_type == M_CTL) - data_mp = mp->b_cont; - else - data_mp = mp; - ip6h = (ip6_t *)data_mp->b_rptr; - if (ip6h->ip6_nxt == IPPROTO_RAW) { - /* - * This message should have been pulled up already in - * ip_wput_v6. We can't do pullups here because the message - * could be from the nce_qd_mp which could have b_next/b_prev - * non-NULL. - */ - ip6i = (ip6i_t *)ip6h; - ASSERT(MBLKL(data_mp) >= sizeof (ip6i_t) + IPV6_HDR_LEN); + bzero(&iras, sizeof (iras)); + iras.ira_flags = 0; + /* + * we are setting the ira_rill to the ipmp_ill (instead of + * the actual ill on which the packet was received), but this + * is ok because we don't actually need the real ira_rill. + * to send the icmp unreachable to the sender. + */ + iras.ira_ill = iras.ira_rill = ill; + iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; + iras.ira_rifindex = iras.ira_ruifindex; + + ip1dbg(("ndp_resolv_failed: dst %s\n", + inet_ntop(AF_INET6, (char *)&ncec->ncec_addr, buf, sizeof (buf)))); + mutex_enter(&ncec->ncec_lock); + mp = ncec->ncec_qd_mp; + ncec->ncec_qd_mp = NULL; + ncec->ncec_nprobes = 0; + mutex_exit(&ncec->ncec_lock); + while (mp != NULL) { + nxt_mp = mp->b_next; + mp->b_next = NULL; - /* - * If this packet is marked IP6I_IPMP_PROBE, then we need to: - * - * 1. Insert it at the head of the nce_qd_mp list. Consider - * the normal (non-probe) load-speading case where the - * source address of the ND packet is not tied to nce_ill. - * If the ill bound to the source address cannot receive, - * the response to the ND packet will not be received. - * However, if ND packets for nce_ill's probes are queued - * behind that ND packet, those probes will also fail to - * be sent, and thus in.mpathd will erroneously conclude - * that nce_ill has also failed. - * - * 2. Drop the probe packet in ndp_timer() if the ND did - * not succeed on the first attempt. This ensures that - * ND problems do not manifest as probe RTT spikes. - */ - if (ip6i->ip6i_flags & IP6I_IPMP_PROBE) - head_insert = B_TRUE; + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards - address unreachable", + mp, ill); + icmp_unreachable_v6(mp, + ICMP6_DST_UNREACH_ADDR, B_FALSE, &iras); + ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); + mp = nxt_mp; } - nce_queue_mp_common(nce, mp, head_insert); + ncec_cb_dispatch(ncec); /* finish off waiting callbacks */ } /* - * Called when address resolution failed due to a timeout. - * Send an ICMP unreachable in response to all queued packets. + * Handle the completion of NDP and ARP resolution. */ void -nce_resolv_failed(nce_t *nce) +nce_resolv_ok(ncec_t *ncec) { - mblk_t *mp, *nxt_mp, *first_mp; - char buf[INET6_ADDRSTRLEN]; - ip6_t *ip6h; - zoneid_t zoneid = GLOBAL_ZONEID; - ip_stack_t *ipst = nce->nce_ill->ill_ipst; + mblk_t *mp; + uint_t pkt_len; + iaflags_t ixaflags = IXAF_NO_TRACE; + nce_t *nce; + ill_t *ill = ncec->ncec_ill; + boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); + ip_stack_t *ipst = ill->ill_ipst; + + if (IS_IPMP(ncec->ncec_ill)) { + nce_resolv_ipmp_ok(ncec); + return; + } + /* non IPMP case */ + + mutex_enter(&ncec->ncec_lock); + ASSERT(ncec->ncec_nprobes == 0); + mp = ncec->ncec_qd_mp; + ncec->ncec_qd_mp = NULL; + mutex_exit(&ncec->ncec_lock); - ip1dbg(("nce_resolv_failed: dst %s\n", - inet_ntop(AF_INET6, (char *)&nce->nce_addr, buf, sizeof (buf)))); - mutex_enter(&nce->nce_lock); - mp = nce->nce_qd_mp; - nce->nce_qd_mp = NULL; - mutex_exit(&nce->nce_lock); while (mp != NULL) { + mblk_t *nxt_mp; + + if (ill->ill_isv6) { + ip6_t *ip6h = (ip6_t *)mp->b_rptr; + + pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; + } else { + ipha_t *ipha = (ipha_t *)mp->b_rptr; + + ixaflags |= IXAF_IS_IPV4; + pkt_len = ntohs(ipha->ipha_length); + } nxt_mp = mp->b_next; mp->b_next = NULL; - mp->b_prev = NULL; - - first_mp = mp; - if (mp->b_datap->db_type == M_CTL) { - ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr; - ASSERT(io->ipsec_out_type == IPSEC_OUT); - zoneid = io->ipsec_out_zoneid; - ASSERT(zoneid != ALL_ZONES); - mp = mp->b_cont; - mp->b_next = NULL; - mp->b_prev = NULL; - } - - ip6h = (ip6_t *)mp->b_rptr; - if (ip6h->ip6_nxt == IPPROTO_RAW) { - ip6i_t *ip6i; + /* + * IXAF_NO_DEV_FLOW_CTL information for TCP packets is no + * longer available, but it's ok to drop this flag because TCP + * has its own flow-control in effect, so TCP packets + * are not likely to get here when flow-control is in effect. + */ + mutex_enter(&ill->ill_lock); + nce = nce_lookup(ill, &ncec->ncec_addr); + mutex_exit(&ill->ill_lock); + + if (nce == NULL) { + if (isv6) { + BUMP_MIB(&ipst->ips_ip6_mib, + ipIfStatsOutDiscards); + } else { + BUMP_MIB(&ipst->ips_ip_mib, + ipIfStatsOutDiscards); + } + ip_drop_output("ipIfStatsOutDiscards - no nce", + mp, NULL); + freemsg(mp); + } else { /* - * This message should have been pulled up already - * in ip_wput_v6. ip_hdr_complete_v6 assumes that - * the header is pulled up. + * We don't know the zoneid, but + * ip_xmit does not care since IXAF_NO_TRACE + * is set. (We traced the packet the first + * time through ip_xmit.) */ - ip6i = (ip6i_t *)ip6h; - ASSERT((mp->b_wptr - (uchar_t *)ip6i) >= - sizeof (ip6i_t) + IPV6_HDR_LEN); - mp->b_rptr += sizeof (ip6i_t); + (void) ip_xmit(mp, nce, ixaflags, pkt_len, 0, + ALL_ZONES, 0, NULL); + nce_refrele(nce); } - /* - * Ignore failure since icmp_unreachable_v6 will silently - * drop packets with an unspecified source address. - */ - (void) ip_hdr_complete_v6((ip6_t *)mp->b_rptr, zoneid, ipst); - icmp_unreachable_v6(nce->nce_ill->ill_wq, first_mp, - ICMP6_DST_UNREACH_ADDR, B_FALSE, B_FALSE, zoneid, ipst); mp = nxt_mp; } - nce_cb_dispatch(nce); + + ncec_cb_dispatch(ncec); /* complete callbacks */ } /* - * Called by SIOCSNDP* ioctl to add/change an nce entry + * Called by SIOCSNDP* ioctl to add/change an ncec entry * and the corresponding attributes. * Disallow states other than ND_REACHABLE or ND_STALE. */ @@ -3082,31 +2923,28 @@ ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr) { sin6_t *sin6; in6_addr_t *addr; + ncec_t *ncec; nce_t *nce; - int err; + int err = 0; uint16_t new_flags = 0; uint16_t old_flags = 0; int inflags = lnr->lnr_flags; ip_stack_t *ipst = ill->ill_ipst; + boolean_t do_postprocess = B_FALSE; ASSERT(ill->ill_isv6); if ((lnr->lnr_state_create != ND_REACHABLE) && (lnr->lnr_state_create != ND_STALE)) return (EINVAL); - if (lnr->lnr_hdw_len > ND_MAX_HDW_LEN) - return (EINVAL); - sin6 = (sin6_t *)&lnr->lnr_addr; addr = &sin6->sin6_addr; mutex_enter(&ipst->ips_ndp6->ndp_g_lock); - /* We know it can not be mapping so just look in the hash table */ - nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr)); - /* See comment in ndp_query() regarding IS_IPMP(ill) usage */ - nce = nce_lookup_addr(ill, IS_IPMP(ill), addr, nce); + ASSERT(!IS_UNDER_IPMP(ill)); + nce = nce_lookup_addr(ill, addr); if (nce != NULL) - new_flags = nce->nce_flags; + new_flags = nce->nce_common->ncec_flags; switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) { case NDF_ISROUTER_ON: @@ -3118,7 +2956,7 @@ ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr) case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON): mutex_exit(&ipst->ips_ndp6->ndp_g_lock); if (nce != NULL) - NCE_REFRELE(nce); + nce_refrele(nce); return (EINVAL); } @@ -3132,17 +2970,15 @@ ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr) case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON): mutex_exit(&ipst->ips_ndp6->ndp_g_lock); if (nce != NULL) - NCE_REFRELE(nce); + nce_refrele(nce); return (EINVAL); } if (nce == NULL) { - err = ndp_add_v6(ill, + err = nce_add_v6(ill, (uchar_t *)lnr->lnr_hdw_addr, + ill->ill_phys_addr_length, addr, - &ipv6_all_ones, - &ipv6_all_zeros, - 0, new_flags, lnr->lnr_state_create, &nce); @@ -3150,269 +2986,354 @@ ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr) mutex_exit(&ipst->ips_ndp6->ndp_g_lock); ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err)); return (err); + } else { + do_postprocess = B_TRUE; } } - old_flags = nce->nce_flags; + ncec = nce->nce_common; + old_flags = ncec->ncec_flags; if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) { - /* - * Router turned to host, delete all ires. - * XXX Just delete the entry, but we need to add too. - */ - nce->nce_flags &= ~NCE_F_ISROUTER; + ncec_router_to_host(ncec); mutex_exit(&ipst->ips_ndp6->ndp_g_lock); - ndp_delete(nce); - NCE_REFRELE(nce); + if (do_postprocess) + err = nce_add_v6_postprocess(nce); + nce_refrele(nce); return (0); } mutex_exit(&ipst->ips_ndp6->ndp_g_lock); - mutex_enter(&nce->nce_lock); - nce->nce_flags = new_flags; - mutex_exit(&nce->nce_lock); + if (do_postprocess) + err = nce_add_v6_postprocess(nce); + /* + * err cannot be anything other than 0 because we don't support + * proxy arp of static addresses. + */ + ASSERT(err == 0); + + mutex_enter(&ncec->ncec_lock); + ncec->ncec_flags = new_flags; + mutex_exit(&ncec->ncec_lock); /* * Note that we ignore the state at this point, which * should be either STALE or REACHABLE. Instead we let * the link layer address passed in to determine the state * much like incoming packets. */ - nce_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE); - NCE_REFRELE(nce); + nce_process(ncec, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE); + nce_refrele(nce); return (0); } /* - * If the device driver supports it, we make nce_fp_mp to have - * an M_DATA prepend. Otherwise nce_fp_mp will be null. - * The caller ensures there is hold on nce for this function. - * Note that since ill_fastpath_probe() copies the mblk there is - * no need for the hold beyond this function. + * Create an nce_t structure for ill using the ncec->ncec_lladdr to set up + * the nce_dlur_mp. If ill != ncec->ncec_ill, then the ips_ill_g_lock must + * be held to ensure that they are in the same group. */ -void -nce_fastpath(nce_t *nce) +static nce_t * +nce_fastpath_create(ill_t *ill, ncec_t *ncec) { - ill_t *ill = nce->nce_ill; - int res; - ASSERT(ill != NULL); - ASSERT(nce->nce_state != ND_INITIAL && nce->nce_state != ND_INCOMPLETE); + nce_t *nce; - if (nce->nce_fp_mp != NULL) { - /* Already contains fastpath info */ - return; - } - if (nce->nce_res_mp != NULL) { - nce_fastpath_list_add(nce); - res = ill_fastpath_probe(ill, nce->nce_res_mp); - /* - * EAGAIN is an indication of a transient error - * i.e. allocation failure etc. leave the nce in the list it - * will be updated when another probe happens for another ire - * if not it will be taken out of the list when the ire is - * deleted. - */ + nce = nce_ill_lookup_then_add(ill, ncec); + + if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill)) + return (nce); - if (res != 0 && res != EAGAIN) - nce_fastpath_list_delete(nce); + /* + * hold the ncec_lock to synchronize with nce_update() so that, + * at the end of this function, the contents of nce_dlur_mp are + * consistent with ncec->ncec_lladdr, even though some intermediate + * packet may have been sent out with a mangled address, which would + * only be a transient condition. + */ + mutex_enter(&ncec->ncec_lock); + if (ncec->ncec_lladdr != NULL) { + bcopy(ncec->ncec_lladdr, nce->nce_dlur_mp->b_rptr + + NCE_LL_ADDR_OFFSET(ill), ill->ill_phys_addr_length); + } else { + nce->nce_dlur_mp = ill_dlur_gen(NULL, 0, ill->ill_sap, + ill->ill_sap_length); } + mutex_exit(&ncec->ncec_lock); + return (nce); } /* - * Drain the list of nce's waiting for fastpath response. + * we make nce_fp_mp to have an M_DATA prepend. + * The caller ensures there is hold on ncec for this function. + * Note that since ill_fastpath_probe() copies the mblk there is + * no need to hold the nce or ncec beyond this function. + * + * If the caller has passed in a non-null ncec_nce to nce_faspath() that + * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill + * and will be returned back by this function, so that no extra nce_refrele + * is required for the caller. The calls from nce_add_common() use this + * method. All other callers (that pass in NULL ncec_nce) will have to do a + * nce_refrele of the returned nce (when it is non-null). */ -void -nce_fastpath_list_dispatch(ill_t *ill, boolean_t (*func)(nce_t *, void *), - void *arg) +nce_t * +nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce) { + nce_t *nce; + ill_t *ill = ncec->ncec_ill; - nce_t *next_nce; - nce_t *current_nce; - nce_t *first_nce; - nce_t *prev_nce = NULL; + ASSERT(ill != NULL); + + if (IS_IPMP(ill) && trigger_fp_req) { + trigger_fp_req = B_FALSE; + ipmp_ncec_fastpath(ncec, ill); - mutex_enter(&ill->ill_lock); - first_nce = current_nce = (nce_t *)ill->ill_fastpath_list; - while (current_nce != (nce_t *)&ill->ill_fastpath_list) { - next_nce = current_nce->nce_fastpath; - /* - * Take it off the list if we're flushing, or if the callback - * routine tells us to do so. Otherwise, leave the nce in the - * fastpath list to handle any pending response from the lower - * layer. We can't drain the list when the callback routine - * comparison failed, because the response is asynchronous in - * nature, and may not arrive in the same order as the list - * insertion. - */ - if (func == NULL || func(current_nce, arg)) { - current_nce->nce_fastpath = NULL; - if (current_nce == first_nce) - ill->ill_fastpath_list = first_nce = next_nce; - else - prev_nce->nce_fastpath = next_nce; - } else { - /* previous element that is still in the list */ - prev_nce = current_nce; - } - current_nce = next_nce; } - mutex_exit(&ill->ill_lock); + /* + * If the caller already has the nce corresponding to the ill, use + * that one. Otherwise we have to lookup/add the nce. Calls from + * nce_add_common() fall in the former category, and have just done + * the nce lookup/add that can be reused. + */ + if (ncec_nce == NULL) + nce = nce_fastpath_create(ill, ncec); + else + nce = ncec_nce; + + if (nce == NULL || IS_LOOPBACK(nce->nce_ill) || IS_VNI(nce->nce_ill)) + return (nce); + + if (trigger_fp_req) + nce_fastpath_trigger(nce); + return (nce); } /* - * Add nce to the nce fastpath list. + * Trigger fastpath on nce. No locks may be held. */ -void -nce_fastpath_list_add(nce_t *nce) +static void +nce_fastpath_trigger(nce_t *nce) { - ill_t *ill; + int res; + ill_t *ill = nce->nce_ill; + ncec_t *ncec = nce->nce_common; - ill = nce->nce_ill; + res = ill_fastpath_probe(ill, nce->nce_dlur_mp); + /* + * EAGAIN is an indication of a transient error + * i.e. allocation failure etc. leave the ncec in the list it + * will be updated when another probe happens for another ire + * if not it will be taken out of the list when the ire is + * deleted. + */ + if (res != 0 && res != EAGAIN && res != ENOTSUP) + nce_fastpath_list_delete(ill, ncec, NULL); +} - mutex_enter(&ill->ill_lock); - mutex_enter(&nce->nce_lock); +/* + * Add ncec to the nce fastpath list on ill. + */ +static nce_t * +nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec) +{ + nce_t *nce = NULL; + ASSERT(MUTEX_HELD(&ill->ill_lock)); /* - * if nce has not been deleted and + * Atomically ensure that the ill is not CONDEMNED and is not going + * down, before adding the NCE. + */ + if (ill->ill_state_flags & ILL_CONDEMNED) + return (NULL); + mutex_enter(&ncec->ncec_lock); + /* + * if ncec has not been deleted and * is not already in the list add it. */ - if (!(nce->nce_flags & NCE_F_CONDEMNED) && - (nce->nce_fastpath == NULL)) { - nce->nce_fastpath = (nce_t *)ill->ill_fastpath_list; - ill->ill_fastpath_list = nce; + if (!NCE_ISCONDEMNED(ncec)) { + nce = nce_lookup(ill, &ncec->ncec_addr); + if (nce != NULL) + goto done; + nce = nce_add(ill, ncec); } +done: + mutex_exit(&ncec->ncec_lock); + return (nce); +} - mutex_exit(&nce->nce_lock); +nce_t * +nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec) +{ + nce_t *nce; + + mutex_enter(&ill->ill_lock); + nce = nce_ill_lookup_then_add_locked(ill, ncec); mutex_exit(&ill->ill_lock); + return (nce); } + /* - * remove nce from the nce fastpath list. + * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted + * nce is added to the 'dead' list, and the caller must nce_refrele() the + * entry after all locks have been dropped. */ void -nce_fastpath_list_delete(nce_t *nce) +nce_fastpath_list_delete(ill_t *ill, ncec_t *ncec, list_t *dead) { - nce_t *nce_ptr; - - ill_t *ill; + nce_t *nce; - ill = nce->nce_ill; ASSERT(ill != NULL); - mutex_enter(&ill->ill_lock); - if (nce->nce_fastpath == NULL) - goto done; - - ASSERT(ill->ill_fastpath_list != &ill->ill_fastpath_list); + /* first clean out any nce pointers in the under_ills */ + if (IS_IPMP(ill)) + ipmp_ncec_flush_nce(ncec); - if (ill->ill_fastpath_list == nce) { - ill->ill_fastpath_list = nce->nce_fastpath; - } else { - nce_ptr = ill->ill_fastpath_list; - while (nce_ptr != (nce_t *)&ill->ill_fastpath_list) { - if (nce_ptr->nce_fastpath == nce) { - nce_ptr->nce_fastpath = nce->nce_fastpath; - break; - } - nce_ptr = nce_ptr->nce_fastpath; + /* now the ill itself */ + mutex_enter(&ill->ill_lock); + for (nce = list_head(&ill->ill_nce); nce != NULL; + nce = list_next(&ill->ill_nce, nce)) { + if (nce->nce_common == ncec) { + nce_refhold(nce); + nce_delete(nce); + break; } } - - nce->nce_fastpath = NULL; -done: mutex_exit(&ill->ill_lock); + if (nce != NULL) { + if (dead == NULL) + nce_refrele(nce); + else + list_insert_tail(dead, nce); + } } /* - * Update all NCE's that are not in fastpath mode and - * have an nce_fp_mp that matches mp. mp->b_cont contains - * the fastpath header. - * - * Returns TRUE if entry should be dequeued, or FALSE otherwise. + * when the fastpath response does not fit in the datab + * associated with the existing nce_fp_mp, we delete and + * add the nce to retrigger fastpath based on the information + * in the ncec_t. */ -boolean_t -ndp_fastpath_update(nce_t *nce, void *arg) +static nce_t * +nce_delete_then_add(nce_t *nce) +{ + ill_t *ill = nce->nce_ill; + nce_t *newnce = NULL; + + ip0dbg(("nce_delete_then_add nce %p ill %s\n", + (void *)nce, ill->ill_name)); + mutex_enter(&ill->ill_lock); + mutex_enter(&nce->nce_common->ncec_lock); + nce_delete(nce); + /* + * Make sure that ncec is not condemned before adding. We hold the + * ill_lock and ncec_lock to synchronize with ncec_delete() and + * ipmp_ncec_flush_nce() + */ + if (!NCE_ISCONDEMNED(nce->nce_common)) + newnce = nce_add(ill, nce->nce_common); + mutex_exit(&nce->nce_common->ncec_lock); + mutex_exit(&ill->ill_lock); + nce_refrele(nce); + return (newnce); /* could be null if nomem */ +} + +typedef struct nce_fp_match_s { + nce_t *nce_fp_match_res; + mblk_t *nce_fp_match_ack_mp; +} nce_fp_match_t; + +/* ARGSUSED */ +static int +nce_fastpath_match_dlur(ill_t *ill, nce_t *nce, void *arg) { - mblk_t *mp, *fp_mp; + nce_fp_match_t *nce_fp_marg = arg; + ncec_t *ncec = nce->nce_common; + mblk_t *mp = nce_fp_marg->nce_fp_match_ack_mp; uchar_t *mp_rptr, *ud_mp_rptr; - mblk_t *ud_mp = nce->nce_res_mp; + mblk_t *ud_mp = nce->nce_dlur_mp; ptrdiff_t cmplen; - if (nce->nce_flags & NCE_F_MAPPING) - return (B_TRUE); - if ((nce->nce_fp_mp != NULL) || (ud_mp == NULL)) - return (B_TRUE); - - ip2dbg(("ndp_fastpath_update: trying\n")); - mp = (mblk_t *)arg; + /* + * mp is the mp associated with the fastpath ack. + * ud_mp is the outstanding DL_UNITDATA_REQ on the nce_t + * under consideration. If the contents match, then the + * fastpath ack is used to update the nce. + */ + if (ud_mp == NULL) + return (0); /* MH_WALK_CONTINUE */ mp_rptr = mp->b_rptr; cmplen = mp->b_wptr - mp_rptr; ASSERT(cmplen >= 0); + ud_mp_rptr = ud_mp->b_rptr; /* - * The nce is locked here to prevent any other threads - * from accessing and changing nce_res_mp when the IPv6 address - * becomes resolved to an lla while we're in the middle - * of looking at and comparing the hardware address (lla). - * It is also locked to prevent multiple threads in nce_fastpath_update - * from examining nce_res_mp atthe same time. + * The ncec is locked here to prevent any other threads from accessing + * and changing nce_dlur_mp when the address becomes resolved to an + * lla while we're in the middle of looking at and comparing the + * hardware address (lla). It is also locked to prevent multiple + * threads in nce_fastpath() from examining nce_dlur_mp at the same + * time. */ - mutex_enter(&nce->nce_lock); + mutex_enter(&ncec->ncec_lock); if (ud_mp->b_wptr - ud_mp_rptr != cmplen || - bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) != 0) { - mutex_exit(&nce->nce_lock); - /* - * Don't take the ire off the fastpath list yet, - * since the response may come later. - */ - return (B_FALSE); - } - /* Matched - install mp as the fastpath mp */ - ip1dbg(("ndp_fastpath_update: match\n")); - fp_mp = dupb(mp->b_cont); - if (fp_mp != NULL) { - nce->nce_fp_mp = fp_mp; + bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) == 0) { + nce_fp_marg->nce_fp_match_res = nce; + mutex_exit(&ncec->ncec_lock); + nce_refhold(nce); + return (1); /* MH_WALK_TERMINATE */ } - mutex_exit(&nce->nce_lock); - return (B_TRUE); + mutex_exit(&ncec->ncec_lock); + return (0); /* MH_WALK_CONTINUE */ } /* - * This function handles the DL_NOTE_FASTPATH_FLUSH notification from - * driver. Note that it assumes IP is exclusive... + * Update all NCE's that are not in fastpath mode and + * have an nce_fp_mp that matches mp. mp->b_cont contains + * the fastpath header. + * + * Returns TRUE if entry should be dequeued, or FALSE otherwise. */ -/* ARGSUSED */ void -ndp_fastpath_flush(nce_t *nce, char *arg) +nce_fastpath_update(ill_t *ill, mblk_t *mp) { - if (nce->nce_flags & NCE_F_MAPPING) - return; - /* No fastpath info? */ - if (nce->nce_fp_mp == NULL || nce->nce_res_mp == NULL) + nce_fp_match_t nce_fp_marg; + nce_t *nce; + mblk_t *nce_fp_mp, *fp_mp; + + nce_fp_marg.nce_fp_match_res = NULL; + nce_fp_marg.nce_fp_match_ack_mp = mp; + + nce_walk(ill, nce_fastpath_match_dlur, &nce_fp_marg); + + if ((nce = nce_fp_marg.nce_fp_match_res) == NULL) return; - if (nce->nce_ipversion == IPV4_VERSION && - nce->nce_flags & NCE_F_BCAST) { - /* - * IPv4 BROADCAST entries: - * We can't delete the nce since it is difficult to - * recreate these without going through the - * ipif down/up dance. - * - * All access to nce->nce_fp_mp in the case of these - * is protected by nce_lock. - */ - mutex_enter(&nce->nce_lock); - if (nce->nce_fp_mp != NULL) { - freeb(nce->nce_fp_mp); - nce->nce_fp_mp = NULL; - mutex_exit(&nce->nce_lock); - nce_fastpath(nce); - } else { + mutex_enter(&nce->nce_lock); + nce_fp_mp = nce->nce_fp_mp; + + if (nce_fp_mp != NULL) { + fp_mp = mp->b_cont; + if (nce_fp_mp->b_rptr + MBLKL(fp_mp) > + nce_fp_mp->b_datap->db_lim) { mutex_exit(&nce->nce_lock); + nce = nce_delete_then_add(nce); + if (nce == NULL) { + return; + } + mutex_enter(&nce->nce_lock); + nce_fp_mp = nce->nce_fp_mp; } + } + + /* Matched - install mp as the fastpath mp */ + if (nce_fp_mp == NULL) { + fp_mp = dupb(mp->b_cont); + nce->nce_fp_mp = fp_mp; } else { - /* Just delete the NCE... */ - ndp_delete(nce); + fp_mp = mp->b_cont; + bcopy(fp_mp->b_rptr, nce_fp_mp->b_rptr, MBLKL(fp_mp)); + nce->nce_fp_mp->b_wptr = nce->nce_fp_mp->b_rptr + + MBLKL(fp_mp); } + mutex_exit(&nce->nce_lock); + nce_refrele(nce); } /* @@ -3451,74 +3372,103 @@ ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen) } /* - * ndp_walk function. + * ncec_walk function. * Free a fraction of the NCE cache entries. - * A fraction of zero means to not free any in that category. + * + * A possible optimization here would be to use ncec_last where possible, and + * delete the least-frequently used entry, which would require more complex + * computation as we walk through the ncec's (e.g., track ncec entries by + * order of ncec_last and/or maintain state) */ -void -ndp_cache_reclaim(nce_t *nce, char *arg) +static void +ncec_cache_reclaim(ncec_t *ncec, char *arg) { - nce_cache_reclaim_t *ncr = (nce_cache_reclaim_t *)arg; - uint_t rand; + ip_stack_t *ipst = ncec->ncec_ipst; + uint_t fraction = *(uint_t *)arg; + uint_t rand; - if (nce->nce_flags & NCE_F_PERMANENT) + if ((ncec->ncec_flags & + (NCE_F_MYADDR | NCE_F_STATIC | NCE_F_BCAST)) != 0) { return; + } rand = (uint_t)lbolt + - NCE_ADDR_HASH_V6(nce->nce_addr, NCE_TABLE_SIZE); - if (ncr->ncr_host != 0 && - (rand/ncr->ncr_host)*ncr->ncr_host == rand) { - ndp_delete(nce); - return; + NCE_ADDR_HASH_V6(ncec->ncec_addr, NCE_TABLE_SIZE); + if ((rand/fraction)*fraction == rand) { + IP_STAT(ipst, ip_nce_reclaim_deleted); + ncec_delete(ncec); } } /* - * ndp_walk function. - * Count the number of NCEs that can be deleted. - * These would be hosts but not routers. + * kmem_cache callback to free up memory. + * + * For now we just delete a fixed fraction. */ -void -ndp_cache_count(nce_t *nce, char *arg) +static void +ip_nce_reclaim_stack(ip_stack_t *ipst) { - ncc_cache_count_t *ncc = (ncc_cache_count_t *)arg; + uint_t fraction = ipst->ips_ip_nce_reclaim_fraction; - if (nce->nce_flags & NCE_F_PERMANENT) - return; + IP_STAT(ipst, ip_nce_reclaim_calls); - ncc->ncc_total++; - if (!(nce->nce_flags & NCE_F_ISROUTER)) - ncc->ncc_host++; + ncec_walk(NULL, (pfi_t)ncec_cache_reclaim, (uchar_t *)&fraction, ipst); + + /* + * Walk all CONNs that can have a reference on an ire, ncec or dce. + * Get them to update any stale references to drop any refholds they + * have. + */ + ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst); +} + +/* + * Called by the memory allocator subsystem directly, when the system + * is running low on memory. + */ +/* ARGSUSED */ +void +ip_nce_reclaim(void *args) +{ + netstack_handle_t nh; + netstack_t *ns; + + netstack_next_init(&nh); + while ((ns = netstack_next(&nh)) != NULL) { + ip_nce_reclaim_stack(ns->netstack_ip); + netstack_rele(ns); + } + netstack_next_fini(&nh); } #ifdef DEBUG void -nce_trace_ref(nce_t *nce) +ncec_trace_ref(ncec_t *ncec) { - ASSERT(MUTEX_HELD(&nce->nce_lock)); + ASSERT(MUTEX_HELD(&ncec->ncec_lock)); - if (nce->nce_trace_disable) + if (ncec->ncec_trace_disable) return; - if (!th_trace_ref(nce, nce->nce_ill->ill_ipst)) { - nce->nce_trace_disable = B_TRUE; - nce_trace_cleanup(nce); + if (!th_trace_ref(ncec, ncec->ncec_ipst)) { + ncec->ncec_trace_disable = B_TRUE; + ncec_trace_cleanup(ncec); } } void -nce_untrace_ref(nce_t *nce) +ncec_untrace_ref(ncec_t *ncec) { - ASSERT(MUTEX_HELD(&nce->nce_lock)); + ASSERT(MUTEX_HELD(&ncec->ncec_lock)); - if (!nce->nce_trace_disable) - th_trace_unref(nce); + if (!ncec->ncec_trace_disable) + th_trace_unref(ncec); } static void -nce_trace_cleanup(const nce_t *nce) +ncec_trace_cleanup(const ncec_t *ncec) { - th_trace_cleanup(nce, nce->nce_trace_disable); + th_trace_cleanup(ncec, ncec->ncec_trace_disable); } #endif @@ -3527,64 +3477,159 @@ nce_trace_cleanup(const nce_t *nce) * Send an ICMP unreachable in response to all queued packets. */ void -arp_resolv_failed(nce_t *nce) +arp_resolv_failed(ncec_t *ncec) { - mblk_t *mp, *nxt_mp, *first_mp; + mblk_t *mp, *nxt_mp; char buf[INET6_ADDRSTRLEN]; - zoneid_t zoneid = GLOBAL_ZONEID; struct in_addr ipv4addr; - ip_stack_t *ipst = nce->nce_ill->ill_ipst; + ill_t *ill = ncec->ncec_ill; + ip_stack_t *ipst = ncec->ncec_ipst; + ip_recv_attr_t iras; - IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &ipv4addr); + bzero(&iras, sizeof (iras)); + iras.ira_flags = IRAF_IS_IPV4; + /* + * we are setting the ira_rill to the ipmp_ill (instead of + * the actual ill on which the packet was received), but this + * is ok because we don't actually need the real ira_rill. + * to send the icmp unreachable to the sender. + */ + iras.ira_ill = iras.ira_rill = ill; + iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; + iras.ira_rifindex = iras.ira_ruifindex; + + IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ipv4addr); ip3dbg(("arp_resolv_failed: dst %s\n", inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf)))); - mutex_enter(&nce->nce_lock); - mp = nce->nce_qd_mp; - nce->nce_qd_mp = NULL; - mutex_exit(&nce->nce_lock); - + mutex_enter(&ncec->ncec_lock); + mp = ncec->ncec_qd_mp; + ncec->ncec_qd_mp = NULL; + ncec->ncec_nprobes = 0; + mutex_exit(&ncec->ncec_lock); while (mp != NULL) { nxt_mp = mp->b_next; mp->b_next = NULL; - mp->b_prev = NULL; - first_mp = mp; - /* - * Send icmp unreachable messages - * to the hosts. - */ - (void) ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst); - ip3dbg(("arp_resolv_failed: Calling icmp_unreachable\n")); - icmp_unreachable(nce->nce_ill->ill_wq, first_mp, - ICMP_HOST_UNREACHABLE, zoneid, ipst); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards - address unreachable", + mp, ill); + if (ipst->ips_ip_arp_icmp_error) { + ip3dbg(("arp_resolv_failed: " + "Calling icmp_unreachable\n")); + icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras); + } else { + freemsg(mp); + } + ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE)); mp = nxt_mp; } + ncec_cb_dispatch(ncec); /* finish off waiting callbacks */ } +/* + * if ill is an under_ill, translate it to the ipmp_ill and add the + * nce on the ipmp_ill. Two nce_t entries (one on the ipmp_ill, and + * one on the underlying in_ill) will be created for the + * ncec_t in this case. The ncec_t itself will be created on the ipmp_ill. + */ int -ndp_lookup_then_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags, - nce_t **newnce, nce_t *src_nce) +nce_lookup_then_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, + const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) { int err; - nce_t *nce; in6_addr_t addr6; ip_stack_t *ipst = ill->ill_ipst; + nce_t *nce, *upper_nce = NULL; + ill_t *in_ill = ill, *under = NULL; + boolean_t need_ill_refrele = B_FALSE; + + if (flags & NCE_F_MCAST) { + /* + * hw_addr will be figured out in nce_set_multicast_v4; + * caller needs to pass in the cast_ill for ipmp + */ + ASSERT(hw_addr == NULL); + ASSERT(!IS_IPMP(ill)); + err = nce_set_multicast_v4(ill, addr, flags, newnce); + return (err); + } + + if (IS_UNDER_IPMP(ill) && !(flags & NCE_F_MYADDR)) { + ill = ipmp_ill_hold_ipmp_ill(ill); + if (ill == NULL) + return (ENXIO); + need_ill_refrele = B_TRUE; + } + if ((flags & NCE_F_BCAST) != 0) { + /* + * IPv4 broadcast ncec: compute the hwaddr. + */ + if (IS_IPMP(ill)) { + under = ipmp_ill_get_xmit_ill(ill, B_FALSE); + if (under == NULL) { + if (need_ill_refrele) + ill_refrele(ill); + return (ENETDOWN); + } + hw_addr = under->ill_bcast_mp->b_rptr + + NCE_LL_ADDR_OFFSET(under); + hw_addr_len = under->ill_phys_addr_length; + } else { + hw_addr = ill->ill_bcast_mp->b_rptr + + NCE_LL_ADDR_OFFSET(ill), + hw_addr_len = ill->ill_phys_addr_length; + } + } mutex_enter(&ipst->ips_ndp4->ndp_g_lock); - nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr)); IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); - /* - * NOTE: IPv4 never matches across the illgrp since the NCE's we're - * looking up have fastpath headers that are inherently per-ill. - */ - nce = nce_lookup_addr(ill, B_FALSE, &addr6, nce); + nce = nce_lookup_addr(ill, &addr6); if (nce == NULL) { - err = ndp_add_v4(ill, addr, flags, newnce, src_nce); + err = nce_add_v4(ill, hw_addr, hw_addr_len, addr, flags, + state, &nce); } else { - *newnce = nce; err = EEXIST; } mutex_exit(&ipst->ips_ndp4->ndp_g_lock); + if (err == 0) + err = nce_add_v4_postprocess(nce); + + if (in_ill != ill && nce != NULL) { + nce_t *under_nce; + + /* + * in_ill was the under_ill. Try to create the under_nce. + * Hold the ill_g_lock to prevent changes to group membership + * until we are done. + */ + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + if (IS_IN_SAME_ILLGRP(in_ill, ill)) { + under_nce = nce_fastpath_create(in_ill, + nce->nce_common); + upper_nce = nce; + if ((nce = under_nce) == NULL) + err = EINVAL; + } + rw_exit(&ipst->ips_ill_g_lock); + if (under_nce != NULL && NCE_ISREACHABLE(nce->nce_common)) + nce_fastpath_trigger(under_nce); + } + if (nce != NULL) { + if (newnce != NULL) + *newnce = nce; + else + nce_refrele(nce); + } + + if (under != NULL) + ill_refrele(under); + + if (upper_nce != NULL) + nce_refrele(upper_nce); + + if (need_ill_refrele) + ill_refrele(ill); + return (err); } @@ -3592,102 +3637,860 @@ ndp_lookup_then_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags, * NDP Cache Entry creation routine for IPv4. * Mapped entries are handled in arp. * This routine must always be called with ndp4->ndp_g_lock held. - * Prior to return, nce_refcnt is incremented. + * Prior to return, ncec_refcnt is incremented. + * + * IPMP notes: the ncec for non-local (i.e., !NCE_MYADDR(ncec) addresses + * are always added pointing at the ipmp_ill. Thus, when the ill passed + * to nce_add_v4 is an under_ill (i.e., IS_UNDER_IPMP(ill)) two nce_t + * entries will be created, both pointing at the same ncec_t. The nce_t + * entries will have their nce_ill set to the ipmp_ill and the under_ill + * respectively, with the ncec_t having its ncec_ill pointing at the ipmp_ill. + * Local addresses are always created on the ill passed to nce_add_v4. */ -static int -ndp_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags, - nce_t **newnce, nce_t *src_nce) +int +nce_add_v4(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, + const in_addr_t *addr, uint16_t flags, uint16_t state, nce_t **newnce) { - static nce_t nce_nil; - nce_t *nce; - mblk_t *mp; - mblk_t *template = NULL; - nce_t **ncep; - ip_stack_t *ipst = ill->ill_ipst; - uint16_t state = ND_INITIAL; int err; + boolean_t is_multicast = (flags & NCE_F_MCAST); + struct in6_addr addr6; + nce_t *nce; - ASSERT(MUTEX_HELD(&ipst->ips_ndp4->ndp_g_lock)); + ASSERT(MUTEX_HELD(&ill->ill_ipst->ips_ndp4->ndp_g_lock)); ASSERT(!ill->ill_isv6); - ASSERT((flags & NCE_F_MAPPING) == 0); + ASSERT(!IN_MULTICAST(htonl(*addr)) || is_multicast); + + IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); + err = nce_add_common(ill, hw_addr, hw_addr_len, &addr6, flags, state, + &nce); + ASSERT(newnce != NULL); + *newnce = nce; + return (err); +} + +/* + * Post-processing routine to be executed after nce_add_v4(). This function + * triggers fastpath (if appropriate) and DAD on the newly added nce entry + * and must be called without any locks held. + * + * Always returns 0, but we return an int to keep this symmetric with the + * IPv6 counter-part. + */ +int +nce_add_v4_postprocess(nce_t *nce) +{ + ncec_t *ncec = nce->nce_common; + uint16_t flags = ncec->ncec_flags; + boolean_t ndp_need_dad = B_FALSE; + boolean_t dropped; + clock_t delay; + ip_stack_t *ipst = ncec->ncec_ill->ill_ipst; + uchar_t *hw_addr = ncec->ncec_lladdr; + boolean_t trigger_fastpath = B_TRUE; - if (ill->ill_resolver_mp == NULL) - return (EINVAL); /* - * Allocate the mblk to hold the nce. + * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then + * we call nce_fastpath as soon as the ncec is resolved in nce_process. + * We call nce_fastpath from nce_update if the link layer address of + * the peer changes from nce_update */ - mp = allocb(sizeof (nce_t), BPRI_MED); - if (mp == NULL) - return (ENOMEM); + if (NCE_PUBLISH(ncec) || !NCE_ISREACHABLE(ncec) || (hw_addr == NULL && + ncec->ncec_ill->ill_net_type != IRE_IF_NORESOLVER)) + trigger_fastpath = B_FALSE; - nce = (nce_t *)mp->b_rptr; - mp->b_wptr = (uchar_t *)&nce[1]; - *nce = nce_nil; - nce->nce_ill = ill; - nce->nce_ipversion = IPV4_VERSION; - nce->nce_flags = flags; - nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT; - nce->nce_rcnt = ill->ill_xmit_count; - IN6_IPADDR_TO_V4MAPPED(*addr, &nce->nce_addr); - nce->nce_mask = ipv6_all_ones; - nce->nce_extract_mask = ipv6_all_zeros; - nce->nce_ll_extract_start = 0; - nce->nce_qd_mp = NULL; - nce->nce_mp = mp; - /* This one is for nce getting created */ - nce->nce_refcnt = 1; - mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL); - ncep = ((nce_t **)NCE_HASH_PTR_V4(ipst, *addr)); + if (trigger_fastpath) + nce_fastpath_trigger(nce); + + if (NCE_PUBLISH(ncec) && ncec->ncec_state == ND_PROBE) { + /* + * Either the caller (by passing in ND_PROBE) + * or nce_add_common() (by the internally computed state + * based on ncec_addr and ill_net_type) has determined + * that this unicast entry needs DAD. Trigger DAD. + */ + ndp_need_dad = B_TRUE; + } else if (flags & NCE_F_UNSOL_ADV) { + /* + * We account for the transmit below by assigning one + * less than the ndd variable. Subsequent decrements + * are done in nce_timer. + */ + mutex_enter(&ncec->ncec_lock); + ncec->ncec_unsolicit_count = + ipst->ips_ip_arp_publish_count - 1; + mutex_exit(&ncec->ncec_lock); + dropped = arp_announce(ncec); + mutex_enter(&ncec->ncec_lock); + if (dropped) + ncec->ncec_unsolicit_count++; + else + ncec->ncec_last_time_defended = ddi_get_lbolt(); + if (ncec->ncec_unsolicit_count != 0) { + nce_start_timer(ncec, + ipst->ips_ip_arp_publish_interval); + } + mutex_exit(&ncec->ncec_lock); + } - nce->nce_trace_disable = B_FALSE; + /* + * If ncec_xmit_interval is 0, user has configured us to send the first + * probe right away. Do so, and set up for the subsequent probes. + */ + if (ndp_need_dad) { + mutex_enter(&ncec->ncec_lock); + if (ncec->ncec_pcnt == 0) { + /* + * DAD probes and announce can be + * administratively disabled by setting the + * probe_count to zero. Restart the timer in + * this case to mark the ipif as ready. + */ + ncec->ncec_unsolicit_count = 0; + mutex_exit(&ncec->ncec_lock); + nce_restart_timer(ncec, 0); + } else { + mutex_exit(&ncec->ncec_lock); + delay = ((ncec->ncec_flags & NCE_F_FAST) ? + ipst->ips_arp_probe_delay : + ipst->ips_arp_fastprobe_delay); + nce_dad(ncec, NULL, (delay == 0 ? B_TRUE : B_FALSE)); + } + } + return (0); +} - if (src_nce != NULL) { +/* + * ncec_walk routine to update all entries that have a given destination or + * gateway address and cached link layer (MAC) address. This is used when ARP + * informs us that a network-to-link-layer mapping may have changed. + */ +void +nce_update_hw_changed(ncec_t *ncec, void *arg) +{ + nce_hw_map_t *hwm = arg; + ipaddr_t ncec_addr; + + if (ncec->ncec_state != ND_REACHABLE) + return; + + IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ncec_addr); + if (ncec_addr != hwm->hwm_addr) + return; + + mutex_enter(&ncec->ncec_lock); + if (hwm->hwm_flags != 0) + ncec->ncec_flags = hwm->hwm_flags; + nce_update(ncec, ND_STALE, hwm->hwm_hwaddr); + mutex_exit(&ncec->ncec_lock); +} + +void +ncec_refhold(ncec_t *ncec) +{ + mutex_enter(&(ncec)->ncec_lock); + (ncec)->ncec_refcnt++; + ASSERT((ncec)->ncec_refcnt != 0); +#ifdef DEBUG + ncec_trace_ref(ncec); +#endif + mutex_exit(&(ncec)->ncec_lock); +} + +void +ncec_refhold_notr(ncec_t *ncec) +{ + mutex_enter(&(ncec)->ncec_lock); + (ncec)->ncec_refcnt++; + ASSERT((ncec)->ncec_refcnt != 0); + mutex_exit(&(ncec)->ncec_lock); +} + +static void +ncec_refhold_locked(ncec_t *ncec) +{ + ASSERT(MUTEX_HELD(&(ncec)->ncec_lock)); + (ncec)->ncec_refcnt++; +#ifdef DEBUG + ncec_trace_ref(ncec); +#endif +} + +/* ncec_inactive destroys the mutex thus no mutex_exit is needed */ +void +ncec_refrele(ncec_t *ncec) +{ + mutex_enter(&(ncec)->ncec_lock); +#ifdef DEBUG + ncec_untrace_ref(ncec); +#endif + ASSERT((ncec)->ncec_refcnt != 0); + if (--(ncec)->ncec_refcnt == 0) { + ncec_inactive(ncec); + } else { + mutex_exit(&(ncec)->ncec_lock); + } +} + +void +ncec_refrele_notr(ncec_t *ncec) +{ + mutex_enter(&(ncec)->ncec_lock); + ASSERT((ncec)->ncec_refcnt != 0); + if (--(ncec)->ncec_refcnt == 0) { + ncec_inactive(ncec); + } else { + mutex_exit(&(ncec)->ncec_lock); + } +} + +/* + * Common to IPv4 and IPv6. + */ +void +nce_restart_timer(ncec_t *ncec, uint_t ms) +{ + timeout_id_t tid; + + ASSERT(!MUTEX_HELD(&(ncec)->ncec_lock)); + + /* First cancel any running timer */ + mutex_enter(&ncec->ncec_lock); + tid = ncec->ncec_timeout_id; + ncec->ncec_timeout_id = 0; + if (tid != 0) { + mutex_exit(&ncec->ncec_lock); + (void) untimeout(tid); + mutex_enter(&ncec->ncec_lock); + } + + /* Restart timer */ + nce_start_timer(ncec, ms); + mutex_exit(&ncec->ncec_lock); +} + +static void +nce_start_timer(ncec_t *ncec, uint_t ms) +{ + ASSERT(MUTEX_HELD(&ncec->ncec_lock)); + /* + * Don't start the timer if the ncec has been deleted, or if the timer + * is already running + */ + if (!NCE_ISCONDEMNED(ncec) && ncec->ncec_timeout_id == 0) { + ncec->ncec_timeout_id = timeout(nce_timer, ncec, + MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms)); + } +} + +int +nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst, + uint16_t flags, nce_t **newnce) +{ + uchar_t *hw_addr; + int err = 0; + ip_stack_t *ipst = ill->ill_ipst; + in6_addr_t dst6; + nce_t *nce; + + ASSERT(!ill->ill_isv6); + + IN6_IPADDR_TO_V4MAPPED(*dst, &dst6); + mutex_enter(&ipst->ips_ndp4->ndp_g_lock); + if ((nce = nce_lookup_addr(ill, &dst6)) != NULL) { + mutex_exit(&ipst->ips_ndp4->ndp_g_lock); + goto done; + } + if (ill->ill_net_type == IRE_IF_RESOLVER) { + /* + * For IRE_IF_RESOLVER a hardware mapping can be + * generated, for IRE_IF_NORESOLVER, resolution cookie + * in the ill is copied in nce_add_v4(). + */ + hw_addr = kmem_alloc(ill->ill_phys_addr_length, KM_NOSLEEP); + if (hw_addr == NULL) { + mutex_exit(&ipst->ips_ndp4->ndp_g_lock); + return (ENOMEM); + } + ip_mcast_mapping(ill, (uchar_t *)dst, hw_addr); + } else { /* - * src_nce has been provided by the caller. The only - * caller who provides a non-null, non-broadcast - * src_nce is from ip_newroute() which must pass in - * a ND_REACHABLE src_nce (this condition is verified - * via an ASSERT for the save_ire->ire_nce in ip_newroute()) + * IRE_IF_NORESOLVER type simply copies the resolution + * cookie passed in. So no hw_addr is needed. */ - mutex_enter(&src_nce->nce_lock); - state = src_nce->nce_state; - if ((src_nce->nce_flags & NCE_F_CONDEMNED) || - (ipst->ips_ndp4->ndp_g_hw_change > 0)) { + hw_addr = NULL; + } + ASSERT(flags & NCE_F_MCAST); + ASSERT(flags & NCE_F_NONUD); + /* nce_state will be computed by nce_add_common() */ + err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags, + ND_UNCHANGED, &nce); + mutex_exit(&ipst->ips_ndp4->ndp_g_lock); + if (err == 0) + err = nce_add_v4_postprocess(nce); + if (hw_addr != NULL) + kmem_free(hw_addr, ill->ill_phys_addr_length); + if (err != 0) { + ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err)); + return (err); + } +done: + if (newnce != NULL) + *newnce = nce; + else + nce_refrele(nce); + return (0); +} + +/* + * This is used when scanning for "old" (least recently broadcast) NCEs. We + * don't want to have to walk the list for every single one, so we gather up + * batches at a time. + */ +#define NCE_RESCHED_LIST_LEN 8 + +typedef struct { + ill_t *ncert_ill; + uint_t ncert_num; + ncec_t *ncert_nces[NCE_RESCHED_LIST_LEN]; +} nce_resched_t; + +/* + * Pick the longest waiting NCEs for defense. + */ +/* ARGSUSED */ +static int +ncec_reschedule(ill_t *ill, nce_t *nce, void *arg) +{ + nce_resched_t *ncert = arg; + ncec_t **ncecs; + ncec_t **ncec_max; + ncec_t *ncec_temp; + ncec_t *ncec = nce->nce_common; + + ASSERT(ncec->ncec_ill == ncert->ncert_ill); + /* + * Only reachable entries that are ready for announcement are eligible. + */ + if (!NCE_MYADDR(ncec) || ncec->ncec_state != ND_REACHABLE) + return (0); + if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) { + ncec_refhold(ncec); + ncert->ncert_nces[ncert->ncert_num++] = ncec; + } else { + ncecs = ncert->ncert_nces; + ncec_max = ncecs + NCE_RESCHED_LIST_LEN; + ncec_refhold(ncec); + for (; ncecs < ncec_max; ncecs++) { + ASSERT(ncec != NULL); + if ((*ncecs)->ncec_last_time_defended > + ncec->ncec_last_time_defended) { + ncec_temp = *ncecs; + *ncecs = ncec; + ncec = ncec_temp; + } + } + ncec_refrele(ncec); + } + return (0); +} + +/* + * Reschedule the ARP defense of any long-waiting NCEs. It's assumed that this + * doesn't happen very often (if at all), and thus it needn't be highly + * optimized. (Note, though, that it's actually O(N) complexity, because the + * outer loop is bounded by a constant rather than by the length of the list.) + */ +static void +nce_ill_reschedule(ill_t *ill, nce_resched_t *ncert) +{ + ncec_t *ncec; + ip_stack_t *ipst = ill->ill_ipst; + uint_t i, defend_rate; + + i = ill->ill_defend_count; + ill->ill_defend_count = 0; + if (ill->ill_isv6) + defend_rate = ipst->ips_ndp_defend_rate; + else + defend_rate = ipst->ips_arp_defend_rate; + /* If none could be sitting around, then don't reschedule */ + if (i < defend_rate) { + DTRACE_PROBE1(reschedule_none, ill_t *, ill); + return; + } + ncert->ncert_ill = ill; + while (ill->ill_defend_count < defend_rate) { + nce_walk_common(ill, ncec_reschedule, ncert); + for (i = 0; i < ncert->ncert_num; i++) { + + ncec = ncert->ncert_nces[i]; + mutex_enter(&ncec->ncec_lock); + ncec->ncec_flags |= NCE_F_DELAYED; + mutex_exit(&ncec->ncec_lock); /* - * src_nce has been deleted, or - * ip_arp_news is in the middle of - * flushing entries in the the nce. - * Fail the add, since we don't know - * if it is safe to copy the contents of - * src_nce + * we plan to schedule this ncec, so incr the + * defend_count in anticipation. */ - DTRACE_PROBE2(nce__bad__src__nce, - nce_t *, src_nce, ill_t *, ill); - mutex_exit(&src_nce->nce_lock); - err = EINVAL; - goto err_ret; + if (++ill->ill_defend_count >= defend_rate) + break; } - template = copyb(src_nce->nce_res_mp); - mutex_exit(&src_nce->nce_lock); - if (template == NULL) { - err = ENOMEM; - goto err_ret; + if (ncert->ncert_num < NCE_RESCHED_LIST_LEN) + break; + } +} + +/* + * Check if the current rate-limiting parameters permit the sending + * of another address defense announcement for both IPv4 and IPv6. + * Returns B_TRUE if rate-limiting is in effect (i.e., send is not + * permitted), and B_FALSE otherwise. The `defend_rate' parameter + * determines how many address defense announcements are permitted + * in any `defense_perio' interval. + */ +static boolean_t +ill_defend_rate_limit(ill_t *ill, ncec_t *ncec) +{ + clock_t now = ddi_get_lbolt(); + ip_stack_t *ipst = ill->ill_ipst; + clock_t start = ill->ill_defend_start; + uint32_t elapsed, defend_period, defend_rate; + nce_resched_t ncert; + boolean_t ret; + int i; + + if (ill->ill_isv6) { + defend_period = ipst->ips_ndp_defend_period; + defend_rate = ipst->ips_ndp_defend_rate; + } else { + defend_period = ipst->ips_arp_defend_period; + defend_rate = ipst->ips_arp_defend_rate; + } + if (defend_rate == 0) + return (B_TRUE); + bzero(&ncert, sizeof (ncert)); + mutex_enter(&ill->ill_lock); + if (start > 0) { + elapsed = now - start; + if (elapsed > SEC_TO_TICK(defend_period)) { + ill->ill_defend_start = now; + /* + * nce_ill_reschedule will attempt to + * prevent starvation by reschduling the + * oldest entries, which are marked with + * the NCE_F_DELAYED flag. + */ + nce_ill_reschedule(ill, &ncert); + } + } else { + ill->ill_defend_start = now; + } + ASSERT(ill->ill_defend_count <= defend_rate); + mutex_enter(&ncec->ncec_lock); + if (ncec->ncec_flags & NCE_F_DELAYED) { + /* + * This ncec was rescheduled as one of the really old + * entries needing on-going defense. The + * ill_defend_count was already incremented in + * nce_ill_reschedule. Go ahead and send the announce. + */ + ncec->ncec_flags &= ~NCE_F_DELAYED; + mutex_exit(&ncec->ncec_lock); + ret = B_FALSE; + goto done; + } + mutex_exit(&ncec->ncec_lock); + if (ill->ill_defend_count < defend_rate) + ill->ill_defend_count++; + if (ill->ill_defend_count == defend_rate) { + /* + * we are no longer allowed to send unbidden defense + * messages. Wait for rescheduling. + */ + ret = B_TRUE; + } else { + ret = B_FALSE; + } +done: + mutex_exit(&ill->ill_lock); + /* + * After all the locks have been dropped we can restart nce timer, + * and refrele the delayed ncecs + */ + for (i = 0; i < ncert.ncert_num; i++) { + clock_t xmit_interval; + ncec_t *tmp; + + tmp = ncert.ncert_nces[i]; + xmit_interval = nce_fuzz_interval(tmp->ncec_xmit_interval, + B_FALSE); + nce_restart_timer(tmp, xmit_interval); + ncec_refrele(tmp); + } + return (ret); +} + +boolean_t +ndp_announce(ncec_t *ncec) +{ + return (ndp_xmit(ncec->ncec_ill, ND_NEIGHBOR_ADVERT, ncec->ncec_lladdr, + ncec->ncec_lladdr_length, &ncec->ncec_addr, &ipv6_all_hosts_mcast, + nce_advert_flags(ncec))); +} + +ill_t * +nce_resolve_src(ncec_t *ncec, in6_addr_t *src) +{ + mblk_t *mp; + in6_addr_t src6; + ipaddr_t src4; + ill_t *ill = ncec->ncec_ill; + ill_t *src_ill = NULL; + ipif_t *ipif = NULL; + boolean_t is_myaddr = NCE_MYADDR(ncec); + boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); + + ASSERT(src != NULL); + ASSERT(IN6_IS_ADDR_UNSPECIFIED(src)); + src6 = *src; + if (is_myaddr) { + src6 = ncec->ncec_addr; + if (!isv6) + IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, src4); + } else { + /* + * try to find one from the outgoing packet. + */ + mutex_enter(&ncec->ncec_lock); + mp = ncec->ncec_qd_mp; + if (mp != NULL) { + if (isv6) { + ip6_t *ip6h = (ip6_t *)mp->b_rptr; + + src6 = ip6h->ip6_src; + } else { + ipha_t *ipha = (ipha_t *)mp->b_rptr; + + src4 = ipha->ipha_src; + IN6_IPADDR_TO_V4MAPPED(src4, &src6); + } + } + mutex_exit(&ncec->ncec_lock); + } + + /* + * For outgoing packets, if the src of outgoing packet is one + * of the assigned interface addresses use it, otherwise we + * will pick the source address below. + * For local addresses (is_myaddr) doing DAD, NDP announce + * messages are mcast. So we use the (IPMP) cast_ill or the + * (non-IPMP) ncec_ill for these message types. The only case + * of unicast DAD messages are for IPv6 ND probes, for which + * we find the ipif_bound_ill corresponding to the ncec_addr. + */ + if (!IN6_IS_ADDR_UNSPECIFIED(&src6) || is_myaddr) { + if (isv6) { + ipif = ipif_lookup_addr_nondup_v6(&src6, ill, ALL_ZONES, + ill->ill_ipst); + } else { + ipif = ipif_lookup_addr_nondup(src4, ill, ALL_ZONES, + ill->ill_ipst); + } + + /* + * If no relevant ipif can be found, then it's not one of our + * addresses. Reset to :: and try to find a src for the NS or + * ARP request using ipif_select_source_v[4,6] below. + * If an ipif can be found, but it's not yet done with + * DAD verification, and we are not being invoked for + * DAD (i.e., !is_myaddr), then just postpone this + * transmission until later. + */ + if (ipif == NULL) { + src6 = ipv6_all_zeros; + src4 = INADDR_ANY; + } else if (!ipif->ipif_addr_ready && !is_myaddr) { + DTRACE_PROBE2(nce__resolve__ipif__not__ready, + ncec_t *, ncec, ipif_t *, ipif); + ipif_refrele(ipif); + return (NULL); } - } else if (flags & NCE_F_BCAST) { + } + + if (IN6_IS_ADDR_UNSPECIFIED(&src6) && !is_myaddr) { /* - * broadcast nce. + * Pick a source address for this solicitation, but + * restrict the selection to addresses assigned to the + * output interface. We do this because the destination will + * create a neighbor cache entry for the source address of + * this packet, so the source address had better be a valid + * neighbor. */ - template = copyb(ill->ill_bcast_mp); + if (isv6) { + ipif = ipif_select_source_v6(ill, &ncec->ncec_addr, + B_TRUE, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES, + B_FALSE, NULL); + } else { + ipaddr_t nce_addr; + + IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, nce_addr); + ipif = ipif_select_source_v4(ill, nce_addr, ALL_ZONES, + B_FALSE, NULL); + } + if (ipif == NULL && IS_IPMP(ill)) { + ill_t *send_ill = ipmp_ill_get_xmit_ill(ill, B_TRUE); + + if (send_ill != NULL) { + if (isv6) { + ipif = ipif_select_source_v6(send_ill, + &ncec->ncec_addr, B_TRUE, + IPV6_PREFER_SRC_DEFAULT, ALL_ZONES, + B_FALSE, NULL); + } else { + IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, + src4); + ipif = ipif_select_source_v4(send_ill, + src4, ALL_ZONES, B_TRUE, NULL); + } + ill_refrele(send_ill); + } + } + + if (ipif == NULL) { + char buf[INET6_ADDRSTRLEN]; + + ip1dbg(("nce_resolve_src: No source ipif for dst %s\n", + inet_ntop((isv6 ? AF_INET6 : AF_INET), + (char *)&ncec->ncec_addr, buf, sizeof (buf)))); + DTRACE_PROBE1(nce__resolve__no__ipif, ncec_t *, ncec); + return (NULL); + } + src6 = ipif->ipif_v6lcl_addr; + } + *src = src6; + if (ipif != NULL) { + src_ill = ipif->ipif_ill; + if (IS_IPMP(src_ill)) + src_ill = ipmp_ipif_hold_bound_ill(ipif); + else + ill_refhold(src_ill); + ipif_refrele(ipif); + DTRACE_PROBE2(nce__resolve__src__ill, ncec_t *, ncec, + ill_t *, src_ill); + } + return (src_ill); +} + +void +ip_nce_lookup_and_update(ipaddr_t *addr, ipif_t *ipif, ip_stack_t *ipst, + uchar_t *hwaddr, int hwaddr_len, int flags) +{ + ill_t *ill; + ncec_t *ncec; + nce_t *nce; + uint16_t new_state; + + ill = (ipif ? ipif->ipif_ill : NULL); + if (ill != NULL) { + /* + * only one ncec is possible + */ + nce = nce_lookup_v4(ill, addr); + if (nce != NULL) { + ncec = nce->nce_common; + mutex_enter(&ncec->ncec_lock); + if (NCE_ISREACHABLE(ncec)) + new_state = ND_UNCHANGED; + else + new_state = ND_STALE; + ncec->ncec_flags = flags; + nce_update(ncec, new_state, hwaddr); + mutex_exit(&ncec->ncec_lock); + nce_refrele(nce); + return; + } + } else { + /* + * ill is wildcard; clean up all ncec's and ire's + * that match on addr. + */ + nce_hw_map_t hwm; + + hwm.hwm_addr = *addr; + hwm.hwm_hwlen = hwaddr_len; + hwm.hwm_hwaddr = hwaddr; + hwm.hwm_flags = flags; + + ncec_walk_common(ipst->ips_ndp4, NULL, + (pfi_t)nce_update_hw_changed, (uchar_t *)&hwm, B_TRUE); + } +} + +/* + * Common function to add ncec entries. + * we always add the ncec with ncec_ill == ill, and always create + * nce_t on ncec_ill. A dlpi fastpath message may be triggered if the + * ncec is !reachable. + * + * When the caller passes in an nce_state of ND_UNCHANGED, + * nce_add_common() will determine the state of the created nce based + * on the ill_net_type and nce_flags used. Otherwise, the nce will + * be created with state set to the passed in nce_state. + */ +static int +nce_add_common(ill_t *ill, uchar_t *hw_addr, uint_t hw_addr_len, + const in6_addr_t *addr, uint16_t flags, uint16_t nce_state, nce_t **retnce) +{ + static ncec_t nce_nil; + uchar_t *template = NULL; + int err; + ncec_t *ncec; + ncec_t **ncep; + ip_stack_t *ipst = ill->ill_ipst; + uint16_t state; + boolean_t fastprobe = B_FALSE; + struct ndp_g_s *ndp; + nce_t *nce = NULL; + mblk_t *dlur_mp = NULL; + + if (ill->ill_isv6) + ndp = ill->ill_ipst->ips_ndp6; + else + ndp = ill->ill_ipst->ips_ndp4; + + *retnce = NULL; + + ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); + + if (IN6_IS_ADDR_UNSPECIFIED(addr)) { + ip0dbg(("nce_add_common: no addr\n")); + return (EINVAL); + } + if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) { + ip0dbg(("nce_add_common: flags = %x\n", (int)flags)); + return (EINVAL); + } + + if (ill->ill_isv6) { + ncep = ((ncec_t **)NCE_HASH_PTR_V6(ipst, *addr)); + } else { + ipaddr_t v4addr; + + IN6_V4MAPPED_TO_IPADDR(addr, v4addr); + ncep = ((ncec_t **)NCE_HASH_PTR_V4(ipst, v4addr)); + } + + /* + * The caller has ensured that there is no nce on ill, but there could + * still be an nce_common_t for the address, so that we find exisiting + * ncec_t strucutures first, and atomically add a new nce_t if + * one is found. The ndp_g_lock ensures that we don't cross threads + * with an ncec_delete(). Unlike ncec_lookup_illgrp() we do not + * compare for matches across the illgrp because this function is + * called via nce_lookup_then_add_v* -> nce_add_v* -> nce_add_common, + * with the nce_lookup_then_add_v* passing in the ipmp_ill where + * appropriate. + */ + ncec = *ncep; + for (; ncec != NULL; ncec = ncec->ncec_next) { + if (ncec->ncec_ill == ill) { + if (IN6_ARE_ADDR_EQUAL(&ncec->ncec_addr, addr)) { + *retnce = nce_ill_lookup_then_add(ill, ncec); + if (*retnce != NULL) + break; + } + } + } + if (*retnce != NULL) { + /* + * We should never find *retnce to be MYADDR, since the caller + * may then incorrectly restart a DAD timer that's already + * running. + */ + ASSERT(!NCE_MYADDR(ncec)); + /* caller must trigger fastpath on nce */ + return (0); + } + ncec = kmem_cache_alloc(ncec_cache, KM_NOSLEEP); + if (ncec == NULL) + return (ENOMEM); + *ncec = nce_nil; + ncec->ncec_ill = ill; + ncec->ncec_ipversion = (ill->ill_isv6 ? IPV6_VERSION : IPV4_VERSION); + ncec->ncec_flags = flags; + ncec->ncec_ipst = ipst; /* No netstack_hold */ + + if (!ill->ill_isv6) { + ipaddr_t addr4; + + /* + * DAD probe interval and probe count are set based on + * fast/slow probe settings. If the underlying link doesn't + * have reliably up/down notifications or if we're working + * with IPv4 169.254.0.0/16 Link Local Address space, then + * don't use the fast timers. Otherwise, use them. + */ + ASSERT(IN6_IS_ADDR_V4MAPPED(addr)); + IN6_V4MAPPED_TO_IPADDR(addr, addr4); + if (ill->ill_note_link && !IS_IPV4_LL_SPACE(&addr4)) + fastprobe = B_TRUE; + if (fastprobe) { + ncec->ncec_xmit_interval = + ipst->ips_arp_fastprobe_interval; + ncec->ncec_pcnt = + ipst->ips_arp_fastprobe_count; + ncec->ncec_flags |= NCE_F_FAST; + } else { + ncec->ncec_xmit_interval = + ipst->ips_arp_probe_interval; + ncec->ncec_pcnt = + ipst->ips_arp_probe_count; + } + if (NCE_PUBLISH(ncec)) { + ncec->ncec_unsolicit_count = + ipst->ips_ip_arp_publish_count; + } + } else { + /* + * probe interval is constant: ILL_PROBE_INTERVAL + * probe count is constant: ND_MAX_UNICAST_SOLICIT + */ + ncec->ncec_pcnt = ND_MAX_UNICAST_SOLICIT; + if (NCE_PUBLISH(ncec)) { + ncec->ncec_unsolicit_count = + ipst->ips_ip_ndp_unsolicit_count; + } + } + ncec->ncec_rcnt = ill->ill_xmit_count; + ncec->ncec_addr = *addr; + ncec->ncec_qd_mp = NULL; + ncec->ncec_refcnt = 1; /* for ncec getting created */ + mutex_init(&ncec->ncec_lock, NULL, MUTEX_DEFAULT, NULL); + ncec->ncec_trace_disable = B_FALSE; + + /* + * ncec_lladdr holds link layer address + */ + if (hw_addr_len > 0) { + template = kmem_alloc(hw_addr_len, KM_NOSLEEP); if (template == NULL) { err = ENOMEM; goto err_ret; } + ncec->ncec_lladdr = template; + ncec->ncec_lladdr_length = hw_addr_len; + bzero(ncec->ncec_lladdr, hw_addr_len); + } + if ((flags & NCE_F_BCAST) != 0) { state = ND_REACHABLE; + ASSERT(hw_addr_len > 0); + } else if (ill->ill_net_type == IRE_IF_RESOLVER) { + state = ND_INITIAL; } else if (ill->ill_net_type == IRE_IF_NORESOLVER) { /* * NORESOLVER entries are always created in the REACHABLE * state. */ + state = ND_REACHABLE; if (ill->ill_phys_addr_length == IP_ADDR_LEN && ill->ill_mactype != DL_IPV4 && ill->ill_mactype != DL_6TO4) { @@ -3698,32 +4501,91 @@ ndp_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags, * that do their own resolution from IP to link-layer * address (e.g. IP over X.25). */ - template = ill_dlur_gen((uchar_t *)addr, - ill->ill_phys_addr_length, - ill->ill_sap, ill->ill_sap_length); - } else { - template = copyb(ill->ill_resolver_mp); + bcopy((uchar_t *)addr, + ncec->ncec_lladdr, ill->ill_phys_addr_length); } - if (template == NULL) { - err = ENOMEM; - goto err_ret; + if (ill->ill_phys_addr_length == IPV6_ADDR_LEN && + ill->ill_mactype != DL_IPV6) { + /* + * We create a nce_res_mp with the IP nexthop address + * as the destination address if the physical legnth + * is exactly 16 bytes for point-to-multipoint links + * that do their own resolution from IP to link-layer + * address. + */ + bcopy((uchar_t *)addr, + ncec->ncec_lladdr, ill->ill_phys_addr_length); } + /* + * Since NUD is not part of the base IPv4 protocol definition, + * IPv4 neighbor entries on NORESOLVER interfaces will never + * age, and are marked NCE_F_NONUD. + */ + if (!ill->ill_isv6) + ncec->ncec_flags |= NCE_F_NONUD; + } else if (ill->ill_net_type == IRE_LOOPBACK) { state = ND_REACHABLE; } - nce->nce_fp_mp = NULL; - nce->nce_res_mp = template; - nce->nce_state = state; + + if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) { + /* + * We are adding an ncec with a deterministic hw_addr, + * so the state can only be one of {REACHABLE, STALE, PROBE}. + * + * if we are adding a unicast ncec for the local address + * it would be REACHABLE; we would be adding a ND_STALE entry + * for the requestor of an ARP_REQUEST/ND_SOLICIT. Our own + * addresses are added in PROBE to trigger DAD. + */ + if ((flags & (NCE_F_MCAST|NCE_F_BCAST)) || + ill->ill_net_type == IRE_IF_NORESOLVER) + state = ND_REACHABLE; + else if (!NCE_PUBLISH(ncec)) + state = ND_STALE; + else + state = ND_PROBE; + if (hw_addr != NULL) + nce_set_ll(ncec, hw_addr); + } + /* caller overrides internally computed state */ + if (nce_state != ND_UNCHANGED) + state = nce_state; + + if (state == ND_PROBE) + ncec->ncec_flags |= NCE_F_UNVERIFIED; + + ncec->ncec_state = state; + if (state == ND_REACHABLE) { - nce->nce_last = TICK_TO_MSEC(lbolt64); - nce->nce_init_time = TICK_TO_MSEC(lbolt64); + ncec->ncec_last = TICK_TO_MSEC(lbolt64); + ncec->ncec_init_time = TICK_TO_MSEC(lbolt64); } else { - nce->nce_last = 0; + ncec->ncec_last = 0; if (state == ND_INITIAL) - nce->nce_init_time = TICK_TO_MSEC(lbolt64); + ncec->ncec_init_time = TICK_TO_MSEC(lbolt64); + } + list_create(&ncec->ncec_cb, sizeof (ncec_cb_t), + offsetof(ncec_cb_t, ncec_cb_node)); + /* + * have all the memory allocations out of the way before taking locks + * and adding the nce. + */ + nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP); + if (nce == NULL) { + err = ENOMEM; + goto err_ret; + } + if (ncec->ncec_lladdr != NULL || + ill->ill_net_type == IRE_IF_NORESOLVER) { + dlur_mp = ill_dlur_gen(ncec->ncec_lladdr, + ill->ill_phys_addr_length, ill->ill_sap, + ill->ill_sap_length); + if (dlur_mp == NULL) { + err = ENOMEM; + goto err_ret; + } } - ASSERT((nce->nce_res_mp == NULL && nce->nce_state == ND_INITIAL) || - (nce->nce_res_mp != NULL && nce->nce_state == ND_REACHABLE)); /* * Atomically ensure that the ill is not CONDEMNED, before * adding the NCE. @@ -3734,128 +4596,423 @@ ndp_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags, err = EINVAL; goto err_ret; } - if ((nce->nce_next = *ncep) != NULL) - nce->nce_next->nce_ptpn = &nce->nce_next; - *ncep = nce; - nce->nce_ptpn = ncep; - *newnce = nce; - /* This one is for nce being used by an active thread */ - NCE_REFHOLD(*newnce); + if (!NCE_MYADDR(ncec) && + (ill->ill_state_flags & ILL_DOWN_IN_PROGRESS)) { + mutex_exit(&ill->ill_lock); + DTRACE_PROBE1(nce__add__on__down__ill, ncec_t *, ncec); + err = EINVAL; + goto err_ret; + } + /* + * Acquire the ncec_lock even before adding the ncec to the list + * so that it cannot get deleted after the ncec is added, but + * before we add the nce. + */ + mutex_enter(&ncec->ncec_lock); + if ((ncec->ncec_next = *ncep) != NULL) + ncec->ncec_next->ncec_ptpn = &ncec->ncec_next; + *ncep = ncec; + ncec->ncec_ptpn = ncep; - /* Bump up the number of nce's referencing this ill */ + /* Bump up the number of ncec's referencing this ill */ DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill, - (char *), "nce", (void *), nce); - ill->ill_nce_cnt++; + (char *), "ncec", (void *), ncec); + ill->ill_ncec_cnt++; + /* + * Since we hold the ncec_lock at this time, the ncec cannot be + * condemned, and we can safely add the nce. + */ + *retnce = nce_add_impl(ill, ncec, nce, dlur_mp); + mutex_exit(&ncec->ncec_lock); mutex_exit(&ill->ill_lock); - DTRACE_PROBE1(ndp__add__v4, nce_t *, nce); + + /* caller must trigger fastpath on *retnce */ return (0); + err_ret: - freeb(mp); - freemsg(template); + if (ncec != NULL) + kmem_cache_free(ncec_cache, ncec); + if (nce != NULL) + kmem_cache_free(nce_cache, nce); + freemsg(dlur_mp); + if (template != NULL) + kmem_free(template, ill->ill_phys_addr_length); return (err); } /* - * ndp_walk routine to delete all entries that have a given destination or - * gateway address and cached link layer (MAC) address. This is used when ARP - * informs us that a network-to-link-layer mapping may have changed. + * take a ref on the nce */ void -nce_delete_hw_changed(nce_t *nce, void *arg) +nce_refhold(nce_t *nce) { - nce_hw_map_t *hwm = arg; - mblk_t *mp; - dl_unitdata_req_t *dlu; - uchar_t *macaddr; - ill_t *ill; - int saplen; - ipaddr_t nce_addr; + mutex_enter(&nce->nce_lock); + nce->nce_refcnt++; + ASSERT((nce)->nce_refcnt != 0); + mutex_exit(&nce->nce_lock); +} - if (nce->nce_state != ND_REACHABLE) - return; +/* + * release a ref on the nce; In general, this + * cannot be called with locks held because nce_inactive + * may result in nce_inactive which will take the ill_lock, + * do ipif_ill_refrele_tail etc. Thus the one exception + * where this can be called with locks held is when the caller + * is certain that the nce_refcnt is sufficient to prevent + * the invocation of nce_inactive. + */ +void +nce_refrele(nce_t *nce) +{ + ASSERT((nce)->nce_refcnt != 0); + mutex_enter(&nce->nce_lock); + if (--nce->nce_refcnt == 0) + nce_inactive(nce); /* destroys the mutex */ + else + mutex_exit(&nce->nce_lock); +} - IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr); - if (nce_addr != hwm->hwm_addr) - return; +/* + * free the nce after all refs have gone away. + */ +static void +nce_inactive(nce_t *nce) +{ + ill_t *ill = nce->nce_ill; + + ASSERT(nce->nce_refcnt == 0); + + ncec_refrele_notr(nce->nce_common); + nce->nce_common = NULL; + freemsg(nce->nce_fp_mp); + freemsg(nce->nce_dlur_mp); + + mutex_enter(&ill->ill_lock); + DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill, + (char *), "nce", (void *), nce); + ill->ill_nce_cnt--; + nce->nce_ill = NULL; + /* + * If the number of ncec's associated with this ill have dropped + * to zero, check whether we need to restart any operation that + * is waiting for this to happen. + */ + if (ILL_DOWN_OK(ill)) { + /* ipif_ill_refrele_tail drops the ill_lock */ + ipif_ill_refrele_tail(ill); + } else { + mutex_exit(&ill->ill_lock); + } + + mutex_destroy(&nce->nce_lock); + kmem_cache_free(nce_cache, nce); +} + +/* + * Add an nce to the ill_nce list. + */ +static nce_t * +nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp) +{ + bzero(nce, sizeof (*nce)); + mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL); + nce->nce_common = ncec; + nce->nce_addr = ncec->ncec_addr; + nce->nce_ill = ill; + DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill, + (char *), "nce", (void *), nce); + ill->ill_nce_cnt++; + + nce->nce_refcnt = 1; /* for the thread */ + ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */ + nce->nce_dlur_mp = dlur_mp; + + /* add nce to the ill's fastpath list. */ + nce->nce_refcnt++; /* for the list */ + list_insert_head(&ill->ill_nce, nce); + return (nce); +} + +static nce_t * +nce_add(ill_t *ill, ncec_t *ncec) +{ + nce_t *nce; + mblk_t *dlur_mp = NULL; + + ASSERT(MUTEX_HELD(&ill->ill_lock)); + ASSERT(MUTEX_HELD(&ncec->ncec_lock)); + + nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP); + if (nce == NULL) + return (NULL); + if (ncec->ncec_lladdr != NULL || + ill->ill_net_type == IRE_IF_NORESOLVER) { + dlur_mp = ill_dlur_gen(ncec->ncec_lladdr, + ill->ill_phys_addr_length, ill->ill_sap, + ill->ill_sap_length); + if (dlur_mp == NULL) { + kmem_cache_free(nce_cache, nce); + return (NULL); + } + } + return (nce_add_impl(ill, ncec, nce, dlur_mp)); +} + +/* + * remove the nce from the ill_faspath list + */ +void +nce_delete(nce_t *nce) +{ + ill_t *ill = nce->nce_ill; + + ASSERT(MUTEX_HELD(&ill->ill_lock)); mutex_enter(&nce->nce_lock); - if ((mp = nce->nce_res_mp) == NULL) { + if (nce->nce_is_condemned) { + /* + * some other thread has removed this nce from the ill_nce list + */ mutex_exit(&nce->nce_lock); return; } - dlu = (dl_unitdata_req_t *)mp->b_rptr; - macaddr = (uchar_t *)(dlu + 1); - ill = nce->nce_ill; - if ((saplen = ill->ill_sap_length) > 0) - macaddr += saplen; - else - saplen = -saplen; + nce->nce_is_condemned = B_TRUE; + mutex_exit(&nce->nce_lock); + list_remove(&ill->ill_nce, nce); /* - * If the hardware address is unchanged, then leave this one alone. - * Note that saplen == abs(saplen) now. + * even though we are holding the ill_lock, it is ok to + * call nce_refrele here because we know that we should have + * at least 2 refs on the nce: one for the thread, and one + * for the list. The refrele below will release the one for + * the list. */ - if (hwm->hwm_hwlen == dlu->dl_dest_addr_length - saplen && - bcmp(hwm->hwm_hwaddr, macaddr, hwm->hwm_hwlen) == 0) { - mutex_exit(&nce->nce_lock); - return; + nce_refrele(nce); +} + +nce_t * +nce_lookup(ill_t *ill, const in6_addr_t *addr) +{ + nce_t *nce = NULL; + + ASSERT(ill != NULL); + ASSERT(MUTEX_HELD(&ill->ill_lock)); + + for (nce = list_head(&ill->ill_nce); nce != NULL; + nce = list_next(&ill->ill_nce, nce)) { + if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr)) + break; } - mutex_exit(&nce->nce_lock); - DTRACE_PROBE1(nce__hw__deleted, nce_t *, nce); - ndp_delete(nce); + /* + * if we found the nce on the ill_nce list while holding + * the ill_lock, then it cannot be condemned yet. + */ + if (nce != NULL) { + ASSERT(!nce->nce_is_condemned); + nce_refhold(nce); + } + return (nce); } /* - * This function verifies whether a given IPv4 address is potentially known to - * the NCE subsystem. If so, then ARP must not delete the corresponding ace_t, - * so that it can continue to look for hardware changes on that address. + * Walk the ill_nce list on ill. The callback function func() cannot perform + * any destructive actions. */ -boolean_t -ndp_lookup_ipaddr(in_addr_t addr, netstack_t *ns) +static void +nce_walk_common(ill_t *ill, pfi_t func, void *arg) { - nce_t *nce; - struct in_addr nceaddr; - ip_stack_t *ipst = ns->netstack_ip; + nce_t *nce = NULL, *nce_next; - if (addr == INADDR_ANY) - return (B_FALSE); + ASSERT(MUTEX_HELD(&ill->ill_lock)); + for (nce = list_head(&ill->ill_nce); nce != NULL; ) { + nce_next = list_next(&ill->ill_nce, nce); + if (func(ill, nce, arg) != 0) + break; + nce = nce_next; + } +} - mutex_enter(&ipst->ips_ndp4->ndp_g_lock); - nce = *(nce_t **)NCE_HASH_PTR_V4(ipst, addr); - for (; nce != NULL; nce = nce->nce_next) { - /* Note that only v4 mapped entries are in the table. */ - IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &nceaddr); - if (addr == nceaddr.s_addr && - IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) { - /* Single flag check; no lock needed */ - if (!(nce->nce_flags & NCE_F_CONDEMNED)) - break; +void +nce_walk(ill_t *ill, pfi_t func, void *arg) +{ + mutex_enter(&ill->ill_lock); + nce_walk_common(ill, func, arg); + mutex_exit(&ill->ill_lock); +} + +void +nce_flush(ill_t *ill, boolean_t flushall) +{ + nce_t *nce, *nce_next; + list_t dead; + + list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node)); + mutex_enter(&ill->ill_lock); + for (nce = list_head(&ill->ill_nce); nce != NULL; ) { + nce_next = list_next(&ill->ill_nce, nce); + if (!flushall && NCE_PUBLISH(nce->nce_common)) { + nce = nce_next; + continue; } + /* + * nce_delete requires that the caller should either not + * be holding locks, or should hold a ref to ensure that + * we wont hit ncec_inactive. So take a ref and clean up + * after the list is flushed. + */ + nce_refhold(nce); + nce_delete(nce); + list_insert_tail(&dead, nce); + nce = nce_next; } - mutex_exit(&ipst->ips_ndp4->ndp_g_lock); - return (nce != NULL); + mutex_exit(&ill->ill_lock); + while ((nce = list_head(&dead)) != NULL) { + list_remove(&dead, nce); + nce_refrele(nce); + } + ASSERT(list_is_empty(&dead)); + list_destroy(&dead); } -/* - * Wrapper around ipif_lookup_addr_exact_v6() that allows ND to work properly - * with IPMP. Specifically, since neighbor discovery is always done on - * underlying interfaces (even for addresses owned by an IPMP interface), we - * need to check for `v6addrp' on both `ill' and on the IPMP meta-interface - * associated with `ill' (if it exists). - */ -static ipif_t * -ip_ndp_lookup_addr_v6(const in6_addr_t *v6addrp, ill_t *ill) +/* Return an interval that is anywhere in the [1 .. intv] range */ +static clock_t +nce_fuzz_interval(clock_t intv, boolean_t initial_time) +{ + clock_t rnd, frac; + + (void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd)); + /* Note that clock_t is signed; must chop off bits */ + rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1; + if (initial_time) { + if (intv <= 0) + intv = 1; + else + intv = (rnd % intv) + 1; + } else { + /* Compute 'frac' as 20% of the configured interval */ + if ((frac = intv / 5) <= 1) + frac = 2; + /* Set intv randomly in the range [intv-frac .. intv+frac] */ + if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0) + intv = 1; + } + return (intv); +} + +void +nce_resolv_ipmp_ok(ncec_t *ncec) { - ipif_t *ipif; + mblk_t *mp; + uint_t pkt_len; + iaflags_t ixaflags = IXAF_NO_TRACE; + nce_t *under_nce; + ill_t *ill = ncec->ncec_ill; + boolean_t isv6 = (ncec->ncec_ipversion == IPV6_VERSION); + ipif_t *src_ipif = NULL; ip_stack_t *ipst = ill->ill_ipst; + ill_t *send_ill; + uint_t nprobes; - ipif = ipif_lookup_addr_exact_v6(v6addrp, ill, ipst); - if (ipif == NULL && IS_UNDER_IPMP(ill)) { - if ((ill = ipmp_ill_hold_ipmp_ill(ill)) != NULL) { - ipif = ipif_lookup_addr_exact_v6(v6addrp, ill, ipst); - ill_refrele(ill); + ASSERT(IS_IPMP(ill)); + + mutex_enter(&ncec->ncec_lock); + nprobes = ncec->ncec_nprobes; + mp = ncec->ncec_qd_mp; + ncec->ncec_qd_mp = NULL; + ncec->ncec_nprobes = 0; + mutex_exit(&ncec->ncec_lock); + + while (mp != NULL) { + mblk_t *nxt_mp; + + nxt_mp = mp->b_next; + mp->b_next = NULL; + if (isv6) { + ip6_t *ip6h = (ip6_t *)mp->b_rptr; + + pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; + src_ipif = ipif_lookup_addr_nondup_v6(&ip6h->ip6_src, + ill, ALL_ZONES, ipst); + } else { + ipha_t *ipha = (ipha_t *)mp->b_rptr; + + ixaflags |= IXAF_IS_IPV4; + pkt_len = ntohs(ipha->ipha_length); + src_ipif = ipif_lookup_addr_nondup(ipha->ipha_src, + ill, ALL_ZONES, ipst); + } + + /* + * find a new nce based on an under_ill. The first IPMP probe + * packet gets queued, so we could still find a src_ipif that + * matches an IPMP test address. + */ + if (src_ipif == NULL || IS_IPMP(src_ipif->ipif_ill)) { + /* + * if src_ipif is null, this could be either a + * forwarded packet or a probe whose src got deleted. + * We identify the former case by looking for the + * ncec_nprobes: the first ncec_nprobes packets are + * probes; + */ + if (src_ipif == NULL && nprobes > 0) + goto drop_pkt; + + /* + * For forwarded packets, we use the ipmp rotor + * to find send_ill. + */ + send_ill = ipmp_ill_get_xmit_ill(ncec->ncec_ill, + B_TRUE); + } else { + send_ill = src_ipif->ipif_ill; + ill_refhold(send_ill); + } + + DTRACE_PROBE4(nce__resolve__ipmp, (mblk_t *), mp, + (ncec_t *), ncec, (ipif_t *), + src_ipif, (ill_t *), send_ill); + + if (send_ill == NULL) { + if (src_ipif != NULL) + ipif_refrele(src_ipif); + goto drop_pkt; } + /* create an under_nce on send_ill */ + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + if (IS_IN_SAME_ILLGRP(send_ill, ncec->ncec_ill)) + under_nce = nce_fastpath_create(send_ill, ncec); + else + under_nce = NULL; + rw_exit(&ipst->ips_ill_g_lock); + if (under_nce != NULL && NCE_ISREACHABLE(ncec)) + nce_fastpath_trigger(under_nce); + + ill_refrele(send_ill); + if (src_ipif != NULL) + ipif_refrele(src_ipif); + + if (under_nce != NULL) { + (void) ip_xmit(mp, under_nce, ixaflags, pkt_len, 0, + ALL_ZONES, 0, NULL); + nce_refrele(under_nce); + if (nprobes > 0) + nprobes--; + mp = nxt_mp; + continue; + } +drop_pkt: + if (isv6) { + BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); + } else { + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); + } + ip_drop_output("ipIfStatsOutDiscards - no under_ill", mp, NULL); + freemsg(mp); + if (nprobes > 0) + nprobes--; + mp = nxt_mp; } - return (ipif); + ncec_cb_dispatch(ncec); /* complete callbacks */ } diff --git a/usr/src/uts/common/inet/ip/ip_netinfo.c b/usr/src/uts/common/inet/ip/ip_netinfo.c index 8b97462d13..33e791adac 100644 --- a/usr/src/uts/common/inet/ip/ip_netinfo.c +++ b/usr/src/uts/common/inet/ip/ip_netinfo.c @@ -38,6 +38,7 @@ #include <sys/cmn_err.h> #include <netinet/in.h> +#include <inet/ipsec_impl.h> #include <inet/common.h> #include <inet/mib2.h> #include <inet/ip.h> @@ -89,6 +90,20 @@ static phy_if_t ipv6_routeto(net_handle_t, struct sockaddr *, struct sockaddr *); static int ipv6_isvalidchecksum(net_handle_t, mblk_t *); +static int net_no_getmtu(net_handle_t, phy_if_t, lif_if_t); +static int net_no_getpmtuenabled(net_handle_t); +static lif_if_t net_no_lifgetnext(net_handle_t, phy_if_t, lif_if_t); +static int net_no_inject(net_handle_t, inject_t, net_inject_t *); +static phy_if_t net_no_routeto(net_handle_t, struct sockaddr *, + struct sockaddr *); +static int net_no_ispartialchecksum(net_handle_t, mblk_t *); +static int net_no_getlifaddr(net_handle_t, phy_if_t, lif_if_t, + size_t, net_ifaddr_t [], void *); +static int net_no_getlifzone(net_handle_t, phy_if_t, lif_if_t, + zoneid_t *); +static int net_no_getlifflags(net_handle_t, phy_if_t, lif_if_t, + uint64_t *); + /* Netinfo private functions */ static int ip_getifname_impl(phy_if_t, char *, const size_t, boolean_t, ip_stack_t *); @@ -111,7 +126,6 @@ static void ip_ni_queue_in_func(void *); static void ip_ni_queue_out_func(void *); static void ip_ni_queue_func_impl(injection_t *, boolean_t); - static net_protocol_t ipv4info = { NETINFO_VERSION, NHF_INET, @@ -149,6 +163,24 @@ static net_protocol_t ipv6info = { ipv6_isvalidchecksum }; +static net_protocol_t arp_netinfo = { + NETINFO_VERSION, + NHF_ARP, + ip_getifname, + net_no_getmtu, + net_no_getpmtuenabled, + net_no_getlifaddr, + net_no_getlifzone, + net_no_getlifflags, + ip_phygetnext, + ip_phylookup, + net_no_lifgetnext, + net_no_inject, + net_no_routeto, + net_no_ispartialchecksum, + ip_isvalidchecksum +}; + /* * The taskq eventq_queue_in is used to process the upside inject messages. * The taskq eventq_queue_out is used to process the downside inject messages. @@ -230,6 +262,9 @@ ip_net_init(ip_stack_t *ipst, netstack_t *ns) ipst->ips_ipv6_net_data = net_protocol_register(id, &ipv6info); ASSERT(ipst->ips_ipv6_net_data != NULL); + + ipst->ips_arp_net_data = net_protocol_register(id, &arp_netinfo); + ASSERT(ipst->ips_ipv6_net_data != NULL); } @@ -248,6 +283,11 @@ ip_net_destroy(ip_stack_t *ipst) if (net_protocol_unregister(ipst->ips_ipv6_net_data) == 0) ipst->ips_ipv6_net_data = NULL; } + + if (ipst->ips_arp_net_data != NULL) { + if (net_protocol_unregister(ipst->ips_arp_net_data) == 0) + ipst->ips_arp_net_data = NULL; + } } /* @@ -612,8 +652,7 @@ ip_getifname_impl(phy_if_t phy_ifdata, ASSERT(buffer != NULL); - ill = ill_lookup_on_ifindex((uint_t)phy_ifdata, isv6, NULL, NULL, - NULL, NULL, ipst); + ill = ill_lookup_on_ifindex((uint_t)phy_ifdata, isv6, ipst); if (ill == NULL) return (1); @@ -667,17 +706,17 @@ ip_getmtu_impl(phy_if_t phy_ifdata, lif_if_t ifdata, boolean_t isv6, if (ipif == NULL) return (0); - mtu = ipif->ipif_mtu; + mtu = ipif->ipif_ill->ill_mtu; ipif_refrele(ipif); if (mtu == 0) { ill_t *ill; if ((ill = ill_lookup_on_ifindex((uint_t)phy_ifdata, isv6, - NULL, NULL, NULL, NULL, ipst)) == NULL) { + ipst)) == NULL) { return (0); } - mtu = ill->ill_max_frag; + mtu = ill->ill_mtu; ill_refrele(ill); } @@ -760,8 +799,7 @@ ip_phylookup_impl(const char *name, boolean_t isv6, ip_stack_t *ipst) phy_if_t phy; ill_t *ill; - ill = ill_lookup_on_name((char *)name, B_FALSE, isv6, NULL, NULL, - NULL, NULL, NULL, ipst); + ill = ill_lookup_on_name((char *)name, B_FALSE, isv6, NULL, ipst); if (ill == NULL) return (0); @@ -813,8 +851,7 @@ ip_lifgetnext_impl(phy_if_t phy_ifdata, lif_if_t ifdata, boolean_t isv6, ipif_t *ipif; ill_t *ill; - ill = ill_lookup_on_ifindex(phy_ifdata, isv6, NULL, NULL, - NULL, NULL, ipst); + ill = ill_lookup_on_ifindex(phy_ifdata, isv6, ipst); if (ill == NULL) return (0); @@ -898,14 +935,10 @@ static int ip_inject_impl(inject_t style, net_inject_t *packet, boolean_t isv6, ip_stack_t *ipst) { - struct sockaddr_in6 *sin6; ddi_taskq_t *tq = NULL; void (* func)(void *); injection_t *inject; - ip6_t *ip6h; - ire_t *ire; mblk_t *mp; - zoneid_t zoneid; ASSERT(packet != NULL); ASSERT(packet->ni_packet != NULL); @@ -941,130 +974,44 @@ ip_inject_impl(inject_t style, net_inject_t *packet, boolean_t isv6, tq = eventq_queue_out; break; - case NI_DIRECT_OUT: - /* - * Note: - * For IPv4, the code path below will be greatly simplified - * with the delivery of surya - it will become a single - * function call to X. A follow on project is aimed to - * provide similar functionality for IPv6. - */ - mp = packet->ni_packet; - zoneid = - netstackid_to_zoneid(ipst->ips_netstack->netstack_stackid); - - if (!isv6) { - struct sockaddr *sock; - - sock = (struct sockaddr *)&packet->ni_addr; - /* - * ipfil_sendpkt was provided by surya to ease the - * problems associated with sending out a packet. - * Currently this function only supports IPv4. - */ - switch (ipfil_sendpkt(sock, mp, packet->ni_physical, - zoneid)) { - case 0 : - case EINPROGRESS: - return (0); - case ECOMM : - case ENONET : - return (1); - default : - return (1); - } - /* NOTREACHED */ - - } - - ip6h = (ip6_t *)mp->b_rptr; - sin6 = (struct sockaddr_in6 *)&packet->ni_addr; - ASSERT(sin6->sin6_family == AF_INET6); - - ire = ire_route_lookup_v6(&sin6->sin6_addr, 0, 0, 0, - NULL, NULL, zoneid, NULL, - MATCH_IRE_DSTONLY|MATCH_IRE_DEFAULT|MATCH_IRE_RECURSIVE, - ipst); + case NI_DIRECT_OUT: { + struct sockaddr *sock; - if (ire == NULL) { - ip2dbg(("ip_inject: ire_cache_lookup failed\n")); - freemsg(mp); - return (1); - } - - if (ire->ire_stq == NULL) { - /* Send to loopback destination. */ - if (ire->ire_rfq == NULL) { - ip2dbg(("ip_inject: bad nexthop\n")); - ire_refrele(ire); - freemsg(mp); - return (1); - } - DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, - void_ip_t *, ip6h, __dtrace_ipsr_ill_t *, - ire->ire_ipif->ipif_ill, ipha_t *, NULL, ip6_t *, - ip6h, int, 1); - ip_wput_local_v6(ire->ire_rfq, - ire->ire_ipif->ipif_ill, ip6h, mp, ire, 0, zoneid); - ire_refrele(ire); - return (0); - } - - mp->b_queue = ire->ire_stq; - - if (ire->ire_nce == NULL || - ire->ire_nce->nce_fp_mp == NULL && - ire->ire_nce->nce_res_mp == NULL) { - ip_newroute_v6(ire->ire_stq, mp, &sin6->sin6_addr, - &ip6h->ip6_src, NULL, zoneid, ipst); + mp = packet->ni_packet; - ire_refrele(ire); + sock = (struct sockaddr *)&packet->ni_addr; + /* + * ipfil_sendpkt was provided by surya to ease the + * problems associated with sending out a packet. + */ + switch (ipfil_sendpkt(sock, mp, packet->ni_physical, + netstackid_to_zoneid( + ipst->ips_netstack->netstack_stackid))) { + case 0 : + case EINPROGRESS: return (0); - } else { - /* prepend L2 header for IPv6 packets. */ - mblk_t *llmp; - - /* - * Lock IREs, see 6420438 - */ - mutex_enter(&ire->ire_lock); - llmp = ire->ire_nce->nce_fp_mp ? - ire->ire_nce->nce_fp_mp : - ire->ire_nce->nce_res_mp; - - if ((mp = dupb(llmp)) == NULL && - (mp = copyb(llmp)) == NULL) { - ip2dbg(("ip_inject: llhdr failed\n")); - mutex_exit(&ire->ire_lock); - ire_refrele(ire); - freemsg(mp); - return (1); - } - mutex_exit(&ire->ire_lock); - linkb(mp, packet->ni_packet); + case ECOMM : + case ENONET : + return (1); + default : + return (1); } - - mp->b_queue = ire->ire_stq; - - break; + /* NOTREACHED */ + } default: freemsg(packet->ni_packet); return (1); } - if (tq) { - inject->inj_ptr = ipst; - if (ddi_taskq_dispatch(tq, func, (void *)inject, - DDI_SLEEP) == DDI_FAILURE) { - ip2dbg(("ip_inject: ddi_taskq_dispatch failed\n")); - freemsg(packet->ni_packet); - return (1); - } - } else { - putnext(ire->ire_stq, mp); - ire_refrele(ire); - } + ASSERT(tq != NULL); + inject->inj_ptr = ipst; + if (ddi_taskq_dispatch(tq, func, (void *)inject, + DDI_SLEEP) == DDI_FAILURE) { + ip2dbg(("ip_inject: ddi_taskq_dispatch failed\n")); + freemsg(packet->ni_packet); + return (1); + } return (0); } @@ -1121,64 +1068,57 @@ ip_routeto_impl(struct sockaddr *address, struct sockaddr *nexthop, struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)address; struct sockaddr_in *next = (struct sockaddr_in *)nexthop; struct sockaddr_in *sin = (struct sockaddr_in *)address; - ire_t *sire = NULL; ire_t *ire; - ill_t *ill; + ire_t *nexthop_ire; phy_if_t phy_if; zoneid_t zoneid; zoneid = netstackid_to_zoneid(ipst->ips_netstack->netstack_stackid); if (address->sa_family == AF_INET6) { - ire = ire_route_lookup_v6(&sin6->sin6_addr, NULL, - 0, 0, NULL, &sire, zoneid, NULL, - MATCH_IRE_DSTONLY|MATCH_IRE_DEFAULT|MATCH_IRE_RECURSIVE, - ipst); + ire = ire_route_recursive_v6(&sin6->sin6_addr, 0, NULL, + zoneid, NULL, MATCH_IRE_DSTONLY, B_TRUE, 0, ipst, NULL, + NULL, NULL); } else { - ire = ire_route_lookup(sin->sin_addr.s_addr, 0, - 0, 0, NULL, &sire, zoneid, NULL, - MATCH_IRE_DSTONLY|MATCH_IRE_DEFAULT|MATCH_IRE_RECURSIVE, - ipst); + ire = ire_route_recursive_v4(sin->sin_addr.s_addr, 0, NULL, + zoneid, NULL, MATCH_IRE_DSTONLY, B_TRUE, 0, ipst, NULL, + NULL, NULL); } - - if (ire == NULL) - return (0); - + ASSERT(ire != NULL); /* * For some destinations, we have routes that are dead ends, so * return to indicate that no physical interface can be used to * reach the destination. */ - if ((ire->ire_flags & (RTF_REJECT | RTF_BLACKHOLE)) != 0) { - if (sire != NULL) - ire_refrele(sire); + if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { ire_refrele(ire); - return (0); + return (NULL); } - ill = ire_to_ill(ire); - if (ill == NULL) { - if (sire != NULL) - ire_refrele(sire); + nexthop_ire = ire_nexthop(ire); + if (nexthop_ire == NULL) { + ire_refrele(ire); + return (0); + } + if (nexthop_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { + ire_refrele(nexthop_ire); ire_refrele(ire); return (0); } + ASSERT(nexthop_ire->ire_ill != NULL); + if (nexthop != NULL) { if (address->sa_family == AF_INET6) { - next->sin_addr.s_addr = sire ? sire->ire_gateway_addr : - sin->sin_addr.s_addr; + next6->sin6_addr = nexthop_ire->ire_addr_v6; } else { - next6->sin6_addr = sire ? sire->ire_gateway_addr_v6 : - sin6->sin6_addr; + next->sin_addr.s_addr = nexthop_ire->ire_addr; } } - ASSERT(ill != NULL); - phy_if = (phy_if_t)ill->ill_phyint->phyint_ifindex; - if (sire != NULL) - ire_refrele(sire); + phy_if = (phy_if_t)nexthop_ire->ire_ill->ill_phyint->phyint_ifindex; ire_refrele(ire); + ire_refrele(nexthop_ire); return (phy_if); } @@ -1477,8 +1417,7 @@ ip_getlifflags_impl(sa_family_t family, phy_if_t phy_ifdata, lif_if_t ifdata, ipif_t *ipif; ill_t *ill; - ill = ill_lookup_on_ifindex(phy_ifdata, - (family == AF_INET6), NULL, NULL, NULL, NULL, ipst); + ill = ill_lookup_on_ifindex(phy_ifdata, (family == AF_INET6), ipst); if (ill == NULL) return (-1); phyi = ill->ill_phyint; @@ -1538,59 +1477,43 @@ static void ip_ni_queue_func_impl(injection_t *inject, boolean_t out) { net_inject_t *packet; - conn_t *conn; ill_t *ill; ip_stack_t *ipst = (ip_stack_t *)inject->inj_ptr; + ip_xmit_attr_t ixas; ASSERT(inject != NULL); packet = &inject->inj_data; ASSERT(packet->ni_packet != NULL); - ill = ill_lookup_on_ifindex((uint_t)packet->ni_physical, - B_FALSE, NULL, NULL, NULL, NULL, ipst); - if (ill == NULL) { - kmem_free(inject, sizeof (*inject)); - return; - } - if (out == 0) { + ill = ill_lookup_on_ifindex((uint_t)packet->ni_physical, + inject->inj_isv6, ipst); + + if (ill == NULL) { + kmem_free(inject, sizeof (*inject)); + return; + } + if (inject->inj_isv6) { - ip_rput_v6(ill->ill_rq, packet->ni_packet); + ip_input_v6(ill, NULL, packet->ni_packet, NULL); } else { ip_input(ill, NULL, packet->ni_packet, NULL); } - kmem_free(inject, sizeof (*inject)); ill_refrele(ill); - return; - } - - /* - * Even though ipcl_conn_create requests that it be passed - * a different value for "TCP", in this case there may not - * be a TCP connection backing the packet and more than - * likely, non-TCP packets will go here too. - */ - conn = ipcl_conn_create(IPCL_IPCCONN, KM_NOSLEEP, ipst->ips_netstack); - if (conn != NULL) { + } else { + bzero(&ixas, sizeof (ixas)); + ixas.ixa_ifindex = packet->ni_physical; + ixas.ixa_ipst = ipst; if (inject->inj_isv6) { - conn->conn_af_isv6 = B_TRUE; - conn->conn_src_preferences = IPV6_PREFER_SRC_DEFAULT; - conn->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; - ip_output_v6(conn, packet->ni_packet, ill->ill_wq, - IP_WPUT); + ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6; } else { - conn->conn_af_isv6 = B_FALSE; - conn->conn_pkt_isv6 = B_FALSE; - conn->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; - ip_output(conn, packet->ni_packet, ill->ill_wq, - IP_WPUT); + ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; } - - CONN_DEC_REF(conn); + (void) ip_output_simple(packet->ni_packet, &ixas); + ixa_cleanup(&ixas); } kmem_free(inject, sizeof (*inject)); - ill_refrele(ill); } /* @@ -1623,3 +1546,152 @@ done: kmem_free(info->hnei_event.hne_data, info->hnei_event.hne_datalen); kmem_free(arg, sizeof (hook_nic_event_int_t)); } + +/* + * Initialize ARP hook family and events + */ +void +arp_hook_init(ip_stack_t *ipst) +{ + HOOK_FAMILY_INIT(&ipst->ips_arproot, Hn_ARP); + if (net_family_register(ipst->ips_arp_net_data, &ipst->ips_arproot) + != 0) { + cmn_err(CE_NOTE, "arp_hook_init" + "net_family_register failed for arp"); + } + + HOOK_EVENT_INIT(&ipst->ips_arp_physical_in_event, NH_PHYSICAL_IN); + ipst->ips_arp_physical_in = net_event_register(ipst->ips_arp_net_data, + &ipst->ips_arp_physical_in_event); + if (ipst->ips_arp_physical_in == NULL) { + cmn_err(CE_NOTE, "arp_hook_init: " + "net_event_register failed for arp/physical_in"); + } + + HOOK_EVENT_INIT(&ipst->ips_arp_physical_out_event, NH_PHYSICAL_OUT); + ipst->ips_arp_physical_out = net_event_register(ipst->ips_arp_net_data, + &ipst->ips_arp_physical_out_event); + if (ipst->ips_arp_physical_out == NULL) { + cmn_err(CE_NOTE, "arp_hook_init: " + "net_event_register failed for arp/physical_out"); + } + + HOOK_EVENT_INIT(&ipst->ips_arp_nic_events, NH_NIC_EVENTS); + ipst->ips_arpnicevents = net_event_register(ipst->ips_arp_net_data, + &ipst->ips_arp_nic_events); + if (ipst->ips_arpnicevents == NULL) { + cmn_err(CE_NOTE, "arp_hook_init: " + "net_event_register failed for arp/nic_events"); + } +} + +void +arp_hook_destroy(ip_stack_t *ipst) +{ + if (ipst->ips_arpnicevents != NULL) { + if (net_event_unregister(ipst->ips_arp_net_data, + &ipst->ips_arp_nic_events) == 0) + ipst->ips_arpnicevents = NULL; + } + + if (ipst->ips_arp_physical_out != NULL) { + if (net_event_unregister(ipst->ips_arp_net_data, + &ipst->ips_arp_physical_out_event) == 0) + ipst->ips_arp_physical_out = NULL; + } + + if (ipst->ips_arp_physical_in != NULL) { + if (net_event_unregister(ipst->ips_arp_net_data, + &ipst->ips_arp_physical_in_event) == 0) + ipst->ips_arp_physical_in = NULL; + } + + (void) net_family_unregister(ipst->ips_arp_net_data, + &ipst->ips_arproot); +} + +void +arp_hook_shutdown(ip_stack_t *ipst) +{ + if (ipst->ips_arp_physical_in != NULL) { + (void) net_event_shutdown(ipst->ips_arp_net_data, + &ipst->ips_arp_physical_in_event); + } + if (ipst->ips_arp_physical_out != NULL) { + (void) net_event_shutdown(ipst->ips_arp_net_data, + &ipst->ips_arp_physical_out_event); + } + if (ipst->ips_arpnicevents != NULL) { + (void) net_event_shutdown(ipst->ips_arp_net_data, + &ipst->ips_arp_nic_events); + } +} + +/* netinfo routines for the unsupported cases */ + +/* ARGSUSED */ +int +net_no_getmtu(net_handle_t handle, phy_if_t phy_ifdata, lif_if_t ifdata) +{ + return (-1); +} + +/* ARGSUSED */ +static int +net_no_getpmtuenabled(net_handle_t neti) +{ + return (-1); +} + +/* ARGSUSED */ +static lif_if_t +net_no_lifgetnext(net_handle_t neti, phy_if_t phy_ifdata, lif_if_t ifdata) +{ + return (-1); +} + +/* ARGSUSED */ +static int +net_no_inject(net_handle_t neti, inject_t style, net_inject_t *packet) +{ + return (-1); +} + +/* ARGSUSED */ +static phy_if_t +net_no_routeto(net_handle_t neti, struct sockaddr *address, + struct sockaddr *next) +{ + return ((phy_if_t)-1); +} + +/* ARGSUSED */ +static int +net_no_ispartialchecksum(net_handle_t neti, mblk_t *mp) +{ + return (-1); +} + +/* ARGSUSED */ +static int +net_no_getlifaddr(net_handle_t neti, phy_if_t phy_ifdata, lif_if_t ifdata, + size_t nelem, net_ifaddr_t type[], void *storage) +{ + return (-1); +} + +/* ARGSUSED */ +static int +net_no_getlifzone(net_handle_t neti, phy_if_t phy_ifdata, lif_if_t ifdata, + zoneid_t *zoneid) +{ + return (-1); +} + +/* ARGSUSED */ +static int +net_no_getlifflags(net_handle_t neti, phy_if_t phy_ifdata, lif_if_t ifdata, + uint64_t *flags) +{ + return (-1); +} diff --git a/usr/src/uts/common/inet/ip/ip_opt_data.c b/usr/src/uts/common/inet/ip/ip_opt_data.c deleted file mode 100644 index e86e59f67d..0000000000 --- a/usr/src/uts/common/inet/ip/ip_opt_data.c +++ /dev/null @@ -1,301 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License (the "License"). - * You may not use this file except in compliance with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#include <sys/types.h> -#include <sys/stream.h> -#define _SUN_TPI_VERSION 2 -#include <sys/tihdr.h> -#include <sys/socket.h> -#include <sys/xti_inet.h> - -#include <inet/common.h> -#include <netinet/ip6.h> -#include <inet/ip.h> - -#include <netinet/in.h> -#include <netinet/ip_mroute.h> -#include <inet/optcom.h> - - -extern int ip_opt_default(queue_t *q, int level, int name, uchar_t *ptr); -extern int ip_opt_get(queue_t *q, int level, int name, uchar_t *ptr); -extern int ip_opt_set(queue_t *q, uint_t optset_context, int level, - int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, - void *dummy, cred_t *cr, mblk_t *first_mp); - -/* - * Table of all known options handled on a IP protocol stack. - * - * Note: Not all of these options are available through all protocol stacks - * For example, multicast options are not accessible in TCP over IP. - * The filtering for that happens in option table at transport level. - * Also, this table excludes any options processed exclusively at the - * transport protocol level. - */ -opdes_t ip_opt_arr[] = { - -{ SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ SO_PROTOTYPE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, - -{ SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), - 0 }, - - -{ IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, - (OP_VARLEN|OP_NODEFAULT), - IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, -{ T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, - (OP_VARLEN|OP_NODEFAULT), - IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, - -{ IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IP_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IP_MULTICAST_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, - sizeof (struct in_addr), 0 /* INADDR_ANY */ }, - -{ IP_MULTICAST_LOOP, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_DEF_FN), - sizeof (uchar_t), -1 /* not initialized */}, - -{ IP_MULTICAST_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_DEF_FN), - sizeof (uchar_t), -1 /* not initialized */ }, - -{ IP_ADD_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_NODEFAULT), - sizeof (struct ip_mreq), -1 /* not initialized */ }, - -{ IP_DROP_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_NODEFAULT), - sizeof (struct ip_mreq), -1 /* not initialized */ }, - -{ IP_BLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_NODEFAULT), - sizeof (struct ip_mreq_source), -1 /* not initialized */ }, - -{ IP_UNBLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_NODEFAULT), - sizeof (struct ip_mreq_source), -1 /* not initialized */ }, - -{ IP_ADD_SOURCE_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, - (OP_NODEFAULT), sizeof (struct ip_mreq_source), -1 }, - -{ IP_DROP_SOURCE_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, - (OP_NODEFAULT), sizeof (struct ip_mreq_source), -1 }, - -{ IP_RECVOPTS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, - -{ IP_RECVDSTADDR, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 - }, - -{ IP_RECVIF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, - -{ IP_PKTINFO, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, - -{ IP_RECVSLLA, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, - -{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, - sizeof (int), 0 /* no ifindex */ }, - -{ IP_DHCPINIT_IF, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0, - sizeof (int), 0 }, - -{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0, - sizeof (int), 0 }, - -{ IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_NODEFAULT), - sizeof (ipsec_req_t), -1 /* not initialized */ }, - -{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0, - sizeof (in_addr_t), -1 /* not initialized */ }, - -{ MRT_INIT, IPPROTO_IP, 0, OA_X, OP_CONFIG, - (OP_NODEFAULT), sizeof (int), -1 /* not initialized */ }, - -{ MRT_DONE, IPPROTO_IP, 0, OA_X, OP_CONFIG, - (OP_NODEFAULT), 0, -1 /* not initialized */ }, - -{ MRT_ADD_VIF, IPPROTO_IP, 0, OA_X, OP_CONFIG, (OP_NODEFAULT), - sizeof (struct vifctl), -1 /* not initialized */ }, - -{ MRT_DEL_VIF, IPPROTO_IP, 0, OA_X, OP_CONFIG, (OP_NODEFAULT), - sizeof (vifi_t), -1 /* not initialized */ }, - -{ MRT_ADD_MFC, IPPROTO_IP, 0, OA_X, OP_CONFIG, (OP_NODEFAULT), - sizeof (struct mfcctl), -1 /* not initialized */ }, - -{ MRT_DEL_MFC, IPPROTO_IP, 0, OA_X, OP_CONFIG, (OP_NODEFAULT), - sizeof (struct mfcctl), -1 /* not initialized */ }, - -{ MRT_VERSION, IPPROTO_IP, OA_R, OA_R, OP_NP, (OP_NODEFAULT), - sizeof (int), -1 /* not initialized */ }, - -{ MRT_ASSERT, IPPROTO_IP, 0, OA_RW, OP_CONFIG, (OP_NODEFAULT), - sizeof (int), -1 /* not initialized */ }, - -{ MCAST_JOIN_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP, - (OP_NODEFAULT), sizeof (struct group_req), - -1 /* not initialized */ }, -{ MCAST_LEAVE_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP, - (OP_NODEFAULT), sizeof (struct group_req), - -1 /* not initialized */ }, -{ MCAST_BLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, - (OP_NODEFAULT), sizeof (struct group_source_req), - -1 /* not initialized */ }, -{ MCAST_UNBLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, - (OP_NODEFAULT), sizeof (struct group_source_req), - -1 /* not initialized */ }, -{ MCAST_JOIN_SOURCE_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP, - (OP_NODEFAULT), sizeof (struct group_source_req), - -1 /* not initialized */ }, -{ MCAST_LEAVE_SOURCE_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP, - (OP_NODEFAULT), sizeof (struct group_source_req), - -1 /* not initialized */ }, - -{ IPV6_MULTICAST_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, - sizeof (int), 0 }, - -{ IPV6_MULTICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_DEF_FN), sizeof (int), -1 /* not initialized */ }, - -{ IPV6_MULTICAST_LOOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_DEF_FN), sizeof (int), -1 /* not initialized */}, - -{ IPV6_JOIN_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, (OP_NODEFAULT), - sizeof (struct ipv6_mreq), -1 /* not initialized */ }, - -{ IPV6_LEAVE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, - (OP_NODEFAULT), - sizeof (struct ipv6_mreq), -1 /* not initialized */ }, - -{ IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_DEF_FN), sizeof (int), -1 /* not initialized */ }, - -{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, - sizeof (int), 0 /* no ifindex */ }, - -{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0, - sizeof (int), 0 }, - -{ IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_NODEFAULT|OP_VARLEN), - sizeof (struct in6_pktinfo), -1 /* not initialized */ }, -{ IPV6_HOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_NODEFAULT|OP_VARLEN), - sizeof (int), -1 /* not initialized */ }, -{ IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_NODEFAULT|OP_VARLEN), - sizeof (sin6_t), -1 /* not initialized */ }, -{ IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_VARLEN|OP_NODEFAULT), 255*8, - -1 /* not initialized */ }, -{ IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_VARLEN|OP_NODEFAULT), 255*8, - -1 /* not initialized */ }, -{ IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_VARLEN|OP_NODEFAULT), 255*8, - -1 /* not initialized */ }, -{ IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_VARLEN|OP_NODEFAULT), 255*8, - -1 /* not initialized */ }, -{ IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_NODEFAULT|OP_VARLEN), - sizeof (int), -1 /* not initialized */ }, -{ IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, - sizeof (struct ip6_mtuinfo), -1 }, -{ IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, - sizeof (int), 0 }, -{ IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, - sizeof (int), -1 }, -{ IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, - sizeof (int), 0 }, - -/* Enable receipt of ancillary data */ -{ IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, - sizeof (int), 0 }, -{ IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, - sizeof (int), 0 }, -{ IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, - sizeof (int), 0 }, -{ _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, - sizeof (int), 0 }, -{ IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, - sizeof (int), 0 }, -{ IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, - sizeof (int), 0 }, -{ IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, - sizeof (int), 0 }, -{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, - sizeof (int), 0 }, -{ IPV6_RECVPATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, - sizeof (int), 0 }, - -{ IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, (OP_NODEFAULT), - sizeof (ipsec_req_t), -1 /* not initialized */ }, -{ IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, - sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT }, - -{ MCAST_JOIN_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, - (OP_NODEFAULT), sizeof (struct group_req), - -1 /* not initialized */ }, -{ MCAST_LEAVE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, - (OP_NODEFAULT), sizeof (struct group_req), - -1 /* not initialized */ }, -{ MCAST_BLOCK_SOURCE, IPPROTO_IPV6, OA_X, OA_X, OP_NP, - (OP_NODEFAULT), sizeof (struct group_source_req), - -1 /* not initialized */ }, -{ MCAST_UNBLOCK_SOURCE, IPPROTO_IPV6, OA_X, OA_X, OP_NP, - (OP_NODEFAULT), sizeof (struct group_source_req), - -1 /* not initialized */ }, -{ MCAST_JOIN_SOURCE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, - (OP_NODEFAULT), sizeof (struct group_source_req), - -1 /* not initialized */ }, -{ MCAST_LEAVE_SOURCE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, - (OP_NODEFAULT), sizeof (struct group_source_req), - -1 /* not initialized */ }, -}; - - -#define IP_OPT_ARR_CNT A_CNT(ip_opt_arr) - - -/* - * Initialize option database object for IP - * - * This object represents database of options to search passed to - * {sock,tpi}optcom_req() interface routine to take care of option - * management and associated methods. - */ - -optdb_obj_t ip_opt_obj = { - ip_opt_default, /* IP default value function pointer */ - ip_opt_get, /* IP get function pointer */ - ip_opt_set, /* IP set function pointer */ - B_FALSE, /* IP is NOT a tpi provider */ - IP_OPT_ARR_CNT, /* IP option database count of entries */ - ip_opt_arr, /* IP option database */ - 0, /* 0 - not needed if not top tpi provider */ - (optlevel_t *)0 /* null - not needed if not top tpi provider */ -}; diff --git a/usr/src/uts/common/inet/ip/ip_output.c b/usr/src/uts/common/inet/ip/ip_output.c new file mode 100644 index 0000000000..a4940fd3e8 --- /dev/null +++ b/usr/src/uts/common/inet/ip/ip_output.c @@ -0,0 +1,2554 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* Copyright (c) 1990 Mentat Inc. */ + +#include <sys/types.h> +#include <sys/stream.h> +#include <sys/strsubr.h> +#include <sys/dlpi.h> +#include <sys/strsun.h> +#include <sys/zone.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/atomic.h> + +#include <sys/systm.h> +#include <sys/param.h> +#include <sys/kmem.h> +#include <sys/sdt.h> +#include <sys/socket.h> +#include <sys/mac.h> +#include <net/if.h> +#include <net/if_arp.h> +#include <net/route.h> +#include <sys/sockio.h> +#include <netinet/in.h> +#include <net/if_dl.h> + +#include <inet/common.h> +#include <inet/mi.h> +#include <inet/mib2.h> +#include <inet/nd.h> +#include <inet/arp.h> +#include <inet/snmpcom.h> +#include <inet/kstatcom.h> + +#include <netinet/igmp_var.h> +#include <netinet/ip6.h> +#include <netinet/icmp6.h> +#include <netinet/sctp.h> + +#include <inet/ip.h> +#include <inet/ip_impl.h> +#include <inet/ip6.h> +#include <inet/ip6_asp.h> +#include <inet/tcp.h> +#include <inet/ip_multi.h> +#include <inet/ip_if.h> +#include <inet/ip_ire.h> +#include <inet/ip_ftable.h> +#include <inet/ip_rts.h> +#include <inet/optcom.h> +#include <inet/ip_ndp.h> +#include <inet/ip_listutils.h> +#include <netinet/igmp.h> +#include <netinet/ip_mroute.h> +#include <inet/ipp_common.h> + +#include <net/pfkeyv2.h> +#include <inet/sadb.h> +#include <inet/ipsec_impl.h> +#include <inet/ipdrop.h> +#include <inet/ip_netinfo.h> + +#include <sys/pattr.h> +#include <inet/ipclassifier.h> +#include <inet/sctp_ip.h> +#include <inet/sctp/sctp_impl.h> +#include <inet/udp_impl.h> +#include <sys/sunddi.h> + +#include <sys/tsol/label.h> +#include <sys/tsol/tnet.h> + +#ifdef DEBUG +extern boolean_t skip_sctp_cksum; +#endif + +static int ip_verify_nce(mblk_t *, ip_xmit_attr_t *); +static int ip_verify_dce(mblk_t *, ip_xmit_attr_t *); +static boolean_t ip_verify_lso(ill_t *, ip_xmit_attr_t *); +static boolean_t ip_verify_zcopy(ill_t *, ip_xmit_attr_t *); +static void ip_output_simple_broadcast(ip_xmit_attr_t *, mblk_t *); + +/* + * There are two types of output functions for IP used for different + * purposes: + * - ip_output_simple() is when sending ICMP errors, TCP resets, etc when there + * is no context in the form of a conn_t. However, there is a + * ip_xmit_attr_t that the callers use to influence interface selection + * (needed for ICMP echo as well as IPv6 link-locals) and IPsec. + * + * - conn_ip_output() is used when sending packets with a conn_t and + * ip_set_destination has been called to cache information. In that case + * various socket options are recorded in the ip_xmit_attr_t and should + * be taken into account. + */ + +/* + * The caller *must* have called conn_connect() or ip_attr_connect() + * before calling conn_ip_output(). The caller needs to redo that each time + * the destination IP address or port changes, as well as each time there is + * a change to any socket option that would modify how packets are routed out + * of the box (e.g., SO_DONTROUTE, IP_NEXTHOP, IP_BOUND_IF). + * + * The ULP caller has to serialize the use of a single ip_xmit_attr_t. + * We assert for that here. + */ +int +conn_ip_output(mblk_t *mp, ip_xmit_attr_t *ixa) +{ + iaflags_t ixaflags = ixa->ixa_flags; + ire_t *ire; + nce_t *nce; + dce_t *dce; + ill_t *ill; + ip_stack_t *ipst = ixa->ixa_ipst; + int error; + + /* We defer ipIfStatsHCOutRequests until an error or we have an ill */ + + ASSERT(ixa->ixa_ire != NULL); + /* Note there is no ixa_nce when reject and blackhole routes */ + ASSERT(ixa->ixa_dce != NULL); /* Could be default dce */ + +#ifdef DEBUG + ASSERT(ixa->ixa_curthread == NULL); + ixa->ixa_curthread = curthread; +#endif + + /* + * Even on labeled systems we can have a NULL ixa_tsl e.g., + * for IGMP/MLD traffic. + */ + + ire = ixa->ixa_ire; + + /* + * If the ULP says the (old) IRE resulted in reachability we + * record this before determine whether to use a new IRE. + * No locking for performance reasons. + */ + if (ixaflags & IXAF_REACH_CONF) + ire->ire_badcnt = 0; + + /* + * Has routing changed since we cached the results of the lookup? + * + * This check captures all of: + * - the cached ire being deleted (by means of the special + * IRE_GENERATION_CONDEMNED) + * - A potentially better ire being added (ire_generation being + * increased) + * - A deletion of the nexthop ire that was used when we did the + * lookup. + * - An addition of a potentially better nexthop ire. + * The last two are handled by walking and increasing the generation + * number on all dependant IREs in ire_flush_cache(). + * + * The check also handles all cases of RTF_REJECT and RTF_BLACKHOLE + * since we ensure that each time we set ixa_ire to such an IRE we + * make sure the ixa_ire_generation does not match (by using + * IRE_GENERATION_VERIFY). + */ + if (ire->ire_generation != ixa->ixa_ire_generation) { + error = ip_verify_ire(mp, ixa); + if (error != 0) { + ip_drop_output("ipIfStatsOutDiscards - verify ire", + mp, NULL); + goto drop; + } + ire = ixa->ixa_ire; + ASSERT(ire != NULL); + if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { +#ifdef DEBUG + ASSERT(ixa->ixa_curthread == curthread); + ixa->ixa_curthread = NULL; +#endif + ire->ire_ob_pkt_count++; + /* ixa_dce might be condemned; use default one */ + return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa, + &ipst->ips_dce_default->dce_ident)); + } + /* + * If the ncec changed then ip_verify_ire already set + * ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; + * so we can recheck the interface mtu. + */ + + /* + * Note that ire->ire_generation could already have changed. + * We catch that next time we send a packet. + */ + } + + /* + * No need to lock access to ixa_nce since the ip_xmit_attr usage + * is single threaded. + */ + ASSERT(ixa->ixa_nce != NULL); + nce = ixa->ixa_nce; + if (nce->nce_is_condemned) { + error = ip_verify_nce(mp, ixa); + /* + * In case ZEROCOPY capability become not available, we + * copy the message and free the original one. We might + * be copying more data than needed but it doesn't hurt + * since such change rarely happens. + */ + switch (error) { + case 0: + break; + case ENOTSUP: { /* ZEROCOPY */ + mblk_t *nmp; + + if ((nmp = copymsg(mp)) != NULL) { + freemsg(mp); + mp = nmp; + + break; + } + /* FALLTHROUGH */ + } + default: + ip_drop_output("ipIfStatsOutDiscards - verify nce", + mp, NULL); + goto drop; + } + ire = ixa->ixa_ire; + ASSERT(ire != NULL); + if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { +#ifdef DEBUG + ASSERT(ixa->ixa_curthread == curthread); + ixa->ixa_curthread = NULL; +#endif + ire->ire_ob_pkt_count++; + /* ixa_dce might be condemned; use default one */ + return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, + ixa, &ipst->ips_dce_default->dce_ident)); + } + ASSERT(ixa->ixa_nce != NULL); + nce = ixa->ixa_nce; + + /* + * Note that some other event could already have made + * the new nce condemned. We catch that next time we + * try to send a packet. + */ + } + /* + * If there is no per-destination dce_t then we have a reference to + * the default dce_t (which merely contains the dce_ipid). + * The generation check captures both the introduction of a + * per-destination dce_t (e.g., due to ICMP packet too big) and + * any change to the per-destination dce (including it becoming + * condemned by use of the special DCE_GENERATION_CONDEMNED). + */ + dce = ixa->ixa_dce; + + /* + * To avoid a periodic timer to increase the path MTU we + * look at dce_last_change_time each time we send a packet. + */ + if ((dce->dce_flags & DCEF_PMTU) && + (TICK_TO_SEC(lbolt64) - dce->dce_last_change_time > + ipst->ips_ip_pathmtu_interval)) { + /* + * Older than 20 minutes. Drop the path MTU information. + * Since the path MTU changes as a result of this, twiddle + * ixa_dce_generation to make us go through the dce + * verification code in conn_ip_output. + */ + mutex_enter(&dce->dce_lock); + dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU); + dce->dce_last_change_time = TICK_TO_SEC(lbolt64); + mutex_exit(&dce->dce_lock); + dce_increment_generation(dce); + } + + if (dce->dce_generation != ixa->ixa_dce_generation) { + error = ip_verify_dce(mp, ixa); + if (error != 0) { + ip_drop_output("ipIfStatsOutDiscards - verify dce", + mp, NULL); + goto drop; + } + dce = ixa->ixa_dce; + + /* + * Note that some other event could already have made the + * new dce's generation number change. + * We catch that next time we try to send a packet. + */ + } + + ill = nce->nce_ill; + + /* + * An initial ixa_fragsize was set in ip_set_destination + * and we update it if any routing changes above. + * A change to ill_mtu with ifconfig will increase all dce_generation + * so that we will detect that with the generation check. + */ + + /* + * Caller needs to make sure IXAF_VERIFY_SRC is not set if + * conn_unspec_src. + */ + if ((ixaflags & IXAF_VERIFY_SOURCE) && + ixa->ixa_src_generation != ipst->ips_src_generation) { + /* Check if the IP source is still assigned to the host. */ + uint_t gen; + + if (!ip_verify_src(mp, ixa, &gen)) { + /* Don't send a packet with a source that isn't ours */ + error = EADDRNOTAVAIL; + ip_drop_output("ipIfStatsOutDiscards - invalid src", + mp, NULL); + goto drop; + } + /* The source is still valid - update the generation number */ + ixa->ixa_src_generation = gen; + } + + /* + * We don't have an IRE when we fragment, hence ire_ob_pkt_count + * can only count the use prior to fragmentation. However the MIB + * counters on the ill will be incremented in post fragmentation. + */ + ire->ire_ob_pkt_count++; + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); + + /* + * Based on ire_type and ire_flags call one of: + * ire_send_local_v* - for IRE_LOCAL and IRE_LOOPBACK + * ire_send_multirt_v* - if RTF_MULTIRT + * ire_send_noroute_v* - if RTF_REJECT or RTF_BLACHOLE + * ire_send_multicast_v* - for IRE_MULTICAST + * ire_send_broadcast_v4 - for IRE_BROADCAST + * ire_send_wire_v* - for the rest. + */ +#ifdef DEBUG + ASSERT(ixa->ixa_curthread == curthread); + ixa->ixa_curthread = NULL; +#endif + return ((ire->ire_sendfn)(ire, mp, mp->b_rptr, ixa, &dce->dce_ident)); + +drop: + if (ixaflags & IXAF_IS_IPV4) { + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); + } else { + BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsHCOutRequests); + BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); + } + freemsg(mp); +#ifdef DEBUG + ASSERT(ixa->ixa_curthread == curthread); + ixa->ixa_curthread = NULL; +#endif + return (error); +} + +/* + * Handle both IPv4 and IPv6. Sets the generation number + * to allow the caller to know when to call us again. + * Returns true if the source address in the packet is a valid source. + * We handle callers which try to send with a zero address (since we only + * get here if UNSPEC_SRC is not set). + */ +boolean_t +ip_verify_src(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp) +{ + ip_stack_t *ipst = ixa->ixa_ipst; + + /* + * Need to grab the generation number before we check to + * avoid a race with a change to the set of local addresses. + * No lock needed since the thread which updates the set of local + * addresses use ipif/ill locks and exit those (hence a store memory + * barrier) before doing the atomic increase of ips_src_generation. + */ + if (generationp != NULL) + *generationp = ipst->ips_src_generation; + + if (ixa->ixa_flags & IXAF_IS_IPV4) { + ipha_t *ipha = (ipha_t *)mp->b_rptr; + + if (ipha->ipha_src == INADDR_ANY) + return (B_FALSE); + + return (ip_laddr_verify_v4(ipha->ipha_src, ixa->ixa_zoneid, + ipst, B_FALSE) != IPVL_BAD); + } else { + ip6_t *ip6h = (ip6_t *)mp->b_rptr; + uint_t scopeid; + + if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) + return (B_FALSE); + + if (ixa->ixa_flags & IXAF_SCOPEID_SET) + scopeid = ixa->ixa_scopeid; + else + scopeid = 0; + + return (ip_laddr_verify_v6(&ip6h->ip6_src, ixa->ixa_zoneid, + ipst, B_FALSE, scopeid) != IPVL_BAD); + } +} + +/* + * Handle both IPv4 and IPv6. Reverify/recalculate the IRE to use. + */ +int +ip_verify_ire(mblk_t *mp, ip_xmit_attr_t *ixa) +{ + uint_t gen; + ire_t *ire; + nce_t *nce; + int error; + boolean_t multirt = B_FALSE; + + /* + * Redo ip_select_route. + * Need to grab generation number as part of the lookup to + * avoid race. + */ + error = 0; + ire = ip_select_route_pkt(mp, ixa, &gen, &error, &multirt); + ASSERT(ire != NULL); /* IRE_NOROUTE if none found */ + if (error != 0) { + ire_refrele(ire); + return (error); + } + + if (ixa->ixa_ire != NULL) + ire_refrele_notr(ixa->ixa_ire); +#ifdef DEBUG + ire_refhold_notr(ire); + ire_refrele(ire); +#endif + ixa->ixa_ire = ire; + ixa->ixa_ire_generation = gen; + if (multirt) { + if (ixa->ixa_flags & IXAF_IS_IPV4) + ixa->ixa_postfragfn = ip_postfrag_multirt_v4; + else + ixa->ixa_postfragfn = ip_postfrag_multirt_v6; + ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST; + } else { + ixa->ixa_postfragfn = ire->ire_postfragfn; + ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST; + } + + /* + * Don't look for an nce for reject or blackhole. + * They have ire_generation set to IRE_GENERATION_VERIFY which + * makes conn_ip_output avoid references to ixa_nce. + */ + if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { + ASSERT(ixa->ixa_ire_generation == IRE_GENERATION_VERIFY); + ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; + return (0); + } + + /* The NCE could now be different */ + nce = ire_to_nce_pkt(ire, mp); + if (nce == NULL) { + /* + * Allocation failure. Make sure we redo ire/nce selection + * next time we send. + */ + ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; + ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; + return (ENOBUFS); + } + if (nce == ixa->ixa_nce) { + /* No change */ + nce_refrele(nce); + return (0); + } + + /* + * Since the path MTU might change as a result of this + * route change, we twiddle ixa_dce_generation to + * make conn_ip_output go through the ip_verify_dce code. + */ + ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; + + if (ixa->ixa_nce != NULL) + nce_refrele(ixa->ixa_nce); + ixa->ixa_nce = nce; + return (0); +} + +/* + * Handle both IPv4 and IPv6. Reverify/recalculate the NCE to use. + */ +static int +ip_verify_nce(mblk_t *mp, ip_xmit_attr_t *ixa) +{ + ire_t *ire = ixa->ixa_ire; + nce_t *nce; + int error = 0; + ipha_t *ipha = NULL; + ip6_t *ip6h = NULL; + + if (ire->ire_ipversion == IPV4_VERSION) + ipha = (ipha_t *)mp->b_rptr; + else + ip6h = (ip6_t *)mp->b_rptr; + + nce = ire_handle_condemned_nce(ixa->ixa_nce, ire, ipha, ip6h, B_TRUE); + if (nce == NULL) { + /* Try to find a better ire */ + return (ip_verify_ire(mp, ixa)); + } + + /* + * The hardware offloading capabilities, for example LSO, of the + * interface might have changed, so do sanity verification here. + */ + if (ixa->ixa_flags & IXAF_VERIFY_LSO) { + if (!ip_verify_lso(nce->nce_ill, ixa)) { + ASSERT(ixa->ixa_notify != NULL); + ixa->ixa_notify(ixa->ixa_notify_cookie, ixa, + IXAN_LSO, 0); + error = ENOTSUP; + } + } + + /* + * Verify ZEROCOPY capability of underlying ill. Notify the ULP with + * any ZEROCOPY changes. In case ZEROCOPY capability is not available + * any more, return error so that conn_ip_output() can take care of + * the ZEROCOPY message properly. It's safe to continue send the + * message when ZEROCOPY newly become available. + */ + if (ixa->ixa_flags & IXAF_VERIFY_ZCOPY) { + if (!ip_verify_zcopy(nce->nce_ill, ixa)) { + ASSERT(ixa->ixa_notify != NULL); + ixa->ixa_notify(ixa->ixa_notify_cookie, ixa, + IXAN_ZCOPY, 0); + if ((ixa->ixa_flags & IXAF_ZCOPY_CAPAB) == 0) + error = ENOTSUP; + } + } + + /* + * Since the path MTU might change as a result of this + * change, we twiddle ixa_dce_generation to + * make conn_ip_output go through the ip_verify_dce code. + */ + ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; + + nce_refrele(ixa->ixa_nce); + ixa->ixa_nce = nce; + return (error); +} + +/* + * Handle both IPv4 and IPv6. Reverify/recalculate the DCE to use. + */ +static int +ip_verify_dce(mblk_t *mp, ip_xmit_attr_t *ixa) +{ + dce_t *dce; + uint_t gen; + uint_t pmtu; + + dce = dce_lookup_pkt(mp, ixa, &gen); + ASSERT(dce != NULL); + + dce_refrele_notr(ixa->ixa_dce); +#ifdef DEBUG + dce_refhold_notr(dce); + dce_refrele(dce); +#endif + ixa->ixa_dce = dce; + ixa->ixa_dce_generation = gen; + + /* Extract the (path) mtu from the dce, ncec_ill etc */ + pmtu = ip_get_pmtu(ixa); + + /* + * Tell ULP about PMTU changes - increase or decrease - by returning + * an error if IXAF_VERIFY_PMTU is set. In such case, ULP should update + * both ixa_pmtu and ixa_fragsize appropriately. + * + * If ULP doesn't set that flag then we need to update ixa_fragsize + * since routing could have changed the ill after after ixa_fragsize + * was set previously in the conn_ip_output path or in + * ip_set_destination. + * + * In case of LSO, ixa_fragsize might be greater than ixa_pmtu. + * + * In the case of a path MTU increase we send the packet after the + * notify to the ULP. + */ + if (ixa->ixa_flags & IXAF_VERIFY_PMTU) { + if (ixa->ixa_pmtu != pmtu) { + uint_t oldmtu = ixa->ixa_pmtu; + + DTRACE_PROBE2(verify_pmtu, uint32_t, pmtu, + uint32_t, ixa->ixa_pmtu); + ASSERT(ixa->ixa_notify != NULL); + ixa->ixa_notify(ixa->ixa_notify_cookie, ixa, + IXAN_PMTU, pmtu); + if (pmtu < oldmtu) + return (EMSGSIZE); + } + } else { + ixa->ixa_fragsize = pmtu; + } + return (0); +} + +/* + * Verify LSO usability. Keep the return value simple to indicate whether + * the LSO capability has changed. Handle both IPv4 and IPv6. + */ +static boolean_t +ip_verify_lso(ill_t *ill, ip_xmit_attr_t *ixa) +{ + ill_lso_capab_t *lsoc = &ixa->ixa_lso_capab; + ill_lso_capab_t *new_lsoc = ill->ill_lso_capab; + + if (ixa->ixa_flags & IXAF_LSO_CAPAB) { + /* + * Not unsable any more. + */ + if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) || + (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) || + (ixa->ixa_ire->ire_flags & RTF_MULTIRT) || + ((ixa->ixa_flags & IXAF_IS_IPV4) ? + !ILL_LSO_TCP_IPV4_USABLE(ill) : + !ILL_LSO_TCP_IPV6_USABLE(ill))) { + ixa->ixa_flags &= ~IXAF_LSO_CAPAB; + + return (B_FALSE); + } + + /* + * Capability has changed, refresh the copy in ixa. + */ + if (lsoc->ill_lso_max != new_lsoc->ill_lso_max) { + *lsoc = *new_lsoc; + + return (B_FALSE); + } + } else { /* Was not usable */ + if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) && + !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && + !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) && + ((ixa->ixa_flags & IXAF_IS_IPV4) ? + ILL_LSO_TCP_IPV4_USABLE(ill) : + ILL_LSO_TCP_IPV6_USABLE(ill))) { + *lsoc = *new_lsoc; + ixa->ixa_flags |= IXAF_LSO_CAPAB; + + return (B_FALSE); + } + } + + return (B_TRUE); +} + +/* + * Verify ZEROCOPY usability. Keep the return value simple to indicate whether + * the ZEROCOPY capability has changed. Handle both IPv4 and IPv6. + */ +static boolean_t +ip_verify_zcopy(ill_t *ill, ip_xmit_attr_t *ixa) +{ + if (ixa->ixa_flags & IXAF_ZCOPY_CAPAB) { + /* + * Not unsable any more. + */ + if ((ixa->ixa_flags & IXAF_IPSEC_SECURE) || + (ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) || + (ixa->ixa_ire->ire_flags & RTF_MULTIRT) || + !ILL_ZCOPY_USABLE(ill)) { + ixa->ixa_flags &= ~IXAF_ZCOPY_CAPAB; + + return (B_FALSE); + } + } else { /* Was not usable */ + if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE) && + !(ixa->ixa_ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && + !(ixa->ixa_ire->ire_flags & RTF_MULTIRT) && + ILL_ZCOPY_USABLE(ill)) { + ixa->ixa_flags |= IXAF_ZCOPY_CAPAB; + + return (B_FALSE); + } + } + + return (B_TRUE); +} + + +/* + * When there is no conn_t context, this will send a packet. + * The caller must *not* have called conn_connect() or ip_attr_connect() + * before calling ip_output_simple(). + * Handles IPv4 and IPv6. Returns zero or an errno such as ENETUNREACH. + * Honors IXAF_SET_SOURCE. + * + * We acquire the ire and after calling ire_sendfn we release + * the hold on the ire. Ditto for the nce and dce. + * + * This assumes that the caller has set the following in ip_xmit_attr_t: + * ixa_tsl, ixa_zoneid, and ixa_ipst must always be set. + * If ixa_ifindex is non-zero it means send out that ill. (If it is + * an upper IPMP ill we load balance across the group; if a lower we send + * on that lower ill without load balancing.) + * IXAF_IS_IPV4 must be set correctly. + * If IXAF_IPSEC_SECURE is set then the ixa_ipsec_* fields must be set. + * If IXAF_NO_IPSEC is set we'd skip IPsec policy lookup. + * If neither of those two are set we do an IPsec policy lookup. + * + * We handle setting things like + * ixa_pktlen + * ixa_ip_hdr_length + * ixa->ixa_protocol + * + * The caller may set ixa_xmit_hint, which is used for ECMP selection and + * transmit ring selecting in GLD. + * + * The caller must do an ixa_cleanup() to release any IPsec references + * after we return. + */ +int +ip_output_simple(mblk_t *mp, ip_xmit_attr_t *ixa) +{ + ts_label_t *effective_tsl = NULL; + int err; + + ASSERT(ixa->ixa_ipst != NULL); + + if (is_system_labeled()) { + ip_stack_t *ipst = ixa->ixa_ipst; + + if (ixa->ixa_flags & IXAF_IS_IPV4) { + err = tsol_check_label_v4(ixa->ixa_tsl, ixa->ixa_zoneid, + &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst, + &effective_tsl); + } else { + err = tsol_check_label_v6(ixa->ixa_tsl, ixa->ixa_zoneid, + &mp, CONN_MAC_DEFAULT, B_FALSE, ixa->ixa_ipst, + &effective_tsl); + } + if (err != 0) { + ip2dbg(("tsol_check: label check failed (%d)\n", err)); + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("tsol_check_label", mp, NULL); + freemsg(mp); + return (err); + } + if (effective_tsl != NULL) { + /* Update the label */ + ip_xmit_attr_replace_tsl(ixa, effective_tsl); + } + } + + if (ixa->ixa_flags & IXAF_IS_IPV4) + return (ip_output_simple_v4(mp, ixa)); + else + return (ip_output_simple_v6(mp, ixa)); +} + +int +ip_output_simple_v4(mblk_t *mp, ip_xmit_attr_t *ixa) +{ + ipha_t *ipha; + ipaddr_t firsthop; /* In IP header */ + ipaddr_t dst; /* End of source route, or ipha_dst if none */ + ire_t *ire; + ipaddr_t setsrc; /* RTF_SETSRC */ + int error; + ill_t *ill = NULL; + dce_t *dce = NULL; + nce_t *nce; + iaflags_t ixaflags = ixa->ixa_flags; + ip_stack_t *ipst = ixa->ixa_ipst; + boolean_t repeat = B_FALSE; + boolean_t multirt = B_FALSE; + + ipha = (ipha_t *)mp->b_rptr; + ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); + + /* + * Even on labeled systems we can have a NULL ixa_tsl e.g., + * for IGMP/MLD traffic. + */ + + /* Caller already set flags */ + ASSERT(ixa->ixa_flags & IXAF_IS_IPV4); + + ASSERT(ixa->ixa_nce == NULL); + + ixa->ixa_pktlen = ntohs(ipha->ipha_length); + ASSERT(ixa->ixa_pktlen == msgdsize(mp)); + ixa->ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha); + ixa->ixa_protocol = ipha->ipha_protocol; + + /* + * Assumes that source routed packets have already been massaged by + * the ULP (ip_massage_options) and as a result ipha_dst is the next + * hop in the source route. The final destination is used for IPsec + * policy and DCE lookup. + */ + firsthop = ipha->ipha_dst; + dst = ip_get_dst(ipha); + +repeat_ire: + error = 0; + setsrc = INADDR_ANY; + ire = ip_select_route_v4(firsthop, ixa, NULL, &setsrc, &error, + &multirt); + ASSERT(ire != NULL); /* IRE_NOROUTE if none found */ + if (error != 0) { + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards - select route", mp, NULL); + freemsg(mp); + goto done; + } + + if (ire->ire_flags & (RTF_BLACKHOLE|RTF_REJECT)) { + /* ire_ill might be NULL hence need to skip some code */ + if (ixaflags & IXAF_SET_SOURCE) + ipha->ipha_src = htonl(INADDR_LOOPBACK); + ixa->ixa_fragsize = IP_MAXPACKET; + ill = NULL; + nce = NULL; + ire->ire_ob_pkt_count++; + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); + /* No dce yet; use default one */ + error = (ire->ire_sendfn)(ire, mp, ipha, ixa, + &ipst->ips_dce_default->dce_ident); + goto done; + } + + /* Note that ipha_dst is only used for IRE_MULTICAST */ + nce = ire_to_nce(ire, ipha->ipha_dst, NULL); + if (nce == NULL) { + /* Allocation failure? */ + ip_drop_output("ire_to_nce", mp, ill); + freemsg(mp); + error = ENOBUFS; + goto done; + } + if (nce->nce_is_condemned) { + nce_t *nce1; + + nce1 = ire_handle_condemned_nce(nce, ire, ipha, NULL, B_TRUE); + nce_refrele(nce); + if (nce1 == NULL) { + if (!repeat) { + /* Try finding a better IRE */ + repeat = B_TRUE; + ire_refrele(ire); + goto repeat_ire; + } + /* Tried twice - drop packet */ + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("No nce", mp, ill); + freemsg(mp); + error = ENOBUFS; + goto done; + } + nce = nce1; + } + + /* + * For multicast with multirt we have a flag passed back from + * ire_lookup_multi_ill_v4 since we don't have an IRE for each + * possible multicast address. + * We also need a flag for multicast since we can't check + * whether RTF_MULTIRT is set in ixa_ire for multicast. + */ + if (multirt) { + ixa->ixa_postfragfn = ip_postfrag_multirt_v4; + ixa->ixa_flags |= IXAF_MULTIRT_MULTICAST; + } else { + ixa->ixa_postfragfn = ire->ire_postfragfn; + ixa->ixa_flags &= ~IXAF_MULTIRT_MULTICAST; + } + ASSERT(ixa->ixa_nce == NULL); + ixa->ixa_nce = nce; + + /* + * Check for a dce_t with a path mtu. + */ + dce = dce_lookup_v4(dst, ipst, NULL); + ASSERT(dce != NULL); + + if (!(ixaflags & IXAF_PMTU_DISCOVERY)) { + ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); + } else if (dce->dce_flags & DCEF_PMTU) { + /* + * To avoid a periodic timer to increase the path MTU we + * look at dce_last_change_time each time we send a packet. + */ + if (TICK_TO_SEC(lbolt64) - dce->dce_last_change_time > + ipst->ips_ip_pathmtu_interval) { + /* + * Older than 20 minutes. Drop the path MTU information. + */ + mutex_enter(&dce->dce_lock); + dce->dce_flags &= ~(DCEF_PMTU|DCEF_TOO_SMALL_PMTU); + dce->dce_last_change_time = TICK_TO_SEC(lbolt64); + mutex_exit(&dce->dce_lock); + dce_increment_generation(dce); + ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); + } else { + uint_t fragsize; + + fragsize = ip_get_base_mtu(nce->nce_ill, ire); + if (fragsize > dce->dce_pmtu) + fragsize = dce->dce_pmtu; + ixa->ixa_fragsize = fragsize; + } + } else { + ixa->ixa_fragsize = ip_get_base_mtu(nce->nce_ill, ire); + } + + /* + * We use use ire_nexthop_ill (and not ncec_ill) to avoid the under ipmp + * interface for source address selection. + */ + ill = ire_nexthop_ill(ire); + + if (ixaflags & IXAF_SET_SOURCE) { + ipaddr_t src; + + /* + * We use the final destination to get + * correct selection for source routed packets + */ + + /* If unreachable we have no ill but need some source */ + if (ill == NULL) { + src = htonl(INADDR_LOOPBACK); + error = 0; + } else { + error = ip_select_source_v4(ill, setsrc, dst, + ixa->ixa_multicast_ifaddr, ixa->ixa_zoneid, ipst, + &src, NULL, NULL); + } + if (error != 0) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards - no source", + mp, ill); + freemsg(mp); + goto done; + } + ipha->ipha_src = src; + } else if (ixaflags & IXAF_VERIFY_SOURCE) { + /* Check if the IP source is assigned to the host. */ + if (!ip_verify_src(mp, ixa, NULL)) { + /* Don't send a packet with a source that isn't ours */ + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards - invalid source", + mp, ill); + freemsg(mp); + error = EADDRNOTAVAIL; + goto done; + } + } + + + /* + * Check against global IPsec policy to set the AH/ESP attributes. + * IPsec will set IXAF_IPSEC_* and ixa_ipsec_* as appropriate. + */ + if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) { + ASSERT(ixa->ixa_ipsec_policy == NULL); + mp = ip_output_attach_policy(mp, ipha, NULL, NULL, ixa); + if (mp == NULL) { + /* MIB and ip_drop_packet already done */ + return (EHOSTUNREACH); /* IPsec policy failure */ + } + } + + if (ill != NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); + } else { + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); + } + + /* + * We update the statistics on the most specific IRE i.e., the first + * one we found. + * We don't have an IRE when we fragment, hence ire_ob_pkt_count + * can only count the use prior to fragmentation. However the MIB + * counters on the ill will be incremented in post fragmentation. + */ + ire->ire_ob_pkt_count++; + + /* + * Based on ire_type and ire_flags call one of: + * ire_send_local_v4 - for IRE_LOCAL and IRE_LOOPBACK + * ire_send_multirt_v4 - if RTF_MULTIRT + * ire_send_noroute_v4 - if RTF_REJECT or RTF_BLACHOLE + * ire_send_multicast_v4 - for IRE_MULTICAST + * ire_send_broadcast_v4 - for IRE_BROADCAST + * ire_send_wire_v4 - for the rest. + */ + error = (ire->ire_sendfn)(ire, mp, ipha, ixa, &dce->dce_ident); +done: + ire_refrele(ire); + if (dce != NULL) + dce_refrele(dce); + if (ill != NULL) + ill_refrele(ill); + if (ixa->ixa_nce != NULL) + nce_refrele(ixa->ixa_nce); + ixa->ixa_nce = NULL; + return (error); +} + +/* + * ire_sendfn() functions. + * These functions use the following xmit_attr: + * - ixa_fragsize - read to determine whether or not to fragment + * - IXAF_IPSEC_SECURE - to determine whether or not to invoke IPsec + * - ixa_ipsec_* are used inside IPsec + * - IXAF_SET_SOURCE - replace IP source in broadcast case. + * - IXAF_LOOPBACK_COPY - for multicast and broadcast + */ + + +/* + * ire_sendfn for IRE_LOCAL and IRE_LOOPBACK + * + * The checks for restrict_interzone_loopback are done in ire_route_recursive. + */ +/* ARGSUSED4 */ +int +ire_send_local_v4(ire_t *ire, mblk_t *mp, void *iph_arg, + ip_xmit_attr_t *ixa, uint32_t *identp) +{ + ipha_t *ipha = (ipha_t *)iph_arg; + ip_stack_t *ipst = ixa->ixa_ipst; + ill_t *ill = ire->ire_ill; + ip_recv_attr_t iras; /* NOTE: No bzero for performance */ + uint_t pktlen = ixa->ixa_pktlen; + + /* + * No fragmentation, no nce, no application of IPsec, + * and no ipha_ident assignment. + * + * Note different order between IP provider and FW_HOOKS than in + * send_wire case. + */ + + /* + * DTrace this as ip:::send. A packet blocked by FW_HOOKS will fire the + * send probe, but not the receive probe. + */ + DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, + ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL, + int, 1); + + if (HOOKS4_INTERESTED_LOOPBACK_OUT(ipst)) { + int error; + + DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL, + ill_t *, ill, ipha_t *, ipha, mblk_t *, mp); + FW_HOOKS(ipst->ips_ip4_loopback_out_event, + ipst->ips_ipv4firewall_loopback_out, + NULL, ill, ipha, mp, mp, 0, ipst, error); + DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp); + if (mp == NULL) + return (error); + + /* + * Even if the destination was changed by the filter we use the + * forwarding decision that was made based on the address + * in ip_output/ip_set_destination. + */ + /* Length could be different */ + ipha = (ipha_t *)mp->b_rptr; + pktlen = ntohs(ipha->ipha_length); + } + + /* + * If a callback is enabled then we need to know the + * source and destination zoneids for the packet. We already + * have those handy. + */ + if (ipst->ips_ip4_observe.he_interested) { + zoneid_t szone, dzone; + zoneid_t stackzoneid; + + stackzoneid = netstackid_to_zoneid( + ipst->ips_netstack->netstack_stackid); + + if (stackzoneid == GLOBAL_ZONEID) { + /* Shared-IP zone */ + dzone = ire->ire_zoneid; + szone = ixa->ixa_zoneid; + } else { + szone = dzone = stackzoneid; + } + ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, ipst); + } + + /* Handle lo0 stats */ + ipst->ips_loopback_packets++; + + /* Map ixa to ira including IPsec policies */ + ipsec_out_to_in(ixa, ill, &iras); + iras.ira_pktlen = pktlen; + + if (!IS_SIMPLE_IPH(ipha)) { + ip_output_local_options(ipha, ipst); + iras.ira_flags |= IRAF_IPV4_OPTIONS; + } + + if (HOOKS4_INTERESTED_LOOPBACK_IN(ipst)) { + int error; + + DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill, + ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp); + FW_HOOKS(ipst->ips_ip4_loopback_in_event, + ipst->ips_ipv4firewall_loopback_in, + ill, NULL, ipha, mp, mp, 0, ipst, error); + + DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp); + if (mp == NULL) { + ira_cleanup(&iras, B_FALSE); + return (error); + } + /* + * Even if the destination was changed by the filter we use the + * forwarding decision that was made based on the address + * in ip_output/ip_set_destination. + */ + /* Length could be different */ + ipha = (ipha_t *)mp->b_rptr; + pktlen = iras.ira_pktlen = ntohs(ipha->ipha_length); + } + + DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, + ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, NULL, + int, 1); + + ire->ire_ib_pkt_count++; + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); + UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, pktlen); + + /* Destined to ire_zoneid - use that for fanout */ + iras.ira_zoneid = ire->ire_zoneid; + + if (is_system_labeled()) { + iras.ira_flags |= IRAF_SYSTEM_LABELED; + + /* + * This updates ira_cred, ira_tsl and ira_free_flags based + * on the label. We don't expect this to ever fail for + * loopback packets, so we silently drop the packet should it + * fail. + */ + if (!tsol_get_pkt_label(mp, IPV4_VERSION, &iras)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("tsol_get_pkt_label", mp, ill); + freemsg(mp); + return (0); + } + ASSERT(iras.ira_tsl != NULL); + + /* tsol_get_pkt_label sometimes does pullupmsg */ + ipha = (ipha_t *)mp->b_rptr; + } + + ip_fanout_v4(mp, ipha, &iras); + + /* We moved any IPsec refs from ixa to iras */ + ira_cleanup(&iras, B_FALSE); + return (0); +} + +/* + * ire_sendfn for IRE_BROADCAST + * If the broadcast address is present on multiple ills and ixa_ifindex + * isn't set, then we generate + * a separate datagram (potentially with different source address) for + * those ills. In any case, only one copy is looped back to ip_input_v4. + */ +int +ire_send_broadcast_v4(ire_t *ire, mblk_t *mp, void *iph_arg, + ip_xmit_attr_t *ixa, uint32_t *identp) +{ + ipha_t *ipha = (ipha_t *)iph_arg; + ip_stack_t *ipst = ixa->ixa_ipst; + irb_t *irb = ire->ire_bucket; + ire_t *ire1; + mblk_t *mp1; + ipha_t *ipha1; + iaflags_t ixaflags = ixa->ixa_flags; + nce_t *nce1, *nce_orig; + + /* + * Unless ire_send_multirt_v4 already set a ttl, force the + * ttl to a smallish value. + */ + if (!(ixa->ixa_flags & IXAF_NO_TTL_CHANGE)) { + /* + * To avoid broadcast storms, we usually set the TTL to 1 for + * broadcasts. This can + * be overridden stack-wide through the ip_broadcast_ttl + * ndd tunable, or on a per-connection basis through the + * IP_BROADCAST_TTL socket option. + * + * If SO_DONTROUTE/IXAF_DONTROUTE is set, then ire_send_wire_v4 + * will force ttl to one after we've set this. + */ + if (ixaflags & IXAF_BROADCAST_TTL_SET) + ipha->ipha_ttl = ixa->ixa_broadcast_ttl; + else + ipha->ipha_ttl = ipst->ips_ip_broadcast_ttl; + } + /* + * Make sure we get a loopback copy (after IPsec and frag) + * Skip hardware checksum so that loopback copy is checksumed. + */ + ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; + + /* Do we need to potentially generate multiple copies? */ + if (irb->irb_ire_cnt == 1 || ixa->ixa_ifindex != 0) + return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); + + /* + * Loop over all IRE_BROADCAST in the bucket (might only be one). + * Note that everything in the bucket has the same destination address. + */ + irb_refhold(irb); + for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { + /* We do the main IRE after the end of the loop */ + if (ire1 == ire) + continue; + + /* + * Only IREs for the same IP address should be in the same + * bucket. + * But could have IRE_HOSTs in the case of CGTP. + * If we find any multirt routes we bail out of the loop + * and just do the single packet at the end; ip_postfrag_multirt + * will duplicate the packet. + */ + ASSERT(ire1->ire_addr == ire->ire_addr); + if (!(ire1->ire_type & IRE_BROADCAST)) + continue; + + if (IRE_IS_CONDEMNED(ire1)) + continue; + + if (ixa->ixa_zoneid != ALL_ZONES && + ire->ire_zoneid != ire1->ire_zoneid) + continue; + + ASSERT(ire->ire_ill != ire1->ire_ill && ire1->ire_ill != NULL); + + if (ire1->ire_flags & RTF_MULTIRT) + break; + + /* + * For IPMP we only send for the ipmp_ill. arp_nce_init() will + * ensure that this goes out on the cast_ill. + */ + if (IS_UNDER_IPMP(ire1->ire_ill)) + continue; + + mp1 = copymsg(mp); + if (mp1 == NULL) { + BUMP_MIB(ire1->ire_ill->ill_ip_mib, + ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards", + mp, ire1->ire_ill); + continue; + } + + ipha1 = (ipha_t *)mp1->b_rptr; + if (ixa->ixa_flags & IXAF_SET_SOURCE) { + /* + * Need to pick a different source address for each + * interface. If we have a global IPsec policy and + * no per-socket policy then we punt to + * ip_output_simple_v4 using a separate ip_xmit_attr_t. + */ + if (ixaflags & IXAF_IPSEC_GLOBAL_POLICY) { + ip_output_simple_broadcast(ixa, mp1); + continue; + } + /* Pick a new source address for each interface */ + if (ip_select_source_v4(ire1->ire_ill, INADDR_ANY, + ipha1->ipha_dst, INADDR_ANY, ixa->ixa_zoneid, ipst, + &ipha1->ipha_src, NULL, NULL) != 0) { + BUMP_MIB(ire1->ire_ill->ill_ip_mib, + ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards - select " + "broadcast source", mp1, ire1->ire_ill); + freemsg(mp1); + continue; + } + /* + * Check against global IPsec policy to set the AH/ESP + * attributes. IPsec will set IXAF_IPSEC_* and + * ixa_ipsec_* as appropriate. + */ + if (!(ixaflags & (IXAF_NO_IPSEC|IXAF_IPSEC_SECURE))) { + ASSERT(ixa->ixa_ipsec_policy == NULL); + mp1 = ip_output_attach_policy(mp1, ipha, NULL, + NULL, ixa); + if (mp1 == NULL) { + /* + * MIB and ip_drop_packet already + * done + */ + continue; + } + } + } + /* Make sure we have an NCE on this ill */ + nce1 = arp_nce_init(ire1->ire_ill, ire1->ire_addr, + ire1->ire_type); + if (nce1 == NULL) { + BUMP_MIB(ire1->ire_ill->ill_ip_mib, + ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards - broadcast nce", + mp1, ire1->ire_ill); + freemsg(mp1); + continue; + } + nce_orig = ixa->ixa_nce; + ixa->ixa_nce = nce1; + + ire_refhold(ire1); + /* + * Ignore any errors here. We just collect the errno for + * the main ire below + */ + (void) ire_send_wire_v4(ire1, mp1, ipha1, ixa, identp); + ire_refrele(ire1); + + ixa->ixa_nce = nce_orig; + nce_refrele(nce1); + + ixa->ixa_flags &= ~IXAF_LOOPBACK_COPY; + } + irb_refrele(irb); + /* Finally, the main one */ + + /* + * For IPMP we only send broadcasts on the ipmp_ill. + */ + if (IS_UNDER_IPMP(ire->ire_ill)) { + freemsg(mp); + return (0); + } + + return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); +} + +/* + * Send a packet using a different source address and different + * IPsec policy. + */ +static void +ip_output_simple_broadcast(ip_xmit_attr_t *ixa, mblk_t *mp) +{ + ip_xmit_attr_t ixas; + + bzero(&ixas, sizeof (ixas)); + ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; + ixas.ixa_zoneid = ixa->ixa_zoneid; + ixas.ixa_ifindex = 0; + ixas.ixa_ipst = ixa->ixa_ipst; + ixas.ixa_cred = ixa->ixa_cred; + ixas.ixa_cpid = ixa->ixa_cpid; + ixas.ixa_tsl = ixa->ixa_tsl; + ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; + + (void) ip_output_simple(mp, &ixas); + ixa_cleanup(&ixas); +} + + +static void +multirt_check_v4(ire_t *ire, ipha_t *ipha, ip_xmit_attr_t *ixa) +{ + ip_stack_t *ipst = ixa->ixa_ipst; + + /* Limit the TTL on multirt packets */ + if (ire->ire_type & IRE_MULTICAST) { + if (ipha->ipha_ttl > 1) { + ip2dbg(("ire_send_multirt_v4: forcing multicast " + "multirt TTL to 1 (was %d), dst 0x%08x\n", + ipha->ipha_ttl, ntohl(ire->ire_addr))); + ipha->ipha_ttl = 1; + } + ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; + } else if ((ipst->ips_ip_multirt_ttl > 0) && + (ipha->ipha_ttl > ipst->ips_ip_multirt_ttl)) { + ipha->ipha_ttl = ipst->ips_ip_multirt_ttl; + /* + * Need to ensure we don't increase the ttl should we go through + * ire_send_broadcast or multicast. + */ + ixa->ixa_flags |= IXAF_NO_TTL_CHANGE; + } +} + +/* + * ire_sendfn for IRE_MULTICAST + */ +int +ire_send_multicast_v4(ire_t *ire, mblk_t *mp, void *iph_arg, + ip_xmit_attr_t *ixa, uint32_t *identp) +{ + ipha_t *ipha = (ipha_t *)iph_arg; + ip_stack_t *ipst = ixa->ixa_ipst; + ill_t *ill = ire->ire_ill; + iaflags_t ixaflags = ixa->ixa_flags; + + /* + * The IRE_MULTICAST is the same whether or not multirt is in use. + * Hence we need special-case code. + */ + if (ixaflags & IXAF_MULTIRT_MULTICAST) + multirt_check_v4(ire, ipha, ixa); + + /* + * Check if anything in ip_input_v4 wants a copy of the transmitted + * packet (after IPsec and fragmentation) + * + * 1. Multicast routers always need a copy unless SO_DONTROUTE is set + * RSVP and the rsvp daemon is an example of a + * protocol and user level process that + * handles it's own routing. Hence, it uses the + * SO_DONTROUTE option to accomplish this. + * 2. If the sender has set IP_MULTICAST_LOOP, then we just + * check whether there are any receivers for the group on the ill + * (ignoring the zoneid). + * 3. If IP_MULTICAST_LOOP is not set, then we check if there are + * any members in other shared-IP zones. + * If such members exist, then we indicate that the sending zone + * shouldn't get a loopback copy to preserve the IP_MULTICAST_LOOP + * behavior. + * + * When we loopback we skip hardware checksum to make sure loopback + * copy is checksumed. + * + * Note that ire_ill is the upper in the case of IPMP. + */ + ixa->ixa_flags &= ~(IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM); + if (ipst->ips_ip_g_mrouter && ill->ill_mrouter_cnt > 0 && + !(ixaflags & IXAF_DONTROUTE)) { + ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; + } else if (ixaflags & IXAF_MULTICAST_LOOP) { + /* + * If this zone or any other zone has members then loopback + * a copy. + */ + if (ill_hasmembers_v4(ill, ipha->ipha_dst)) + ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; + } else if (ipst->ips_netstack->netstack_numzones > 1) { + /* + * This zone should not have a copy. But there are some other + * zones which might have members. + */ + if (ill_hasmembers_otherzones_v4(ill, ipha->ipha_dst, + ixa->ixa_zoneid)) { + ixa->ixa_flags |= IXAF_NO_LOOP_ZONEID_SET; + ixa->ixa_no_loop_zoneid = ixa->ixa_zoneid; + ixa->ixa_flags |= IXAF_LOOPBACK_COPY | IXAF_NO_HW_CKSUM; + } + } + + /* + * Unless ire_send_multirt_v4 or icmp_output_hdrincl already set a ttl, + * force the ttl to the IP_MULTICAST_TTL value + */ + if (!(ixaflags & IXAF_NO_TTL_CHANGE)) { + ipha->ipha_ttl = ixa->ixa_multicast_ttl; + } + + return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); +} + +/* + * ire_sendfn for IREs with RTF_MULTIRT + */ +int +ire_send_multirt_v4(ire_t *ire, mblk_t *mp, void *iph_arg, + ip_xmit_attr_t *ixa, uint32_t *identp) +{ + ipha_t *ipha = (ipha_t *)iph_arg; + + multirt_check_v4(ire, ipha, ixa); + + if (ire->ire_type & IRE_MULTICAST) + return (ire_send_multicast_v4(ire, mp, ipha, ixa, identp)); + else if (ire->ire_type & IRE_BROADCAST) + return (ire_send_broadcast_v4(ire, mp, ipha, ixa, identp)); + else + return (ire_send_wire_v4(ire, mp, ipha, ixa, identp)); +} + +/* + * ire_sendfn for IREs with RTF_REJECT/RTF_BLACKHOLE, including IRE_NOROUTE + */ +int +ire_send_noroute_v4(ire_t *ire, mblk_t *mp, void *iph_arg, + ip_xmit_attr_t *ixa, uint32_t *identp) +{ + ip_stack_t *ipst = ixa->ixa_ipst; + ipha_t *ipha = (ipha_t *)iph_arg; + ill_t *ill; + ip_recv_attr_t iras; + boolean_t dummy; + + /* We assign an IP ident for nice errors */ + ipha->ipha_ident = atomic_add_32_nv(identp, 1); + + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutNoRoutes); + + if (ire->ire_type & IRE_NOROUTE) { + /* A lack of a route as opposed to RTF_REJECT|BLACKHOLE */ + ip_rts_change(RTM_MISS, ipha->ipha_dst, 0, 0, 0, 0, 0, 0, + RTA_DST, ipst); + } + + if (ire->ire_flags & RTF_BLACKHOLE) { + ip_drop_output("ipIfStatsOutNoRoutes RTF_BLACKHOLE", mp, NULL); + freemsg(mp); + /* No error even for local senders - silent blackhole */ + return (0); + } + ip_drop_output("ipIfStatsOutNoRoutes RTF_REJECT", mp, NULL); + + /* + * We need an ill_t for the ip_recv_attr_t even though this packet + * was never received and icmp_unreachable doesn't currently use + * ira_ill. + */ + ill = ill_lookup_on_name("lo0", B_FALSE, + !(ixa->ixa_flags & IRAF_IS_IPV4), &dummy, ipst); + if (ill == NULL) { + freemsg(mp); + return (EHOSTUNREACH); + } + + bzero(&iras, sizeof (iras)); + /* Map ixa to ira including IPsec policies */ + ipsec_out_to_in(ixa, ill, &iras); + + if (ip_source_routed(ipha, ipst)) { + icmp_unreachable(mp, ICMP_SOURCE_ROUTE_FAILED, &iras); + } else { + icmp_unreachable(mp, ICMP_HOST_UNREACHABLE, &iras); + } + /* We moved any IPsec refs from ixa to iras */ + ira_cleanup(&iras, B_FALSE); + ill_refrele(ill); + return (EHOSTUNREACH); +} + +/* + * Calculate a checksum ignoring any hardware capabilities + * + * Returns B_FALSE if the packet was too short for the checksum. Caller + * should free and do stats. + */ +static boolean_t +ip_output_sw_cksum_v4(mblk_t *mp, ipha_t *ipha, ip_xmit_attr_t *ixa) +{ + ip_stack_t *ipst = ixa->ixa_ipst; + uint_t pktlen = ixa->ixa_pktlen; + uint16_t *cksump; + uint32_t cksum; + uint8_t protocol = ixa->ixa_protocol; + uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length; + ipaddr_t dst = ipha->ipha_dst; + ipaddr_t src = ipha->ipha_src; + + /* Just in case it contained garbage */ + DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS; + + /* + * Calculate ULP checksum + */ + if (protocol == IPPROTO_TCP) { + cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length); + cksum = IP_TCP_CSUM_COMP; + } else if (protocol == IPPROTO_UDP) { + cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length); + cksum = IP_UDP_CSUM_COMP; + } else if (protocol == IPPROTO_SCTP) { + sctp_hdr_t *sctph; + + ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph))); + sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length); + /* + * Zero out the checksum field to ensure proper + * checksum calculation. + */ + sctph->sh_chksum = 0; +#ifdef DEBUG + if (!skip_sctp_cksum) +#endif + sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length); + goto ip_hdr_cksum; + } else { + goto ip_hdr_cksum; + } + + /* ULP puts the checksum field is in the first mblk */ + ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr); + + /* + * We accumulate the pseudo header checksum in cksum. + * This is pretty hairy code, so watch close. One + * thing to keep in mind is that UDP and TCP have + * stored their respective datagram lengths in their + * checksum fields. This lines things up real nice. + */ + cksum += (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); + + cksum = IP_CSUM(mp, ip_hdr_length, cksum); + /* + * For UDP/IPv4 a zero means that the packets wasn't checksummed. + * Change to 0xffff + */ + if (protocol == IPPROTO_UDP && cksum == 0) + *cksump = ~cksum; + else + *cksump = cksum; + + IP_STAT(ipst, ip_out_sw_cksum); + IP_STAT_UPDATE(ipst, ip_out_sw_cksum_bytes, pktlen); + +ip_hdr_cksum: + /* Calculate IPv4 header checksum */ + ipha->ipha_hdr_checksum = 0; + ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); + return (B_TRUE); +} + +/* + * Calculate the ULP checksum - try to use hardware. + * In the case of MULTIRT, broadcast or multicast the + * IXAF_NO_HW_CKSUM is set in which case we use software. + * + * If the hardware supports IP header checksum offload; then clear the + * contents of IP header checksum field as expected by NIC. + * Do this only if we offloaded either full or partial sum. + * + * Returns B_FALSE if the packet was too short for the checksum. Caller + * should free and do stats. + */ +static boolean_t +ip_output_cksum_v4(iaflags_t ixaflags, mblk_t *mp, ipha_t *ipha, + ip_xmit_attr_t *ixa, ill_t *ill) +{ + uint_t pktlen = ixa->ixa_pktlen; + uint16_t *cksump; + uint16_t hck_flags; + uint32_t cksum; + uint8_t protocol = ixa->ixa_protocol; + uint16_t ip_hdr_length = ixa->ixa_ip_hdr_length; + + if ((ixaflags & IXAF_NO_HW_CKSUM) || !ILL_HCKSUM_CAPABLE(ill) || + !dohwcksum) { + return (ip_output_sw_cksum_v4(mp, ipha, ixa)); + } + + /* + * Calculate ULP checksum. Note that we don't use cksump and cksum + * if the ill has FULL support. + */ + if (protocol == IPPROTO_TCP) { + cksump = IPH_TCPH_CHECKSUMP(ipha, ip_hdr_length); + cksum = IP_TCP_CSUM_COMP; /* Pseudo-header cksum */ + } else if (protocol == IPPROTO_UDP) { + cksump = IPH_UDPH_CHECKSUMP(ipha, ip_hdr_length); + cksum = IP_UDP_CSUM_COMP; /* Pseudo-header cksum */ + } else if (protocol == IPPROTO_SCTP) { + sctp_hdr_t *sctph; + + ASSERT(MBLKL(mp) >= (ip_hdr_length + sizeof (*sctph))); + sctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_length); + /* + * Zero out the checksum field to ensure proper + * checksum calculation. + */ + sctph->sh_chksum = 0; +#ifdef DEBUG + if (!skip_sctp_cksum) +#endif + sctph->sh_chksum = sctp_cksum(mp, ip_hdr_length); + goto ip_hdr_cksum; + } else { + ip_hdr_cksum: + /* Calculate IPv4 header checksum */ + ipha->ipha_hdr_checksum = 0; + ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); + return (B_TRUE); + } + + /* ULP puts the checksum field is in the first mblk */ + ASSERT(((uchar_t *)cksump) + sizeof (uint16_t) <= mp->b_wptr); + + /* + * Underlying interface supports hardware checksum offload for + * the payload; leave the payload checksum for the hardware to + * calculate. N.B: We only need to set up checksum info on the + * first mblk. + */ + hck_flags = ill->ill_hcksum_capab->ill_hcksum_txflags; + + DB_CKSUMFLAGS(mp) &= ~HCK_FLAGS; + if (hck_flags & HCKSUM_INET_FULL_V4) { + /* + * Hardware calculates pseudo-header, header and the + * payload checksums, so clear the checksum field in + * the protocol header. + */ + *cksump = 0; + DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM; + + ipha->ipha_hdr_checksum = 0; + if (hck_flags & HCKSUM_IPHDRCKSUM) { + DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM; + } else { + ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); + } + return (B_TRUE); + } + if ((hck_flags) & HCKSUM_INET_PARTIAL) { + ipaddr_t dst = ipha->ipha_dst; + ipaddr_t src = ipha->ipha_src; + /* + * Partial checksum offload has been enabled. Fill + * the checksum field in the protocol header with the + * pseudo-header checksum value. + * + * We accumulate the pseudo header checksum in cksum. + * This is pretty hairy code, so watch close. One + * thing to keep in mind is that UDP and TCP have + * stored their respective datagram lengths in their + * checksum fields. This lines things up real nice. + */ + cksum += (dst >> 16) + (dst & 0xFFFF) + + (src >> 16) + (src & 0xFFFF); + cksum += *(cksump); + cksum = (cksum & 0xFFFF) + (cksum >> 16); + *(cksump) = (cksum & 0xFFFF) + (cksum >> 16); + + /* + * Offsets are relative to beginning of IP header. + */ + DB_CKSUMSTART(mp) = ip_hdr_length; + DB_CKSUMSTUFF(mp) = (uint8_t *)cksump - (uint8_t *)ipha; + DB_CKSUMEND(mp) = pktlen; + DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM; + + ipha->ipha_hdr_checksum = 0; + if (hck_flags & HCKSUM_IPHDRCKSUM) { + DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM; + } else { + ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); + } + return (B_TRUE); + } + /* Hardware capabilities include neither full nor partial IPv4 */ + return (ip_output_sw_cksum_v4(mp, ipha, ixa)); +} + +/* + * ire_sendfn for offlink and onlink destinations. + * Also called from the multicast, broadcast, multirt send functions. + * + * Assumes that the caller has a hold on the ire. + * + * This function doesn't care if the IRE just became condemned since that + * can happen at any time. + */ +/* ARGSUSED */ +int +ire_send_wire_v4(ire_t *ire, mblk_t *mp, void *iph_arg, + ip_xmit_attr_t *ixa, uint32_t *identp) +{ + ip_stack_t *ipst = ixa->ixa_ipst; + ipha_t *ipha = (ipha_t *)iph_arg; + iaflags_t ixaflags = ixa->ixa_flags; + ill_t *ill; + + ASSERT(ixa->ixa_nce != NULL); + ill = ixa->ixa_nce->nce_ill; + + if (ixaflags & IXAF_DONTROUTE) + ipha->ipha_ttl = 1; + + /* + * Assign an ident value for this packet. There could be other + * threads targeting the same destination, so we have to arrange + * for a atomic increment. Note that we use a 32-bit atomic add + * because it has better performance than its 16-bit sibling. + * + * Normally ixa_extra_ident is 0, but in the case of LSO it will + * be the number of TCP segments that the driver/hardware will + * extraly construct. + * + * If running in cluster mode and if the source address + * belongs to a replicated service then vector through + * cl_inet_ipident vector to allocate ip identifier + * NOTE: This is a contract private interface with the + * clustering group. + */ + if (cl_inet_ipident != NULL) { + ipaddr_t src = ipha->ipha_src; + ipaddr_t dst = ipha->ipha_dst; + netstackid_t stack_id = ipst->ips_netstack->netstack_stackid; + + ASSERT(cl_inet_isclusterwide != NULL); + if ((*cl_inet_isclusterwide)(stack_id, IPPROTO_IP, + AF_INET, (uint8_t *)(uintptr_t)src, NULL)) { + /* + * Note: not correct with LSO since we can't allocate + * ixa_extra_ident+1 consecutive values. + */ + ipha->ipha_ident = (*cl_inet_ipident)(stack_id, + IPPROTO_IP, AF_INET, (uint8_t *)(uintptr_t)src, + (uint8_t *)(uintptr_t)dst, NULL); + } else { + ipha->ipha_ident = atomic_add_32_nv(identp, + ixa->ixa_extra_ident + 1); + } + } else { + ipha->ipha_ident = atomic_add_32_nv(identp, + ixa->ixa_extra_ident + 1); + } +#ifndef _BIG_ENDIAN + ipha->ipha_ident = htons(ipha->ipha_ident); +#endif + + /* + * This might set b_band, thus the IPsec and fragmentation + * code in IP ensures that b_band is updated in the first mblk. + */ + if (IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { + /* ip_process translates an IS_UNDER_IPMP */ + mp = ip_process(IPP_LOCAL_OUT, mp, ill, ill); + if (mp == NULL) { + /* ip_drop_packet and MIB done */ + return (0); /* Might just be delayed */ + } + } + + /* + * Verify any IPv4 options. + * + * The presense of IP options also forces the network stack to + * calculate the checksum in software. This is because: + * + * Wrap around: certain partial-checksum NICs (eri, ce) limit + * the size of "start offset" width to 6-bit. This effectively + * sets the largest value of the offset to 64-bytes, starting + * from the MAC header. When the cumulative MAC and IP headers + * exceed such limit, the offset will wrap around. This causes + * the checksum to be calculated at the wrong place. + * + * IPv4 source routing: none of the full-checksum capable NICs + * is capable of correctly handling the IPv4 source-routing + * option for purposes of calculating the pseudo-header; the + * actual destination is different from the destination in the + * header which is that of the next-hop. (This case may not be + * true for NICs which can parse IPv6 extension headers, but + * we choose to simplify the implementation by not offloading + * checksum when they are present.) + */ + if (!IS_SIMPLE_IPH(ipha)) { + ixaflags = ixa->ixa_flags |= IXAF_NO_HW_CKSUM; + /* An IS_UNDER_IPMP ill is ok here */ + if (ip_output_options(mp, ipha, ixa, ill)) { + /* Packet has been consumed and ICMP error sent */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + return (EINVAL); + } + } + + /* + * To handle IPsec/iptun's labeling needs we need to tag packets + * while we still have ixa_tsl + */ + if (is_system_labeled() && ixa->ixa_tsl != NULL && + (ill->ill_mactype == DL_6TO4 || ill->ill_mactype == DL_IPV4 || + ill->ill_mactype == DL_IPV6)) { + cred_t *newcr; + + newcr = copycred_from_tslabel(ixa->ixa_cred, ixa->ixa_tsl, + KM_NOSLEEP); + if (newcr == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards - newcr", + mp, ill); + freemsg(mp); + return (ENOBUFS); + } + mblk_setcred(mp, newcr, NOPID); + crfree(newcr); /* mblk_setcred did its own crhold */ + } + + if (ixa->ixa_pktlen > ixa->ixa_fragsize || + (ixaflags & IXAF_IPSEC_SECURE)) { + uint32_t pktlen; + + pktlen = ixa->ixa_pktlen; + if (ixaflags & IXAF_IPSEC_SECURE) + pktlen += ipsec_out_extra_length(ixa); + + if (pktlen > IP_MAXPACKET) + return (EMSGSIZE); + + if (ixaflags & IXAF_SET_ULP_CKSUM) { + /* + * Compute ULP checksum and IP header checksum + * using software + */ + if (!ip_output_sw_cksum_v4(mp, ipha, ixa)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards", mp, ill); + freemsg(mp); + return (EINVAL); + } + } else { + /* Calculate IPv4 header checksum */ + ipha->ipha_hdr_checksum = 0; + ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); + } + + /* + * If this packet would generate a icmp_frag_needed + * message, we need to handle it before we do the IPsec + * processing. Otherwise, we need to strip the IPsec + * headers before we send up the message to the ULPs + * which becomes messy and difficult. + * + * We check using IXAF_DONTFRAG. The DF bit in the header + * is not inspected - it will be copied to any generated + * fragments. + */ + if ((pktlen > ixa->ixa_fragsize) && + (ixaflags & IXAF_DONTFRAG)) { + /* Generate ICMP and return error */ + ip_recv_attr_t iras; + + DTRACE_PROBE4(ip4__fragsize__fail, uint_t, pktlen, + uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen, + uint_t, ixa->ixa_pmtu); + + bzero(&iras, sizeof (iras)); + /* Map ixa to ira including IPsec policies */ + ipsec_out_to_in(ixa, ill, &iras); + + ip_drop_output("ICMP_FRAG_NEEDED", mp, ill); + icmp_frag_needed(mp, ixa->ixa_fragsize, &iras); + /* We moved any IPsec refs from ixa to iras */ + ira_cleanup(&iras, B_FALSE); + return (EMSGSIZE); + } + DTRACE_PROBE4(ip4__fragsize__ok, uint_t, pktlen, + uint_t, ixa->ixa_fragsize, uint_t, ixa->ixa_pktlen, + uint_t, ixa->ixa_pmtu); + + if (ixaflags & IXAF_IPSEC_SECURE) { + /* + * Pass in sufficient information so that + * IPsec can determine whether to fragment, and + * which function to call after fragmentation. + */ + return (ipsec_out_process(mp, ixa)); + } + return (ip_fragment_v4(mp, ixa->ixa_nce, ixaflags, + ixa->ixa_pktlen, ixa->ixa_fragsize, ixa->ixa_xmit_hint, + ixa->ixa_zoneid, ixa->ixa_no_loop_zoneid, + ixa->ixa_postfragfn, &ixa->ixa_cookie)); + } + if (ixaflags & IXAF_SET_ULP_CKSUM) { + /* Compute ULP checksum and IP header checksum */ + /* An IS_UNDER_IPMP ill is ok here */ + if (!ip_output_cksum_v4(ixaflags, mp, ipha, ixa, ill)) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards", mp, ill); + freemsg(mp); + return (EINVAL); + } + } else { + /* Calculate IPv4 header checksum */ + ipha->ipha_hdr_checksum = 0; + ipha->ipha_hdr_checksum = ip_csum_hdr(ipha); + } + return ((ixa->ixa_postfragfn)(mp, ixa->ixa_nce, ixaflags, + ixa->ixa_pktlen, ixa->ixa_xmit_hint, ixa->ixa_zoneid, + ixa->ixa_no_loop_zoneid, &ixa->ixa_cookie)); +} + +/* + * Send mp into ip_input + * Common for IPv4 and IPv6 + */ +void +ip_postfrag_loopback(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, + uint_t pkt_len, zoneid_t nolzid) +{ + rtc_t rtc; + ill_t *ill = nce->nce_ill; + ip_recv_attr_t iras; /* NOTE: No bzero for performance */ + ncec_t *ncec; + + ncec = nce->nce_common; + iras.ira_flags = IRAF_VERIFY_IP_CKSUM | IRAF_VERIFY_ULP_CKSUM | + IRAF_LOOPBACK | IRAF_L2SRC_LOOPBACK; + if (ncec->ncec_flags & NCE_F_BCAST) + iras.ira_flags |= IRAF_L2DST_BROADCAST; + else if (ncec->ncec_flags & NCE_F_MCAST) + iras.ira_flags |= IRAF_L2DST_MULTICAST; + + iras.ira_free_flags = 0; + iras.ira_cred = NULL; + iras.ira_cpid = NOPID; + iras.ira_tsl = NULL; + iras.ira_zoneid = ALL_ZONES; + iras.ira_pktlen = pkt_len; + UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCInOctets, iras.ira_pktlen); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCInReceives); + + if (ixaflags & IXAF_IS_IPV4) + iras.ira_flags |= IRAF_IS_IPV4; + + iras.ira_ill = iras.ira_rill = ill; + iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex; + iras.ira_rifindex = iras.ira_ruifindex; + iras.ira_mhip = NULL; + + iras.ira_flags |= ixaflags & IAF_MASK; + iras.ira_no_loop_zoneid = nolzid; + + /* Broadcast and multicast doesn't care about the squeue */ + iras.ira_sqp = NULL; + + rtc.rtc_ire = NULL; + if (ixaflags & IXAF_IS_IPV4) { + ipha_t *ipha = (ipha_t *)mp->b_rptr; + + rtc.rtc_ipaddr = INADDR_ANY; + + (*ill->ill_inputfn)(mp, ipha, &ipha->ipha_dst, &iras, &rtc); + if (rtc.rtc_ire != NULL) { + ASSERT(rtc.rtc_ipaddr != INADDR_ANY); + ire_refrele(rtc.rtc_ire); + } + } else { + ip6_t *ip6h = (ip6_t *)mp->b_rptr; + + rtc.rtc_ip6addr = ipv6_all_zeros; + + (*ill->ill_inputfn)(mp, ip6h, &ip6h->ip6_dst, &iras, &rtc); + if (rtc.rtc_ire != NULL) { + ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&rtc.rtc_ip6addr)); + ire_refrele(rtc.rtc_ire); + } + } + /* Any references to clean up? No hold on ira */ + if (iras.ira_flags & (IRAF_IPSEC_SECURE|IRAF_SYSTEM_LABELED)) + ira_cleanup(&iras, B_FALSE); +} + +/* + * Post fragmentation function for IRE_MULTICAST and IRE_BROADCAST which + * looks at the IXAF_LOOPBACK_COPY flag. + * Common for IPv4 and IPv6. + * + * If the loopback copy fails (due to no memory) but we send the packet out + * on the wire we return no failure. Only in the case we supress the wire + * sending do we take the loopback failure into account. + * + * Note that we do not perform DTRACE_IP7 and FW_HOOKS for the looped back copy. + * Those operations are performed on this packet in ip_xmit() and it would + * be odd to do it twice for the same packet. + */ +int +ip_postfrag_loopcheck(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, + uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, + uintptr_t *ixacookie) +{ + ill_t *ill = nce->nce_ill; + int error = 0; + + /* + * Check for IXAF_LOOPBACK_COPY - send a copy to ip as if the driver + * had looped it back + */ + if (ixaflags & IXAF_LOOPBACK_COPY) { + mblk_t *mp1; + + mp1 = copymsg(mp); + if (mp1 == NULL) { + /* Failed to deliver the loopback copy. */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards", mp, ill); + error = ENOBUFS; + } else { + ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len, + nolzid); + } + } + + /* + * If TTL = 0 then only do the loopback to this host i.e. we are + * done. We are also done if this was the + * loopback interface since it is sufficient + * to loopback one copy of a multicast packet. + */ + if (ixaflags & IXAF_IS_IPV4) { + ipha_t *ipha = (ipha_t *)mp->b_rptr; + + if (ipha->ipha_ttl == 0) { + ip_drop_output("multicast ipha_ttl not sent to wire", + mp, ill); + freemsg(mp); + return (error); + } + } else { + ip6_t *ip6h = (ip6_t *)mp->b_rptr; + + if (ip6h->ip6_hops == 0) { + ip_drop_output("multicast ipha_ttl not sent to wire", + mp, ill); + freemsg(mp); + return (error); + } + } + if (nce->nce_ill->ill_wq == NULL) { + /* Loopback interface */ + ip_drop_output("multicast on lo0 not sent to wire", mp, ill); + freemsg(mp); + return (error); + } + + return (ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0, + ixacookie)); +} + +/* + * Post fragmentation function for RTF_MULTIRT routes. + * Since IRE_BROADCASTs can have RTF_MULTIRT, this function + * checks IXAF_LOOPBACK_COPY. + * + * If no packet is sent due to failures then we return an errno, but if at + * least one succeeded we return zero. + */ +int +ip_postfrag_multirt_v4(mblk_t *mp, nce_t *nce, iaflags_t ixaflags, + uint_t pkt_len, uint32_t xmit_hint, zoneid_t szone, zoneid_t nolzid, + uintptr_t *ixacookie) +{ + irb_t *irb; + ipha_t *ipha = (ipha_t *)mp->b_rptr; + ire_t *ire; + ire_t *ire1; + mblk_t *mp1; + nce_t *nce1; + ill_t *ill = nce->nce_ill; + ill_t *ill1; + ip_stack_t *ipst = ill->ill_ipst; + int error = 0; + int num_sent = 0; + int err; + uint_t ire_type; + ipaddr_t nexthop; + + ASSERT(ixaflags & IXAF_IS_IPV4); + + /* Check for IXAF_LOOPBACK_COPY */ + if (ixaflags & IXAF_LOOPBACK_COPY) { + mblk_t *mp1; + + mp1 = copymsg(mp); + if (mp1 == NULL) { + /* Failed to deliver the loopback copy. */ + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards", mp, ill); + error = ENOBUFS; + } else { + ip_postfrag_loopback(mp1, nce, ixaflags, pkt_len, + nolzid); + } + } + + /* + * Loop over RTF_MULTIRT for ipha_dst in the same bucket. Send + * a copy to each one. + * Use the nce (nexthop) and ipha_dst to find the ire. + * + * MULTIRT is not designed to work with shared-IP zones thus we don't + * need to pass a zoneid or a label to the IRE lookup. + */ + if (V4_PART_OF_V6(nce->nce_addr) == ipha->ipha_dst) { + /* Broadcast and multicast case */ + ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, 0, 0, + NULL, ALL_ZONES, NULL, MATCH_IRE_DSTONLY, 0, ipst, NULL); + } else { + ipaddr_t v4addr = V4_PART_OF_V6(nce->nce_addr); + + /* Unicast case */ + ire = ire_ftable_lookup_v4(ipha->ipha_dst, 0, v4addr, 0, + NULL, ALL_ZONES, NULL, MATCH_IRE_GW, 0, ipst, NULL); + } + + if (ire == NULL || + (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || + !(ire->ire_flags & RTF_MULTIRT)) { + /* Drop */ + ip_drop_output("ip_postfrag_multirt didn't find route", + mp, nce->nce_ill); + if (ire != NULL) + ire_refrele(ire); + return (ENETUNREACH); + } + + irb = ire->ire_bucket; + irb_refhold(irb); + for (ire1 = irb->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { + /* + * For broadcast we can have a mixture of IRE_BROADCAST and + * IRE_HOST due to the manually added IRE_HOSTs that are used + * to trigger the creation of the special CGTP broadcast routes. + * Thus we have to skip if ire_type doesn't match the original. + */ + if (IRE_IS_CONDEMNED(ire1) || + !(ire1->ire_flags & RTF_MULTIRT) || + ire1->ire_type != ire->ire_type) + continue; + + /* Do the ire argument one after the loop */ + if (ire1 == ire) + continue; + + ill1 = ire_nexthop_ill(ire1); + if (ill1 == NULL) { + /* + * This ire might not have been picked by + * ire_route_recursive, in which case ire_dep might + * not have been setup yet. + * We kick ire_route_recursive to try to resolve + * starting at ire1. + */ + ire_t *ire2; + + ire2 = ire_route_recursive_impl_v4(ire1, + ire1->ire_addr, ire1->ire_type, ire1->ire_ill, + ire1->ire_zoneid, NULL, MATCH_IRE_DSTONLY, + B_TRUE, 0, ipst, NULL, NULL, NULL); + if (ire2 != NULL) + ire_refrele(ire2); + ill1 = ire_nexthop_ill(ire1); + } + + if (ill1 == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards - no ill", + mp, ill); + error = ENETUNREACH; + continue; + } + + /* Pick the addr and type to use for arp_nce_init */ + if (nce->nce_common->ncec_flags & NCE_F_BCAST) { + ire_type = IRE_BROADCAST; + nexthop = ire1->ire_gateway_addr; + } else if (nce->nce_common->ncec_flags & NCE_F_MCAST) { + ire_type = IRE_MULTICAST; + nexthop = ipha->ipha_dst; + } else { + ire_type = ire1->ire_type; /* Doesn't matter */ + nexthop = ire1->ire_gateway_addr; + } + + /* If IPMP meta or under, then we just drop */ + if (ill1->ill_grp != NULL) { + BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards - IPMP", + mp, ill1); + ill_refrele(ill1); + error = ENETUNREACH; + continue; + } + + nce1 = arp_nce_init(ill1, nexthop, ire_type); + if (nce1 == NULL) { + BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards - no nce", + mp, ill1); + ill_refrele(ill1); + error = ENETUNREACH; + continue; + } + mp1 = copymsg(mp); + if (mp1 == NULL) { + BUMP_MIB(ill1->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards", mp, ill1); + nce_refrele(nce1); + ill_refrele(ill1); + error = ENOBUFS; + continue; + } + /* Preserve HW checksum for this copy */ + DB_CKSUMSTART(mp1) = DB_CKSUMSTART(mp); + DB_CKSUMSTUFF(mp1) = DB_CKSUMSTUFF(mp); + DB_CKSUMEND(mp1) = DB_CKSUMEND(mp); + DB_CKSUMFLAGS(mp1) = DB_CKSUMFLAGS(mp); + DB_LSOMSS(mp1) = DB_LSOMSS(mp); + + ire1->ire_ob_pkt_count++; + err = ip_xmit(mp1, nce1, ixaflags, pkt_len, xmit_hint, szone, + 0, ixacookie); + if (err == 0) + num_sent++; + else + error = err; + nce_refrele(nce1); + ill_refrele(ill1); + } + irb_refrele(irb); + ire_refrele(ire); + /* Finally, the main one */ + err = ip_xmit(mp, nce, ixaflags, pkt_len, xmit_hint, szone, 0, + ixacookie); + if (err == 0) + num_sent++; + else + error = err; + if (num_sent > 0) + return (0); + else + return (error); +} + +/* + * Verify local connectivity. This check is called by ULP fusion code. + * The generation number on an IRE_LOCAL or IRE_LOOPBACK only changes if + * the interface is brought down and back up. So we simply fail the local + * process. The caller, TCP Fusion, should unfuse the connection. + */ +boolean_t +ip_output_verify_local(ip_xmit_attr_t *ixa) +{ + ire_t *ire = ixa->ixa_ire; + + if (!(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK))) + return (B_FALSE); + + return (ixa->ixa_ire->ire_generation == ixa->ixa_ire_generation); +} + +/* + * Local process for ULP loopback, TCP Fusion. Handle both IPv4 and IPv6. + * + * The caller must call ip_output_verify_local() first. This function handles + * IPobs, FW_HOOKS, and/or IPsec cases sequentially. + */ +mblk_t * +ip_output_process_local(mblk_t *mp, ip_xmit_attr_t *ixa, boolean_t hooks_out, + boolean_t hooks_in, conn_t *peer_connp) +{ + ill_t *ill = ixa->ixa_ire->ire_ill; + ipha_t *ipha = NULL; + ip6_t *ip6h = NULL; + ip_stack_t *ipst = ixa->ixa_ipst; + iaflags_t ixaflags = ixa->ixa_flags; + ip_recv_attr_t iras; + int error; + + ASSERT(mp != NULL); + + if (ixaflags & IXAF_IS_IPV4) { + ipha = (ipha_t *)mp->b_rptr; + + /* + * If a callback is enabled then we need to know the + * source and destination zoneids for the packet. We already + * have those handy. + */ + if (ipst->ips_ip4_observe.he_interested) { + zoneid_t szone, dzone; + zoneid_t stackzoneid; + + stackzoneid = netstackid_to_zoneid( + ipst->ips_netstack->netstack_stackid); + + if (stackzoneid == GLOBAL_ZONEID) { + /* Shared-IP zone */ + dzone = ixa->ixa_ire->ire_zoneid; + szone = ixa->ixa_zoneid; + } else { + szone = dzone = stackzoneid; + } + ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, + ipst); + } + DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, + ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, + NULL, int, 1); + + /* FW_HOOKS: LOOPBACK_OUT */ + if (hooks_out) { + DTRACE_PROBE4(ip4__loopback__out__start, ill_t *, NULL, + ill_t *, ill, ipha_t *, ipha, mblk_t *, mp); + FW_HOOKS(ipst->ips_ip4_loopback_out_event, + ipst->ips_ipv4firewall_loopback_out, + NULL, ill, ipha, mp, mp, 0, ipst, error); + DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp); + } + if (mp == NULL) + return (NULL); + + /* FW_HOOKS: LOOPBACK_IN */ + if (hooks_in) { + DTRACE_PROBE4(ip4__loopback__in__start, ill_t *, ill, + ill_t *, NULL, ipha_t *, ipha, mblk_t *, mp); + FW_HOOKS(ipst->ips_ip4_loopback_in_event, + ipst->ips_ipv4firewall_loopback_in, + ill, NULL, ipha, mp, mp, 0, ipst, error); + DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp); + } + if (mp == NULL) + return (NULL); + + DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, + ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, ip6_t *, + NULL, int, 1); + + /* Inbound IPsec polocies */ + if (peer_connp != NULL) { + /* Map ixa to ira including IPsec policies. */ + ipsec_out_to_in(ixa, ill, &iras); + mp = ipsec_check_inbound_policy(mp, peer_connp, ipha, + NULL, &iras); + } + } else { + ip6h = (ip6_t *)mp->b_rptr; + + /* + * If a callback is enabled then we need to know the + * source and destination zoneids for the packet. We already + * have those handy. + */ + if (ipst->ips_ip6_observe.he_interested) { + zoneid_t szone, dzone; + zoneid_t stackzoneid; + + stackzoneid = netstackid_to_zoneid( + ipst->ips_netstack->netstack_stackid); + + if (stackzoneid == GLOBAL_ZONEID) { + /* Shared-IP zone */ + dzone = ixa->ixa_ire->ire_zoneid; + szone = ixa->ixa_zoneid; + } else { + szone = dzone = stackzoneid; + } + ipobs_hook(mp, IPOBS_HOOK_LOCAL, szone, dzone, ill, + ipst); + } + DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, + ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, + ip6h, int, 1); + + /* FW_HOOKS: LOOPBACK_OUT */ + if (hooks_out) { + DTRACE_PROBE4(ip6__loopback__out__start, ill_t *, NULL, + ill_t *, ill, ip6_t *, ip6h, mblk_t *, mp); + FW_HOOKS6(ipst->ips_ip6_loopback_out_event, + ipst->ips_ipv6firewall_loopback_out, + NULL, ill, ip6h, mp, mp, 0, ipst, error); + DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, mp); + } + if (mp == NULL) + return (NULL); + + /* FW_HOOKS: LOOPBACK_IN */ + if (hooks_in) { + DTRACE_PROBE4(ip6__loopback__in__start, ill_t *, ill, + ill_t *, NULL, ip6_t *, ip6h, mblk_t *, mp); + FW_HOOKS6(ipst->ips_ip6_loopback_in_event, + ipst->ips_ipv6firewall_loopback_in, + ill, NULL, ip6h, mp, mp, 0, ipst, error); + DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, mp); + } + if (mp == NULL) + return (NULL); + + DTRACE_IP7(receive, mblk_t *, mp, conn_t *, NULL, void_ip_t *, + ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, ip6_t *, + ip6h, int, 1); + + /* Inbound IPsec polocies */ + if (peer_connp != NULL) { + /* Map ixa to ira including IPsec policies. */ + ipsec_out_to_in(ixa, ill, &iras); + mp = ipsec_check_inbound_policy(mp, peer_connp, NULL, + ip6h, &iras); + } + } + + if (mp == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", NULL, ill); + } + + return (mp); +} diff --git a/usr/src/uts/common/inet/ip/ip_rts.c b/usr/src/uts/common/inet/ip/ip_rts.c index 70c8bd2ea1..228c7581a3 100644 --- a/usr/src/uts/common/inet/ip/ip_rts.c +++ b/usr/src/uts/common/inet/ip/ip_rts.c @@ -81,24 +81,33 @@ static size_t rts_copyfromsockaddr(struct sockaddr *sa, in6_addr_t *addrp); static void rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst, ipaddr_t mask, ipaddr_t gateway, ipaddr_t src_addr, ipaddr_t brd_addr, - ipaddr_t author, const ipif_t *ipif, mblk_t *mp, uint_t, const tsol_gc_t *); + ipaddr_t author, ipaddr_t ifaddr, const ill_t *ill, mblk_t *mp, + const tsol_gc_t *); static int rts_getaddrs(rt_msghdr_t *rtm, in6_addr_t *dst_addrp, in6_addr_t *gw_addrp, in6_addr_t *net_maskp, in6_addr_t *authorp, in6_addr_t *if_addrp, in6_addr_t *src_addrp, ushort_t *indexp, sa_family_t *afp, tsol_rtsecattr_t *rtsecattr, int *error); static void rts_getifdata(if_data_t *if_data, const ipif_t *ipif); static int rts_getmetrics(ire_t *ire, rt_metrics_t *metrics); -static mblk_t *rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire, - sa_family_t af); +static mblk_t *rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *ifire, + const in6_addr_t *setsrc, tsol_ire_gw_secattr_t *attrp, sa_family_t af); static void rts_setmetrics(ire_t *ire, uint_t which, rt_metrics_t *metrics); -static void ip_rts_request_retry(ipsq_t *, queue_t *q, mblk_t *mp, void *); +static ire_t *ire_lookup_v4(ipaddr_t dst_addr, ipaddr_t net_mask, + ipaddr_t gw_addr, const ill_t *ill, zoneid_t zoneid, + const ts_label_t *tsl, int match_flags, ip_stack_t *ipst, ire_t **pifire, + ipaddr_t *v4setsrcp, tsol_ire_gw_secattr_t **gwattrp); +static ire_t *ire_lookup_v6(const in6_addr_t *dst_addr_v6, + const in6_addr_t *net_mask_v6, const in6_addr_t *gw_addr_v6, + const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, int match_flags, + ip_stack_t *ipst, ire_t **pifire, + in6_addr_t *v6setsrcp, tsol_ire_gw_secattr_t **gwattrp); /* * Send `mp' to all eligible routing queues. A queue is ineligible if: * * 1. SO_USELOOPBACK is off and it is not the originating queue. - * 2. RTAW_UNDER_IPMP is on and RTSQ_UNDER_IPMP is clear in `flags'. - * 3. RTAW_UNDER_IPMP is off and RTSQ_NORMAL is clear in `flags'. + * 2. RTA_UNDER_IPMP is on and RTSQ_UNDER_IPMP is not set in `flags'. + * 3. RTA_UNDER_IPMP is off and RTSQ_NORMAL is not set in `flags'. * 4. It is not the same address family as `af', and `af' isn't AF_UNSPEC. */ void @@ -110,7 +119,7 @@ rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, uint_t flags, /* * Since we don't have an ill_t here, RTSQ_DEFAULT must already be - * resolved to one or more of RTSQ_NORMAL|RTSQ_UNDER_IPMP by now. + * resolved to one or more of RTSQ_NORMAL|RTSQ_UNDER_IPMP at this point. */ ASSERT(!(flags & RTSQ_DEFAULT)); @@ -119,7 +128,6 @@ rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, uint_t flags, for (; connp != NULL; connp = next_connp) { next_connp = connp->conn_next; - /* * If there was a family specified when this routing socket was * created and it doesn't match the family of the message to @@ -139,28 +147,27 @@ rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, uint_t flags, if (!(flags & RTSQ_NORMAL)) continue; } - /* * For the originating queue, we only copy the message upstream * if loopback is set. For others reading on the routing * socket, we check if there is room upstream for a copy of the * message. */ - if ((o_connp == connp) && connp->conn_loopback == 0) { + if ((o_connp == connp) && connp->conn_useloopback == 0) { connp = connp->conn_next; continue; } CONN_INC_REF(connp); mutex_exit(&ipst->ips_rts_clients->connf_lock); /* Pass to rts_input */ - if ((IPCL_IS_NONSTR(connp) && !PROTO_FLOW_CNTRLD(connp))|| - (!IPCL_IS_NONSTR(connp) && - canputnext(CONNP_TO_RQ(connp)))) { + if (IPCL_IS_NONSTR(connp) ? !connp->conn_flow_cntrld : + canputnext(connp->conn_rq)) { mp1 = dupmsg(mp); if (mp1 == NULL) mp1 = copymsg(mp); + /* Note that we pass a NULL ira to rts_input */ if (mp1 != NULL) - (connp->conn_recv)(connp, mp1, NULL); + (connp->conn_recv)(connp, mp1, NULL, NULL); } mutex_enter(&ipst->ips_rts_clients->connf_lock); @@ -176,7 +183,7 @@ rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, uint_t flags, * Takes an ire and sends an ack to all the routing sockets. This * routine is used * - when a route is created/deleted through the ioctl interface. - * - when ire_expire deletes a stale redirect + * - when a stale redirect is deleted */ void ip_rts_rtmsg(int type, ire_t *ire, int error, ip_stack_t *ipst) @@ -192,6 +199,8 @@ ip_rts_rtmsg(int type, ire_t *ire, int error, ip_stack_t *ipst) ASSERT(ire->ire_ipversion == IPV4_VERSION || ire->ire_ipversion == IPV6_VERSION); + ASSERT(!(ire->ire_type & IRE_IF_CLONE)); + if (ire->ire_flags & RTF_SETSRC) rtm_addrs |= RTA_SRC; @@ -202,8 +211,8 @@ ip_rts_rtmsg(int type, ire_t *ire, int error, ip_stack_t *ipst) if (mp == NULL) return; rts_fill_msg(type, rtm_addrs, ire->ire_addr, ire->ire_mask, - ire->ire_gateway_addr, ire->ire_src_addr, 0, 0, NULL, mp, - 0, NULL); + ire->ire_gateway_addr, ire->ire_setsrc_addr, 0, 0, 0, NULL, + mp, NULL); break; case IPV6_VERSION: af = AF_INET6; @@ -215,8 +224,8 @@ ip_rts_rtmsg(int type, ire_t *ire, int error, ip_stack_t *ipst) mutex_exit(&ire->ire_lock); rts_fill_msg_v6(type, rtm_addrs, &ire->ire_addr_v6, &ire->ire_mask_v6, &gw_addr_v6, - &ire->ire_src_addr_v6, &ipv6_all_zeros, &ipv6_all_zeros, - NULL, mp, 0, NULL); + &ire->ire_setsrc_addr_v6, &ipv6_all_zeros, &ipv6_all_zeros, + &ipv6_all_zeros, NULL, mp, NULL); break; } rtm = (rt_msghdr_t *)mp->b_rptr; @@ -230,13 +239,6 @@ ip_rts_rtmsg(int type, ire_t *ire, int error, ip_stack_t *ipst) rts_queue_input(mp, NULL, af, RTSQ_ALL, ipst); } -/* ARGSUSED */ -static void -ip_rts_request_retry(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy) -{ - (void) ip_rts_request(q, mp, msg_getcred(mp, NULL)); -} - /* * This is a call from the RTS module * indicating that this is a Routing Socket @@ -248,7 +250,7 @@ ip_rts_register(conn_t *connp) { ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - connp->conn_loopback = 1; + connp->conn_useloopback = 1; ipcl_hash_insert_wildcard(ipst->ips_rts_clients, connp); } @@ -269,18 +271,9 @@ ip_rts_unregister(conn_t *connp) * * In general, this function does not consume the message supplied but rather * sends the message upstream with an appropriate UNIX errno. - * - * We may need to restart this operation if the ipif cannot be looked up - * due to an exclusive operation that is currently in progress. The restart - * entry point is ip_rts_request_retry. While the request is enqueud in the - * ipsq the ioctl could be aborted and the conn close. To ensure that we don't - * have stale conn pointers, ip_wput_ioctl does a conn refhold. This is - * released at the completion of the rts ioctl at the end of this function - * by calling CONN_OPER_PENDING_DONE or when the ioctl is aborted and - * conn close occurs in conn_ioctl_cleanup. */ int -ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr) +ip_rts_request_common(mblk_t *mp, conn_t *connp, cred_t *ioc_cr) { rt_msghdr_t *rtm = NULL; in6_addr_t dst_addr_v6; @@ -289,9 +282,12 @@ ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr) in6_addr_t net_mask_v6; in6_addr_t author_v6; in6_addr_t if_addr_v6; - mblk_t *mp1, *ioc_mp = mp; + mblk_t *mp1; ire_t *ire = NULL; - ire_t *sire = NULL; + ire_t *ifire = NULL; + ipaddr_t v4setsrc; + in6_addr_t v6setsrc = ipv6_all_zeros; + tsol_ire_gw_secattr_t *gwattr = NULL; int error = 0; int match_flags = MATCH_IRE_DSTONLY; int match_flags_local = MATCH_IRE_TYPE | MATCH_IRE_GW; @@ -302,9 +298,6 @@ ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr) ipaddr_t src_addr; ipaddr_t net_mask; ushort_t index; - ipif_t *ipif = NULL; - ipif_t *tmp_ipif = NULL; - IOCP iocp = (IOCP)mp->b_rptr; boolean_t gcgrp_xtraref = B_FALSE; tsol_gcgrp_addr_t ga; tsol_rtsecattr_t rtsecattr; @@ -314,42 +307,11 @@ ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr) ts_label_t *tsl = NULL; zoneid_t zoneid; ip_stack_t *ipst; - - ip1dbg(("ip_rts_request: mp is %x\n", DB_TYPE(mp))); + ill_t *ill = NULL; zoneid = connp->conn_zoneid; ipst = connp->conn_netstack->netstack_ip; - ASSERT(mp->b_cont != NULL); - /* ioc_mp holds mp */ - mp = mp->b_cont; - - /* - * The Routing Socket data starts on - * next block. If there is no next block - * this is an indication from routing module - * that it is a routing socket stream queue. - * We need to support that for compatibility with SDP since - * it has a contract private interface to use IP_IOC_RTS_REQUEST. - */ - if (mp->b_cont == NULL) { - /* - * This is a message from SDP - * indicating that this is a Routing Socket - * Stream. Insert this conn_t in routing - * socket client list. - */ - connp->conn_loopback = 1; - ipcl_hash_insert_wildcard(ipst->ips_rts_clients, connp); - goto done; - } - mp1 = dupmsg(mp->b_cont); - if (mp1 == NULL) { - error = ENOBUFS; - goto done; - } - mp = mp1; - if (mp->b_cont != NULL && !pullupmsg(mp, -1)) { freemsg(mp); error = EINVAL; @@ -446,20 +408,13 @@ ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr) */ ASSERT(af == AF_INET || af == AF_INET6); + /* Handle RTA_IFP */ if (index != 0) { - ill_t *ill; + ipif_t *ipif; lookup: - /* - * IPC must be refheld somewhere in ip_wput_nondata or - * ip_wput_ioctl etc... and cleaned up if ioctl is killed. - * If ILL_CHANGING the request is queued in the ipsq. - */ - ill = ill_lookup_on_ifindex(index, af == AF_INET6, - CONNP_TO_WQ(connp), ioc_mp, ip_rts_request_retry, &error, - ipst); + ill = ill_lookup_on_ifindex(index, af == AF_INET6, ipst); if (ill == NULL) { - if (error != EINPROGRESS) - error = EINVAL; + error = EINVAL; goto done; } @@ -474,13 +429,13 @@ lookup: switch (rtm->rtm_type) { case RTM_CHANGE: case RTM_DELETE: - ill_refrele(ill); error = EINVAL; goto done; case RTM_ADD: index = ipmp_ill_get_ipmp_ifindex(ill); ill_refrele(ill); if (index == 0) { + ill = NULL; /* already refrele'd */ error = EINVAL; goto done; } @@ -488,9 +443,18 @@ lookup: } } - ipif = ipif_get_next_ipif(NULL, ill); - ill_refrele(ill); match_flags |= MATCH_IRE_ILL; + /* + * This provides the same zoneid as in Solaris 10 + * that -ifp picks the zoneid from the first ipif on the ill. + * But it might not be useful since the first ipif will always + * have the same zoneid as the ill. + */ + ipif = ipif_get_next_ipif(NULL, ill); + if (ipif != NULL) { + zoneid = ipif->ipif_zoneid; + ipif_refrele(ipif); + } } /* @@ -545,6 +509,8 @@ lookup: switch (af) { case AF_INET: if (src_addr != INADDR_ANY) { + uint_t type; + /* * The RTF_SETSRC flag is present, check that * the supplied src address is not the loopback @@ -556,20 +522,11 @@ lookup: } /* * Also check that the supplied address is a - * valid, local one. + * valid, local one. Only allow IFF_UP ones */ - tmp_ipif = ipif_lookup_addr(src_addr, NULL, - ALL_ZONES, CONNP_TO_WQ(connp), ioc_mp, - ip_rts_request_retry, &error, ipst); - if (tmp_ipif == NULL) { - if (error != EINPROGRESS) - error = EADDRNOTAVAIL; - goto done; - } - if (!(tmp_ipif->ipif_flags & IPIF_UP) || - (tmp_ipif->ipif_flags & - (IPIF_NOLOCAL | IPIF_ANYCAST))) { - error = EINVAL; + type = ip_type_v4(src_addr, ipst); + if (!(type & (IRE_LOCAL|IRE_LOOPBACK))) { + error = EADDRNOTAVAIL; goto done; } } else { @@ -584,14 +541,15 @@ lookup: } error = ip_rt_add(dst_addr, net_mask, gw_addr, src_addr, - rtm->rtm_flags, ipif, &ire, B_FALSE, - WR(q), ioc_mp, ip_rts_request_retry, - rtsap, ipst); - if (ipif != NULL) - ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); + rtm->rtm_flags, ill, &ire, B_FALSE, + rtsap, ipst, zoneid); + if (ill != NULL) + ASSERT(!MUTEX_HELD(&ill->ill_lock)); break; case AF_INET6: if (!IN6_IS_ADDR_UNSPECIFIED(&src_addr_v6)) { + uint_t type; + /* * The RTF_SETSRC flag is present, check that * the supplied src address is not the loopback @@ -603,28 +561,17 @@ lookup: } /* * Also check that the supplied address is a - * valid, local one. + * valid, local one. Only allow UP ones. */ - tmp_ipif = ipif_lookup_addr_v6(&src_addr_v6, - NULL, ALL_ZONES, CONNP_TO_WQ(connp), ioc_mp, - ip_rts_request_retry, &error, ipst); - if (tmp_ipif == NULL) { - if (error != EINPROGRESS) - error = EADDRNOTAVAIL; - goto done; - } - - if (!(tmp_ipif->ipif_flags & IPIF_UP) || - (tmp_ipif->ipif_flags & - (IPIF_NOLOCAL | IPIF_ANYCAST))) { - error = EINVAL; + type = ip_type_v6(&src_addr_v6, ipst); + if (!(type & (IRE_LOCAL|IRE_LOOPBACK))) { + error = EADDRNOTAVAIL; goto done; } error = ip_rt_add_v6(&dst_addr_v6, &net_mask_v6, &gw_addr_v6, &src_addr_v6, rtm->rtm_flags, - ipif, &ire, WR(q), ioc_mp, - ip_rts_request_retry, rtsap, ipst); + ill, &ire, rtsap, ipst, zoneid); break; } /* @@ -637,10 +584,9 @@ lookup: } error = ip_rt_add_v6(&dst_addr_v6, &net_mask_v6, &gw_addr_v6, NULL, rtm->rtm_flags, - ipif, &ire, WR(q), ioc_mp, - ip_rts_request_retry, rtsap, ipst); - if (ipif != NULL) - ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); + ill, &ire, rtsap, ipst, zoneid); + if (ill != NULL) + ASSERT(!MUTEX_HELD(&ill->ill_lock)); break; } if (error != 0) @@ -666,13 +612,13 @@ lookup: switch (af) { case AF_INET: error = ip_rt_delete(dst_addr, net_mask, gw_addr, - found_addrs, rtm->rtm_flags, ipif, B_FALSE, - WR(q), ioc_mp, ip_rts_request_retry, ipst); + found_addrs, rtm->rtm_flags, ill, B_FALSE, + ipst, zoneid); break; case AF_INET6: error = ip_rt_delete_v6(&dst_addr_v6, &net_mask_v6, - &gw_addr_v6, found_addrs, rtm->rtm_flags, ipif, - WR(q), ioc_mp, ip_rts_request_retry, ipst); + &gw_addr_v6, found_addrs, rtm->rtm_flags, ill, + ipst, zoneid); break; } break; @@ -680,8 +626,7 @@ lookup: case RTM_CHANGE: /* * In the case of RTM_GET, the forwarding table should be - * searched recursively with default being matched if the - * specific route doesn't exist. Also, if a gateway was + * searched recursively. Also, if a gateway was * specified then the gateway address must also be matched. * * In the case of RTM_CHANGE, the gateway address (if supplied) @@ -706,9 +651,7 @@ lookup: } if (rtm->rtm_type == RTM_GET) { - match_flags |= - (MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE | - MATCH_IRE_SECATTR); + match_flags |= MATCH_IRE_SECATTR; match_flags_local |= MATCH_IRE_SECATTR; if ((found_addrs & RTA_GATEWAY) != 0) match_flags |= MATCH_IRE_GW; @@ -749,57 +692,34 @@ lookup: * IRE_LOCAL entry. * * If we didn't check for or find an IRE_LOOPBACK or IRE_LOCAL - * entry, then look in the forwarding table. + * entry, then look for any other type of IRE. */ switch (af) { case AF_INET: if (net_mask == IP_HOST_MASK) { - ire = ire_ctable_lookup(dst_addr, gw_addr, + ire = ire_ftable_lookup_v4(dst_addr, 0, gw_addr, IRE_LOCAL | IRE_LOOPBACK, NULL, zoneid, - tsl, match_flags_local, ipst); - /* - * If we found an IRE_LOCAL, make sure - * it is one that would be used by this - * zone to send packets. - */ - if (ire != NULL && - ire->ire_type == IRE_LOCAL && - ipst->ips_ip_restrict_interzone_loopback && - !ire_local_ok_across_zones(ire, - zoneid, &dst_addr, tsl, ipst)) { - ire_refrele(ire); - ire = NULL; - } + tsl, match_flags_local, 0, ipst, NULL); } if (ire == NULL) { - ire = ire_ftable_lookup(dst_addr, net_mask, - gw_addr, 0, ipif, &sire, zoneid, 0, - tsl, match_flags, ipst); + ire = ire_lookup_v4(dst_addr, net_mask, + gw_addr, ill, zoneid, tsl, match_flags, + ipst, &ifire, &v4setsrc, &gwattr); + IN6_IPADDR_TO_V4MAPPED(v4setsrc, &v6setsrc); } break; case AF_INET6: if (IN6_ARE_ADDR_EQUAL(&net_mask_v6, &ipv6_all_ones)) { - ire = ire_ctable_lookup_v6(&dst_addr_v6, + ire = ire_ftable_lookup_v6(&dst_addr_v6, NULL, &gw_addr_v6, IRE_LOCAL | IRE_LOOPBACK, NULL, - zoneid, tsl, match_flags_local, ipst); - /* - * If we found an IRE_LOCAL, make sure - * it is one that would be used by this - * zone to send packets. - */ - if (ire != NULL && - ire->ire_type == IRE_LOCAL && - ipst->ips_ip_restrict_interzone_loopback && - !ire_local_ok_across_zones(ire, - zoneid, (void *)&dst_addr_v6, tsl, ipst)) { - ire_refrele(ire); - ire = NULL; - } + zoneid, tsl, match_flags_local, 0, ipst, + NULL); } if (ire == NULL) { - ire = ire_ftable_lookup_v6(&dst_addr_v6, - &net_mask_v6, &gw_addr_v6, 0, ipif, &sire, - zoneid, 0, tsl, match_flags, ipst); + ire = ire_lookup_v6(&dst_addr_v6, + &net_mask_v6, &gw_addr_v6, ill, zoneid, + tsl, match_flags, ipst, &ifire, &v6setsrc, + &gwattr); } break; } @@ -810,10 +730,21 @@ lookup: error = ESRCH; goto done; } + /* + * Want to return failure if we get an IRE_NOROUTE from + * ire_route_recursive + */ + if (ire->ire_type & IRE_NOROUTE) { + ire_refrele(ire); + ire = NULL; + error = ESRCH; + goto done; + } + /* we know the IRE before we come here */ switch (rtm->rtm_type) { case RTM_GET: - mp1 = rts_rtmget(mp, ire, sire, af); + mp1 = rts_rtmget(mp, ire, ifire, &v6setsrc, gwattr, af); if (mp1 == NULL) { error = ENOBUFS; goto done; @@ -843,7 +774,6 @@ lookup: */ switch (af) { case AF_INET: - ire_flush_cache_v4(ire, IRE_FLUSH_DELETE); if ((found_addrs & RTA_GATEWAY) != 0 && (ire->ire_gateway_addr != gw_addr)) { ire->ire_gateway_addr = gw_addr; @@ -863,9 +793,10 @@ lookup: if ((found_addrs & RTA_SRC) != 0 && (rtm->rtm_flags & RTF_SETSRC) != 0 && - (ire->ire_src_addr != src_addr)) { - + (ire->ire_setsrc_addr != src_addr)) { if (src_addr != INADDR_ANY) { + uint_t type; + /* * The RTF_SETSRC flag is * present, check that the @@ -880,50 +811,47 @@ lookup: goto done; } /* - * Also check that the the + * Also check that the * supplied addr is a valid * local address. */ - tmp_ipif = ipif_lookup_addr( - src_addr, NULL, ALL_ZONES, - WR(q), ioc_mp, - ip_rts_request_retry, - &error, ipst); - if (tmp_ipif == NULL) { - error = (error == - EINPROGRESS) ? - error : - EADDRNOTAVAIL; - goto done; - } - - if (!(tmp_ipif->ipif_flags & - IPIF_UP) || - (tmp_ipif->ipif_flags & - (IPIF_NOLOCAL | - IPIF_ANYCAST))) { - error = EINVAL; + type = ip_type_v4(src_addr, + ipst); + if (!(type & + (IRE_LOCAL|IRE_LOOPBACK))) { + error = EADDRNOTAVAIL; goto done; } ire->ire_flags |= RTF_SETSRC; + ire->ire_setsrc_addr = + src_addr; } else { ire->ire_flags &= ~RTF_SETSRC; + ire->ire_setsrc_addr = + INADDR_ANY; } - ire->ire_src_addr = src_addr; + /* + * Let conn_ixa caching know that + * source address selection changed + */ + ip_update_source_selection(ipst); } + ire_flush_cache_v4(ire, IRE_FLUSH_GWCHANGE); break; case AF_INET6: - ire_flush_cache_v6(ire, IRE_FLUSH_DELETE); mutex_enter(&ire->ire_lock); if ((found_addrs & RTA_GATEWAY) != 0 && !IN6_ARE_ADDR_EQUAL( &ire->ire_gateway_addr_v6, &gw_addr_v6)) { ire->ire_gateway_addr_v6 = gw_addr_v6; } + mutex_exit(&ire->ire_lock); if (rtsap != NULL) { ga.ga_af = AF_INET6; + mutex_enter(&ire->ire_lock); ga.ga_addr = ire->ire_gateway_addr_v6; + mutex_exit(&ire->ire_lock); gcgrp = gcgrp_lookup(&ga, B_TRUE); if (gcgrp == NULL) { @@ -935,10 +863,11 @@ lookup: if ((found_addrs & RTA_SRC) != 0 && (rtm->rtm_flags & RTF_SETSRC) != 0 && !IN6_ARE_ADDR_EQUAL( - &ire->ire_src_addr_v6, &src_addr_v6)) { - + &ire->ire_setsrc_addr_v6, &src_addr_v6)) { if (!IN6_IS_ADDR_UNSPECIFIED( &src_addr_v6)) { + uint_t type; + /* * The RTF_SETSRC flag is * present, check that the @@ -949,54 +878,44 @@ lookup: */ if (IN6_IS_ADDR_LOOPBACK( &src_addr_v6)) { - mutex_exit( - &ire->ire_lock); error = EINVAL; goto done; } /* - * Also check that the the + * Also check that the * supplied addr is a valid * local address. */ - tmp_ipif = ipif_lookup_addr_v6( - &src_addr_v6, NULL, - ALL_ZONES, - CONNP_TO_WQ(connp), ioc_mp, - ip_rts_request_retry, - &error, ipst); - if (tmp_ipif == NULL) { - mutex_exit( - &ire->ire_lock); - error = (error == - EINPROGRESS) ? - error : - EADDRNOTAVAIL; - goto done; - } - if (!(tmp_ipif->ipif_flags & - IPIF_UP) || - (tmp_ipif->ipif_flags & - (IPIF_NOLOCAL | - IPIF_ANYCAST))) { - mutex_exit( - &ire->ire_lock); - error = EINVAL; + type = ip_type_v6(&src_addr_v6, + ipst); + if (!(type & + (IRE_LOCAL|IRE_LOOPBACK))) { + error = EADDRNOTAVAIL; goto done; } + mutex_enter(&ire->ire_lock); ire->ire_flags |= RTF_SETSRC; + ire->ire_setsrc_addr_v6 = + src_addr_v6; + mutex_exit(&ire->ire_lock); } else { + mutex_enter(&ire->ire_lock); ire->ire_flags &= ~RTF_SETSRC; + ire->ire_setsrc_addr_v6 = + ipv6_all_zeros; + mutex_exit(&ire->ire_lock); } - ire->ire_src_addr_v6 = src_addr_v6; + /* + * Let conn_ixa caching know that + * source address selection changed + */ + ip_update_source_selection(ipst); } - mutex_exit(&ire->ire_lock); + ire_flush_cache_v6(ire, IRE_FLUSH_GWCHANGE); break; } if (rtsap != NULL) { - in_addr_t ga_addr4; - ASSERT(gcgrp != NULL); /* @@ -1010,7 +929,7 @@ lookup: gc = gc_create(rtsap, gcgrp, &gcgrp_xtraref); if (gc == NULL || (error = tsol_ire_init_gwattr(ire, - ire->ire_ipversion, gc, NULL)) != 0) { + ire->ire_ipversion, gc)) != 0) { if (gc != NULL) { GC_REFRELE(gc); } else { @@ -1019,21 +938,6 @@ lookup: } goto done; } - - /* - * Now delete any existing gateway IRE caches - * as well as all caches using the gateway, - * and allow them to be created on demand - * through ip_newroute{_v6}. - */ - IN6_V4MAPPED_TO_IPADDR(&ga.ga_addr, ga_addr4); - if (af == AF_INET) { - ire_clookup_delete_cache_gw( - ga_addr4, ALL_ZONES, ipst); - } else { - ire_clookup_delete_cache_gw_v6( - &ga.ga_addr, ALL_ZONES, ipst); - } } rts_setmetrics(ire, rtm->rtm_inits, &rtm->rtm_rmx); break; @@ -1046,21 +950,14 @@ lookup: done: if (ire != NULL) ire_refrele(ire); - if (sire != NULL) - ire_refrele(sire); - if (ipif != NULL) - ipif_refrele(ipif); - if (tmp_ipif != NULL) - ipif_refrele(tmp_ipif); + if (ifire != NULL) + ire_refrele(ifire); + if (ill != NULL) + ill_refrele(ill); if (gcgrp_xtraref) GCGRP_REFRELE(gcgrp); - if (error == EINPROGRESS) { - if (rtm != NULL) - freemsg(mp); - return (error); - } if (rtm != NULL) { ASSERT(mp->b_wptr <= mp->b_datap->db_lim); if (error != 0) { @@ -1074,12 +971,190 @@ done: } rts_queue_input(mp, connp, af, RTSQ_ALL, ipst); } + return (error); +} + +/* + * Helper function that can do recursive lookups including when + * MATCH_IRE_GW and/or MATCH_IRE_MASK is set. + */ +static ire_t * +ire_lookup_v4(ipaddr_t dst_addr, ipaddr_t net_mask, ipaddr_t gw_addr, + const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, + int match_flags, ip_stack_t *ipst, ire_t **pifire, ipaddr_t *v4setsrcp, + tsol_ire_gw_secattr_t **gwattrp) +{ + ire_t *ire; + ire_t *ifire = NULL; + uint_t ire_type; + + *pifire = NULL; + *v4setsrcp = INADDR_ANY; + *gwattrp = NULL; + + /* Skip IRE_IF_CLONE */ + match_flags |= MATCH_IRE_TYPE; + ire_type = (IRE_ONLINK|IRE_OFFLINK) & ~IRE_IF_CLONE; + + /* + * ire_route_recursive can't match gateway or mask thus if they are + * set we have to do two steps of lookups + */ + if (match_flags & (MATCH_IRE_GW|MATCH_IRE_MASK)) { + ire = ire_ftable_lookup_v4(dst_addr, net_mask, gw_addr, + ire_type, ill, zoneid, tsl, match_flags, 0, ipst, NULL); + + if (ire == NULL ||(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) + return (ire); + + if (ire->ire_type & IRE_ONLINK) + return (ire); + + if (ire->ire_flags & RTF_SETSRC) { + ASSERT(ire->ire_setsrc_addr != INADDR_ANY); + *v4setsrcp = ire->ire_setsrc_addr; + v4setsrcp = NULL; + } + + /* The first ire_gw_secattr is passed back */ + if (ire->ire_gw_secattr != NULL) { + *gwattrp = ire->ire_gw_secattr; + gwattrp = NULL; + } + + /* Look for an interface ire recursively based on the gateway */ + dst_addr = ire->ire_gateway_addr; + match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_MASK); + ifire = ire_route_recursive_v4(dst_addr, ire_type, ill, zoneid, + tsl, match_flags, B_FALSE, 0, ipst, v4setsrcp, gwattrp, + NULL); + } else { + ire = ire_route_recursive_v4(dst_addr, ire_type, ill, zoneid, + tsl, match_flags, B_FALSE, 0, ipst, v4setsrcp, gwattrp, + NULL); + } + *pifire = ifire; + return (ire); +} + +static ire_t * +ire_lookup_v6(const in6_addr_t *dst_addr_v6, + const in6_addr_t *net_mask_v6, const in6_addr_t *gw_addr_v6, + const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, int match_flags, + ip_stack_t *ipst, ire_t **pifire, + in6_addr_t *v6setsrcp, tsol_ire_gw_secattr_t **gwattrp) +{ + ire_t *ire; + ire_t *ifire = NULL; + uint_t ire_type; + + *pifire = NULL; + *v6setsrcp = ipv6_all_zeros; + *gwattrp = NULL; + + /* Skip IRE_IF_CLONE */ + match_flags |= MATCH_IRE_TYPE; + ire_type = (IRE_ONLINK|IRE_OFFLINK) & ~IRE_IF_CLONE; + + /* + * ire_route_recursive can't match gateway or mask thus if they are + * set we have to do two steps of lookups + */ + if (match_flags & (MATCH_IRE_GW|MATCH_IRE_MASK)) { + in6_addr_t dst; + + ire = ire_ftable_lookup_v6(dst_addr_v6, net_mask_v6, + gw_addr_v6, ire_type, ill, zoneid, tsl, match_flags, 0, + ipst, NULL); + + if (ire == NULL ||(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) + return (ire); + + if (ire->ire_type & IRE_ONLINK) + return (ire); + + if (ire->ire_flags & RTF_SETSRC) { + ASSERT(!IN6_IS_ADDR_UNSPECIFIED( + &ire->ire_setsrc_addr_v6)); + *v6setsrcp = ire->ire_setsrc_addr_v6; + v6setsrcp = NULL; + } + + /* The first ire_gw_secattr is passed back */ + if (ire->ire_gw_secattr != NULL) { + *gwattrp = ire->ire_gw_secattr; + gwattrp = NULL; + } + + mutex_enter(&ire->ire_lock); + dst = ire->ire_gateway_addr_v6; + mutex_exit(&ire->ire_lock); + match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_MASK); + ifire = ire_route_recursive_v6(&dst, ire_type, ill, zoneid, tsl, + match_flags, B_FALSE, 0, ipst, v6setsrcp, gwattrp, NULL); + } else { + ire = ire_route_recursive_v6(dst_addr_v6, ire_type, ill, zoneid, + tsl, match_flags, B_FALSE, 0, ipst, v6setsrcp, gwattrp, + NULL); + } + *pifire = ifire; + return (ire); +} + + +/* + * Handle IP_IOC_RTS_REQUEST ioctls + */ +int +ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr) +{ + conn_t *connp = Q_TO_CONN(q); + IOCP iocp = (IOCP)mp->b_rptr; + mblk_t *mp1, *ioc_mp = mp; + int error = 0; + ip_stack_t *ipst; + ipst = connp->conn_netstack->netstack_ip; + + ASSERT(mp->b_cont != NULL); + /* ioc_mp holds mp */ + mp = mp->b_cont; + + /* + * The Routing Socket data starts on + * next block. If there is no next block + * this is an indication from routing module + * that it is a routing socket stream queue. + * We need to support that for compatibility with SDP since + * it has a contract private interface to use IP_IOC_RTS_REQUEST. + * Note: SDP no longer uses IP_IOC_RTS_REQUEST - we can remove this. + */ + if (mp->b_cont == NULL) { + /* + * This is a message from SDP + * indicating that this is a Routing Socket + * Stream. Insert this conn_t in routing + * socket client list. + */ + connp->conn_useloopback = 1; + ipcl_hash_insert_wildcard(ipst->ips_rts_clients, connp); + goto done; + } + mp1 = dupmsg(mp->b_cont); + if (mp1 == NULL) { + error = ENOBUFS; + goto done; + } + mp = mp1; + + error = ip_rts_request_common(mp, connp, ioc_cr); +done: iocp->ioc_error = error; ioc_mp->b_datap->db_type = M_IOCACK; if (iocp->ioc_error != 0) iocp->ioc_count = 0; - (connp->conn_recv)(connp, ioc_mp, NULL); + /* Note that we pass a NULL ira to rts_input */ + (connp->conn_recv)(connp, ioc_mp, NULL, NULL); /* conn was refheld in ip_wput_ioctl. */ CONN_OPER_PENDING_DONE(connp); @@ -1087,12 +1162,6 @@ done: return (error); } -int -ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr) -{ - return (ip_rts_request_common(q, mp, Q_TO_CONN(q), ioc_cr)); -} - /* * Build a reply to the RTM_GET request contained in the given message block * using the retrieved IRE of the destination address, the parent IRE (if it @@ -1102,26 +1171,34 @@ ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr) * otherwise NULL is returned. */ static mblk_t * -rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire, sa_family_t af) +rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *ifire, const in6_addr_t *setsrc, + tsol_ire_gw_secattr_t *attrp, sa_family_t af) { rt_msghdr_t *rtm; rt_msghdr_t *new_rtm; mblk_t *new_mp; int rtm_addrs; int rtm_flags; - in6_addr_t gw_addr_v6; - tsol_ire_gw_secattr_t *attrp = NULL; tsol_gc_t *gc = NULL; tsol_gcgrp_t *gcgrp = NULL; - int sacnt = 0; + ill_t *ill; + ipif_t *ipif = NULL; + ipaddr_t brdaddr; /* IFF_POINTOPOINT destination */ + ipaddr_t ifaddr; + in6_addr_t brdaddr6; /* IFF_POINTOPOINT destination */ + in6_addr_t ifaddr6; + ipaddr_t v4setsrc; - ASSERT(ire->ire_ipif != NULL); rtm = (rt_msghdr_t *)mp->b_rptr; - if (sire != NULL && sire->ire_gw_secattr != NULL) - attrp = sire->ire_gw_secattr; - else if (ire->ire_gw_secattr != NULL) - attrp = ire->ire_gw_secattr; + /* + * Find the ill used to send packets. This will be NULL in case + * of a reject or blackhole. + */ + if (ifire != NULL) + ill = ire_nexthop_ill(ifire); + else + ill = ire_nexthop_ill(ire); if (attrp != NULL) { mutex_enter(&attrp->igsa_lock); @@ -1129,29 +1206,9 @@ rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire, sa_family_t af) gcgrp = gc->gc_grp; ASSERT(gcgrp != NULL); rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); - sacnt = 1; - } else if ((gcgrp = attrp->igsa_gcgrp) != NULL) { - rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); - gc = gcgrp->gcgrp_head; - sacnt = gcgrp->gcgrp_count; } mutex_exit(&attrp->igsa_lock); - - /* do nothing if there's no gc to report */ - if (gc == NULL) { - ASSERT(sacnt == 0); - if (gcgrp != NULL) { - /* we might as well drop the lock now */ - rw_exit(&gcgrp->gcgrp_rwlock); - gcgrp = NULL; - } - attrp = NULL; - } - - ASSERT(gc == NULL || (gcgrp != NULL && - RW_LOCK_HELD(&gcgrp->gcgrp_rwlock))); } - ASSERT(sacnt == 0 || gc != NULL); /* * Always return RTA_DST, RTA_GATEWAY and RTA_NETMASK. @@ -1162,16 +1219,36 @@ rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire, sa_family_t af) * point-to-point. */ rtm_addrs = (RTA_DST | RTA_GATEWAY | RTA_NETMASK); - if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) { + if ((rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) && ill != NULL) { rtm_addrs |= (RTA_IFP | RTA_IFA); - if (ire->ire_ipif->ipif_flags & IPIF_POINTOPOINT) - rtm_addrs |= RTA_BRD; + /* + * We associate an IRE with an ILL, hence we don't exactly + * know what might make sense for RTA_IFA and RTA_BRD. We + * pick the first ipif on the ill. + */ + ipif = ipif_get_next_ipif(NULL, ill); + if (ipif != NULL) { + if (ipif->ipif_isv6) + ifaddr6 = ipif->ipif_v6lcl_addr; + else + ifaddr = ipif->ipif_lcl_addr; + if (ipif->ipif_flags & IPIF_POINTOPOINT) { + rtm_addrs |= RTA_BRD; + if (ipif->ipif_isv6) + brdaddr6 = ipif->ipif_v6pp_dst_addr; + else + brdaddr = ipif->ipif_pp_dst_addr; + } + ipif_refrele(ipif); + } } - new_mp = rts_alloc_msg(RTM_GET, rtm_addrs, af, sacnt); + new_mp = rts_alloc_msg(RTM_GET, rtm_addrs, af, gc != NULL ? 1 : 0); if (new_mp == NULL) { if (gcgrp != NULL) rw_exit(&gcgrp->gcgrp_rwlock); + if (ill != NULL) + ill_refrele(ill); return (NULL); } @@ -1187,49 +1264,24 @@ rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire, sa_family_t af) ASSERT(af == AF_INET || af == AF_INET6); switch (af) { case AF_INET: - if (sire == NULL) { - rtm_flags = ire->ire_flags; - rts_fill_msg(RTM_GET, rtm_addrs, ire->ire_addr, - ire->ire_mask, ire->ire_src_addr, ire->ire_src_addr, - ire->ire_ipif->ipif_pp_dst_addr, 0, ire->ire_ipif, - new_mp, sacnt, gc); - } else { - if (sire->ire_flags & RTF_SETSRC) - rtm_addrs |= RTA_SRC; - - rtm_flags = sire->ire_flags; - rts_fill_msg(RTM_GET, rtm_addrs, sire->ire_addr, - sire->ire_mask, sire->ire_gateway_addr, - (sire->ire_flags & RTF_SETSRC) ? - sire->ire_src_addr : ire->ire_src_addr, - ire->ire_ipif->ipif_pp_dst_addr, - 0, ire->ire_ipif, new_mp, sacnt, gc); - } + IN6_V4MAPPED_TO_IPADDR(setsrc, v4setsrc); + if (v4setsrc != INADDR_ANY) + rtm_addrs |= RTA_SRC; + + rtm_flags = ire->ire_flags; + rts_fill_msg(RTM_GET, rtm_addrs, ire->ire_addr, + ire->ire_mask, ire->ire_gateway_addr, v4setsrc, + brdaddr, 0, ifaddr, ill, new_mp, gc); break; case AF_INET6: - if (sire == NULL) { - rtm_flags = ire->ire_flags; - rts_fill_msg_v6(RTM_GET, rtm_addrs, &ire->ire_addr_v6, - &ire->ire_mask_v6, &ire->ire_src_addr_v6, - &ire->ire_src_addr_v6, - &ire->ire_ipif->ipif_v6pp_dst_addr, - &ipv6_all_zeros, ire->ire_ipif, new_mp, - sacnt, gc); - } else { - if (sire->ire_flags & RTF_SETSRC) - rtm_addrs |= RTA_SRC; - - rtm_flags = sire->ire_flags; - mutex_enter(&sire->ire_lock); - gw_addr_v6 = sire->ire_gateway_addr_v6; - mutex_exit(&sire->ire_lock); - rts_fill_msg_v6(RTM_GET, rtm_addrs, &sire->ire_addr_v6, - &sire->ire_mask_v6, &gw_addr_v6, - (sire->ire_flags & RTF_SETSRC) ? - &sire->ire_src_addr_v6 : &ire->ire_src_addr_v6, - &ire->ire_ipif->ipif_v6pp_dst_addr, &ipv6_all_zeros, - ire->ire_ipif, new_mp, sacnt, gc); - } + if (!IN6_IS_ADDR_UNSPECIFIED(setsrc)) + rtm_addrs |= RTA_SRC; + + rtm_flags = ire->ire_flags; + rts_fill_msg_v6(RTM_GET, rtm_addrs, &ire->ire_addr_v6, + &ire->ire_mask_v6, &ire->ire_gateway_addr_v6, + setsrc, &brdaddr6, &ipv6_all_zeros, + &ifaddr6, ill, new_mp, gc); break; } @@ -1259,11 +1311,9 @@ rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire, sa_family_t af) new_rtm->rtm_use = rtm->rtm_use; new_rtm->rtm_addrs = rtm_addrs; new_rtm->rtm_flags = rtm_flags; - if (sire == NULL) - new_rtm->rtm_inits = rts_getmetrics(ire, &new_rtm->rtm_rmx); - else - new_rtm->rtm_inits = rts_getmetrics(sire, &new_rtm->rtm_rmx); - + new_rtm->rtm_inits = rts_getmetrics(ire, &new_rtm->rtm_rmx); + if (ill != NULL) + ill_refrele(ill); return (new_mp); } @@ -1273,10 +1323,11 @@ rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire, sa_family_t af) static void rts_getifdata(if_data_t *if_data, const ipif_t *ipif) { - if_data->ifi_type = ipif->ipif_type; /* ethernet, tokenring, etc */ + if_data->ifi_type = ipif->ipif_ill->ill_type; + /* ethernet, tokenring, etc */ if_data->ifi_addrlen = 0; /* media address length */ if_data->ifi_hdrlen = 0; /* media header length */ - if_data->ifi_mtu = ipif->ipif_mtu; /* maximum transmission unit */ + if_data->ifi_mtu = ipif->ipif_ill->ill_mtu; /* mtu */ if_data->ifi_metric = ipif->ipif_metric; /* metric (external only) */ if_data->ifi_baudrate = 0; /* linespeed */ @@ -1302,18 +1353,19 @@ rts_setmetrics(ire_t *ire, uint_t which, rt_metrics_t *metrics) { clock_t rtt; clock_t rtt_sd; - ipif_t *ipif; + ill_t *ill; ifrt_t *ifrt; mblk_t *mp; in6_addr_t gw_addr_v6; + /* Need to add back some metrics to the IRE? */ /* - * Bypass obtaining the lock and searching ipif_saved_ire_mp in the + * Bypass obtaining the lock and searching ill_saved_ire_mp in the * common case of no metrics. */ if (which == 0) return; - ire->ire_uinfo.iulp_set = B_TRUE; + ire->ire_metrics.iulp_set = B_TRUE; /* * iulp_rtt and iulp_rtt_sd are in milliseconds, but 4.4BSD-Lite2's @@ -1330,42 +1382,41 @@ rts_setmetrics(ire_t *ire, uint_t which, rt_metrics_t *metrics) */ mutex_enter(&ire->ire_lock); if (which & RTV_MTU) - ire->ire_max_frag = metrics->rmx_mtu; + ire->ire_metrics.iulp_mtu = metrics->rmx_mtu; if (which & RTV_RTT) - ire->ire_uinfo.iulp_rtt = rtt; + ire->ire_metrics.iulp_rtt = rtt; if (which & RTV_SSTHRESH) - ire->ire_uinfo.iulp_ssthresh = metrics->rmx_ssthresh; + ire->ire_metrics.iulp_ssthresh = metrics->rmx_ssthresh; if (which & RTV_RTTVAR) - ire->ire_uinfo.iulp_rtt_sd = rtt_sd; + ire->ire_metrics.iulp_rtt_sd = rtt_sd; if (which & RTV_SPIPE) - ire->ire_uinfo.iulp_spipe = metrics->rmx_sendpipe; + ire->ire_metrics.iulp_spipe = metrics->rmx_sendpipe; if (which & RTV_RPIPE) - ire->ire_uinfo.iulp_rpipe = metrics->rmx_recvpipe; + ire->ire_metrics.iulp_rpipe = metrics->rmx_recvpipe; mutex_exit(&ire->ire_lock); /* - * Search through the ifrt_t chain hanging off the IPIF in order to + * Search through the ifrt_t chain hanging off the ILL in order to * reflect the metric change there. */ - ipif = ire->ire_ipif; - if (ipif == NULL) + ill = ire->ire_ill; + if (ill == NULL) return; - ASSERT((ipif->ipif_isv6 && ire->ire_ipversion == IPV6_VERSION) || - ((!ipif->ipif_isv6 && ire->ire_ipversion == IPV4_VERSION))); - if (ipif->ipif_isv6) { + ASSERT((ill->ill_isv6 && ire->ire_ipversion == IPV6_VERSION) || + ((!ill->ill_isv6 && ire->ire_ipversion == IPV4_VERSION))); + if (ill->ill_isv6) { mutex_enter(&ire->ire_lock); gw_addr_v6 = ire->ire_gateway_addr_v6; mutex_exit(&ire->ire_lock); } - mutex_enter(&ipif->ipif_saved_ire_lock); - for (mp = ipif->ipif_saved_ire_mp; mp != NULL; mp = mp->b_cont) { + mutex_enter(&ill->ill_saved_ire_lock); + for (mp = ill->ill_saved_ire_mp; mp != NULL; mp = mp->b_cont) { /* - * On a given ipif, the triple of address, gateway and mask is - * unique for each saved IRE (in the case of ordinary interface - * routes, the gateway address is all-zeroes). + * On a given ill, the tuple of address, gateway, mask, + * ire_type and zoneid unique for each saved IRE. */ ifrt = (ifrt_t *)mp->b_rptr; - if (ipif->ipif_isv6) { + if (ill->ill_isv6) { if (!IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6addr, &ire->ire_addr_v6) || !IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6gateway_addr, @@ -1379,23 +1430,36 @@ rts_setmetrics(ire_t *ire, uint_t which, rt_metrics_t *metrics) ifrt->ifrt_mask != ire->ire_mask) continue; } + if (ifrt->ifrt_zoneid != ire->ire_zoneid || + ifrt->ifrt_type != ire->ire_type) + continue; + if (which & RTV_MTU) - ifrt->ifrt_max_frag = metrics->rmx_mtu; + ifrt->ifrt_metrics.iulp_mtu = metrics->rmx_mtu; if (which & RTV_RTT) - ifrt->ifrt_iulp_info.iulp_rtt = rtt; + ifrt->ifrt_metrics.iulp_rtt = rtt; if (which & RTV_SSTHRESH) { - ifrt->ifrt_iulp_info.iulp_ssthresh = + ifrt->ifrt_metrics.iulp_ssthresh = metrics->rmx_ssthresh; } if (which & RTV_RTTVAR) - ifrt->ifrt_iulp_info.iulp_rtt_sd = metrics->rmx_rttvar; + ifrt->ifrt_metrics.iulp_rtt_sd = metrics->rmx_rttvar; if (which & RTV_SPIPE) - ifrt->ifrt_iulp_info.iulp_spipe = metrics->rmx_sendpipe; + ifrt->ifrt_metrics.iulp_spipe = metrics->rmx_sendpipe; if (which & RTV_RPIPE) - ifrt->ifrt_iulp_info.iulp_rpipe = metrics->rmx_recvpipe; + ifrt->ifrt_metrics.iulp_rpipe = metrics->rmx_recvpipe; break; } - mutex_exit(&ipif->ipif_saved_ire_lock); + mutex_exit(&ill->ill_saved_ire_lock); + + /* + * Update any IRE_IF_CLONE hanging created from this IRE_IF so they + * get any new iulp_mtu. + * We do that by deleting them; ire_create_if_clone will pick + * up the new metrics. + */ + if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != 0) + ire_dep_delete_if_clone(ire); } /* @@ -1407,27 +1471,69 @@ rts_getmetrics(ire_t *ire, rt_metrics_t *metrics) int metrics_set = 0; bzero(metrics, sizeof (rt_metrics_t)); + /* * iulp_rtt and iulp_rtt_sd are in milliseconds, but 4.4BSD-Lite2's * <net/route.h> says: rmx_rtt and rmx_rttvar are stored as * microseconds. */ - metrics->rmx_rtt = ire->ire_uinfo.iulp_rtt * 1000; + metrics->rmx_rtt = ire->ire_metrics.iulp_rtt * 1000; metrics_set |= RTV_RTT; - metrics->rmx_mtu = ire->ire_max_frag; + metrics->rmx_mtu = ire->ire_metrics.iulp_mtu; metrics_set |= RTV_MTU; - metrics->rmx_ssthresh = ire->ire_uinfo.iulp_ssthresh; + metrics->rmx_ssthresh = ire->ire_metrics.iulp_ssthresh; metrics_set |= RTV_SSTHRESH; - metrics->rmx_rttvar = ire->ire_uinfo.iulp_rtt_sd * 1000; + metrics->rmx_rttvar = ire->ire_metrics.iulp_rtt_sd * 1000; metrics_set |= RTV_RTTVAR; - metrics->rmx_sendpipe = ire->ire_uinfo.iulp_spipe; + metrics->rmx_sendpipe = ire->ire_metrics.iulp_spipe; metrics_set |= RTV_SPIPE; - metrics->rmx_recvpipe = ire->ire_uinfo.iulp_rpipe; + metrics->rmx_recvpipe = ire->ire_metrics.iulp_rpipe; metrics_set |= RTV_RPIPE; return (metrics_set); } /* + * Given two sets of metrics (src and dst), use the dst values if they are + * set. If a dst value is not set but the src value is set, then we use + * the src value. + * dst is updated with the new values. + * This is used to merge information from a dce_t and ire_metrics, where the + * dce values takes precedence. + */ +void +rts_merge_metrics(iulp_t *dst, const iulp_t *src) +{ + if (!src->iulp_set) + return; + + if (dst->iulp_ssthresh == 0) + dst->iulp_ssthresh = src->iulp_ssthresh; + if (dst->iulp_rtt == 0) + dst->iulp_rtt = src->iulp_rtt; + if (dst->iulp_rtt_sd == 0) + dst->iulp_rtt_sd = src->iulp_rtt_sd; + if (dst->iulp_spipe == 0) + dst->iulp_spipe = src->iulp_spipe; + if (dst->iulp_rpipe == 0) + dst->iulp_rpipe = src->iulp_rpipe; + if (dst->iulp_rtomax == 0) + dst->iulp_rtomax = src->iulp_rtomax; + if (dst->iulp_sack == 0) + dst->iulp_sack = src->iulp_sack; + if (dst->iulp_tstamp_ok == 0) + dst->iulp_tstamp_ok = src->iulp_tstamp_ok; + if (dst->iulp_wscale_ok == 0) + dst->iulp_wscale_ok = src->iulp_wscale_ok; + if (dst->iulp_ecn_ok == 0) + dst->iulp_ecn_ok = src->iulp_ecn_ok; + if (dst->iulp_pmtud_ok == 0) + dst->iulp_pmtud_ok = src->iulp_pmtud_ok; + if (dst->iulp_mtu == 0) + dst->iulp_mtu = src->iulp_mtu; +} + + +/* * Takes a pointer to a routing message and extracts necessary info by looking * at the rtm->rtm_addrs bits and store the requested sockaddrs in the pointers * passed (all of which must be valid). @@ -1552,7 +1658,8 @@ rts_getaddrs(rt_msghdr_t *rtm, in6_addr_t *dst_addrp, in6_addr_t *gw_addrp, static void rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst, ipaddr_t mask, ipaddr_t gateway, ipaddr_t src_addr, ipaddr_t brd_addr, ipaddr_t author, - const ipif_t *ipif, mblk_t *mp, uint_t sacnt, const tsol_gc_t *gc) + ipaddr_t ifaddr, const ill_t *ill, mblk_t *mp, + const tsol_gc_t *gc) { rt_msghdr_t *rtm; sin_t *sin; @@ -1561,7 +1668,6 @@ rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst, ipaddr_t mask, int i; ASSERT(mp != NULL); - ASSERT(sacnt == 0 || gc != NULL); /* * First find the type of the message * and its length. @@ -1571,7 +1677,7 @@ rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst, ipaddr_t mask, * Now find the size of the data * that follows the message header. */ - data_size = rts_data_msg_size(rtm_addrs, AF_INET, sacnt); + data_size = rts_data_msg_size(rtm_addrs, AF_INET, gc != NULL ? 1 : 0); rtm = (rt_msghdr_t *)mp->b_rptr; mp->b_wptr = &mp->b_rptr[header_size]; @@ -1596,9 +1702,13 @@ rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst, ipaddr_t mask, cp += sizeof (sin_t); break; case RTA_IFP: - cp += ill_dls_info((struct sockaddr_dl *)cp, ipif); + cp += ill_dls_info((struct sockaddr_dl *)cp, ill); break; case RTA_IFA: + sin->sin_addr.s_addr = ifaddr; + sin->sin_family = AF_INET; + cp += sizeof (sin_t); + break; case RTA_SRC: sin->sin_addr.s_addr = src_addr; sin->sin_family = AF_INET; @@ -1625,24 +1735,20 @@ rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst, ipaddr_t mask, rtm_ext_t *rtm_ext; struct rtsa_s *rp_dst; tsol_rtsecattr_t *rsap; - int i; ASSERT(gc->gc_grp != NULL); ASSERT(RW_LOCK_HELD(&gc->gc_grp->gcgrp_rwlock)); - ASSERT(sacnt > 0); rtm_ext = (rtm_ext_t *)cp; rtm_ext->rtmex_type = RTMEX_GATEWAY_SECATTR; - rtm_ext->rtmex_len = TSOL_RTSECATTR_SIZE(sacnt); + rtm_ext->rtmex_len = TSOL_RTSECATTR_SIZE(1); rsap = (tsol_rtsecattr_t *)(rtm_ext + 1); - rsap->rtsa_cnt = sacnt; + rsap->rtsa_cnt = 1; rp_dst = rsap->rtsa_attr; - for (i = 0; i < sacnt; i++, gc = gc->gc_next, rp_dst++) { - ASSERT(gc->gc_db != NULL); - bcopy(&gc->gc_db->gcdb_attr, rp_dst, sizeof (*rp_dst)); - } + ASSERT(gc->gc_db != NULL); + bcopy(&gc->gc_db->gcdb_attr, rp_dst, sizeof (*rp_dst)); cp = (uchar_t *)rp_dst; } @@ -1659,6 +1765,7 @@ rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst, ipaddr_t mask, /* * Allocates and initializes a routing socket message. + * Note that sacnt is either zero or one. */ mblk_t * rts_alloc_msg(int type, int rtm_addrs, sa_family_t af, uint_t sacnt) @@ -1755,7 +1862,7 @@ ip_rts_change(int type, ipaddr_t dst_addr, ipaddr_t gw_addr, ipaddr_t net_mask, if (mp == NULL) return; rts_fill_msg(type, rtm_addrs, dst_addr, net_mask, gw_addr, source, 0, - author, NULL, mp, 0, NULL); + author, 0, NULL, mp, NULL); rtm = (rt_msghdr_t *)mp->b_rptr; rtm->rtm_flags = flags; rtm->rtm_errno = error; @@ -1784,12 +1891,12 @@ ip_rts_xifmsg(const ipif_t *ipif, uint64_t set, uint64_t clear, uint_t flags) ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; /* - * This message should be generated only when the physical interface - * is changing state. + * This message should be generated only + * when the physical device is changing + * state. */ if (ipif->ipif_id != 0) return; - if (ipif->ipif_isv6) { af = AF_INET6; mp = rts_alloc_msg(RTM_IFINFO, RTA_IFP, af, 0); @@ -1797,14 +1904,15 @@ ip_rts_xifmsg(const ipif_t *ipif, uint64_t set, uint64_t clear, uint_t flags) return; rts_fill_msg_v6(RTM_IFINFO, RTA_IFP, &ipv6_all_zeros, &ipv6_all_zeros, &ipv6_all_zeros, &ipv6_all_zeros, - &ipv6_all_zeros, &ipv6_all_zeros, ipif, mp, 0, NULL); + &ipv6_all_zeros, &ipv6_all_zeros, &ipv6_all_zeros, + ipif->ipif_ill, mp, NULL); } else { af = AF_INET; mp = rts_alloc_msg(RTM_IFINFO, RTA_IFP, af, 0); if (mp == NULL) return; - rts_fill_msg(RTM_IFINFO, RTA_IFP, 0, 0, 0, 0, 0, 0, ipif, mp, - 0, NULL); + rts_fill_msg(RTM_IFINFO, RTA_IFP, 0, 0, 0, 0, 0, 0, 0, + ipif->ipif_ill, mp, NULL); } ifm = (if_msghdr_t *)mp->b_rptr; ifm->ifm_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; @@ -1843,6 +1951,12 @@ ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif, uint_t flags) sa_family_t af; ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; + /* + * Let conn_ixa caching know that source address selection + * changed + */ + ip_update_source_selection(ipst); + if (ipif->ipif_isv6) af = AF_INET6; else @@ -1875,15 +1989,17 @@ ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif, uint_t flags) case AF_INET: rts_fill_msg(ncmd, rtm_addrs, 0, ipif->ipif_net_mask, 0, ipif->ipif_lcl_addr, - ipif->ipif_pp_dst_addr, 0, ipif, mp, - 0, NULL); + ipif->ipif_pp_dst_addr, 0, + ipif->ipif_lcl_addr, ipif->ipif_ill, + mp, NULL); break; case AF_INET6: rts_fill_msg_v6(ncmd, rtm_addrs, &ipv6_all_zeros, &ipif->ipif_v6net_mask, &ipv6_all_zeros, &ipif->ipif_v6lcl_addr, &ipif->ipif_v6pp_dst_addr, &ipv6_all_zeros, - ipif, mp, 0, NULL); + &ipif->ipif_v6lcl_addr, ipif->ipif_ill, + mp, NULL); break; } ifam = (ifa_msghdr_t *)mp->b_rptr; @@ -1904,14 +2020,15 @@ ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif, uint_t flags) case AF_INET: rts_fill_msg(cmd, rtm_addrs, ipif->ipif_lcl_addr, ipif->ipif_net_mask, 0, - 0, 0, 0, NULL, mp, 0, NULL); + 0, 0, 0, 0, NULL, mp, NULL); break; case AF_INET6: rts_fill_msg_v6(cmd, rtm_addrs, &ipif->ipif_v6lcl_addr, &ipif->ipif_v6net_mask, &ipv6_all_zeros, &ipv6_all_zeros, &ipv6_all_zeros, - &ipv6_all_zeros, NULL, mp, 0, NULL); + &ipv6_all_zeros, &ipv6_all_zeros, + NULL, mp, NULL); break; } rtm = (rt_msghdr_t *)mp->b_rptr; diff --git a/usr/src/uts/common/inet/ip/ip_sadb.c b/usr/src/uts/common/inet/ip/ip_sadb.c index 35b822902a..e099d04427 100644 --- a/usr/src/uts/common/inet/ip/ip_sadb.c +++ b/usr/src/uts/common/inet/ip/ip_sadb.c @@ -36,7 +36,6 @@ #include <inet/ip6.h> #include <net/pfkeyv2.h> -#include <inet/ipsec_info.h> #include <inet/sadb.h> #include <inet/ipsec_impl.h> #include <inet/ipdrop.h> @@ -57,35 +56,21 @@ ipsec_match_outbound_ids(ipsec_latch_t *ipl, ipsa_t *sa) ipsid_equal(ipl->ipl_remote_cid, sa->ipsa_dst_cid); } -/* cr1 is packet cred; cr2 is SA credential */ +/* l1 is packet label; l2 is SA label */ boolean_t -ipsec_label_match(cred_t *cr1, cred_t *cr2) +ipsec_label_match(ts_label_t *l1, ts_label_t *l2) { - ts_label_t *l1, *l2; - if (!is_system_labeled()) return (B_TRUE); /* - * Check for NULL creds. Unlabeled SA always matches; + * Check for NULL label. Unlabeled SA (l2) always matches; * unlabeled user with labeled SA always fails */ - if (cr2 == NULL) + if (l2 == NULL) return (B_TRUE); - if (cr1 == NULL) - return (B_FALSE); - - /* If we reach here, we have two passed-in creds. */ - ASSERT(cr2 != NULL && cr1 != NULL); - - /* Check for NULL labels. Two is good, one is bad, zero is good. */ - l1 = crgetlabel(cr1); - l2 = crgetlabel(cr2); if (l1 == NULL) - return (l2 == NULL); - - if (l2 == NULL) return (B_FALSE); /* Simple IPsec MLS policy: labels must be equal */ @@ -109,32 +94,32 @@ ipsec_label_match(cred_t *cr1, cred_t *cr2) * The SA ptr I return will have its reference count incremented by one. */ ipsa_t * -ipsec_getassocbyconn(isaf_t *bucket, ipsec_out_t *io, uint32_t *src, - uint32_t *dst, sa_family_t af, uint8_t protocol, cred_t *cr) +ipsec_getassocbyconn(isaf_t *bucket, ip_xmit_attr_t *ixa, uint32_t *src, + uint32_t *dst, sa_family_t af, uint8_t protocol, ts_label_t *tsl) { ipsa_t *retval, *candidate; ipsec_action_t *candact; boolean_t need_unique; - boolean_t tunnel_mode = io->ipsec_out_tunnel; + boolean_t tunnel_mode = (ixa->ixa_flags & IXAF_IPSEC_TUNNEL); uint64_t unique_id; uint32_t old_flags, excludeflags; - ipsec_policy_t *pp = io->ipsec_out_policy; - ipsec_action_t *actlist = io->ipsec_out_act; + ipsec_policy_t *pp = ixa->ixa_ipsec_policy; + ipsec_action_t *actlist = ixa->ixa_ipsec_action; ipsec_action_t *act; - ipsec_latch_t *ipl = io->ipsec_out_latch; + ipsec_latch_t *ipl = ixa->ixa_ipsec_latch; ipsa_ref_t *ipr = NULL; - sa_family_t inaf = io->ipsec_out_inaf; - uint32_t *insrc = io->ipsec_out_insrc; - uint32_t *indst = io->ipsec_out_indst; - uint8_t insrcpfx = io->ipsec_out_insrcpfx; - uint8_t indstpfx = io->ipsec_out_indstpfx; + sa_family_t inaf = ixa->ixa_ipsec_inaf; + uint32_t *insrc = ixa->ixa_ipsec_insrc; + uint32_t *indst = ixa->ixa_ipsec_indst; + uint8_t insrcpfx = ixa->ixa_ipsec_insrcpfx; + uint8_t indstpfx = ixa->ixa_ipsec_indstpfx; ASSERT(MUTEX_HELD(&bucket->isaf_lock)); /* - * Caller must set ipsec_out_t structure such that we know + * Caller must set ip_xmit_attr_t structure such that we know * whether this is tunnel mode or transport mode based on - * io->ipsec_out_tunnel. If this flag is set, we assume that + * IXAF_IPSEC_TUNNEL. If this flag is set, we assume that * there are valid inner src and destination addresses to compare. */ @@ -145,7 +130,7 @@ ipsec_getassocbyconn(isaf_t *bucket, ipsec_out_t *io, uint32_t *src, if (ipl != NULL) { ASSERT((protocol == IPPROTO_AH) || (protocol == IPPROTO_ESP)); - ipr = &ipl->ipl_ref[protocol - IPPROTO_ESP]; + ipr = &ixa->ixa_ipsec_ref[protocol - IPPROTO_ESP]; retval = ipr->ipsr_sa; @@ -169,7 +154,7 @@ ipsec_getassocbyconn(isaf_t *bucket, ipsec_out_t *io, uint32_t *src, ASSERT(actlist != NULL); need_unique = actlist->ipa_want_unique; - unique_id = SA_FORM_UNIQUE_ID(io); + unique_id = SA_FORM_UNIQUE_ID(ixa); /* * Precompute mask for SA flags comparison: If we need a @@ -332,7 +317,7 @@ ipsec_getassocbyconn(isaf_t *bucket, ipsec_out_t *io, uint32_t *src, /* * Do labels match? */ - if (!ipsec_label_match(cr, retval->ipsa_cred)) + if (!ipsec_label_match(tsl, retval->ipsa_tsl)) goto next_ipsa; /* @@ -451,10 +436,9 @@ next_ipsa: ipsec_latch_ids(ipl, retval->ipsa_src_cid, retval->ipsa_dst_cid); } - if (!ipl->ipl_out_action_latched) { + if (ixa->ixa_ipsec_action == NULL) { IPACT_REFHOLD(act); - ipl->ipl_out_action = act; - ipl->ipl_out_action_latched = B_TRUE; + ixa->ixa_ipsec_action = act; } } @@ -471,7 +455,7 @@ next_ipsa: retval->ipsa_flags |= IPSA_F_UNIQUE; retval->ipsa_unique_id = unique_id; retval->ipsa_unique_mask = SA_UNIQUE_MASK( - io->ipsec_out_src_port, io->ipsec_out_dst_port, + ixa->ixa_ipsec_src_port, ixa->ixa_ipsec_dst_port, protocol, 0); } @@ -581,45 +565,41 @@ ipsec_getassocbyspi(isaf_t *bucket, uint32_t spi, uint32_t *src, uint32_t *dst, } boolean_t -ipsec_outbound_sa(mblk_t *mp, uint_t proto) +ipsec_outbound_sa(mblk_t *data_mp, ip_xmit_attr_t *ixa, uint_t proto) { - mblk_t *data_mp; - ipsec_out_t *io; ipaddr_t dst; uint32_t *dst_ptr, *src_ptr; isaf_t *bucket; ipsa_t *assoc; - ip6_pkt_t ipp; + ip_pkt_t ipp; in6_addr_t dst6; ipsa_t **sa; sadbp_t *sadbp; sadb_t *sp; sa_family_t af; - cred_t *cr; - netstack_t *ns; + ip_stack_t *ipst = ixa->ixa_ipst; + netstack_t *ns = ipst->ips_netstack; - data_mp = mp->b_cont; - io = (ipsec_out_t *)mp->b_rptr; - ns = io->ipsec_out_ns; + ASSERT(ixa->ixa_flags & IXAF_IPSEC_SECURE); if (proto == IPPROTO_ESP) { ipsecesp_stack_t *espstack; espstack = ns->netstack_ipsecesp; - sa = &io->ipsec_out_esp_sa; + sa = &ixa->ixa_ipsec_esp_sa; sadbp = &espstack->esp_sadb; } else { ipsecah_stack_t *ahstack; ASSERT(proto == IPPROTO_AH); ahstack = ns->netstack_ipsecah; - sa = &io->ipsec_out_ah_sa; + sa = &ixa->ixa_ipsec_ah_sa; sadbp = &ahstack->ah_sadb; } ASSERT(*sa == NULL); - if (io->ipsec_out_v4) { + if (ixa->ixa_flags & IXAF_IS_IPV4) { ipha_t *ipha = (ipha_t *)data_mp->b_rptr; ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); @@ -651,11 +631,9 @@ ipsec_outbound_sa(mblk_t *mp, uint_t proto) dst_ptr = (uint32_t *)&dst6; } - cr = msg_getcred(data_mp, NULL); - mutex_enter(&bucket->isaf_lock); - assoc = ipsec_getassocbyconn(bucket, io, src_ptr, dst_ptr, af, - proto, cr); + assoc = ipsec_getassocbyconn(bucket, ixa, src_ptr, dst_ptr, af, + proto, ixa->ixa_tsl); mutex_exit(&bucket->isaf_lock); if (assoc == NULL) @@ -674,17 +652,16 @@ ipsec_outbound_sa(mblk_t *mp, uint_t proto) /* * Inbound IPsec SA selection. + * Can return a pulled up mblk. + * When it returns non-NULL ahp is updated */ - -ah_t * -ipsec_inbound_ah_sa(mblk_t *mp, netstack_t *ns) +mblk_t * +ipsec_inbound_ah_sa(mblk_t *mp, ip_recv_attr_t *ira, ah_t **ahp) { - mblk_t *ipsec_in; ipha_t *ipha; ipsa_t *assoc; ah_t *ah; isaf_t *hptr; - ipsec_in_t *ii; boolean_t isv6; ip6_t *ip6h; int ah_offset; @@ -692,20 +669,13 @@ ipsec_inbound_ah_sa(mblk_t *mp, netstack_t *ns) int pullup_len; sadb_t *sp; sa_family_t af; + netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack; ipsec_stack_t *ipss = ns->netstack_ipsec; ipsecah_stack_t *ahstack = ns->netstack_ipsecah; IP_AH_BUMP_STAT(ipss, in_requests); - ASSERT(mp->b_datap->db_type == M_CTL); - - ipsec_in = mp; - ii = (ipsec_in_t *)ipsec_in->b_rptr; - mp = mp->b_cont; - - ASSERT(mp->b_datap->db_type == M_DATA); - - isv6 = !ii->ipsec_in_v4; + isv6 = !(ira->ira_flags & IRAF_IS_IPV4); if (isv6) { ip6h = (ip6_t *)mp->b_rptr; ah_offset = ipsec_ah_get_hdr_size_v6(mp, B_TRUE); @@ -729,7 +699,7 @@ ipsec_inbound_ah_sa(mblk_t *mp, netstack_t *ns) SL_WARN | SL_ERROR, "ipsec_inbound_ah_sa: Small AH header\n"); IP_AH_BUMP_STAT(ipss, in_discards); - ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL, + ip_drop_packet(mp, B_TRUE, ira->ira_ill, DROPPER(ipss, ipds_ah_bad_length), &ipss->ipsec_dropper); return (NULL); @@ -763,11 +733,11 @@ ipsec_inbound_ah_sa(mblk_t *mp, netstack_t *ns) assoc->ipsa_state == IPSA_STATE_ACTIVE_ELSEWHERE) { IP_AH_BUMP_STAT(ipss, lookup_failure); IP_AH_BUMP_STAT(ipss, in_discards); - ipsecah_in_assocfailure(ipsec_in, 0, + ipsecah_in_assocfailure(mp, 0, SL_ERROR | SL_CONSOLE | SL_WARN, "ipsec_inbound_ah_sa: No association found for " "spi 0x%x, dst addr %s\n", - ah->ah_spi, dst_ptr, af, ahstack); + ah->ah_spi, dst_ptr, af, ira); if (assoc != NULL) { IPSA_REFRELE(assoc); } @@ -775,33 +745,44 @@ ipsec_inbound_ah_sa(mblk_t *mp, netstack_t *ns) } if (assoc->ipsa_state == IPSA_STATE_LARVAL && - sadb_set_lpkt(assoc, ipsec_in, ns)) { + sadb_set_lpkt(assoc, mp, ira)) { /* Not fully baked; swap the packet under a rock until then */ IPSA_REFRELE(assoc); return (NULL); } + /* Are the IPsec fields initialized at all? */ + if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) { + ira->ira_ipsec_action = NULL; + ira->ira_ipsec_ah_sa = NULL; + ira->ira_ipsec_esp_sa = NULL; + } + /* * Save a reference to the association so that it can * be retrieved after execution. We free any AH SA reference * already there (innermost SA "wins". The reference to * the SA will also be used later when doing the policy checks. */ - - if (ii->ipsec_in_ah_sa != NULL) { - IPSA_REFRELE(ii->ipsec_in_ah_sa); + if (ira->ira_ipsec_ah_sa != NULL) { + IPSA_REFRELE(ira->ira_ipsec_ah_sa); } - ii->ipsec_in_ah_sa = assoc; + ira->ira_flags |= IRAF_IPSEC_SECURE; + ira->ira_ipsec_ah_sa = assoc; - return (ah); + *ahp = ah; + return (mp); } -esph_t * -ipsec_inbound_esp_sa(mblk_t *ipsec_in_mp, netstack_t *ns) +/* + * Can return a pulled up mblk. + * When it returns non-NULL esphp is updated + */ +mblk_t * +ipsec_inbound_esp_sa(mblk_t *data_mp, ip_recv_attr_t *ira, esph_t **esphp) { - mblk_t *data_mp, *placeholder; + mblk_t *placeholder; uint32_t *src_ptr, *dst_ptr; - ipsec_in_t *ii; ipha_t *ipha; ip6_t *ip6h; esph_t *esph; @@ -811,19 +792,13 @@ ipsec_inbound_esp_sa(mblk_t *ipsec_in_mp, netstack_t *ns) sa_family_t af; boolean_t isv6; sadb_t *sp; + netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack; ipsec_stack_t *ipss = ns->netstack_ipsec; ipsecesp_stack_t *espstack = ns->netstack_ipsecesp; IP_ESP_BUMP_STAT(ipss, in_requests); - ASSERT(ipsec_in_mp->b_datap->db_type == M_CTL); - - /* We have IPSEC_IN already! */ - ii = (ipsec_in_t *)ipsec_in_mp->b_rptr; - data_mp = ipsec_in_mp->b_cont; - ASSERT(ii->ipsec_in_type == IPSEC_IN); - - isv6 = !ii->ipsec_in_v4; + isv6 = !(ira->ira_flags & IRAF_IS_IPV4); if (isv6) { ip6h = (ip6_t *)data_mp->b_rptr; } else { @@ -841,17 +816,11 @@ ipsec_inbound_esp_sa(mblk_t *ipsec_in_mp, netstack_t *ns) * actual packet length. */ if (data_mp->b_datap->db_ref > 1 || - (data_mp->b_wptr - data_mp->b_rptr) < - (isv6 ? (ntohs(ip6h->ip6_plen) + sizeof (ip6_t)) - : ntohs(ipha->ipha_length))) { + (data_mp->b_wptr - data_mp->b_rptr) < ira->ira_pktlen) { placeholder = msgpullup(data_mp, -1); if (placeholder == NULL) { IP_ESP_BUMP_STAT(ipss, in_discards); - /* - * TODO: Extract inbound interface from the IPSEC_IN - * message's ii->ipsec_in_rill_index. - */ - ip_drop_packet(ipsec_in_mp, B_TRUE, NULL, NULL, + ip_drop_packet(data_mp, B_TRUE, ira->ira_ill, DROPPER(ipss, ipds_esp_nomem), &ipss->ipsec_dropper); return (NULL); @@ -859,7 +828,6 @@ ipsec_inbound_esp_sa(mblk_t *ipsec_in_mp, netstack_t *ns) /* Reset packet with new pulled up mblk. */ freemsg(data_mp); data_mp = placeholder; - ipsec_in_mp->b_cont = data_mp; } } @@ -904,11 +872,11 @@ ipsec_inbound_esp_sa(mblk_t *ipsec_in_mp, netstack_t *ns) /* This is a loggable error! AUDIT ME! */ IP_ESP_BUMP_STAT(ipss, lookup_failure); IP_ESP_BUMP_STAT(ipss, in_discards); - ipsecesp_in_assocfailure(ipsec_in_mp, 0, + ipsecesp_in_assocfailure(data_mp, 0, SL_ERROR | SL_CONSOLE | SL_WARN, "ipsec_inbound_esp_sa: No association found for " "spi 0x%x, dst addr %s\n", - esph->esph_spi, dst_ptr, af, espstack); + esph->esph_spi, dst_ptr, af, ira); if (ipsa != NULL) { IPSA_REFRELE(ipsa); } @@ -916,22 +884,31 @@ ipsec_inbound_esp_sa(mblk_t *ipsec_in_mp, netstack_t *ns) } if (ipsa->ipsa_state == IPSA_STATE_LARVAL && - sadb_set_lpkt(ipsa, ipsec_in_mp, ns)) { + sadb_set_lpkt(ipsa, data_mp, ira)) { /* Not fully baked; swap the packet under a rock until then */ IPSA_REFRELE(ipsa); return (NULL); } + /* Are the IPsec fields initialized at all? */ + if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) { + ira->ira_ipsec_action = NULL; + ira->ira_ipsec_ah_sa = NULL; + ira->ira_ipsec_esp_sa = NULL; + } + /* * Save a reference to the association so that it can * be retrieved after execution. We free any AH SA reference * already there (innermost SA "wins". The reference to * the SA will also be used later when doing the policy checks. */ - if (ii->ipsec_in_esp_sa != NULL) { - IPSA_REFRELE(ii->ipsec_in_esp_sa); + if (ira->ira_ipsec_esp_sa != NULL) { + IPSA_REFRELE(ira->ira_ipsec_esp_sa); } - ii->ipsec_in_esp_sa = ipsa; + ira->ira_flags |= IRAF_IPSEC_SECURE; + ira->ira_ipsec_esp_sa = ipsa; - return (esph); + *esphp = esph; + return (data_mp); } diff --git a/usr/src/uts/common/inet/ip/ip_srcid.c b/usr/src/uts/common/inet/ip/ip_srcid.c index 949508a796..f6507d6413 100644 --- a/usr/src/uts/common/inet/ip/ip_srcid.c +++ b/usr/src/uts/common/inet/ip/ip_srcid.c @@ -101,11 +101,7 @@ #include <netinet/ip_mroute.h> #include <inet/ipclassifier.h> -#include <net/pfkeyv2.h> -#include <inet/ipsec_info.h> -#include <inet/sadb.h> #include <sys/kmem.h> -#include <inet/ipsec_impl.h> static uint_t srcid_nextid(ip_stack_t *); static srcid_map_t **srcid_lookup_addr(const in6_addr_t *addr, @@ -239,7 +235,7 @@ ip_srcid_find_id(uint_t id, in6_addr_t *addr, zoneid_t zoneid, rw_enter(&ipst->ips_srcid_lock, RW_READER); smpp = srcid_lookup_id(id, ipst); smp = *smpp; - if (smp == NULL || smp->sm_zoneid != zoneid) { + if (smp == NULL || (smp->sm_zoneid != zoneid && zoneid != ALL_ZONES)) { /* Not preset */ ip1dbg(("ip_srcid_find_id: unknown %u or in wrong zone\n", id)); *addr = ipv6_all_zeros; @@ -290,7 +286,7 @@ srcid_lookup_addr(const in6_addr_t *addr, zoneid_t zoneid, ip_stack_t *ipst) smpp = &ipst->ips_srcid_head; while (*smpp != NULL) { if (IN6_ARE_ADDR_EQUAL(&(*smpp)->sm_addr, addr) && - zoneid == (*smpp)->sm_zoneid) + (zoneid == (*smpp)->sm_zoneid || zoneid == ALL_ZONES)) return (smpp); smpp = &(*smpp)->sm_next; } diff --git a/usr/src/uts/common/inet/ip/ipclassifier.c b/usr/src/uts/common/inet/ip/ipclassifier.c index 45683ec967..31fa14b4af 100644 --- a/usr/src/uts/common/inet/ip/ipclassifier.c +++ b/usr/src/uts/common/inet/ip/ipclassifier.c @@ -52,16 +52,12 @@ * asynchronous and the reference protects the connection from being destroyed * before its processing is finished). * - * send and receive functions are currently used for TCP only. The send function - * determines the IP entry point for the packet once it leaves TCP to be sent to - * the destination address. The receive function is used by IP when the packet - * should be passed for TCP processing. When a new connection is created these - * are set to ip_output() and tcp_input() respectively. During the lifetime of - * the connection the send and receive functions may change depending on the - * changes in the connection state. For example, Once the connection is bound to - * an addresse, the receive function for this connection is set to - * tcp_conn_request(). This allows incoming SYNs to go directly into the - * listener SYN processing function without going to tcp_input() first. + * conn_recv is used to pass up packets to the ULP. + * For TCP conn_recv changes. It is tcp_input_listener_unbound initially for + * a listener, and changes to tcp_input_listener as the listener has picked a + * good squeue. For other cases it is set to tcp_input_data. + * + * conn_recvicmp is used to pass up ICMP errors to the ULP. * * Classifier uses several hash tables: * @@ -91,8 +87,8 @@ * Connection Lookup: * ------------------ * - * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, zoneid, ip_stack) - * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, zoneid, ip_stack) + * conn_t *ipcl_classify_v4(mp, protocol, hdr_len, ira, ip_stack) + * conn_t *ipcl_classify_v6(mp, protocol, hdr_len, ira, ip_stack) * * Finds connection for an incoming IPv4 or IPv6 packet. Returns NULL if * it can't find any associated connection. If the connection is found, its @@ -107,9 +103,12 @@ * hdr_len: The size of IP header. It is used to find TCP or UDP header in * the packet. * - * zoneid: The zone in which the returned connection must be; the zoneid - * corresponding to the ire_zoneid on the IRE located for the - * packet's destination address. + * ira->ira_zoneid: The zone in which the returned connection must be; the + * zoneid corresponding to the ire_zoneid on the IRE located for + * the packet's destination address. + * + * ira->ira_flags: Contains the IRAF_TX_MAC_EXEMPTABLE and + * IRAF_TX_SHARED_ADDR flags * * For TCP connections, the lookup order is as follows: * 5-tuple {src, dst, protocol, local port, remote port} @@ -156,7 +155,7 @@ * any. In any event, the receiving socket must have SO_MAC_EXEMPT set and the * receiver's label must dominate the sender's default label. * - * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcph_t *, int, ip_stack); + * conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack); * conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t, * ip_stack); * @@ -184,34 +183,26 @@ * Table Updates * ------------- * - * int ipcl_conn_insert(connp, protocol, src, dst, ports) - * int ipcl_conn_insert_v6(connp, protocol, src, dst, ports, ifindex) + * int ipcl_conn_insert(connp); + * int ipcl_conn_insert_v4(connp); + * int ipcl_conn_insert_v6(connp); * * Insert 'connp' in the ipcl_conn_fanout. * Arguements : * connp conn_t to be inserted - * protocol connection protocol - * src source address - * dst destination address - * ports local and remote port - * ifindex interface index for IPv6 connections * * Return value : * 0 if connp was inserted * EADDRINUSE if the connection with the same tuple * already exists. * - * int ipcl_bind_insert(connp, protocol, src, lport); - * int ipcl_bind_insert_v6(connp, protocol, src, lport); + * int ipcl_bind_insert(connp); + * int ipcl_bind_insert_v4(connp); + * int ipcl_bind_insert_v6(connp); * * Insert 'connp' in ipcl_bind_fanout. * Arguements : * connp conn_t to be inserted - * protocol connection protocol - * src source address connection wants - * to bind to - * lport local port connection wants to - * bind to * * * void ipcl_hash_remove(connp); @@ -261,6 +252,8 @@ #include <netinet/icmp6.h> #include <inet/ip.h> +#include <inet/ip_if.h> +#include <inet/ip_ire.h> #include <inet/ip6.h> #include <inet/ip_ndp.h> #include <inet/ip_impl.h> @@ -280,19 +273,6 @@ #include <sys/tsol/tnet.h> #include <sys/sockio.h> -#ifdef DEBUG -#define IPCL_DEBUG -#else -#undef IPCL_DEBUG -#endif - -#ifdef IPCL_DEBUG -int ipcl_debug_level = 0; -#define IPCL_DEBUG_LVL(level, args) \ - if (ipcl_debug_level & level) { printf args; } -#else -#define IPCL_DEBUG_LVL(level, args) {; } -#endif /* Old value for compatibility. Setable in /etc/system */ uint_t tcp_conn_hash_size = 0; @@ -336,10 +316,8 @@ typedef union itc_s { struct kmem_cache *tcp_conn_cache; struct kmem_cache *ip_conn_cache; -struct kmem_cache *ip_helper_stream_cache; extern struct kmem_cache *sctp_conn_cache; extern struct kmem_cache *tcp_sack_info_cache; -extern struct kmem_cache *tcp_iphc_cache; struct kmem_cache *udp_conn_cache; struct kmem_cache *rawip_conn_cache; struct kmem_cache *rts_conn_cache; @@ -362,34 +340,6 @@ static void rawip_conn_destructor(void *, void *); static int rts_conn_constructor(void *, void *, int); static void rts_conn_destructor(void *, void *); -static int ip_helper_stream_constructor(void *, void *, int); -static void ip_helper_stream_destructor(void *, void *); - -boolean_t ip_use_helper_cache = B_TRUE; - -/* - * Hook functions to enable cluster networking - * On non-clustered systems these vectors must always be NULL. - */ -extern void (*cl_inet_listen)(netstackid_t, uint8_t, sa_family_t, - uint8_t *, in_port_t, void *); -extern void (*cl_inet_unlisten)(netstackid_t, uint8_t, sa_family_t, - uint8_t *, in_port_t, void *); - -#ifdef IPCL_DEBUG -#define INET_NTOA_BUFSIZE 18 - -static char * -inet_ntoa_r(uint32_t in, char *b) -{ - unsigned char *p; - - p = (unsigned char *)∈ - (void) sprintf(b, "%d.%d.%d.%d", p[0], p[1], p[2], p[3]); - return (b); -} -#endif - /* * Global (for all stack instances) init routine */ @@ -420,15 +370,6 @@ ipcl_g_init(void) sizeof (itc_t) + sizeof (rts_t), CACHE_ALIGN_SIZE, rts_conn_constructor, rts_conn_destructor, NULL, NULL, NULL, 0); - - if (ip_use_helper_cache) { - ip_helper_stream_cache = kmem_cache_create - ("ip_helper_stream_cache", sizeof (ip_helper_stream_info_t), - CACHE_ALIGN_SIZE, ip_helper_stream_constructor, - ip_helper_stream_destructor, NULL, NULL, NULL, 0); - } else { - ip_helper_stream_cache = NULL; - } } /* @@ -493,10 +434,10 @@ ipcl_init(ip_stack_t *ipst) MUTEX_DEFAULT, NULL); } - ipst->ips_ipcl_proto_fanout = kmem_zalloc(IPPROTO_MAX * + ipst->ips_ipcl_proto_fanout_v4 = kmem_zalloc(IPPROTO_MAX * sizeof (connf_t), KM_SLEEP); for (i = 0; i < IPPROTO_MAX; i++) { - mutex_init(&ipst->ips_ipcl_proto_fanout[i].connf_lock, NULL, + mutex_init(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock, NULL, MUTEX_DEFAULT, NULL); } @@ -576,11 +517,12 @@ ipcl_destroy(ip_stack_t *ipst) ipst->ips_ipcl_bind_fanout = NULL; for (i = 0; i < IPPROTO_MAX; i++) { - ASSERT(ipst->ips_ipcl_proto_fanout[i].connf_head == NULL); - mutex_destroy(&ipst->ips_ipcl_proto_fanout[i].connf_lock); + ASSERT(ipst->ips_ipcl_proto_fanout_v4[i].connf_head == NULL); + mutex_destroy(&ipst->ips_ipcl_proto_fanout_v4[i].connf_lock); } - kmem_free(ipst->ips_ipcl_proto_fanout, IPPROTO_MAX * sizeof (connf_t)); - ipst->ips_ipcl_proto_fanout = NULL; + kmem_free(ipst->ips_ipcl_proto_fanout_v4, + IPPROTO_MAX * sizeof (connf_t)); + ipst->ips_ipcl_proto_fanout_v4 = NULL; for (i = 0; i < IPPROTO_MAX; i++) { ASSERT(ipst->ips_ipcl_proto_fanout_v6[i].connf_head == NULL); @@ -636,7 +578,6 @@ conn_t * ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns) { conn_t *connp; - sctp_stack_t *sctps; struct kmem_cache *conn_cache; switch (type) { @@ -644,10 +585,10 @@ ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns) if ((connp = kmem_cache_alloc(sctp_conn_cache, sleep)) == NULL) return (NULL); sctp_conn_init(connp); - sctps = ns->netstack_sctp; - SCTP_G_Q_REFHOLD(sctps); netstack_hold(ns); connp->conn_netstack = ns; + connp->conn_ixa->ixa_ipst = ns->netstack_ip; + ipcl_globalhash_insert(connp); return (connp); case IPCL_TCPCONN: @@ -681,6 +622,7 @@ ipcl_conn_create(uint32_t type, int sleep, netstack_t *ns) connp->conn_ref = 1; netstack_hold(ns); connp->conn_netstack = ns; + connp->conn_ixa->ixa_ipst = ns->netstack_ip; ipcl_globalhash_insert(connp); return (connp); } @@ -693,61 +635,61 @@ ipcl_conn_destroy(conn_t *connp) ASSERT(!MUTEX_HELD(&connp->conn_lock)); ASSERT(connp->conn_ref == 0); - ASSERT(connp->conn_ire_cache == NULL); DTRACE_PROBE1(conn__destroy, conn_t *, connp); - if (connp->conn_effective_cred != NULL) { - crfree(connp->conn_effective_cred); - connp->conn_effective_cred = NULL; - } - if (connp->conn_cred != NULL) { crfree(connp->conn_cred); connp->conn_cred = NULL; } + if (connp->conn_ht_iphc != NULL) { + kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated); + connp->conn_ht_iphc = NULL; + connp->conn_ht_iphc_allocated = 0; + connp->conn_ht_iphc_len = 0; + connp->conn_ht_ulp = NULL; + connp->conn_ht_ulp_len = 0; + } + ip_pkt_free(&connp->conn_xmit_ipp); + ipcl_globalhash_remove(connp); - /* FIXME: add separate tcp_conn_free()? */ + if (connp->conn_latch != NULL) { + IPLATCH_REFRELE(connp->conn_latch); + connp->conn_latch = NULL; + } + if (connp->conn_latch_in_policy != NULL) { + IPPOL_REFRELE(connp->conn_latch_in_policy); + connp->conn_latch_in_policy = NULL; + } + if (connp->conn_latch_in_action != NULL) { + IPACT_REFRELE(connp->conn_latch_in_action); + connp->conn_latch_in_action = NULL; + } + if (connp->conn_policy != NULL) { + IPPH_REFRELE(connp->conn_policy, ns); + connp->conn_policy = NULL; + } + + if (connp->conn_ipsec_opt_mp != NULL) { + freemsg(connp->conn_ipsec_opt_mp); + connp->conn_ipsec_opt_mp = NULL; + } + if (connp->conn_flags & IPCL_TCPCONN) { - tcp_t *tcp = connp->conn_tcp; - tcp_stack_t *tcps; - - ASSERT(tcp != NULL); - tcps = tcp->tcp_tcps; - if (tcps != NULL) { - if (connp->conn_latch != NULL) { - IPLATCH_REFRELE(connp->conn_latch, ns); - connp->conn_latch = NULL; - } - if (connp->conn_policy != NULL) { - IPPH_REFRELE(connp->conn_policy, ns); - connp->conn_policy = NULL; - } - tcp->tcp_tcps = NULL; - TCPS_REFRELE(tcps); - } + tcp_t *tcp = connp->conn_tcp; tcp_free(tcp); mp = tcp->tcp_timercache; - tcp->tcp_cred = NULL; + + tcp->tcp_tcps = NULL; if (tcp->tcp_sack_info != NULL) { bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t)); kmem_cache_free(tcp_sack_info_cache, tcp->tcp_sack_info); } - if (tcp->tcp_iphc != NULL) { - if (tcp->tcp_hdr_grown) { - kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len); - } else { - bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); - kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc); - } - tcp->tcp_iphc_len = 0; - } - ASSERT(tcp->tcp_iphc_len == 0); /* * tcp_rsrv_mp can be NULL if tcp_get_conn() fails to allocate @@ -759,17 +701,15 @@ ipcl_conn_destroy(conn_t *connp) mutex_destroy(&tcp->tcp_rsrv_mp_lock); } - ASSERT(connp->conn_latch == NULL); - ASSERT(connp->conn_policy == NULL); - + ipcl_conn_cleanup(connp); + connp->conn_flags = IPCL_TCPCONN; if (ns != NULL) { ASSERT(tcp->tcp_tcps == NULL); connp->conn_netstack = NULL; + connp->conn_ixa->ixa_ipst = NULL; netstack_rele(ns); } - ipcl_conn_cleanup(connp); - connp->conn_flags = IPCL_TCPCONN; bzero(tcp, sizeof (tcp_t)); tcp->tcp_timercache = mp; @@ -777,18 +717,6 @@ ipcl_conn_destroy(conn_t *connp) kmem_cache_free(tcp_conn_cache, connp); return; } - if (connp->conn_latch != NULL) { - IPLATCH_REFRELE(connp->conn_latch, connp->conn_netstack); - connp->conn_latch = NULL; - } - if (connp->conn_policy != NULL) { - IPPH_REFRELE(connp->conn_policy, connp->conn_netstack); - connp->conn_policy = NULL; - } - if (connp->conn_ipsec_opt_mp != NULL) { - freemsg(connp->conn_ipsec_opt_mp); - connp->conn_ipsec_opt_mp = NULL; - } if (connp->conn_flags & IPCL_SCTPCONN) { ASSERT(ns != NULL); @@ -796,21 +724,21 @@ ipcl_conn_destroy(conn_t *connp) return; } + ipcl_conn_cleanup(connp); if (ns != NULL) { connp->conn_netstack = NULL; + connp->conn_ixa->ixa_ipst = NULL; netstack_rele(ns); } - ipcl_conn_cleanup(connp); - /* leave conn_priv aka conn_udp, conn_icmp, etc in place. */ if (connp->conn_flags & IPCL_UDPCONN) { connp->conn_flags = IPCL_UDPCONN; kmem_cache_free(udp_conn_cache, connp); } else if (connp->conn_flags & IPCL_RAWIPCONN) { - connp->conn_flags = IPCL_RAWIPCONN; - connp->conn_ulp = IPPROTO_ICMP; + connp->conn_proto = IPPROTO_ICMP; + connp->conn_ixa->ixa_protocol = connp->conn_proto; kmem_cache_free(rawip_conn_cache, connp); } else if (connp->conn_flags & IPCL_RTSCONN) { connp->conn_flags = IPCL_RTSCONN; @@ -826,7 +754,6 @@ ipcl_conn_destroy(conn_t *connp) /* * Running in cluster mode - deregister listener information */ - static void ipcl_conn_unlisten(conn_t *connp) { @@ -837,12 +764,12 @@ ipcl_conn_unlisten(conn_t *connp) sa_family_t addr_family; uint8_t *laddrp; - if (connp->conn_pkt_isv6) { + if (connp->conn_ipversion == IPV6_VERSION) { addr_family = AF_INET6; - laddrp = (uint8_t *)&connp->conn_bound_source_v6; + laddrp = (uint8_t *)&connp->conn_bound_addr_v6; } else { addr_family = AF_INET; - laddrp = (uint8_t *)&connp->conn_bound_source; + laddrp = (uint8_t *)&connp->conn_bound_addr_v4; } (*cl_inet_unlisten)(connp->conn_netstack->netstack_stackid, IPPROTO_TCP, addr_family, laddrp, connp->conn_lport, NULL); @@ -859,8 +786,6 @@ ipcl_conn_unlisten(conn_t *connp) connf_t *connfp = (connp)->conn_fanout; \ ASSERT(!MUTEX_HELD(&((connp)->conn_lock))); \ if (connfp != NULL) { \ - IPCL_DEBUG_LVL(4, ("IPCL_HASH_REMOVE: connp %p", \ - (void *)(connp))); \ mutex_enter(&connfp->connf_lock); \ if ((connp)->conn_next != NULL) \ (connp)->conn_next->conn_prev = \ @@ -884,7 +809,11 @@ ipcl_conn_unlisten(conn_t *connp) void ipcl_hash_remove(conn_t *connp) { + uint8_t protocol = connp->conn_proto; + IPCL_HASH_REMOVE(connp); + if (protocol == IPPROTO_RSVP) + ill_set_inputfn_all(connp->conn_netstack->netstack_ip); } /* @@ -937,8 +866,6 @@ ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp) } #define IPCL_HASH_INSERT_CONNECTED(connfp, connp) { \ - IPCL_DEBUG_LVL(8, ("IPCL_HASH_INSERT_CONNECTED: connfp %p " \ - "connp %p", (void *)(connfp), (void *)(connp))); \ IPCL_HASH_REMOVE((connp)); \ mutex_enter(&(connfp)->connf_lock); \ IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); \ @@ -947,13 +874,11 @@ ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp) #define IPCL_HASH_INSERT_BOUND(connfp, connp) { \ conn_t *pconnp = NULL, *nconnp; \ - IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_BOUND: connfp %p " \ - "connp %p", (void *)connfp, (void *)(connp))); \ IPCL_HASH_REMOVE((connp)); \ mutex_enter(&(connfp)->connf_lock); \ nconnp = (connfp)->connf_head; \ while (nconnp != NULL && \ - !_IPCL_V4_MATCH_ANY(nconnp->conn_srcv6)) { \ + !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) { \ pconnp = nconnp; \ nconnp = nconnp->conn_next; \ } \ @@ -977,16 +902,14 @@ ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp) #define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \ conn_t **list, *prev, *next; \ boolean_t isv4mapped = \ - IN6_IS_ADDR_V4MAPPED(&(connp)->conn_srcv6); \ - IPCL_DEBUG_LVL(32, ("IPCL_HASH_INSERT_WILDCARD: connfp %p " \ - "connp %p", (void *)(connfp), (void *)(connp))); \ + IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6); \ IPCL_HASH_REMOVE((connp)); \ mutex_enter(&(connfp)->connf_lock); \ list = &(connfp)->connf_head; \ prev = NULL; \ while ((next = *list) != NULL) { \ if (isv4mapped && \ - IN6_IS_ADDR_UNSPECIFIED(&next->conn_srcv6) && \ + IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) && \ connp->conn_zoneid == next->conn_zoneid) { \ (connp)->conn_next = next; \ if (prev != NULL) \ @@ -1012,44 +935,13 @@ ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp) IPCL_HASH_INSERT_WILDCARD(connfp, connp); } -void -ipcl_proto_insert(conn_t *connp, uint8_t protocol) -{ - connf_t *connfp; - ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - - ASSERT(connp != NULL); - ASSERT((connp->conn_mac_mode == CONN_MAC_DEFAULT) || - protocol == IPPROTO_AH || protocol == IPPROTO_ESP); - - connp->conn_ulp = protocol; - - /* Insert it in the protocol hash */ - connfp = &ipst->ips_ipcl_proto_fanout[protocol]; - IPCL_HASH_INSERT_WILDCARD(connfp, connp); -} - -void -ipcl_proto_insert_v6(conn_t *connp, uint8_t protocol) -{ - connf_t *connfp; - ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - - ASSERT(connp != NULL); - ASSERT((connp->conn_mac_mode == CONN_MAC_DEFAULT) || - protocol == IPPROTO_AH || protocol == IPPROTO_ESP); - - connp->conn_ulp = protocol; - - /* Insert it in the Bind Hash */ - connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; - IPCL_HASH_INSERT_WILDCARD(connfp, connp); -} - /* * Because the classifier is used to classify inbound packets, the destination * address is meant to be our local tunnel address (tunnel source), and the * source the remote tunnel address (tunnel destination). + * + * Note that conn_proto can't be used for fanout since the upper protocol + * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel. */ conn_t * ipcl_iptun_classify_v4(ipaddr_t *src, ipaddr_t *dst, ip_stack_t *ipst) @@ -1128,13 +1020,13 @@ ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport) oconnp = oconnp->conn_next) { if (oconnp->conn_lport == lport && oconnp->conn_zoneid == connp->conn_zoneid && - oconnp->conn_af_isv6 == connp->conn_af_isv6 && - ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) || - IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_srcv6) || - IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6) || - IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_srcv6)) || - IN6_ARE_ADDR_EQUAL(&oconnp->conn_srcv6, - &connp->conn_srcv6))) { + oconnp->conn_family == connp->conn_family && + ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) || + IN6_IS_ADDR_UNSPECIFIED(&oconnp->conn_laddr_v6) || + IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6) || + IN6_IS_ADDR_V4MAPPED_ANY(&oconnp->conn_laddr_v6)) || + IN6_ARE_ADDR_EQUAL(&oconnp->conn_laddr_v6, + &connp->conn_laddr_v6))) { break; } } @@ -1142,10 +1034,10 @@ ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport) if (oconnp != NULL) return (EADDRNOTAVAIL); - if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6) || - IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_remv6)) { - if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) || - IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_srcv6)) { + if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) || + IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { + if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) || + IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) { IPCL_HASH_INSERT_WILDCARD(connfp, connp); } else { IPCL_HASH_INSERT_BOUND(connfp, connp); @@ -1157,17 +1049,18 @@ ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport) } static int -ipcl_iptun_hash_insert(conn_t *connp, ipaddr_t src, ipaddr_t dst, - ip_stack_t *ipst) +ipcl_iptun_hash_insert(conn_t *connp, ip_stack_t *ipst) { connf_t *connfp; conn_t *tconnp; + ipaddr_t laddr = connp->conn_laddr_v4; + ipaddr_t faddr = connp->conn_faddr_v4; - connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(src, dst)]; + connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH(laddr, faddr)]; mutex_enter(&connfp->connf_lock); for (tconnp = connfp->connf_head; tconnp != NULL; tconnp = tconnp->conn_next) { - if (IPCL_IPTUN_MATCH(tconnp, src, dst)) { + if (IPCL_IPTUN_MATCH(tconnp, laddr, faddr)) { /* A tunnel is already bound to these addresses. */ mutex_exit(&connfp->connf_lock); return (EADDRINUSE); @@ -1179,17 +1072,18 @@ ipcl_iptun_hash_insert(conn_t *connp, ipaddr_t src, ipaddr_t dst, } static int -ipcl_iptun_hash_insert_v6(conn_t *connp, const in6_addr_t *src, - const in6_addr_t *dst, ip_stack_t *ipst) +ipcl_iptun_hash_insert_v6(conn_t *connp, ip_stack_t *ipst) { connf_t *connfp; conn_t *tconnp; + in6_addr_t *laddr = &connp->conn_laddr_v6; + in6_addr_t *faddr = &connp->conn_faddr_v6; - connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(src, dst)]; + connfp = &ipst->ips_ipcl_iptun_fanout[IPCL_IPTUN_HASH_V6(laddr, faddr)]; mutex_enter(&connfp->connf_lock); for (tconnp = connfp->connf_head; tconnp != NULL; tconnp = tconnp->conn_next) { - if (IPCL_IPTUN_MATCH_V6(tconnp, src, dst)) { + if (IPCL_IPTUN_MATCH_V6(tconnp, laddr, faddr)) { /* A tunnel is already bound to these addresses. */ mutex_exit(&connfp->connf_lock); return (EADDRINUSE); @@ -1213,12 +1107,12 @@ check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst) connf_t *connfp; conn_t *tconn; - connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp]; + connfp = &ipst->ips_ipcl_proto_fanout_v4[connp->conn_proto]; mutex_enter(&connfp->connf_lock); for (tconn = connfp->connf_head; tconn != NULL; tconn = tconn->conn_next) { /* We don't allow v4 fallback for v6 raw socket */ - if (connp->conn_af_isv6 != tconn->conn_af_isv6) + if (connp->conn_family != tconn->conn_family) continue; /* If neither is exempt, then there's no conflict */ if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) && @@ -1228,9 +1122,9 @@ check_exempt_conflict_v4(conn_t *connp, ip_stack_t *ipst) if (connp->conn_zoneid == tconn->conn_zoneid) continue; /* If both are bound to different specific addrs, ok */ - if (connp->conn_src != INADDR_ANY && - tconn->conn_src != INADDR_ANY && - connp->conn_src != tconn->conn_src) + if (connp->conn_laddr_v4 != INADDR_ANY && + tconn->conn_laddr_v4 != INADDR_ANY && + connp->conn_laddr_v4 != tconn->conn_laddr_v4) continue; /* These two conflict; fail */ break; @@ -1245,12 +1139,12 @@ check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst) connf_t *connfp; conn_t *tconn; - connfp = &ipst->ips_ipcl_proto_fanout[connp->conn_ulp]; + connfp = &ipst->ips_ipcl_proto_fanout_v6[connp->conn_proto]; mutex_enter(&connfp->connf_lock); for (tconn = connfp->connf_head; tconn != NULL; tconn = tconn->conn_next) { /* We don't allow v4 fallback for v6 raw socket */ - if (connp->conn_af_isv6 != tconn->conn_af_isv6) + if (connp->conn_family != tconn->conn_family) continue; /* If neither is exempt, then there's no conflict */ if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) && @@ -1260,9 +1154,10 @@ check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst) if (connp->conn_zoneid == tconn->conn_zoneid) continue; /* If both are bound to different addrs, ok */ - if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) && - !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_srcv6) && - !IN6_ARE_ADDR_EQUAL(&connp->conn_srcv6, &tconn->conn_srcv6)) + if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) && + !IN6_IS_ADDR_UNSPECIFIED(&tconn->conn_laddr_v6) && + !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6, + &tconn->conn_laddr_v6)) continue; /* These two conflict; fail */ break; @@ -1273,28 +1168,29 @@ check_exempt_conflict_v6(conn_t *connp, ip_stack_t *ipst) /* * (v4, v6) bind hash insertion routines + * The caller has already setup the conn (conn_proto, conn_laddr_v6, conn_lport) */ + +int +ipcl_bind_insert(conn_t *connp) +{ + if (connp->conn_ipversion == IPV6_VERSION) + return (ipcl_bind_insert_v6(connp)); + else + return (ipcl_bind_insert_v4(connp)); +} + int -ipcl_bind_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, uint16_t lport) +ipcl_bind_insert_v4(conn_t *connp) { connf_t *connfp; -#ifdef IPCL_DEBUG - char buf[INET_NTOA_BUFSIZE]; -#endif int ret = 0; ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - - ASSERT(connp); - - IPCL_DEBUG_LVL(64, ("ipcl_bind_insert: connp %p, src = %s, " - "port = %d\n", (void *)connp, inet_ntoa_r(src, buf), lport)); - - connp->conn_ulp = protocol; - IN6_IPADDR_TO_V4MAPPED(src, &connp->conn_srcv6); - connp->conn_lport = lport; + uint16_t lport = connp->conn_lport; + uint8_t protocol = connp->conn_proto; if (IPCL_IS_IPTUN(connp)) - return (ipcl_iptun_hash_insert(connp, src, INADDR_ANY, ipst)); + return (ipcl_iptun_hash_insert(connp, ipst)); switch (protocol) { default: @@ -1304,45 +1200,40 @@ ipcl_bind_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, uint16_t lport) /* FALLTHROUGH */ case IPPROTO_UDP: if (protocol == IPPROTO_UDP) { - IPCL_DEBUG_LVL(64, - ("ipcl_bind_insert: connp %p - udp\n", - (void *)connp)); connfp = &ipst->ips_ipcl_udp_fanout[ IPCL_UDP_HASH(lport, ipst)]; } else { - IPCL_DEBUG_LVL(64, - ("ipcl_bind_insert: connp %p - protocol\n", - (void *)connp)); - connfp = &ipst->ips_ipcl_proto_fanout[protocol]; + connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol]; } - if (connp->conn_rem != INADDR_ANY) { + if (connp->conn_faddr_v4 != INADDR_ANY) { IPCL_HASH_INSERT_CONNECTED(connfp, connp); - } else if (connp->conn_src != INADDR_ANY) { + } else if (connp->conn_laddr_v4 != INADDR_ANY) { IPCL_HASH_INSERT_BOUND(connfp, connp); } else { IPCL_HASH_INSERT_WILDCARD(connfp, connp); } + if (protocol == IPPROTO_RSVP) + ill_set_inputfn_all(ipst); break; case IPPROTO_TCP: - /* Insert it in the Bind Hash */ ASSERT(connp->conn_zoneid != ALL_ZONES); connfp = &ipst->ips_ipcl_bind_fanout[ IPCL_BIND_HASH(lport, ipst)]; - if (connp->conn_src != INADDR_ANY) { + if (connp->conn_laddr_v4 != INADDR_ANY) { IPCL_HASH_INSERT_BOUND(connfp, connp); } else { IPCL_HASH_INSERT_WILDCARD(connfp, connp); } if (cl_inet_listen != NULL) { - ASSERT(!connp->conn_pkt_isv6); + ASSERT(connp->conn_ipversion == IPV4_VERSION); connp->conn_flags |= IPCL_CL_LISTENER; (*cl_inet_listen)( connp->conn_netstack->netstack_stackid, IPPROTO_TCP, AF_INET, - (uint8_t *)&connp->conn_bound_source, lport, NULL); + (uint8_t *)&connp->conn_bound_addr_v4, lport, NULL); } break; @@ -1355,20 +1246,16 @@ ipcl_bind_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, uint16_t lport) } int -ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src, - uint16_t lport) +ipcl_bind_insert_v6(conn_t *connp) { connf_t *connfp; int ret = 0; ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - - ASSERT(connp != NULL); connp->conn_ulp = protocol; - connp->conn_srcv6 = *src; - connp->conn_lport = lport; + uint16_t lport = connp->conn_lport; + uint8_t protocol = connp->conn_proto; if (IPCL_IS_IPTUN(connp)) { - return (ipcl_iptun_hash_insert_v6(connp, src, &ipv6_all_zeros, - ipst)); + return (ipcl_iptun_hash_insert_v6(connp, ipst)); } switch (protocol) { @@ -1379,21 +1266,15 @@ ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src, /* FALLTHROUGH */ case IPPROTO_UDP: if (protocol == IPPROTO_UDP) { - IPCL_DEBUG_LVL(128, - ("ipcl_bind_insert_v6: connp %p - udp\n", - (void *)connp)); connfp = &ipst->ips_ipcl_udp_fanout[ IPCL_UDP_HASH(lport, ipst)]; } else { - IPCL_DEBUG_LVL(128, - ("ipcl_bind_insert_v6: connp %p - protocol\n", - (void *)connp)); connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; } - if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) { + if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) { IPCL_HASH_INSERT_CONNECTED(connfp, connp); - } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) { + } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { IPCL_HASH_INSERT_BOUND(connfp, connp); } else { IPCL_HASH_INSERT_WILDCARD(connfp, connp); @@ -1401,13 +1282,11 @@ ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src, break; case IPPROTO_TCP: - /* XXX - Need a separate table for IN6_IS_ADDR_UNSPECIFIED? */ - /* Insert it in the Bind Hash */ ASSERT(connp->conn_zoneid != ALL_ZONES); connfp = &ipst->ips_ipcl_bind_fanout[ IPCL_BIND_HASH(lport, ipst)]; - if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) { + if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { IPCL_HASH_INSERT_BOUND(connfp, connp); } else { IPCL_HASH_INSERT_WILDCARD(connfp, connp); @@ -1416,13 +1295,13 @@ ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src, sa_family_t addr_family; uint8_t *laddrp; - if (connp->conn_pkt_isv6) { + if (connp->conn_ipversion == IPV6_VERSION) { addr_family = AF_INET6; laddrp = - (uint8_t *)&connp->conn_bound_source_v6; + (uint8_t *)&connp->conn_bound_addr_v6; } else { addr_family = AF_INET; - laddrp = (uint8_t *)&connp->conn_bound_source; + laddrp = (uint8_t *)&connp->conn_bound_addr_v4; } connp->conn_flags |= IPCL_CL_LISTENER; (*cl_inet_listen)( @@ -1441,43 +1320,35 @@ ipcl_bind_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src, /* * ipcl_conn_hash insertion routines. + * The caller has already set conn_proto and the addresses/ports in the conn_t. */ + int -ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, - ipaddr_t rem, uint32_t ports) +ipcl_conn_insert(conn_t *connp) +{ + if (connp->conn_ipversion == IPV6_VERSION) + return (ipcl_conn_insert_v6(connp)); + else + return (ipcl_conn_insert_v4(connp)); +} + +int +ipcl_conn_insert_v4(conn_t *connp) { connf_t *connfp; - uint16_t *up; conn_t *tconnp; -#ifdef IPCL_DEBUG - char sbuf[INET_NTOA_BUFSIZE], rbuf[INET_NTOA_BUFSIZE]; -#endif - in_port_t lport; int ret = 0; ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - - IPCL_DEBUG_LVL(256, ("ipcl_conn_insert: connp %p, src = %s, " - "dst = %s, ports = %x, protocol = %x", (void *)connp, - inet_ntoa_r(src, sbuf), inet_ntoa_r(rem, rbuf), - ports, protocol)); + uint16_t lport = connp->conn_lport; + uint8_t protocol = connp->conn_proto; if (IPCL_IS_IPTUN(connp)) - return (ipcl_iptun_hash_insert(connp, src, rem, ipst)); + return (ipcl_iptun_hash_insert(connp, ipst)); switch (protocol) { case IPPROTO_TCP: - if (!(connp->conn_flags & IPCL_EAGER)) { - /* - * for a eager connection, i.e connections which - * have just been created, the initialization is - * already done in ip at conn_creation time, so - * we can skip the checks here. - */ - IPCL_CONN_INIT(connp, protocol, src, rem, ports); - } - /* - * For tcp, we check whether the connection tuple already + * For TCP, we check whether the connection tuple already * exists before allowing the connection to proceed. We * also allow indexing on the zoneid. This is to allow * multiple shared stack zones to have the same tcp @@ -1486,16 +1357,15 @@ ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, * doesn't have to be unique. */ connfp = &ipst->ips_ipcl_conn_fanout[ - IPCL_CONN_HASH(connp->conn_rem, + IPCL_CONN_HASH(connp->conn_faddr_v4, connp->conn_ports, ipst)]; mutex_enter(&connfp->connf_lock); for (tconnp = connfp->connf_head; tconnp != NULL; tconnp = tconnp->conn_next) { - if ((IPCL_CONN_MATCH(tconnp, connp->conn_ulp, - connp->conn_rem, connp->conn_src, - connp->conn_ports)) && - (IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid))) { - + if (IPCL_CONN_MATCH(tconnp, connp->conn_proto, + connp->conn_faddr_v4, connp->conn_laddr_v4, + connp->conn_ports) && + IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) { /* Already have a conn. bail out */ mutex_exit(&connfp->connf_lock); return (EADDRINUSE); @@ -1512,6 +1382,7 @@ ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, } ASSERT(connp->conn_recv != NULL); + ASSERT(connp->conn_recvicmp != NULL); IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); mutex_exit(&connfp->connf_lock); @@ -1523,7 +1394,6 @@ ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, * from the hash first. */ IPCL_HASH_REMOVE(connp); - lport = htons((uint16_t)(ntohl(ports) & 0xFFFF)); ret = ipcl_sctp_hash_insert(connp, lport); break; @@ -1540,18 +1410,16 @@ ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, /* FALLTHROUGH */ case IPPROTO_UDP: - up = (uint16_t *)&ports; - IPCL_CONN_INIT(connp, protocol, src, rem, ports); if (protocol == IPPROTO_UDP) { connfp = &ipst->ips_ipcl_udp_fanout[ - IPCL_UDP_HASH(up[1], ipst)]; + IPCL_UDP_HASH(lport, ipst)]; } else { - connfp = &ipst->ips_ipcl_proto_fanout[protocol]; + connfp = &ipst->ips_ipcl_proto_fanout_v4[protocol]; } - if (connp->conn_rem != INADDR_ANY) { + if (connp->conn_faddr_v4 != INADDR_ANY) { IPCL_HASH_INSERT_CONNECTED(connfp, connp); - } else if (connp->conn_src != INADDR_ANY) { + } else if (connp->conn_laddr_v4 != INADDR_ANY) { IPCL_HASH_INSERT_BOUND(connfp, connp); } else { IPCL_HASH_INSERT_WILDCARD(connfp, connp); @@ -1563,25 +1431,21 @@ ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, } int -ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src, - const in6_addr_t *rem, uint32_t ports, uint_t ifindex) +ipcl_conn_insert_v6(conn_t *connp) { connf_t *connfp; - uint16_t *up; conn_t *tconnp; - in_port_t lport; int ret = 0; ip_stack_t *ipst = connp->conn_netstack->netstack_ip; + uint16_t lport = connp->conn_lport; + uint8_t protocol = connp->conn_proto; + uint_t ifindex = connp->conn_bound_if; if (IPCL_IS_IPTUN(connp)) - return (ipcl_iptun_hash_insert_v6(connp, src, rem, ipst)); + return (ipcl_iptun_hash_insert_v6(connp, ipst)); switch (protocol) { case IPPROTO_TCP: - /* Just need to insert a conn struct */ - if (!(connp->conn_flags & IPCL_EAGER)) { - IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports); - } /* * For tcp, we check whether the connection tuple already @@ -1593,17 +1457,18 @@ ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src, * doesn't have to be unique. */ connfp = &ipst->ips_ipcl_conn_fanout[ - IPCL_CONN_HASH_V6(connp->conn_remv6, connp->conn_ports, + IPCL_CONN_HASH_V6(connp->conn_faddr_v6, connp->conn_ports, ipst)]; mutex_enter(&connfp->connf_lock); for (tconnp = connfp->connf_head; tconnp != NULL; tconnp = tconnp->conn_next) { - if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_ulp, - connp->conn_remv6, connp->conn_srcv6, + /* NOTE: need to match zoneid. Bug in onnv-gate */ + if (IPCL_CONN_MATCH_V6(tconnp, connp->conn_proto, + connp->conn_faddr_v6, connp->conn_laddr_v6, connp->conn_ports) && - (tconnp->conn_tcp->tcp_bound_if == 0 || - tconnp->conn_tcp->tcp_bound_if == ifindex) && - (IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid))) { + (tconnp->conn_bound_if == 0 || + tconnp->conn_bound_if == ifindex) && + IPCL_ZONE_MATCH(tconnp, connp->conn_zoneid)) { /* Already have a conn. bail out */ mutex_exit(&connfp->connf_lock); return (EADDRINUSE); @@ -1624,7 +1489,6 @@ ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src, case IPPROTO_SCTP: IPCL_HASH_REMOVE(connp); - lport = htons((uint16_t)(ntohl(ports) & 0xFFFF)); ret = ipcl_sctp_hash_insert(connp, lport); break; @@ -1634,18 +1498,16 @@ ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src, return (EADDRINUSE); /* FALLTHROUGH */ case IPPROTO_UDP: - up = (uint16_t *)&ports; - IPCL_CONN_INIT_V6(connp, protocol, *src, *rem, ports); if (protocol == IPPROTO_UDP) { connfp = &ipst->ips_ipcl_udp_fanout[ - IPCL_UDP_HASH(up[1], ipst)]; + IPCL_UDP_HASH(lport, ipst)]; } else { connfp = &ipst->ips_ipcl_proto_fanout_v6[protocol]; } - if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6)) { + if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) { IPCL_HASH_INSERT_CONNECTED(connfp, connp); - } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6)) { + } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { IPCL_HASH_INSERT_BOUND(connfp, connp); } else { IPCL_HASH_INSERT_WILDCARD(connfp, connp); @@ -1667,8 +1529,8 @@ ipcl_conn_insert_v6(conn_t *connp, uint8_t protocol, const in6_addr_t *src, * zone, then label checks are omitted. */ conn_t * -ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid, - ip_stack_t *ipst) +ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, + ip_recv_attr_t *ira, ip_stack_t *ipst) { ipha_t *ipha; connf_t *connfp, *bind_connfp; @@ -1677,8 +1539,7 @@ ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid, uint32_t ports; conn_t *connp; uint16_t *up; - boolean_t shared_addr; - boolean_t unlabeled; + zoneid_t zoneid = ira->ira_zoneid; ipha = (ipha_t *)mp->b_rptr; up = (uint16_t *)((uchar_t *)ipha + hdr_len + TCP_PORTS_OFFSET); @@ -1692,11 +1553,14 @@ ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid, mutex_enter(&connfp->connf_lock); for (connp = connfp->connf_head; connp != NULL; connp = connp->conn_next) { - if ((IPCL_CONN_MATCH(connp, protocol, - ipha->ipha_src, ipha->ipha_dst, ports)) && - (IPCL_ZONE_MATCH(connp, zoneid))) { + if (IPCL_CONN_MATCH(connp, protocol, + ipha->ipha_src, ipha->ipha_dst, ports) && + (connp->conn_zoneid == zoneid || + connp->conn_allzones || + ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && + (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && + (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) break; - } } if (connp != NULL) { @@ -1713,48 +1577,19 @@ ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid, } mutex_exit(&connfp->connf_lock); - lport = up[1]; - unlabeled = B_FALSE; - /* Cred cannot be null on IPv4 */ - if (is_system_labeled()) { - cred_t *cr = msg_getcred(mp, NULL); - ASSERT(cr != NULL); - unlabeled = (crgetlabel(cr)->tsl_flags & - TSLF_UNLABELED) != 0; - } - shared_addr = (zoneid == ALL_ZONES); - if (shared_addr) { - /* - * No need to handle exclusive-stack zones since - * ALL_ZONES only applies to the shared stack. - */ - zoneid = tsol_mlp_findzone(protocol, lport); - /* - * If no shared MLP is found, tsol_mlp_findzone returns - * ALL_ZONES. In that case, we assume it's SLP, and - * search for the zone based on the packet label. - * - * If there is such a zone, we prefer to find a - * connection in it. Otherwise, we look for a - * MAC-exempt connection in any zone whose label - * dominates the default label on the packet. - */ - if (zoneid == ALL_ZONES) - zoneid = tsol_packet_to_zoneid(mp); - else - unlabeled = B_FALSE; - } - bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; mutex_enter(&bind_connfp->connf_lock); for (connp = bind_connfp->connf_head; connp != NULL; connp = connp->conn_next) { if (IPCL_BIND_MATCH(connp, protocol, ipha->ipha_dst, - lport) && (IPCL_ZONE_MATCH(connp, zoneid) || - (unlabeled && shared_addr && - (connp->conn_mac_mode != CONN_MAC_DEFAULT)))) + lport) && + (connp->conn_zoneid == zoneid || + connp->conn_allzones || + ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && + (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && + (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) break; } @@ -1762,16 +1597,17 @@ ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid, * If the matching connection is SLP on a private address, then * the label on the packet must match the local zone's label. * Otherwise, it must be in the label range defined by tnrh. - * This is ensured by tsol_receive_label. + * This is ensured by tsol_receive_local. + * + * Note that we don't check tsol_receive_local for + * the connected case. */ - if (connp != NULL && is_system_labeled() && + if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, - shared_addr, connp)) { - DTRACE_PROBE3( - tx__ip__log__info__classify__tcp, - char *, - "connp(1) could not receive mp(2)", - conn_t *, connp, mblk_t *, mp); + ira, connp)) { + DTRACE_PROBE3(tx__ip__log__info__classify__tcp, + char *, "connp(1) could not receive mp(2)", + conn_t *, connp, mblk_t *, mp); connp = NULL; } @@ -1783,61 +1619,27 @@ ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid, } mutex_exit(&bind_connfp->connf_lock); - - IPCL_DEBUG_LVL(512, - ("ipcl_classify: couldn't classify mp = %p\n", - (void *)mp)); break; case IPPROTO_UDP: lport = up[1]; - unlabeled = B_FALSE; - /* Cred cannot be null on IPv4 */ - if (is_system_labeled()) { - cred_t *cr = msg_getcred(mp, NULL); - ASSERT(cr != NULL); - unlabeled = (crgetlabel(cr)->tsl_flags & - TSLF_UNLABELED) != 0; - } - shared_addr = (zoneid == ALL_ZONES); - if (shared_addr) { - /* - * No need to handle exclusive-stack zones since - * ALL_ZONES only applies to the shared stack. - */ - zoneid = tsol_mlp_findzone(protocol, lport); - /* - * If no shared MLP is found, tsol_mlp_findzone returns - * ALL_ZONES. In that case, we assume it's SLP, and - * search for the zone based on the packet label. - * - * If there is such a zone, we prefer to find a - * connection in it. Otherwise, we look for a - * MAC-exempt connection in any zone whose label - * dominates the default label on the packet. - */ - if (zoneid == ALL_ZONES) - zoneid = tsol_packet_to_zoneid(mp); - else - unlabeled = B_FALSE; - } fport = up[0]; - IPCL_DEBUG_LVL(512, ("ipcl_udp_classify %x %x", lport, fport)); connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; mutex_enter(&connfp->connf_lock); for (connp = connfp->connf_head; connp != NULL; connp = connp->conn_next) { if (IPCL_UDP_MATCH(connp, lport, ipha->ipha_dst, fport, ipha->ipha_src) && - (IPCL_ZONE_MATCH(connp, zoneid) || - (unlabeled && shared_addr && - (connp->conn_mac_mode != CONN_MAC_DEFAULT)))) + (connp->conn_zoneid == zoneid || + connp->conn_allzones || + ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && + (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE)))) break; } - if (connp != NULL && is_system_labeled() && + if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && !tsol_receive_local(mp, &ipha->ipha_dst, IPV4_VERSION, - shared_addr, connp)) { + ira, connp)) { DTRACE_PROBE3(tx__ip__log__info__classify__udp, char *, "connp(1) could not receive mp(2)", conn_t *, connp, mblk_t *, mp); @@ -1854,9 +1656,7 @@ ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid, * We shouldn't come here for multicast/broadcast packets */ mutex_exit(&connfp->connf_lock); - IPCL_DEBUG_LVL(512, - ("ipcl_classify: cant find udp conn_t for ports : %x %x", - lport, fport)); + break; case IPPROTO_ENCAP: @@ -1869,26 +1669,25 @@ ipcl_classify_v4(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid, } conn_t * -ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid, - ip_stack_t *ipst) +ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, + ip_recv_attr_t *ira, ip_stack_t *ipst) { ip6_t *ip6h; connf_t *connfp, *bind_connfp; uint16_t lport; uint16_t fport; - tcph_t *tcph; + tcpha_t *tcpha; uint32_t ports; conn_t *connp; uint16_t *up; - boolean_t shared_addr; - boolean_t unlabeled; + zoneid_t zoneid = ira->ira_zoneid; ip6h = (ip6_t *)mp->b_rptr; switch (protocol) { case IPPROTO_TCP: - tcph = (tcph_t *)&mp->b_rptr[hdr_len]; - up = (uint16_t *)tcph->th_lport; + tcpha = (tcpha_t *)&mp->b_rptr[hdr_len]; + up = &tcpha->tha_lport; ports = *(uint32_t *)up; connfp = @@ -1897,11 +1696,14 @@ ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid, mutex_enter(&connfp->connf_lock); for (connp = connfp->connf_head; connp != NULL; connp = connp->conn_next) { - if ((IPCL_CONN_MATCH_V6(connp, protocol, - ip6h->ip6_src, ip6h->ip6_dst, ports)) && - (IPCL_ZONE_MATCH(connp, zoneid))) { + if (IPCL_CONN_MATCH_V6(connp, protocol, + ip6h->ip6_src, ip6h->ip6_dst, ports) && + (connp->conn_zoneid == zoneid || + connp->conn_allzones || + ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && + (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && + (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) break; - } } if (connp != NULL) { @@ -1920,37 +1722,6 @@ ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid, mutex_exit(&connfp->connf_lock); lport = up[1]; - unlabeled = B_FALSE; - /* Cred can be null on IPv6 */ - if (is_system_labeled()) { - cred_t *cr = msg_getcred(mp, NULL); - - unlabeled = (cr != NULL && - crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0; - } - shared_addr = (zoneid == ALL_ZONES); - if (shared_addr) { - /* - * No need to handle exclusive-stack zones since - * ALL_ZONES only applies to the shared stack. - */ - zoneid = tsol_mlp_findzone(protocol, lport); - /* - * If no shared MLP is found, tsol_mlp_findzone returns - * ALL_ZONES. In that case, we assume it's SLP, and - * search for the zone based on the packet label. - * - * If there is such a zone, we prefer to find a - * connection in it. Otherwise, we look for a - * MAC-exempt connection in any zone whose label - * dominates the default label on the packet. - */ - if (zoneid == ALL_ZONES) - zoneid = tsol_packet_to_zoneid(mp); - else - unlabeled = B_FALSE; - } - bind_connfp = &ipst->ips_ipcl_bind_fanout[IPCL_BIND_HASH(lport, ipst)]; mutex_enter(&bind_connfp->connf_lock); @@ -1958,15 +1729,17 @@ ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid, connp = connp->conn_next) { if (IPCL_BIND_MATCH_V6(connp, protocol, ip6h->ip6_dst, lport) && - (IPCL_ZONE_MATCH(connp, zoneid) || - (unlabeled && shared_addr && - (connp->conn_mac_mode != CONN_MAC_DEFAULT)))) + (connp->conn_zoneid == zoneid || + connp->conn_allzones || + ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && + (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && + (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) break; } - if (connp != NULL && is_system_labeled() && + if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, - shared_addr, connp)) { + ira, connp)) { DTRACE_PROBE3(tx__ip__log__info__classify__tcp6, char *, "connp(1) could not receive mp(2)", conn_t *, connp, mblk_t *, mp); @@ -1977,72 +1750,33 @@ ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid, /* Have a listner at least */ CONN_INC_REF(connp); mutex_exit(&bind_connfp->connf_lock); - IPCL_DEBUG_LVL(512, - ("ipcl_classify_v6: found listner " - "connp = %p\n", (void *)connp)); - return (connp); } mutex_exit(&bind_connfp->connf_lock); - - IPCL_DEBUG_LVL(512, - ("ipcl_classify_v6: couldn't classify mp = %p\n", - (void *)mp)); break; case IPPROTO_UDP: up = (uint16_t *)&mp->b_rptr[hdr_len]; lport = up[1]; - unlabeled = B_FALSE; - /* Cred can be null on IPv6 */ - if (is_system_labeled()) { - cred_t *cr = msg_getcred(mp, NULL); - - unlabeled = (cr != NULL && - crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0; - } - shared_addr = (zoneid == ALL_ZONES); - if (shared_addr) { - /* - * No need to handle exclusive-stack zones since - * ALL_ZONES only applies to the shared stack. - */ - zoneid = tsol_mlp_findzone(protocol, lport); - /* - * If no shared MLP is found, tsol_mlp_findzone returns - * ALL_ZONES. In that case, we assume it's SLP, and - * search for the zone based on the packet label. - * - * If there is such a zone, we prefer to find a - * connection in it. Otherwise, we look for a - * MAC-exempt connection in any zone whose label - * dominates the default label on the packet. - */ - if (zoneid == ALL_ZONES) - zoneid = tsol_packet_to_zoneid(mp); - else - unlabeled = B_FALSE; - } - fport = up[0]; - IPCL_DEBUG_LVL(512, ("ipcl_udp_classify_v6 %x %x", lport, - fport)); connfp = &ipst->ips_ipcl_udp_fanout[IPCL_UDP_HASH(lport, ipst)]; mutex_enter(&connfp->connf_lock); for (connp = connfp->connf_head; connp != NULL; connp = connp->conn_next) { if (IPCL_UDP_MATCH_V6(connp, lport, ip6h->ip6_dst, fport, ip6h->ip6_src) && - (IPCL_ZONE_MATCH(connp, zoneid) || - (unlabeled && shared_addr && - (connp->conn_mac_mode != CONN_MAC_DEFAULT)))) + (connp->conn_zoneid == zoneid || + connp->conn_allzones || + ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && + (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && + (ira->ira_flags & IRAF_TX_SHARED_ADDR)))) break; } - if (connp != NULL && is_system_labeled() && + if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && !tsol_receive_local(mp, &ip6h->ip6_dst, IPV6_VERSION, - shared_addr, connp)) { + ira, connp)) { DTRACE_PROBE3(tx__ip__log__info__classify__udp6, char *, "connp(1) could not receive mp(2)", conn_t *, connp, mblk_t *, mp); @@ -2059,9 +1793,6 @@ ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid, * We shouldn't come here for multicast/broadcast packets */ mutex_exit(&connfp->connf_lock); - IPCL_DEBUG_LVL(512, - ("ipcl_classify_v6: cant find udp conn_t for ports : %x %x", - lport, fport)); break; case IPPROTO_ENCAP: case IPPROTO_IPV6: @@ -2076,125 +1807,80 @@ ipcl_classify_v6(mblk_t *mp, uint8_t protocol, uint_t hdr_len, zoneid_t zoneid, * wrapper around ipcl_classify_(v4,v6) routines. */ conn_t * -ipcl_classify(mblk_t *mp, zoneid_t zoneid, ip_stack_t *ipst) +ipcl_classify(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst) { - uint16_t hdr_len; - ipha_t *ipha; - uint8_t *nexthdrp; - - if (MBLKL(mp) < sizeof (ipha_t)) - return (NULL); - - switch (IPH_HDR_VERSION(mp->b_rptr)) { - case IPV4_VERSION: - ipha = (ipha_t *)mp->b_rptr; - hdr_len = IPH_HDR_LENGTH(ipha); - return (ipcl_classify_v4(mp, ipha->ipha_protocol, hdr_len, - zoneid, ipst)); - case IPV6_VERSION: - if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr, - &hdr_len, &nexthdrp)) - return (NULL); - - return (ipcl_classify_v6(mp, *nexthdrp, hdr_len, zoneid, ipst)); + if (ira->ira_flags & IRAF_IS_IPV4) { + return (ipcl_classify_v4(mp, ira->ira_protocol, + ira->ira_ip_hdr_length, ira, ipst)); + } else { + return (ipcl_classify_v6(mp, ira->ira_protocol, + ira->ira_ip_hdr_length, ira, ipst)); } - - return (NULL); } +/* + * Only used to classify SCTP RAW sockets + */ conn_t * -ipcl_classify_raw(mblk_t *mp, uint8_t protocol, zoneid_t zoneid, - uint32_t ports, ipha_t *hdr, ip_stack_t *ipst) +ipcl_classify_raw(mblk_t *mp, uint8_t protocol, uint32_t ports, + ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, ip_stack_t *ipst) { connf_t *connfp; conn_t *connp; in_port_t lport; - int af; - boolean_t shared_addr; - boolean_t unlabeled; + int ipversion; const void *dst; + zoneid_t zoneid = ira->ira_zoneid; lport = ((uint16_t *)&ports)[1]; - - unlabeled = B_FALSE; - /* Cred can be null on IPv6 */ - if (is_system_labeled()) { - cred_t *cr = msg_getcred(mp, NULL); - - unlabeled = (cr != NULL && - crgetlabel(cr)->tsl_flags & TSLF_UNLABELED) != 0; - } - shared_addr = (zoneid == ALL_ZONES); - if (shared_addr) { - /* - * No need to handle exclusive-stack zones since ALL_ZONES - * only applies to the shared stack. - */ - zoneid = tsol_mlp_findzone(protocol, lport); - /* - * If no shared MLP is found, tsol_mlp_findzone returns - * ALL_ZONES. In that case, we assume it's SLP, and search for - * the zone based on the packet label. - * - * If there is such a zone, we prefer to find a connection in - * it. Otherwise, we look for a MAC-exempt connection in any - * zone whose label dominates the default label on the packet. - */ - if (zoneid == ALL_ZONES) - zoneid = tsol_packet_to_zoneid(mp); - else - unlabeled = B_FALSE; + if (ira->ira_flags & IRAF_IS_IPV4) { + dst = (const void *)&ipha->ipha_dst; + ipversion = IPV4_VERSION; + } else { + dst = (const void *)&ip6h->ip6_dst; + ipversion = IPV6_VERSION; } - af = IPH_HDR_VERSION(hdr); - dst = af == IPV4_VERSION ? (const void *)&hdr->ipha_dst : - (const void *)&((ip6_t *)hdr)->ip6_dst; connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(ntohs(lport), ipst)]; - mutex_enter(&connfp->connf_lock); for (connp = connfp->connf_head; connp != NULL; connp = connp->conn_next) { /* We don't allow v4 fallback for v6 raw socket. */ - if (af == (connp->conn_af_isv6 ? IPV4_VERSION : - IPV6_VERSION)) + if (ipversion != connp->conn_ipversion) continue; - if (connp->conn_fully_bound) { - if (af == IPV4_VERSION) { + if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && + !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { + if (ipversion == IPV4_VERSION) { if (!IPCL_CONN_MATCH(connp, protocol, - hdr->ipha_src, hdr->ipha_dst, ports)) + ipha->ipha_src, ipha->ipha_dst, ports)) continue; } else { if (!IPCL_CONN_MATCH_V6(connp, protocol, - ((ip6_t *)hdr)->ip6_src, - ((ip6_t *)hdr)->ip6_dst, ports)) + ip6h->ip6_src, ip6h->ip6_dst, ports)) continue; } } else { - if (af == IPV4_VERSION) { + if (ipversion == IPV4_VERSION) { if (!IPCL_BIND_MATCH(connp, protocol, - hdr->ipha_dst, lport)) + ipha->ipha_dst, lport)) continue; } else { if (!IPCL_BIND_MATCH_V6(connp, protocol, - ((ip6_t *)hdr)->ip6_dst, lport)) + ip6h->ip6_dst, lport)) continue; } } - if (IPCL_ZONE_MATCH(connp, zoneid) || - (unlabeled && - (connp->conn_mac_mode != CONN_MAC_DEFAULT) && - shared_addr)) + if (connp->conn_zoneid == zoneid || + connp->conn_allzones || + ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && + (ira->ira_flags & IRAF_TX_MAC_EXEMPTABLE) && + (ira->ira_flags & IRAF_TX_SHARED_ADDR))) break; } - /* - * If the connection is fully-bound and connection-oriented (TCP or - * SCTP), then we've already validated the remote system's label. - * There's no need to do it again for every packet. - */ - if (connp != NULL && is_system_labeled() && (!connp->conn_fully_bound || - !(connp->conn_flags & (IPCL_TCP|IPCL_SCTPCONN))) && - !tsol_receive_local(mp, dst, af, shared_addr, connp)) { + + if (connp != NULL && (ira->ira_flags & IRAF_SYSTEM_LABELED) && + !tsol_receive_local(mp, dst, ipversion, ira, connp)) { DTRACE_PROBE3(tx__ip__log__info__classify__rawip, char *, "connp(1) could not receive mp(2)", conn_t *, connp, mblk_t *, mp); @@ -2205,22 +1891,22 @@ ipcl_classify_raw(mblk_t *mp, uint8_t protocol, zoneid_t zoneid, goto found; mutex_exit(&connfp->connf_lock); - /* Try to look for a wildcard match. */ + /* Try to look for a wildcard SCTP RAW socket match. */ connfp = &ipst->ips_ipcl_raw_fanout[IPCL_RAW_HASH(0, ipst)]; mutex_enter(&connfp->connf_lock); for (connp = connfp->connf_head; connp != NULL; connp = connp->conn_next) { /* We don't allow v4 fallback for v6 raw socket. */ - if ((af == (connp->conn_af_isv6 ? IPV4_VERSION : - IPV6_VERSION)) || !IPCL_ZONE_MATCH(connp, zoneid)) { + if (ipversion != connp->conn_ipversion) continue; - } - if (af == IPV4_VERSION) { - if (IPCL_RAW_MATCH(connp, protocol, hdr->ipha_dst)) + if (!IPCL_ZONE_MATCH(connp, zoneid)) + continue; + + if (ipversion == IPV4_VERSION) { + if (IPCL_RAW_MATCH(connp, protocol, ipha->ipha_dst)) break; } else { - if (IPCL_RAW_MATCH_V6(connp, protocol, - ((ip6_t *)hdr)->ip6_dst)) { + if (IPCL_RAW_MATCH_V6(connp, protocol, ip6h->ip6_dst)) { break; } } @@ -2253,11 +1939,23 @@ tcp_conn_constructor(void *buf, void *cdrarg, int kmflags) mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); cv_init(&connp->conn_sq_cv, NULL, CV_DEFAULT, NULL); - tcp->tcp_timercache = tcp_timermp_alloc(KM_NOSLEEP); + tcp->tcp_timercache = tcp_timermp_alloc(kmflags); + if (tcp->tcp_timercache == NULL) + return (ENOMEM); connp->conn_tcp = tcp; connp->conn_flags = IPCL_TCPCONN; - connp->conn_ulp = IPPROTO_TCP; + connp->conn_proto = IPPROTO_TCP; tcp->tcp_connp = connp; + rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); + + connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); + if (connp->conn_ixa == NULL) { + tcp_timermp_free(tcp); + return (ENOMEM); + } + connp->conn_ixa->ixa_refcnt = 1; + connp->conn_ixa->ixa_protocol = connp->conn_proto; + connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); return (0); } @@ -2276,6 +1974,15 @@ tcp_conn_destructor(void *buf, void *cdrarg) mutex_destroy(&connp->conn_lock); cv_destroy(&connp->conn_cv); cv_destroy(&connp->conn_sq_cv); + rw_destroy(&connp->conn_ilg_lock); + + /* Can be NULL if constructor failed */ + if (connp->conn_ixa != NULL) { + ASSERT(connp->conn_ixa->ixa_refcnt == 1); + ASSERT(connp->conn_ixa->ixa_ire == NULL); + ASSERT(connp->conn_ixa->ixa_nce == NULL); + ixa_refrele(connp->conn_ixa); + } } /* ARGSUSED */ @@ -2289,7 +1996,13 @@ ip_conn_constructor(void *buf, void *cdrarg, int kmflags) mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); connp->conn_flags = IPCL_IPCCONN; + rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); + connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); + if (connp->conn_ixa == NULL) + return (ENOMEM); + connp->conn_ixa->ixa_refcnt = 1; + connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); return (0); } @@ -2304,6 +2017,15 @@ ip_conn_destructor(void *buf, void *cdrarg) ASSERT(connp->conn_priv == NULL); mutex_destroy(&connp->conn_lock); cv_destroy(&connp->conn_cv); + rw_destroy(&connp->conn_ilg_lock); + + /* Can be NULL if constructor failed */ + if (connp->conn_ixa != NULL) { + ASSERT(connp->conn_ixa->ixa_refcnt == 1); + ASSERT(connp->conn_ixa->ixa_ire == NULL); + ASSERT(connp->conn_ixa->ixa_nce == NULL); + ixa_refrele(connp->conn_ixa); + } } /* ARGSUSED */ @@ -2321,8 +2043,15 @@ udp_conn_constructor(void *buf, void *cdrarg, int kmflags) cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); connp->conn_udp = udp; connp->conn_flags = IPCL_UDPCONN; - connp->conn_ulp = IPPROTO_UDP; + connp->conn_proto = IPPROTO_UDP; udp->udp_connp = connp; + rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); + connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); + if (connp->conn_ixa == NULL) + return (ENOMEM); + connp->conn_ixa->ixa_refcnt = 1; + connp->conn_ixa->ixa_protocol = connp->conn_proto; + connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); return (0); } @@ -2339,6 +2068,15 @@ udp_conn_destructor(void *buf, void *cdrarg) ASSERT(connp->conn_udp == udp); mutex_destroy(&connp->conn_lock); cv_destroy(&connp->conn_cv); + rw_destroy(&connp->conn_ilg_lock); + + /* Can be NULL if constructor failed */ + if (connp->conn_ixa != NULL) { + ASSERT(connp->conn_ixa->ixa_refcnt == 1); + ASSERT(connp->conn_ixa->ixa_ire == NULL); + ASSERT(connp->conn_ixa->ixa_nce == NULL); + ixa_refrele(connp->conn_ixa); + } } /* ARGSUSED */ @@ -2356,8 +2094,15 @@ rawip_conn_constructor(void *buf, void *cdrarg, int kmflags) cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); connp->conn_icmp = icmp; connp->conn_flags = IPCL_RAWIPCONN; - connp->conn_ulp = IPPROTO_ICMP; + connp->conn_proto = IPPROTO_ICMP; icmp->icmp_connp = connp; + rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); + connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); + if (connp->conn_ixa == NULL) + return (ENOMEM); + connp->conn_ixa->ixa_refcnt = 1; + connp->conn_ixa->ixa_protocol = connp->conn_proto; + connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); return (0); } @@ -2374,6 +2119,15 @@ rawip_conn_destructor(void *buf, void *cdrarg) ASSERT(connp->conn_icmp == icmp); mutex_destroy(&connp->conn_lock); cv_destroy(&connp->conn_cv); + rw_destroy(&connp->conn_ilg_lock); + + /* Can be NULL if constructor failed */ + if (connp->conn_ixa != NULL) { + ASSERT(connp->conn_ixa->ixa_refcnt == 1); + ASSERT(connp->conn_ixa->ixa_ire == NULL); + ASSERT(connp->conn_ixa->ixa_nce == NULL); + ixa_refrele(connp->conn_ixa); + } } /* ARGSUSED */ @@ -2392,6 +2146,12 @@ rts_conn_constructor(void *buf, void *cdrarg, int kmflags) connp->conn_rts = rts; connp->conn_flags = IPCL_RTSCONN; rts->rts_connp = connp; + rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); + connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); + if (connp->conn_ixa == NULL) + return (ENOMEM); + connp->conn_ixa->ixa_refcnt = 1; + connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); return (0); } @@ -2408,71 +2168,35 @@ rts_conn_destructor(void *buf, void *cdrarg) ASSERT(connp->conn_rts == rts); mutex_destroy(&connp->conn_lock); cv_destroy(&connp->conn_cv); -} + rw_destroy(&connp->conn_ilg_lock); -/* ARGSUSED */ -int -ip_helper_stream_constructor(void *buf, void *cdrarg, int kmflags) -{ - int error; - netstack_t *ns; - int ret; - tcp_stack_t *tcps; - ip_helper_stream_info_t *ip_helper_str; - ip_stack_t *ipst; - - ns = netstack_find_by_cred(kcred); - ASSERT(ns != NULL); - tcps = ns->netstack_tcp; - ipst = ns->netstack_ip; - ASSERT(tcps != NULL); - ip_helper_str = (ip_helper_stream_info_t *)buf; - - do { - error = ldi_open_by_name(DEV_IP, IP_HELPER_STR, kcred, - &ip_helper_str->iphs_handle, ipst->ips_ldi_ident); - } while (error == EINTR); - - if (error == 0) { - do { - error = ldi_ioctl( - ip_helper_str->iphs_handle, SIOCSQPTR, - (intptr_t)buf, FKIOCTL, kcred, &ret); - } while (error == EINTR); - - if (error != 0) { - (void) ldi_close( - ip_helper_str->iphs_handle, 0, kcred); - } + /* Can be NULL if constructor failed */ + if (connp->conn_ixa != NULL) { + ASSERT(connp->conn_ixa->ixa_refcnt == 1); + ASSERT(connp->conn_ixa->ixa_ire == NULL); + ASSERT(connp->conn_ixa->ixa_nce == NULL); + ixa_refrele(connp->conn_ixa); } - - netstack_rele(ipst->ips_netstack); - - return (error); } -/* ARGSUSED */ -static void -ip_helper_stream_destructor(void *buf, void *cdrarg) -{ - ip_helper_stream_info_t *ip_helper_str = (ip_helper_stream_info_t *)buf; - - ip_helper_str->iphs_rq->q_ptr = - ip_helper_str->iphs_wq->q_ptr = - ip_helper_str->iphs_minfo; - (void) ldi_close(ip_helper_str->iphs_handle, 0, kcred); -} - - /* * Called as part of ipcl_conn_destroy to assert and clear any pointers * in the conn_t. + * + * Below we list all the pointers in the conn_t as a documentation aid. + * The ones that we can not ASSERT to be NULL are #ifdef'ed out. + * If you add any pointers to the conn_t please add an ASSERT here + * and #ifdef it out if it can't be actually asserted to be NULL. + * In any case, we bzero most of the conn_t at the end of the function. */ void ipcl_conn_cleanup(conn_t *connp) { - ASSERT(connp->conn_ire_cache == NULL); + ip_xmit_attr_t *ixa; + ASSERT(connp->conn_latch == NULL); + ASSERT(connp->conn_latch_in_policy == NULL); + ASSERT(connp->conn_latch_in_action == NULL); #ifdef notdef ASSERT(connp->conn_rq == NULL); ASSERT(connp->conn_wq == NULL); @@ -2485,18 +2209,6 @@ ipcl_conn_cleanup(conn_t *connp) ASSERT(connp->conn_fanout == NULL); ASSERT(connp->conn_next == NULL); ASSERT(connp->conn_prev == NULL); -#ifdef notdef - /* - * The ill and ipif pointers are not cleared before the conn_t - * goes away since they do not hold a reference on the ill/ipif. - * We should replace these pointers with ifindex/ipaddr_t to - * make the code less complex. - */ - ASSERT(connp->conn_outgoing_ill == NULL); - ASSERT(connp->conn_incoming_ill == NULL); - ASSERT(connp->conn_multicast_ipif == NULL); - ASSERT(connp->conn_multicast_ill == NULL); -#endif ASSERT(connp->conn_oper_pending_ill == NULL); ASSERT(connp->conn_ilg == NULL); ASSERT(connp->conn_drain_next == NULL); @@ -2506,10 +2218,19 @@ ipcl_conn_cleanup(conn_t *connp) ASSERT(connp->conn_idl == NULL); #endif ASSERT(connp->conn_ipsec_opt_mp == NULL); - ASSERT(connp->conn_effective_cred == NULL); +#ifdef notdef + /* conn_netstack is cleared by the caller; needed by ixa_cleanup */ ASSERT(connp->conn_netstack == NULL); +#endif ASSERT(connp->conn_helper_info == NULL); + ASSERT(connp->conn_ixa != NULL); + ixa = connp->conn_ixa; + ASSERT(ixa->ixa_refcnt == 1); + /* Need to preserve ixa_protocol */ + ixa_cleanup(ixa); + ixa->ixa_flags = 0; + /* Clear out the conn_t fields that are not preserved */ bzero(&connp->conn_start_clr, sizeof (conn_t) - @@ -2602,10 +2323,11 @@ ipcl_globalhash_remove(conn_t *connp) /* * Walk the list of all conn_t's in the system, calling the function provided - * with the specified argument for each. + * With the specified argument for each. * Applies to both IPv4 and IPv6. * - * IPCs may hold pointers to ipif/ill. To guard against stale pointers + * CONNs may hold pointers to ills (conn_dhcpinit_ill and + * conn_oper_pending_ill). To guard against stale pointers * ipcl_walk() is called to cleanup the conn_t's, typically when an interface is * unplumbed or removed. New conn_t's that are created while we are walking * may be missed by this walk, because they are not necessarily inserted @@ -2657,7 +2379,7 @@ ipcl_walk(pfv_t func, void *arg, ip_stack_t *ipst) * (peer tcp in ESTABLISHED state). */ conn_t * -ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcph_t *tcph, +ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcpha_t *tcpha, ip_stack_t *ipst) { uint32_t ports; @@ -2675,8 +2397,8 @@ ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcph_t *tcph, zone_chk = (ipha->ipha_src == htonl(INADDR_LOOPBACK) || ipha->ipha_dst == htonl(INADDR_LOOPBACK)); - bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t)); - bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t)); + pports[0] = tcpha->tha_fport; + pports[1] = tcpha->tha_lport; connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, ports, ipst)]; @@ -2707,7 +2429,7 @@ ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *connp, ipha_t *ipha, tcph_t *tcph, * (peer tcp in ESTABLISHED state). */ conn_t * -ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph, +ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcpha_t *tcpha, ip_stack_t *ipst) { uint32_t ports; @@ -2728,8 +2450,8 @@ ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph, zone_chk = (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src) || IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst)); - bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t)); - bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t)); + pports[0] = tcpha->tha_fport; + pports[1] = tcpha->tha_lport; connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH_V6(ip6h->ip6_dst, ports, ipst)]; @@ -2738,7 +2460,7 @@ ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph, for (tconnp = connfp->connf_head; tconnp != NULL; tconnp = tconnp->conn_next) { - /* We skip tcp_bound_if check here as this is loopback tcp */ + /* We skip conn_bound_if check here as this is loopback tcp */ if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, ip6h->ip6_dst, ip6h->ip6_src, ports) && tconnp->conn_tcp->tcp_state == TCPS_ESTABLISHED && @@ -2760,7 +2482,7 @@ ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *connp, ip6_t *ip6h, tcph_t *tcph, * Only checks for connected entries i.e. no INADDR_ANY checks. */ conn_t * -ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcph_t *tcph, int min_state, +ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcpha_t *tcpha, int min_state, ip_stack_t *ipst) { uint32_t ports; @@ -2769,8 +2491,8 @@ ipcl_tcp_lookup_reversed_ipv4(ipha_t *ipha, tcph_t *tcph, int min_state, conn_t *tconnp; pports = (uint16_t *)&ports; - bcopy(tcph->th_fport, &pports[0], sizeof (uint16_t)); - bcopy(tcph->th_lport, &pports[1], sizeof (uint16_t)); + pports[0] = tcpha->tha_fport; + pports[1] = tcpha->tha_lport; connfp = &ipst->ips_ipcl_conn_fanout[IPCL_CONN_HASH(ipha->ipha_dst, ports, ipst)]; @@ -2823,8 +2545,8 @@ ipcl_tcp_lookup_reversed_ipv6(ip6_t *ip6h, tcpha_t *tcpha, int min_state, if (IPCL_CONN_MATCH_V6(tconnp, IPPROTO_TCP, ip6h->ip6_dst, ip6h->ip6_src, ports) && tcp->tcp_state >= min_state && - (tcp->tcp_bound_if == 0 || - tcp->tcp_bound_if == ifindex)) { + (tconnp->conn_bound_if == 0 || + tconnp->conn_bound_if == ifindex)) { CONN_INC_REF(tconnp); mutex_exit(&connfp->connf_lock); @@ -2901,8 +2623,8 @@ ipcl_lookup_listener_v6(uint16_t lport, in6_addr_t *laddr, uint_t ifindex, tcp = connp->conn_tcp; if (IPCL_BIND_MATCH_V6(connp, IPPROTO_TCP, *laddr, lport) && IPCL_ZONE_MATCH(connp, zoneid) && - (tcp->tcp_bound_if == 0 || - tcp->tcp_bound_if == ifindex) && + (connp->conn_bound_if == 0 || + connp->conn_bound_if == ifindex) && tcp->tcp_listener == NULL) { CONN_INC_REF(connp); mutex_exit(&bind_connfp->connf_lock); diff --git a/usr/src/uts/common/inet/ip/ipdrop.c b/usr/src/uts/common/inet/ip/ipdrop.c index 6d08ec9d60..0f257d6cd2 100644 --- a/usr/src/uts/common/inet/ip/ipdrop.c +++ b/usr/src/uts/common/inet/ip/ipdrop.c @@ -29,11 +29,11 @@ #include <sys/sunddi.h> #include <sys/kstat.h> #include <sys/kmem.h> +#include <sys/sdt.h> #include <net/pfkeyv2.h> #include <inet/common.h> #include <inet/ip.h> #include <inet/ip6.h> -#include <inet/ipsec_info.h> #include <inet/ipsec_impl.h> #include <inet/ipdrop.h> @@ -246,16 +246,11 @@ ip_drop_unregister(ipdropper_t *ipd) * Actually drop a packet. Many things could happen here, but at the least, * the packet will be freemsg()ed. */ -/* ARGSUSED */ void -ip_drop_packet(mblk_t *mp, boolean_t inbound, ill_t *arriving, - ire_t *outbound_ire, struct kstat_named *counter, ipdropper_t *who_called) +ip_drop_packet(mblk_t *mp, boolean_t inbound, ill_t *ill, + struct kstat_named *counter, ipdropper_t *who_called) { - mblk_t *ipsec_mp = NULL; - ipsec_in_t *ii = NULL; - ipsec_out_t *io = NULL; - ipsec_info_t *in; - uint8_t vers; + char *str; if (mp == NULL) { /* @@ -265,41 +260,7 @@ ip_drop_packet(mblk_t *mp, boolean_t inbound, ill_t *arriving, return; } - if (DB_TYPE(mp) == M_CTL) { - in = (ipsec_info_t *)mp->b_rptr; - - if (in->ipsec_info_type == IPSEC_IN) - ii = (ipsec_in_t *)in; - else if (in->ipsec_info_type == IPSEC_OUT) - io = (ipsec_out_t *)in; - - /* See if this is an ICMP packet (check for v4/v6). */ - vers = (*mp->b_rptr) >> 4; - if (vers != IPV4_VERSION && vers != IPV6_VERSION) { - /* - * If not, it's some other sort of M_CTL to be freed. - * For now, treat it like an ordinary packet. - */ - ipsec_mp = mp; - mp = mp->b_cont; - } - } - - /* Reality checks */ - if (inbound && io != NULL) - cmn_err(CE_WARN, - "ip_drop_packet: inbound packet with IPSEC_OUT"); - - if (outbound_ire != NULL && ii != NULL) - cmn_err(CE_WARN, - "ip_drop_packet: outbound packet with IPSEC_IN"); - - /* At this point, mp always points to the data. */ - /* - * Can't make the assertion yet - It could be an inbound ICMP - * message, which is M_CTL but with data in it. - */ - /* ASSERT(mp->b_datap->db_type == M_DATA); */ + ASSERT(mp->b_datap->db_type == M_DATA); /* Increment the bean counter, if available. */ if (counter != NULL) { @@ -318,16 +279,22 @@ ip_drop_packet(mblk_t *mp, boolean_t inbound, ill_t *arriving, break; /* Other types we can't handle for now. */ } - - /* TODO? Copy out kstat name for use in logging. */ } - /* TODO: log the packet details if logging is called for. */ + if (counter != NULL) + str = counter->name; + else if (who_called != NULL) + str = who_called->ipd_name; + else + str = "Unspecified IPsec drop"; + + if (inbound) + ip_drop_input(str, mp, ill); + else + ip_drop_output(str, mp, ill); + /* TODO: queue the packet onto a snoop-friendly queue. */ - /* If I haven't queued the packet or some such nonsense, free it. */ - if (ipsec_mp != NULL) - freeb(ipsec_mp); /* * ASSERT this isn't a b_next linked mblk chain where a * chained dropper should be used instead @@ -335,3 +302,50 @@ ip_drop_packet(mblk_t *mp, boolean_t inbound, ill_t *arriving, ASSERT(mp->b_prev == NULL && mp->b_next == NULL); freemsg(mp); } + +/* + * This is just a convinient place for dtrace to see dropped packets + */ +/*ARGSUSED*/ +void +ip_drop_input(char *str, mblk_t *mp, ill_t *ill) +{ + if (mp == NULL) + return; + + if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { + ipha_t *ipha = (ipha_t *)mp->b_rptr; + + DTRACE_IP7(drop__in, mblk_t *, mp, conn_t *, NULL, void_ip_t *, + ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, + ip6_t *, NULL, int, 0); + } else { + ip6_t *ip6h = (ip6_t *)mp->b_rptr; + + DTRACE_IP7(drop__in, mblk_t *, mp, conn_t *, NULL, void_ip_t *, + ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, + ip6_t *, ip6h, int, 0); + } +} + +/*ARGSUSED*/ +void +ip_drop_output(char *str, mblk_t *mp, ill_t *ill) +{ + if (mp == NULL) + return; + + if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { + ipha_t *ipha = (ipha_t *)mp->b_rptr; + + DTRACE_IP7(drop__out, mblk_t *, mp, conn_t *, NULL, void_ip_t *, + ipha, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, + ip6_t *, NULL, int, 0); + } else { + ip6_t *ip6h = (ip6_t *)mp->b_rptr; + + DTRACE_IP7(drop__out, mblk_t *, mp, conn_t *, NULL, void_ip_t *, + ip6h, __dtrace_ipsr_ill_t *, ill, ipha_t *, NULL, + ip6_t *, ip6h, int, 0); + } +} diff --git a/usr/src/uts/common/inet/ip/ipmp.c b/usr/src/uts/common/inet/ip/ipmp.c index ea8b4a73bb..b89171ed2b 100644 --- a/usr/src/uts/common/inet/ip/ipmp.c +++ b/usr/src/uts/common/inet/ip/ipmp.c @@ -22,12 +22,12 @@ * Use is subject to license terms. */ -#include <inet/arp.h> #include <inet/ip.h> #include <inet/ip6.h> #include <inet/ip_if.h> #include <inet/ip_ire.h> #include <inet/ip_multi.h> +#include <inet/ip_ndp.h> #include <inet/ip_rts.h> #include <inet/mi.h> #include <net/if_types.h> @@ -52,20 +52,6 @@ #define IPMP_GRP_HASH_SIZE 64 #define IPMP_ILL_REFRESH_TIMEOUT 120 /* seconds */ -/* - * Templates for IPMP ARP messages. - */ -static const arie_t ipmp_aract_template = { - AR_IPMP_ACTIVATE, - sizeof (arie_t), /* Name offset */ - sizeof (arie_t) /* Name length (set by ill_arp_alloc) */ -}; - -static const arie_t ipmp_ardeact_template = { - AR_IPMP_DEACTIVATE, - sizeof (arie_t), /* Name offset */ - sizeof (arie_t) /* Name length (set by ill_arp_alloc) */ -}; /* * IPMP meta-interface kstats (based on those in PSARC/1997/198). @@ -497,7 +483,7 @@ ipmp_grp_vet_ill(ipmp_grp_t *grp, ill_t *ill) * An ill must strictly be using ARP and/or ND for address * resolution for it to be allowed into a group. */ - if (ill->ill_flags & (ILLF_NONUD | ILLF_NOARP | ILLF_XRESOLV)) + if (ill->ill_flags & (ILLF_NONUD | ILLF_NOARP)) return (ENOTSUP); /* @@ -752,7 +738,7 @@ ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *illg) if (illg->ig_next_ill == NULL) illg->ig_next_ill = list_head(&illg->ig_actif); - if (ill_check_and_refhold(ill) == 0) { + if (ill_check_and_refhold(ill)) { rw_exit(&ipst->ips_ipmp_lock); return (ill); } @@ -763,17 +749,6 @@ ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *illg) } /* - * Return a pointer to the nominated multicast ill in `illg', or NULL if one - * doesn't exist. Caller must be inside the IPSQ. - */ -ill_t * -ipmp_illgrp_cast_ill(ipmp_illgrp_t *illg) -{ - ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); - return (illg->ig_cast_ill); -} - -/* * Return a held pointer to the nominated multicast ill in `illg', or NULL if * one doesn't exist. Caller need not be inside the IPSQ. */ @@ -785,7 +760,7 @@ ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *illg) rw_enter(&ipst->ips_ipmp_lock, RW_READER); castill = illg->ig_cast_ill; - if (castill != NULL && ill_check_and_refhold(castill) == 0) { + if (castill != NULL && ill_check_and_refhold(castill)) { rw_exit(&ipst->ips_ipmp_lock); return (castill); } @@ -794,6 +769,20 @@ ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *illg) } /* + * Callback routine for ncec_walk() that deletes `nce' if it is associated with + * the `(ill_t *)arg' and it is not one of the local addresses. Caller must be + * inside the IPSQ. + */ +static void +ipmp_ncec_delete_nonlocal(ncec_t *ncec, uchar_t *arg) +{ + if ((ncec != NULL) && !NCE_MYADDR(ncec) && + ncec->ncec_ill == (ill_t *)arg) { + ncec_delete(ncec); + } +} + +/* * Set the nominated cast ill on `illg' to `castill'. If `castill' is NULL, * any existing nomination is removed. Caller must be inside the IPSQ. */ @@ -820,6 +809,14 @@ ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill) */ if (ipmp_ill->ill_dl_up) ill_leave_multicast(ipmp_ill); + + /* + * Delete any NCEs tied to the old nomination. We must do this + * last since ill_leave_multicast() may trigger IREs to be + * built using ig_cast_ill. + */ + ncec_walk(ocastill, (pfi_t)ipmp_ncec_delete_nonlocal, ocastill, + ocastill->ill_ipst); } /* @@ -829,16 +826,6 @@ ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill) illg->ig_cast_ill = castill; rw_exit(&ipst->ips_ipmp_lock); - if (ocastill != NULL) { - /* - * Delete any IREs tied to the old nomination. We must do - * this after the new castill is set and has reached global - * visibility since the datapath has not been quiesced. - */ - ire_walk_ill(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE, - ill_stq_cache_delete, ocastill, ocastill); - } - /* * Enable new nominated ill (if any). */ @@ -855,15 +842,6 @@ ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill) if (ipmp_ill->ill_dl_up) ill_recover_multicast(ipmp_ill); } - - /* - * For IPv4, refresh our broadcast IREs. This needs to be done even - * if there's no new nomination since ill_refresh_bcast() still must - * update the IPMP meta-interface's broadcast IREs to point back at - * the IPMP meta-interface itself. - */ - if (!ipmp_ill->ill_isv6) - ill_refresh_bcast(ipmp_ill); } /* @@ -872,33 +850,33 @@ ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill) * created IPMP ARP entry, or NULL on failure. */ ipmp_arpent_t * -ipmp_illgrp_create_arpent(ipmp_illgrp_t *illg, mblk_t *mp, boolean_t proxyarp) +ipmp_illgrp_create_arpent(ipmp_illgrp_t *illg, boolean_t proxyarp, + ipaddr_t ipaddr, uchar_t *lladdr, size_t lladdr_len, uint16_t flags) { - uchar_t *addrp; - area_t *area = (area_t *)mp->b_rptr; ipmp_arpent_t *entp, *oentp; ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); - ASSERT(area->area_proto_addr_length == sizeof (ipaddr_t)); - if ((entp = kmem_zalloc(sizeof (ipmp_arpent_t), KM_NOSLEEP)) == NULL) + if ((entp = kmem_alloc(sizeof (ipmp_arpent_t) + lladdr_len, + KM_NOSLEEP)) == NULL) return (NULL); - if ((mp = copyb(mp)) == NULL) { - kmem_free(entp, sizeof (ipmp_arpent_t)); - return (NULL); - } - - DB_TYPE(mp) = M_PROTO; - entp->ia_area_mp = mp; - entp->ia_proxyarp = proxyarp; - addrp = mi_offset_paramc(mp, area->area_proto_addr_offset, - sizeof (ipaddr_t)); - bcopy(addrp, &entp->ia_ipaddr, sizeof (ipaddr_t)); - + /* + * Delete any existing ARP entry for this address. + */ if ((oentp = ipmp_illgrp_lookup_arpent(illg, &entp->ia_ipaddr)) != NULL) ipmp_illgrp_destroy_arpent(illg, oentp); + /* + * Prepend the new entry. + */ + entp->ia_ipaddr = ipaddr; + entp->ia_flags = flags; + entp->ia_lladdr_len = lladdr_len; + entp->ia_lladdr = (uchar_t *)&entp[1]; + bcopy(lladdr, entp->ia_lladdr, lladdr_len); + entp->ia_proxyarp = proxyarp; + entp->ia_notified = B_TRUE; list_insert_head(&illg->ig_arpent, entp); return (entp); } @@ -912,8 +890,7 @@ ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp) ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); list_remove(&illg->ig_arpent, entp); - freeb(entp->ia_area_mp); - kmem_free(entp, sizeof (ipmp_arpent_t)); + kmem_free(entp, sizeof (ipmp_arpent_t) + entp->ia_lladdr_len); } /* @@ -957,10 +934,9 @@ ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *illg) { ill_t *ill, *ipmp_ill = illg->ig_ipmp_ill; uint_t paddrlen = ipmp_ill->ill_phys_addr_length; - area_t *area; - mblk_t *area_mp; - uchar_t *physaddr; ipmp_arpent_t *entp; + ncec_t *ncec; + nce_t *nce; ASSERT(IAM_WRITER_ILL(ipmp_ill)); ASSERT(!ipmp_ill->ill_isv6); @@ -973,11 +949,7 @@ ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *illg) continue; } - area = (area_t *)entp->ia_area_mp->b_rptr; ASSERT(paddrlen == ill->ill_phys_addr_length); - ASSERT(paddrlen == area->area_hw_addr_length); - physaddr = mi_offset_paramc(entp->ia_area_mp, - area->area_hw_addr_offset, paddrlen); /* * If this is a proxy ARP entry, we can skip notifying ARP if @@ -985,18 +957,25 @@ ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *illg) * update the entry's hardware address before notifying ARP. */ if (entp->ia_proxyarp) { - if (bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0 && - entp->ia_notified) + if (bcmp(ill->ill_phys_addr, entp->ia_lladdr, + paddrlen) == 0 && entp->ia_notified) continue; - bcopy(ill->ill_phys_addr, physaddr, paddrlen); + bcopy(ill->ill_phys_addr, entp->ia_lladdr, paddrlen); } - if ((area_mp = copyb(entp->ia_area_mp)) == NULL) { - entp->ia_notified = B_FALSE; + (void) nce_lookup_then_add_v4(ipmp_ill, entp->ia_lladdr, + paddrlen, &entp->ia_ipaddr, entp->ia_flags, ND_UNCHANGED, + &nce); + if (nce == NULL || !entp->ia_proxyarp) { + if (nce != NULL) + nce_refrele(nce); continue; } - - putnext(ipmp_ill->ill_rq, area_mp); + ncec = nce->nce_common; + mutex_enter(&ncec->ncec_lock); + nce_update(ncec, ND_UNCHANGED, ill->ill_phys_addr); + mutex_exit(&ncec->ncec_lock); + nce_refrele(nce); ipmp_illgrp_mark_arpent(illg, entp); if ((ill = list_next(&illg->ig_actif, ill)) == NULL) @@ -1061,16 +1040,16 @@ ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *illg) ASSERT(IAM_WRITER_ILL(ipmp_ill)); /* - * Since ill_max_mtu can only change under ill_lock, we hold ill_lock + * Since ill_mtu can only change under ill_lock, we hold ill_lock * for each ill as we iterate through the list. Any changes to the - * ill_max_mtu will also trigger an update, so even if we missed it + * ill_mtu will also trigger an update, so even if we missed it * this time around, the update will catch it. */ ill = list_head(&illg->ig_if); for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) { mutex_enter(&ill->ill_lock); - if (mtu == 0 || ill->ill_max_mtu < mtu) - mtu = ill->ill_max_mtu; + if (mtu == 0 || ill->ill_mtu < mtu) + mtu = ill->ill_mtu; mutex_exit(&ill->ill_lock); } @@ -1171,13 +1150,12 @@ ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg) * This may seem odd, but it's consistent with the application view * that `ill' no longer exists (e.g., due to ipmp_ill_rtsaddrmsg()). */ + update_conn_ill(ill, ill->ill_ipst); if (ill->ill_isv6) { - reset_conn_ill(ill); reset_mrt_ill(ill); } else { ipif = ill->ill_ipif; for (; ipif != NULL; ipif = ipif->ipif_next) { - reset_conn_ipif(ipif); reset_mrt_vif_ipif(ipif); } } @@ -1206,7 +1184,7 @@ ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg) ipmp_ill->ill_flags |= ILLF_COS_ENABLED; mutex_exit(&ipmp_ill->ill_lock); } - ipmp_illgrp_set_mtu(illg, ill->ill_max_mtu); + ipmp_illgrp_set_mtu(illg, ill->ill_mtu); } else { ASSERT(ipmp_ill->ill_phys_addr_length == ill->ill_phys_addr_length); @@ -1217,8 +1195,8 @@ ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg) ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED; mutex_exit(&ipmp_ill->ill_lock); } - if (illg->ig_mtu > ill->ill_max_mtu) - ipmp_illgrp_set_mtu(illg, ill->ill_max_mtu); + if (illg->ig_mtu > ill->ill_mtu) + ipmp_illgrp_set_mtu(illg, ill->ill_mtu); } rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); @@ -1232,12 +1210,6 @@ ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg) */ ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_mark_testhidden, ill, ill); - /* - * Merge any broadcast IREs, if need be. - */ - if (!ill->ill_isv6) - ill_refresh_bcast(ill); - ipmp_ill_refresh_active(ill); } @@ -1301,12 +1273,6 @@ ipmp_ill_leave_illgrp(ill_t *ill) rw_exit(&ipst->ips_ill_g_lock); /* - * Recreate any broadcast IREs that had been shared, if need be. - */ - if (!ill->ill_isv6) - ill_refresh_bcast(ill); - - /* * Re-establish multicast memberships that were previously being * handled by the IPMP meta-interface. */ @@ -1456,10 +1422,8 @@ static boolean_t ipmp_ill_activate(ill_t *ill) { ipif_t *ipif; - mblk_t *actmp = NULL, *deactmp = NULL; mblk_t *linkupmp = NULL, *linkdownmp = NULL; ipmp_grp_t *grp = ill->ill_phyint->phyint_grp; - const char *grifname = grp->gr_ifname; ipmp_illgrp_t *illg = ill->ill_grp; ill_t *maxill; ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); @@ -1478,20 +1442,6 @@ ipmp_ill_activate(ill_t *ill) goto fail; } - /* - * For IPv4, allocate the activate/deactivate messages, and tell ARP. - */ - if (!ill->ill_isv6) { - actmp = ill_arie_alloc(ill, grifname, &ipmp_aract_template); - deactmp = ill_arie_alloc(ill, grifname, &ipmp_ardeact_template); - if (actmp == NULL || deactmp == NULL) - goto fail; - - ASSERT(ill->ill_ardeact_mp == NULL); - ill->ill_ardeact_mp = deactmp; - putnext(illg->ig_ipmp_ill->ill_rq, actmp); - } - if (list_is_empty(&illg->ig_actif)) { /* * Now that we have an active ill, nominate it for multicast @@ -1524,12 +1474,6 @@ ipmp_ill_activate(ill_t *ill) ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE); ipmp_ill_bind_ipif(ill, ipif, Res_act_rebind); } - - /* - * TODO: explore whether it's advantageous to flush IRE_CACHE - * bindings to force existing connections to be redistributed - * to the new ill. - */ } /* @@ -1542,7 +1486,7 @@ ipmp_ill_activate(ill_t *ill) rw_exit(&ipst->ips_ipmp_lock); /* - * Refresh ARP entries to use `ill', if need be. + * Refresh static/proxy ARP entries to use `ill', if need be. */ if (!ill->ill_isv6) ipmp_illgrp_refresh_arpent(illg); @@ -1557,8 +1501,6 @@ ipmp_ill_activate(ill_t *ill) } return (B_TRUE); fail: - freemsg(actmp); - freemsg(deactmp); freemsg(linkupmp); freemsg(linkdownmp); return (B_FALSE); @@ -1581,18 +1523,6 @@ ipmp_ill_deactivate(ill_t *ill) ASSERT(IS_UNDER_IPMP(ill)); /* - * Delete all IRE_CACHE entries for the group. (We cannot restrict - * ourselves to entries with ire_stq == ill since there may be other - * IREs that are backed by ACEs that are tied to this ill -- and thus - * when those ACEs are deleted, the IREs will be adrift without any - * AR_CN_ANNOUNCE notification from ARP.) - */ - if (ill->ill_isv6) - ire_walk_v6(ill_grp_cache_delete, ill, ALL_ZONES, ipst); - else - ire_walk_v4(ill_grp_cache_delete, ill, ALL_ZONES, ipst); - - /* * Pull the interface out of the active list. */ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); @@ -1609,6 +1539,12 @@ ipmp_ill_deactivate(ill_t *ill) ipmp_illgrp_set_cast(illg, list_head(&illg->ig_actif)); /* + * Delete all nce_t entries using this ill, so that the next attempt + * to send data traffic will revalidate cached nce's. + */ + nce_flush(ill, B_TRUE); + + /* * Unbind all of the ipifs bound to this ill, and save 'em in a list; * we'll rebind them after we tell the resolver the ill is no longer * active. We must do things in this order or the resolver could @@ -1620,18 +1556,10 @@ ipmp_ill_deactivate(ill_t *ill) ipif->ipif_bound_next = ubheadipif; ubheadipif = ipif; } - if (!ill->ill_isv6) { - /* - * Tell ARP `ill' is no longer active in the group. - */ - mp = ill->ill_ardeact_mp; - ill->ill_ardeact_mp = NULL; - ASSERT(mp != NULL); - putnext(illg->ig_ipmp_ill->ill_rq, mp); /* - * Refresh any ARP entries that had been using `ill'. + * Refresh static/proxy ARP entries that had been using `ill'. */ ipmp_illgrp_refresh_arpent(illg); } @@ -1649,6 +1577,20 @@ ipmp_ill_deactivate(ill_t *ill) ipmp_ill_bind_ipif(minill, ipif, Res_act_rebind); } + if (list_is_empty(&illg->ig_actif)) { + ill_t *ipmp_ill = illg->ig_ipmp_ill; + + ncec_walk(ipmp_ill, (pfi_t)ncec_delete_per_ill, + (uchar_t *)ipmp_ill, ipmp_ill->ill_ipst); + } + + /* + * Remove any IRE_IF_CLONE for this ill since they might have + * an ire_nce_cache/nce_common which refers to another ill in the group. + */ + ire_walk_ill(MATCH_IRE_TYPE, IRE_IF_CLONE, ill_downi_if_clone, + ill, ill); + /* * Finally, mark the group link down, if necessary. */ @@ -1725,7 +1667,7 @@ ipmp_ill_bind_ipif(ill_t *ill, ipif_t *ipif, enum ip_resolver_action act) /* * If necessary, tell ARP/NDP about the new mapping. Note that - * ipif_resolver_up() cannot fail for non-XRESOLV IPv6 ills. + * ipif_resolver_up() cannot fail for IPv6 ills. */ if (act != Res_act_none) { if (ill->ill_isv6) { @@ -1756,15 +1698,12 @@ ipmp_ill_bind_ipif(ill_t *ill, ipif_t *ipif, enum ip_resolver_action act) static ipif_t * ipmp_ill_unbind_ipif(ill_t *ill, ipif_t *ipif, boolean_t notifyres) { - ill_t *ipmp_ill; ipif_t *previpif; ip_stack_t *ipst = ill->ill_ipst; ASSERT(IAM_WRITER_ILL(ill)); ASSERT(IS_UNDER_IPMP(ill)); - ipmp_ill = ill->ill_grp->ig_ipmp_ill; - /* * If necessary, find an ipif to unbind. */ @@ -1803,13 +1742,10 @@ ipmp_ill_unbind_ipif(ill_t *ill, ipif_t *ipif, boolean_t notifyres) * If requested, notify the resolvers (provided we're bound). */ if (notifyres && ipif->ipif_bound) { - if (ill->ill_isv6) { + if (ill->ill_isv6) ipif_ndp_down(ipif); - } else { - ASSERT(ipif->ipif_arp_del_mp != NULL); - putnext(ipmp_ill->ill_rq, ipif->ipif_arp_del_mp); - ipif->ipif_arp_del_mp = NULL; - } + else + (void) ipif_arp_down(ipif); } ipif->ipif_bound = B_FALSE; @@ -1845,8 +1781,8 @@ ipmp_ill_is_active(ill_t *ill) } /* - * IRE walker callback: set IRE_MARK_TESTHIDDEN on cache/interface/offsubnet - * IREs with a source address on `ill_arg'. + * IRE walker callback: set ire_testhidden on IRE_HIDDEN_TYPE IREs associated + * with `ill_arg'. */ static void ipmp_ill_ire_mark_testhidden(ire_t *ire, char *ill_arg) @@ -1856,27 +1792,18 @@ ipmp_ill_ire_mark_testhidden(ire_t *ire, char *ill_arg) ASSERT(IAM_WRITER_ILL(ill)); ASSERT(!IS_IPMP(ill)); - if (ire->ire_ipif->ipif_ill != ill) + if (ire->ire_ill != ill) return; - switch (ire->ire_type) { - case IRE_HOST: - case IRE_PREFIX: - case IRE_DEFAULT: - case IRE_CACHE: - case IRE_IF_RESOLVER: - case IRE_IF_NORESOLVER: + if (IRE_HIDDEN_TYPE(ire->ire_type)) { DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire); - ire->ire_marks |= IRE_MARK_TESTHIDDEN; - break; - default: - break; + ire->ire_testhidden = B_TRUE; } } /* - * IRE walker callback: clear IRE_MARK_TESTHIDDEN if the IRE has a source - * address on `ill_arg'. + * IRE walker callback: clear ire_testhidden if the IRE has a source address + * on `ill_arg'. */ static void ipmp_ill_ire_clear_testhidden(ire_t *ire, char *ill_arg) @@ -1886,9 +1813,9 @@ ipmp_ill_ire_clear_testhidden(ire_t *ire, char *ill_arg) ASSERT(IAM_WRITER_ILL(ill)); ASSERT(!IS_IPMP(ill)); - if (ire->ire_ipif->ipif_ill == ill) { + if (ire->ire_ill == ill) { DTRACE_PROBE1(ipmp__clear__testhidden, ire_t *, ire); - ire->ire_marks &= ~IRE_MARK_TESTHIDDEN; + ire->ire_testhidden = B_FALSE; } } @@ -1909,7 +1836,7 @@ ipmp_ill_hold_ipmp_ill(ill_t *ill) rw_enter(&ipst->ips_ipmp_lock, RW_READER); illg = ill->ill_grp; - if (illg != NULL && ill_check_and_refhold(illg->ig_ipmp_ill) == 0) { + if (illg != NULL && ill_check_and_refhold(illg->ig_ipmp_ill)) { rw_exit(&ipst->ips_ipmp_lock); return (illg->ig_ipmp_ill); } @@ -2135,7 +2062,7 @@ ipmp_ipif_hold_bound_ill(const ipif_t *ipif) rw_enter(&ipst->ips_ipmp_lock, RW_READER); boundill = ipif->ipif_bound_ill; - if (boundill != NULL && ill_check_and_refhold(boundill) == 0) { + if (boundill != NULL && ill_check_and_refhold(boundill)) { rw_exit(&ipst->ips_ipmp_lock); return (boundill); } @@ -2192,3 +2119,182 @@ ipmp_ipif_is_up_dataaddr(const ipif_t *ipif) { return (ipmp_ipif_is_dataaddr(ipif) && (ipif->ipif_flags & IPIF_UP)); } + +/* + * Check if `mp' contains a probe packet by verifying if the IP source address + * is a test address on an underlying interface `ill'. Caller need not be inside + * the IPSQ. + */ +boolean_t +ipmp_packet_is_probe(mblk_t *mp, ill_t *ill) +{ + ip6_t *ip6h = (ip6_t *)mp->b_rptr; + ipha_t *ipha = (ipha_t *)mp->b_rptr; + + ASSERT(DB_TYPE(mp) != M_CTL); + + if (!IS_UNDER_IPMP(ill)) + return (B_FALSE); + + if (ill->ill_isv6) { + if (!IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) && + ipif_lookup_testaddr_v6(ill, &ip6h->ip6_src, NULL)) + return (B_TRUE); + } else { + if ((ipha->ipha_src != INADDR_ANY) && + ipif_lookup_testaddr_v4(ill, &ipha->ipha_src, NULL)) + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * Pick out an appropriate underlying interface for packet transmit. This + * function may be called from the data path, so we need to verify that the + * IPMP group associated with `ill' is non-null after holding the ill_g_lock. + * Caller need not be inside the IPSQ. + */ +ill_t * +ipmp_ill_get_xmit_ill(ill_t *ill, boolean_t is_unicast) +{ + ill_t *xmit_ill; + ip_stack_t *ipst = ill->ill_ipst; + + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + if (ill->ill_grp == NULL) { + /* + * The interface was taken out of the group. Return ill itself, + * but take a ref so that callers will always be able to do + * ill_refrele(ill); + */ + rw_exit(&ipst->ips_ill_g_lock); + ill_refhold(ill); + return (ill); + } + if (!is_unicast) + xmit_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); + else + xmit_ill = ipmp_illgrp_hold_next_ill(ill->ill_grp); + rw_exit(&ipst->ips_ill_g_lock); + return (xmit_ill); +} + +/* + * Flush out any nce that points at `ncec' from an underlying interface + */ +void +ipmp_ncec_flush_nce(ncec_t *ncec) +{ + ill_t *ncec_ill = ncec->ncec_ill; + ill_t *ill; + ipmp_illgrp_t *illg; + ip_stack_t *ipst = ncec_ill->ill_ipst; + list_t dead; + nce_t *nce; + + if (!IS_IPMP(ncec_ill)) + return; + + illg = ncec_ill->ill_grp; + list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node)); + + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + ill = list_head(&illg->ig_if); + for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) { + nce_fastpath_list_delete(ill, ncec, &dead); + } + rw_exit(&ipst->ips_ill_g_lock); + + /* + * we may now nce_refrele() all dead entries since all locks have been + * dropped. + */ + while ((nce = list_head(&dead)) != NULL) { + list_remove(&dead, nce); + nce_refrele(nce); + } + ASSERT(list_is_empty(&dead)); + list_destroy(&dead); +} + +/* + * For each interface in the IPMP group, if there are nce_t entries for the IP + * address corresponding to `ncec', then their dl_unitdata_req_t and fastpath + * information must be updated to match the link-layer address information in + * `ncec'. + */ +void +ipmp_ncec_fastpath(ncec_t *ncec, ill_t *ipmp_ill) +{ + ill_t *ill; + ipmp_illgrp_t *illg = ipmp_ill->ill_grp; + ip_stack_t *ipst = ipmp_ill->ill_ipst; + nce_t *nce, *nce_next; + list_t replace; + + ASSERT(IS_IPMP(ipmp_ill)); + + /* + * if ncec itself is not reachable, there is no use in creating nce_t + * entries on the underlying interfaces in the group. + */ + if (!NCE_ISREACHABLE(ncec)) + return; + + list_create(&replace, sizeof (nce_t), offsetof(nce_t, nce_node)); + rw_enter(&ipst->ips_ipmp_lock, RW_READER); + ill = list_head(&illg->ig_actif); + for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) { + /* + * For each underlying interface, we first check if there is an + * nce_t for the address in ncec->ncec_addr. If one exists, + * we should trigger nce_fastpath for that nce_t. However, the + * catch is that we are holding the ips_ipmp_lock to prevent + * changes to the IPMP group membership, so that we cannot + * putnext() to the driver. So we nce_delete the + * list nce_t entries that need to be updated into the + * `replace' list, and then process the `replace' list + * after dropping the ips_ipmp_lock. + */ + mutex_enter(&ill->ill_lock); + for (nce = list_head(&ill->ill_nce); nce != NULL; ) { + nce_next = list_next(&ill->ill_nce, nce); + if (!IN6_ARE_ADDR_EQUAL(&nce->nce_addr, + &ncec->ncec_addr)) { + nce = nce_next; + continue; + } + nce_refhold(nce); + nce_delete(nce); + list_insert_tail(&replace, nce); + nce = nce_next; + } + mutex_exit(&ill->ill_lock); + } + rw_exit(&ipst->ips_ipmp_lock); + /* + * `replace' now has the list of nce's on which we should be triggering + * nce_fastpath(). We now retrigger fastpath by setting up the nce + * again. The code in nce_lookup_then_add_v* ensures that nce->nce_ill + * is still in the group for ncec->ncec_ill + */ + while ((nce = list_head(&replace)) != NULL) { + list_remove(&replace, nce); + if (ncec->ncec_ill->ill_isv6) { + (void) nce_lookup_then_add_v6(nce->nce_ill, + ncec->ncec_lladdr, ncec->ncec_lladdr_length, + &nce->nce_addr, ncec->ncec_flags, ND_UNCHANGED, + NULL); + } else { + ipaddr_t ipaddr; + + IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ipaddr); + (void) nce_lookup_then_add_v4(nce->nce_ill, + ncec->ncec_lladdr, ncec->ncec_lladdr_length, + &ipaddr, ncec->ncec_flags, ND_UNCHANGED, NULL); + } + nce_refrele(nce); + } + ASSERT(list_is_empty(&replace)); + list_destroy(&replace); +} diff --git a/usr/src/uts/common/inet/ip/ipsec_loader.c b/usr/src/uts/common/inet/ip/ipsec_loader.c index 6609146fd1..7f5c434359 100644 --- a/usr/src/uts/common/inet/ip/ipsec_loader.c +++ b/usr/src/uts/common/inet/ip/ipsec_loader.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -121,8 +121,6 @@ ipsec_loader(void *arg) } mutex_exit(&ipss->ipsec_loader_lock); - ip_ipsec_load_complete(ipss); - mutex_enter(&ipss->ipsec_loader_lock); if (!ipsec_failure) { CALLB_CPR_EXIT(&cprinfo); diff --git a/usr/src/uts/common/inet/ip/ipsecah.c b/usr/src/uts/common/inet/ip/ipsecah.c index c130dac490..a511b85ff4 100644 --- a/usr/src/uts/common/inet/ip/ipsecah.c +++ b/usr/src/uts/common/inet/ip/ipsecah.c @@ -54,6 +54,8 @@ #include <inet/ip.h> #include <inet/ip6.h> #include <inet/nd.h> +#include <inet/ip_if.h> +#include <inet/ip_ndp.h> #include <inet/ipsec_info.h> #include <inet/ipsec_impl.h> #include <inet/sadb.h> @@ -62,7 +64,6 @@ #include <inet/ipdrop.h> #include <sys/taskq.h> #include <sys/policy.h> -#include <sys/iphada.h> #include <sys/strsun.h> #include <sys/crypto/common.h> @@ -132,32 +133,27 @@ static ipsecahparam_t lcl_param_arr[] = { #define AH_MSGSIZE(mp) ((mp)->b_cont != NULL ? msgdsize(mp) : MBLKL(mp)) -static ipsec_status_t ah_auth_out_done(mblk_t *); -static ipsec_status_t ah_auth_in_done(mblk_t *); +static mblk_t *ah_auth_out_done(mblk_t *, ip_xmit_attr_t *, ipsec_crypto_t *); +static mblk_t *ah_auth_in_done(mblk_t *, ip_recv_attr_t *, ipsec_crypto_t *); static mblk_t *ah_process_ip_options_v4(mblk_t *, ipsa_t *, int *, uint_t, boolean_t, ipsecah_stack_t *); static mblk_t *ah_process_ip_options_v6(mblk_t *, ipsa_t *, int *, uint_t, boolean_t, ipsecah_stack_t *); static void ah_getspi(mblk_t *, keysock_in_t *, ipsecah_stack_t *); -static ipsec_status_t ah_inbound_accelerated(mblk_t *, boolean_t, ipsa_t *, - uint32_t); -static ipsec_status_t ah_outbound_accelerated_v4(mblk_t *, ipsa_t *); -static ipsec_status_t ah_outbound_accelerated_v6(mblk_t *, ipsa_t *); -static ipsec_status_t ah_outbound(mblk_t *); +static void ah_inbound_restart(mblk_t *, ip_recv_attr_t *); + +static mblk_t *ah_outbound(mblk_t *, ip_xmit_attr_t *); +static void ah_outbound_finish(mblk_t *, ip_xmit_attr_t *); static int ipsecah_open(queue_t *, dev_t *, int, int, cred_t *); static int ipsecah_close(queue_t *); -static void ipsecah_rput(queue_t *, mblk_t *); static void ipsecah_wput(queue_t *, mblk_t *); static void ah_send_acquire(ipsacq_t *, mblk_t *, netstack_t *); static boolean_t ah_register_out(uint32_t, uint32_t, uint_t, ipsecah_stack_t *, - mblk_t *); + cred_t *); static void *ipsecah_stack_init(netstackid_t stackid, netstack_t *ns); static void ipsecah_stack_fini(netstackid_t stackid, void *arg); -extern void (*cl_inet_getspi)(netstackid_t, uint8_t, uint8_t *, size_t, - void *); - /* Setable in /etc/system */ uint32_t ah_hash_size = IPSEC_DEFAULT_HASH_SIZE; @@ -168,7 +164,7 @@ static struct module_info info = { }; static struct qinit rinit = { - (pfi_t)ipsecah_rput, NULL, ipsecah_open, ipsecah_close, NULL, &info, + (pfi_t)putnext, NULL, ipsecah_open, ipsecah_close, NULL, &info, NULL }; @@ -215,9 +211,6 @@ ah_kstat_init(ipsecah_stack_t *ahstack, netstackid_t stackid) KI(acquire_requests); KI(bytes_expired); KI(out_discards); - KI(in_accelerated); - KI(out_accelerated); - KI(noaccel); KI(crypto_sync); KI(crypto_async); KI(crypto_failures); @@ -275,9 +268,9 @@ ah_ager(void *arg) hrtime_t begin = gethrtime(); sadb_ager(&ahstack->ah_sadb.s_v4, ahstack->ah_pfkey_q, - ahstack->ah_sadb.s_ip_q, ahstack->ipsecah_reap_delay, ns); + ahstack->ipsecah_reap_delay, ns); sadb_ager(&ahstack->ah_sadb.s_v6, ahstack->ah_pfkey_q, - ahstack->ah_sadb.s_ip_q, ahstack->ipsecah_reap_delay, ns); + ahstack->ipsecah_reap_delay, ns); ahstack->ah_event = sadb_retimeout(begin, ahstack->ah_pfkey_q, ah_ager, ahstack, @@ -474,7 +467,13 @@ ipsecah_stack_fini(netstackid_t stackid, void *arg) } /* - * AH module open routine. The module should be opened by keysock. + * AH module open routine, which is here for keysock plumbing. + * Keysock is pushed over {AH,ESP} which is an artifact from the Bad Old + * Days of export control, and fears that ESP would not be allowed + * to be shipped at all by default. Eventually, keysock should + * either access AH and ESP via modstubs or krtld dependencies, or + * perhaps be folded in with AH and ESP into a single IPsec/netsec + * module ("netsec" if PF_KEY provides more than AH/ESP keying tables). */ /* ARGSUSED */ static int @@ -497,57 +496,10 @@ ipsecah_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) ahstack = ns->netstack_ipsecah; ASSERT(ahstack != NULL); - /* - * ASSUMPTIONS (because I'm MT_OCEXCL): - * - * * I'm being pushed on top of IP for all my opens (incl. #1). - * * Only ipsecah_open() can write into ah_sadb.s_ip_q. - * * Because of this, I can check lazily for ah_sadb.s_ip_q. - * - * If these assumptions are wrong, I'm in BIG trouble... - */ - q->q_ptr = ahstack; WR(q)->q_ptr = q->q_ptr; - if (ahstack->ah_sadb.s_ip_q == NULL) { - struct T_unbind_req *tur; - - ahstack->ah_sadb.s_ip_q = WR(q); - /* Allocate an unbind... */ - ahstack->ah_ip_unbind = allocb(sizeof (struct T_unbind_req), - BPRI_HI); - - /* - * Send down T_BIND_REQ to bind IPPROTO_AH. - * Handle the ACK here in AH. - */ - qprocson(q); - if (ahstack->ah_ip_unbind == NULL || - !sadb_t_bind_req(ahstack->ah_sadb.s_ip_q, IPPROTO_AH)) { - if (ahstack->ah_ip_unbind != NULL) { - freeb(ahstack->ah_ip_unbind); - ahstack->ah_ip_unbind = NULL; - } - q->q_ptr = NULL; - qprocsoff(q); - netstack_rele(ahstack->ipsecah_netstack); - return (ENOMEM); - } - - ahstack->ah_ip_unbind->b_datap->db_type = M_PROTO; - tur = (struct T_unbind_req *)ahstack->ah_ip_unbind->b_rptr; - tur->PRIM_type = T_UNBIND_REQ; - } else { - qprocson(q); - } - - /* - * For now, there's not much I can do. I'll be getting a message - * passed down to me from keysock (in my wput), and a T_BIND_ACK - * up from IP (in my rput). - */ - + qprocson(q); return (0); } @@ -560,17 +512,6 @@ ipsecah_close(queue_t *q) ipsecah_stack_t *ahstack = (ipsecah_stack_t *)q->q_ptr; /* - * If ah_sadb.s_ip_q is attached to this instance, send a - * T_UNBIND_REQ to IP for the instance before doing - * a qprocsoff(). - */ - if (WR(q) == ahstack->ah_sadb.s_ip_q && - ahstack->ah_ip_unbind != NULL) { - putnext(WR(q), ahstack->ah_ip_unbind); - ahstack->ah_ip_unbind = NULL; - } - - /* * Clean up q_ptr, if needed. */ qprocsoff(q); @@ -585,98 +526,16 @@ ipsecah_close(queue_t *q) (void) quntimeout(q, ahstack->ah_event); } - if (WR(q) == ahstack->ah_sadb.s_ip_q) { - /* - * If the ah_sadb.s_ip_q is attached to this instance, find - * another. The OCEXCL outer perimeter helps us here. - */ - - ahstack->ah_sadb.s_ip_q = NULL; - - /* - * Find a replacement queue for ah_sadb.s_ip_q. - */ - if (ahstack->ah_pfkey_q != NULL && - ahstack->ah_pfkey_q != RD(q)) { - /* - * See if we can use the pfkey_q. - */ - ahstack->ah_sadb.s_ip_q = WR(ahstack->ah_pfkey_q); - } - - if (ahstack->ah_sadb.s_ip_q == NULL || - !sadb_t_bind_req(ahstack->ah_sadb.s_ip_q, IPPROTO_AH)) { - ah1dbg(ahstack, - ("ipsecah: Can't reassign ah_sadb.s_ip_q.\n")); - ahstack->ah_sadb.s_ip_q = NULL; - } else { - ahstack->ah_ip_unbind = - allocb(sizeof (struct T_unbind_req), BPRI_HI); - - if (ahstack->ah_ip_unbind != NULL) { - struct T_unbind_req *tur; - - ahstack->ah_ip_unbind->b_datap->db_type = - M_PROTO; - tur = (struct T_unbind_req *) - ahstack->ah_ip_unbind->b_rptr; - tur->PRIM_type = T_UNBIND_REQ; - } - /* If it's NULL, I can't do much here. */ - } - } - netstack_rele(ahstack->ipsecah_netstack); return (0); } /* - * AH module read put routine. - */ -/* ARGSUSED */ -static void -ipsecah_rput(queue_t *q, mblk_t *mp) -{ - ipsecah_stack_t *ahstack = (ipsecah_stack_t *)q->q_ptr; - - ASSERT(mp->b_datap->db_type != M_CTL); /* No more IRE_DB_REQ. */ - - switch (mp->b_datap->db_type) { - case M_PROTO: - case M_PCPROTO: - /* TPI message of some sort. */ - switch (*((t_scalar_t *)mp->b_rptr)) { - case T_BIND_ACK: - /* We expect this. */ - ah3dbg(ahstack, - ("Thank you IP from AH for T_BIND_ACK\n")); - break; - case T_ERROR_ACK: - cmn_err(CE_WARN, - "ipsecah: AH received T_ERROR_ACK from IP."); - break; - case T_OK_ACK: - /* Probably from a (rarely sent) T_UNBIND_REQ. */ - break; - default: - ah1dbg(ahstack, ("Unknown M_{,PC}PROTO message.\n")); - } - freemsg(mp); - break; - default: - /* For now, passthru message. */ - ah2dbg(ahstack, ("AH got unknown mblk type %d.\n", - mp->b_datap->db_type)); - putnext(q, mp); - } -} - -/* * Construct an SADB_REGISTER message with the current algorithms. */ static boolean_t ah_register_out(uint32_t sequence, uint32_t pid, uint_t serial, - ipsecah_stack_t *ahstack, mblk_t *in_mp) + ipsecah_stack_t *ahstack, cred_t *cr) { mblk_t *mp; boolean_t rc = B_TRUE; @@ -691,7 +550,7 @@ ah_register_out(uint32_t sequence, uint32_t pid, uint_t serial, sadb_sens_t *sens; size_t sens_len = 0; sadb_ext_t *nextext; - cred_t *sens_cr = NULL; + ts_label_t *sens_tsl = NULL; /* Allocate the KEYSOCK_OUT. */ mp = sadb_keysock_out(serial); @@ -700,11 +559,10 @@ ah_register_out(uint32_t sequence, uint32_t pid, uint_t serial, return (B_FALSE); } - if (is_system_labeled() && (in_mp != NULL)) { - sens_cr = msg_getcred(in_mp, NULL); - - if (sens_cr != NULL) { - sens_len = sadb_sens_len_from_cred(sens_cr); + if (is_system_labeled() && (cr != NULL)) { + sens_tsl = crgetlabel(cr); + if (sens_tsl != NULL) { + sens_len = sadb_sens_len_from_label(sens_tsl); allocsize += sens_len; } } @@ -786,10 +644,10 @@ ah_register_out(uint32_t sequence, uint32_t pid, uint_t serial, mutex_exit(&ipss->ipsec_alg_lock); - if (sens_cr != NULL) { + if (sens_tsl != NULL) { sens = (sadb_sens_t *)nextext; - sadb_sens_from_cred(sens, SADB_EXT_SENSITIVITY, - sens_cr, sens_len); + sadb_sens_from_label(sens, SADB_EXT_SENSITIVITY, + sens_tsl, sens_len); nextext = (sadb_ext_t *)(((uint8_t *)sens) + sens_len); } @@ -847,40 +705,61 @@ ipsecah_algs_changed(netstack_t *ns) /* * Stub function that taskq_dispatch() invokes to take the mblk (in arg) - * and put() it into AH and STREAMS again. + * and send it into AH and IP again. */ static void inbound_task(void *arg) { - ah_t *ah; - mblk_t *mp = (mblk_t *)arg; - ipsec_in_t *ii = (ipsec_in_t *)mp->b_rptr; - int ipsec_rc; - netstack_t *ns; - ipsecah_stack_t *ahstack; - - ns = netstack_find_by_stackid(ii->ipsec_in_stackid); - if (ns == NULL || ns != ii->ipsec_in_ns) { - /* Just freemsg(). */ - if (ns != NULL) - netstack_rele(ns); + mblk_t *mp = (mblk_t *)arg; + mblk_t *async_mp; + ip_recv_attr_t iras; + + async_mp = mp; + mp = async_mp->b_cont; + async_mp->b_cont = NULL; + if (!ip_recv_attr_from_mblk(async_mp, &iras)) { + /* The ill or ip_stack_t disappeared on us */ + ip_drop_input("ip_recv_attr_from_mblk", mp, NULL); freemsg(mp); - return; + goto done; } - ahstack = ns->netstack_ipsecah; + ah_inbound_restart(mp, &iras); +done: + ira_cleanup(&iras, B_TRUE); +} - ah2dbg(ahstack, ("in AH inbound_task")); +/* + * Restart ESP after the SA has been added. + */ +static void +ah_inbound_restart(mblk_t *mp, ip_recv_attr_t *ira) +{ + ah_t *ah; + netstack_t *ns; + ipsecah_stack_t *ahstack; + + ns = ira->ira_ill->ill_ipst->ips_netstack; + ahstack = ns->netstack_ipsecah; ASSERT(ahstack != NULL); - ah = ipsec_inbound_ah_sa(mp, ns); - if (ah != NULL) { - ASSERT(ii->ipsec_in_ah_sa != NULL); - ipsec_rc = ii->ipsec_in_ah_sa->ipsa_input_func(mp, ah); - if (ipsec_rc == IPSEC_STATUS_SUCCESS) - ip_fanout_proto_again(mp, NULL, NULL, NULL); + mp = ipsec_inbound_ah_sa(mp, ira, &ah); + if (mp == NULL) + return; + + ASSERT(ah != NULL); + ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE); + ASSERT(ira->ira_ipsec_ah_sa != NULL); + + mp = ira->ira_ipsec_ah_sa->ipsa_input_func(mp, ah, ira); + if (mp == NULL) { + /* + * Either it failed or is pending. In the former case + * ipIfStatsInDiscards was increased. + */ + return; } - netstack_rele(ns); + ip_input_post_ipsec(mp, ira); } /* @@ -1051,60 +930,96 @@ ah_add_sa_finish(mblk_t *mp, sadb_msg_t *samsg, keysock_in_t *ksi, if (larval != NULL) lpkt = sadb_clear_lpkt(larval); - rc = sadb_common_add(ahstack->ah_sadb.s_ip_q, ahstack->ah_pfkey_q, mp, + rc = sadb_common_add(ahstack->ah_pfkey_q, mp, samsg, ksi, primary, secondary, larval, clone, is_inbound, diagnostic, ns, &ahstack->ah_sadb); + if (lpkt != NULL) { + if (rc == 0) { + rc = !taskq_dispatch(ah_taskq, inbound_task, lpkt, + TQ_NOSLEEP); + } + if (rc != 0) { + lpkt = ip_recv_attr_free_mblk(lpkt); + ip_drop_packet(lpkt, B_TRUE, NULL, + DROPPER(ipss, ipds_sadb_inlarval_timeout), + &ahstack->ah_dropper); + } + } + /* * How much more stack will I create with all of these - * ah_inbound_* and ah_outbound_*() calls? + * ah_outbound_*() calls? */ - if (rc == 0 && lpkt != NULL) - rc = !taskq_dispatch(ah_taskq, inbound_task, lpkt, TQ_NOSLEEP); - - if (rc != 0) { - ip_drop_packet(lpkt, B_TRUE, NULL, NULL, - DROPPER(ipss, ipds_sadb_inlarval_timeout), - &ahstack->ah_dropper); - } - + /* Handle the packets queued waiting for the SA */ while (acq_msgs != NULL) { - mblk_t *mp = acq_msgs; + mblk_t *asyncmp; + mblk_t *data_mp; + ip_xmit_attr_t ixas; + ill_t *ill; + asyncmp = acq_msgs; acq_msgs = acq_msgs->b_next; - mp->b_next = NULL; - if (rc == 0) { - ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr; - - ASSERT(ahstack->ah_sadb.s_ip_q != NULL); - if (ipsec_outbound_sa(mp, IPPROTO_AH)) { - io->ipsec_out_ah_done = B_TRUE; - if (ah_outbound(mp) == IPSEC_STATUS_SUCCESS) { - ipha_t *ipha = (ipha_t *) - mp->b_cont->b_rptr; - if (sq.af == AF_INET) { - ip_wput_ipsec_out(NULL, mp, - ipha, NULL, NULL); - } else { - ip6_t *ip6h = (ip6_t *)ipha; - - ASSERT(sq.af == AF_INET6); - - ip_wput_ipsec_out_v6(NULL, - mp, ip6h, NULL, NULL); - } - } - continue; - } + asyncmp->b_next = NULL; + + /* + * Extract the ip_xmit_attr_t from the first mblk. + * Verifies that the netstack and ill is still around; could + * have vanished while iked was doing its work. + * On succesful return we have a nce_t and the ill/ipst can't + * disappear until we do the nce_refrele in ixa_cleanup. + */ + data_mp = asyncmp->b_cont; + asyncmp->b_cont = NULL; + if (!ip_xmit_attr_from_mblk(asyncmp, &ixas)) { + AH_BUMP_STAT(ahstack, out_discards); + ip_drop_packet(data_mp, B_FALSE, NULL, + DROPPER(ipss, ipds_sadb_acquire_timeout), + &ahstack->ah_dropper); + } else if (rc != 0) { + ill = ixas.ixa_nce->nce_ill; + AH_BUMP_STAT(ahstack, out_discards); + ip_drop_packet(data_mp, B_FALSE, ill, + DROPPER(ipss, ipds_sadb_acquire_timeout), + &ahstack->ah_dropper); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + } else { + ah_outbound_finish(data_mp, &ixas); } + ixa_cleanup(&ixas); + } + + return (rc); +} + + +/* + * Process one of the queued messages (from ipsacq_mp) once the SA + * has been added. + */ +static void +ah_outbound_finish(mblk_t *data_mp, ip_xmit_attr_t *ixa) +{ + netstack_t *ns = ixa->ixa_ipst->ips_netstack; + ipsecah_stack_t *ahstack = ns->netstack_ipsecah; + ipsec_stack_t *ipss = ns->netstack_ipsec; + ill_t *ill = ixa->ixa_nce->nce_ill; + + if (!ipsec_outbound_sa(data_mp, ixa, IPPROTO_AH)) { AH_BUMP_STAT(ahstack, out_discards); - ip_drop_packet(mp, B_FALSE, NULL, NULL, + ip_drop_packet(data_mp, B_FALSE, ill, DROPPER(ipss, ipds_sadb_acquire_timeout), &ahstack->ah_dropper); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + return; } - return (rc); + data_mp = ah_outbound(data_mp, ixa); + if (data_mp == NULL) + return; + + (void) ip_output_post_ipsec(data_mp, ixa); } /* @@ -1300,8 +1215,7 @@ ah_del_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic, } return (sadb_purge_sa(mp, ksi, (sin->sin_family == AF_INET6) ? &ahstack->ah_sadb.s_v6 : - &ahstack->ah_sadb.s_v4, diagnostic, ahstack->ah_pfkey_q, - ahstack->ah_sadb.s_ip_q)); + &ahstack->ah_sadb.s_v4, diagnostic, ahstack->ah_pfkey_q)); } return (sadb_delget_sa(mp, ksi, &ahstack->ah_sadb, diagnostic, @@ -1449,7 +1363,7 @@ ah_parse_pfkey(mblk_t *mp, ipsecah_stack_t *ahstack) * Keysock takes care of the PF_KEY bookkeeping for this. */ if (ah_register_out(samsg->sadb_msg_seq, samsg->sadb_msg_pid, - ksi->ks_in_serial, ahstack, mp)) { + ksi->ks_in_serial, ahstack, msg_getcred(mp, NULL))) { freemsg(mp); } else { /* @@ -1534,8 +1448,7 @@ ah_keysock_no_socket(mblk_t *mp, ipsecah_stack_t *ahstack) samsg->sadb_msg_errno = kse->ks_err_errno; samsg->sadb_msg_len = SADB_8TO64(sizeof (*samsg)); /* - * Use the write-side of the ah_pfkey_q, in case there is - * no ahstack->ah_sadb.s_ip_q. + * Use the write-side of the ah_pfkey_q */ sadb_in_acquire(samsg, &ahstack->ah_sadb, WR(ahstack->ah_pfkey_q), ahstack->ipsecah_netstack); @@ -1825,22 +1738,15 @@ ah_age_bytes(ipsa_t *assoc, uint64_t bytes, boolean_t inbound) * Called while holding the algorithm lock. */ static void -ah_insert_prop(sadb_prop_t *prop, ipsacq_t *acqrec, uint_t combs) +ah_insert_prop(sadb_prop_t *prop, ipsacq_t *acqrec, uint_t combs, + netstack_t *ns) { sadb_comb_t *comb = (sadb_comb_t *)(prop + 1); - ipsec_out_t *io; ipsec_action_t *ap; ipsec_prot_t *prot; - ipsecah_stack_t *ahstack; - netstack_t *ns; - ipsec_stack_t *ipss; - - io = (ipsec_out_t *)acqrec->ipsacq_mp->b_rptr; - ASSERT(io->ipsec_out_type == IPSEC_OUT); + ipsecah_stack_t *ahstack = ns->netstack_ipsecah; + ipsec_stack_t *ipss = ns->netstack_ipsec; - ns = io->ipsec_out_ns; - ipss = ns->netstack_ipsec; - ahstack = ns->netstack_ipsecah; ASSERT(MUTEX_HELD(&ipss->ipsec_alg_lock)); prop->sadb_prop_exttype = SADB_EXT_PROPOSAL; @@ -1851,9 +1757,9 @@ ah_insert_prop(sadb_prop_t *prop, ipsacq_t *acqrec, uint_t combs) /* * Based upon algorithm properties, and what-not, prioritize a - * proposal, based on the ordering of the ah algorithms in the - * alternatives presented in the policy rule passed down - * through the ipsec_out_t and attached to the acquire record. + * proposal, based on the ordering of the AH algorithms in the + * alternatives in the policy rule or socket that was placed + * in the acquire record. */ for (ap = acqrec->ipsacq_act; ap != NULL; @@ -1961,7 +1867,7 @@ ah_send_acquire(ipsacq_t *acqrec, mblk_t *extended, netstack_t *ns) /* Insert proposal here. */ prop = (sadb_prop_t *)(((uint64_t *)samsg) + samsg->sadb_msg_len); - ah_insert_prop(prop, acqrec, combs); + ah_insert_prop(prop, acqrec, combs, ns); samsg->sadb_msg_len += prop->sadb_prop_len; msgmp->b_wptr += SADB_64TO8(samsg->sadb_msg_len); @@ -2117,11 +2023,12 @@ ah_getspi(mblk_t *mp, keysock_in_t *ksi, ipsecah_stack_t *ahstack) /* * IPv6 sends up the ICMP errors for validation and the removal of the AH * header. + * If succesful, the mp has been modified to not include the AH header so + * that the caller can fanout to the ULP's icmp error handler. */ -static ipsec_status_t -ah_icmp_error_v6(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack) +static mblk_t * +ah_icmp_error_v6(mblk_t *mp, ip_recv_attr_t *ira, ipsecah_stack_t *ahstack) { - mblk_t *mp; ip6_t *ip6h, *oip6h; uint16_t hdr_length, ah_length; uint8_t *nexthdrp; @@ -2132,14 +2039,6 @@ ah_icmp_error_v6(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack) uint8_t *post_ah_ptr; ipsec_stack_t *ipss = ahstack->ipsecah_netstack->netstack_ipsec; - mp = ipsec_mp->b_cont; - ASSERT(mp->b_datap->db_type == M_CTL); - - /* - * Change the type to M_DATA till we finish pullups. - */ - mp->b_datap->db_type = M_DATA; - /* * Eat the cost of a pullupmsg() for now. It makes the rest of this * code far less convoluted. @@ -2150,10 +2049,10 @@ ah_icmp_error_v6(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack) mp->b_rptr + hdr_length + sizeof (icmp6_t) + sizeof (ip6_t) + sizeof (ah_t) > mp->b_wptr) { IP_AH_BUMP_STAT(ipss, in_discards); - ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL, + ip_drop_packet(mp, B_TRUE, ira->ira_ill, DROPPER(ipss, ipds_ah_nomem), &ahstack->ah_dropper); - return (IPSEC_STATUS_FAILED); + return (NULL); } oip6h = (ip6_t *)mp->b_rptr; @@ -2161,10 +2060,10 @@ ah_icmp_error_v6(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack) ip6h = (ip6_t *)(icmp6 + 1); if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_length, &nexthdrp)) { IP_AH_BUMP_STAT(ipss, in_discards); - ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL, + ip_drop_packet(mp, B_TRUE, ira->ira_ill, DROPPER(ipss, ipds_ah_bad_v6_hdrs), &ahstack->ah_dropper); - return (IPSEC_STATUS_FAILED); + return (NULL); } ah = (ah_t *)((uint8_t *)ip6h + hdr_length); @@ -2186,10 +2085,10 @@ ah_icmp_error_v6(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack) ah->ah_spi, &oip6h->ip6_src, AF_INET6, ahstack->ipsecah_netstack); } - ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL, + ip_drop_packet(mp, B_TRUE, ira->ira_ill, DROPPER(ipss, ipds_ah_no_sa), &ahstack->ah_dropper); - return (IPSEC_STATUS_FAILED); + return (NULL); } IPSA_REFRELE(assoc); @@ -2208,10 +2107,10 @@ ah_icmp_error_v6(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack) if (post_ah_ptr > mp->b_wptr) { IP_AH_BUMP_STAT(ipss, in_discards); - ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL, + ip_drop_packet(mp, B_TRUE, ira->ira_ill, DROPPER(ipss, ipds_ah_bad_length), &ahstack->ah_dropper); - return (IPSEC_STATUS_FAILED); + return (NULL); } ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) - ah_length); @@ -2219,20 +2118,19 @@ ah_icmp_error_v6(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack) ovbcopy(post_ah_ptr, ah, (size_t)((uintptr_t)mp->b_wptr - (uintptr_t)post_ah_ptr)); mp->b_wptr -= ah_length; - /* Rewhack to be an ICMP error. */ - mp->b_datap->db_type = M_CTL; - return (IPSEC_STATUS_SUCCESS); + return (mp); } /* * IP sends up the ICMP errors for validation and the removal of * the AH header. + * If succesful, the mp has been modified to not include the AH header so + * that the caller can fanout to the ULP's icmp error handler. */ -static ipsec_status_t -ah_icmp_error_v4(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack) +static mblk_t * +ah_icmp_error_v4(mblk_t *mp, ip_recv_attr_t *ira, ipsecah_stack_t *ahstack) { - mblk_t *mp; mblk_t *mp1; icmph_t *icmph; int iph_hdr_length; @@ -2248,14 +2146,6 @@ ah_icmp_error_v4(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack) uint8_t nexthdr; ipsec_stack_t *ipss = ahstack->ipsecah_netstack->netstack_ipsec; - mp = ipsec_mp->b_cont; - ASSERT(mp->b_datap->db_type == M_CTL); - - /* - * Change the type to M_DATA till we finish pullups. - */ - mp->b_datap->db_type = M_DATA; - oipha = ipha = (ipha_t *)mp->b_rptr; iph_hdr_length = IPH_HDR_LENGTH(ipha); icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; @@ -2274,10 +2164,10 @@ ah_icmp_error_v4(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack) SL_WARN | SL_ERROR, "ICMP error: Small AH header\n"); IP_AH_BUMP_STAT(ipss, in_discards); - ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL, + ip_drop_packet(mp, B_TRUE, ira->ira_ill, DROPPER(ipss, ipds_ah_bad_length), &ahstack->ah_dropper); - return (IPSEC_STATUS_FAILED); + return (NULL); } icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; ipha = (ipha_t *)&icmph[1]; @@ -2304,10 +2194,10 @@ ah_icmp_error_v4(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack) ah->ah_spi, &oipha->ipha_src, AF_INET, ahstack->ipsecah_netstack); } - ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL, + ip_drop_packet(mp, B_TRUE, ira->ira_ill, DROPPER(ipss, ipds_ah_no_sa), &ahstack->ah_dropper); - return (IPSEC_STATUS_FAILED); + return (NULL); } IPSA_REFRELE(assoc); @@ -2343,10 +2233,10 @@ ah_icmp_error_v4(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack) * We tried hard, give up now. */ IP_AH_BUMP_STAT(ipss, in_discards); - ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL, + ip_drop_packet(mp, B_TRUE, ira->ira_ill, DROPPER(ipss, ipds_ah_nomem), &ahstack->ah_dropper); - return (IPSEC_STATUS_FAILED); + return (NULL); } icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; ipha = (ipha_t *)&icmph[1]; @@ -2354,8 +2244,8 @@ ah_icmp_error_v4(mblk_t *ipsec_mp, ipsecah_stack_t *ahstack) done: /* * Remove the AH header and change the protocol. - * Don't update the spi fields in the ipsec_in - * message as we are called just to validate the + * Don't update the spi fields in the ip_recv_attr_t + * as we are called just to validate the * message attached to the ICMP message. * * If we never pulled up since all of the message @@ -2368,14 +2258,11 @@ done: if ((mp1 = allocb(alloc_size, BPRI_LO)) == NULL) { IP_AH_BUMP_STAT(ipss, in_discards); - ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL, + ip_drop_packet(mp, B_TRUE, ira->ira_ill, DROPPER(ipss, ipds_ah_nomem), &ahstack->ah_dropper); - return (IPSEC_STATUS_FAILED); + return (NULL); } - /* ICMP errors are M_CTL messages */ - mp1->b_datap->db_type = M_CTL; - ipsec_mp->b_cont = mp1; bcopy(mp->b_rptr, mp1->b_rptr, alloc_size); mp1->b_wptr += alloc_size; @@ -2402,24 +2289,23 @@ done: ipha->ipha_hdr_checksum = 0; ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha); - return (IPSEC_STATUS_SUCCESS); + return (mp1); } /* * IP calls this to validate the ICMP errors that * we got from the network. */ -ipsec_status_t -ipsecah_icmp_error(mblk_t *mp) +mblk_t * +ipsecah_icmp_error(mblk_t *data_mp, ip_recv_attr_t *ira) { - ipsec_in_t *ii = (ipsec_in_t *)mp->b_rptr; - netstack_t *ns = ii->ipsec_in_ns; + netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack; ipsecah_stack_t *ahstack = ns->netstack_ipsecah; - if (ii->ipsec_in_v4) - return (ah_icmp_error_v4(mp, ahstack)); + if (ira->ira_flags & IRAF_IS_IPV4) + return (ah_icmp_error_v4(data_mp, ira, ahstack)); else - return (ah_icmp_error_v6(mp, ahstack)); + return (ah_icmp_error_v6(data_mp, ira, ahstack)); } static int @@ -2546,7 +2432,7 @@ ah_fix_phdr_v6(ip6_t *ip6h, ip6_t *oip6h, boolean_t outbound, prev_nexthdr = (uint8_t *)&ip6h->ip6_nxt; nexthdr = oip6h->ip6_nxt; /* Assume IP has already stripped it */ - ASSERT(nexthdr != IPPROTO_FRAGMENT && nexthdr != IPPROTO_RAW); + ASSERT(nexthdr != IPPROTO_FRAGMENT); ah = NULL; dsthdr = NULL; for (;;) { @@ -2741,19 +2627,19 @@ ah_finish_up(ah_t *phdr_ah, ah_t *inbound_ah, ipsa_t *assoc, * argument is freed. */ static void -ah_log_bad_auth(mblk_t *ipsec_in) +ah_log_bad_auth(mblk_t *mp, ip_recv_attr_t *ira, ipsec_crypto_t *ic) { - mblk_t *mp = ipsec_in->b_cont->b_cont; - ipsec_in_t *ii = (ipsec_in_t *)ipsec_in->b_rptr; - boolean_t isv4 = ii->ipsec_in_v4; - ipsa_t *assoc = ii->ipsec_in_ah_sa; - int af; - void *addr; - netstack_t *ns = ii->ipsec_in_ns; + boolean_t isv4 = (ira->ira_flags & IRAF_IS_IPV4); + ipsa_t *assoc = ira->ira_ipsec_ah_sa; + int af; + void *addr; + netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack; ipsecah_stack_t *ahstack = ns->netstack_ipsecah; ipsec_stack_t *ipss = ns->netstack_ipsec; - mp->b_rptr -= ii->ipsec_in_skip_len; + ASSERT(mp->b_datap->db_type == M_DATA); + + mp->b_rptr -= ic->ic_skip_len; if (isv4) { ipha_t *ipha = (ipha_t *)mp->b_rptr; @@ -2776,110 +2662,163 @@ ah_log_bad_auth(mblk_t *ipsec_in) assoc->ipsa_spi, addr, af, ahstack->ipsecah_netstack); IP_AH_BUMP_STAT(ipss, in_discards); - ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL, + ip_drop_packet(mp, B_TRUE, ira->ira_ill, DROPPER(ipss, ipds_ah_bad_auth), &ahstack->ah_dropper); } /* * Kernel crypto framework callback invoked after completion of async - * crypto requests. + * crypto requests for outbound packets. */ static void -ah_kcf_callback(void *arg, int status) +ah_kcf_callback_outbound(void *arg, int status) { - mblk_t *ipsec_mp = (mblk_t *)arg; - ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr; - boolean_t is_inbound = (ii->ipsec_in_type == IPSEC_IN); - netstackid_t stackid; - netstack_t *ns, *ns_arg; + mblk_t *mp = (mblk_t *)arg; + mblk_t *async_mp; + netstack_t *ns; ipsec_stack_t *ipss; ipsecah_stack_t *ahstack; - ipsec_out_t *io = (ipsec_out_t *)ii; + mblk_t *data_mp; + ip_xmit_attr_t ixas; + ipsec_crypto_t *ic; + ill_t *ill; - ASSERT(ipsec_mp->b_cont != NULL); + /* + * First remove the ipsec_crypto_t mblk + * Note that we need to ipsec_free_crypto_data(mp) once done with ic. + */ + async_mp = ipsec_remove_crypto_data(mp, &ic); + ASSERT(async_mp != NULL); - if (is_inbound) { - stackid = ii->ipsec_in_stackid; - ns_arg = ii->ipsec_in_ns; + /* + * Extract the ip_xmit_attr_t from the first mblk. + * Verifies that the netstack and ill is still around; could + * have vanished while kEf was doing its work. + * On succesful return we have a nce_t and the ill/ipst can't + * disappear until we do the nce_refrele in ixa_cleanup. + */ + data_mp = async_mp->b_cont; + async_mp->b_cont = NULL; + if (!ip_xmit_attr_from_mblk(async_mp, &ixas)) { + /* Disappeared on us - no ill/ipst for MIB */ + if (ixas.ixa_nce != NULL) { + ill = ixas.ixa_nce->nce_ill; + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards", data_mp, ill); + } + freemsg(data_mp); + goto done; + } + ns = ixas.ixa_ipst->ips_netstack; + ahstack = ns->netstack_ipsecah; + ipss = ns->netstack_ipsec; + ill = ixas.ixa_nce->nce_ill; + + if (status == CRYPTO_SUCCESS) { + data_mp = ah_auth_out_done(data_mp, &ixas, ic); + if (data_mp == NULL) + goto done; + + (void) ip_output_post_ipsec(data_mp, &ixas); } else { - stackid = io->ipsec_out_stackid; - ns_arg = io->ipsec_out_ns; + /* Outbound shouldn't see invalid MAC */ + ASSERT(status != CRYPTO_INVALID_MAC); + + ah1dbg(ahstack, + ("ah_kcf_callback_outbound: crypto failed with 0x%x\n", + status)); + AH_BUMP_STAT(ahstack, crypto_failures); + AH_BUMP_STAT(ahstack, out_discards); + + ip_drop_packet(data_mp, B_FALSE, ill, + DROPPER(ipss, ipds_ah_crypto_failed), + &ahstack->ah_dropper); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); } +done: + ixa_cleanup(&ixas); + (void) ipsec_free_crypto_data(mp); +} + +/* + * Kernel crypto framework callback invoked after completion of async + * crypto requests for inbound packets. + */ +static void +ah_kcf_callback_inbound(void *arg, int status) +{ + mblk_t *mp = (mblk_t *)arg; + mblk_t *async_mp; + netstack_t *ns; + ipsec_stack_t *ipss; + ipsecah_stack_t *ahstack; + mblk_t *data_mp; + ip_recv_attr_t iras; + ipsec_crypto_t *ic; + /* - * Verify that the netstack is still around; could have vanished - * while kEf was doing its work. + * First remove the ipsec_crypto_t mblk + * Note that we need to ipsec_free_crypto_data(mp) once done with ic. */ - ns = netstack_find_by_stackid(stackid); - if (ns == NULL || ns != ns_arg) { - /* Disappeared on us */ - if (ns != NULL) - netstack_rele(ns); - freemsg(ipsec_mp); - return; - } + async_mp = ipsec_remove_crypto_data(mp, &ic); + ASSERT(async_mp != NULL); + /* + * Extract the ip_xmit_attr_t from the first mblk. + * Verifies that the netstack and ill is still around; could + * have vanished while kEf was doing its work. + */ + data_mp = async_mp->b_cont; + async_mp->b_cont = NULL; + if (!ip_recv_attr_from_mblk(async_mp, &iras)) { + /* The ill or ip_stack_t disappeared on us */ + ip_drop_input("ip_recv_attr_from_mblk", data_mp, NULL); + freemsg(data_mp); + goto done; + } + ns = iras.ira_ill->ill_ipst->ips_netstack; ahstack = ns->netstack_ipsecah; ipss = ns->netstack_ipsec; if (status == CRYPTO_SUCCESS) { - if (is_inbound) { - if (ah_auth_in_done(ipsec_mp) != IPSEC_STATUS_SUCCESS) { - netstack_rele(ns); - return; - } - /* finish IPsec processing */ - ip_fanout_proto_again(ipsec_mp, NULL, NULL, NULL); - } else { - ipha_t *ipha; + data_mp = ah_auth_in_done(data_mp, &iras, ic); + if (data_mp == NULL) + goto done; - if (ah_auth_out_done(ipsec_mp) != - IPSEC_STATUS_SUCCESS) { - netstack_rele(ns); - return; - } - - /* finish IPsec processing */ - ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr; - if (IPH_HDR_VERSION(ipha) == IP_VERSION) { - ip_wput_ipsec_out(NULL, ipsec_mp, ipha, NULL, - NULL); - } else { - ip6_t *ip6h = (ip6_t *)ipha; - ip_wput_ipsec_out_v6(NULL, ipsec_mp, ip6h, - NULL, NULL); - } - } + /* finish IPsec processing */ + ip_input_post_ipsec(data_mp, &iras); } else if (status == CRYPTO_INVALID_MAC) { - ah_log_bad_auth(ipsec_mp); + ah_log_bad_auth(data_mp, &iras, ic); } else { - ah1dbg(ahstack, ("ah_kcf_callback: crypto failed with 0x%x\n", + ah1dbg(ahstack, + ("ah_kcf_callback_inbound: crypto failed with 0x%x\n", status)); AH_BUMP_STAT(ahstack, crypto_failures); - if (is_inbound) - IP_AH_BUMP_STAT(ipss, in_discards); - else - AH_BUMP_STAT(ahstack, out_discards); - ip_drop_packet(ipsec_mp, is_inbound, NULL, NULL, + IP_AH_BUMP_STAT(ipss, in_discards); + ip_drop_packet(data_mp, B_TRUE, iras.ira_ill, DROPPER(ipss, ipds_ah_crypto_failed), &ahstack->ah_dropper); + BUMP_MIB(iras.ira_ill->ill_ip_mib, ipIfStatsInDiscards); } - netstack_rele(ns); +done: + ira_cleanup(&iras, B_TRUE); + (void) ipsec_free_crypto_data(mp); } /* * Invoked on kernel crypto failure during inbound and outbound processing. */ static void -ah_crypto_failed(mblk_t *mp, boolean_t is_inbound, int kef_rc, - ipsecah_stack_t *ahstack) +ah_crypto_failed(mblk_t *data_mp, boolean_t is_inbound, int kef_rc, + ill_t *ill, ipsecah_stack_t *ahstack) { ipsec_stack_t *ipss = ahstack->ipsecah_netstack->netstack_ipsec; ah1dbg(ahstack, ("crypto failed for %s AH with 0x%x\n", is_inbound ? "inbound" : "outbound", kef_rc)); - ip_drop_packet(mp, is_inbound, NULL, NULL, + ip_drop_packet(data_mp, is_inbound, ill, DROPPER(ipss, ipds_ah_crypto_failed), &ahstack->ah_dropper); AH_BUMP_STAT(ahstack, crypto_failures); @@ -2893,14 +2832,14 @@ ah_crypto_failed(mblk_t *mp, boolean_t is_inbound, int kef_rc, * Helper macros for the ah_submit_req_{inbound,outbound}() functions. */ -#define AH_INIT_CALLREQ(_cr, _ipss) { \ - (_cr)->cr_flag = CRYPTO_SKIP_REQID|CRYPTO_RESTRICTED; \ - if ((_ipss)->ipsec_algs_exec_mode[IPSEC_ALG_AUTH] == \ - IPSEC_ALGS_EXEC_ASYNC) \ - (_cr)->cr_flag |= CRYPTO_ALWAYS_QUEUE; \ - (_cr)->cr_callback_arg = ipsec_mp; \ - (_cr)->cr_callback_func = ah_kcf_callback; \ -} +/* + * A statement-equivalent macro, _cr MUST point to a modifiable + * crypto_call_req_t. + */ +#define AH_INIT_CALLREQ(_cr, _mp, _callback) \ + (_cr)->cr_flag = CRYPTO_SKIP_REQID|CRYPTO_ALWAYS_QUEUE; \ + (_cr)->cr_callback_arg = (_mp); \ + (_cr)->cr_callback_func = (_callback) #define AH_INIT_CRYPTO_DATA(data, msglen, mblk) { \ (data)->cd_format = CRYPTO_DATA_MBLK; \ @@ -2920,124 +2859,185 @@ ah_crypto_failed(mblk_t *mp, boolean_t is_inbound, int kef_rc, /* * Submit an inbound packet for processing by the crypto framework. */ -static ipsec_status_t -ah_submit_req_inbound(mblk_t *ipsec_mp, size_t skip_len, uint32_t ah_offset, - ipsa_t *assoc) +static mblk_t * +ah_submit_req_inbound(mblk_t *phdr_mp, ip_recv_attr_t *ira, + size_t skip_len, uint32_t ah_offset, ipsa_t *assoc) { int kef_rc; - mblk_t *phdr_mp; - crypto_call_req_t call_req; - ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr; + mblk_t *mp; + crypto_call_req_t call_req, *callrp; uint_t icv_len = assoc->ipsa_mac_len; crypto_ctx_template_t ctx_tmpl; - netstack_t *ns = ii->ipsec_in_ns; - ipsecah_stack_t *ahstack = ns->netstack_ipsecah; - ipsec_stack_t *ipss = ns->netstack_ipsec; + ipsecah_stack_t *ahstack; + ipsec_crypto_t *ic, icstack; + boolean_t force = (assoc->ipsa_flags & IPSA_F_ASYNC); + + ahstack = ira->ira_ill->ill_ipst->ips_netstack->netstack_ipsecah; - phdr_mp = ipsec_mp->b_cont; ASSERT(phdr_mp != NULL); - ASSERT(ii->ipsec_in_type == IPSEC_IN); + ASSERT(phdr_mp->b_datap->db_type == M_DATA); + + if (force) { + /* We are doing asynch; allocate mblks to hold state */ + if ((mp = ip_recv_attr_to_mblk(ira)) == NULL || + (mp = ipsec_add_crypto_data(mp, &ic)) == NULL) { + BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", phdr_mp, + ira->ira_ill); + freemsg(phdr_mp); + return (NULL); + } - /* - * In case kEF queues and calls back, make sure we have the - * netstackid_t for verification that the IP instance is still around - * in esp_kcf_callback(). - */ - ASSERT(ii->ipsec_in_stackid == ns->netstack_stackid); + linkb(mp, phdr_mp); + callrp = &call_req; + AH_INIT_CALLREQ(callrp, mp, ah_kcf_callback_inbound); + } else { + /* + * If we know we are going to do sync then ipsec_crypto_t + * should be on the stack. + */ + ic = &icstack; + bzero(ic, sizeof (*ic)); + callrp = NULL; + } /* init arguments for the crypto framework */ - AH_INIT_CRYPTO_DATA(&ii->ipsec_in_crypto_data, AH_MSGSIZE(phdr_mp), + AH_INIT_CRYPTO_DATA(&ic->ic_crypto_data, AH_MSGSIZE(phdr_mp), phdr_mp); - AH_INIT_CRYPTO_MAC(&ii->ipsec_in_crypto_mac, icv_len, + AH_INIT_CRYPTO_MAC(&ic->ic_crypto_mac, icv_len, (char *)phdr_mp->b_cont->b_rptr - skip_len + ah_offset + sizeof (ah_t)); - AH_INIT_CALLREQ(&call_req, ipss); - - ii->ipsec_in_skip_len = skip_len; + ic->ic_skip_len = skip_len; IPSEC_CTX_TMPL(assoc, ipsa_authtmpl, IPSEC_ALG_AUTH, ctx_tmpl); /* call KEF to do the MAC operation */ kef_rc = crypto_mac_verify(&assoc->ipsa_amech, - &ii->ipsec_in_crypto_data, &assoc->ipsa_kcfauthkey, ctx_tmpl, - &ii->ipsec_in_crypto_mac, &call_req); + &ic->ic_crypto_data, &assoc->ipsa_kcfauthkey, ctx_tmpl, + &ic->ic_crypto_mac, callrp); switch (kef_rc) { case CRYPTO_SUCCESS: AH_BUMP_STAT(ahstack, crypto_sync); - return (ah_auth_in_done(ipsec_mp)); + phdr_mp = ah_auth_in_done(phdr_mp, ira, ic); + if (force) { + /* Free mp after we are done with ic */ + mp = ipsec_free_crypto_data(mp); + (void) ip_recv_attr_free_mblk(mp); + } + return (phdr_mp); case CRYPTO_QUEUED: - /* ah_kcf_callback() will be invoked on completion */ + /* ah_kcf_callback_inbound() will be invoked on completion */ AH_BUMP_STAT(ahstack, crypto_async); - return (IPSEC_STATUS_PENDING); + return (NULL); case CRYPTO_INVALID_MAC: + /* Free mp after we are done with ic */ AH_BUMP_STAT(ahstack, crypto_sync); - ah_log_bad_auth(ipsec_mp); - return (IPSEC_STATUS_FAILED); + BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards); + ah_log_bad_auth(phdr_mp, ira, ic); + /* phdr_mp was passed to ip_drop_packet */ + if (force) { + mp = ipsec_free_crypto_data(mp); + (void) ip_recv_attr_free_mblk(mp); + } + return (NULL); } - ah_crypto_failed(ipsec_mp, B_TRUE, kef_rc, ahstack); - return (IPSEC_STATUS_FAILED); + if (force) { + mp = ipsec_free_crypto_data(mp); + phdr_mp = ip_recv_attr_free_mblk(mp); + } + BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards); + ah_crypto_failed(phdr_mp, B_TRUE, kef_rc, ira->ira_ill, ahstack); + /* phdr_mp was passed to ip_drop_packet */ + return (NULL); } /* * Submit an outbound packet for processing by the crypto framework. */ -static ipsec_status_t -ah_submit_req_outbound(mblk_t *ipsec_mp, size_t skip_len, ipsa_t *assoc) +static mblk_t * +ah_submit_req_outbound(mblk_t *phdr_mp, ip_xmit_attr_t *ixa, + size_t skip_len, ipsa_t *assoc) { int kef_rc; - mblk_t *phdr_mp; - crypto_call_req_t call_req; - ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr; + mblk_t *mp; + crypto_call_req_t call_req, *callrp; uint_t icv_len = assoc->ipsa_mac_len; - netstack_t *ns = io->ipsec_out_ns; - ipsecah_stack_t *ahstack = ns->netstack_ipsecah; - ipsec_stack_t *ipss = ns->netstack_ipsec; + ipsecah_stack_t *ahstack; + ipsec_crypto_t *ic, icstack; + ill_t *ill = ixa->ixa_nce->nce_ill; + boolean_t force = (assoc->ipsa_flags & IPSA_F_ASYNC); - phdr_mp = ipsec_mp->b_cont; - ASSERT(phdr_mp != NULL); - ASSERT(io->ipsec_out_type == IPSEC_OUT); + ahstack = ill->ill_ipst->ips_netstack->netstack_ipsecah; - /* - * In case kEF queues and calls back, keep netstackid_t for - * verification that the IP instance is still around in - * ah_kcf_callback(). - */ - io->ipsec_out_stackid = ns->netstack_stackid; + ASSERT(phdr_mp != NULL); + ASSERT(phdr_mp->b_datap->db_type == M_DATA); + + if (force) { + /* We are doing asynch; allocate mblks to hold state */ + if ((mp = ip_xmit_attr_to_mblk(ixa)) == NULL || + (mp = ipsec_add_crypto_data(mp, &ic)) == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards", phdr_mp, ill); + freemsg(phdr_mp); + return (NULL); + } + linkb(mp, phdr_mp); + callrp = &call_req; + AH_INIT_CALLREQ(callrp, mp, ah_kcf_callback_outbound); + } else { + /* + * If we know we are going to do sync then ipsec_crypto_t + * should be on the stack. + */ + ic = &icstack; + bzero(ic, sizeof (*ic)); + callrp = NULL; + } /* init arguments for the crypto framework */ - AH_INIT_CRYPTO_DATA(&io->ipsec_out_crypto_data, AH_MSGSIZE(phdr_mp), + AH_INIT_CRYPTO_DATA(&ic->ic_crypto_data, AH_MSGSIZE(phdr_mp), phdr_mp); - AH_INIT_CRYPTO_MAC(&io->ipsec_out_crypto_mac, icv_len, + AH_INIT_CRYPTO_MAC(&ic->ic_crypto_mac, icv_len, (char *)phdr_mp->b_wptr); - AH_INIT_CALLREQ(&call_req, ipss); + ic->ic_skip_len = skip_len; - io->ipsec_out_skip_len = skip_len; - - ASSERT(io->ipsec_out_ah_sa != NULL); + ASSERT(ixa->ixa_ipsec_ah_sa != NULL); /* call KEF to do the MAC operation */ - kef_rc = crypto_mac(&assoc->ipsa_amech, &io->ipsec_out_crypto_data, + kef_rc = crypto_mac(&assoc->ipsa_amech, &ic->ic_crypto_data, &assoc->ipsa_kcfauthkey, assoc->ipsa_authtmpl, - &io->ipsec_out_crypto_mac, &call_req); + &ic->ic_crypto_mac, callrp); switch (kef_rc) { case CRYPTO_SUCCESS: AH_BUMP_STAT(ahstack, crypto_sync); - return (ah_auth_out_done(ipsec_mp)); + phdr_mp = ah_auth_out_done(phdr_mp, ixa, ic); + if (force) { + /* Free mp after we are done with ic */ + mp = ipsec_free_crypto_data(mp); + (void) ip_xmit_attr_free_mblk(mp); + } + return (phdr_mp); case CRYPTO_QUEUED: - /* ah_kcf_callback() will be invoked on completion */ + /* ah_kcf_callback_outbound() will be invoked on completion */ AH_BUMP_STAT(ahstack, crypto_async); - return (IPSEC_STATUS_PENDING); + return (NULL); } - ah_crypto_failed(ipsec_mp, B_FALSE, kef_rc, ahstack); - return (IPSEC_STATUS_FAILED); + if (force) { + mp = ipsec_free_crypto_data(mp); + phdr_mp = ip_xmit_attr_free_mblk(mp); + } + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ah_crypto_failed(phdr_mp, B_FALSE, kef_rc, NULL, ahstack); + /* phdr_mp was passed to ip_drop_packet */ + return (NULL); } /* @@ -3056,7 +3056,6 @@ ah_process_ip_options_v6(mblk_t *mp, ipsa_t *assoc, int *length_to_skip, uint_t ah_align_sz; uint_t ah_offset; int hdr_size; - ipsec_stack_t *ipss = ahstack->ipsecah_netstack->netstack_ipsec; /* * Allocate space for the authentication data also. It is @@ -3135,9 +3134,6 @@ ah_process_ip_options_v6(mblk_t *mp, ipsa_t *assoc, int *length_to_skip, ah_offset = ah_fix_phdr_v6(ip6h, oip6h, outbound, B_FALSE); if (ah_offset == 0) { - ip_drop_packet(phdr_mp, !outbound, NULL, NULL, - DROPPER(ipss, ipds_ah_bad_v6_hdrs), - &ahstack->ah_dropper); return (NULL); } } @@ -3375,65 +3371,67 @@ ah_hdr: /* * Authenticate an outbound datagram. This function is called * whenever IP sends an outbound datagram that needs authentication. + * Returns a modified packet if done. Returns NULL if error or queued. + * If error return then ipIfStatsOutDiscards has been increased. */ -static ipsec_status_t -ah_outbound(mblk_t *ipsec_out) +static mblk_t * +ah_outbound(mblk_t *data_mp, ip_xmit_attr_t *ixa) { - mblk_t *mp; mblk_t *phdr_mp; - ipsec_out_t *oi; ipsa_t *assoc; int length_to_skip; uint_t ah_align_sz; uint_t age_bytes; - netstack_t *ns; - ipsec_stack_t *ipss; - ipsecah_stack_t *ahstack; + netstack_t *ns = ixa->ixa_ipst->ips_netstack; + ipsecah_stack_t *ahstack = ns->netstack_ipsecah; + ipsec_stack_t *ipss = ns->netstack_ipsec; + ill_t *ill = ixa->ixa_nce->nce_ill; + boolean_t need_refrele = B_FALSE; /* * Construct the chain of mblks * - * IPSEC_OUT->PSEUDO_HDR->DATA + * PSEUDO_HDR->DATA * * one by one. */ - ASSERT(ipsec_out->b_datap->db_type == M_CTL); - - ASSERT(MBLKL(ipsec_out) >= sizeof (ipsec_info_t)); - - mp = ipsec_out->b_cont; - oi = (ipsec_out_t *)ipsec_out->b_rptr; - ns = oi->ipsec_out_ns; - ipss = ns->netstack_ipsec; - ahstack = ns->netstack_ipsecah; - AH_BUMP_STAT(ahstack, out_requests); - ASSERT(mp->b_datap->db_type == M_DATA); + ASSERT(data_mp->b_datap->db_type == M_DATA); - assoc = oi->ipsec_out_ah_sa; + assoc = ixa->ixa_ipsec_ah_sa; ASSERT(assoc != NULL); /* * Get the outer IP header in shape to escape this system.. */ - if (is_system_labeled() && (assoc->ipsa_ocred != NULL)) { - int whack; - - mblk_setcred(mp, assoc->ipsa_ocred, NOPID); - if (oi->ipsec_out_v4) - whack = sadb_whack_label(&mp, assoc); - else - whack = sadb_whack_label_v6(&mp, assoc); - if (whack != 0) { - ip_drop_packet(ipsec_out, B_FALSE, NULL, - NULL, DROPPER(ipss, ipds_ah_nomem), + if (is_system_labeled() && (assoc->ipsa_otsl != NULL)) { + /* + * Need to update packet with any CIPSO option and update + * ixa_tsl to capture the new label. + * We allocate a separate ixa for that purpose. + */ + ixa = ip_xmit_attr_duplicate(ixa); + if (ixa == NULL) { + ip_drop_packet(data_mp, B_FALSE, ill, + DROPPER(ipss, ipds_ah_nomem), &ahstack->ah_dropper); - return (IPSEC_STATUS_FAILED); + return (NULL); + } + need_refrele = B_TRUE; + + label_hold(assoc->ipsa_otsl); + ip_xmit_attr_replace_tsl(ixa, assoc->ipsa_otsl); + + data_mp = sadb_whack_label(data_mp, assoc, ixa, + DROPPER(ipss, ipds_ah_nomem), &ahstack->ah_dropper); + if (data_mp == NULL) { + /* Packet dropped by sadb_whack_label */ + ixa_refrele(ixa); + return (NULL); } - ipsec_out->b_cont = mp; } /* @@ -3441,14 +3439,14 @@ ah_outbound(mblk_t *ipsec_out) * adding the AH header, ICV, and padding to the packet. */ - if (oi->ipsec_out_v4) { - ipha_t *ipha = (ipha_t *)mp->b_rptr; + if (ixa->ixa_flags & IXAF_IS_IPV4) { + ipha_t *ipha = (ipha_t *)data_mp->b_rptr; ah_align_sz = P2ALIGN(assoc->ipsa_mac_len + IPV4_PADDING_ALIGN - 1, IPV4_PADDING_ALIGN); age_bytes = ntohs(ipha->ipha_length) + sizeof (ah_t) + ah_align_sz; } else { - ip6_t *ip6h = (ip6_t *)mp->b_rptr; + ip6_t *ip6h = (ip6_t *)data_mp->b_rptr; ah_align_sz = P2ALIGN(assoc->ipsa_mac_len + IPV6_PADDING_ALIGN - 1, IPV6_PADDING_ALIGN); age_bytes = sizeof (ip6_t) + ntohs(ip6h->ip6_plen) + @@ -3461,8 +3459,12 @@ ah_outbound(mblk_t *ipsec_out) "AH association 0x%x, dst %s had bytes expire.\n", ntohl(assoc->ipsa_spi), assoc->ipsa_dstaddr, AF_INET, ahstack->ipsecah_netstack); - freemsg(ipsec_out); - return (IPSEC_STATUS_FAILED); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards", data_mp, ill); + freemsg(data_mp); + if (need_refrele) + ixa_refrele(ixa); + return (NULL); } /* @@ -3470,64 +3472,59 @@ ah_outbound(mblk_t *ipsec_out) * (AH is computing the checksum over the outer label). */ - if (oi->ipsec_out_is_capab_ill) { - ah3dbg(ahstack, ("ah_outbound: pkt can be accelerated\n")); - if (oi->ipsec_out_v4) - return (ah_outbound_accelerated_v4(ipsec_out, assoc)); - else - return (ah_outbound_accelerated_v6(ipsec_out, assoc)); - } - AH_BUMP_STAT(ahstack, noaccel); - /* * Insert pseudo header: - * IPSEC_INFO -> [IP, ULP] => IPSEC_INFO -> [IP, AH, ICV] -> ULP + * [IP, ULP] => [IP, AH, ICV] -> ULP */ - if (oi->ipsec_out_v4) { - phdr_mp = ah_process_ip_options_v4(mp, assoc, &length_to_skip, - assoc->ipsa_mac_len, B_TRUE, ahstack); + if (ixa->ixa_flags & IXAF_IS_IPV4) { + phdr_mp = ah_process_ip_options_v4(data_mp, assoc, + &length_to_skip, assoc->ipsa_mac_len, B_TRUE, ahstack); } else { - phdr_mp = ah_process_ip_options_v6(mp, assoc, &length_to_skip, - assoc->ipsa_mac_len, B_TRUE, ahstack); + phdr_mp = ah_process_ip_options_v6(data_mp, assoc, + &length_to_skip, assoc->ipsa_mac_len, B_TRUE, ahstack); } if (phdr_mp == NULL) { AH_BUMP_STAT(ahstack, out_discards); - ip_drop_packet(ipsec_out, B_FALSE, NULL, NULL, + ip_drop_packet(data_mp, B_FALSE, ixa->ixa_nce->nce_ill, DROPPER(ipss, ipds_ah_bad_v4_opts), &ahstack->ah_dropper); - return (IPSEC_STATUS_FAILED); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + if (need_refrele) + ixa_refrele(ixa); + return (NULL); } - ipsec_out->b_cont = phdr_mp; - phdr_mp->b_cont = mp; - mp->b_rptr += length_to_skip; + phdr_mp->b_cont = data_mp; + data_mp->b_rptr += length_to_skip; + data_mp = phdr_mp; /* - * At this point ipsec_out points to the IPSEC_OUT, new_mp - * points to an mblk containing the pseudo header (IP header, + * At this point data_mp points to + * an mblk containing the pseudo header (IP header, * AH header, and ICV with mutable fields zero'ed out). * mp points to the mblk containing the ULP data. The original - * IP header is kept before the ULP data in mp. + * IP header is kept before the ULP data in data_mp. */ /* submit MAC request to KCF */ - return (ah_submit_req_outbound(ipsec_out, length_to_skip, assoc)); + data_mp = ah_submit_req_outbound(data_mp, ixa, length_to_skip, assoc); + if (need_refrele) + ixa_refrele(ixa); + return (data_mp); } -static ipsec_status_t -ah_inbound(mblk_t *ipsec_in_mp, void *arg) +static mblk_t * +ah_inbound(mblk_t *data_mp, void *arg, ip_recv_attr_t *ira) { - mblk_t *data_mp = ipsec_in_mp->b_cont; - ipsec_in_t *ii = (ipsec_in_t *)ipsec_in_mp->b_rptr; - ah_t *ah = (ah_t *)arg; - ipsa_t *assoc = ii->ipsec_in_ah_sa; - int length_to_skip; - int ah_length; - mblk_t *phdr_mp; - uint32_t ah_offset; - netstack_t *ns = ii->ipsec_in_ns; + ah_t *ah = (ah_t *)arg; + ipsa_t *assoc = ira->ira_ipsec_ah_sa; + int length_to_skip; + int ah_length; + mblk_t *phdr_mp; + uint32_t ah_offset; + netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack; ipsecah_stack_t *ahstack = ns->netstack_ipsecah; ipsec_stack_t *ipss = ns->netstack_ipsec; @@ -3547,10 +3544,11 @@ ah_inbound(mblk_t *ipsec_in_mp, void *arg) if (!sadb_replay_peek(assoc, ah->ah_replay)) { AH_BUMP_STAT(ahstack, replay_early_failures); IP_AH_BUMP_STAT(ipss, in_discards); - ip_drop_packet(ipsec_in_mp, B_TRUE, NULL, NULL, + ip_drop_packet(data_mp, B_TRUE, ira->ira_ill, DROPPER(ipss, ipds_ah_early_replay), &ahstack->ah_dropper); - return (IPSEC_STATUS_FAILED); + BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards); + return (NULL); } /* @@ -3561,19 +3559,6 @@ ah_inbound(mblk_t *ipsec_in_mp, void *arg) ah_offset = (uchar_t *)ah - data_mp->b_rptr; /* - * Has this packet already been processed by a hardware - * IPsec accelerator? - */ - if (ii->ipsec_in_accelerated) { - ah3dbg(ahstack, - ("ah_inbound_v6: pkt processed by ill=%d isv6=%d\n", - ii->ipsec_in_ill_index, !ii->ipsec_in_v4)); - return (ah_inbound_accelerated(ipsec_in_mp, ii->ipsec_in_v4, - assoc, ah_offset)); - } - AH_BUMP_STAT(ahstack, noaccel); - - /* * We need to pullup until the ICV before we call * ah_process_ip_options_v6. */ @@ -3590,18 +3575,19 @@ ah_inbound(mblk_t *ipsec_in_mp, void *arg) SL_WARN | SL_ERROR, "ah_inbound: Small AH header\n"); IP_AH_BUMP_STAT(ipss, in_discards); - ip_drop_packet(ipsec_in_mp, B_TRUE, NULL, NULL, + ip_drop_packet(data_mp, B_TRUE, ira->ira_ill, DROPPER(ipss, ipds_ah_nomem), &ahstack->ah_dropper); - return (IPSEC_STATUS_FAILED); + BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards); + return (NULL); } } /* * Insert pseudo header: - * IPSEC_INFO -> [IP, ULP] => IPSEC_INFO -> [IP, AH, ICV] -> ULP + * [IP, ULP] => [IP, AH, ICV] -> ULP */ - if (ii->ipsec_in_v4) { + if (ira->ira_flags & IRAF_IS_IPV4) { phdr_mp = ah_process_ip_options_v4(data_mp, assoc, &length_to_skip, assoc->ipsa_mac_len, B_FALSE, ahstack); } else { @@ -3611,483 +3597,33 @@ ah_inbound(mblk_t *ipsec_in_mp, void *arg) if (phdr_mp == NULL) { IP_AH_BUMP_STAT(ipss, in_discards); - ip_drop_packet(ipsec_in_mp, B_TRUE, NULL, NULL, - (ii->ipsec_in_v4 ? + ip_drop_packet(data_mp, B_TRUE, ira->ira_ill, + ((ira->ira_flags & IRAF_IS_IPV4) ? DROPPER(ipss, ipds_ah_bad_v4_opts) : DROPPER(ipss, ipds_ah_bad_v6_hdrs)), &ahstack->ah_dropper); - return (IPSEC_STATUS_FAILED); + BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards); + return (NULL); } - ipsec_in_mp->b_cont = phdr_mp; phdr_mp->b_cont = data_mp; data_mp->b_rptr += length_to_skip; + data_mp = phdr_mp; /* submit request to KCF */ - return (ah_submit_req_inbound(ipsec_in_mp, length_to_skip, ah_offset, + return (ah_submit_req_inbound(data_mp, ira, length_to_skip, ah_offset, assoc)); } /* - * ah_inbound_accelerated: - * Called from ah_inbound() to process IPsec packets that have been - * accelerated by hardware. - * - * Basically does what ah_auth_in_done() with some changes since - * no pseudo-headers are involved, i.e. the passed message is a - * IPSEC_INFO->DATA. - * - * It is assumed that only packets that have been successfully - * processed by the adapter come here. - * - * 1. get algorithm structure corresponding to association - * 2. calculate pointers to authentication header and ICV - * 3. compare ICV in AH header with ICV in data attributes - * 3.1 if different: - * 3.1.1 generate error - * 3.1.2 discard message - * 3.2 if ICV matches: - * 3.2.1 check replay - * 3.2.2 remove AH header - * 3.2.3 age SA byte - * 3.2.4 send to IP - */ -ipsec_status_t -ah_inbound_accelerated(mblk_t *ipsec_in, boolean_t isv4, ipsa_t *assoc, - uint32_t ah_offset) -{ - mblk_t *mp; - ipha_t *ipha; - ah_t *ah; - ipsec_in_t *ii; - uint32_t icv_len; - uint32_t align_len; - uint32_t age_bytes; - ip6_t *ip6h; - uint8_t *in_icv; - mblk_t *hada_mp; - uint32_t next_hdr; - da_ipsec_t *hada; - kstat_named_t *counter; - ipsecah_stack_t *ahstack; - netstack_t *ns; - ipsec_stack_t *ipss; - - ii = (ipsec_in_t *)ipsec_in->b_rptr; - ns = ii->ipsec_in_ns; - ahstack = ns->netstack_ipsecah; - ipss = ns->netstack_ipsec; - - mp = ipsec_in->b_cont; - hada_mp = ii->ipsec_in_da; - ASSERT(hada_mp != NULL); - hada = (da_ipsec_t *)hada_mp->b_rptr; - - AH_BUMP_STAT(ahstack, in_accelerated); - - /* - * We only support one level of decapsulation in hardware, so - * nuke the pointer. - */ - ii->ipsec_in_da = NULL; - ii->ipsec_in_accelerated = B_FALSE; - - /* - * Extract ICV length from attributes M_CTL and sanity check - * its value. We allow the mblk to be smaller than da_ipsec_t - * for a small ICV, as long as the entire ICV fits within the mblk. - * Also ensures that the ICV length computed by Provider - * corresponds to the ICV length of the algorithm specified by the SA. - */ - icv_len = hada->da_icv_len; - if ((icv_len != assoc->ipsa_mac_len) || - (icv_len > DA_ICV_MAX_LEN) || (MBLKL(hada_mp) < - (sizeof (da_ipsec_t) - DA_ICV_MAX_LEN + icv_len))) { - ah0dbg(("ah_inbound_accelerated: " - "ICV len (%u) incorrect or mblk too small (%u)\n", - icv_len, (uint32_t)(MBLKL(hada_mp)))); - counter = DROPPER(ipss, ipds_ah_bad_length); - goto ah_in_discard; - } - ASSERT(icv_len != 0); - - /* compute the padded AH ICV len */ - if (isv4) { - ipha = (ipha_t *)mp->b_rptr; - align_len = (icv_len + IPV4_PADDING_ALIGN - 1) & - -IPV4_PADDING_ALIGN; - } else { - ip6h = (ip6_t *)mp->b_rptr; - align_len = (icv_len + IPV6_PADDING_ALIGN - 1) & - -IPV6_PADDING_ALIGN; - } - - ah = (ah_t *)(mp->b_rptr + ah_offset); - in_icv = (uint8_t *)ah + sizeof (ah_t); - - /* compare ICV in AH header vs ICV computed by adapter */ - if (bcmp(hada->da_icv, in_icv, icv_len)) { - int af; - void *addr; - - if (isv4) { - addr = &ipha->ipha_dst; - af = AF_INET; - } else { - addr = &ip6h->ip6_dst; - af = AF_INET6; - } - - /* - * Log the event. Don't print to the console, block - * potential denial-of-service attack. - */ - AH_BUMP_STAT(ahstack, bad_auth); - ipsec_assocfailure(info.mi_idnum, 0, 0, SL_ERROR | SL_WARN, - "AH Authentication failed spi %x, dst_addr %s", - assoc->ipsa_spi, addr, af, ahstack->ipsecah_netstack); - counter = DROPPER(ipss, ipds_ah_bad_auth); - goto ah_in_discard; - } - - ah3dbg(ahstack, ("AH succeeded, checking replay\n")); - AH_BUMP_STAT(ahstack, good_auth); - - if (!sadb_replay_check(assoc, ah->ah_replay)) { - int af; - void *addr; - - if (isv4) { - addr = &ipha->ipha_dst; - af = AF_INET; - } else { - addr = &ip6h->ip6_dst; - af = AF_INET6; - } - - /* - * Log the event. As of now we print out an event. - * Do not print the replay failure number, or else - * syslog cannot collate the error messages. Printing - * the replay number that failed (or printing to the - * console) opens a denial-of-service attack. - */ - AH_BUMP_STAT(ahstack, replay_failures); - ipsec_assocfailure(info.mi_idnum, 0, 0, - SL_ERROR | SL_WARN, - "Replay failed for AH spi %x, dst_addr %s", - assoc->ipsa_spi, addr, af, ahstack->ipsecah_netstack); - counter = DROPPER(ipss, ipds_ah_replay); - goto ah_in_discard; - } - - /* - * Remove AH header. We do this by copying everything before - * the AH header onto the AH header+ICV. - */ - /* overwrite AH with what was preceeding it (IP header) */ - next_hdr = ah->ah_nexthdr; - ovbcopy(mp->b_rptr, mp->b_rptr + sizeof (ah_t) + align_len, - ah_offset); - mp->b_rptr += sizeof (ah_t) + align_len; - if (isv4) { - /* adjust IP header next protocol */ - ipha = (ipha_t *)mp->b_rptr; - ipha->ipha_protocol = next_hdr; - - age_bytes = ipha->ipha_length; - - /* adjust length in IP header */ - ipha->ipha_length -= (sizeof (ah_t) + align_len); - - /* recalculate checksum */ - ipha->ipha_hdr_checksum = 0; - ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha); - } else { - /* adjust IP header next protocol */ - ip6h = (ip6_t *)mp->b_rptr; - ip6h->ip6_nxt = next_hdr; - - age_bytes = sizeof (ip6_t) + ntohs(ip6h->ip6_plen) + - sizeof (ah_t); - - /* adjust length in IP header */ - ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) - - (sizeof (ah_t) + align_len)); - } - - /* age SA */ - if (!ah_age_bytes(assoc, age_bytes, B_TRUE)) { - /* The ipsa has hit hard expiration, LOG and AUDIT. */ - ipsec_assocfailure(info.mi_idnum, 0, 0, - SL_ERROR | SL_WARN, - "AH Association 0x%x, dst %s had bytes expire.\n", - assoc->ipsa_spi, assoc->ipsa_dstaddr, - AF_INET, ahstack->ipsecah_netstack); - AH_BUMP_STAT(ahstack, bytes_expired); - counter = DROPPER(ipss, ipds_ah_bytes_expire); - goto ah_in_discard; - } - - freeb(hada_mp); - return (IPSEC_STATUS_SUCCESS); - -ah_in_discard: - IP_AH_BUMP_STAT(ipss, in_discards); - freeb(hada_mp); - ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL, counter, - &ahstack->ah_dropper); - return (IPSEC_STATUS_FAILED); -} - -/* - * ah_outbound_accelerated_v4: - * Called from ah_outbound_v4() and once it is determined that the - * packet is elligible for hardware acceleration. - * - * We proceed as follows: - * 1. allocate and initialize attributes mblk - * 2. mark IPSEC_OUT to indicate that pkt is accelerated - * 3. insert AH header - */ -static ipsec_status_t -ah_outbound_accelerated_v4(mblk_t *ipsec_mp, ipsa_t *assoc) -{ - mblk_t *mp, *new_mp; - ipsec_out_t *oi; - uint_t ah_data_sz; /* ICV length, algorithm dependent */ - uint_t ah_align_sz; /* ICV length + padding */ - uint32_t v_hlen_tos_len; /* from original IP header */ - ipha_t *oipha; /* original IP header */ - ipha_t *nipha; /* new IP header */ - uint_t option_length = 0; - uint_t new_hdr_len; /* new header length */ - uint_t iphdr_length; - ah_t *ah_hdr; /* ptr to AH header */ - netstack_t *ns; - ipsec_stack_t *ipss; - ipsecah_stack_t *ahstack; - - oi = (ipsec_out_t *)ipsec_mp->b_rptr; - ns = oi->ipsec_out_ns; - ipss = ns->netstack_ipsec; - ahstack = ns->netstack_ipsecah; - - mp = ipsec_mp->b_cont; - - AH_BUMP_STAT(ahstack, out_accelerated); - - oipha = (ipha_t *)mp->b_rptr; - v_hlen_tos_len = ((uint32_t *)oipha)[0]; - - /* mark packet as being accelerated in IPSEC_OUT */ - ASSERT(oi->ipsec_out_accelerated == B_FALSE); - oi->ipsec_out_accelerated = B_TRUE; - - /* calculate authentication data length, i.e. ICV + padding */ - ah_data_sz = assoc->ipsa_mac_len; - ah_align_sz = (ah_data_sz + IPV4_PADDING_ALIGN - 1) & - -IPV4_PADDING_ALIGN; - - /* - * Insert pseudo header: - * IPSEC_INFO -> [IP, ULP] => IPSEC_INFO -> [IP, AH, ICV] -> ULP - */ - - /* IP + AH + authentication + padding data length */ - new_hdr_len = IP_SIMPLE_HDR_LENGTH + sizeof (ah_t) + ah_align_sz; - if (V_HLEN != IP_SIMPLE_HDR_VERSION) { - option_length = oipha->ipha_version_and_hdr_length - - (uint8_t)((IP_VERSION << 4) + - IP_SIMPLE_HDR_LENGTH_IN_WORDS); - option_length <<= 2; - new_hdr_len += option_length; - } - - /* allocate pseudo-header mblk */ - if ((new_mp = allocb(new_hdr_len, BPRI_HI)) == NULL) { - /* IPsec kstats: bump bean counter here */ - ip_drop_packet(ipsec_mp, B_FALSE, NULL, NULL, - DROPPER(ipss, ipds_ah_nomem), - &ahstack->ah_dropper); - return (IPSEC_STATUS_FAILED); - } - - new_mp->b_cont = mp; - ipsec_mp->b_cont = new_mp; - new_mp->b_wptr += new_hdr_len; - - /* copy original IP header to new header */ - bcopy(mp->b_rptr, new_mp->b_rptr, IP_SIMPLE_HDR_LENGTH + - option_length); - - /* update IP header */ - nipha = (ipha_t *)new_mp->b_rptr; - nipha->ipha_protocol = IPPROTO_AH; - iphdr_length = ntohs(nipha->ipha_length); - iphdr_length += sizeof (ah_t) + ah_align_sz; - nipha->ipha_length = htons(iphdr_length); - nipha->ipha_hdr_checksum = 0; - nipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(nipha); - - /* skip original IP header in mp */ - mp->b_rptr += IP_SIMPLE_HDR_LENGTH + option_length; - - /* initialize AH header */ - ah_hdr = (ah_t *)(new_mp->b_rptr + IP_SIMPLE_HDR_LENGTH + - option_length); - ah_hdr->ah_nexthdr = oipha->ipha_protocol; - if (!ah_finish_up(ah_hdr, NULL, assoc, ah_data_sz, ah_align_sz, - ahstack)) { - /* Only way this fails is if outbound replay counter wraps. */ - ip_drop_packet(ipsec_mp, B_FALSE, NULL, NULL, - DROPPER(ipss, ipds_ah_replay), - &ahstack->ah_dropper); - return (IPSEC_STATUS_FAILED); - } - - return (IPSEC_STATUS_SUCCESS); -} - -/* - * ah_outbound_accelerated_v6: - * - * Called from ah_outbound_v6() once it is determined that the packet - * is eligible for hardware acceleration. - * - * We proceed as follows: - * 1. allocate and initialize attributes mblk - * 2. mark IPSEC_OUT to indicate that pkt is accelerated - * 3. insert AH header - */ -static ipsec_status_t -ah_outbound_accelerated_v6(mblk_t *ipsec_mp, ipsa_t *assoc) -{ - mblk_t *mp, *phdr_mp; - ipsec_out_t *oi; - uint_t ah_data_sz; /* ICV length, algorithm dependent */ - uint_t ah_align_sz; /* ICV length + padding */ - ip6_t *oip6h; /* original IP header */ - ip6_t *ip6h; /* new IP header */ - uint_t option_length = 0; - uint_t hdr_size; - uint_t ah_offset; - ah_t *ah_hdr; /* ptr to AH header */ - netstack_t *ns; - ipsec_stack_t *ipss; - ipsecah_stack_t *ahstack; - - oi = (ipsec_out_t *)ipsec_mp->b_rptr; - ns = oi->ipsec_out_ns; - ipss = ns->netstack_ipsec; - ahstack = ns->netstack_ipsecah; - - mp = ipsec_mp->b_cont; - - AH_BUMP_STAT(ahstack, out_accelerated); - - oip6h = (ip6_t *)mp->b_rptr; - - /* mark packet as being accelerated in IPSEC_OUT */ - ASSERT(oi->ipsec_out_accelerated == B_FALSE); - oi->ipsec_out_accelerated = B_TRUE; - - /* calculate authentication data length, i.e. ICV + padding */ - ah_data_sz = assoc->ipsa_mac_len; - ah_align_sz = (ah_data_sz + IPV4_PADDING_ALIGN - 1) & - -IPV4_PADDING_ALIGN; - - ASSERT(ah_align_sz >= ah_data_sz); - - hdr_size = ipsec_ah_get_hdr_size_v6(mp, B_FALSE); - option_length = hdr_size - IPV6_HDR_LEN; - - /* This was not included in ipsec_ah_get_hdr_size_v6() */ - hdr_size += (sizeof (ah_t) + ah_align_sz); - - if ((phdr_mp = allocb(hdr_size, BPRI_HI)) == NULL) { - ip_drop_packet(ipsec_mp, B_FALSE, NULL, NULL, - DROPPER(ipss, ipds_ah_nomem), - &ahstack->ah_dropper); - return (IPSEC_STATUS_FAILED); - } - phdr_mp->b_wptr += hdr_size; - - /* - * Form the basic IP header first. We always assign every bit - * of the v6 basic header, so a separate bzero is unneeded. - */ - ip6h = (ip6_t *)phdr_mp->b_rptr; - ip6h->ip6_vcf = oip6h->ip6_vcf; - ip6h->ip6_hlim = oip6h->ip6_hlim; - ip6h->ip6_src = oip6h->ip6_src; - ip6h->ip6_dst = oip6h->ip6_dst; - /* - * Include the size of AH and authentication data. - * This is how our recipient would compute the - * authentication data. Look at what we do in the - * inbound case below. - */ - ip6h->ip6_plen = htons(ntohs(oip6h->ip6_plen) + sizeof (ah_t) + - ah_align_sz); - - /* - * Insert pseudo header: - * IPSEC_INFO -> [IP6, LLH, ULP] => - * IPSEC_INFO -> [IP, LLH, AH, ICV] -> ULP - */ - - if (option_length == 0) { - /* Form the AH header */ - ip6h->ip6_nxt = IPPROTO_AH; - ((ah_t *)(ip6h + 1))->ah_nexthdr = oip6h->ip6_nxt; - ah_offset = IPV6_HDR_LEN; - } else { - ip6h->ip6_nxt = oip6h->ip6_nxt; - /* option_length does not include the AH header's size */ - ah_offset = ah_fix_phdr_v6(ip6h, oip6h, B_TRUE, B_FALSE); - if (ah_offset == 0) { - freemsg(phdr_mp); - ip_drop_packet(ipsec_mp, B_FALSE, NULL, NULL, - DROPPER(ipss, ipds_ah_bad_v6_hdrs), - &ahstack->ah_dropper); - return (IPSEC_STATUS_FAILED); - } - } - - phdr_mp->b_cont = mp; - ipsec_mp->b_cont = phdr_mp; - - /* skip original IP header in mp */ - mp->b_rptr += IPV6_HDR_LEN + option_length; - - /* initialize AH header */ - ah_hdr = (ah_t *)(phdr_mp->b_rptr + IPV6_HDR_LEN + option_length); - ah_hdr->ah_nexthdr = oip6h->ip6_nxt; - - if (!ah_finish_up(((ah_t *)((uint8_t *)ip6h + ah_offset)), NULL, - assoc, ah_data_sz, ah_align_sz, ahstack)) { - /* Only way this fails is if outbound replay counter wraps. */ - ip_drop_packet(ipsec_mp, B_FALSE, NULL, NULL, - DROPPER(ipss, ipds_ah_replay), - &ahstack->ah_dropper); - return (IPSEC_STATUS_FAILED); - } - - return (IPSEC_STATUS_SUCCESS); -} - -/* * Invoked after processing of an inbound packet by the * kernel crypto framework. Called by ah_submit_req() for a sync request, * or by the kcf callback for an async request. - * Returns IPSEC_STATUS_SUCCESS on success, IPSEC_STATUS_FAILED on failure. - * On failure, the mblk chain ipsec_in is freed by this function. + * Returns NULL if the mblk chain is consumed. */ -static ipsec_status_t -ah_auth_in_done(mblk_t *ipsec_in) +static mblk_t * +ah_auth_in_done(mblk_t *phdr_mp, ip_recv_attr_t *ira, ipsec_crypto_t *ic) { - mblk_t *phdr_mp; ipha_t *ipha; uint_t ah_offset = 0; mblk_t *mp; @@ -4096,41 +3632,36 @@ ah_auth_in_done(mblk_t *ipsec_in) uint32_t length; uint32_t *dest32; uint8_t *dest; - ipsec_in_t *ii; boolean_t isv4; ip6_t *ip6h; uint_t icv_len; ipsa_t *assoc; kstat_named_t *counter; - netstack_t *ns; - ipsecah_stack_t *ahstack; - ipsec_stack_t *ipss; - - ii = (ipsec_in_t *)ipsec_in->b_rptr; - ns = ii->ipsec_in_ns; - ahstack = ns->netstack_ipsecah; - ipss = ns->netstack_ipsec; + netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack; + ipsecah_stack_t *ahstack = ns->netstack_ipsecah; + ipsec_stack_t *ipss = ns->netstack_ipsec; - isv4 = ii->ipsec_in_v4; - assoc = ii->ipsec_in_ah_sa; - icv_len = (uint_t)ii->ipsec_in_crypto_mac.cd_raw.iov_len; + isv4 = (ira->ira_flags & IRAF_IS_IPV4); + assoc = ira->ira_ipsec_ah_sa; + icv_len = (uint_t)ic->ic_crypto_mac.cd_raw.iov_len; - phdr_mp = ipsec_in->b_cont; if (phdr_mp == NULL) { - ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL, + ip_drop_packet(phdr_mp, B_TRUE, ira->ira_ill, DROPPER(ipss, ipds_ah_nomem), &ahstack->ah_dropper); - return (IPSEC_STATUS_FAILED); + BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards); + return (NULL); } mp = phdr_mp->b_cont; if (mp == NULL) { - ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL, + ip_drop_packet(phdr_mp, B_TRUE, ira->ira_ill, DROPPER(ipss, ipds_ah_nomem), &ahstack->ah_dropper); - return (IPSEC_STATUS_FAILED); + BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards); + return (NULL); } - mp->b_rptr -= ii->ipsec_in_skip_len; + mp->b_rptr -= ic->ic_skip_len; ah_set_usetime(assoc, B_TRUE); @@ -4256,8 +3787,7 @@ ah_auth_in_done(mblk_t *ipsec_in) while (*nexthdr != IPPROTO_AH) { whereptr += hdrlen; /* Assume IP has already stripped it */ - ASSERT(*nexthdr != IPPROTO_FRAGMENT && - *nexthdr != IPPROTO_RAW); + ASSERT(*nexthdr != IPPROTO_FRAGMENT); switch (*nexthdr) { case IPPROTO_HOPOPTS: hbhhdr = (ip6_hbh_t *)whereptr; @@ -4292,20 +3822,18 @@ ah_auth_in_done(mblk_t *ipsec_in) while (--dest >= mp->b_rptr) *dest = *(dest - newpos); } - ipsec_in->b_cont = mp; - phdr_mp->b_cont = NULL; - /* - * If a db_credp exists in phdr_mp, it must also exist in mp. - */ - ASSERT(DB_CRED(phdr_mp) == NULL || - msg_getcred(mp, NULL) != NULL); freeb(phdr_mp); /* * If SA is labelled, use its label, else inherit the label */ - if (is_system_labeled() && (assoc->ipsa_cred != NULL)) { - mblk_setcred(mp, assoc->ipsa_cred, NOPID); + if (is_system_labeled() && (assoc->ipsa_tsl != NULL)) { + if (!ip_recv_attr_replace_label(ira, assoc->ipsa_tsl)) { + ip_drop_packet(mp, B_TRUE, ira->ira_ill, + DROPPER(ipss, ipds_ah_nomem), &ahstack->ah_dropper); + BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards); + return (NULL); + } } if (assoc->ipsa_state == IPSA_STATE_IDLE) { @@ -4313,17 +3841,18 @@ ah_auth_in_done(mblk_t *ipsec_in) * Cluster buffering case. Tell caller that we're * handling the packet. */ - sadb_buf_pkt(assoc, ipsec_in, ns); - return (IPSEC_STATUS_PENDING); + sadb_buf_pkt(assoc, mp, ira); + return (NULL); } - return (IPSEC_STATUS_SUCCESS); + return (mp); ah_in_discard: IP_AH_BUMP_STAT(ipss, in_discards); - ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL, counter, + ip_drop_packet(phdr_mp, B_TRUE, ira->ira_ill, counter, &ahstack->ah_dropper); - return (IPSEC_STATUS_FAILED); + BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards); + return (NULL); } /* @@ -4332,49 +3861,37 @@ ah_in_discard: * executed syncrhonously, or by the KEF callback for a request * executed asynchronously. */ -static ipsec_status_t -ah_auth_out_done(mblk_t *ipsec_out) +static mblk_t * +ah_auth_out_done(mblk_t *phdr_mp, ip_xmit_attr_t *ixa, ipsec_crypto_t *ic) { - mblk_t *phdr_mp; mblk_t *mp; int align_len; uint32_t hdrs_length; uchar_t *ptr; uint32_t length; boolean_t isv4; - ipsec_out_t *io; size_t icv_len; - netstack_t *ns; - ipsec_stack_t *ipss; - ipsecah_stack_t *ahstack; - - io = (ipsec_out_t *)ipsec_out->b_rptr; - ns = io->ipsec_out_ns; - ipss = ns->netstack_ipsec; - ahstack = ns->netstack_ipsecah; + netstack_t *ns = ixa->ixa_ipst->ips_netstack; + ipsecah_stack_t *ahstack = ns->netstack_ipsecah; + ipsec_stack_t *ipss = ns->netstack_ipsec; + ill_t *ill = ixa->ixa_nce->nce_ill; - isv4 = io->ipsec_out_v4; - icv_len = io->ipsec_out_crypto_mac.cd_raw.iov_len; - - phdr_mp = ipsec_out->b_cont; - if (phdr_mp == NULL) { - ip_drop_packet(ipsec_out, B_FALSE, NULL, NULL, - DROPPER(ipss, ipds_ah_nomem), - &ahstack->ah_dropper); - return (IPSEC_STATUS_FAILED); - } + isv4 = (ixa->ixa_flags & IXAF_IS_IPV4); + icv_len = ic->ic_crypto_mac.cd_raw.iov_len; mp = phdr_mp->b_cont; if (mp == NULL) { - ip_drop_packet(ipsec_out, B_FALSE, NULL, NULL, + ip_drop_packet(phdr_mp, B_FALSE, ill, DROPPER(ipss, ipds_ah_nomem), &ahstack->ah_dropper); - return (IPSEC_STATUS_FAILED); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + return (NULL); } - mp->b_rptr -= io->ipsec_out_skip_len; + mp->b_rptr -= ic->ic_skip_len; - ASSERT(io->ipsec_out_ah_sa != NULL); - ah_set_usetime(io->ipsec_out_ah_sa, B_FALSE); + ASSERT(ixa->ixa_flags & IXAF_IPSEC_SECURE); + ASSERT(ixa->ixa_ipsec_ah_sa != NULL); + ah_set_usetime(ixa->ixa_ipsec_ah_sa, B_FALSE); if (isv4) { ipha_t *ipha; @@ -4454,7 +3971,7 @@ ah_auth_out_done(mblk_t *ipsec_out) freeb(mp); } - return (IPSEC_STATUS_SUCCESS); + return (phdr_mp); } /* Refactor me */ @@ -4464,16 +3981,18 @@ ah_auth_out_done(mblk_t *ipsec_out) */ void ipsecah_in_assocfailure(mblk_t *mp, char level, ushort_t sl, char *fmt, - uint32_t spi, void *addr, int af, ipsecah_stack_t *ahstack) + uint32_t spi, void *addr, int af, ip_recv_attr_t *ira) { - ipsec_stack_t *ipss = ahstack->ipsecah_netstack->netstack_ipsec; + netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack; + ipsecah_stack_t *ahstack = ns->netstack_ipsecah; + ipsec_stack_t *ipss = ns->netstack_ipsec; if (ahstack->ipsecah_log_unknown_spi) { ipsec_assocfailure(info.mi_idnum, 0, level, sl, fmt, spi, addr, af, ahstack->ipsecah_netstack); } - ip_drop_packet(mp, B_TRUE, NULL, NULL, + ip_drop_packet(mp, B_TRUE, ira->ira_ill, DROPPER(ipss, ipds_ah_no_sa), &ahstack->ah_dropper); } diff --git a/usr/src/uts/common/inet/ip/ipsecesp.c b/usr/src/uts/common/inet/ip/ipsecesp.c index 089e23e937..8af449384f 100644 --- a/usr/src/uts/common/inet/ip/ipsecesp.c +++ b/usr/src/uts/common/inet/ip/ipsecesp.c @@ -53,6 +53,8 @@ #include <inet/ip.h> #include <inet/ip_impl.h> #include <inet/ip6.h> +#include <inet/ip_if.h> +#include <inet/ip_ndp.h> #include <inet/sadb.h> #include <inet/ipsec_info.h> #include <inet/ipsec_impl.h> @@ -67,8 +69,6 @@ #include <sys/taskq.h> #include <sys/note.h> -#include <sys/iphada.h> - #include <sys/tsol/tnet.h> /* @@ -130,26 +130,23 @@ static ipsecespparam_t lcl_param_arr[] = { static int ipsecesp_open(queue_t *, dev_t *, int, int, cred_t *); static int ipsecesp_close(queue_t *); -static void ipsecesp_rput(queue_t *, mblk_t *); static void ipsecesp_wput(queue_t *, mblk_t *); static void *ipsecesp_stack_init(netstackid_t stackid, netstack_t *ns); static void ipsecesp_stack_fini(netstackid_t stackid, void *arg); static void esp_send_acquire(ipsacq_t *, mblk_t *, netstack_t *); static void esp_prepare_udp(netstack_t *, mblk_t *, ipha_t *); -static ipsec_status_t esp_outbound_accelerated(mblk_t *, uint_t); -static ipsec_status_t esp_inbound_accelerated(mblk_t *, mblk_t *, - boolean_t, ipsa_t *); +static void esp_outbound_finish(mblk_t *, ip_xmit_attr_t *); +static void esp_inbound_restart(mblk_t *, ip_recv_attr_t *); static boolean_t esp_register_out(uint32_t, uint32_t, uint_t, - ipsecesp_stack_t *, mblk_t *); + ipsecesp_stack_t *, cred_t *); static boolean_t esp_strip_header(mblk_t *, boolean_t, uint32_t, kstat_named_t **, ipsecesp_stack_t *); -static ipsec_status_t esp_submit_req_inbound(mblk_t *, ipsa_t *, uint_t); -static ipsec_status_t esp_submit_req_outbound(mblk_t *, ipsa_t *, uchar_t *, - uint_t); -extern void (*cl_inet_getspi)(netstackid_t, uint8_t, uint8_t *, size_t, - void *); +static mblk_t *esp_submit_req_inbound(mblk_t *, ip_recv_attr_t *, + ipsa_t *, uint_t); +static mblk_t *esp_submit_req_outbound(mblk_t *, ip_xmit_attr_t *, + ipsa_t *, uchar_t *, uint_t); /* Setable in /etc/system */ uint32_t esp_hash_size = IPSEC_DEFAULT_HASH_SIZE; @@ -159,7 +156,7 @@ static struct module_info info = { }; static struct qinit rinit = { - (pfi_t)ipsecesp_rput, NULL, ipsecesp_open, ipsecesp_close, NULL, &info, + (pfi_t)putnext, NULL, ipsecesp_open, ipsecesp_close, NULL, &info, NULL }; @@ -201,9 +198,6 @@ typedef struct esp_kstats_s { kstat_named_t esp_stat_acquire_requests; kstat_named_t esp_stat_bytes_expired; kstat_named_t esp_stat_out_discards; - kstat_named_t esp_stat_in_accelerated; - kstat_named_t esp_stat_out_accelerated; - kstat_named_t esp_stat_noaccel; kstat_named_t esp_stat_crypto_sync; kstat_named_t esp_stat_crypto_async; kstat_named_t esp_stat_crypto_failures; @@ -266,9 +260,6 @@ esp_kstat_init(ipsecesp_stack_t *espstack, netstackid_t stackid) KI(acquire_requests); KI(bytes_expired); KI(out_discards); - KI(in_accelerated); - KI(out_accelerated); - KI(noaccel); KI(crypto_sync); KI(crypto_async); KI(crypto_failures); @@ -384,9 +375,9 @@ esp_ager(void *arg) hrtime_t begin = gethrtime(); sadb_ager(&espstack->esp_sadb.s_v4, espstack->esp_pfkey_q, - espstack->esp_sadb.s_ip_q, espstack->ipsecesp_reap_delay, ns); + espstack->ipsecesp_reap_delay, ns); sadb_ager(&espstack->esp_sadb.s_v6, espstack->esp_pfkey_q, - espstack->esp_sadb.s_ip_q, espstack->ipsecesp_reap_delay, ns); + espstack->ipsecesp_reap_delay, ns); espstack->esp_event = sadb_retimeout(begin, espstack->esp_pfkey_q, esp_ager, espstack, @@ -583,7 +574,13 @@ ipsecesp_stack_fini(netstackid_t stackid, void *arg) } /* - * ESP module open routine. + * ESP module open routine, which is here for keysock plumbing. + * Keysock is pushed over {AH,ESP} which is an artifact from the Bad Old + * Days of export control, and fears that ESP would not be allowed + * to be shipped at all by default. Eventually, keysock should + * either access AH and ESP via modstubs or krtld dependencies, or + * perhaps be folded in with AH and ESP into a single IPsec/netsec + * module ("netsec" if PF_KEY provides more than AH/ESP keying tables). */ /* ARGSUSED */ static int @@ -606,56 +603,10 @@ ipsecesp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) espstack = ns->netstack_ipsecesp; ASSERT(espstack != NULL); - /* - * ASSUMPTIONS (because I'm MT_OCEXCL): - * - * * I'm being pushed on top of IP for all my opens (incl. #1). - * * Only ipsecesp_open() can write into esp_sadb.s_ip_q. - * * Because of this, I can check lazily for esp_sadb.s_ip_q. - * - * If these assumptions are wrong, I'm in BIG trouble... - */ - q->q_ptr = espstack; WR(q)->q_ptr = q->q_ptr; - if (espstack->esp_sadb.s_ip_q == NULL) { - struct T_unbind_req *tur; - - espstack->esp_sadb.s_ip_q = WR(q); - /* Allocate an unbind... */ - espstack->esp_ip_unbind = allocb(sizeof (struct T_unbind_req), - BPRI_HI); - - /* - * Send down T_BIND_REQ to bind IPPROTO_ESP. - * Handle the ACK here in ESP. - */ - qprocson(q); - if (espstack->esp_ip_unbind == NULL || - !sadb_t_bind_req(espstack->esp_sadb.s_ip_q, IPPROTO_ESP)) { - if (espstack->esp_ip_unbind != NULL) { - freeb(espstack->esp_ip_unbind); - espstack->esp_ip_unbind = NULL; - } - q->q_ptr = NULL; - netstack_rele(espstack->ipsecesp_netstack); - return (ENOMEM); - } - - espstack->esp_ip_unbind->b_datap->db_type = M_PROTO; - tur = (struct T_unbind_req *)espstack->esp_ip_unbind->b_rptr; - tur->PRIM_type = T_UNBIND_REQ; - } else { - qprocson(q); - } - - /* - * For now, there's not much I can do. I'll be getting a message - * passed down to me from keysock (in my wput), and a T_BIND_ACK - * up from IP (in my rput). - */ - + qprocson(q); return (0); } @@ -668,17 +619,6 @@ ipsecesp_close(queue_t *q) ipsecesp_stack_t *espstack = (ipsecesp_stack_t *)q->q_ptr; /* - * If esp_sadb.s_ip_q is attached to this instance, send a - * T_UNBIND_REQ to IP for the instance before doing - * a qprocsoff(). - */ - if (WR(q) == espstack->esp_sadb.s_ip_q && - espstack->esp_ip_unbind != NULL) { - putnext(WR(q), espstack->esp_ip_unbind); - espstack->esp_ip_unbind = NULL; - } - - /* * Clean up q_ptr, if needed. */ qprocsoff(q); @@ -693,45 +633,6 @@ ipsecesp_close(queue_t *q) (void) quntimeout(q, espstack->esp_event); } - if (WR(q) == espstack->esp_sadb.s_ip_q) { - /* - * If the esp_sadb.s_ip_q is attached to this instance, find - * another. The OCEXCL outer perimeter helps us here. - */ - espstack->esp_sadb.s_ip_q = NULL; - - /* - * Find a replacement queue for esp_sadb.s_ip_q. - */ - if (espstack->esp_pfkey_q != NULL && - espstack->esp_pfkey_q != RD(q)) { - /* - * See if we can use the pfkey_q. - */ - espstack->esp_sadb.s_ip_q = WR(espstack->esp_pfkey_q); - } - - if (espstack->esp_sadb.s_ip_q == NULL || - !sadb_t_bind_req(espstack->esp_sadb.s_ip_q, IPPROTO_ESP)) { - esp1dbg(espstack, ("ipsecesp: Can't reassign ip_q.\n")); - espstack->esp_sadb.s_ip_q = NULL; - } else { - espstack->esp_ip_unbind = - allocb(sizeof (struct T_unbind_req), BPRI_HI); - - if (espstack->esp_ip_unbind != NULL) { - struct T_unbind_req *tur; - - espstack->esp_ip_unbind->b_datap->db_type = - M_PROTO; - tur = (struct T_unbind_req *) - espstack->esp_ip_unbind->b_rptr; - tur->PRIM_type = T_UNBIND_REQ; - } - /* If it's NULL, I can't do much here. */ - } - } - netstack_rele(espstack->ipsecesp_netstack); return (0); } @@ -834,26 +735,27 @@ esp_age_bytes(ipsa_t *assoc, uint64_t bytes, boolean_t inbound) /* * Do incoming NAT-T manipulations for packet. + * Returns NULL if the mblk chain is consumed. */ -static ipsec_status_t +static mblk_t * esp_fix_natt_checksums(mblk_t *data_mp, ipsa_t *assoc) { ipha_t *ipha = (ipha_t *)data_mp->b_rptr; - tcpha_t *tcph; + tcpha_t *tcpha; udpha_t *udpha; /* Initialize to our inbound cksum adjustment... */ uint32_t sum = assoc->ipsa_inbound_cksum; switch (ipha->ipha_protocol) { case IPPROTO_TCP: - tcph = (tcpha_t *)(data_mp->b_rptr + + tcpha = (tcpha_t *)(data_mp->b_rptr + IPH_HDR_LENGTH(ipha)); #define DOWN_SUM(x) (x) = ((x) & 0xFFFF) + ((x) >> 16) - sum += ~ntohs(tcph->tha_sum) & 0xFFFF; + sum += ~ntohs(tcpha->tha_sum) & 0xFFFF; DOWN_SUM(sum); DOWN_SUM(sum); - tcph->tha_sum = ~htons(sum); + tcpha->tha_sum = ~htons(sum); break; case IPPROTO_UDP: udpha = (udpha_t *)(data_mp->b_rptr + IPH_HDR_LENGTH(ipha)); @@ -876,7 +778,7 @@ esp_fix_natt_checksums(mblk_t *data_mp, ipsa_t *assoc) */ break; } - return (IPSEC_STATUS_SUCCESS); + return (data_mp); } @@ -968,10 +870,11 @@ esp_strip_header(mblk_t *data_mp, boolean_t isv4, uint32_t ivlen, if (ip6h->ip6_nxt == IPPROTO_ESP) { ip6h->ip6_nxt = nexthdr; } else { - ip6_pkt_t ipp; + ip_pkt_t ipp; bzero(&ipp, sizeof (ipp)); - (void) ip_find_hdr_v6(data_mp, ip6h, &ipp, NULL); + (void) ip_find_hdr_v6(data_mp, ip6h, B_FALSE, &ipp, + NULL); if (ipp.ipp_dstopts != NULL) { ipp.ipp_dstopts->ip6d_nxt = nexthdr; } else if (ipp.ipp_rthdr != NULL) { @@ -1227,16 +1130,14 @@ esp_set_usetime(ipsa_t *assoc, boolean_t inbound) /* * Handle ESP inbound data for IPv4 and IPv6. * On success returns B_TRUE, on failure returns B_FALSE and frees the - * mblk chain ipsec_in_mp. + * mblk chain data_mp. */ -ipsec_status_t -esp_inbound(mblk_t *ipsec_in_mp, void *arg) +mblk_t * +esp_inbound(mblk_t *data_mp, void *arg, ip_recv_attr_t *ira) { - mblk_t *data_mp = ipsec_in_mp->b_cont; - ipsec_in_t *ii = (ipsec_in_t *)ipsec_in_mp->b_rptr; esph_t *esph = (esph_t *)arg; - ipsa_t *ipsa = ii->ipsec_in_esp_sa; - netstack_t *ns = ii->ipsec_in_ns; + ipsa_t *ipsa = ira->ira_ipsec_esp_sa; + netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack; ipsecesp_stack_t *espstack = ns->netstack_ipsecesp; ipsec_stack_t *ipss = ns->netstack_ipsec; @@ -1254,36 +1155,18 @@ esp_inbound(mblk_t *ipsec_in_mp, void *arg) if (!sadb_replay_peek(ipsa, esph->esph_replay)) { ESP_BUMP_STAT(espstack, replay_early_failures); IP_ESP_BUMP_STAT(ipss, in_discards); - /* - * TODO: Extract inbound interface from the IPSEC_IN - * message's ii->ipsec_in_rill_index. - */ - ip_drop_packet(ipsec_in_mp, B_TRUE, NULL, NULL, + ip_drop_packet(data_mp, B_TRUE, ira->ira_ill, DROPPER(ipss, ipds_esp_early_replay), &espstack->esp_dropper); - return (IPSEC_STATUS_FAILED); + BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards); + return (NULL); } /* - * Has this packet already been processed by a hardware - * IPsec accelerator? - */ - if (ii->ipsec_in_accelerated) { - ipsec_status_t rv; - esp3dbg(espstack, - ("esp_inbound: pkt processed by ill=%d isv6=%d\n", - ii->ipsec_in_ill_index, !ii->ipsec_in_v4)); - rv = esp_inbound_accelerated(ipsec_in_mp, - data_mp, ii->ipsec_in_v4, ipsa); - return (rv); - } - ESP_BUMP_STAT(espstack, noaccel); - - /* * Adjust the IP header's payload length to reflect the removal * of the ICV. */ - if (!ii->ipsec_in_v4) { + if (!(ira->ira_flags & IRAF_IS_IPV4)) { ip6_t *ip6h = (ip6_t *)data_mp->b_rptr; ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) - ipsa->ipsa_mac_len); @@ -1294,7 +1177,7 @@ esp_inbound(mblk_t *ipsec_in_mp, void *arg) } /* submit the request to the crypto framework */ - return (esp_submit_req_inbound(ipsec_in_mp, ipsa, + return (esp_submit_req_inbound(data_mp, ira, ipsa, (uint8_t *)esph - data_mp->b_rptr)); } @@ -1303,21 +1186,15 @@ esp_inbound(mblk_t *ipsec_in_mp, void *arg) * Called while holding the algorithm lock. */ static void -esp_insert_prop(sadb_prop_t *prop, ipsacq_t *acqrec, uint_t combs) +esp_insert_prop(sadb_prop_t *prop, ipsacq_t *acqrec, uint_t combs, + netstack_t *ns) { sadb_comb_t *comb = (sadb_comb_t *)(prop + 1); - ipsec_out_t *io; ipsec_action_t *ap; ipsec_prot_t *prot; - netstack_t *ns; - ipsecesp_stack_t *espstack; - ipsec_stack_t *ipss; + ipsecesp_stack_t *espstack = ns->netstack_ipsecesp; + ipsec_stack_t *ipss = ns->netstack_ipsec; - io = (ipsec_out_t *)acqrec->ipsacq_mp->b_rptr; - ASSERT(io->ipsec_out_type == IPSEC_OUT); - ns = io->ipsec_out_ns; - espstack = ns->netstack_ipsecesp; - ipss = ns->netstack_ipsec; ASSERT(MUTEX_HELD(&ipss->ipsec_alg_lock)); prop->sadb_prop_exttype = SADB_EXT_PROPOSAL; @@ -1327,9 +1204,10 @@ esp_insert_prop(sadb_prop_t *prop, ipsacq_t *acqrec, uint_t combs) prop->sadb_prop_replay = espstack->ipsecesp_replay_size; /* - * Based upon algorithm properties, and what-not, prioritize - * a proposal. If the IPSEC_OUT message has an algorithm specified, - * use it first and foremost. + * Based upon algorithm properties, and what-not, prioritize a + * proposal, based on the ordering of the ESP algorithms in the + * alternatives in the policy rule or socket that was placed + * in the acquire record. * * For each action in policy list * Add combination. If I've hit limit, return. @@ -1456,7 +1334,7 @@ esp_send_acquire(ipsacq_t *acqrec, mblk_t *extended, netstack_t *ns) /* Insert proposal here. */ prop = (sadb_prop_t *)(((uint64_t *)samsg) + samsg->sadb_msg_len); - esp_insert_prop(prop, acqrec, combs); + esp_insert_prop(prop, acqrec, combs, ns); samsg->sadb_msg_len += prop->sadb_prop_len; msgmp->b_wptr += SADB_64TO8(samsg->sadb_msg_len); @@ -1756,13 +1634,11 @@ esp_port_freshness(uint32_t ports, ipsa_t *assoc) * If authentication was performed on the packet, this function is called * only if the authentication succeeded. * On success returns B_TRUE, on failure returns B_FALSE and frees the - * mblk chain ipsec_in_mp. + * mblk chain data_mp. */ -static ipsec_status_t -esp_in_done(mblk_t *ipsec_in_mp) +static mblk_t * +esp_in_done(mblk_t *data_mp, ip_recv_attr_t *ira, ipsec_crypto_t *ic) { - ipsec_in_t *ii = (ipsec_in_t *)ipsec_in_mp->b_rptr; - mblk_t *data_mp; ipsa_t *assoc; uint_t espstart; uint32_t ivlen = 0; @@ -1770,11 +1646,11 @@ esp_in_done(mblk_t *ipsec_in_mp) esph_t *esph; kstat_named_t *counter; boolean_t is_natt; - netstack_t *ns = ii->ipsec_in_ns; + netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack; ipsecesp_stack_t *espstack = ns->netstack_ipsecesp; ipsec_stack_t *ipss = ns->netstack_ipsec; - assoc = ii->ipsec_in_esp_sa; + assoc = ira->ira_ipsec_esp_sa; ASSERT(assoc != NULL); is_natt = ((assoc->ipsa_flags & IPSA_F_NATT) != 0); @@ -1782,26 +1658,25 @@ esp_in_done(mblk_t *ipsec_in_mp) /* get the pointer to the ESP header */ if (assoc->ipsa_encr_alg == SADB_EALG_NULL) { /* authentication-only ESP */ - espstart = ii->ipsec_in_crypto_data.cd_offset; - processed_len = ii->ipsec_in_crypto_data.cd_length; + espstart = ic->ic_crypto_data.cd_offset; + processed_len = ic->ic_crypto_data.cd_length; } else { /* encryption present */ ivlen = assoc->ipsa_iv_len; if (assoc->ipsa_auth_alg == SADB_AALG_NONE) { /* encryption-only ESP */ - espstart = ii->ipsec_in_crypto_data.cd_offset - + espstart = ic->ic_crypto_data.cd_offset - sizeof (esph_t) - assoc->ipsa_iv_len; - processed_len = ii->ipsec_in_crypto_data.cd_length + + processed_len = ic->ic_crypto_data.cd_length + ivlen; } else { /* encryption with authentication */ - espstart = ii->ipsec_in_crypto_dual_data.dd_offset1; - processed_len = ii->ipsec_in_crypto_dual_data.dd_len2 + + espstart = ic->ic_crypto_dual_data.dd_offset1; + processed_len = ic->ic_crypto_dual_data.dd_len2 + ivlen; } } - data_mp = ipsec_in_mp->b_cont; esph = (esph_t *)(data_mp->b_rptr + espstart); if (assoc->ipsa_auth_alg != IPSA_AALG_NONE || @@ -1840,8 +1715,11 @@ esp_in_done(mblk_t *ipsec_in_mp) goto drop_and_bail; } - if (is_natt) - esp_port_freshness(ii->ipsec_in_esp_udp_ports, assoc); + if (is_natt) { + ASSERT(ira->ira_flags & IRAF_ESP_UDP_PORTS); + ASSERT(ira->ira_esp_udp_ports != 0); + esp_port_freshness(ira->ira_esp_udp_ports, assoc); + } } esp_set_usetime(assoc, B_TRUE); @@ -1863,44 +1741,41 @@ esp_in_done(mblk_t *ipsec_in_mp) * spews "branch, predict taken" code for this. */ - if (esp_strip_header(data_mp, ii->ipsec_in_v4, ivlen, &counter, - espstack)) { - - if (is_system_labeled()) { - cred_t *cr = assoc->ipsa_cred; + if (esp_strip_header(data_mp, (ira->ira_flags & IRAF_IS_IPV4), + ivlen, &counter, espstack)) { - if (cr != NULL) { - mblk_setcred(data_mp, cr, NOPID); + if (is_system_labeled() && assoc->ipsa_tsl != NULL) { + if (!ip_recv_attr_replace_label(ira, assoc->ipsa_tsl)) { + ip_drop_packet(data_mp, B_TRUE, ira->ira_ill, + DROPPER(ipss, ipds_ah_nomem), + &espstack->esp_dropper); + BUMP_MIB(ira->ira_ill->ill_ip_mib, + ipIfStatsInDiscards); + return (NULL); } - } if (is_natt) return (esp_fix_natt_checksums(data_mp, assoc)); - ASSERT(!is_system_labeled() || (DB_CRED(data_mp) != NULL)); - if (assoc->ipsa_state == IPSA_STATE_IDLE) { /* * Cluster buffering case. Tell caller that we're * handling the packet. */ - sadb_buf_pkt(assoc, ipsec_in_mp, ns); - return (IPSEC_STATUS_PENDING); + sadb_buf_pkt(assoc, data_mp, ira); + return (NULL); } - return (IPSEC_STATUS_SUCCESS); + return (data_mp); } esp1dbg(espstack, ("esp_in_done: esp_strip_header() failed\n")); drop_and_bail: IP_ESP_BUMP_STAT(ipss, in_discards); - /* - * TODO: Extract inbound interface from the IPSEC_IN message's - * ii->ipsec_in_rill_index. - */ - ip_drop_packet(ipsec_in_mp, B_TRUE, NULL, NULL, counter, + ip_drop_packet(data_mp, B_TRUE, ira->ira_ill, counter, &espstack->esp_dropper); - return (IPSEC_STATUS_FAILED); + BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards); + return (NULL); } /* @@ -1908,11 +1783,10 @@ drop_and_bail: * argument is freed. */ static void -esp_log_bad_auth(mblk_t *ipsec_in) +esp_log_bad_auth(mblk_t *mp, ip_recv_attr_t *ira) { - ipsec_in_t *ii = (ipsec_in_t *)ipsec_in->b_rptr; - ipsa_t *assoc = ii->ipsec_in_esp_sa; - netstack_t *ns = ii->ipsec_in_ns; + ipsa_t *assoc = ira->ira_ipsec_esp_sa; + netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack; ipsecesp_stack_t *espstack = ns->netstack_ipsecesp; ipsec_stack_t *ipss = ns->netstack_ipsec; @@ -1928,11 +1802,7 @@ esp_log_bad_auth(mblk_t *ipsec_in) espstack->ipsecesp_netstack); IP_ESP_BUMP_STAT(ipss, in_discards); - /* - * TODO: Extract inbound interface from the IPSEC_IN - * message's ii->ipsec_in_rill_index. - */ - ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL, + ip_drop_packet(mp, B_TRUE, ira->ira_ill, DROPPER(ipss, ipds_esp_bad_auth), &espstack->esp_dropper); } @@ -1944,148 +1814,205 @@ esp_log_bad_auth(mblk_t *ipsec_in) * Returns B_TRUE if the AH processing was not needed or if it was * performed successfully. Returns B_FALSE and consumes the passed mblk * if AH processing was required but could not be performed. + * + * Returns data_mp unless data_mp was consumed/queued. */ -static boolean_t -esp_do_outbound_ah(mblk_t *ipsec_mp) +static mblk_t * +esp_do_outbound_ah(mblk_t *data_mp, ip_xmit_attr_t *ixa) { - ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr; - ipsec_status_t ipsec_rc; ipsec_action_t *ap; - ap = io->ipsec_out_act; + ap = ixa->ixa_ipsec_action; if (ap == NULL) { - ipsec_policy_t *pp = io->ipsec_out_policy; + ipsec_policy_t *pp = ixa->ixa_ipsec_policy; ap = pp->ipsp_act; } if (!ap->ipa_want_ah) - return (B_TRUE); + return (data_mp); - ASSERT(io->ipsec_out_ah_done == B_FALSE); - - if (io->ipsec_out_ah_sa == NULL) { - if (!ipsec_outbound_sa(ipsec_mp, IPPROTO_AH)) { - sadb_acquire(ipsec_mp, io, B_TRUE, B_FALSE); - return (B_FALSE); + /* + * Normally the AH SA would have already been put in place + * but it could have been flushed so we need to look for it. + */ + if (ixa->ixa_ipsec_ah_sa == NULL) { + if (!ipsec_outbound_sa(data_mp, ixa, IPPROTO_AH)) { + sadb_acquire(data_mp, ixa, B_TRUE, B_FALSE); + return (NULL); } } - ASSERT(io->ipsec_out_ah_sa != NULL); + ASSERT(ixa->ixa_ipsec_ah_sa != NULL); - io->ipsec_out_ah_done = B_TRUE; - ipsec_rc = io->ipsec_out_ah_sa->ipsa_output_func(ipsec_mp); - return (ipsec_rc == IPSEC_STATUS_SUCCESS); + data_mp = ixa->ixa_ipsec_ah_sa->ipsa_output_func(data_mp, ixa); + return (data_mp); } /* * Kernel crypto framework callback invoked after completion of async - * crypto requests. + * crypto requests for outbound packets. */ static void -esp_kcf_callback(void *arg, int status) +esp_kcf_callback_outbound(void *arg, int status) { - mblk_t *ipsec_mp = (mblk_t *)arg; - ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr; - ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr; - boolean_t is_inbound = (ii->ipsec_in_type == IPSEC_IN); - netstackid_t stackid; - netstack_t *ns, *ns_arg; - ipsecesp_stack_t *espstack; + mblk_t *mp = (mblk_t *)arg; + mblk_t *async_mp; + netstack_t *ns; ipsec_stack_t *ipss; + ipsecesp_stack_t *espstack; + mblk_t *data_mp; + ip_xmit_attr_t ixas; + ipsec_crypto_t *ic; + ill_t *ill; - ASSERT(ipsec_mp->b_cont != NULL); + /* + * First remove the ipsec_crypto_t mblk + * Note that we need to ipsec_free_crypto_data(mp) once done with ic. + */ + async_mp = ipsec_remove_crypto_data(mp, &ic); + ASSERT(async_mp != NULL); - if (is_inbound) { - stackid = ii->ipsec_in_stackid; - ns_arg = ii->ipsec_in_ns; + /* + * Extract the ip_xmit_attr_t from the first mblk. + * Verifies that the netstack and ill is still around; could + * have vanished while kEf was doing its work. + * On succesful return we have a nce_t and the ill/ipst can't + * disappear until we do the nce_refrele in ixa_cleanup. + */ + data_mp = async_mp->b_cont; + async_mp->b_cont = NULL; + if (!ip_xmit_attr_from_mblk(async_mp, &ixas)) { + /* Disappeared on us - no ill/ipst for MIB */ + /* We have nowhere to do stats since ixa_ipst could be NULL */ + if (ixas.ixa_nce != NULL) { + ill = ixas.ixa_nce->nce_ill; + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards", data_mp, ill); + } + freemsg(data_mp); + goto done; + } + ns = ixas.ixa_ipst->ips_netstack; + espstack = ns->netstack_ipsecesp; + ipss = ns->netstack_ipsec; + ill = ixas.ixa_nce->nce_ill; + + if (status == CRYPTO_SUCCESS) { + /* + * If a ICV was computed, it was stored by the + * crypto framework at the end of the packet. + */ + ipha_t *ipha = (ipha_t *)data_mp->b_rptr; + + esp_set_usetime(ixas.ixa_ipsec_esp_sa, B_FALSE); + /* NAT-T packet. */ + if (IPH_HDR_VERSION(ipha) == IP_VERSION && + ipha->ipha_protocol == IPPROTO_UDP) + esp_prepare_udp(ns, data_mp, ipha); + + /* do AH processing if needed */ + data_mp = esp_do_outbound_ah(data_mp, &ixas); + if (data_mp == NULL) + goto done; + + (void) ip_output_post_ipsec(data_mp, &ixas); } else { - stackid = io->ipsec_out_stackid; - ns_arg = io->ipsec_out_ns; + /* Outbound shouldn't see invalid MAC */ + ASSERT(status != CRYPTO_INVALID_MAC); + + esp1dbg(espstack, + ("esp_kcf_callback_outbound: crypto failed with 0x%x\n", + status)); + ESP_BUMP_STAT(espstack, crypto_failures); + ESP_BUMP_STAT(espstack, out_discards); + ip_drop_packet(data_mp, B_FALSE, ill, + DROPPER(ipss, ipds_esp_crypto_failed), + &espstack->esp_dropper); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); } +done: + ixa_cleanup(&ixas); + (void) ipsec_free_crypto_data(mp); +} + +/* + * Kernel crypto framework callback invoked after completion of async + * crypto requests for inbound packets. + */ +static void +esp_kcf_callback_inbound(void *arg, int status) +{ + mblk_t *mp = (mblk_t *)arg; + mblk_t *async_mp; + netstack_t *ns; + ipsecesp_stack_t *espstack; + ipsec_stack_t *ipss; + mblk_t *data_mp; + ip_recv_attr_t iras; + ipsec_crypto_t *ic; /* - * Verify that the netstack is still around; could have vanished - * while kEf was doing its work. + * First remove the ipsec_crypto_t mblk + * Note that we need to ipsec_free_crypto_data(mp) once done with ic. */ - ns = netstack_find_by_stackid(stackid); - if (ns == NULL || ns != ns_arg) { - /* Disappeared on us */ - if (ns != NULL) - netstack_rele(ns); - freemsg(ipsec_mp); - return; + async_mp = ipsec_remove_crypto_data(mp, &ic); + ASSERT(async_mp != NULL); + + /* + * Extract the ip_recv_attr_t from the first mblk. + * Verifies that the netstack and ill is still around; could + * have vanished while kEf was doing its work. + */ + data_mp = async_mp->b_cont; + async_mp->b_cont = NULL; + if (!ip_recv_attr_from_mblk(async_mp, &iras)) { + /* The ill or ip_stack_t disappeared on us */ + ip_drop_input("ip_recv_attr_from_mblk", data_mp, NULL); + freemsg(data_mp); + goto done; } + ns = iras.ira_ill->ill_ipst->ips_netstack; espstack = ns->netstack_ipsecesp; ipss = ns->netstack_ipsec; if (status == CRYPTO_SUCCESS) { - if (is_inbound) { - if (esp_in_done(ipsec_mp) != IPSEC_STATUS_SUCCESS) { - netstack_rele(ns); - return; - } - /* finish IPsec processing */ - ip_fanout_proto_again(ipsec_mp, NULL, NULL, NULL); - } else { - /* - * If a ICV was computed, it was stored by the - * crypto framework at the end of the packet. - */ - ipha_t *ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr; - - esp_set_usetime(io->ipsec_out_esp_sa, B_FALSE); - /* NAT-T packet. */ - if (ipha->ipha_protocol == IPPROTO_UDP) - esp_prepare_udp(ns, ipsec_mp->b_cont, ipha); - - /* do AH processing if needed */ - if (!esp_do_outbound_ah(ipsec_mp)) { - netstack_rele(ns); - return; - } - /* finish IPsec processing */ - if (IPH_HDR_VERSION(ipha) == IP_VERSION) { - ip_wput_ipsec_out(NULL, ipsec_mp, ipha, NULL, - NULL); - } else { - ip6_t *ip6h = (ip6_t *)ipha; - ip_wput_ipsec_out_v6(NULL, ipsec_mp, ip6h, - NULL, NULL); - } - } + data_mp = esp_in_done(data_mp, &iras, ic); + if (data_mp == NULL) + goto done; + /* finish IPsec processing */ + ip_input_post_ipsec(data_mp, &iras); } else if (status == CRYPTO_INVALID_MAC) { - esp_log_bad_auth(ipsec_mp); - + esp_log_bad_auth(data_mp, &iras); } else { esp1dbg(espstack, ("esp_kcf_callback: crypto failed with 0x%x\n", status)); ESP_BUMP_STAT(espstack, crypto_failures); - if (is_inbound) - IP_ESP_BUMP_STAT(ipss, in_discards); - else - ESP_BUMP_STAT(espstack, out_discards); - ip_drop_packet(ipsec_mp, is_inbound, NULL, NULL, + IP_ESP_BUMP_STAT(ipss, in_discards); + ip_drop_packet(data_mp, B_TRUE, iras.ira_ill, DROPPER(ipss, ipds_esp_crypto_failed), &espstack->esp_dropper); + BUMP_MIB(iras.ira_ill->ill_ip_mib, ipIfStatsInDiscards); } - netstack_rele(ns); +done: + ira_cleanup(&iras, B_TRUE); + (void) ipsec_free_crypto_data(mp); } /* * Invoked on crypto framework failure during inbound and outbound processing. */ static void -esp_crypto_failed(mblk_t *mp, boolean_t is_inbound, int kef_rc, - ipsecesp_stack_t *espstack) +esp_crypto_failed(mblk_t *data_mp, boolean_t is_inbound, int kef_rc, + ill_t *ill, ipsecesp_stack_t *espstack) { ipsec_stack_t *ipss = espstack->ipsecesp_netstack->netstack_ipsec; esp1dbg(espstack, ("crypto failed for %s ESP with 0x%x\n", is_inbound ? "inbound" : "outbound", kef_rc)); - ip_drop_packet(mp, is_inbound, NULL, NULL, + ip_drop_packet(data_mp, is_inbound, ill, DROPPER(ipss, ipds_esp_crypto_failed), &espstack->esp_dropper); ESP_BUMP_STAT(espstack, crypto_failures); @@ -2095,11 +2022,14 @@ esp_crypto_failed(mblk_t *mp, boolean_t is_inbound, int kef_rc, ESP_BUMP_STAT(espstack, out_discards); } -#define ESP_INIT_CALLREQ(_cr) { \ - (_cr)->cr_flag = CRYPTO_SKIP_REQID|CRYPTO_RESTRICTED; \ - (_cr)->cr_callback_arg = ipsec_mp; \ - (_cr)->cr_callback_func = esp_kcf_callback; \ -} +/* + * A statement-equivalent macro, _cr MUST point to a modifiable + * crypto_call_req_t. + */ +#define ESP_INIT_CALLREQ(_cr, _mp, _callback) \ + (_cr)->cr_flag = CRYPTO_SKIP_REQID|CRYPTO_ALWAYS_QUEUE; \ + (_cr)->cr_callback_arg = (_mp); \ + (_cr)->cr_callback_func = (_callback) #define ESP_INIT_CRYPTO_MAC(mac, icvlen, icvbuf) { \ (mac)->cd_format = CRYPTO_DATA_RAW; \ @@ -2132,44 +2062,45 @@ esp_crypto_failed(mblk_t *mp, boolean_t is_inbound, int kef_rc, (data)->dd_offset2 = off2; \ } -static ipsec_status_t -esp_submit_req_inbound(mblk_t *ipsec_mp, ipsa_t *assoc, uint_t esph_offset) +/* + * Returns data_mp if successfully completed the request. Returns + * NULL if it failed (and increments InDiscards) or if it is pending. + */ +static mblk_t * +esp_submit_req_inbound(mblk_t *esp_mp, ip_recv_attr_t *ira, + ipsa_t *assoc, uint_t esph_offset) { - ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr; - boolean_t do_auth; uint_t auth_offset, msg_len, auth_len; - crypto_call_req_t call_req; - mblk_t *esp_mp; + crypto_call_req_t call_req, *callrp; + mblk_t *mp; esph_t *esph_ptr; - int kef_rc = CRYPTO_FAILED; + int kef_rc; uint_t icv_len = assoc->ipsa_mac_len; crypto_ctx_template_t auth_ctx_tmpl; - boolean_t do_encr; + boolean_t do_auth, do_encr, force; uint_t encr_offset, encr_len; uint_t iv_len = assoc->ipsa_iv_len; crypto_ctx_template_t encr_ctx_tmpl; - netstack_t *ns = ii->ipsec_in_ns; - ipsecesp_stack_t *espstack = ns->netstack_ipsecesp; - ipsec_stack_t *ipss = ns->netstack_ipsec; + ipsec_crypto_t *ic, icstack; uchar_t *iv_ptr; - - ASSERT(ii->ipsec_in_type == IPSEC_IN); - - /* - * In case kEF queues and calls back, keep netstackid_t for - * verification that the IP instance is still around in - * esp_kcf_callback(). - */ - ASSERT(ii->ipsec_in_stackid == ns->netstack_stackid); + netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack; + ipsec_stack_t *ipss = ns->netstack_ipsec; + ipsecesp_stack_t *espstack = ns->netstack_ipsecesp; do_auth = assoc->ipsa_auth_alg != SADB_AALG_NONE; do_encr = assoc->ipsa_encr_alg != SADB_EALG_NULL; + force = (assoc->ipsa_flags & IPSA_F_ASYNC); + +#ifdef IPSEC_LATENCY_TEST + kef_rc = CRYPTO_SUCCESS; +#else + kef_rc = CRYPTO_FAILED; +#endif /* * An inbound packet is of the form: - * IPSEC_IN -> [IP,options,ESP,IV,data,ICV,pad] + * [IP,options,ESP,IV,data,ICV,pad] */ - esp_mp = ipsec_mp->b_cont; esph_ptr = (esph_t *)(esp_mp->b_rptr + esph_offset); iv_ptr = (uchar_t *)(esph_ptr + 1); /* Packet length starting at IP header ending after ESP ICV. */ @@ -2178,8 +2109,6 @@ esp_submit_req_inbound(mblk_t *ipsec_mp, ipsa_t *assoc, uint_t esph_offset) encr_offset = esph_offset + sizeof (esph_t) + iv_len; encr_len = msg_len - encr_offset; - ESP_INIT_CALLREQ(&call_req); - /* * Counter mode algs need a nonce. This is setup in sadb_common_add(). * If for some reason we are using a SA which does not have a nonce @@ -2187,23 +2116,40 @@ esp_submit_req_inbound(mblk_t *ipsec_mp, ipsa_t *assoc, uint_t esph_offset) */ if ((assoc->ipsa_flags & IPSA_F_COUNTERMODE) && (assoc->ipsa_nonce == NULL)) { - ip_drop_packet(ipsec_mp, B_FALSE, NULL, NULL, + ip_drop_packet(esp_mp, B_TRUE, ira->ira_ill, DROPPER(ipss, ipds_esp_nomem), &espstack->esp_dropper); - return (IPSEC_STATUS_FAILED); + return (NULL); } - if (do_auth) { - /* force asynchronous processing? */ - if (ipss->ipsec_algs_exec_mode[IPSEC_ALG_AUTH] == - IPSEC_ALGS_EXEC_ASYNC) - call_req.cr_flag |= CRYPTO_ALWAYS_QUEUE; + if (force) { + /* We are doing asynch; allocate mblks to hold state */ + if ((mp = ip_recv_attr_to_mblk(ira)) == NULL || + (mp = ipsec_add_crypto_data(mp, &ic)) == NULL) { + BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", esp_mp, + ira->ira_ill); + return (NULL); + } + linkb(mp, esp_mp); + callrp = &call_req; + ESP_INIT_CALLREQ(callrp, mp, esp_kcf_callback_inbound); + } else { + /* + * If we know we are going to do sync then ipsec_crypto_t + * should be on the stack. + */ + ic = &icstack; + bzero(ic, sizeof (*ic)); + callrp = NULL; + } + if (do_auth) { /* authentication context template */ IPSEC_CTX_TMPL(assoc, ipsa_authtmpl, IPSEC_ALG_AUTH, auth_ctx_tmpl); /* ICV to be verified */ - ESP_INIT_CRYPTO_MAC(&ii->ipsec_in_crypto_mac, + ESP_INIT_CRYPTO_MAC(&ic->ic_crypto_mac, icv_len, esp_mp->b_wptr - icv_len); /* authentication starts at the ESP header */ @@ -2212,79 +2158,90 @@ esp_submit_req_inbound(mblk_t *ipsec_mp, ipsa_t *assoc, uint_t esph_offset) if (!do_encr) { /* authentication only */ /* initialize input data argument */ - ESP_INIT_CRYPTO_DATA(&ii->ipsec_in_crypto_data, + ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data, esp_mp, auth_offset, auth_len); /* call the crypto framework */ kef_rc = crypto_mac_verify(&assoc->ipsa_amech, - &ii->ipsec_in_crypto_data, + &ic->ic_crypto_data, &assoc->ipsa_kcfauthkey, auth_ctx_tmpl, - &ii->ipsec_in_crypto_mac, &call_req); + &ic->ic_crypto_mac, callrp); } } if (do_encr) { - /* force asynchronous processing? */ - if (ipss->ipsec_algs_exec_mode[IPSEC_ALG_ENCR] == - IPSEC_ALGS_EXEC_ASYNC) - call_req.cr_flag |= CRYPTO_ALWAYS_QUEUE; - /* encryption template */ IPSEC_CTX_TMPL(assoc, ipsa_encrtmpl, IPSEC_ALG_ENCR, encr_ctx_tmpl); /* Call the nonce update function. Also passes in IV */ (assoc->ipsa_noncefunc)(assoc, (uchar_t *)esph_ptr, encr_len, - iv_ptr, &ii->ipsec_in_cmm, &ii->ipsec_in_crypto_data); + iv_ptr, &ic->ic_cmm, &ic->ic_crypto_data); if (!do_auth) { /* decryption only */ /* initialize input data argument */ - ESP_INIT_CRYPTO_DATA(&ii->ipsec_in_crypto_data, + ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data, esp_mp, encr_offset, encr_len); /* call the crypto framework */ kef_rc = crypto_decrypt((crypto_mechanism_t *) - &ii->ipsec_in_cmm, &ii->ipsec_in_crypto_data, + &ic->ic_cmm, &ic->ic_crypto_data, &assoc->ipsa_kcfencrkey, encr_ctx_tmpl, - NULL, &call_req); + NULL, callrp); } } if (do_auth && do_encr) { /* dual operation */ /* initialize input data argument */ - ESP_INIT_CRYPTO_DUAL_DATA(&ii->ipsec_in_crypto_dual_data, + ESP_INIT_CRYPTO_DUAL_DATA(&ic->ic_crypto_dual_data, esp_mp, auth_offset, auth_len, encr_offset, encr_len - icv_len); /* specify IV */ - ii->ipsec_in_crypto_dual_data.dd_miscdata = (char *)iv_ptr; + ic->ic_crypto_dual_data.dd_miscdata = (char *)iv_ptr; /* call the framework */ kef_rc = crypto_mac_verify_decrypt(&assoc->ipsa_amech, - &assoc->ipsa_emech, &ii->ipsec_in_crypto_dual_data, + &assoc->ipsa_emech, &ic->ic_crypto_dual_data, &assoc->ipsa_kcfauthkey, &assoc->ipsa_kcfencrkey, - auth_ctx_tmpl, encr_ctx_tmpl, &ii->ipsec_in_crypto_mac, - NULL, &call_req); + auth_ctx_tmpl, encr_ctx_tmpl, &ic->ic_crypto_mac, + NULL, callrp); } switch (kef_rc) { case CRYPTO_SUCCESS: ESP_BUMP_STAT(espstack, crypto_sync); - return (esp_in_done(ipsec_mp)); + esp_mp = esp_in_done(esp_mp, ira, ic); + if (force) { + /* Free mp after we are done with ic */ + mp = ipsec_free_crypto_data(mp); + (void) ip_recv_attr_free_mblk(mp); + } + return (esp_mp); case CRYPTO_QUEUED: - /* esp_kcf_callback() will be invoked on completion */ + /* esp_kcf_callback_inbound() will be invoked on completion */ ESP_BUMP_STAT(espstack, crypto_async); - return (IPSEC_STATUS_PENDING); + return (NULL); case CRYPTO_INVALID_MAC: + if (force) { + mp = ipsec_free_crypto_data(mp); + esp_mp = ip_recv_attr_free_mblk(mp); + } ESP_BUMP_STAT(espstack, crypto_sync); - esp_log_bad_auth(ipsec_mp); - return (IPSEC_STATUS_FAILED); + BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards); + esp_log_bad_auth(esp_mp, ira); + /* esp_mp was passed to ip_drop_packet */ + return (NULL); } - esp_crypto_failed(ipsec_mp, B_TRUE, kef_rc, espstack); - return (IPSEC_STATUS_FAILED); + mp = ipsec_free_crypto_data(mp); + esp_mp = ip_recv_attr_free_mblk(mp); + BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards); + esp_crypto_failed(esp_mp, B_TRUE, kef_rc, ira->ira_ill, espstack); + /* esp_mp was passed to ip_drop_packet */ + return (NULL); } /* @@ -2293,6 +2250,9 @@ esp_submit_req_inbound(mblk_t *ipsec_mp, ipsa_t *assoc, uint_t esph_offset) * uses mblk-insertion to insert the UDP header. * TODO - If there is an easy way to prep a packet for HW checksums, make * it happen here. + * Note that this is used before both before calling ip_output_simple and + * in the esp datapath. The former could use IXAF_SET_ULP_CKSUM but not the + * latter. */ static void esp_prepare_udp(netstack_t *ns, mblk_t *mp, ipha_t *ipha) @@ -2313,7 +2273,7 @@ esp_prepare_udp(netstack_t *ns, mblk_t *mp, ipha_t *ipha) /* arr points to the IP header. */ arr = (uint16_t *)ipha; IP_STAT(ns->netstack_ip, ip_out_sw_cksum); - IP_STAT_UPDATE(ns->netstack_ip, ip_udp_out_sw_cksum_bytes, + IP_STAT_UPDATE(ns->netstack_ip, ip_out_sw_cksum_bytes, ntohs(htons(ipha->ipha_length) - hlen)); /* arr[6-9] are the IP addresses. */ cksum = IP_UDP_CSUM_COMP + arr[6] + arr[7] + arr[8] + arr[9] + @@ -2336,41 +2296,45 @@ esp_prepare_udp(netstack_t *ns, mblk_t *mp, ipha_t *ipha) static void actually_send_keepalive(void *arg) { - mblk_t *ipsec_mp = (mblk_t *)arg; - ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr; - ipha_t *ipha; - netstack_t *ns; - - ASSERT(DB_TYPE(ipsec_mp) == M_CTL); - ASSERT(io->ipsec_out_type == IPSEC_OUT); - ASSERT(ipsec_mp->b_cont != NULL); - ASSERT(DB_TYPE(ipsec_mp->b_cont) == M_DATA); - - ns = netstack_find_by_stackid(io->ipsec_out_stackid); - if (ns == NULL || ns != io->ipsec_out_ns) { - /* Just freemsg(). */ - if (ns != NULL) - netstack_rele(ns); - freemsg(ipsec_mp); + mblk_t *mp = (mblk_t *)arg; + ip_xmit_attr_t ixas; + netstack_t *ns; + netstackid_t stackid; + + stackid = (netstackid_t)(uintptr_t)mp->b_prev; + mp->b_prev = NULL; + ns = netstack_find_by_stackid(stackid); + if (ns == NULL) { + /* Disappeared */ + ip_drop_output("ipIfStatsOutDiscards", mp, NULL); + freemsg(mp); return; } - ipha = (ipha_t *)ipsec_mp->b_cont->b_rptr; - ip_wput_ipsec_out(NULL, ipsec_mp, ipha, NULL, NULL); + bzero(&ixas, sizeof (ixas)); + ixas.ixa_zoneid = ALL_ZONES; + ixas.ixa_cred = kcred; + ixas.ixa_cpid = NOPID; + ixas.ixa_tsl = NULL; + ixas.ixa_ipst = ns->netstack_ip; + /* No ULP checksum; done by esp_prepare_udp */ + ixas.ixa_flags = IXAF_IS_IPV4 | IXAF_NO_IPSEC; + + (void) ip_output_simple(mp, &ixas); + ixa_cleanup(&ixas); netstack_rele(ns); } /* - * Send a one-byte UDP NAT-T keepalive. Construct an IPSEC_OUT too that'll - * get fed into esp_send_udp/ip_wput_ipsec_out. + * Send a one-byte UDP NAT-T keepalive. */ void ipsecesp_send_keepalive(ipsa_t *assoc) { - mblk_t *mp = NULL, *ipsec_mp = NULL; - ipha_t *ipha; - udpha_t *udpha; - ipsec_out_t *io; + mblk_t *mp; + ipha_t *ipha; + udpha_t *udpha; + netstack_t *ns = assoc->ipsa_netstack; ASSERT(MUTEX_NOT_HELD(&assoc->ipsa_lock)); @@ -2399,85 +2363,78 @@ ipsecesp_send_keepalive(ipsa_t *assoc) mp->b_wptr = (uint8_t *)(udpha + 1); *(mp->b_wptr++) = 0xFF; - ipsec_mp = ipsec_alloc_ipsec_out(assoc->ipsa_netstack); - if (ipsec_mp == NULL) { - freeb(mp); - return; - } - ipsec_mp->b_cont = mp; - io = (ipsec_out_t *)ipsec_mp->b_rptr; - io->ipsec_out_zoneid = - netstackid_to_zoneid(assoc->ipsa_netstack->netstack_stackid); - io->ipsec_out_stackid = assoc->ipsa_netstack->netstack_stackid; + esp_prepare_udp(ns, mp, ipha); - esp_prepare_udp(assoc->ipsa_netstack, mp, ipha); /* * We're holding an isaf_t bucket lock, so pawn off the actual * packet transmission to another thread. Just in case syncq * processing causes a same-bucket packet to be processed. */ - if (taskq_dispatch(esp_taskq, actually_send_keepalive, ipsec_mp, + mp->b_prev = (mblk_t *)(uintptr_t)ns->netstack_stackid; + + if (taskq_dispatch(esp_taskq, actually_send_keepalive, mp, TQ_NOSLEEP) == 0) { /* Assume no memory if taskq_dispatch() fails. */ - ip_drop_packet(ipsec_mp, B_FALSE, NULL, NULL, - DROPPER(assoc->ipsa_netstack->netstack_ipsec, - ipds_esp_nomem), - &assoc->ipsa_netstack->netstack_ipsecesp->esp_dropper); + mp->b_prev = NULL; + ip_drop_packet(mp, B_FALSE, NULL, + DROPPER(ns->netstack_ipsec, ipds_esp_nomem), + &ns->netstack_ipsecesp->esp_dropper); } } -static ipsec_status_t -esp_submit_req_outbound(mblk_t *ipsec_mp, ipsa_t *assoc, uchar_t *icv_buf, - uint_t payload_len) +/* + * Returns mp if successfully completed the request. Returns + * NULL if it failed (and increments InDiscards) or if it is pending. + */ +static mblk_t * +esp_submit_req_outbound(mblk_t *data_mp, ip_xmit_attr_t *ixa, ipsa_t *assoc, + uchar_t *icv_buf, uint_t payload_len) { - ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr; uint_t auth_len; - crypto_call_req_t call_req; - mblk_t *esp_mp, *data_mp, *ip_mp; + crypto_call_req_t call_req, *callrp; + mblk_t *esp_mp; esph_t *esph_ptr; + mblk_t *mp; int kef_rc = CRYPTO_FAILED; uint_t icv_len = assoc->ipsa_mac_len; crypto_ctx_template_t auth_ctx_tmpl; - boolean_t do_auth; - boolean_t do_encr; + boolean_t do_auth, do_encr, force; uint_t iv_len = assoc->ipsa_iv_len; crypto_ctx_template_t encr_ctx_tmpl; boolean_t is_natt = ((assoc->ipsa_flags & IPSA_F_NATT) != 0); size_t esph_offset = (is_natt ? UDPH_SIZE : 0); - netstack_t *ns = io->ipsec_out_ns; + netstack_t *ns = ixa->ixa_ipst->ips_netstack; ipsecesp_stack_t *espstack = ns->netstack_ipsecesp; + ipsec_crypto_t *ic, icstack; + uchar_t *iv_ptr; + crypto_data_t *cd_ptr = NULL; + ill_t *ill = ixa->ixa_nce->nce_ill; ipsec_stack_t *ipss = ns->netstack_ipsec; - uchar_t *iv_ptr; - crypto_data_t *cd_ptr = NULL; esp3dbg(espstack, ("esp_submit_req_outbound:%s", is_natt ? "natt" : "not natt")); - ASSERT(io->ipsec_out_type == IPSEC_OUT); - - /* - * In case kEF queues and calls back, keep netstackid_t for - * verification that the IP instance is still around in - * esp_kcf_callback(). - */ - io->ipsec_out_stackid = ns->netstack_stackid; - do_encr = assoc->ipsa_encr_alg != SADB_EALG_NULL; do_auth = assoc->ipsa_auth_alg != SADB_AALG_NONE; + force = (assoc->ipsa_flags & IPSA_F_ASYNC); + +#ifdef IPSEC_LATENCY_TEST + kef_rc = CRYPTO_SUCCESS; +#else + kef_rc = CRYPTO_FAILED; +#endif /* * Outbound IPsec packets are of the form: - * IPSEC_OUT -> [IP,options] -> [ESP,IV] -> [data] -> [pad,ICV] + * [IP,options] -> [ESP,IV] -> [data] -> [pad,ICV] * unless it's NATT, then it's - * IPSEC_OUT -> [IP,options] -> [udp][ESP,IV] -> [data] -> [pad,ICV] + * [IP,options] -> [udp][ESP,IV] -> [data] -> [pad,ICV] * Get a pointer to the mblk containing the ESP header. */ - ip_mp = ipsec_mp->b_cont; - esp_mp = ipsec_mp->b_cont->b_cont; - ASSERT(ip_mp != NULL && esp_mp != NULL); + ASSERT(data_mp->b_cont != NULL); + esp_mp = data_mp->b_cont; esph_ptr = (esph_t *)(esp_mp->b_rptr + esph_offset); iv_ptr = (uchar_t *)(esph_ptr + 1); - data_mp = ipsec_mp->b_cont->b_cont->b_cont; /* * Combined mode algs need a nonce. This is setup in sadb_common_add(). @@ -2486,25 +2443,42 @@ esp_submit_req_outbound(mblk_t *ipsec_mp, ipsa_t *assoc, uchar_t *icv_buf, */ if ((assoc->ipsa_flags & IPSA_F_COUNTERMODE) && (assoc->ipsa_nonce == NULL)) { - ip_drop_packet(ipsec_mp, B_FALSE, NULL, NULL, + ip_drop_packet(data_mp, B_FALSE, NULL, DROPPER(ipss, ipds_esp_nomem), &espstack->esp_dropper); - return (IPSEC_STATUS_FAILED); + return (NULL); } - ESP_INIT_CALLREQ(&call_req); + if (force) { + /* We are doing asynch; allocate mblks to hold state */ + if ((mp = ip_xmit_attr_to_mblk(ixa)) == NULL || + (mp = ipsec_add_crypto_data(mp, &ic)) == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("ipIfStatsOutDiscards", data_mp, ill); + freemsg(data_mp); + return (NULL); + } + + linkb(mp, data_mp); + callrp = &call_req; + ESP_INIT_CALLREQ(callrp, mp, esp_kcf_callback_outbound); + } else { + /* + * If we know we are going to do sync then ipsec_crypto_t + * should be on the stack. + */ + ic = &icstack; + bzero(ic, sizeof (*ic)); + callrp = NULL; + } - if (do_auth) { - /* force asynchronous processing? */ - if (ipss->ipsec_algs_exec_mode[IPSEC_ALG_AUTH] == - IPSEC_ALGS_EXEC_ASYNC) - call_req.cr_flag |= CRYPTO_ALWAYS_QUEUE; + if (do_auth) { /* authentication context template */ IPSEC_CTX_TMPL(assoc, ipsa_authtmpl, IPSEC_ALG_AUTH, auth_ctx_tmpl); /* where to store the computed mac */ - ESP_INIT_CRYPTO_MAC(&io->ipsec_out_crypto_mac, + ESP_INIT_CRYPTO_MAC(&ic->ic_crypto_mac, icv_len, icv_buf); /* authentication starts at the ESP header */ @@ -2512,35 +2486,30 @@ esp_submit_req_outbound(mblk_t *ipsec_mp, ipsa_t *assoc, uchar_t *icv_buf, if (!do_encr) { /* authentication only */ /* initialize input data argument */ - ESP_INIT_CRYPTO_DATA(&io->ipsec_out_crypto_data, + ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data, esp_mp, esph_offset, auth_len); /* call the crypto framework */ kef_rc = crypto_mac(&assoc->ipsa_amech, - &io->ipsec_out_crypto_data, + &ic->ic_crypto_data, &assoc->ipsa_kcfauthkey, auth_ctx_tmpl, - &io->ipsec_out_crypto_mac, &call_req); + &ic->ic_crypto_mac, callrp); } } if (do_encr) { - /* force asynchronous processing? */ - if (ipss->ipsec_algs_exec_mode[IPSEC_ALG_ENCR] == - IPSEC_ALGS_EXEC_ASYNC) - call_req.cr_flag |= CRYPTO_ALWAYS_QUEUE; - /* encryption context template */ IPSEC_CTX_TMPL(assoc, ipsa_encrtmpl, IPSEC_ALG_ENCR, encr_ctx_tmpl); /* Call the nonce update function. */ (assoc->ipsa_noncefunc)(assoc, (uchar_t *)esph_ptr, payload_len, - iv_ptr, &io->ipsec_out_cmm, &io->ipsec_out_crypto_data); + iv_ptr, &ic->ic_cmm, &ic->ic_crypto_data); if (!do_auth) { /* encryption only, skip mblk that contains ESP hdr */ /* initialize input data argument */ - ESP_INIT_CRYPTO_DATA(&io->ipsec_out_crypto_data, - data_mp, 0, payload_len); + ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data, + esp_mp->b_cont, 0, payload_len); /* * For combined mode ciphers, the ciphertext is the same @@ -2556,20 +2525,19 @@ esp_submit_req_outbound(mblk_t *ipsec_mp, ipsa_t *assoc, uchar_t *icv_buf, * for the cipher to use. */ if (assoc->ipsa_flags & IPSA_F_COMBINED) { - bcopy(&io->ipsec_out_crypto_data, - &io->ipsec_out_crypto_mac, + bcopy(&ic->ic_crypto_data, + &ic->ic_crypto_mac, sizeof (crypto_data_t)); - io->ipsec_out_crypto_mac.cd_length = + ic->ic_crypto_mac.cd_length = payload_len + icv_len; - cd_ptr = &io->ipsec_out_crypto_mac; + cd_ptr = &ic->ic_crypto_mac; } /* call the crypto framework */ kef_rc = crypto_encrypt((crypto_mechanism_t *) - &io->ipsec_out_cmm, - &io->ipsec_out_crypto_data, + &ic->ic_cmm, &ic->ic_crypto_data, &assoc->ipsa_kcfencrkey, encr_ctx_tmpl, - cd_ptr, &call_req); + cd_ptr, callrp); } } @@ -2584,49 +2552,58 @@ esp_submit_req_outbound(mblk_t *ipsec_mp, ipsa_t *assoc, uchar_t *icv_buf, * the authentication at the ESP header, i.e. use an * authentication offset of zero. */ - ESP_INIT_CRYPTO_DUAL_DATA(&io->ipsec_out_crypto_dual_data, + ESP_INIT_CRYPTO_DUAL_DATA(&ic->ic_crypto_dual_data, esp_mp, MBLKL(esp_mp), payload_len, esph_offset, auth_len); /* specify IV */ - io->ipsec_out_crypto_dual_data.dd_miscdata = (char *)iv_ptr; + ic->ic_crypto_dual_data.dd_miscdata = (char *)iv_ptr; /* call the framework */ kef_rc = crypto_encrypt_mac(&assoc->ipsa_emech, &assoc->ipsa_amech, NULL, &assoc->ipsa_kcfencrkey, &assoc->ipsa_kcfauthkey, encr_ctx_tmpl, auth_ctx_tmpl, - &io->ipsec_out_crypto_dual_data, - &io->ipsec_out_crypto_mac, &call_req); + &ic->ic_crypto_dual_data, + &ic->ic_crypto_mac, callrp); } switch (kef_rc) { case CRYPTO_SUCCESS: ESP_BUMP_STAT(espstack, crypto_sync); esp_set_usetime(assoc, B_FALSE); + if (force) { + mp = ipsec_free_crypto_data(mp); + data_mp = ip_xmit_attr_free_mblk(mp); + } if (is_natt) - esp_prepare_udp(ns, ipsec_mp->b_cont, - (ipha_t *)ipsec_mp->b_cont->b_rptr); - return (IPSEC_STATUS_SUCCESS); + esp_prepare_udp(ns, data_mp, (ipha_t *)data_mp->b_rptr); + return (data_mp); case CRYPTO_QUEUED: - /* esp_kcf_callback() will be invoked on completion */ + /* esp_kcf_callback_outbound() will be invoked on completion */ ESP_BUMP_STAT(espstack, crypto_async); - return (IPSEC_STATUS_PENDING); + return (NULL); } - esp_crypto_failed(ipsec_mp, B_FALSE, kef_rc, espstack); - return (IPSEC_STATUS_FAILED); + if (force) { + mp = ipsec_free_crypto_data(mp); + data_mp = ip_xmit_attr_free_mblk(mp); + } + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + esp_crypto_failed(data_mp, B_FALSE, kef_rc, NULL, espstack); + /* data_mp was passed to ip_drop_packet */ + return (NULL); } /* * Handle outbound IPsec processing for IPv4 and IPv6 - * On success returns B_TRUE, on failure returns B_FALSE and frees the - * mblk chain ipsec_in_mp. + * + * Returns data_mp if successfully completed the request. Returns + * NULL if it failed (and increments InDiscards) or if it is pending. */ -static ipsec_status_t -esp_outbound(mblk_t *mp) +static mblk_t * +esp_outbound(mblk_t *data_mp, ip_xmit_attr_t *ixa) { - mblk_t *ipsec_out_mp, *data_mp, *espmp, *tailmp; - ipsec_out_t *io; + mblk_t *espmp, *tailmp; ipha_t *ipha; ip6_t *ip6h; esph_t *esph_ptr, *iv_ptr; @@ -2640,17 +2617,11 @@ esp_outbound(mblk_t *mp) uchar_t *icv_buf; udpha_t *udpha; boolean_t is_natt = B_FALSE; - netstack_t *ns; - ipsecesp_stack_t *espstack; - ipsec_stack_t *ipss; - - ipsec_out_mp = mp; - data_mp = ipsec_out_mp->b_cont; - - io = (ipsec_out_t *)ipsec_out_mp->b_rptr; - ns = io->ipsec_out_ns; - espstack = ns->netstack_ipsecesp; - ipss = ns->netstack_ipsec; + netstack_t *ns = ixa->ixa_ipst->ips_netstack; + ipsecesp_stack_t *espstack = ns->netstack_ipsecesp; + ipsec_stack_t *ipss = ns->netstack_ipsec; + ill_t *ill = ixa->ixa_nce->nce_ill; + boolean_t need_refrele = B_FALSE; ESP_BUMP_STAT(espstack, out_requests); @@ -2662,65 +2633,73 @@ esp_outbound(mblk_t *mp) * we might as well make use of msgpullup() and get the mblk into one * contiguous piece! */ - ipsec_out_mp->b_cont = msgpullup(data_mp, -1); - if (ipsec_out_mp->b_cont == NULL) { + tailmp = msgpullup(data_mp, -1); + if (tailmp == NULL) { esp0dbg(("esp_outbound: msgpullup() failed, " "dropping packet.\n")); - ipsec_out_mp->b_cont = data_mp; - /* - * TODO: Find the outbound IRE for this packet and - * pass it to ip_drop_packet(). - */ - ip_drop_packet(ipsec_out_mp, B_FALSE, NULL, NULL, + ip_drop_packet(data_mp, B_FALSE, ill, DROPPER(ipss, ipds_esp_nomem), &espstack->esp_dropper); - return (IPSEC_STATUS_FAILED); - } else { - freemsg(data_mp); - data_mp = ipsec_out_mp->b_cont; + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + return (NULL); } + freemsg(data_mp); + data_mp = tailmp; - assoc = io->ipsec_out_esp_sa; + assoc = ixa->ixa_ipsec_esp_sa; ASSERT(assoc != NULL); /* * Get the outer IP header in shape to escape this system.. */ - if (is_system_labeled() && (assoc->ipsa_ocred != NULL)) { - int whack; - - mblk_setcred(data_mp, assoc->ipsa_ocred, NOPID); - if (io->ipsec_out_v4) - whack = sadb_whack_label(&data_mp, assoc); - else - whack = sadb_whack_label_v6(&data_mp, assoc); - if (whack != 0) { - ip_drop_packet(ipsec_out_mp, B_FALSE, NULL, - NULL, DROPPER(ipss, ipds_esp_nomem), + if (is_system_labeled() && (assoc->ipsa_otsl != NULL)) { + /* + * Need to update packet with any CIPSO option and update + * ixa_tsl to capture the new label. + * We allocate a separate ixa for that purpose. + */ + ixa = ip_xmit_attr_duplicate(ixa); + if (ixa == NULL) { + ip_drop_packet(data_mp, B_FALSE, ill, + DROPPER(ipss, ipds_esp_nomem), &espstack->esp_dropper); - return (IPSEC_STATUS_FAILED); + return (NULL); } - ipsec_out_mp->b_cont = data_mp; - } + need_refrele = B_TRUE; + label_hold(assoc->ipsa_otsl); + ip_xmit_attr_replace_tsl(ixa, assoc->ipsa_otsl); + + data_mp = sadb_whack_label(data_mp, assoc, ixa, + DROPPER(ipss, ipds_esp_nomem), &espstack->esp_dropper); + if (data_mp == NULL) { + /* Packet dropped by sadb_whack_label */ + ixa_refrele(ixa); + return (NULL); + } + } /* * Reality check.... */ ipha = (ipha_t *)data_mp->b_rptr; /* So we can call esp_acquire(). */ - if (io->ipsec_out_v4) { + if (ixa->ixa_flags & IXAF_IS_IPV4) { + ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); + af = AF_INET; divpoint = IPH_HDR_LENGTH(ipha); datalen = ntohs(ipha->ipha_length) - divpoint; nhp = (uint8_t *)&ipha->ipha_protocol; } else { - ip6_pkt_t ipp; + ip_pkt_t ipp; + + ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION); af = AF_INET6; ip6h = (ip6_t *)ipha; bzero(&ipp, sizeof (ipp)); - divpoint = ip_find_hdr_v6(data_mp, ip6h, &ipp, NULL); + divpoint = ip_find_hdr_v6(data_mp, ip6h, B_FALSE, &ipp, NULL); if (ipp.ipp_dstopts != NULL && ipp.ipp_dstopts->ip6d_nxt != IPPROTO_ROUTING) { /* @@ -2795,28 +2774,26 @@ esp_outbound(mblk_t *mp) */ if (!esp_age_bytes(assoc, datalen + padlen + iv_len + 2, B_FALSE)) { - /* - * TODO: Find the outbound IRE for this packet and - * pass it to ip_drop_packet(). - */ - ip_drop_packet(mp, B_FALSE, NULL, NULL, + ip_drop_packet(data_mp, B_FALSE, ill, DROPPER(ipss, ipds_esp_bytes_expire), &espstack->esp_dropper); - return (IPSEC_STATUS_FAILED); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + if (need_refrele) + ixa_refrele(ixa); + return (NULL); } espmp = allocb(esplen, BPRI_HI); if (espmp == NULL) { ESP_BUMP_STAT(espstack, out_discards); esp1dbg(espstack, ("esp_outbound: can't allocate espmp.\n")); - /* - * TODO: Find the outbound IRE for this packet and - * pass it to ip_drop_packet(). - */ - ip_drop_packet(mp, B_FALSE, NULL, NULL, + ip_drop_packet(data_mp, B_FALSE, ill, DROPPER(ipss, ipds_esp_nomem), &espstack->esp_dropper); - return (IPSEC_STATUS_FAILED); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + if (need_refrele) + ixa_refrele(ixa); + return (NULL); } espmp->b_wptr += esplen; esph_ptr = (esph_t *)espmp->b_rptr; @@ -2853,14 +2830,13 @@ esp_outbound(mblk_t *mp) ESP_BUMP_STAT(espstack, out_discards); sadb_replay_delete(assoc); - /* - * TODO: Find the outbound IRE for this packet and - * pass it to ip_drop_packet(). - */ - ip_drop_packet(mp, B_FALSE, NULL, NULL, + ip_drop_packet(data_mp, B_FALSE, ill, DROPPER(ipss, ipds_esp_replay), &espstack->esp_dropper); - return (IPSEC_STATUS_FAILED); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + if (need_refrele) + ixa_refrele(ixa); + return (NULL); } iv_ptr = (esph_ptr + 1); @@ -2887,9 +2863,11 @@ esp_outbound(mblk_t *mp) */ if (!update_iv((uint8_t *)iv_ptr, espstack->esp_pfkey_q, assoc, espstack)) { - ip_drop_packet(mp, B_FALSE, NULL, NULL, + ip_drop_packet(data_mp, B_FALSE, ill, DROPPER(ipss, ipds_esp_iv_wrap), &espstack->esp_dropper); - return (IPSEC_STATUS_FAILED); + if (need_refrele) + ixa_refrele(ixa); + return (NULL); } /* Fix the IP header. */ @@ -2898,7 +2876,7 @@ esp_outbound(mblk_t *mp) protocol = *nhp; - if (io->ipsec_out_v4) { + if (ixa->ixa_flags & IXAF_IS_IPV4) { ipha->ipha_length = htons(ntohs(ipha->ipha_length) + adj); if (is_natt) { *nhp = IPPROTO_UDP; @@ -2922,15 +2900,14 @@ esp_outbound(mblk_t *mp) if (!esp_insert_esp(data_mp, espmp, divpoint, espstack)) { ESP_BUMP_STAT(espstack, out_discards); /* NOTE: esp_insert_esp() only fails if there's no memory. */ - /* - * TODO: Find the outbound IRE for this packet and - * pass it to ip_drop_packet(). - */ - ip_drop_packet(mp, B_FALSE, NULL, NULL, + ip_drop_packet(data_mp, B_FALSE, ill, DROPPER(ipss, ipds_esp_nomem), &espstack->esp_dropper); freeb(espmp); - return (IPSEC_STATUS_FAILED); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + if (need_refrele) + ixa_refrele(ixa); + return (NULL); } /* Append padding (and leave room for ICV). */ @@ -2941,14 +2918,13 @@ esp_outbound(mblk_t *mp) if (tailmp->b_cont == NULL) { ESP_BUMP_STAT(espstack, out_discards); esp0dbg(("esp_outbound: Can't allocate tailmp.\n")); - /* - * TODO: Find the outbound IRE for this packet and - * pass it to ip_drop_packet(). - */ - ip_drop_packet(mp, B_FALSE, NULL, NULL, + ip_drop_packet(data_mp, B_FALSE, ill, DROPPER(ipss, ipds_esp_nomem), &espstack->esp_dropper); - return (IPSEC_STATUS_FAILED); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + if (need_refrele) + ixa_refrele(ixa); + return (NULL); } tailmp = tailmp->b_cont; } @@ -2968,29 +2944,6 @@ esp_outbound(mblk_t *mp) esp2dbg(espstack, (dump_msg(data_mp))); /* - * The packet is eligible for hardware acceleration if the - * following conditions are satisfied: - * - * 1. the packet will not be fragmented - * 2. the provider supports the algorithms specified by SA - * 3. there is no pending control message being exchanged - * 4. snoop is not attached - * 5. the destination address is not a multicast address - * - * All five of these conditions are checked by IP prior to - * sending the packet to ESP. - * - * But We, and We Alone, can, nay MUST check if the packet - * is over NATT, and then disqualify it from hardware - * acceleration. - */ - - if (io->ipsec_out_is_capab_ill && !(assoc->ipsa_flags & IPSA_F_NATT)) { - return (esp_outbound_accelerated(ipsec_out_mp, mac_len)); - } - ESP_BUMP_STAT(espstack, noaccel); - - /* * Okay. I've set up the pre-encryption ESP. Let's do it! */ @@ -3002,32 +2955,23 @@ esp_outbound(mblk_t *mp) icv_buf = NULL; } - return (esp_submit_req_outbound(ipsec_out_mp, assoc, icv_buf, - datalen + padlen + 2)); + data_mp = esp_submit_req_outbound(data_mp, ixa, assoc, icv_buf, + datalen + padlen + 2); + if (need_refrele) + ixa_refrele(ixa); + return (data_mp); } /* * IP calls this to validate the ICMP errors that * we got from the network. */ -ipsec_status_t -ipsecesp_icmp_error(mblk_t *ipsec_mp) +mblk_t * +ipsecesp_icmp_error(mblk_t *data_mp, ip_recv_attr_t *ira) { - ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr; - boolean_t is_inbound = (ii->ipsec_in_type == IPSEC_IN); - netstack_t *ns; - ipsecesp_stack_t *espstack; - ipsec_stack_t *ipss; - - if (is_inbound) { - ns = ii->ipsec_in_ns; - } else { - ipsec_out_t *io = (ipsec_out_t *)ipsec_mp->b_rptr; - - ns = io->ipsec_out_ns; - } - espstack = ns->netstack_ipsecesp; - ipss = ns->netstack_ipsec; + netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack; + ipsecesp_stack_t *espstack = ns->netstack_ipsecesp; + ipsec_stack_t *ipss = ns->netstack_ipsec; /* * Unless we get an entire packet back, this function is useless. @@ -3044,55 +2988,10 @@ ipsecesp_icmp_error(mblk_t *ipsec_mp) * very small, we discard here. */ IP_ESP_BUMP_STAT(ipss, in_discards); - ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL, + ip_drop_packet(data_mp, B_TRUE, ira->ira_ill, DROPPER(ipss, ipds_esp_icmp), &espstack->esp_dropper); - return (IPSEC_STATUS_FAILED); -} - -/* - * ESP module read put routine. - */ -/* ARGSUSED */ -static void -ipsecesp_rput(queue_t *q, mblk_t *mp) -{ - ipsecesp_stack_t *espstack = (ipsecesp_stack_t *)q->q_ptr; - - ASSERT(mp->b_datap->db_type != M_CTL); /* No more IRE_DB_REQ. */ - - switch (mp->b_datap->db_type) { - case M_PROTO: - case M_PCPROTO: - /* TPI message of some sort. */ - switch (*((t_scalar_t *)mp->b_rptr)) { - case T_BIND_ACK: - esp3dbg(espstack, - ("Thank you IP from ESP for T_BIND_ACK\n")); - break; - case T_ERROR_ACK: - cmn_err(CE_WARN, - "ipsecesp: ESP received T_ERROR_ACK from IP."); - /* - * Make esp_sadb.s_ip_q NULL, and in the - * future, perhaps try again. - */ - espstack->esp_sadb.s_ip_q = NULL; - break; - case T_OK_ACK: - /* Probably from a (rarely sent) T_UNBIND_REQ. */ - break; - default: - esp0dbg(("Unknown M_{,PC}PROTO message.\n")); - } - freemsg(mp); - break; - default: - /* For now, passthru message. */ - esp2dbg(espstack, ("ESP got unknown mblk type %d.\n", - mp->b_datap->db_type)); - putnext(q, mp); - } + return (NULL); } /* @@ -3102,7 +3001,7 @@ ipsecesp_rput(queue_t *q, mblk_t *mp) */ static boolean_t esp_register_out(uint32_t sequence, uint32_t pid, uint_t serial, - ipsecesp_stack_t *espstack, mblk_t *in_mp) + ipsecesp_stack_t *espstack, cred_t *cr) { mblk_t *pfkey_msg_mp, *keysock_out_mp; sadb_msg_t *samsg; @@ -3121,7 +3020,7 @@ esp_register_out(uint32_t sequence, uint32_t pid, uint_t serial, sadb_sens_t *sens; size_t sens_len = 0; sadb_ext_t *nextext; - cred_t *sens_cr = NULL; + ts_label_t *sens_tsl = NULL; /* Allocate the KEYSOCK_OUT. */ keysock_out_mp = sadb_keysock_out(serial); @@ -3130,11 +3029,10 @@ esp_register_out(uint32_t sequence, uint32_t pid, uint_t serial, return (B_FALSE); } - if (is_system_labeled() && (in_mp != NULL)) { - sens_cr = msg_getcred(in_mp, NULL); - - if (sens_cr != NULL) { - sens_len = sadb_sens_len_from_cred(sens_cr); + if (is_system_labeled() && (cr != NULL)) { + sens_tsl = crgetlabel(cr); + if (sens_tsl != NULL) { + sens_len = sadb_sens_len_from_label(sens_tsl); allocsize += sens_len; } } @@ -3268,10 +3166,10 @@ esp_register_out(uint32_t sequence, uint32_t pid, uint_t serial, mutex_exit(&ipss->ipsec_alg_lock); - if (sens_cr != NULL) { + if (sens_tsl != NULL) { sens = (sadb_sens_t *)nextext; - sadb_sens_from_cred(sens, SADB_EXT_SENSITIVITY, - sens_cr, sens_len); + sadb_sens_from_label(sens, SADB_EXT_SENSITIVITY, + sens_tsl, sens_len); nextext = (sadb_ext_t *)(((uint8_t *)sens) + sens_len); } @@ -3336,40 +3234,61 @@ ipsecesp_algs_changed(netstack_t *ns) /* * Stub function that taskq_dispatch() invokes to take the mblk (in arg) - * and put() it into AH and STREAMS again. + * and send() it into ESP and IP again. */ static void inbound_task(void *arg) { - esph_t *esph; - mblk_t *mp = (mblk_t *)arg; - ipsec_in_t *ii = (ipsec_in_t *)mp->b_rptr; - netstack_t *ns; - ipsecesp_stack_t *espstack; - int ipsec_rc; - - ns = netstack_find_by_stackid(ii->ipsec_in_stackid); - if (ns == NULL || ns != ii->ipsec_in_ns) { - /* Just freemsg(). */ - if (ns != NULL) - netstack_rele(ns); + mblk_t *mp = (mblk_t *)arg; + mblk_t *async_mp; + ip_recv_attr_t iras; + + async_mp = mp; + mp = async_mp->b_cont; + async_mp->b_cont = NULL; + if (!ip_recv_attr_from_mblk(async_mp, &iras)) { + /* The ill or ip_stack_t disappeared on us */ + ip_drop_input("ip_recv_attr_from_mblk", mp, NULL); freemsg(mp); - return; + goto done; } - espstack = ns->netstack_ipsecesp; + esp_inbound_restart(mp, &iras); +done: + ira_cleanup(&iras, B_TRUE); +} + +/* + * Restart ESP after the SA has been added. + */ +static void +esp_inbound_restart(mblk_t *mp, ip_recv_attr_t *ira) +{ + esph_t *esph; + netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack; + ipsecesp_stack_t *espstack = ns->netstack_ipsecesp; esp2dbg(espstack, ("in ESP inbound_task")); ASSERT(espstack != NULL); - esph = ipsec_inbound_esp_sa(mp, ns); - if (esph != NULL) { - ASSERT(ii->ipsec_in_esp_sa != NULL); - ipsec_rc = ii->ipsec_in_esp_sa->ipsa_input_func(mp, esph); - if (ipsec_rc == IPSEC_STATUS_SUCCESS) - ip_fanout_proto_again(mp, NULL, NULL, NULL); + mp = ipsec_inbound_esp_sa(mp, ira, &esph); + if (mp == NULL) + return; + + ASSERT(esph != NULL); + ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE); + ASSERT(ira->ira_ipsec_esp_sa != NULL); + + mp = ira->ira_ipsec_esp_sa->ipsa_input_func(mp, esph, ira); + if (mp == NULL) { + /* + * Either it failed or is pending. In the former case + * ipIfStatsInDiscards was increased. + */ + return; } - netstack_rele(ns); + + ip_input_post_ipsec(mp, ira); } /* @@ -3533,17 +3452,21 @@ esp_add_sa_finish(mblk_t *mp, sadb_msg_t *samsg, keysock_in_t *ksi, if (larval != NULL) lpkt = sadb_clear_lpkt(larval); - rc = sadb_common_add(espstack->esp_sadb.s_ip_q, espstack->esp_pfkey_q, + rc = sadb_common_add(espstack->esp_pfkey_q, mp, samsg, ksi, primary, secondary, larval, clone, is_inbound, diagnostic, espstack->ipsecesp_netstack, &espstack->esp_sadb); - if (rc == 0 && lpkt != NULL) - rc = !taskq_dispatch(esp_taskq, inbound_task, lpkt, TQ_NOSLEEP); - - if (rc != 0) { - ip_drop_packet(lpkt, B_TRUE, NULL, NULL, - DROPPER(ipss, ipds_sadb_inlarval_timeout), - &espstack->esp_dropper); + if (lpkt != NULL) { + if (rc == 0) { + rc = !taskq_dispatch(esp_taskq, inbound_task, + lpkt, TQ_NOSLEEP); + } + if (rc != 0) { + lpkt = ip_recv_attr_free_mblk(lpkt); + ip_drop_packet(lpkt, B_TRUE, NULL, + DROPPER(ipss, ipds_sadb_inlarval_timeout), + &espstack->esp_dropper); + } } /* @@ -3551,45 +3474,78 @@ esp_add_sa_finish(mblk_t *mp, sadb_msg_t *samsg, keysock_in_t *ksi, * esp_outbound() calls? */ + /* Handle the packets queued waiting for the SA */ while (acq_msgs != NULL) { - mblk_t *mp = acq_msgs; + mblk_t *asyncmp; + mblk_t *data_mp; + ip_xmit_attr_t ixas; + ill_t *ill; + asyncmp = acq_msgs; acq_msgs = acq_msgs->b_next; - mp->b_next = NULL; - if (rc == 0) { - if (ipsec_outbound_sa(mp, IPPROTO_ESP)) { - ((ipsec_out_t *)(mp->b_rptr))-> - ipsec_out_esp_done = B_TRUE; - if (esp_outbound(mp) == IPSEC_STATUS_SUCCESS) { - ipha_t *ipha; - - /* do AH processing if needed */ - if (!esp_do_outbound_ah(mp)) - continue; - - ipha = (ipha_t *)mp->b_cont->b_rptr; - - /* finish IPsec processing */ - if (IPH_HDR_VERSION(ipha) == - IP_VERSION) { - ip_wput_ipsec_out(NULL, mp, - ipha, NULL, NULL); - } else { - ip6_t *ip6h = (ip6_t *)ipha; - ip_wput_ipsec_out_v6(NULL, - mp, ip6h, NULL, NULL); - } - } - continue; - } + asyncmp->b_next = NULL; + + /* + * Extract the ip_xmit_attr_t from the first mblk. + * Verifies that the netstack and ill is still around; could + * have vanished while iked was doing its work. + * On succesful return we have a nce_t and the ill/ipst can't + * disappear until we do the nce_refrele in ixa_cleanup. + */ + data_mp = asyncmp->b_cont; + asyncmp->b_cont = NULL; + if (!ip_xmit_attr_from_mblk(asyncmp, &ixas)) { + ESP_BUMP_STAT(espstack, out_discards); + ip_drop_packet(data_mp, B_FALSE, NULL, + DROPPER(ipss, ipds_sadb_acquire_timeout), + &espstack->esp_dropper); + } else if (rc != 0) { + ill = ixas.ixa_nce->nce_ill; + ESP_BUMP_STAT(espstack, out_discards); + ip_drop_packet(data_mp, B_FALSE, ill, + DROPPER(ipss, ipds_sadb_acquire_timeout), + &espstack->esp_dropper); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + } else { + esp_outbound_finish(data_mp, &ixas); } + ixa_cleanup(&ixas); + } + + return (rc); +} + +/* + * Process one of the queued messages (from ipsacq_mp) once the SA + * has been added. + */ +static void +esp_outbound_finish(mblk_t *data_mp, ip_xmit_attr_t *ixa) +{ + netstack_t *ns = ixa->ixa_ipst->ips_netstack; + ipsecesp_stack_t *espstack = ns->netstack_ipsecesp; + ipsec_stack_t *ipss = ns->netstack_ipsec; + ill_t *ill = ixa->ixa_nce->nce_ill; + + if (!ipsec_outbound_sa(data_mp, ixa, IPPROTO_ESP)) { ESP_BUMP_STAT(espstack, out_discards); - ip_drop_packet(mp, B_FALSE, NULL, NULL, + ip_drop_packet(data_mp, B_FALSE, ill, DROPPER(ipss, ipds_sadb_acquire_timeout), &espstack->esp_dropper); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); + return; } - return (rc); + data_mp = esp_outbound(data_mp, ixa); + if (data_mp == NULL) + return; + + /* do AH processing if needed */ + data_mp = esp_do_outbound_ah(data_mp, ixa); + if (data_mp == NULL) + return; + + (void) ip_output_post_ipsec(data_mp, ixa); } /* @@ -3674,11 +3630,13 @@ esp_add_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic, netstack_t *ns) return (EINVAL); } +#ifndef IPSEC_LATENCY_TEST if (assoc->sadb_sa_encrypt == SADB_EALG_NULL && assoc->sadb_sa_auth == SADB_AALG_NONE) { *diagnostic = SADB_X_DIAGNOSTIC_BAD_AALG; return (EINVAL); } +#endif if (assoc->sadb_sa_flags & ~espstack->esp_sadb.s_addflags) { *diagnostic = SADB_X_DIAGNOSTIC_BAD_SAFLAGS; @@ -3734,7 +3692,11 @@ esp_add_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic, netstack_t *ns) /* * First locate the authentication algorithm. */ +#ifdef IPSEC_LATENCY_TEST + if (akey != NULL && assoc->sadb_sa_auth != SADB_AALG_NONE) { +#else if (akey != NULL) { +#endif ipsec_alginfo_t *aalg; aalg = ipss->ipsec_alglists[IPSEC_ALG_AUTH] @@ -3883,7 +3845,7 @@ esp_del_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic, return (sadb_purge_sa(mp, ksi, (sin->sin_family == AF_INET6) ? &espstack->esp_sadb.s_v6 : &espstack->esp_sadb.s_v4, diagnostic, - espstack->esp_pfkey_q, espstack->esp_sadb.s_ip_q)); + espstack->esp_pfkey_q)); } return (sadb_delget_sa(mp, ksi, &espstack->esp_sadb, diagnostic, @@ -4024,7 +3986,7 @@ esp_parse_pfkey(mblk_t *mp, ipsecesp_stack_t *espstack) * Keysock takes care of the PF_KEY bookkeeping for this. */ if (esp_register_out(samsg->sadb_msg_seq, samsg->sadb_msg_pid, - ksi->ks_in_serial, espstack, mp)) { + ksi->ks_in_serial, espstack, msg_getcred(mp, NULL))) { freemsg(mp); } else { /* @@ -4109,8 +4071,7 @@ esp_keysock_no_socket(mblk_t *mp, ipsecesp_stack_t *espstack) samsg->sadb_msg_errno = kse->ks_err_errno; samsg->sadb_msg_len = SADB_8TO64(sizeof (*samsg)); /* - * Use the write-side of the esp_pfkey_q, in case there is - * no esp_sadb.s_ip_q. + * Use the write-side of the esp_pfkey_q */ sadb_in_acquire(samsg, &espstack->esp_sadb, WR(espstack->esp_pfkey_q), espstack->ipsecesp_netstack); @@ -4197,236 +4158,23 @@ ipsecesp_wput(queue_t *q, mblk_t *mp) } /* - * Process an outbound ESP packet that can be accelerated by a IPsec - * hardware acceleration capable Provider. - * The caller already inserted and initialized the ESP header. - * This function allocates a tagging M_CTL, and adds room at the end - * of the packet to hold the ICV if authentication is needed. - * - * On success returns B_TRUE, on failure returns B_FALSE and frees the - * mblk chain ipsec_out. - */ -static ipsec_status_t -esp_outbound_accelerated(mblk_t *ipsec_out, uint_t icv_len) -{ - ipsec_out_t *io; - mblk_t *lastmp; - netstack_t *ns; - ipsecesp_stack_t *espstack; - ipsec_stack_t *ipss; - - io = (ipsec_out_t *)ipsec_out->b_rptr; - ns = io->ipsec_out_ns; - espstack = ns->netstack_ipsecesp; - ipss = ns->netstack_ipsec; - - ESP_BUMP_STAT(espstack, out_accelerated); - - /* mark packet as being accelerated in IPSEC_OUT */ - ASSERT(io->ipsec_out_accelerated == B_FALSE); - io->ipsec_out_accelerated = B_TRUE; - - /* - * add room at the end of the packet for the ICV if needed - */ - if (icv_len > 0) { - /* go to last mblk */ - lastmp = ipsec_out; /* For following while loop. */ - do { - lastmp = lastmp->b_cont; - } while (lastmp->b_cont != NULL); - - /* if not enough available room, allocate new mblk */ - if ((lastmp->b_wptr + icv_len) > lastmp->b_datap->db_lim) { - lastmp->b_cont = allocb(icv_len, BPRI_HI); - if (lastmp->b_cont == NULL) { - ESP_BUMP_STAT(espstack, out_discards); - ip_drop_packet(ipsec_out, B_FALSE, NULL, NULL, - DROPPER(ipss, ipds_esp_nomem), - &espstack->esp_dropper); - return (IPSEC_STATUS_FAILED); - } - lastmp = lastmp->b_cont; - } - lastmp->b_wptr += icv_len; - } - - return (IPSEC_STATUS_SUCCESS); -} - -/* - * Process an inbound accelerated ESP packet. - * On success returns B_TRUE, on failure returns B_FALSE and frees the - * mblk chain ipsec_in. - */ -static ipsec_status_t -esp_inbound_accelerated(mblk_t *ipsec_in, mblk_t *data_mp, boolean_t isv4, - ipsa_t *assoc) -{ - ipsec_in_t *ii = (ipsec_in_t *)ipsec_in->b_rptr; - mblk_t *hada_mp; - uint32_t icv_len = 0; - da_ipsec_t *hada; - ipha_t *ipha; - ip6_t *ip6h; - kstat_named_t *counter; - netstack_t *ns = ii->ipsec_in_ns; - ipsecesp_stack_t *espstack = ns->netstack_ipsecesp; - ipsec_stack_t *ipss = ns->netstack_ipsec; - - ESP_BUMP_STAT(espstack, in_accelerated); - - hada_mp = ii->ipsec_in_da; - ASSERT(hada_mp != NULL); - hada = (da_ipsec_t *)hada_mp->b_rptr; - - /* - * We only support one level of decapsulation in hardware, so - * nuke the pointer. - */ - ii->ipsec_in_da = NULL; - ii->ipsec_in_accelerated = B_FALSE; - - if (assoc->ipsa_auth_alg != IPSA_AALG_NONE) { - /* - * ESP with authentication. We expect the Provider to have - * computed the ICV and placed it in the hardware acceleration - * data attributes. - * - * Extract ICV length from attributes M_CTL and sanity check - * its value. We allow the mblk to be smaller than da_ipsec_t - * for a small ICV, as long as the entire ICV fits within the - * mblk. - * - * Also ensures that the ICV length computed by Provider - * corresponds to the ICV length of the agorithm specified by - * the SA. - */ - icv_len = hada->da_icv_len; - if ((icv_len != assoc->ipsa_mac_len) || - (icv_len > DA_ICV_MAX_LEN) || (MBLKL(hada_mp) < - (sizeof (da_ipsec_t) - DA_ICV_MAX_LEN + icv_len))) { - esp0dbg(("esp_inbound_accelerated: " - "ICV len (%u) incorrect or mblk too small (%u)\n", - icv_len, (uint32_t)(MBLKL(hada_mp)))); - counter = DROPPER(ipss, ipds_esp_bad_auth); - goto esp_in_discard; - } - } - - /* get pointers to IP header */ - if (isv4) { - ipha = (ipha_t *)data_mp->b_rptr; - } else { - ip6h = (ip6_t *)data_mp->b_rptr; - } - - /* - * Compare ICV in ESP packet vs ICV computed by adapter. - * We also remove the ICV from the end of the packet since - * it will no longer be needed. - * - * Assume that esp_inbound() already ensured that the pkt - * was in one mblk. - */ - ASSERT(data_mp->b_cont == NULL); - data_mp->b_wptr -= icv_len; - /* adjust IP header */ - if (isv4) - ipha->ipha_length = htons(ntohs(ipha->ipha_length) - icv_len); - else - ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) - icv_len); - if (icv_len && bcmp(hada->da_icv, data_mp->b_wptr, icv_len)) { - int af; - void *addr; - - if (isv4) { - addr = &ipha->ipha_dst; - af = AF_INET; - } else { - addr = &ip6h->ip6_dst; - af = AF_INET6; - } - - /* - * Log the event. Don't print to the console, block - * potential denial-of-service attack. - */ - ESP_BUMP_STAT(espstack, bad_auth); - ipsec_assocfailure(info.mi_idnum, 0, 0, SL_ERROR | SL_WARN, - "ESP Authentication failed spi %x, dst_addr %s", - assoc->ipsa_spi, addr, af, espstack->ipsecesp_netstack); - counter = DROPPER(ipss, ipds_esp_bad_auth); - goto esp_in_discard; - } - - esp3dbg(espstack, ("esp_inbound_accelerated: ESP authentication " - "succeeded, checking replay\n")); - - ipsec_in->b_cont = data_mp; - - /* - * Remove ESP header and padding from packet. - */ - if (!esp_strip_header(data_mp, ii->ipsec_in_v4, assoc->ipsa_iv_len, - &counter, espstack)) { - esp1dbg(espstack, ("esp_inbound_accelerated: " - "esp_strip_header() failed\n")); - goto esp_in_discard; - } - - freeb(hada_mp); - - if (is_system_labeled() && (assoc->ipsa_cred != NULL)) - mblk_setcred(data_mp, assoc->ipsa_cred, NOPID); - - /* - * Account for usage.. - */ - if (!esp_age_bytes(assoc, msgdsize(data_mp), B_TRUE)) { - /* The ipsa has hit hard expiration, LOG and AUDIT. */ - ESP_BUMP_STAT(espstack, bytes_expired); - IP_ESP_BUMP_STAT(ipss, in_discards); - ipsec_assocfailure(info.mi_idnum, 0, 0, SL_ERROR | SL_WARN, - "ESP association 0x%x, dst %s had bytes expire.\n", - assoc->ipsa_spi, assoc->ipsa_dstaddr, assoc->ipsa_addrfam, - espstack->ipsecesp_netstack); - ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL, - DROPPER(ipss, ipds_esp_bytes_expire), - &espstack->esp_dropper); - return (IPSEC_STATUS_FAILED); - } - - /* done processing the packet */ - return (IPSEC_STATUS_SUCCESS); - -esp_in_discard: - IP_ESP_BUMP_STAT(ipss, in_discards); - freeb(hada_mp); - - ipsec_in->b_cont = data_mp; /* For ip_drop_packet()'s sake... */ - ip_drop_packet(ipsec_in, B_TRUE, NULL, NULL, counter, - &espstack->esp_dropper); - - return (IPSEC_STATUS_FAILED); -} - -/* * Wrapper to allow IP to trigger an ESP association failure message * during inbound SA selection. */ void ipsecesp_in_assocfailure(mblk_t *mp, char level, ushort_t sl, char *fmt, - uint32_t spi, void *addr, int af, ipsecesp_stack_t *espstack) + uint32_t spi, void *addr, int af, ip_recv_attr_t *ira) { - ipsec_stack_t *ipss = espstack->ipsecesp_netstack->netstack_ipsec; + netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack; + ipsecesp_stack_t *espstack = ns->netstack_ipsecesp; + ipsec_stack_t *ipss = ns->netstack_ipsec; if (espstack->ipsecesp_log_unknown_spi) { ipsec_assocfailure(info.mi_idnum, 0, level, sl, fmt, spi, addr, af, espstack->ipsecesp_netstack); } - ip_drop_packet(mp, B_TRUE, NULL, NULL, + ip_drop_packet(mp, B_TRUE, ira->ira_ill, DROPPER(ipss, ipds_esp_no_sa), &espstack->esp_dropper); } diff --git a/usr/src/uts/common/inet/ip/keysock.c b/usr/src/uts/common/inet/ip/keysock.c index ca82eeece0..855af28bb2 100644 --- a/usr/src/uts/common/inet/ip/keysock.c +++ b/usr/src/uts/common/inet/ip/keysock.c @@ -852,7 +852,7 @@ keysock_opt_get(queue_t *q, int level, int name, uchar_t *ptr) int keysock_opt_set(queue_t *q, uint_t mgmt_flags, int level, int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, - uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk) + uchar_t *outvalp, void *thisdg_attrs, cred_t *cr) { int *i1 = (int *)invalp, errno = 0; keysock_t *ks = (keysock_t *)q->q_ptr; @@ -936,11 +936,9 @@ keysock_wput_other(queue_t *q, mblk_t *mp) } if (((union T_primitives *)mp->b_rptr)->type == T_SVR4_OPTMGMT_REQ) { - (void) svr4_optcom_req(q, mp, cr, - &keysock_opt_obj, B_FALSE); + svr4_optcom_req(q, mp, cr, &keysock_opt_obj); } else { - (void) tpi_optcom_req(q, mp, cr, - &keysock_opt_obj, B_FALSE); + tpi_optcom_req(q, mp, cr, &keysock_opt_obj); } break; case T_DATA_REQ: diff --git a/usr/src/uts/common/inet/ip/keysock_opt_data.c b/usr/src/uts/common/inet/ip/keysock_opt_data.c index d8d9f1d0ad..4dee663d42 100644 --- a/usr/src/uts/common/inet/ip/keysock_opt_data.c +++ b/usr/src/uts/common/inet/ip/keysock_opt_data.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 1996-1998,2001-2003 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/stream.h> #define _SUN_TPI_VERSION 1 @@ -51,11 +48,11 @@ */ opdes_t keysock_opt_arr[] = { - { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, + { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, (t_uscalar_t)sizeof (int), 0 }, - { SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, + { SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, (t_uscalar_t)sizeof (int), 0 }, - { SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, + { SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, (t_uscalar_t)sizeof (int), 0 }, }; @@ -88,7 +85,6 @@ optdb_obj_t keysock_opt_obj = { NULL, /* KEYSOCK default value function pointer */ keysock_opt_get, /* KEYSOCK get function pointer */ keysock_opt_set, /* KEYSOCK set function pointer */ - B_TRUE, /* KEYSOCK is tpi provider */ KEYSOCK_OPT_ARR_CNT, /* KEYSOCK option database count of entries */ keysock_opt_arr, /* KEYSOCK option database */ KEYSOCK_VALID_LEVELS_CNT, /* KEYSOCK valid level count of entries */ diff --git a/usr/src/uts/common/inet/ip/rts.c b/usr/src/uts/common/inet/ip/rts.c index ce3ac6faca..d5a1d84395 100644 --- a/usr/src/uts/common/inet/ip/rts.c +++ b/usr/src/uts/common/inet/ip/rts.c @@ -72,7 +72,6 @@ * Addresses are assigned to interfaces. * ICMP redirects are processed and a IRE_HOST/RTF_DYNAMIC is installed. * No route is found while sending a packet. - * When TCP requests IP to remove an IRE_CACHE of a troubled destination. * * Since all we do is reformat the messages between routing socket and * ioctl forms, no synchronization is necessary in this module; all @@ -113,7 +112,8 @@ static rtsparam_t lcl_param_arr[] = { static void rts_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error); -static void rts_input(void *, mblk_t *, void *); +static void rts_input(void *, mblk_t *, void *, ip_recv_attr_t *); +static void rts_icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *); static mblk_t *rts_ioctl_alloc(mblk_t *data); static int rts_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); static boolean_t rts_param_register(IDP *ndp, rtsparam_t *rtspa, int cnt); @@ -211,28 +211,28 @@ rts_common_close(queue_t *q, conn_t *connp) if (!IPCL_IS_NONSTR(connp)) { qprocsoff(q); + } - /* - * Now we are truly single threaded on this stream, and can - * delete the things hanging off the connp, and finally the - * connp. - * We removed this connp from the fanout list, it cannot be - * accessed thru the fanouts, and we already waited for the - * conn_ref to drop to 0. We are already in close, so - * there cannot be any other thread from the top. qprocsoff - * has completed, and service has completed or won't run in - * future. - */ + /* + * Now we are truly single threaded on this stream, and can + * delete the things hanging off the connp, and finally the connp. + * We removed this connp from the fanout list, it cannot be + * accessed thru the fanouts, and we already waited for the + * conn_ref to drop to 0. We are already in close, so + * there cannot be any other thread from the top. qprocsoff + * has completed, and service has completed or won't run in + * future. + */ + ASSERT(connp->conn_ref == 1); + + if (!IPCL_IS_NONSTR(connp)) { inet_minor_free(connp->conn_minor_arena, connp->conn_dev); } else { ip_free_helper_stream(connp); } - ASSERT(connp->conn_ref == 1); - connp->conn_ref--; ipcl_conn_destroy(connp); - return (0); } @@ -256,7 +256,6 @@ rts_stream_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) { conn_t *connp; dev_t conn_dev; - rts_stack_t *rtss; rts_t *rts; /* If the stream is already open, return immediately. */ @@ -266,7 +265,6 @@ rts_stream_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) if (sflag == MODOPEN) return (EINVAL); - /* * Since RTS is not used so heavily, allocating from the small * arena should be sufficient. @@ -278,44 +276,31 @@ rts_stream_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) connp = rts_open(flag, credp); ASSERT(connp != NULL); - *devp = makedevice(getemajor(*devp), (minor_t)conn_dev); rts = connp->conn_rts; - rw_enter(&rts->rts_rwlock, RW_WRITER); connp->conn_dev = conn_dev; connp->conn_minor_arena = ip_minor_arena_sa; - /* - * Initialize the rts_t structure for this stream. - */ q->q_ptr = connp; WR(q)->q_ptr = connp; connp->conn_rq = q; connp->conn_wq = WR(q); - rtss = rts->rts_rtss; - q->q_hiwat = rtss->rtss_recv_hiwat; - WR(q)->q_hiwat = rtss->rtss_xmit_hiwat; - WR(q)->q_lowat = rtss->rtss_xmit_lowat; - - + WR(q)->q_hiwat = connp->conn_sndbuf; + WR(q)->q_lowat = connp->conn_sndlowat; mutex_enter(&connp->conn_lock); connp->conn_state_flags &= ~CONN_INCIPIENT; mutex_exit(&connp->conn_lock); - - qprocson(q); rw_exit(&rts->rts_rwlock); - /* - * Indicate the down IP module that this is a routing socket - * client by sending an RTS IOCTL without any user data. Although - * this is just a notification message (without any real routing - * request), we pass in any credential for correctness sake. - */ + + /* Indicate to IP that this is a routing socket client */ ip_rts_register(connp); + qprocson(q); + return (0); } @@ -352,22 +337,38 @@ rts_open(int flag, cred_t *credp) */ netstack_rele(ns); - rw_enter(&rts->rts_rwlock, RW_WRITER); ASSERT(connp->conn_rts == rts); ASSERT(rts->rts_connp == connp); + connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM; + /* conn_allzones can not be set this early, hence no IPCL_ZONEID */ + connp->conn_ixa->ixa_zoneid = zoneid; connp->conn_zoneid = zoneid; connp->conn_flow_cntrld = B_FALSE; - connp->conn_ulp_labeled = is_system_labeled(); - rts->rts_rtss = rtss; - rts->rts_xmit_hiwat = rtss->rtss_xmit_hiwat; + + connp->conn_rcvbuf = rtss->rtss_recv_hiwat; + connp->conn_sndbuf = rtss->rtss_xmit_hiwat; + connp->conn_sndlowat = rtss->rtss_xmit_lowat; + connp->conn_rcvlowat = rts_mod_info.mi_lowat; + + connp->conn_family = PF_ROUTE; + connp->conn_so_type = SOCK_RAW; + /* SO_PROTOTYPE is always sent down by sockfs setting conn_proto */ connp->conn_recv = rts_input; + connp->conn_recvicmp = rts_icmp_input; + crhold(credp); connp->conn_cred = credp; + connp->conn_cpid = curproc->p_pid; + /* Cache things in ixa without an extra refhold */ + connp->conn_ixa->ixa_cred = connp->conn_cred; + connp->conn_ixa->ixa_cpid = connp->conn_cpid; + if (is_system_labeled()) + connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred); /* * rts sockets start out as bound and connected @@ -429,7 +430,6 @@ rts_tpi_bind(queue_t *q, mblk_t *mp) { conn_t *connp = Q_TO_CONN(q); rts_t *rts = connp->conn_rts; - mblk_t *mp1; struct T_bind_req *tbr; if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { @@ -444,16 +444,6 @@ rts_tpi_bind(queue_t *q, mblk_t *mp) rts_err_ack(q, mp, TOUTSTATE, 0); return; } - /* - * Reallocate the message to make sure we have enough room for an - * address and the protocol type. - */ - mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin_t), 1); - if (mp1 == NULL) { - rts_err_ack(q, mp, TSYSERR, ENOMEM); - return; - } - mp = mp1; tbr = (struct T_bind_req *)mp->b_rptr; if (tbr->ADDR_length != 0) { (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, @@ -465,6 +455,7 @@ rts_tpi_bind(queue_t *q, mblk_t *mp) tbr->ADDR_offset = (t_scalar_t)sizeof (struct T_bind_req); tbr->ADDR_length = 0; tbr->PRIM_type = T_BIND_ACK; + mp->b_datap->db_type = M_PCPROTO; rts->rts_state = TS_IDLE; qreply(q, mp); } @@ -545,70 +536,30 @@ static int rts_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) { rts_t *rts = connp->conn_rts; - int *i1 = (int *)ptr; + conn_opt_arg_t coas; + int retval; ASSERT(RW_READ_HELD(&rts->rts_rwlock)); switch (level) { - case SOL_SOCKET: - switch (name) { - case SO_DEBUG: - *i1 = rts->rts_debug; - break; - case SO_REUSEADDR: - *i1 = rts->rts_reuseaddr; - break; - case SO_TYPE: - *i1 = SOCK_RAW; - break; - /* - * The following three items are available here, - * but are only meaningful to IP. - */ - case SO_DONTROUTE: - *i1 = rts->rts_dontroute; - break; - case SO_USELOOPBACK: - *i1 = rts->rts_useloopback; - break; - case SO_BROADCAST: - *i1 = rts->rts_broadcast; - break; - case SO_PROTOTYPE: - *i1 = rts->rts_proto; - break; - /* - * The following two items can be manipulated, - * but changing them should do nothing. - */ - case SO_SNDBUF: - ASSERT(rts->rts_xmit_hiwat <= INT_MAX); - *i1 = (int)(rts->rts_xmit_hiwat); - break; - case SO_RCVBUF: - ASSERT(rts->rts_recv_hiwat <= INT_MAX); - *i1 = (int)(rts->rts_recv_hiwat); - break; - case SO_DOMAIN: - *i1 = PF_ROUTE; - break; - default: - return (-1); - } - break; + /* do this in conn_opt_get? */ case SOL_ROUTE: switch (name) { case RT_AWARE: mutex_enter(&connp->conn_lock); - *i1 = connp->conn_rtaware; + *(int *)ptr = connp->conn_rtaware; mutex_exit(&connp->conn_lock); - break; + return (0); } break; - default: - return (-1); } - return ((int)sizeof (int)); + coas.coa_connp = connp; + coas.coa_ixa = connp->conn_ixa; + coas.coa_ipp = &connp->conn_xmit_ipp; + mutex_enter(&connp->conn_lock); + retval = conn_opt_get(&coas, level, name, ptr); + mutex_exit(&connp->conn_lock); + return (retval); } /* ARGSUSED */ @@ -620,6 +571,12 @@ rts_do_opt_set(conn_t *connp, int level, int name, uint_t inlen, int *i1 = (int *)invalp; rts_t *rts = connp->conn_rts; rts_stack_t *rtss = rts->rts_rtss; + int error; + conn_opt_arg_t coas; + + coas.coa_connp = connp; + coas.coa_ixa = connp->conn_ixa; + coas.coa_ipp = &connp->conn_xmit_ipp; ASSERT(RW_WRITE_HELD(&rts->rts_rwlock)); @@ -638,38 +595,6 @@ rts_do_opt_set(conn_t *connp, int level, int name, uint_t inlen, switch (level) { case SOL_SOCKET: switch (name) { - case SO_REUSEADDR: - if (!checkonly) { - rts->rts_reuseaddr = *i1 ? 1 : 0; - connp->conn_reuseaddr = *i1 ? 1 : 0; - } - break; /* goto sizeof (int) option return */ - case SO_DEBUG: - if (!checkonly) - rts->rts_debug = *i1 ? 1 : 0; - break; /* goto sizeof (int) option return */ - /* - * The following three items are available here, - * but are only meaningful to IP. - */ - case SO_DONTROUTE: - if (!checkonly) { - rts->rts_dontroute = *i1 ? 1 : 0; - connp->conn_dontroute = *i1 ? 1 : 0; - } - break; /* goto sizeof (int) option return */ - case SO_USELOOPBACK: - if (!checkonly) { - rts->rts_useloopback = *i1 ? 1 : 0; - connp->conn_loopback = *i1 ? 1 : 0; - } - break; /* goto sizeof (int) option return */ - case SO_BROADCAST: - if (!checkonly) { - rts->rts_broadcast = *i1 ? 1 : 0; - connp->conn_broadcast = *i1 ? 1 : 0; - } - break; /* goto sizeof (int) option return */ case SO_PROTOTYPE: /* * Routing socket applications that call socket() with @@ -678,13 +603,15 @@ rts_do_opt_set(conn_t *connp, int level, int name, uint_t inlen, * down the SO_PROTOTYPE and rts_queue_input() * implements the filtering. */ - if (*i1 != AF_INET && *i1 != AF_INET6) + if (*i1 != AF_INET && *i1 != AF_INET6) { + *outlenp = 0; return (EPROTONOSUPPORT); - if (!checkonly) { - rts->rts_proto = *i1; - connp->conn_proto = *i1; } - break; /* goto sizeof (int) option return */ + if (!checkonly) + connp->conn_proto = *i1; + *outlenp = inlen; + return (0); + /* * The following two items can be manipulated, * but changing them should do nothing. @@ -694,36 +621,13 @@ rts_do_opt_set(conn_t *connp, int level, int name, uint_t inlen, *outlenp = 0; return (ENOBUFS); } - if (!checkonly) { - rts->rts_xmit_hiwat = *i1; - if (!IPCL_IS_NONSTR(connp)) - connp->conn_wq->q_hiwat = *i1; - } break; /* goto sizeof (int) option return */ case SO_RCVBUF: if (*i1 > rtss->rtss_max_buf) { *outlenp = 0; return (ENOBUFS); } - if (!checkonly) { - rts->rts_recv_hiwat = *i1; - rw_exit(&rts->rts_rwlock); - (void) proto_set_rx_hiwat(connp->conn_rq, connp, - *i1); - rw_enter(&rts->rts_rwlock, RW_WRITER); - } - break; /* goto sizeof (int) option return */ - case SO_RCVTIMEO: - case SO_SNDTIMEO: - /* - * Pass these two options in order for third part - * protocol usage. Here just return directly. - */ - return (0); - default: - *outlenp = 0; - return (EINVAL); } break; case SOL_ROUTE: @@ -734,15 +638,17 @@ rts_do_opt_set(conn_t *connp, int level, int name, uint_t inlen, connp->conn_rtaware = *i1; mutex_exit(&connp->conn_lock); } - break; /* goto sizeof (int) option return */ - default: - *outlenp = 0; - return (EINVAL); + *outlenp = inlen; + return (0); } break; - default: + } + /* Serialized setsockopt since we are D_MTQPAIR */ + error = conn_opt_set(&coas, level, name, inlen, invalp, + checkonly, cr); + if (error != 0) { *outlenp = 0; - return (EINVAL); + return (error); } /* * Common case of return from an option that is sizeof (int) @@ -832,7 +738,7 @@ rts_tpi_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) int rts_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, - uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk) + uchar_t *outvalp, void *thisdg_attrs, cred_t *cr) { conn_t *connp = Q_TO_CONN(q); int error; @@ -1009,10 +915,6 @@ err_ret: * consumes the message or passes it downstream; it never queues a * a message. The data messages that go down are wrapped in an IOCTL * message. - * - * FIXME? Should we call IP rts_request directly? Could punt on returning - * errno in the case when it defers processing due to - * IPIF_CHANGING/ILL_CHANGING??? */ static void rts_wput(queue_t *q, mblk_t *mp) @@ -1057,7 +959,7 @@ rts_wput(queue_t *q, mblk_t *mp) } return; } - ip_output(connp, mp1, q, IP_WPUT); + ip_wput_nondata(q, mp1); } @@ -1120,11 +1022,9 @@ rts_wput_other(queue_t *q, mblk_t *mp) } if (((union T_primitives *)rptr)->type == T_SVR4_OPTMGMT_REQ) { - (void) svr4_optcom_req(q, mp, cr, - &rts_opt_obj, B_TRUE); + svr4_optcom_req(q, mp, cr, &rts_opt_obj); } else { - (void) tpi_optcom_req(q, mp, cr, - &rts_opt_obj, B_TRUE); + tpi_optcom_req(q, mp, cr, &rts_opt_obj); } return; case O_T_CONN_RES: @@ -1168,7 +1068,7 @@ rts_wput_other(queue_t *q, mblk_t *mp) default: break; } - ip_output(connp, mp, q, IP_WPUT); + ip_wput_nondata(q, mp); } /* @@ -1177,7 +1077,6 @@ rts_wput_other(queue_t *q, mblk_t *mp) static void rts_wput_iocdata(queue_t *q, mblk_t *mp) { - conn_t *connp = Q_TO_CONN(q); struct sockaddr *rtsaddr; mblk_t *mp1; STRUCT_HANDLE(strbuf, sb); @@ -1188,7 +1087,7 @@ rts_wput_iocdata(queue_t *q, mblk_t *mp) case TI_GETPEERNAME: break; default: - ip_output(connp, mp, q, IP_WPUT); + ip_wput_nondata(q, mp); return; } switch (mi_copy_state(q, mp, &mp1)) { @@ -1233,9 +1132,12 @@ rts_wput_iocdata(queue_t *q, mblk_t *mp) mi_copyout(q, mp); } +/* + * IP passes up a NULL ira. + */ /*ARGSUSED2*/ static void -rts_input(void *arg1, mblk_t *mp, void *arg2) +rts_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) { conn_t *connp = (conn_t *)arg1; rts_t *rts = connp->conn_rts; @@ -1248,27 +1150,17 @@ rts_input(void *arg1, mblk_t *mp, void *arg2) case M_IOCACK: case M_IOCNAK: iocp = (struct iocblk *)mp->b_rptr; - if (IPCL_IS_NONSTR(connp)) { - ASSERT(rts->rts_flag & (RTS_REQ_PENDING)); - mutex_enter(&rts->rts_send_mutex); - rts->rts_flag &= ~RTS_REQ_INPROG; + ASSERT(!IPCL_IS_NONSTR(connp)); + if (rts->rts_flag & (RTS_WPUT_PENDING)) { + rts->rts_flag &= ~RTS_WPUT_PENDING; rts->rts_error = iocp->ioc_error; - cv_signal(&rts->rts_io_cv); - mutex_exit(&rts->rts_send_mutex); + /* + * Tell rts_wvw/qwait that we are done. + * Note: there is no qwait_wakeup() we can use. + */ + qenable(connp->conn_rq); freemsg(mp); return; - } else { - if (rts->rts_flag & (RTS_WPUT_PENDING)) { - rts->rts_flag &= ~RTS_WPUT_PENDING; - rts->rts_error = iocp->ioc_error; - /* - * Tell rts_wvw/qwait that we are done. - * Note: there is no qwait_wakeup() we can use. - */ - qenable(connp->conn_rq); - freemsg(mp); - return; - } } break; case M_DATA: @@ -1316,6 +1208,12 @@ rts_input(void *arg1, mblk_t *mp, void *arg2) } } +/*ARGSUSED*/ +static void +rts_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) +{ + freemsg(mp); +} void rts_ddi_g_init(void) @@ -1427,11 +1325,6 @@ int rts_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr, socklen_t *addrlen, cred_t *cr) { - conn_t *connp = (conn_t *)proto_handle; - rts_t *rts = connp->conn_rts; - - ASSERT(rts != NULL); - bzero(addr, sizeof (struct sockaddr)); addr->sa_family = AF_ROUTE; *addrlen = sizeof (struct sockaddr); @@ -1444,7 +1337,11 @@ int rts_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr, socklen_t *addrlen, cred_t *cr) { - return (EOPNOTSUPP); + bzero(addr, sizeof (struct sockaddr)); + addr->sa_family = AF_ROUTE; + *addrlen = sizeof (struct sockaddr); + + return (0); } static int @@ -1461,7 +1358,6 @@ rts_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len, rts_opt_obj.odb_opt_des_arr, rts_opt_obj.odb_opt_arr_cnt, - rts_opt_obj.odb_topmost_tpiprovider, B_FALSE, B_TRUE, cr); if (error != 0) { if (error < 0) @@ -1473,25 +1369,20 @@ rts_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, rw_enter(&rts->rts_rwlock, RW_READER); len = rts_opt_get(connp, level, option_name, optvalp_buf); rw_exit(&rts->rts_rwlock); - - if (len < 0) { - /* - * Pass on to IP - */ - error = ip_get_options(connp, level, option_name, - optvalp, optlen, cr); - } else { - /* - * update optlen and copy option value - */ - t_uscalar_t size = MIN(len, *optlen); - bcopy(optvalp_buf, optvalp, size); - bcopy(&size, optlen, sizeof (size)); - error = 0; + if (len == -1) { + kmem_free(optvalp_buf, max_optbuf_len); + return (EINVAL); } + /* + * update optlen and copy option value + */ + t_uscalar_t size = MIN(len, *optlen); + + bcopy(optvalp_buf, optvalp, size); + bcopy(&size, optlen, sizeof (size)); kmem_free(optvalp_buf, max_optbuf_len); - return (error); + return (0); } static int @@ -1505,7 +1396,6 @@ rts_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, error = proto_opt_check(level, option_name, optlen, NULL, rts_opt_obj.odb_opt_des_arr, rts_opt_obj.odb_opt_arr_cnt, - rts_opt_obj.odb_topmost_tpiprovider, B_TRUE, B_FALSE, cr); if (error != 0) { @@ -1530,9 +1420,7 @@ static int rts_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, cred_t *cr) { - mblk_t *mp1; conn_t *connp = (conn_t *)proto_handle; - rts_t *rts = connp->conn_rts; rt_msghdr_t *rtm; int error; @@ -1546,65 +1434,19 @@ rts_send(sock_lower_handle_t proto_handle, mblk_t *mp, */ if ((mp->b_wptr - mp->b_rptr) < sizeof (rt_msghdr_t)) { if (!pullupmsg(mp, sizeof (rt_msghdr_t))) { - rts->rts_error = EINVAL; freemsg(mp); - return (rts->rts_error); + return (EINVAL); } } rtm = (rt_msghdr_t *)mp->b_rptr; rtm->rtm_pid = curproc->p_pid; - mp1 = rts_ioctl_alloc(mp); - if (mp1 == NULL) { - ASSERT(rts != NULL); - freemsg(mp); - return (ENOMEM); - } - /* - * Allow only one outstanding request(ioctl) at any given time + * We are not constrained by the ioctl interface and + * ip_rts_request_common processing requests synchronously hence + * we can send them down concurrently. */ - mutex_enter(&rts->rts_send_mutex); - while (rts->rts_flag & RTS_REQ_PENDING) { - int ret; - - ret = cv_wait_sig(&rts->rts_send_cv, &rts->rts_send_mutex); - if (ret <= 0) { - mutex_exit(&rts->rts_send_mutex); - freemsg(mp); - return (EINTR); - } - } - - rts->rts_flag |= RTS_REQ_PENDING; - - rts->rts_flag |= RTS_REQ_INPROG; - - mutex_exit(&rts->rts_send_mutex); - - CONN_INC_REF(connp); - - error = ip_rts_request_common(rts->rts_connp->conn_wq, mp1, connp, cr); - - mutex_enter(&rts->rts_send_mutex); - if (error == EINPROGRESS) { - ASSERT(rts->rts_flag & RTS_REQ_INPROG); - if (rts->rts_flag & RTS_REQ_INPROG) { - /* - * Once the request has been issued we wait for - * completion - */ - cv_wait(&rts->rts_io_cv, &rts->rts_send_mutex); - error = rts->rts_error; - } - } - - ASSERT((error != 0) || !(rts->rts_flag & RTS_REQ_INPROG)); - ASSERT(MUTEX_HELD(&rts->rts_send_mutex)); - - rts->rts_flag &= ~(RTS_REQ_PENDING | RTS_REQ_INPROG); - cv_signal(&rts->rts_send_cv); - mutex_exit(&rts->rts_send_mutex); + error = ip_rts_request_common(mp, connp, cr); return (error); } @@ -1614,8 +1456,6 @@ rts_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, uint_t *smodep, int *errorp, int flags, cred_t *credp) { conn_t *connp; - rts_t *rts; - rts_stack_t *rtss; if (family != AF_ROUTE || type != SOCK_RAW || (proto != 0 && proto != AF_INET && proto != AF_INET6)) { @@ -1627,25 +1467,7 @@ rts_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, ASSERT(connp != NULL); connp->conn_flags |= IPCL_NONSTR; - rts = connp->conn_rts; - rtss = rts->rts_rtss; - - rts->rts_xmit_hiwat = rtss->rtss_xmit_hiwat; - rts->rts_xmit_lowat = rtss->rtss_xmit_lowat; - rts->rts_recv_hiwat = rtss->rtss_recv_hiwat; - rts->rts_recv_lowat = rts_mod_info.mi_lowat; - - ASSERT(rtss->rtss_ldi_ident != NULL); - - *errorp = ip_create_helper_stream(connp, rtss->rtss_ldi_ident); - if (*errorp != 0) { -#ifdef DEBUG - cmn_err(CE_CONT, "rts_create: create of IP helper stream" - " failed\n"); -#endif - (void) rts_close((sock_lower_handle_t)connp, 0, credp); - return (NULL); - } + connp->conn_proto = proto; mutex_enter(&connp->conn_lock); connp->conn_state_flags &= ~CONN_INCIPIENT; @@ -1663,8 +1485,6 @@ rts_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags, cred_t *cr) { conn_t *connp = (conn_t *)proto_handle; - rts_t *rts = connp->conn_rts; - rts_stack_t *rtss = rts->rts_rtss; struct sock_proto_props sopp; connp->conn_upcalls = sock_upcalls; @@ -1673,8 +1493,8 @@ rts_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle, sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ; sopp.sopp_wroff = 0; - sopp.sopp_rxhiwat = rtss->rtss_recv_hiwat; - sopp.sopp_rxlowat = rts_mod_info.mi_lowat; + sopp.sopp_rxhiwat = connp->conn_rcvbuf; + sopp.sopp_rxlowat = connp->conn_rcvlowat; sopp.sopp_maxblk = INFPSZ; sopp.sopp_maxpsz = rts_mod_info.mi_maxpsz; sopp.sopp_minpsz = (rts_mod_info.mi_minpsz == 1) ? 0 : @@ -1689,12 +1509,7 @@ rts_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle, (*connp->conn_upcalls->su_connected) (connp->conn_upper_handle, 0, NULL, -1); - /* - * Indicate the down IP module that this is a routing socket - * client by sending an RTS IOCTL without any user data. Although - * this is just a notification message (without any real routing - * request), we pass in any credential for correctness sake. - */ + /* Indicate to IP that this is a routing socket client */ ip_rts_register(connp); } @@ -1743,6 +1558,27 @@ rts_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, conn_t *connp = (conn_t *)proto_handle; int error; + /* + * If we don't have a helper stream then create one. + * ip_create_helper_stream takes care of locking the conn_t, + * so this check for NULL is just a performance optimization. + */ + if (connp->conn_helper_info == NULL) { + rts_stack_t *rtss = connp->conn_rts->rts_rtss; + + ASSERT(rtss->rtss_ldi_ident != NULL); + + /* + * Create a helper stream for non-STREAMS socket. + */ + error = ip_create_helper_stream(connp, rtss->rtss_ldi_ident); + if (error != 0) { + ip0dbg(("rts_ioctl: create of IP helper stream " + "failed %d\n", error)); + return (error); + } + } + switch (cmd) { case ND_SET: case ND_GET: diff --git a/usr/src/uts/common/inet/ip/rts_opt_data.c b/usr/src/uts/common/inet/ip/rts_opt_data.c index 8a96edb668..1dd64a0317 100644 --- a/usr/src/uts/common/inet/ip/rts_opt_data.c +++ b/usr/src/uts/common/inet/ip/rts_opt_data.c @@ -40,6 +40,7 @@ #include <inet/optcom.h> #include <inet/rts_impl.h> +#include <inet/rts_impl.h> /* * Table of all known options handled on a RTS protocol stack. * @@ -49,21 +50,21 @@ */ opdes_t rts_opt_arr[] = { -{ SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), +{ SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_SNDTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, +{ SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_SNDTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (struct timeval), 0 }, -{ SO_RCVTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ SO_RCVTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (struct timeval), 0 }, -{ SO_PROTOTYPE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, +{ SO_PROTOTYPE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, { RT_AWARE, SOL_ROUTE, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, }; @@ -98,9 +99,8 @@ uint_t rts_max_optsize; /* initialized in _init() */ optdb_obj_t rts_opt_obj = { rts_opt_default, /* RTS default value function pointer */ - rts_tpi_opt_get, /* RTS get function pointer */ - rts_tpi_opt_set, /* RTS set function pointer */ - B_TRUE, /* RTS is tpi provider */ + rts_tpi_opt_get, /* RTS get function pointer */ + rts_tpi_opt_set, /* RTS set function pointer */ RTS_OPT_ARR_CNT, /* RTS option database count of entries */ rts_opt_arr, /* RTS option database */ RTS_VALID_LEVELS_CNT, /* RTS valid level count of entries */ diff --git a/usr/src/uts/common/inet/ip/sadb.c b/usr/src/uts/common/inet/ip/sadb.c index 784b3b08aa..5ae4f6da8e 100644 --- a/usr/src/uts/common/inet/ip/sadb.c +++ b/usr/src/uts/common/inet/ip/sadb.c @@ -59,7 +59,6 @@ #include <inet/ipsecesp.h> #include <sys/random.h> #include <sys/dlpi.h> -#include <sys/iphada.h> #include <sys/strsun.h> #include <sys/strsubr.h> #include <inet/ip_if.h> @@ -77,15 +76,13 @@ static mblk_t *sadb_extended_acquire(ipsec_selector_t *, ipsec_policy_t *, ipsec_action_t *, boolean_t, uint32_t, uint32_t, sadb_sens_t *, netstack_t *); -static void sadb_ill_df(ill_t *, mblk_t *, isaf_t *, int, boolean_t); -static ipsa_t *sadb_torch_assoc(isaf_t *, ipsa_t *, boolean_t, mblk_t **); -static void sadb_drain_torchq(queue_t *, mblk_t *); +static ipsa_t *sadb_torch_assoc(isaf_t *, ipsa_t *); static void sadb_destroy_acqlist(iacqf_t **, uint_t, boolean_t, netstack_t *); static void sadb_destroy(sadb_t *, netstack_t *); static mblk_t *sadb_sa2msg(ipsa_t *, sadb_msg_t *); -static cred_t *sadb_cred_from_sens(sadb_sens_t *, uint64_t *); -static sadb_sens_t *sadb_make_sens_ext(cred_t *cr, int *len); +static ts_label_t *sadb_label_from_sens(sadb_sens_t *, uint64_t *); +static sadb_sens_t *sadb_make_sens_ext(ts_label_t *tsl, int *len); static time_t sadb_add_time(time_t, uint64_t); static void lifetime_fuzz(ipsa_t *); @@ -96,12 +93,6 @@ static void destroy_ipsa_pair(ipsap_t *); static int update_pairing(ipsap_t *, ipsa_query_t *, keysock_in_t *, int *); static void ipsa_set_replay(ipsa_t *ipsa, uint32_t offset); -extern void (*cl_inet_getspi)(netstackid_t stack_id, uint8_t protocol, - uint8_t *ptr, size_t len, void *args); -extern int (*cl_inet_checkspi)(netstackid_t stack_id, uint8_t protocol, - uint32_t spi, void *args); -extern void (*cl_inet_deletespi)(netstackid_t stack_id, uint8_t protocol, - uint32_t spi, void *args); /* * ipsacq_maxpackets is defined here to make it tunable * from /etc/system. @@ -269,6 +260,7 @@ static void sadb_freeassoc(ipsa_t *ipsa) { ipsec_stack_t *ipss = ipsa->ipsa_netstack->netstack_ipsec; + mblk_t *asyncmp, *mp; ASSERT(ipss != NULL); ASSERT(MUTEX_NOT_HELD(&ipsa->ipsa_lock)); @@ -276,20 +268,24 @@ sadb_freeassoc(ipsa_t *ipsa) ASSERT(ipsa->ipsa_next == NULL); ASSERT(ipsa->ipsa_ptpn == NULL); + + asyncmp = sadb_clear_lpkt(ipsa); + if (asyncmp != NULL) { + mp = ip_recv_attr_free_mblk(asyncmp); + ip_drop_packet(mp, B_TRUE, NULL, + DROPPER(ipss, ipds_sadb_inlarval_timeout), + &ipss->ipsec_sadb_dropper); + } mutex_enter(&ipsa->ipsa_lock); - /* Don't call sadb_clear_lpkt() since we hold the ipsa_lock anyway. */ - ip_drop_packet(ipsa->ipsa_lpkt, B_TRUE, NULL, NULL, - DROPPER(ipss, ipds_sadb_inlarval_timeout), - &ipss->ipsec_sadb_dropper); - if (ipsa->ipsa_cred != NULL) { - crfree(ipsa->ipsa_cred); - ipsa->ipsa_cred = NULL; + if (ipsa->ipsa_tsl != NULL) { + label_rele(ipsa->ipsa_tsl); + ipsa->ipsa_tsl = NULL; } - if (ipsa->ipsa_ocred != NULL) { - crfree(ipsa->ipsa_ocred); - ipsa->ipsa_ocred = NULL; + if (ipsa->ipsa_otsl != NULL) { + label_rele(ipsa->ipsa_otsl); + ipsa->ipsa_otsl = NULL; } ipsec_destroy_ctx_tmpl(ipsa, IPSEC_ALG_AUTH); @@ -712,336 +708,6 @@ sadb_walker(isaf_t *table, uint_t numentries, } /* - * From the given SA, construct a dl_ct_ipsec_key and - * a dl_ct_ipsec structures to be sent to the adapter as part - * of a DL_CONTROL_REQ. - * - * ct_sa must point to the storage allocated for the key - * structure and must be followed by storage allocated - * for the SA information that must be sent to the driver - * as part of the DL_CONTROL_REQ request. - * - * The is_inbound boolean indicates whether the specified - * SA is part of an inbound SA table. - * - * Returns B_TRUE if the corresponding SA must be passed to - * a provider, B_FALSE otherwise; frees *mp if it returns B_FALSE. - */ -static boolean_t -sadb_req_from_sa(ipsa_t *sa, mblk_t *mp, boolean_t is_inbound) -{ - dl_ct_ipsec_key_t *keyp; - dl_ct_ipsec_t *sap; - void *ct_sa = mp->b_wptr; - - ASSERT(MUTEX_HELD(&sa->ipsa_lock)); - - keyp = (dl_ct_ipsec_key_t *)(ct_sa); - sap = (dl_ct_ipsec_t *)(keyp + 1); - - IPSECHW_DEBUG(IPSECHW_CAPAB, ("sadb_req_from_sa: " - "is_inbound = %d\n", is_inbound)); - - /* initialize flag */ - sap->sadb_sa_flags = 0; - if (is_inbound) { - sap->sadb_sa_flags |= DL_CT_IPSEC_INBOUND; - /* - * If an inbound SA has a peer, then mark it has being - * an outbound SA as well. - */ - if (sa->ipsa_haspeer) - sap->sadb_sa_flags |= DL_CT_IPSEC_OUTBOUND; - } else { - /* - * If an outbound SA has a peer, then don't send it, - * since we will send the copy from the inbound table. - */ - if (sa->ipsa_haspeer) { - freemsg(mp); - return (B_FALSE); - } - sap->sadb_sa_flags |= DL_CT_IPSEC_OUTBOUND; - } - - keyp->dl_key_spi = sa->ipsa_spi; - bcopy(sa->ipsa_dstaddr, keyp->dl_key_dest_addr, - DL_CTL_IPSEC_ADDR_LEN); - keyp->dl_key_addr_family = sa->ipsa_addrfam; - - sap->sadb_sa_auth = sa->ipsa_auth_alg; - sap->sadb_sa_encrypt = sa->ipsa_encr_alg; - - sap->sadb_key_len_a = sa->ipsa_authkeylen; - sap->sadb_key_bits_a = sa->ipsa_authkeybits; - bcopy(sa->ipsa_authkey, - sap->sadb_key_data_a, sap->sadb_key_len_a); - - sap->sadb_key_len_e = sa->ipsa_encrkeylen; - sap->sadb_key_bits_e = sa->ipsa_encrkeybits; - bcopy(sa->ipsa_encrkey, - sap->sadb_key_data_e, sap->sadb_key_len_e); - - mp->b_wptr += sizeof (dl_ct_ipsec_t) + sizeof (dl_ct_ipsec_key_t); - return (B_TRUE); -} - -/* - * Called from AH or ESP to format a message which will be used to inform - * IPsec-acceleration-capable ills of a SADB change. - * (It is not possible to send the message to IP directly from this function - * since the SA, if any, is locked during the call). - * - * dl_operation: DL_CONTROL_REQ operation (add, delete, update, etc) - * sa_type: identifies whether the operation applies to AH or ESP - * (must be one of SADB_SATYPE_AH or SADB_SATYPE_ESP) - * sa: Pointer to an SA. Must be non-NULL and locked - * for ADD, DELETE, GET, and UPDATE operations. - * This function returns an mblk chain that must be passed to IP - * for forwarding to the IPsec capable providers. - */ -mblk_t * -sadb_fmt_sa_req(uint_t dl_operation, uint_t sa_type, ipsa_t *sa, - boolean_t is_inbound) -{ - mblk_t *mp; - dl_control_req_t *ctrl; - boolean_t need_key = B_FALSE; - mblk_t *ctl_mp = NULL; - ipsec_ctl_t *ctl; - - /* - * 1 allocate and initialize DL_CONTROL_REQ M_PROTO - * 2 if a key is needed for the operation - * 2.1 initialize key - * 2.2 if a full SA is needed for the operation - * 2.2.1 initialize full SA info - * 3 return message; caller will call ill_ipsec_capab_send_all() - * to send the resulting message to IPsec capable ills. - */ - - ASSERT(sa_type == SADB_SATYPE_AH || sa_type == SADB_SATYPE_ESP); - - /* - * Allocate DL_CONTROL_REQ M_PROTO - * We allocate room for the SA even if it's not needed - * by some of the operations (for example flush) - */ - mp = allocb(sizeof (dl_control_req_t) + - sizeof (dl_ct_ipsec_key_t) + sizeof (dl_ct_ipsec_t), BPRI_HI); - if (mp == NULL) - return (NULL); - mp->b_datap->db_type = M_PROTO; - - /* initialize dl_control_req_t */ - ctrl = (dl_control_req_t *)mp->b_wptr; - ctrl->dl_primitive = DL_CONTROL_REQ; - ctrl->dl_operation = dl_operation; - ctrl->dl_type = sa_type == SADB_SATYPE_AH ? DL_CT_IPSEC_AH : - DL_CT_IPSEC_ESP; - ctrl->dl_key_offset = sizeof (dl_control_req_t); - ctrl->dl_key_length = sizeof (dl_ct_ipsec_key_t); - ctrl->dl_data_offset = sizeof (dl_control_req_t) + - sizeof (dl_ct_ipsec_key_t); - ctrl->dl_data_length = sizeof (dl_ct_ipsec_t); - mp->b_wptr += sizeof (dl_control_req_t); - - if ((dl_operation == DL_CO_SET) || (dl_operation == DL_CO_DELETE)) { - ASSERT(sa != NULL); - ASSERT(MUTEX_HELD(&sa->ipsa_lock)); - - need_key = B_TRUE; - - /* - * Initialize key and SA data. Note that for some - * operations the SA data is ignored by the provider - * (delete, etc.) - */ - if (!sadb_req_from_sa(sa, mp, is_inbound)) - return (NULL); - } - - /* construct control message */ - ctl_mp = allocb(sizeof (ipsec_ctl_t), BPRI_HI); - if (ctl_mp == NULL) { - cmn_err(CE_WARN, "sadb_fmt_sa_req: allocb failed\n"); - freemsg(mp); - return (NULL); - } - - ctl_mp->b_datap->db_type = M_CTL; - ctl_mp->b_wptr += sizeof (ipsec_ctl_t); - ctl_mp->b_cont = mp; - - ctl = (ipsec_ctl_t *)ctl_mp->b_rptr; - ctl->ipsec_ctl_type = IPSEC_CTL; - ctl->ipsec_ctl_len = sizeof (ipsec_ctl_t); - ctl->ipsec_ctl_sa_type = sa_type; - - if (need_key) { - /* - * Keep an additional reference on SA, since it will be - * needed by IP to send control messages corresponding - * to that SA from its perimeter. IP will do a - * IPSA_REFRELE when done with the request. - */ - ASSERT(MUTEX_HELD(&sa->ipsa_lock)); - IPSA_REFHOLD(sa); - ctl->ipsec_ctl_sa = sa; - } else - ctl->ipsec_ctl_sa = NULL; - - return (ctl_mp); -} - - -/* - * Called by sadb_ill_download() to dump the entries for a specific - * fanout table. For each SA entry in the table passed as argument, - * use mp as a template and constructs a full DL_CONTROL message, and - * call ill_dlpi_send(), provided by IP, to send the resulting - * messages to the ill. - */ -static void -sadb_ill_df(ill_t *ill, mblk_t *mp, isaf_t *fanout, int num_entries, - boolean_t is_inbound) -{ - ipsa_t *walker; - mblk_t *nmp, *salist; - int i, error = 0; - ip_stack_t *ipst = ill->ill_ipst; - netstack_t *ns = ipst->ips_netstack; - - IPSECHW_DEBUG(IPSECHW_SADB, ("sadb_ill_df: fanout at 0x%p ne=%d\n", - (void *)fanout, num_entries)); - /* - * For each IPSA hash bucket do: - * - Hold the mutex - * - Walk each entry, sending a corresponding request to IP - * for it. - */ - ASSERT(mp->b_datap->db_type == M_PROTO); - - for (i = 0; i < num_entries; i++) { - mutex_enter(&fanout[i].isaf_lock); - salist = NULL; - - for (walker = fanout[i].isaf_ipsa; walker != NULL; - walker = walker->ipsa_next) { - IPSECHW_DEBUG(IPSECHW_SADB, - ("sadb_ill_df: sending SA to ill via IP \n")); - /* - * Duplicate the template mp passed and - * complete DL_CONTROL_REQ data. - * To be more memory efficient, we could use - * dupb() for the M_CTL and copyb() for the M_PROTO - * as the M_CTL, since the M_CTL is the same for - * every SA entry passed down to IP for the same ill. - * - * Note that copymsg/copyb ensure that the new mblk - * is at least as large as the source mblk even if it's - * not using all its storage -- therefore, nmp - * has trailing space for sadb_req_from_sa to add - * the SA-specific bits. - */ - mutex_enter(&walker->ipsa_lock); - if (ipsec_capab_match(ill, - ill->ill_phyint->phyint_ifindex, ill->ill_isv6, - walker, ns)) { - nmp = copymsg(mp); - if (nmp == NULL) { - IPSECHW_DEBUG(IPSECHW_SADB, - ("sadb_ill_df: alloc error\n")); - error = ENOMEM; - mutex_exit(&walker->ipsa_lock); - break; - } - if (sadb_req_from_sa(walker, nmp, is_inbound)) { - nmp->b_next = salist; - salist = nmp; - } - } - mutex_exit(&walker->ipsa_lock); - } - mutex_exit(&fanout[i].isaf_lock); - while (salist != NULL) { - nmp = salist; - salist = nmp->b_next; - nmp->b_next = NULL; - ill_dlpi_send(ill, nmp); - } - if (error != 0) - break; /* out of for loop. */ - } -} - -/* - * Called by ill_ipsec_capab_add(). Sends a copy of the SADB of - * the type specified by sa_type to the specified ill. - * - * We call for each fanout table defined by the SADB (one per - * protocol). sadb_ill_df() finally calls ill_dlpi_send() for - * each SADB entry in order to send a corresponding DL_CONTROL_REQ - * message to the ill. - */ -void -sadb_ill_download(ill_t *ill, uint_t sa_type) -{ - mblk_t *protomp; /* prototype message */ - dl_control_req_t *ctrl; - sadbp_t *spp; - sadb_t *sp; - int dlt; - ip_stack_t *ipst = ill->ill_ipst; - netstack_t *ns = ipst->ips_netstack; - - ASSERT(sa_type == SADB_SATYPE_AH || sa_type == SADB_SATYPE_ESP); - - /* - * Allocate and initialize prototype answer. A duplicate for - * each SA is sent down to the interface. - */ - - /* DL_CONTROL_REQ M_PROTO mblk_t */ - protomp = allocb(sizeof (dl_control_req_t) + - sizeof (dl_ct_ipsec_key_t) + sizeof (dl_ct_ipsec_t), BPRI_HI); - if (protomp == NULL) - return; - protomp->b_datap->db_type = M_PROTO; - - dlt = (sa_type == SADB_SATYPE_AH) ? DL_CT_IPSEC_AH : DL_CT_IPSEC_ESP; - if (sa_type == SADB_SATYPE_ESP) { - ipsecesp_stack_t *espstack = ns->netstack_ipsecesp; - - spp = &espstack->esp_sadb; - } else { - ipsecah_stack_t *ahstack = ns->netstack_ipsecah; - - spp = &ahstack->ah_sadb; - } - - ctrl = (dl_control_req_t *)protomp->b_wptr; - ctrl->dl_primitive = DL_CONTROL_REQ; - ctrl->dl_operation = DL_CO_SET; - ctrl->dl_type = dlt; - ctrl->dl_key_offset = sizeof (dl_control_req_t); - ctrl->dl_key_length = sizeof (dl_ct_ipsec_key_t); - ctrl->dl_data_offset = sizeof (dl_control_req_t) + - sizeof (dl_ct_ipsec_key_t); - ctrl->dl_data_length = sizeof (dl_ct_ipsec_t); - protomp->b_wptr += sizeof (dl_control_req_t); - - /* - * then for each SADB entry, we fill out the dl_ct_ipsec_key_t - * and dl_ct_ipsec_t - */ - sp = ill->ill_isv6 ? &(spp->s_v6) : &(spp->s_v4); - sadb_ill_df(ill, protomp, sp->sdb_of, sp->sdb_hashsize, B_FALSE); - sadb_ill_df(ill, protomp, sp->sdb_if, sp->sdb_hashsize, B_TRUE); - freemsg(protomp); -} - -/* * Call me to free up a security association fanout. Use the forever * variable to indicate freeing up the SAs (forever == B_FALSE, e.g. * an SADB_FLUSH message), or destroying everything (forever == B_TRUE, @@ -1119,30 +785,11 @@ sadb_destroy(sadb_t *sp, netstack_t *ns) ASSERT(sp->sdb_acq == NULL); } -static void -sadb_send_flush_req(sadbp_t *spp) -{ - mblk_t *ctl_mp; - - /* - * we've been unplumbed, or never were plumbed; don't go there. - */ - if (spp->s_ip_q == NULL) - return; - - /* have IP send a flush msg to the IPsec accelerators */ - ctl_mp = sadb_fmt_sa_req(DL_CO_FLUSH, spp->s_satype, NULL, B_TRUE); - if (ctl_mp != NULL) - putnext(spp->s_ip_q, ctl_mp); -} - void sadbp_flush(sadbp_t *spp, netstack_t *ns) { sadb_flush(&spp->s_v4, ns); sadb_flush(&spp->s_v6, ns); - - sadb_send_flush_req(spp); } void @@ -1151,7 +798,6 @@ sadbp_destroy(sadbp_t *spp, netstack_t *ns) sadb_destroy(&spp->s_v4, ns); sadb_destroy(&spp->s_v6, ns); - sadb_send_flush_req(spp); if (spp->s_satype == SADB_SATYPE_AH) { ipsec_stack_t *ipss = ns->netstack_ipsec; @@ -1259,11 +905,11 @@ sadb_cloneassoc(ipsa_t *ipsa) /* bzero and initialize locks, in case *_init() allocates... */ mutex_init(&newbie->ipsa_lock, NULL, MUTEX_DEFAULT, NULL); - if (newbie->ipsa_cred != NULL) - crhold(newbie->ipsa_cred); + if (newbie->ipsa_tsl != NULL) + label_hold(newbie->ipsa_tsl); - if (newbie->ipsa_ocred != NULL) - crhold(newbie->ipsa_ocred); + if (newbie->ipsa_otsl != NULL) + label_hold(newbie->ipsa_otsl); /* * While somewhat dain-bramaged, the most graceful way to @@ -1554,14 +1200,14 @@ sadb_sa2msg(ipsa_t *ipsa, sadb_msg_t *samsg) encr = B_FALSE; } - if (ipsa->ipsa_cred != NULL) { - senslen = sadb_sens_len_from_cred(ipsa->ipsa_cred); + if (ipsa->ipsa_tsl != NULL) { + senslen = sadb_sens_len_from_label(ipsa->ipsa_tsl); alloclen += senslen; sensinteg = B_TRUE; } - if (ipsa->ipsa_ocred != NULL) { - osenslen = sadb_sens_len_from_cred(ipsa->ipsa_ocred); + if (ipsa->ipsa_otsl != NULL) { + osenslen = sadb_sens_len_from_label(ipsa->ipsa_otsl); alloclen += osenslen; osensinteg = B_TRUE; } @@ -1792,8 +1438,8 @@ sadb_sa2msg(ipsa_t *ipsa, sadb_msg_t *samsg) if (sensinteg) { sens = (sadb_sens_t *)walker; - sadb_sens_from_cred(sens, SADB_EXT_SENSITIVITY, - ipsa->ipsa_cred, senslen); + sadb_sens_from_label(sens, SADB_EXT_SENSITIVITY, + ipsa->ipsa_tsl, senslen); walker = (sadb_ext_t *)((uint64_t *)walker + walker->sadb_ext_len); @@ -1802,8 +1448,8 @@ sadb_sa2msg(ipsa_t *ipsa, sadb_msg_t *samsg) if (osensinteg) { sens = (sadb_sens_t *)walker; - sadb_sens_from_cred(sens, SADB_X_EXT_OUTER_SENS, - ipsa->ipsa_ocred, osenslen); + sadb_sens_from_label(sens, SADB_X_EXT_OUTER_SENS, + ipsa->ipsa_otsl, osenslen); if (ipsa->ipsa_mac_exempt) sens->sadb_x_sens_flags = SADB_X_SENS_IMPLICIT; @@ -2123,7 +1769,6 @@ sadb_addrcheck(queue_t *pfkey_q, mblk_t *mp, sadb_ext_t *ext, uint_t serial, sadb_address_t *addr = (sadb_address_t *)ext; struct sockaddr_in *sin; struct sockaddr_in6 *sin6; - ire_t *ire; int diagnostic, type; boolean_t normalized = B_FALSE; @@ -2249,18 +1894,12 @@ bail: /* * At this point, we're a unicast IPv6 address. * - * A ctable lookup for local is sufficient here. If we're - * local, return KS_IN_ADDR_ME, otherwise KS_IN_ADDR_NOTME. - * * XXX Zones alert -> me/notme decision needs to be tempered * by what zone we're in when we go to zone-aware IPsec. */ - ire = ire_ctable_lookup_v6(&sin6->sin6_addr, NULL, - IRE_LOCAL, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, - ns->netstack_ip); - if (ire != NULL) { + if (ip_type_v6(&sin6->sin6_addr, ns->netstack_ip) == + IRE_LOCAL) { /* Hey hey, it's local. */ - IRE_REFRELE(ire); return (KS_IN_ADDR_ME); } } else { @@ -2272,23 +1911,17 @@ bail: /* * At this point we're a unicast or broadcast IPv4 address. * - * Lookup on the ctable for IRE_BROADCAST or IRE_LOCAL. - * A NULL return value is NOTME, otherwise, look at the - * returned ire for broadcast or not and return accordingly. + * Check if the address is IRE_BROADCAST or IRE_LOCAL. * * XXX Zones alert -> me/notme decision needs to be tempered * by what zone we're in when we go to zone-aware IPsec. */ - ire = ire_ctable_lookup(sin->sin_addr.s_addr, 0, - IRE_LOCAL | IRE_BROADCAST, NULL, ALL_ZONES, NULL, - MATCH_IRE_TYPE, ns->netstack_ip); - if (ire != NULL) { - /* Check for local or broadcast */ - type = ire->ire_type; - IRE_REFRELE(ire); - ASSERT(type == IRE_LOCAL || type == IRE_BROADCAST); - return ((type == IRE_LOCAL) ? KS_IN_ADDR_ME : - KS_IN_ADDR_MBCAST); + type = ip_type_v4(sin->sin_addr.s_addr, ns->netstack_ip); + switch (type) { + case IRE_LOCAL: + return (KS_IN_ADDR_ME); + case IRE_BROADCAST: + return (KS_IN_ADDR_MBCAST); } } @@ -2763,7 +2396,6 @@ struct sadb_purge_state ipsa_query_t sq; boolean_t inbnd; uint8_t sadb_sa_state; - mblk_t *mq; }; static void @@ -2785,7 +2417,7 @@ sadb_purge_cb(isaf_t *head, ipsa_t *entry, void *cookie) sadb_delete_cluster(entry); } entry->ipsa_state = IPSA_STATE_DEAD; - (void) sadb_torch_assoc(head, entry, ps->inbnd, &ps->mq); + (void) sadb_torch_assoc(head, entry); } /* @@ -2794,15 +2426,13 @@ sadb_purge_cb(isaf_t *head, ipsa_t *entry, void *cookie) */ int sadb_purge_sa(mblk_t *mp, keysock_in_t *ksi, sadb_t *sp, - int *diagnostic, queue_t *pfkey_q, queue_t *ip_q) + int *diagnostic, queue_t *pfkey_q) { struct sadb_purge_state ps; int error = sadb_form_query(ksi, 0, IPSA_Q_SRC|IPSA_Q_DST|IPSA_Q_SRCID|IPSA_Q_DSTID|IPSA_Q_KMC, &ps.sq, diagnostic); - ps.mq = NULL; - if (error != 0) return (error); @@ -2819,9 +2449,6 @@ sadb_purge_sa(mblk_t *mp, keysock_in_t *ksi, sadb_t *sp, ps.inbnd = B_FALSE; sadb_walker(sp->sdb_of, sp->sdb_hashsize, sadb_purge_cb, &ps); - if (ps.mq != NULL) - sadb_drain_torchq(ip_q, ps.mq); - ASSERT(mp->b_cont != NULL); sadb_pfkey_echo(pfkey_q, mp, (sadb_msg_t *)mp->b_cont->b_rptr, ksi, NULL); @@ -2870,12 +2497,11 @@ sadb_delpair_state_one(isaf_t *head, ipsa_t *entry, void *cookie) } entry->ipsa_state = IPSA_STATE_DEAD; - (void) sadb_torch_assoc(head, entry, B_FALSE, &ps->mq); + (void) sadb_torch_assoc(head, entry); if (peer_assoc != NULL) { mutex_enter(&peer_assoc->ipsa_lock); peer_assoc->ipsa_state = IPSA_STATE_DEAD; - (void) sadb_torch_assoc(inbound_bucket, peer_assoc, - B_FALSE, &ps->mq); + (void) sadb_torch_assoc(inbound_bucket, peer_assoc); } mutex_exit(&inbound_bucket->isaf_lock); } @@ -2889,7 +2515,6 @@ sadb_delpair_state(mblk_t *mp, keysock_in_t *ksi, sadbp_t *spp, int error; ps.sq.spp = spp; /* XXX param */ - ps.mq = NULL; error = sadb_form_query(ksi, IPSA_Q_DST|IPSA_Q_SRC, IPSA_Q_SRC|IPSA_Q_DST|IPSA_Q_SRCID|IPSA_Q_DSTID|IPSA_Q_KMC, @@ -2902,9 +2527,6 @@ sadb_delpair_state(mblk_t *mp, keysock_in_t *ksi, sadbp_t *spp, sadb_walker(ps.sq.sp->sdb_of, ps.sq.sp->sdb_hashsize, sadb_delpair_state_one, &ps); - if (ps.mq != NULL) - sadb_drain_torchq(pfkey_q, ps.mq); - ASSERT(mp->b_cont != NULL); sadb_pfkey_echo(pfkey_q, mp, (sadb_msg_t *)mp->b_cont->b_rptr, ksi, NULL); @@ -2921,7 +2543,6 @@ sadb_delget_sa(mblk_t *mp, keysock_in_t *ksi, sadbp_t *spp, ipsa_query_t sq; ipsa_t *echo_target = NULL; ipsap_t ipsapp; - mblk_t *torchq = NULL; uint_t error = 0; if (sadb_msg_type == SADB_X_DELPAIR_STATE) @@ -2965,7 +2586,7 @@ sadb_delget_sa(mblk_t *mp, keysock_in_t *ksi, sadbp_t *spp, } ipsapp.ipsap_sa_ptr->ipsa_state = IPSA_STATE_DEAD; (void) sadb_torch_assoc(ipsapp.ipsap_bucket, - ipsapp.ipsap_sa_ptr, B_FALSE, &torchq); + ipsapp.ipsap_sa_ptr); /* * sadb_torch_assoc() releases the ipsa_lock * and calls sadb_unlinkassoc() which does a @@ -2984,7 +2605,7 @@ sadb_delget_sa(mblk_t *mp, keysock_in_t *ksi, sadbp_t *spp, ipsapp.ipsap_psa_ptr->ipsa_state = IPSA_STATE_DEAD; (void) sadb_torch_assoc(ipsapp.ipsap_pbucket, - ipsapp.ipsap_psa_ptr, B_FALSE, &torchq); + ipsapp.ipsap_psa_ptr); } else { /* * Only half of the "pair" has been deleted. @@ -3004,9 +2625,6 @@ sadb_delget_sa(mblk_t *mp, keysock_in_t *ksi, sadbp_t *spp, mutex_exit(&ipsapp.ipsap_pbucket->isaf_lock); } - if (torchq != NULL) - sadb_drain_torchq(spp->s_ip_q, torchq); - ASSERT(mp->b_cont != NULL); if (error == 0) @@ -3269,7 +2887,7 @@ sadb_nat_calculations(ipsa_t *newbie, sadb_address_t *natt_loc_ext, * case here. */ int -sadb_common_add(queue_t *ip_q, queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg, +sadb_common_add(queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg, keysock_in_t *ksi, isaf_t *primary, isaf_t *secondary, ipsa_t *newbie, boolean_t clone, boolean_t is_inbound, int *diagnostic, netstack_t *ns, sadbp_t *spp) @@ -3313,11 +2931,11 @@ sadb_common_add(queue_t *ip_q, queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg, int error = 0; boolean_t isupdate = (newbie != NULL); uint32_t *src_addr_ptr, *dst_addr_ptr, *isrc_addr_ptr, *idst_addr_ptr; - mblk_t *ctl_mp = NULL; ipsec_stack_t *ipss = ns->netstack_ipsec; ip_stack_t *ipst = ns->netstack_ip; ipsec_alginfo_t *alg; int rcode; + boolean_t async = B_FALSE; init_ipsa_pair(&ipsapp); @@ -3549,7 +3167,14 @@ sadb_common_add(queue_t *ip_q, queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg, newbie->ipsa_authtmpl = NULL; newbie->ipsa_encrtmpl = NULL; +#ifdef IPSEC_LATENCY_TEST + if (akey != NULL && newbie->ipsa_auth_alg != SADB_AALG_NONE) { +#else if (akey != NULL) { +#endif + async = (ipss->ipsec_algs_exec_mode[IPSEC_ALG_AUTH] == + IPSEC_ALGS_EXEC_ASYNC); + newbie->ipsa_authkeybits = akey->sadb_key_bits; newbie->ipsa_authkeylen = SADB_1TO8(akey->sadb_key_bits); /* In case we have to round up to the next byte... */ @@ -3604,6 +3229,8 @@ sadb_common_add(queue_t *ip_q, queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg, if (ekey != NULL) { mutex_enter(&ipss->ipsec_alg_lock); + async = async || (ipss->ipsec_algs_exec_mode[IPSEC_ALG_ENCR] == + IPSEC_ALGS_EXEC_ASYNC); alg = ipss->ipsec_alglists[IPSEC_ALG_ENCR] [newbie->ipsa_encr_alg]; @@ -3757,6 +3384,9 @@ sadb_common_add(queue_t *ip_q, queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg, } } + if (async) + newbie->ipsa_flags |= IPSA_F_ASYNC; + /* * Ptrs to processing functions. */ @@ -3812,7 +3442,7 @@ sadb_common_add(queue_t *ip_q, queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg, if (sens != NULL) { uint64_t *bitmap = (uint64_t *)(sens + 1); - newbie->ipsa_cred = sadb_cred_from_sens(sens, bitmap); + newbie->ipsa_tsl = sadb_label_from_sens(sens, bitmap); } /* @@ -3820,41 +3450,55 @@ sadb_common_add(queue_t *ip_q, queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg, */ if (osens != NULL) { uint64_t *bitmap = (uint64_t *)(osens + 1); - cred_t *cred, *effective_cred; + ts_label_t *tsl, *effective_tsl; uint32_t *peer_addr_ptr; + zoneid_t zoneid = GLOBAL_ZONEID; + zone_t *zone; peer_addr_ptr = is_inbound ? src_addr_ptr : dst_addr_ptr; - cred = sadb_cred_from_sens(osens, bitmap); + tsl = sadb_label_from_sens(osens, bitmap); newbie->ipsa_mac_exempt = CONN_MAC_DEFAULT; if (osens->sadb_x_sens_flags & SADB_X_SENS_IMPLICIT) { newbie->ipsa_mac_exempt = CONN_MAC_IMPLICIT; } - error = tsol_check_dest(cred, peer_addr_ptr, + error = tsol_check_dest(tsl, peer_addr_ptr, (af == AF_INET6)?IPV6_VERSION:IPV4_VERSION, - newbie->ipsa_mac_exempt, &effective_cred); + newbie->ipsa_mac_exempt, B_TRUE, &effective_tsl); if (error != 0) { - crfree(cred); + label_rele(tsl); mutex_exit(&newbie->ipsa_lock); goto error; } - if (effective_cred != NULL) { - crfree(cred); - cred = effective_cred; + if (effective_tsl != NULL) { + label_rele(tsl); + tsl = effective_tsl; } - newbie->ipsa_ocred = cred; + newbie->ipsa_otsl = tsl; + + zone = zone_find_by_label(tsl); + if (zone != NULL) { + zoneid = zone->zone_id; + zone_rele(zone); + } + /* + * For exclusive stacks we set the zoneid to zero to operate + * as if in the global zone for tsol_compute_label_v4/v6 + */ + if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID) + zoneid = GLOBAL_ZONEID; if (af == AF_INET6) { - error = tsol_compute_label_v6(cred, + error = tsol_compute_label_v6(tsl, zoneid, (in6_addr_t *)peer_addr_ptr, newbie->ipsa_opt_storage, ipst); } else { - error = tsol_compute_label(cred, *peer_addr_ptr, - newbie->ipsa_opt_storage, ipst); + error = tsol_compute_label_v4(tsl, zoneid, + *peer_addr_ptr, newbie->ipsa_opt_storage, ipst); } if (error != 0) { mutex_exit(&newbie->ipsa_lock); @@ -3916,9 +3560,6 @@ sadb_common_add(queue_t *ip_q, queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg, mutex_enter(&primary->isaf_lock); } - IPSECHW_DEBUG(IPSECHW_SADB, ("sadb_common_add: spi = 0x%x\n", - newbie->ipsa_spi)); - /* * sadb_insertassoc() doesn't increment the reference * count. We therefore have to increment the @@ -3938,10 +3579,6 @@ sadb_common_add(queue_t *ip_q, queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg, mutex_enter(&newbie->ipsa_lock); error = sadb_insertassoc(newbie, primary); - if (error == 0) { - ctl_mp = sadb_fmt_sa_req(DL_CO_SET, newbie->ipsa_type, newbie, - is_inbound); - } mutex_exit(&newbie->ipsa_lock); if (error != 0) { @@ -3982,13 +3619,6 @@ sadb_common_add(queue_t *ip_q, queue_t *pfkey_q, mblk_t *mp, sadb_msg_t *samsg, ASSERT(MUTEX_NOT_HELD(&newbie->ipsa_lock)); ASSERT(newbie_clone == NULL || (MUTEX_NOT_HELD(&newbie_clone->ipsa_lock))); - /* - * If hardware acceleration could happen, send it. - */ - if (ctl_mp != NULL) { - putnext(ip_q, ctl_mp); - ctl_mp = NULL; - } error_unlock: @@ -4037,8 +3667,6 @@ error: if (newbie_clone != NULL) { IPSA_REFRELE(newbie_clone); } - if (ctl_mp != NULL) - freemsg(ctl_mp); if (error == 0) { /* @@ -4315,37 +3943,12 @@ sadb_age_bytes(queue_t *pfkey_q, ipsa_t *assoc, uint64_t bytes, } /* - * Push one or more DL_CO_DELETE messages queued up by - * sadb_torch_assoc down to the underlying driver now that it's a - * convenient time for it (i.e., ipsa bucket locks not held). - */ -static void -sadb_drain_torchq(queue_t *q, mblk_t *mp) -{ - while (mp != NULL) { - mblk_t *next = mp->b_next; - mp->b_next = NULL; - if (q != NULL) - putnext(q, mp); - else - freemsg(mp); - mp = next; - } -} - -/* * "Torch" an individual SA. Returns NULL, so it can be tail-called from * sadb_age_assoc(). - * - * If SA is hardware-accelerated, and we can't allocate the mblk - * containing the DL_CO_DELETE, just return; it will remain in the - * table and be swept up by sadb_ager() in a subsequent pass. */ static ipsa_t * -sadb_torch_assoc(isaf_t *head, ipsa_t *sa, boolean_t inbnd, mblk_t **mq) +sadb_torch_assoc(isaf_t *head, ipsa_t *sa) { - mblk_t *mp; - ASSERT(MUTEX_HELD(&head->isaf_lock)); ASSERT(MUTEX_HELD(&sa->ipsa_lock)); ASSERT(sa->ipsa_state == IPSA_STATE_DEAD); @@ -4355,15 +3958,6 @@ sadb_torch_assoc(isaf_t *head, ipsa_t *sa, boolean_t inbnd, mblk_t **mq) */ head->isaf_gen++; - if (sa->ipsa_flags & IPSA_F_HW) { - mp = sadb_fmt_sa_req(DL_CO_DELETE, sa->ipsa_type, sa, inbnd); - if (mp == NULL) { - mutex_exit(&sa->ipsa_lock); - return (NULL); - } - mp->b_next = *mq; - *mq = mp; - } mutex_exit(&sa->ipsa_lock); sadb_unlinkassoc(sa); @@ -4404,7 +3998,7 @@ sadb_idle_activities(ipsa_t *assoc, time_t delta, boolean_t inbound) */ static ipsa_t * sadb_age_assoc(isaf_t *head, queue_t *pfkey_q, ipsa_t *assoc, - time_t current, int reap_delay, boolean_t inbound, mblk_t **mq) + time_t current, int reap_delay, boolean_t inbound) { ipsa_t *retval = NULL; boolean_t dropped_mutex = B_FALSE; @@ -4419,7 +4013,7 @@ sadb_age_assoc(isaf_t *head, queue_t *pfkey_q, ipsa_t *assoc, (assoc->ipsa_hardexpiretime != 0))) && (assoc->ipsa_hardexpiretime <= current)) { assoc->ipsa_state = IPSA_STATE_DEAD; - return (sadb_torch_assoc(head, assoc, inbound, mq)); + return (sadb_torch_assoc(head, assoc)); } /* @@ -4433,7 +4027,7 @@ sadb_age_assoc(isaf_t *head, queue_t *pfkey_q, ipsa_t *assoc, if (assoc->ipsa_hardexpiretime != 0 && assoc->ipsa_hardexpiretime <= current) { if (assoc->ipsa_state == IPSA_STATE_DEAD) - return (sadb_torch_assoc(head, assoc, inbound, mq)); + return (sadb_torch_assoc(head, assoc)); if (inbound) { sadb_delete_cluster(assoc); @@ -4516,8 +4110,7 @@ sadb_age_assoc(isaf_t *head, queue_t *pfkey_q, ipsa_t *assoc, * the second time sadb_ager() runs. */ void -sadb_ager(sadb_t *sp, queue_t *pfkey_q, queue_t *ip_q, int reap_delay, - netstack_t *ns) +sadb_ager(sadb_t *sp, queue_t *pfkey_q, int reap_delay, netstack_t *ns) { int i; isaf_t *bucket; @@ -4527,7 +4120,6 @@ sadb_ager(sadb_t *sp, queue_t *pfkey_q, queue_t *ip_q, int reap_delay, templist_t *haspeerlist, *newbie; /* Snapshot current time now. */ time_t current = gethrestime_sec(); - mblk_t *mq = NULL; haspeerlist = NULL; /* @@ -4559,7 +4151,7 @@ sadb_ager(sadb_t *sp, queue_t *pfkey_q, queue_t *ip_q, int reap_delay, assoc = spare) { spare = assoc->ipsa_next; if (sadb_age_assoc(bucket, pfkey_q, assoc, current, - reap_delay, B_TRUE, &mq) != NULL) { + reap_delay, B_TRUE) != NULL) { /* * Put SA's which have a peer or SA's which * are paired on a list for processing after @@ -4585,10 +4177,6 @@ sadb_ager(sadb_t *sp, queue_t *pfkey_q, queue_t *ip_q, int reap_delay, mutex_exit(&bucket->isaf_lock); } - if (mq != NULL) { - sadb_drain_torchq(ip_q, mq); - mq = NULL; - } age_pair_peer_list(haspeerlist, sp, B_FALSE); haspeerlist = NULL; @@ -4600,7 +4188,7 @@ sadb_ager(sadb_t *sp, queue_t *pfkey_q, queue_t *ip_q, int reap_delay, assoc = spare) { spare = assoc->ipsa_next; if (sadb_age_assoc(bucket, pfkey_q, assoc, current, - reap_delay, B_FALSE, &mq) != NULL) { + reap_delay, B_FALSE) != NULL) { /* * sadb_age_assoc() increments the refcnt, * effectively doing an IPSA_REFHOLD(). @@ -4621,10 +4209,6 @@ sadb_ager(sadb_t *sp, queue_t *pfkey_q, queue_t *ip_q, int reap_delay, } mutex_exit(&bucket->isaf_lock); } - if (mq != NULL) { - sadb_drain_torchq(ip_q, mq); - mq = NULL; - } age_pair_peer_list(haspeerlist, sp, B_TRUE); @@ -5227,7 +4811,7 @@ update_pairing(ipsap_t *ipsapp, ipsa_query_t *sq, keysock_in_t *ksi, static ipsacq_t * sadb_checkacquire(iacqf_t *bucket, ipsec_action_t *ap, ipsec_policy_t *pp, uint32_t *src, uint32_t *dst, uint32_t *isrc, uint32_t *idst, - uint64_t unique_id, cred_t *cr) + uint64_t unique_id, ts_label_t *tsl) { ipsacq_t *walker; sa_family_t fam; @@ -5257,7 +4841,7 @@ sadb_checkacquire(iacqf_t *bucket, ipsec_action_t *ap, ipsec_policy_t *pp, (pp == walker->ipsacq_policy) && /* XXX do deep compares of ap/pp? */ (unique_id == walker->ipsacq_unique_id) && - (ipsec_label_match(cr, walker->ipsacq_cred))) + (ipsec_label_match(tsl, walker->ipsacq_tsl))) break; /* everything matched */ mutex_exit(&walker->ipsacq_lock); } @@ -5272,31 +4856,32 @@ sadb_checkacquire(iacqf_t *bucket, ipsec_action_t *ap, ipsec_policy_t *pp, * send the acquire up.. * * In cases where we need both AH and ESP, add the SA to the ESP ACQUIRE - * list. The ah_add_sa_finish() routines can look at the packet's ipsec_out_t - * and handle this case specially. + * list. The ah_add_sa_finish() routines can look at the packet's attached + * attributes and handle this case specially. */ void -sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp) +sadb_acquire(mblk_t *datamp, ip_xmit_attr_t *ixa, boolean_t need_ah, + boolean_t need_esp) { + mblk_t *asyncmp; sadbp_t *spp; sadb_t *sp; ipsacq_t *newbie; iacqf_t *bucket; - mblk_t *datamp = mp->b_cont; mblk_t *extended; ipha_t *ipha = (ipha_t *)datamp->b_rptr; ip6_t *ip6h = (ip6_t *)datamp->b_rptr; uint32_t *src, *dst, *isrc, *idst; - ipsec_policy_t *pp = io->ipsec_out_policy; - ipsec_action_t *ap = io->ipsec_out_act; + ipsec_policy_t *pp = ixa->ixa_ipsec_policy; + ipsec_action_t *ap = ixa->ixa_ipsec_action; sa_family_t af; int hashoffset; uint32_t seq; uint64_t unique_id = 0; ipsec_selector_t sel; - boolean_t tunnel_mode = io->ipsec_out_tunnel; - cred_t *cr = NULL; - netstack_t *ns = io->ipsec_out_ns; + boolean_t tunnel_mode = (ixa->ixa_flags & IXAF_IPSEC_TUNNEL) != 0; + ts_label_t *tsl = NULL; + netstack_t *ns = ixa->ixa_ipst->ips_netstack; ipsec_stack_t *ipss = ns->netstack_ipsec; sadb_sens_t *sens = NULL; int sens_len; @@ -5315,12 +4900,10 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp) spp = &ahstack->ah_sadb; } - sp = io->ipsec_out_v4 ? &spp->s_v4 : &spp->s_v6; - - ASSERT(mp->b_cont != NULL); + sp = (ixa->ixa_flags & IXAF_IS_IPV4) ? &spp->s_v4 : &spp->s_v6; if (is_system_labeled()) - cr = msg_getcred(mp->b_cont, NULL); + tsl = ixa->ixa_tsl; if (ap == NULL) ap = pp->ipsp_act; @@ -5328,7 +4911,7 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp) ASSERT(ap != NULL); if (ap->ipa_act.ipa_apply.ipp_use_unique || tunnel_mode) - unique_id = SA_FORM_UNIQUE_ID(io); + unique_id = SA_FORM_UNIQUE_ID(ixa); /* * Set up an ACQUIRE record. @@ -5345,14 +4928,14 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp) dst = (uint32_t *)&ipha->ipha_dst; af = AF_INET; hashoffset = OUTBOUND_HASH_V4(sp, ipha->ipha_dst); - ASSERT(io->ipsec_out_v4 == B_TRUE); + ASSERT(ixa->ixa_flags & IXAF_IS_IPV4); } else { ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION); src = (uint32_t *)&ip6h->ip6_src; dst = (uint32_t *)&ip6h->ip6_dst; af = AF_INET6; hashoffset = OUTBOUND_HASH_V6(sp, ip6h->ip6_dst); - ASSERT(io->ipsec_out_v4 == B_FALSE); + ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4)); } if (tunnel_mode) { @@ -5363,14 +4946,14 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp) * with self-encapsulated protection. Until we better * support this, drop the packet. */ - ip_drop_packet(mp, B_FALSE, NULL, NULL, + ip_drop_packet(datamp, B_FALSE, NULL, DROPPER(ipss, ipds_spd_got_selfencap), &ipss->ipsec_spd_dropper); return; } /* Snag inner addresses. */ - isrc = io->ipsec_out_insrc; - idst = io->ipsec_out_indst; + isrc = ixa->ixa_ipsec_insrc; + idst = ixa->ixa_ipsec_indst; } else { isrc = idst = NULL; } @@ -5382,7 +4965,7 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp) bucket = &(sp->sdb_acq[hashoffset]); mutex_enter(&bucket->iacqf_lock); newbie = sadb_checkacquire(bucket, ap, pp, src, dst, isrc, idst, - unique_id, cr); + unique_id, tsl); if (newbie == NULL) { /* @@ -5391,7 +4974,7 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp) newbie = kmem_zalloc(sizeof (*newbie), KM_NOSLEEP); if (newbie == NULL) { mutex_exit(&bucket->iacqf_lock); - ip_drop_packet(mp, B_FALSE, NULL, NULL, + ip_drop_packet(datamp, B_FALSE, NULL, DROPPER(ipss, ipds_sadb_acquire_nomem), &ipss->ipsec_sadb_dropper); return; @@ -5433,11 +5016,30 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp) */ ASSERT(MUTEX_HELD(&newbie->ipsacq_lock)); - mp->b_next = NULL; + /* + * Make the ip_xmit_attr_t into something we can queue. + * If no memory it frees datamp. + */ + asyncmp = ip_xmit_attr_to_mblk(ixa); + if (asyncmp != NULL) + linkb(asyncmp, datamp); + /* Queue up packet. Use b_next. */ - if (newbie->ipsacq_numpackets == 0) { + + if (asyncmp == NULL) { + /* Statistics for allocation failure */ + if (ixa->ixa_flags & IXAF_IS_IPV4) { + BUMP_MIB(&ixa->ixa_ipst->ips_ip_mib, + ipIfStatsOutDiscards); + } else { + BUMP_MIB(&ixa->ixa_ipst->ips_ip6_mib, + ipIfStatsOutDiscards); + } + ip_drop_output("No memory for asyncmp", datamp, NULL); + freemsg(datamp); + } else if (newbie->ipsacq_numpackets == 0) { /* First one. */ - newbie->ipsacq_mp = mp; + newbie->ipsacq_mp = asyncmp; newbie->ipsacq_numpackets = 1; newbie->ipsacq_expire = gethrestime_sec(); /* @@ -5448,28 +5050,28 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp) newbie->ipsacq_seq = seq; newbie->ipsacq_addrfam = af; - newbie->ipsacq_srcport = io->ipsec_out_src_port; - newbie->ipsacq_dstport = io->ipsec_out_dst_port; - newbie->ipsacq_icmp_type = io->ipsec_out_icmp_type; - newbie->ipsacq_icmp_code = io->ipsec_out_icmp_code; + newbie->ipsacq_srcport = ixa->ixa_ipsec_src_port; + newbie->ipsacq_dstport = ixa->ixa_ipsec_dst_port; + newbie->ipsacq_icmp_type = ixa->ixa_ipsec_icmp_type; + newbie->ipsacq_icmp_code = ixa->ixa_ipsec_icmp_code; if (tunnel_mode) { - newbie->ipsacq_inneraddrfam = io->ipsec_out_inaf; - newbie->ipsacq_proto = io->ipsec_out_inaf == AF_INET6 ? + newbie->ipsacq_inneraddrfam = ixa->ixa_ipsec_inaf; + newbie->ipsacq_proto = ixa->ixa_ipsec_inaf == AF_INET6 ? IPPROTO_IPV6 : IPPROTO_ENCAP; - newbie->ipsacq_innersrcpfx = io->ipsec_out_insrcpfx; - newbie->ipsacq_innerdstpfx = io->ipsec_out_indstpfx; + newbie->ipsacq_innersrcpfx = ixa->ixa_ipsec_insrcpfx; + newbie->ipsacq_innerdstpfx = ixa->ixa_ipsec_indstpfx; IPSA_COPY_ADDR(newbie->ipsacq_innersrc, - io->ipsec_out_insrc, io->ipsec_out_inaf); + ixa->ixa_ipsec_insrc, ixa->ixa_ipsec_inaf); IPSA_COPY_ADDR(newbie->ipsacq_innerdst, - io->ipsec_out_indst, io->ipsec_out_inaf); + ixa->ixa_ipsec_indst, ixa->ixa_ipsec_inaf); } else { - newbie->ipsacq_proto = io->ipsec_out_proto; + newbie->ipsacq_proto = ixa->ixa_ipsec_proto; } newbie->ipsacq_unique_id = unique_id; - if (cr != NULL) { - crhold(cr); - newbie->ipsacq_cred = cr; + if (ixa->ixa_tsl != NULL) { + label_hold(ixa->ixa_tsl); + newbie->ipsacq_tsl = ixa->ixa_tsl; } } else { /* Scan to the end of the list & insert. */ @@ -5477,13 +5079,16 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp) while (lastone->b_next != NULL) lastone = lastone->b_next; - lastone->b_next = mp; + lastone->b_next = asyncmp; if (newbie->ipsacq_numpackets++ == ipsacq_maxpackets) { newbie->ipsacq_numpackets = ipsacq_maxpackets; lastone = newbie->ipsacq_mp; newbie->ipsacq_mp = lastone->b_next; lastone->b_next = NULL; - ip_drop_packet(lastone, B_FALSE, NULL, NULL, + + /* Freeing the async message */ + lastone = ip_xmit_attr_free_mblk(lastone); + ip_drop_packet(lastone, B_FALSE, NULL, DROPPER(ipss, ipds_sadb_acquire_toofull), &ipss->ipsec_sadb_dropper); } else { @@ -5518,17 +5123,17 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp) * opportunities here in failure cases. */ (void) memset(&sel, 0, sizeof (sel)); - sel.ips_isv4 = io->ipsec_out_v4; + sel.ips_isv4 = (ixa->ixa_flags & IXAF_IS_IPV4) != 0; if (tunnel_mode) { - sel.ips_protocol = (io->ipsec_out_inaf == AF_INET) ? + sel.ips_protocol = (ixa->ixa_ipsec_inaf == AF_INET) ? IPPROTO_ENCAP : IPPROTO_IPV6; } else { - sel.ips_protocol = io->ipsec_out_proto; - sel.ips_local_port = io->ipsec_out_src_port; - sel.ips_remote_port = io->ipsec_out_dst_port; + sel.ips_protocol = ixa->ixa_ipsec_proto; + sel.ips_local_port = ixa->ixa_ipsec_src_port; + sel.ips_remote_port = ixa->ixa_ipsec_dst_port; } - sel.ips_icmp_type = io->ipsec_out_icmp_type; - sel.ips_icmp_code = io->ipsec_out_icmp_code; + sel.ips_icmp_type = ixa->ixa_ipsec_icmp_type; + sel.ips_icmp_code = ixa->ixa_ipsec_icmp_code; sel.ips_is_icmp_inv_acq = 0; if (af == AF_INET) { sel.ips_local_addr_v4 = ipha->ipha_src; @@ -5542,13 +5147,13 @@ sadb_acquire(mblk_t *mp, ipsec_out_t *io, boolean_t need_ah, boolean_t need_esp) if (extended == NULL) goto punt_extended; - if (cr != NULL) { + if (ixa->ixa_tsl != NULL) { /* * XXX MLS correct condition here? * XXX MLS other credential attributes in acquire? * XXX malloc failure? don't fall back to original? */ - sens = sadb_make_sens_ext(cr, &sens_len); + sens = sadb_make_sens_ext(ixa->ixa_tsl, &sens_len); if (sens == NULL) { freeb(extended); @@ -5585,13 +5190,13 @@ punt_extended: void sadb_destroy_acquire(ipsacq_t *acqrec, netstack_t *ns) { - mblk_t *mp; + mblk_t *mp; ipsec_stack_t *ipss = ns->netstack_ipsec; ASSERT(MUTEX_HELD(acqrec->ipsacq_linklock)); if (acqrec->ipsacq_policy != NULL) { - IPPOL_REFRELE(acqrec->ipsacq_policy, ns); + IPPOL_REFRELE(acqrec->ipsacq_policy); } if (acqrec->ipsacq_act != NULL) { IPACT_REFRELE(acqrec->ipsacq_act); @@ -5602,9 +5207,9 @@ sadb_destroy_acquire(ipsacq_t *acqrec, netstack_t *ns) if (acqrec->ipsacq_next != NULL) acqrec->ipsacq_next->ipsacq_ptpn = acqrec->ipsacq_ptpn; - if (acqrec->ipsacq_cred) { - crfree(acqrec->ipsacq_cred); - acqrec->ipsacq_cred = NULL; + if (acqrec->ipsacq_tsl != NULL) { + label_rele(acqrec->ipsacq_tsl); + acqrec->ipsacq_tsl = NULL; } /* @@ -5618,7 +5223,9 @@ sadb_destroy_acquire(ipsacq_t *acqrec, netstack_t *ns) mp = acqrec->ipsacq_mp; acqrec->ipsacq_mp = mp->b_next; mp->b_next = NULL; - ip_drop_packet(mp, B_FALSE, NULL, NULL, + /* Freeing the async message */ + mp = ip_xmit_attr_free_mblk(mp); + ip_drop_packet(mp, B_FALSE, NULL, DROPPER(ipss, ipds_sadb_acquire_timeout), &ipss->ipsec_sadb_dropper); } @@ -5795,24 +5402,23 @@ sadb_action_to_ecomb(uint8_t *start, uint8_t *limit, ipsec_action_t *act, /* ARGSUSED */ int -sadb_sens_len_from_cred(cred_t *cr) +sadb_sens_len_from_label(ts_label_t *tsl) { int baselen = sizeof (sadb_sens_t) + _C_LEN * 4; return (roundup(baselen, sizeof (uint64_t))); } void -sadb_sens_from_cred(sadb_sens_t *sens, int exttype, cred_t *cr, int senslen) +sadb_sens_from_label(sadb_sens_t *sens, int exttype, ts_label_t *tsl, + int senslen) { uint8_t *bitmap; bslabel_t *sl; - ts_label_t *tsl; /* LINTED */ ASSERT((_C_LEN & 1) == 0); ASSERT((senslen & 7) == 0); - tsl = crgetlabel(cr); sl = label2bslabel(tsl); sens->sadb_sens_exttype = exttype; @@ -5830,14 +5436,14 @@ sadb_sens_from_cred(sadb_sens_t *sens, int exttype, cred_t *cr, int senslen) } static sadb_sens_t * -sadb_make_sens_ext(cred_t *cr, int *len) +sadb_make_sens_ext(ts_label_t *tsl, int *len) { /* XXX allocation failure? */ - int sens_len = sadb_sens_len_from_cred(cr); + int sens_len = sadb_sens_len_from_label(tsl); sadb_sens_t *sens = kmem_alloc(sens_len, KM_SLEEP); - sadb_sens_from_cred(sens, SADB_EXT_SENSITIVITY, cr, sens_len); + sadb_sens_from_label(sens, SADB_EXT_SENSITIVITY, tsl, sens_len); *len = sens_len; @@ -5849,12 +5455,12 @@ sadb_make_sens_ext(cred_t *cr, int *len) * With a special designated "not a label" cred_t ? */ /* ARGSUSED */ -cred_t * -sadb_cred_from_sens(sadb_sens_t *sens, uint64_t *bitmap) +ts_label_t * +sadb_label_from_sens(sadb_sens_t *sens, uint64_t *bitmap) { int bitmap_len = SADB_64TO8(sens->sadb_sens_sens_len); bslabel_t sl; - cred_t *cr; + ts_label_t *tsl; if (sens->sadb_sens_integ_level != 0) return (NULL); @@ -5868,13 +5474,13 @@ sadb_cred_from_sens(sadb_sens_t *sens, uint64_t *bitmap) bcopy(bitmap, &((_bslabel_impl_t *)&sl)->compartments, bitmap_len); - cr = newcred_from_bslabel(&sl, sens->sadb_sens_dpd, KM_NOSLEEP); - if (cr == NULL) - return (cr); + tsl = labelalloc(&sl, sens->sadb_sens_dpd, KM_NOSLEEP); + if (tsl == NULL) + return (NULL); if (sens->sadb_x_sens_flags & SADB_X_SENS_UNLABELED) - crgetlabel(cr)->tsl_flags |= TSLF_UNLABELED; - return (cr); + tsl->tsl_flags |= TSLF_UNLABELED; + return (tsl); } /* End XXX label-library-leakage */ @@ -6359,12 +5965,13 @@ sadb_getspi(keysock_in_t *ksi, uint32_t master_spi, int *diagnostic, * * Caller frees the message, so we don't have to here. * - * NOTE: The ip_q parameter may be used in the future for ACQUIRE + * NOTE: The pfkey_q parameter may be used in the future for ACQUIRE * failures. */ /* ARGSUSED */ void -sadb_in_acquire(sadb_msg_t *samsg, sadbp_t *sp, queue_t *ip_q, netstack_t *ns) +sadb_in_acquire(sadb_msg_t *samsg, sadbp_t *sp, queue_t *pfkey_q, + netstack_t *ns) { int i; ipsacq_t *acqrec; @@ -6624,36 +6231,6 @@ sadb_replay_delete(ipsa_t *assoc) } /* - * Given a queue that presumably points to IP, send a T_BIND_REQ for _proto_ - * down. The caller will handle the T_BIND_ACK locally. - */ -boolean_t -sadb_t_bind_req(queue_t *q, int proto) -{ - struct T_bind_req *tbr; - mblk_t *mp; - - mp = allocb_cred(sizeof (struct T_bind_req) + 1, kcred, NOPID); - if (mp == NULL) { - /* cmn_err(CE_WARN, */ - /* "sadb_t_bind_req(%d): couldn't allocate mblk\n", proto); */ - return (B_FALSE); - } - mp->b_datap->db_type = M_PCPROTO; - tbr = (struct T_bind_req *)mp->b_rptr; - mp->b_wptr += sizeof (struct T_bind_req); - tbr->PRIM_type = T_BIND_REQ; - tbr->ADDR_length = 0; - tbr->ADDR_offset = 0; - tbr->CONIND_number = 0; - *mp->b_wptr = (uint8_t)proto; - mp->b_wptr++; - - putnext(q, mp); - return (B_TRUE); -} - -/* * Special front-end to ipsec_rl_strlog() dealing with SA failure. * this is designed to take only a format string with "* %x * %s *", so * that "spi" is printed first, then "addr" is converted using inet_pton(). @@ -6676,7 +6253,6 @@ ipsec_assocfailure(short mid, short sid, char level, ushort_t sl, char *fmt, /* * Fills in a reference to the policy, if any, from the conn, in *ppp - * Releases a reference to the passed conn_t. */ static void ipsec_conn_pol(ipsec_selector_t *sel, conn_t *connp, ipsec_policy_t **ppp) @@ -6684,15 +6260,14 @@ ipsec_conn_pol(ipsec_selector_t *sel, conn_t *connp, ipsec_policy_t **ppp) ipsec_policy_t *pp; ipsec_latch_t *ipl = connp->conn_latch; - if ((ipl != NULL) && (ipl->ipl_out_policy != NULL)) { - pp = ipl->ipl_out_policy; + if ((ipl != NULL) && (connp->conn_ixa->ixa_ipsec_policy != NULL)) { + pp = connp->conn_ixa->ixa_ipsec_policy; IPPOL_REFHOLD(pp); } else { - pp = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, connp, NULL, sel, + pp = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, connp, sel, connp->conn_netstack); } *ppp = pp; - CONN_DEC_REF(connp); } /* @@ -6753,6 +6328,7 @@ ipsec_udp_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp, ip_stack_t *ipst) mutex_exit(&connfp->connf_lock); ipsec_conn_pol(sel, connp, ppp); + CONN_DEC_REF(connp); } static conn_t * @@ -6866,6 +6442,7 @@ ipsec_tcp_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp, ip_stack_t *ipst) } ipsec_conn_pol(sel, connp, ppp); + CONN_DEC_REF(connp); } static void @@ -6895,21 +6472,27 @@ ipsec_sctp_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp, pptr[0] = sel->ips_remote_port; pptr[1] = sel->ips_local_port; + /* + * For labeled systems, there's no need to check the + * label here. It's known to be good as we checked + * before allowing the connection to become bound. + */ if (sel->ips_isv4) { in6_addr_t src, dst; IN6_IPADDR_TO_V4MAPPED(sel->ips_remote_addr_v4, &dst); IN6_IPADDR_TO_V4MAPPED(sel->ips_local_addr_v4, &src); connp = sctp_find_conn(&dst, &src, ports, ALL_ZONES, - ipst->ips_netstack->netstack_sctp); + 0, ipst->ips_netstack->netstack_sctp); } else { connp = sctp_find_conn(&sel->ips_remote_addr_v6, &sel->ips_local_addr_v6, ports, ALL_ZONES, - ipst->ips_netstack->netstack_sctp); + 0, ipst->ips_netstack->netstack_sctp); } if (connp == NULL) return; ipsec_conn_pol(sel, connp, ppp); + CONN_DEC_REF(connp); } /* @@ -6985,7 +6568,7 @@ ipsec_get_inverse_acquire_sel(ipsec_selector_t *sel, sadb_address_t *srcext, static int ipsec_tun_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp, sadb_address_t *innsrcext, sadb_address_t *inndstext, ipsec_tun_pol_t *itp, - int *diagnostic, netstack_t *ns) + int *diagnostic) { int err; ipsec_policy_head_t *polhead; @@ -7045,8 +6628,7 @@ ipsec_tun_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp, polhead = itp->itp_policy; ASSERT(polhead != NULL); rw_enter(&polhead->iph_lock, RW_READER); - *ppp = ipsec_find_policy_head(NULL, polhead, - IPSEC_TYPE_INBOUND, sel, ns); + *ppp = ipsec_find_policy_head(NULL, polhead, IPSEC_TYPE_INBOUND, sel); rw_exit(&polhead->iph_lock); /* @@ -7059,6 +6641,10 @@ ipsec_tun_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp, return (0); } +/* + * For sctp conn_faddr is the primary address, hence this is of limited + * use for sctp. + */ static void ipsec_oth_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp, ip_stack_t *ipst) @@ -7068,7 +6654,7 @@ ipsec_oth_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp, conn_t *connp; if (isv4) { - connfp = &ipst->ips_ipcl_proto_fanout[sel->ips_protocol]; + connfp = &ipst->ips_ipcl_proto_fanout_v4[sel->ips_protocol]; } else { connfp = &ipst->ips_ipcl_proto_fanout_v6[sel->ips_protocol]; } @@ -7076,17 +6662,20 @@ ipsec_oth_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp, mutex_enter(&connfp->connf_lock); for (connp = connfp->connf_head; connp != NULL; connp = connp->conn_next) { - if (!((isv4 && !((connp->conn_src == 0 || - connp->conn_src == sel->ips_local_addr_v4) && - (connp->conn_rem == 0 || - connp->conn_rem == sel->ips_remote_addr_v4))) || - (!isv4 && !((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_srcv6) || - IN6_ARE_ADDR_EQUAL(&connp->conn_srcv6, - &sel->ips_local_addr_v6)) && - (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_remv6) || - IN6_ARE_ADDR_EQUAL(&connp->conn_remv6, - &sel->ips_remote_addr_v6)))))) { - break; + if (isv4) { + if ((connp->conn_laddr_v4 == INADDR_ANY || + connp->conn_laddr_v4 == sel->ips_local_addr_v4) && + (connp->conn_faddr_v4 == INADDR_ANY || + connp->conn_faddr_v4 == sel->ips_remote_addr_v4)) + break; + } else { + if ((IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) || + IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6, + &sel->ips_local_addr_v6)) && + (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) || + IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6, + &sel->ips_remote_addr_v6))) + break; } } if (connp == NULL) { @@ -7098,6 +6687,7 @@ ipsec_oth_pol(ipsec_selector_t *sel, ipsec_policy_t **ppp, mutex_exit(&connfp->connf_lock); ipsec_conn_pol(sel, connp, ppp); + CONN_DEC_REF(connp); } /* @@ -7245,7 +6835,7 @@ ipsec_construct_inverse_acquire(sadb_msg_t *samsg, sadb_ext_t *extv[], isel.ips_isv4 = (sel.ips_protocol == IPPROTO_ENCAP); } /* Else isel is initialized by ipsec_tun_pol(). */ err = ipsec_tun_pol(&isel, &pp, innsrcext, inndstext, itp, - &diagnostic, ns); + &diagnostic); /* * NOTE: isel isn't used for now, but in RFC 430x IPsec, it * may be. @@ -7263,8 +6853,7 @@ ipsec_construct_inverse_acquire(sadb_msg_t *samsg, sadb_ext_t *extv[], * look in the global policy. */ if (pp == NULL) { - pp = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, NULL, NULL, &sel, - ns); + pp = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, NULL, &sel, ns); if (pp == NULL) { /* There's no global policy. */ err = ENOENT; @@ -7282,7 +6871,7 @@ ipsec_construct_inverse_acquire(sadb_msg_t *samsg, sadb_ext_t *extv[], (itp != NULL && (itp->itp_flags & ITPF_P_TUNNEL)), samsg->sadb_msg_seq, samsg->sadb_msg_pid, sens, ns); if (pp != NULL) { - IPPOL_REFRELE(pp, ns); + IPPOL_REFRELE(pp); } ASSERT(err == 0 && diagnostic == 0); if (retmp == NULL) @@ -7306,37 +6895,49 @@ bail: /* * sadb_set_lpkt: Return TRUE if we can swap in a value to ipsa->ipsa_lpkt and * freemsg the previous value. Return FALSE if we lost the race and the SA is - * in a non-LARVAL state. free clue: ip_drop_packet(NULL) is safe. + * in a non-LARVAL state. We also return FALSE if we can't allocate the attrmp. */ boolean_t -sadb_set_lpkt(ipsa_t *ipsa, mblk_t *npkt, netstack_t *ns) +sadb_set_lpkt(ipsa_t *ipsa, mblk_t *npkt, ip_recv_attr_t *ira) { - mblk_t *opkt; + mblk_t *opkt; + netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack; ipsec_stack_t *ipss = ns->netstack_ipsec; boolean_t is_larval; - /* - * Check the packet's netstack id in case we go asynch with a - * taskq_dispatch. - */ - ASSERT(((ipsec_in_t *)npkt->b_rptr)->ipsec_in_type == IPSEC_IN); - ASSERT(((ipsec_in_t *)npkt->b_rptr)->ipsec_in_stackid == - ns->netstack_stackid); - mutex_enter(&ipsa->ipsa_lock); is_larval = (ipsa->ipsa_state == IPSA_STATE_LARVAL); if (is_larval) { - opkt = ipsa->ipsa_lpkt; - ipsa->ipsa_lpkt = npkt; + mblk_t *attrmp; + + attrmp = ip_recv_attr_to_mblk(ira); + if (attrmp == NULL) { + ill_t *ill = ira->ira_ill; + + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", npkt, ill); + freemsg(npkt); + opkt = NULL; + is_larval = B_FALSE; + } else { + ASSERT(attrmp->b_cont == NULL); + attrmp->b_cont = npkt; + npkt = attrmp; + opkt = ipsa->ipsa_lpkt; + ipsa->ipsa_lpkt = npkt; + } } else { /* We lost the race. */ opkt = NULL; } mutex_exit(&ipsa->ipsa_lock); - ip_drop_packet(opkt, B_TRUE, NULL, NULL, - DROPPER(ipss, ipds_sadb_inlarval_replace), - &ipss->ipsec_sadb_dropper); + if (opkt != NULL) { + opkt = ip_recv_attr_free_mblk(opkt); + ip_drop_packet(opkt, B_TRUE, ira->ira_ill, + DROPPER(ipss, ipds_sadb_inlarval_replace), + &ipss->ipsec_sadb_dropper); + } return (is_larval); } @@ -7353,7 +6954,6 @@ sadb_clear_lpkt(ipsa_t *ipsa) opkt = ipsa->ipsa_lpkt; ipsa->ipsa_lpkt = NULL; mutex_exit(&ipsa->ipsa_lock); - return (opkt); } @@ -7361,18 +6961,18 @@ sadb_clear_lpkt(ipsa_t *ipsa) * Buffer a packet that's in IDLE state as set by Solaris Clustering. */ void -sadb_buf_pkt(ipsa_t *ipsa, mblk_t *bpkt, netstack_t *ns) +sadb_buf_pkt(ipsa_t *ipsa, mblk_t *bpkt, ip_recv_attr_t *ira) { + netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack; ipsec_stack_t *ipss = ns->netstack_ipsec; - extern void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, - sa_family_t, in6_addr_t, in6_addr_t, void *); in6_addr_t *srcaddr = (in6_addr_t *)(&ipsa->ipsa_srcaddr); in6_addr_t *dstaddr = (in6_addr_t *)(&ipsa->ipsa_dstaddr); + mblk_t *mp; ASSERT(ipsa->ipsa_state == IPSA_STATE_IDLE); if (cl_inet_idlesa == NULL) { - ip_drop_packet(bpkt, B_TRUE, NULL, NULL, + ip_drop_packet(bpkt, B_TRUE, ira->ira_ill, DROPPER(ipss, ipds_sadb_inidle_overflow), &ipss->ipsec_sadb_dropper); return; @@ -7382,13 +6982,14 @@ sadb_buf_pkt(ipsa_t *ipsa, mblk_t *bpkt, netstack_t *ns) (ipsa->ipsa_type == SADB_SATYPE_AH) ? IPPROTO_AH : IPPROTO_ESP, ipsa->ipsa_spi, ipsa->ipsa_addrfam, *srcaddr, *dstaddr, NULL); - /* - * Check the packet's netstack id in case we go asynch with a - * taskq_dispatch. - */ - ASSERT(((ipsec_in_t *)bpkt->b_rptr)->ipsec_in_type == IPSEC_IN); - ASSERT(((ipsec_in_t *)bpkt->b_rptr)->ipsec_in_stackid == - ns->netstack_stackid); + mp = ip_recv_attr_to_mblk(ira); + if (mp == NULL) { + ip_drop_packet(bpkt, B_TRUE, ira->ira_ill, + DROPPER(ipss, ipds_sadb_inidle_overflow), + &ipss->ipsec_sadb_dropper); + return; + } + linkb(mp, bpkt); mutex_enter(&ipsa->ipsa_lock); ipsa->ipsa_mblkcnt++; @@ -7399,16 +7000,17 @@ sadb_buf_pkt(ipsa_t *ipsa, mblk_t *bpkt, netstack_t *ns) ipsa->ipsa_bpkt_tail = bpkt; if (ipsa->ipsa_mblkcnt > SADB_MAX_IDLEPKTS) { mblk_t *tmp; + tmp = ipsa->ipsa_bpkt_head; ipsa->ipsa_bpkt_head = ipsa->ipsa_bpkt_head->b_next; - ip_drop_packet(tmp, B_TRUE, NULL, NULL, + tmp = ip_recv_attr_free_mblk(tmp); + ip_drop_packet(tmp, B_TRUE, NULL, DROPPER(ipss, ipds_sadb_inidle_overflow), &ipss->ipsec_sadb_dropper); ipsa->ipsa_mblkcnt --; } } mutex_exit(&ipsa->ipsa_lock); - } /* @@ -7419,30 +7021,28 @@ void sadb_clear_buf_pkt(void *ipkt) { mblk_t *tmp, *buf_pkt; - netstack_t *ns; - ipsec_in_t *ii; + ip_recv_attr_t iras; buf_pkt = (mblk_t *)ipkt; - ii = (ipsec_in_t *)buf_pkt->b_rptr; - ASSERT(ii->ipsec_in_type == IPSEC_IN); - ns = netstack_find_by_stackid(ii->ipsec_in_stackid); - if (ns != NULL && ns != ii->ipsec_in_ns) { - netstack_rele(ns); - ns = NULL; /* For while-loop below. */ - } - while (buf_pkt != NULL) { + mblk_t *data_mp; + tmp = buf_pkt->b_next; buf_pkt->b_next = NULL; - if (ns != NULL) - ip_fanout_proto_again(buf_pkt, NULL, NULL, NULL); - else - freemsg(buf_pkt); + + data_mp = buf_pkt->b_cont; + buf_pkt->b_cont = NULL; + if (!ip_recv_attr_from_mblk(buf_pkt, &iras)) { + /* The ill or ip_stack_t disappeared on us. */ + ip_drop_input("ip_recv_attr_from_mblk", data_mp, NULL); + freemsg(data_mp); + } else { + ip_input_post_ipsec(data_mp, &iras); + } + ira_cleanup(&iras, B_TRUE); buf_pkt = tmp; } - if (ns != NULL) - netstack_rele(ns); } /* * Walker callback used by sadb_alg_update() to free/create crypto @@ -7454,6 +7054,8 @@ struct sadb_update_alg_state { ipsec_algtype_t alg_type; uint8_t alg_id; boolean_t is_added; + boolean_t async_auth; + boolean_t async_encr; }; static void @@ -7470,6 +7072,15 @@ sadb_alg_update_cb(isaf_t *head, ipsa_t *entry, void *cookie) mutex_enter(&entry->ipsa_lock); + if ((entry->ipsa_encr_alg != SADB_EALG_NONE && entry->ipsa_encr_alg != + SADB_EALG_NULL && update_state->async_encr) || + (entry->ipsa_auth_alg != SADB_AALG_NONE && + update_state->async_auth)) { + entry->ipsa_flags |= IPSA_F_ASYNC; + } else { + entry->ipsa_flags &= ~IPSA_F_ASYNC; + } + switch (update_state->alg_type) { case IPSEC_ALG_AUTH: if (entry->ipsa_auth_alg == update_state->alg_id) @@ -7511,8 +7122,11 @@ sadb_alg_update_cb(isaf_t *head, ipsa_t *entry, void *cookie) } /* - * Invoked by IP when an software crypto provider has been updated. - * The type and id of the corresponding algorithm is passed as argument. + * Invoked by IP when an software crypto provider has been updated, or if + * the crypto synchrony changes. The type and id of the corresponding + * algorithm is passed as argument. The type is set to ALL in the case of + * a synchrony change. + * * is_added is B_TRUE if the provider was added, B_FALSE if it was * removed. The function updates the SADB and free/creates the * context templates associated with SAs if needed. @@ -7529,12 +7143,17 @@ sadb_alg_update(ipsec_algtype_t alg_type, uint8_t alg_id, boolean_t is_added, struct sadb_update_alg_state update_state; ipsecah_stack_t *ahstack = ns->netstack_ipsecah; ipsecesp_stack_t *espstack = ns->netstack_ipsecesp; + ipsec_stack_t *ipss = ns->netstack_ipsec; update_state.alg_type = alg_type; update_state.alg_id = alg_id; update_state.is_added = is_added; + update_state.async_auth = ipss->ipsec_algs_exec_mode[IPSEC_ALG_AUTH] == + IPSEC_ALGS_EXEC_ASYNC; + update_state.async_encr = ipss->ipsec_algs_exec_mode[IPSEC_ALG_ENCR] == + IPSEC_ALGS_EXEC_ASYNC; - if (alg_type == IPSEC_ALG_AUTH) { + if (alg_type == IPSEC_ALG_AUTH || alg_type == IPSEC_ALG_ALL) { /* walk the AH tables only for auth. algorithm changes */ SADB_ALG_UPDATE_WALK(ahstack->ah_sadb.s_v4, sdb_of); SADB_ALG_UPDATE_WALK(ahstack->ah_sadb.s_v4, sdb_if); @@ -7693,15 +7312,15 @@ ipsec_check_key(crypto_mech_type_t mech_type, sadb_key_t *sadb_key, * * This is inelegant and really could use refactoring. */ -int -sadb_whack_label(mblk_t **mpp, ipsa_t *assoc) +mblk_t * +sadb_whack_label_v4(mblk_t *mp, ipsa_t *assoc, kstat_named_t *counter, + ipdropper_t *dropper) { int delta; int plen; dblk_t *db; int hlen; uint8_t *opt_storage = assoc->ipsa_opt_storage; - mblk_t *mp = *mpp; ipha_t *ipha = (ipha_t *)mp->b_rptr; plen = ntohs(ipha->ipha_length); @@ -7731,8 +7350,10 @@ sadb_whack_label(mblk_t **mpp, ipsa_t *assoc) new_mp = allocb_tmpl(hlen + copylen + (mp->b_rptr - mp->b_datap->db_base), mp); - if (new_mp == NULL) - return (ENOMEM); + if (new_mp == NULL) { + ip_drop_packet(mp, B_FALSE, NULL, counter, dropper); + return (NULL); + } /* keep the bias */ new_mp->b_rptr += mp->b_rptr - mp->b_datap->db_base; @@ -7743,7 +7364,7 @@ sadb_whack_label(mblk_t **mpp, ipsa_t *assoc) new_mp->b_cont = mp->b_cont; freeb(mp); } - *mpp = mp = new_mp; + mp = new_mp; ipha = (ipha_t *)mp->b_rptr; } @@ -7768,11 +7389,12 @@ sadb_whack_label(mblk_t **mpp, ipsa_t *assoc) ipha->ipha_length = htons(plen); - return (0); + return (mp); } -int -sadb_whack_label_v6(mblk_t **mpp, ipsa_t *assoc) +mblk_t * +sadb_whack_label_v6(mblk_t *mp, ipsa_t *assoc, kstat_named_t *counter, + ipdropper_t *dropper) { int delta; int plen; @@ -7780,7 +7402,6 @@ sadb_whack_label_v6(mblk_t **mpp, ipsa_t *assoc) int hlen; uint8_t *opt_storage = assoc->ipsa_opt_storage; uint_t sec_opt_len; /* label option length not including type, len */ - mblk_t *mp = *mpp; ip6_t *ip6h = (ip6_t *)mp->b_rptr; plen = ntohs(ip6h->ip6_plen); @@ -7818,8 +7439,10 @@ sadb_whack_label_v6(mblk_t **mpp, ipsa_t *assoc) copylen = hdr_len; new_mp = allocb_tmpl(hlen + copylen + (mp->b_rptr - mp->b_datap->db_base), mp); - if (new_mp == NULL) - return (ENOMEM); + if (new_mp == NULL) { + ip_drop_packet(mp, B_FALSE, NULL, counter, dropper); + return (NULL); + } /* keep the bias */ new_mp->b_rptr += mp->b_rptr - mp->b_datap->db_base; @@ -7830,7 +7453,7 @@ sadb_whack_label_v6(mblk_t **mpp, ipsa_t *assoc) new_mp->b_cont = mp->b_cont; freeb(mp); } - *mpp = mp = new_mp; + mp = new_mp; ip6h = (ip6_t *)mp->b_rptr; } @@ -7856,10 +7479,46 @@ sadb_whack_label_v6(mblk_t **mpp, ipsa_t *assoc) ip6h->ip6_plen = htons(plen); - return (0); + return (mp); } +/* Whack the labels and update ip_xmit_attr_t as needed */ +mblk_t * +sadb_whack_label(mblk_t *mp, ipsa_t *assoc, ip_xmit_attr_t *ixa, + kstat_named_t *counter, ipdropper_t *dropper) +{ + int adjust; + int iplen; + if (ixa->ixa_flags & IXAF_IS_IPV4) { + ipha_t *ipha = (ipha_t *)mp->b_rptr; + + ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); + iplen = ntohs(ipha->ipha_length); + mp = sadb_whack_label_v4(mp, assoc, counter, dropper); + if (mp == NULL) + return (NULL); + + ipha = (ipha_t *)mp->b_rptr; + ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); + adjust = (int)ntohs(ipha->ipha_length) - iplen; + } else { + ip6_t *ip6h = (ip6_t *)mp->b_rptr; + + ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION); + iplen = ntohs(ip6h->ip6_plen); + mp = sadb_whack_label_v6(mp, assoc, counter, dropper); + if (mp == NULL) + return (NULL); + + ip6h = (ip6_t *)mp->b_rptr; + ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION); + adjust = (int)ntohs(ip6h->ip6_plen) - iplen; + } + ixa->ixa_pktlen += adjust; + ixa->ixa_ip_hdr_length += adjust; + return (mp); +} /* * If this is an outgoing SA then add some fuzz to the @@ -7969,7 +7628,7 @@ age_pair_peer_list(templist_t *haspeerlist, sadb_t *sp, boolean_t outbound) *((ipaddr_t *)&dying-> ipsa_srcaddr)); } - bucket = &(sp->sdb_of[outhash]); + bucket = &(sp->sdb_of[outhash]); } mutex_enter(&bucket->isaf_lock); diff --git a/usr/src/uts/common/inet/ip/spd.c b/usr/src/uts/common/inet/ip/spd.c index 37a9f47432..e6903cefc2 100644 --- a/usr/src/uts/common/inet/ip/spd.c +++ b/usr/src/uts/common/inet/ip/spd.c @@ -37,6 +37,7 @@ #include <sys/strsubr.h> #include <sys/strsun.h> #include <sys/strlog.h> +#include <sys/strsun.h> #include <sys/cmn_err.h> #include <sys/zone.h> @@ -59,7 +60,6 @@ #include <net/pfkeyv2.h> #include <net/pfpolicy.h> -#include <inet/ipsec_info.h> #include <inet/sadb.h> #include <inet/ipsec_impl.h> @@ -75,16 +75,8 @@ static void ipsec_update_present_flags(ipsec_stack_t *); static ipsec_act_t *ipsec_act_wildcard_expand(ipsec_act_t *, uint_t *, netstack_t *); -static void ipsec_out_free(void *); -static void ipsec_in_free(void *); -static mblk_t *ipsec_attach_global_policy(mblk_t **, conn_t *, - ipsec_selector_t *, netstack_t *); -static mblk_t *ipsec_apply_global_policy(mblk_t *, conn_t *, - ipsec_selector_t *, netstack_t *); static mblk_t *ipsec_check_ipsecin_policy(mblk_t *, ipsec_policy_t *, - ipha_t *, ip6_t *, uint64_t, netstack_t *); -static void ipsec_in_release_refs(ipsec_in_t *); -static void ipsec_out_release_refs(ipsec_out_t *); + ipha_t *, ip6_t *, uint64_t, ip_recv_attr_t *, netstack_t *); static void ipsec_action_free_table(ipsec_action_t *); static void ipsec_action_reclaim(void *); static void ipsec_action_reclaim_stack(netstack_t *); @@ -105,9 +97,9 @@ typedef enum { SELRET_NOMEM, SELRET_BADPKT, SELRET_SUCCESS, SELRET_TUNFRAG} static selret_t ipsec_init_inbound_sel(ipsec_selector_t *, mblk_t *, ipha_t *, ip6_t *, uint8_t); -static boolean_t ipsec_check_ipsecin_action(struct ipsec_in_s *, mblk_t *, +static boolean_t ipsec_check_ipsecin_action(ip_recv_attr_t *, mblk_t *, struct ipsec_action_s *, ipha_t *ipha, ip6_t *ip6h, const char **, - kstat_named_t **); + kstat_named_t **, netstack_t *); static void ipsec_unregister_prov_update(void); static void ipsec_prov_update_callback_stack(uint32_t, void *, netstack_t *); static boolean_t ipsec_compare_action(ipsec_policy_t *, ipsec_policy_t *); @@ -117,15 +109,13 @@ static void ipsec_kstat_destroy(ipsec_stack_t *); static int ipsec_free_tables(ipsec_stack_t *); static int tunnel_compare(const void *, const void *); static void ipsec_freemsg_chain(mblk_t *); -static void ip_drop_packet_chain(mblk_t *, boolean_t, ill_t *, ire_t *, +static void ip_drop_packet_chain(mblk_t *, boolean_t, ill_t *, struct kstat_named *, ipdropper_t *); static boolean_t ipsec_kstat_init(ipsec_stack_t *); static void ipsec_kstat_destroy(ipsec_stack_t *); static int ipsec_free_tables(ipsec_stack_t *); static int tunnel_compare(const void *, const void *); static void ipsec_freemsg_chain(mblk_t *); -static void ip_drop_packet_chain(mblk_t *, boolean_t, ill_t *, ire_t *, - struct kstat_named *, ipdropper_t *); /* * Selector hash table is statically sized at module load time. @@ -150,16 +140,15 @@ static crypto_notify_handle_t prov_update_handle = NULL; static kmem_cache_t *ipsec_action_cache; static kmem_cache_t *ipsec_sel_cache; static kmem_cache_t *ipsec_pol_cache; -static kmem_cache_t *ipsec_info_cache; /* Frag cache prototypes */ -static void ipsec_fragcache_clean(ipsec_fragcache_t *); +static void ipsec_fragcache_clean(ipsec_fragcache_t *, ipsec_stack_t *); static ipsec_fragcache_entry_t *fragcache_delentry(int, - ipsec_fragcache_entry_t *, ipsec_fragcache_t *); + ipsec_fragcache_entry_t *, ipsec_fragcache_t *, ipsec_stack_t *); boolean_t ipsec_fragcache_init(ipsec_fragcache_t *); -void ipsec_fragcache_uninit(ipsec_fragcache_t *); -mblk_t *ipsec_fragcache_add(ipsec_fragcache_t *, mblk_t *, mblk_t *, int, - ipsec_stack_t *); +void ipsec_fragcache_uninit(ipsec_fragcache_t *, ipsec_stack_t *ipss); +mblk_t *ipsec_fragcache_add(ipsec_fragcache_t *, mblk_t *, mblk_t *, + int, ipsec_stack_t *); int ipsec_hdr_pullup_needed = 0; int ipsec_weird_null_inbound_policy = 0; @@ -240,23 +229,28 @@ ipsec_freemsg_chain(mblk_t *mp) ASSERT(mp->b_prev == NULL); mpnext = mp->b_next; mp->b_next = NULL; - freemsg(mp); /* Always works, even if NULL */ + freemsg(mp); mp = mpnext; } } -/* ip_drop all messages in an mblk chain */ +/* + * ip_drop all messages in an mblk chain + * Can handle a b_next chain of ip_recv_attr_t mblks, or just a b_next chain + * of data. + */ static void -ip_drop_packet_chain(mblk_t *mp, boolean_t inbound, ill_t *arriving, - ire_t *outbound_ire, struct kstat_named *counter, ipdropper_t *who_called) +ip_drop_packet_chain(mblk_t *mp, boolean_t inbound, ill_t *ill, + struct kstat_named *counter, ipdropper_t *who_called) { mblk_t *mpnext; while (mp != NULL) { ASSERT(mp->b_prev == NULL); mpnext = mp->b_next; mp->b_next = NULL; - ip_drop_packet(mp, inbound, arriving, outbound_ire, counter, - who_called); + if (ip_recv_attr_is_mblk(mp)) + mp = ip_recv_attr_free_mblk(mp); + ip_drop_packet(mp, inbound, ill, counter, who_called); mp = mpnext; } } @@ -287,7 +281,7 @@ ipsec_policy_cmpbyid(const void *a, const void *b) * ipsl_sel (selector set), so an entry with a NULL ipsp_sel is not * actually in-tree but rather a template node being used in * an avl_find query; see ipsec_policy_delete(). This gives us - * a placeholder in the ordering just before the the first entry with + * a placeholder in the ordering just before the first entry with * a key >= the one we're looking for, so we can walk forward from * that point to get the remaining entries with the same id. */ @@ -443,7 +437,6 @@ ipsec_policy_g_destroy(void) kmem_cache_destroy(ipsec_action_cache); kmem_cache_destroy(ipsec_sel_cache); kmem_cache_destroy(ipsec_pol_cache); - kmem_cache_destroy(ipsec_info_cache); ipsec_unregister_prov_update(); @@ -693,9 +686,6 @@ ipsec_policy_g_init(void) ipsec_pol_cache = kmem_cache_create("ipsec_policy", sizeof (ipsec_policy_t), _POINTER_ALIGNMENT, NULL, NULL, NULL, NULL, NULL, 0); - ipsec_info_cache = kmem_cache_create("ipsec_info", - sizeof (ipsec_info_t), _POINTER_ALIGNMENT, NULL, NULL, - NULL, NULL, NULL, 0); /* * We want to be informed each time a stack is created or @@ -920,6 +910,7 @@ ipsec_copy_policy(const ipsec_policy_t *src) src->ipsp_sel->ipsl_refs++; HASH_NULL(dst, ipsp_hash); + dst->ipsp_netstack = src->ipsp_netstack; dst->ipsp_refs = 1; dst->ipsp_sel = src->ipsp_sel; dst->ipsp_act = src->ipsp_act; @@ -1469,7 +1460,7 @@ ipsec_req_from_conn(conn_t *connp, ipsec_req_t *req, int af) bzero(req, sizeof (*req)); - mutex_enter(&connp->conn_lock); + ASSERT(MUTEX_HELD(&connp->conn_lock)); ipl = connp->conn_latch; /* @@ -1478,20 +1469,20 @@ ipsec_req_from_conn(conn_t *connp, ipsec_req_t *req, int af) * look at configured policy. */ if (ipl != NULL) { - if (ipl->ipl_in_action != NULL) { - rv = ipsec_req_from_act(ipl->ipl_in_action, req); + if (connp->conn_latch_in_action != NULL) { + rv = ipsec_req_from_act(connp->conn_latch_in_action, + req); goto done; } - if (ipl->ipl_in_policy != NULL) { - rv = ipsec_req_from_act(ipl->ipl_in_policy->ipsp_act, - req); + if (connp->conn_latch_in_policy != NULL) { + rv = ipsec_req_from_act( + connp->conn_latch_in_policy->ipsp_act, req); goto done; } } if (connp->conn_policy != NULL) rv = ipsec_req_from_head(connp->conn_policy, req, af); done: - mutex_exit(&connp->conn_lock); return (rv); } @@ -1502,66 +1493,18 @@ ipsec_actvec_free(ipsec_act_t *act, uint_t nact) } /* - * When outbound policy is not cached, look it up the hard way and attach - * an ipsec_out_t to the packet.. - */ -static mblk_t * -ipsec_attach_global_policy(mblk_t **mp, conn_t *connp, ipsec_selector_t *sel, - netstack_t *ns) -{ - ipsec_policy_t *p; - - p = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, connp, NULL, sel, ns); - - if (p == NULL) - return (NULL); - return (ipsec_attach_ipsec_out(mp, connp, p, sel->ips_protocol, ns)); -} - -/* - * We have an ipsec_out already, but don't have cached policy; fill it in - * with the right actions. - */ -static mblk_t * -ipsec_apply_global_policy(mblk_t *ipsec_mp, conn_t *connp, - ipsec_selector_t *sel, netstack_t *ns) -{ - ipsec_out_t *io; - ipsec_policy_t *p; - - ASSERT(ipsec_mp->b_datap->db_type == M_CTL); - ASSERT(ipsec_mp->b_cont->b_datap->db_type == M_DATA); - - io = (ipsec_out_t *)ipsec_mp->b_rptr; - - if (io->ipsec_out_policy == NULL) { - p = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, connp, io, sel, ns); - io->ipsec_out_policy = p; - } - return (ipsec_mp); -} - - -/* * Consumes a reference to ipsp. */ static mblk_t * -ipsec_check_loopback_policy(mblk_t *first_mp, boolean_t mctl_present, +ipsec_check_loopback_policy(mblk_t *data_mp, ip_recv_attr_t *ira, ipsec_policy_t *ipsp) { - mblk_t *ipsec_mp; - ipsec_in_t *ii; - netstack_t *ns; - - if (!mctl_present) - return (first_mp); + if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) + return (data_mp); - ipsec_mp = first_mp; + ASSERT(ira->ira_flags & IRAF_LOOPBACK); - ii = (ipsec_in_t *)ipsec_mp->b_rptr; - ns = ii->ipsec_in_ns; - ASSERT(ii->ipsec_in_loopback); - IPPOL_REFRELE(ipsp, ns); + IPPOL_REFRELE(ipsp); /* * We should do an actual policy check here. Revisit this @@ -1569,7 +1512,7 @@ ipsec_check_loopback_policy(mblk_t *first_mp, boolean_t mctl_present, * get there.) */ - return (first_mp); + return (data_mp); } /* @@ -1577,20 +1520,19 @@ ipsec_check_loopback_policy(mblk_t *first_mp, boolean_t mctl_present, * expected by the SAs it traversed on the way in. */ static boolean_t -ipsec_check_ipsecin_unique(ipsec_in_t *ii, const char **reason, - kstat_named_t **counter, uint64_t pkt_unique) +ipsec_check_ipsecin_unique(ip_recv_attr_t *ira, const char **reason, + kstat_named_t **counter, uint64_t pkt_unique, netstack_t *ns) { uint64_t ah_mask, esp_mask; ipsa_t *ah_assoc; ipsa_t *esp_assoc; - netstack_t *ns = ii->ipsec_in_ns; ipsec_stack_t *ipss = ns->netstack_ipsec; - ASSERT(ii->ipsec_in_secure); - ASSERT(!ii->ipsec_in_loopback); + ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE); + ASSERT(!(ira->ira_flags & IRAF_LOOPBACK)); - ah_assoc = ii->ipsec_in_ah_sa; - esp_assoc = ii->ipsec_in_esp_sa; + ah_assoc = ira->ira_ipsec_ah_sa; + esp_assoc = ira->ira_ipsec_esp_sa; ASSERT((ah_assoc != NULL) || (esp_assoc != NULL)); ah_mask = (ah_assoc != NULL) ? ah_assoc->ipsa_unique_mask : 0; @@ -1621,30 +1563,30 @@ ipsec_check_ipsecin_unique(ipsec_in_t *ii, const char **reason, } static boolean_t -ipsec_check_ipsecin_action(ipsec_in_t *ii, mblk_t *mp, ipsec_action_t *ap, - ipha_t *ipha, ip6_t *ip6h, const char **reason, kstat_named_t **counter) +ipsec_check_ipsecin_action(ip_recv_attr_t *ira, mblk_t *mp, ipsec_action_t *ap, + ipha_t *ipha, ip6_t *ip6h, const char **reason, kstat_named_t **counter, + netstack_t *ns) { boolean_t ret = B_TRUE; ipsec_prot_t *ipp; ipsa_t *ah_assoc; ipsa_t *esp_assoc; boolean_t decaps; - netstack_t *ns = ii->ipsec_in_ns; ipsec_stack_t *ipss = ns->netstack_ipsec; ASSERT((ipha == NULL && ip6h != NULL) || (ip6h == NULL && ipha != NULL)); - if (ii->ipsec_in_loopback) { + if (ira->ira_flags & IRAF_LOOPBACK) { /* * Besides accepting pointer-equivalent actions, we also * accept any ICMP errors we generated for ourselves, * regardless of policy. If we do not wish to make this * assumption in the future, check here, and where - * icmp_loopback is initialized in ip.c and ip6.c. (Look for - * ipsec_out_icmp_loopback.) + * IXAF_TRUSTED_ICMP is initialized in ip.c and ip6.c. */ - if (ap == ii->ipsec_in_action || ii->ipsec_in_icmp_loopback) + if (ap == ira->ira_ipsec_action || + (ira->ira_flags & IRAF_TRUSTED_ICMP)) return (B_TRUE); /* Deep compare necessary here?? */ @@ -1652,12 +1594,13 @@ ipsec_check_ipsecin_action(ipsec_in_t *ii, mblk_t *mp, ipsec_action_t *ap, *reason = "loopback policy mismatch"; return (B_FALSE); } - ASSERT(!ii->ipsec_in_icmp_loopback); + ASSERT(!(ira->ira_flags & IRAF_TRUSTED_ICMP)); + ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE); - ah_assoc = ii->ipsec_in_ah_sa; - esp_assoc = ii->ipsec_in_esp_sa; + ah_assoc = ira->ira_ipsec_ah_sa; + esp_assoc = ira->ira_ipsec_esp_sa; - decaps = ii->ipsec_in_decaps; + decaps = (ira->ira_flags & IRAF_IPSEC_DECAPS); switch (ap->ipa_act.ipa_type) { case IPSEC_ACT_DISCARD: @@ -1744,10 +1687,10 @@ ipsec_check_ipsecin_action(ipsec_in_t *ii, mblk_t *mp, ipsec_action_t *ap, } } } else if (esp_assoc != NULL) { - /* - * Don't allow this. Check IPSEC NOTE above - * ip_fanout_proto(). - */ + /* + * Don't allow this. Check IPSEC NOTE above + * ip_fanout_proto(). + */ *counter = DROPPER(ipss, ipds_spd_got_esp); *reason = "unexpected ESP"; ret = B_FALSE; @@ -1777,17 +1720,18 @@ ipsec_check_ipsecin_action(ipsec_in_t *ii, mblk_t *mp, ipsec_action_t *ap, ret = B_FALSE; break; } - if (ii->ipsec_in_action != NULL) { + if (ira->ira_ipsec_action != NULL) { /* * This can happen if we do a double policy-check on * a packet * XXX XXX should fix this case! */ - IPACT_REFRELE(ii->ipsec_in_action); + IPACT_REFRELE(ira->ira_ipsec_action); } - ASSERT(ii->ipsec_in_action == NULL); + ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE); + ASSERT(ira->ira_ipsec_action == NULL); IPACT_REFHOLD(ap); - ii->ipsec_in_action = ap; + ira->ira_ipsec_action = ap; break; /* from switch */ } return (ret); @@ -1818,9 +1762,9 @@ static uint64_t conn_to_unique(conn_t *connp, mblk_t *data_mp, ipha_t *ipha, ip6_t *ip6h) { ipsec_selector_t sel; - uint8_t ulp = connp->conn_ulp; + uint8_t ulp = connp->conn_proto; - ASSERT(connp->conn_latch->ipl_in_policy != NULL); + ASSERT(connp->conn_latch_in_policy != NULL); if ((ulp == IPPROTO_TCP || ulp == IPPROTO_UDP || ulp == IPPROTO_SCTP) && (connp->conn_fport == 0 || connp->conn_lport == 0)) { @@ -1839,46 +1783,51 @@ conn_to_unique(conn_t *connp, mblk_t *data_mp, ipha_t *ipha, ip6_t *ip6h) SELRET_SUCCESS) { ASSERT(sel.ips_local_port == connp->conn_lport); ASSERT(sel.ips_remote_port == connp->conn_fport); - ASSERT(sel.ips_protocol == connp->conn_ulp); + ASSERT(sel.ips_protocol == connp->conn_proto); } - ASSERT(connp->conn_ulp != 0); + ASSERT(connp->conn_proto != 0); #endif return (SA_UNIQUE_ID(connp->conn_fport, connp->conn_lport, ulp, 0)); } /* - * Called to check policy on a latched connection, both from this file - * and from tcp.c + * Called to check policy on a latched connection. + * Note that we don't dereference conn_latch or conn_ihere since the conn might + * be closing. The caller passes a held ipsec_latch_t instead. */ -boolean_t -ipsec_check_ipsecin_latch(ipsec_in_t *ii, mblk_t *mp, ipsec_latch_t *ipl, - ipha_t *ipha, ip6_t *ip6h, const char **reason, kstat_named_t **counter, - conn_t *connp) +static boolean_t +ipsec_check_ipsecin_latch(ip_recv_attr_t *ira, mblk_t *mp, ipsec_latch_t *ipl, + ipsec_action_t *ap, ipha_t *ipha, ip6_t *ip6h, const char **reason, + kstat_named_t **counter, conn_t *connp, netstack_t *ns) { - netstack_t *ns = ii->ipsec_in_ns; ipsec_stack_t *ipss = ns->netstack_ipsec; ASSERT(ipl->ipl_ids_latched == B_TRUE); + ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE); - if (!ii->ipsec_in_loopback) { + if (!(ira->ira_flags & IRAF_LOOPBACK)) { /* * Over loopback, there aren't real security associations, * so there are neither identities nor "unique" values * for us to check the packet against. */ - if ((ii->ipsec_in_ah_sa != NULL) && - (!spd_match_inbound_ids(ipl, ii->ipsec_in_ah_sa))) { - *counter = DROPPER(ipss, ipds_spd_ah_badid); - *reason = "AH identity mismatch"; - return (B_FALSE); + if (ira->ira_ipsec_ah_sa != NULL) { + if (!spd_match_inbound_ids(ipl, + ira->ira_ipsec_ah_sa)) { + *counter = DROPPER(ipss, ipds_spd_ah_badid); + *reason = "AH identity mismatch"; + return (B_FALSE); + } } - if ((ii->ipsec_in_esp_sa != NULL) && - (!spd_match_inbound_ids(ipl, ii->ipsec_in_esp_sa))) { - *counter = DROPPER(ipss, ipds_spd_esp_badid); - *reason = "ESP identity mismatch"; - return (B_FALSE); + if (ira->ira_ipsec_esp_sa != NULL) { + if (!spd_match_inbound_ids(ipl, + ira->ira_ipsec_esp_sa)) { + *counter = DROPPER(ipss, ipds_spd_esp_badid); + *reason = "ESP identity mismatch"; + return (B_FALSE); + } } /* @@ -1886,14 +1835,13 @@ ipsec_check_ipsecin_latch(ipsec_in_t *ii, mblk_t *mp, ipsec_latch_t *ipl, * In DEBUG kernels (see conn_to_unique()'s implementation), * verify this even if it REALLY slows things down. */ - if (!ipsec_check_ipsecin_unique(ii, reason, counter, - conn_to_unique(connp, mp, ipha, ip6h))) { + if (!ipsec_check_ipsecin_unique(ira, reason, counter, + conn_to_unique(connp, mp, ipha, ip6h), ns)) { return (B_FALSE); } } - - return (ipsec_check_ipsecin_action(ii, mp, ipl->ipl_in_action, - ipha, ip6h, reason, counter)); + return (ipsec_check_ipsecin_action(ira, mp, ap, ipha, ip6h, reason, + counter, ns)); } /* @@ -1903,52 +1851,48 @@ ipsec_check_ipsecin_latch(ipsec_in_t *ii, mblk_t *mp, ipsec_latch_t *ipl, * Called from ipsec_check_global_policy, and ipsec_check_inbound_policy. * * Consumes a reference to ipsp. + * Returns the mblk if ok. */ static mblk_t * -ipsec_check_ipsecin_policy(mblk_t *first_mp, ipsec_policy_t *ipsp, - ipha_t *ipha, ip6_t *ip6h, uint64_t pkt_unique, netstack_t *ns) +ipsec_check_ipsecin_policy(mblk_t *data_mp, ipsec_policy_t *ipsp, + ipha_t *ipha, ip6_t *ip6h, uint64_t pkt_unique, ip_recv_attr_t *ira, + netstack_t *ns) { - ipsec_in_t *ii; ipsec_action_t *ap; const char *reason = "no policy actions found"; - mblk_t *data_mp, *ipsec_mp; - ipsec_stack_t *ipss = ns->netstack_ipsec; ip_stack_t *ipst = ns->netstack_ip; + ipsec_stack_t *ipss = ns->netstack_ipsec; kstat_named_t *counter; counter = DROPPER(ipss, ipds_spd_got_secure); - data_mp = first_mp->b_cont; - ipsec_mp = first_mp; - ASSERT(ipsp != NULL); ASSERT((ipha == NULL && ip6h != NULL) || (ip6h == NULL && ipha != NULL)); - ii = (ipsec_in_t *)ipsec_mp->b_rptr; + if (ira->ira_flags & IRAF_LOOPBACK) + return (ipsec_check_loopback_policy(data_mp, ira, ipsp)); - if (ii->ipsec_in_loopback) - return (ipsec_check_loopback_policy(first_mp, B_TRUE, ipsp)); - ASSERT(ii->ipsec_in_type == IPSEC_IN); - ASSERT(ii->ipsec_in_secure); + ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE); - if (ii->ipsec_in_action != NULL) { + if (ira->ira_ipsec_action != NULL) { /* * this can happen if we do a double policy-check on a packet * Would be nice to be able to delete this test.. */ - IPACT_REFRELE(ii->ipsec_in_action); + IPACT_REFRELE(ira->ira_ipsec_action); } - ASSERT(ii->ipsec_in_action == NULL); + ASSERT(ira->ira_ipsec_action == NULL); - if (!SA_IDS_MATCH(ii->ipsec_in_ah_sa, ii->ipsec_in_esp_sa)) { + if (!SA_IDS_MATCH(ira->ira_ipsec_ah_sa, ira->ira_ipsec_esp_sa)) { reason = "inbound AH and ESP identities differ"; counter = DROPPER(ipss, ipds_spd_ahesp_diffid); goto drop; } - if (!ipsec_check_ipsecin_unique(ii, &reason, &counter, pkt_unique)) + if (!ipsec_check_ipsecin_unique(ira, &reason, &counter, pkt_unique, + ns)) goto drop; /* @@ -1957,21 +1901,21 @@ ipsec_check_ipsecin_policy(mblk_t *first_mp, ipsec_policy_t *ipsp, */ for (ap = ipsp->ipsp_act; ap != NULL; ap = ap->ipa_next) { - if (ipsec_check_ipsecin_action(ii, data_mp, ap, - ipha, ip6h, &reason, &counter)) { + if (ipsec_check_ipsecin_action(ira, data_mp, ap, + ipha, ip6h, &reason, &counter, ns)) { BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded); - IPPOL_REFRELE(ipsp, ns); - return (first_mp); + IPPOL_REFRELE(ipsp); + return (data_mp); } } drop: ipsec_rl_strlog(ns, IP_MOD_ID, 0, 0, SL_ERROR|SL_WARN|SL_CONSOLE, "ipsec inbound policy mismatch: %s, packet dropped\n", reason); - IPPOL_REFRELE(ipsp, ns); - ASSERT(ii->ipsec_in_action == NULL); + IPPOL_REFRELE(ipsp); + ASSERT(ira->ira_ipsec_action == NULL); BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed); - ip_drop_packet(first_mp, B_TRUE, NULL, NULL, counter, + ip_drop_packet(data_mp, B_TRUE, NULL, counter, &ipss->ipsec_spd_dropper); return (NULL); } @@ -2075,7 +2019,7 @@ ipsec_find_policy_chain(ipsec_policy_t *best, ipsec_policy_t *chain, */ ipsec_policy_t * ipsec_find_policy_head(ipsec_policy_t *best, ipsec_policy_head_t *head, - int direction, ipsec_selector_t *sel, netstack_t *ns) + int direction, ipsec_selector_t *sel) { ipsec_policy_t *curbest; ipsec_policy_root_t *root; @@ -2121,7 +2065,7 @@ ipsec_find_policy_head(ipsec_policy_t *best, ipsec_policy_head_t *head, IPPOL_REFHOLD(curbest); if (best != NULL) { - IPPOL_REFRELE(best, ns); + IPPOL_REFRELE(best); } } @@ -2139,20 +2083,17 @@ ipsec_find_policy_head(ipsec_policy_t *best, ipsec_policy_head_t *head, * reference when done. */ ipsec_policy_t * -ipsec_find_policy(int direction, conn_t *connp, ipsec_out_t *io, - ipsec_selector_t *sel, netstack_t *ns) +ipsec_find_policy(int direction, const conn_t *connp, ipsec_selector_t *sel, + netstack_t *ns) { ipsec_policy_t *p; ipsec_stack_t *ipss = ns->netstack_ipsec; p = ipsec_find_policy_head(NULL, &ipss->ipsec_system_policy, - direction, sel, ns); + direction, sel); if ((connp != NULL) && (connp->conn_policy != NULL)) { p = ipsec_find_policy_head(p, connp->conn_policy, - direction, sel, ns); - } else if ((io != NULL) && (io->ipsec_out_polhead != NULL)) { - p = ipsec_find_policy_head(p, io->ipsec_out_polhead, - direction, sel, ns); + direction, sel); } return (p); @@ -2172,21 +2113,16 @@ ipsec_find_policy(int direction, conn_t *connp, ipsec_out_t *io, * floor. */ mblk_t * -ipsec_check_global_policy(mblk_t *first_mp, conn_t *connp, - ipha_t *ipha, ip6_t *ip6h, boolean_t mctl_present, netstack_t *ns) +ipsec_check_global_policy(mblk_t *data_mp, conn_t *connp, + ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira, netstack_t *ns) { ipsec_policy_t *p; ipsec_selector_t sel; - mblk_t *data_mp, *ipsec_mp; boolean_t policy_present; kstat_named_t *counter; - ipsec_in_t *ii = NULL; uint64_t pkt_unique; - ipsec_stack_t *ipss = ns->netstack_ipsec; ip_stack_t *ipst = ns->netstack_ip; - - data_mp = mctl_present ? first_mp->b_cont : first_mp; - ipsec_mp = mctl_present ? first_mp : NULL; + ipsec_stack_t *ipss = ns->netstack_ipsec; sel.ips_is_icmp_inv_acq = 0; @@ -2203,13 +2139,7 @@ ipsec_check_global_policy(mblk_t *first_mp, conn_t *connp, * No global policy and no per-socket policy; * just pass it back (but we shouldn't get here in that case) */ - return (first_mp); - } - - if (ipsec_mp != NULL) { - ASSERT(ipsec_mp->b_datap->db_type == M_CTL); - ii = (ipsec_in_t *)(ipsec_mp->b_rptr); - ASSERT(ii->ipsec_in_type == IPSEC_IN); + return (data_mp); } /* @@ -2217,32 +2147,11 @@ ipsec_check_global_policy(mblk_t *first_mp, conn_t *connp, * Otherwise consult system policy. */ if ((connp != NULL) && (connp->conn_latch != NULL)) { - p = connp->conn_latch->ipl_in_policy; + p = connp->conn_latch_in_policy; if (p != NULL) { IPPOL_REFHOLD(p); } /* - * The caller may have mistakenly assigned an ip6i_t as the - * ip6h for this packet, so take that corner-case into - * account. - */ - if (ip6h != NULL && ip6h->ip6_nxt == IPPROTO_RAW) { - ip6h++; - /* First check for bizarro split-mblk headers. */ - if ((uintptr_t)ip6h > (uintptr_t)data_mp->b_wptr || - ((uintptr_t)ip6h) + sizeof (ip6_t) > - (uintptr_t)data_mp->b_wptr) { - ipsec_log_policy_failure(IPSEC_POLICY_MISMATCH, - "ipsec_check_global_policy", ipha, ip6h, - B_TRUE, ns); - counter = DROPPER(ipss, ipds_spd_nomem); - goto fail; - } - /* Next, see if ip6i is at the end of an mblk. */ - if (ip6h == (ip6_t *)data_mp->b_wptr) - ip6h = (ip6_t *)data_mp->b_cont->b_rptr; - } - /* * Fudge sel for UNIQUE_ID setting below. */ pkt_unique = conn_to_unique(connp, data_mp, ipha, ip6h); @@ -2271,20 +2180,19 @@ ipsec_check_global_policy(mblk_t *first_mp, conn_t *connp, * local policy alone. */ - p = ipsec_find_policy(IPSEC_TYPE_INBOUND, connp, NULL, &sel, - ns); + p = ipsec_find_policy(IPSEC_TYPE_INBOUND, connp, &sel, ns); pkt_unique = SA_UNIQUE_ID(sel.ips_remote_port, sel.ips_local_port, sel.ips_protocol, 0); } if (p == NULL) { - if (ipsec_mp == NULL) { + if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) { /* * We have no policy; default to succeeding. * XXX paranoid system design doesn't do this. */ BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded); - return (first_mp); + return (data_mp); } else { counter = DROPPER(ipss, ipds_spd_got_secure); ipsec_log_policy_failure(IPSEC_POLICY_NOT_NEEDED, @@ -2293,16 +2201,16 @@ ipsec_check_global_policy(mblk_t *first_mp, conn_t *connp, goto fail; } } - if ((ii != NULL) && (ii->ipsec_in_secure)) { - return (ipsec_check_ipsecin_policy(ipsec_mp, p, ipha, ip6h, - pkt_unique, ns)); + if (ira->ira_flags & IRAF_IPSEC_SECURE) { + return (ipsec_check_ipsecin_policy(data_mp, p, ipha, ip6h, + pkt_unique, ira, ns)); } if (p->ipsp_act->ipa_allow_clear) { BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded); - IPPOL_REFRELE(p, ns); - return (first_mp); + IPPOL_REFRELE(p); + return (data_mp); } - IPPOL_REFRELE(p, ns); + IPPOL_REFRELE(p); /* * If we reach here, we will drop the packet because it failed the * global policy check because the packet was cleartext, and it @@ -2313,7 +2221,7 @@ ipsec_check_global_policy(mblk_t *first_mp, conn_t *connp, counter = DROPPER(ipss, ipds_spd_got_clear); fail: - ip_drop_packet(first_mp, B_TRUE, NULL, NULL, counter, + ip_drop_packet(data_mp, B_TRUE, NULL, counter, &ipss->ipsec_spd_dropper); BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed); return (NULL); @@ -2435,7 +2343,7 @@ ipsec_inbound_accept_clear(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h) case ICMP_FRAGMENTATION_NEEDED: /* * Be in sync with icmp_inbound, where we have - * already set ire_max_frag. + * already set dce_pmtu */ #ifdef FRAGCACHE_DEBUG cmn_err(CE_WARN, "ICMP frag needed\n"); @@ -2496,27 +2404,44 @@ ipsec_latch_ids(ipsec_latch_t *ipl, ipsid_t *local, ipsid_t *remote) } void -ipsec_latch_inbound(ipsec_latch_t *ipl, ipsec_in_t *ii) +ipsec_latch_inbound(conn_t *connp, ip_recv_attr_t *ira) { ipsa_t *sa; + ipsec_latch_t *ipl = connp->conn_latch; if (!ipl->ipl_ids_latched) { ipsid_t *local = NULL; ipsid_t *remote = NULL; - if (!ii->ipsec_in_loopback) { - if (ii->ipsec_in_esp_sa != NULL) - sa = ii->ipsec_in_esp_sa; + if (!(ira->ira_flags & IRAF_LOOPBACK)) { + ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE); + if (ira->ira_ipsec_esp_sa != NULL) + sa = ira->ira_ipsec_esp_sa; else - sa = ii->ipsec_in_ah_sa; + sa = ira->ira_ipsec_ah_sa; ASSERT(sa != NULL); local = sa->ipsa_dst_cid; remote = sa->ipsa_src_cid; } ipsec_latch_ids(ipl, local, remote); } - ipl->ipl_in_action = ii->ipsec_in_action; - IPACT_REFHOLD(ipl->ipl_in_action); + if (ira->ira_flags & IRAF_IPSEC_SECURE) { + if (connp->conn_latch_in_action != NULL) { + /* + * Previously cached action. This is probably + * harmless, but in DEBUG kernels, check for + * action equality. + * + * Preserve the existing action to preserve latch + * invariance. + */ + ASSERT(connp->conn_latch_in_action == + ira->ira_ipsec_action); + return; + } + connp->conn_latch_in_action = ira->ira_ipsec_action; + IPACT_REFHOLD(connp->conn_latch_in_action); + } } /* @@ -2527,27 +2452,25 @@ ipsec_latch_inbound(ipsec_latch_t *ipl, ipsec_in_t *ii) * see also ipsec_check_ipsecin_latch() and ipsec_check_global_policy() */ mblk_t * -ipsec_check_inbound_policy(mblk_t *first_mp, conn_t *connp, - ipha_t *ipha, ip6_t *ip6h, boolean_t mctl_present) +ipsec_check_inbound_policy(mblk_t *mp, conn_t *connp, + ipha_t *ipha, ip6_t *ip6h, ip_recv_attr_t *ira) { - ipsec_in_t *ii; - boolean_t ret; - mblk_t *mp = mctl_present ? first_mp->b_cont : first_mp; - mblk_t *ipsec_mp = mctl_present ? first_mp : NULL; - ipsec_latch_t *ipl; - uint64_t unique_id; + boolean_t ret; + ipsec_latch_t *ipl; + ipsec_action_t *ap; + uint64_t unique_id; ipsec_stack_t *ipss; ip_stack_t *ipst; netstack_t *ns; ipsec_policy_head_t *policy_head; + ipsec_policy_t *p = NULL; ASSERT(connp != NULL); ns = connp->conn_netstack; ipss = ns->netstack_ipsec; ipst = ns->netstack_ip; - if (ipsec_mp == NULL) { -clear: + if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) { /* * This is the case where the incoming datagram is * cleartext and we need to see whether this client @@ -2559,49 +2482,49 @@ clear: mutex_enter(&connp->conn_lock); if (connp->conn_state_flags & CONN_CONDEMNED) { mutex_exit(&connp->conn_lock); - ip_drop_packet(first_mp, B_TRUE, NULL, - NULL, DROPPER(ipss, ipds_spd_got_clear), + ip_drop_packet(mp, B_TRUE, NULL, + DROPPER(ipss, ipds_spd_got_clear), &ipss->ipsec_spd_dropper); BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed); return (NULL); } - if ((ipl = connp->conn_latch) != NULL) { + if (connp->conn_latch != NULL) { /* Hold a reference in case the conn is closing */ - IPLATCH_REFHOLD(ipl); + p = connp->conn_latch_in_policy; + if (p != NULL) + IPPOL_REFHOLD(p); mutex_exit(&connp->conn_lock); /* * Policy is cached in the conn. */ - if ((ipl->ipl_in_policy != NULL) && - (!ipl->ipl_in_policy->ipsp_act->ipa_allow_clear)) { + if (p != NULL && !p->ipsp_act->ipa_allow_clear) { ret = ipsec_inbound_accept_clear(mp, ipha, ip6h); if (ret) { BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded); - IPLATCH_REFRELE(ipl, ns); - return (first_mp); + IPPOL_REFRELE(p); + return (mp); } else { ipsec_log_policy_failure( IPSEC_POLICY_MISMATCH, "ipsec_check_inbound_policy", ipha, ip6h, B_FALSE, ns); - ip_drop_packet(first_mp, B_TRUE, NULL, - NULL, + ip_drop_packet(mp, B_TRUE, NULL, DROPPER(ipss, ipds_spd_got_clear), &ipss->ipsec_spd_dropper); BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed); - IPLATCH_REFRELE(ipl, ns); + IPPOL_REFRELE(p); return (NULL); } } else { BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded); - IPLATCH_REFRELE(ipl, ns); - return (first_mp); + if (p != NULL) + IPPOL_REFRELE(p); + return (mp); } } else { - uchar_t db_type; policy_head = connp->conn_policy; /* Hold a reference in case the conn is closing */ @@ -2611,50 +2534,22 @@ clear: /* * As this is a non-hardbound connection we need * to look at both per-socket policy and global - * policy. As this is cleartext, mark the mp as - * M_DATA in case if it is an ICMP error being - * reported before calling ipsec_check_global_policy - * so that it does not mistake it for IPSEC_IN. + * policy. */ - db_type = mp->b_datap->db_type; - mp->b_datap->db_type = M_DATA; - first_mp = ipsec_check_global_policy(first_mp, connp, - ipha, ip6h, mctl_present, ns); + mp = ipsec_check_global_policy(mp, connp, + ipha, ip6h, ira, ns); if (policy_head != NULL) IPPH_REFRELE(policy_head, ns); - if (first_mp != NULL) - mp->b_datap->db_type = db_type; - return (first_mp); + return (mp); } } - /* - * If it is inbound check whether the attached message - * is secure or not. We have a special case for ICMP, - * where we have a IPSEC_IN message and the attached - * message is not secure. See icmp_inbound_error_fanout - * for details. - */ - ASSERT(ipsec_mp != NULL); - ASSERT(ipsec_mp->b_datap->db_type == M_CTL); - ii = (ipsec_in_t *)ipsec_mp->b_rptr; - - if (!ii->ipsec_in_secure) - goto clear; - - /* - * mp->b_cont could be either a M_CTL message - * for icmp errors being sent up or a M_DATA message. - */ - ASSERT(mp->b_datap->db_type == M_CTL || mp->b_datap->db_type == M_DATA); - - ASSERT(ii->ipsec_in_type == IPSEC_IN); mutex_enter(&connp->conn_lock); /* Connection is closing */ if (connp->conn_state_flags & CONN_CONDEMNED) { mutex_exit(&connp->conn_lock); - ip_drop_packet(first_mp, B_TRUE, NULL, - NULL, DROPPER(ipss, ipds_spd_got_clear), + ip_drop_packet(mp, B_TRUE, NULL, + DROPPER(ipss, ipds_spd_got_clear), &ipss->ipsec_spd_dropper); BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed); return (NULL); @@ -2679,58 +2574,64 @@ clear: * policy. It will check against conn or global * depending on whichever is stronger. */ - retmp = ipsec_check_global_policy(first_mp, connp, - ipha, ip6h, mctl_present, ns); + retmp = ipsec_check_global_policy(mp, connp, + ipha, ip6h, ira, ns); if (policy_head != NULL) IPPH_REFRELE(policy_head, ns); return (retmp); } IPLATCH_REFHOLD(ipl); + /* Hold reference on conn_latch_in_action in case conn is closing */ + ap = connp->conn_latch_in_action; + if (ap != NULL) + IPACT_REFHOLD(ap); mutex_exit(&connp->conn_lock); - if (ipl->ipl_in_action != NULL) { + if (ap != NULL) { /* Policy is cached & latched; fast(er) path */ const char *reason; kstat_named_t *counter; - if (ipsec_check_ipsecin_latch(ii, mp, ipl, - ipha, ip6h, &reason, &counter, connp)) { + if (ipsec_check_ipsecin_latch(ira, mp, ipl, ap, + ipha, ip6h, &reason, &counter, connp, ns)) { BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded); - IPLATCH_REFRELE(ipl, ns); - return (first_mp); + IPLATCH_REFRELE(ipl); + IPACT_REFRELE(ap); + return (mp); } ipsec_rl_strlog(ns, IP_MOD_ID, 0, 0, SL_ERROR|SL_WARN|SL_CONSOLE, "ipsec inbound policy mismatch: %s, packet dropped\n", reason); - ip_drop_packet(first_mp, B_TRUE, NULL, NULL, counter, + ip_drop_packet(mp, B_TRUE, NULL, counter, &ipss->ipsec_spd_dropper); BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed); - IPLATCH_REFRELE(ipl, ns); + IPLATCH_REFRELE(ipl); + IPACT_REFRELE(ap); return (NULL); - } else if (ipl->ipl_in_policy == NULL) { + } + if ((p = connp->conn_latch_in_policy) == NULL) { ipsec_weird_null_inbound_policy++; - IPLATCH_REFRELE(ipl, ns); - return (first_mp); + IPLATCH_REFRELE(ipl); + return (mp); } unique_id = conn_to_unique(connp, mp, ipha, ip6h); - IPPOL_REFHOLD(ipl->ipl_in_policy); - first_mp = ipsec_check_ipsecin_policy(first_mp, ipl->ipl_in_policy, - ipha, ip6h, unique_id, ns); + IPPOL_REFHOLD(p); + mp = ipsec_check_ipsecin_policy(mp, p, ipha, ip6h, unique_id, ira, ns); /* * NOTE: ipsecIn{Failed,Succeeeded} bumped by * ipsec_check_ipsecin_policy(). */ - if (first_mp != NULL) - ipsec_latch_inbound(ipl, ii); - IPLATCH_REFRELE(ipl, ns); - return (first_mp); + if (mp != NULL) + ipsec_latch_inbound(connp, ira); + IPLATCH_REFRELE(ipl); + return (mp); } /* - * Handle all sorts of cases like tunnel-mode, ICMP, and ip6i prepending. + * Handle all sorts of cases like tunnel-mode and ICMP. */ static int prepended_length(mblk_t *mp, uintptr_t hptr) @@ -2779,19 +2680,24 @@ prepended_length(mblk_t *mp, uintptr_t hptr) * should put this packet in a fragment-gathering queue. * Only returned if SEL_TUNNEL_MODE and SEL_PORT_POLICY * is set. + * + * Note that ipha/ip6h can be in a different mblk (mp->b_cont) in the case + * of tunneled packets. + * Also, mp->b_rptr can be an ICMP error where ipha/ip6h is the packet in + * error past the ICMP error. */ static selret_t ipsec_init_inbound_sel(ipsec_selector_t *sel, mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, uint8_t sel_flags) { uint16_t *ports; - int outer_hdr_len = 0; /* For ICMP, tunnel-mode, or ip6i cases... */ + int outer_hdr_len = 0; /* For ICMP or tunnel-mode cases... */ ushort_t hdr_len; mblk_t *spare_mp = NULL; uint8_t *nexthdrp, *transportp; uint8_t nexthdr; uint8_t icmp_proto; - ip6_pkt_t ipp; + ip_pkt_t ipp; boolean_t port_policy_present = (sel_flags & SEL_PORT_POLICY); boolean_t is_icmp = (sel_flags & SEL_IS_ICMP); boolean_t tunnel_mode = (sel_flags & SEL_TUNNEL_MODE); @@ -2802,44 +2708,14 @@ ipsec_init_inbound_sel(ipsec_selector_t *sel, mblk_t *mp, ipha_t *ipha, if (ip6h != NULL) { outer_hdr_len = prepended_length(mp, (uintptr_t)ip6h); - nexthdr = ip6h->ip6_nxt; - - /* - * The caller may have mistakenly assigned an ip6i_t as the - * ip6h for this packet, so take that corner-case into - * account. - */ - if (nexthdr == IPPROTO_RAW) { - ip6h++; - /* First check for bizarro split-mblk headers. */ - if ((uintptr_t)ip6h > (uintptr_t)mp->b_wptr || - ((uintptr_t)ip6h) + sizeof (ip6_t) > - (uintptr_t)mp->b_wptr) { - return (SELRET_BADPKT); - } - /* Next, see if ip6i is at the end of an mblk. */ - if (ip6h == (ip6_t *)mp->b_wptr) - ip6h = (ip6_t *)mp->b_cont->b_rptr; - - nexthdr = ip6h->ip6_nxt; - - /* - * Finally, if we haven't adjusted for ip6i, do so - * now. ip6i_t structs are prepended, so an ICMP - * or tunnel packet would just be overwritten. - */ - if (outer_hdr_len == 0) - outer_hdr_len = sizeof (ip6i_t); - } - icmp_proto = IPPROTO_ICMPV6; sel->ips_isv4 = B_FALSE; sel->ips_local_addr_v6 = ip6h->ip6_dst; sel->ips_remote_addr_v6 = ip6h->ip6_src; bzero(&ipp, sizeof (ipp)); - (void) ip_find_hdr_v6(mp, ip6h, &ipp, NULL); + (void) ip_find_hdr_v6(mp, ip6h, B_FALSE, &ipp, NULL); switch (nexthdr) { case IPPROTO_HOPOPTS: @@ -2852,7 +2728,6 @@ ipsec_init_inbound_sel(ipsec_selector_t *sel, mblk_t *mp, ipha_t *ipha, */ if ((spare_mp = msgpullup(mp, -1)) == NULL) return (SELRET_NOMEM); - if (!ip_hdr_length_nexthdr_v6(spare_mp, (ip6_t *)(spare_mp->b_rptr + outer_hdr_len), &hdr_len, &nexthdrp)) { @@ -2930,6 +2805,10 @@ ipsec_init_inbound_sel(ipsec_selector_t *sel, mblk_t *mp, ipha_t *ipha, return (SELRET_SUCCESS); } +/* + * This is called with a b_next chain of messages from the fragcache code, + * hence it needs to discard a chain on error. + */ static boolean_t ipsec_init_outbound_ports(ipsec_selector_t *sel, mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, int outer_hdr_len, ipsec_stack_t *ipss) @@ -2967,7 +2846,7 @@ ipsec_init_outbound_ports(ipsec_selector_t *sel, mblk_t *mp, ipha_t *ipha, &hdr_len, &nexthdrp)) { /* Always works, even if NULL. */ ipsec_freemsg_chain(spare_mp); - ip_drop_packet_chain(mp, B_FALSE, NULL, NULL, + ip_drop_packet_chain(mp, B_FALSE, NULL, DROPPER(ipss, ipds_spd_nomem), &ipss->ipsec_spd_dropper); return (B_FALSE); @@ -3005,7 +2884,7 @@ ipsec_init_outbound_ports(ipsec_selector_t *sel, mblk_t *mp, ipha_t *ipha, */ if (spare_mp == NULL && (spare_mp = msgpullup(mp, -1)) == NULL) { - ip_drop_packet_chain(mp, B_FALSE, NULL, NULL, + ip_drop_packet_chain(mp, B_FALSE, NULL, DROPPER(ipss, ipds_spd_nomem), &ipss->ipsec_spd_dropper); return (B_FALSE); @@ -3029,13 +2908,68 @@ ipsec_init_outbound_ports(ipsec_selector_t *sel, mblk_t *mp, ipha_t *ipha, } /* + * Prepend an mblk with a ipsec_crypto_t to the message chain. + * Frees the argument and returns NULL should the allocation fail. + * Returns the pointer to the crypto data part. + */ +mblk_t * +ipsec_add_crypto_data(mblk_t *data_mp, ipsec_crypto_t **icp) +{ + mblk_t *mp; + + mp = allocb(sizeof (ipsec_crypto_t), BPRI_MED); + if (mp == NULL) { + freemsg(data_mp); + return (NULL); + } + bzero(mp->b_rptr, sizeof (ipsec_crypto_t)); + mp->b_wptr += sizeof (ipsec_crypto_t); + mp->b_cont = data_mp; + mp->b_datap->db_type = M_EVENT; /* For ASSERT */ + *icp = (ipsec_crypto_t *)mp->b_rptr; + return (mp); +} + +/* + * Remove what was prepended above. Return b_cont and a pointer to the + * crypto data. + * The caller must call ipsec_free_crypto_data for mblk once it is done + * with the crypto data. + */ +mblk_t * +ipsec_remove_crypto_data(mblk_t *crypto_mp, ipsec_crypto_t **icp) +{ + ASSERT(crypto_mp->b_datap->db_type == M_EVENT); + ASSERT(MBLKL(crypto_mp) == sizeof (ipsec_crypto_t)); + + *icp = (ipsec_crypto_t *)crypto_mp->b_rptr; + return (crypto_mp->b_cont); +} + +/* + * Free what was prepended above. Return b_cont. + */ +mblk_t * +ipsec_free_crypto_data(mblk_t *crypto_mp) +{ + mblk_t *mp; + + ASSERT(crypto_mp->b_datap->db_type == M_EVENT); + ASSERT(MBLKL(crypto_mp) == sizeof (ipsec_crypto_t)); + + mp = crypto_mp->b_cont; + freeb(crypto_mp); + return (mp); +} + +/* * Create an ipsec_action_t based on the way an inbound packet was protected. * Used to reflect traffic back to a sender. * * We don't bother interning the action into the hash table. */ ipsec_action_t * -ipsec_in_to_out_action(ipsec_in_t *ii) +ipsec_in_to_out_action(ip_recv_attr_t *ira) { ipsa_t *ah_assoc, *esp_assoc; uint_t auth_alg = 0, encr_alg = 0, espa_alg = 0; @@ -3057,10 +2991,12 @@ ipsec_in_to_out_action(ipsec_in_t *ii) */ ap->ipa_act.ipa_type = IPSEC_ACT_APPLY; ap->ipa_act.ipa_log = 0; - ah_assoc = ii->ipsec_in_ah_sa; + ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE); + + ah_assoc = ira->ira_ipsec_ah_sa; ap->ipa_act.ipa_apply.ipp_use_ah = (ah_assoc != NULL); - esp_assoc = ii->ipsec_in_esp_sa; + esp_assoc = ira->ira_ipsec_esp_sa; ap->ipa_act.ipa_apply.ipp_use_esp = (esp_assoc != NULL); if (esp_assoc != NULL) { @@ -3074,7 +3010,8 @@ ipsec_in_to_out_action(ipsec_in_t *ii) ap->ipa_act.ipa_apply.ipp_encr_alg = (uint8_t)encr_alg; ap->ipa_act.ipa_apply.ipp_auth_alg = (uint8_t)auth_alg; ap->ipa_act.ipa_apply.ipp_esp_auth_alg = (uint8_t)espa_alg; - ap->ipa_act.ipa_apply.ipp_use_se = ii->ipsec_in_decaps; + ap->ipa_act.ipa_apply.ipp_use_se = + !!(ira->ira_flags & IRAF_IPSEC_DECAPS); unique = B_FALSE; if (esp_assoc != NULL) { @@ -3104,7 +3041,7 @@ ipsec_in_to_out_action(ipsec_in_t *ii) ap->ipa_act.ipa_apply.ipp_use_unique = unique; ap->ipa_want_unique = unique; ap->ipa_allow_clear = B_FALSE; - ap->ipa_want_se = ii->ipsec_in_decaps; + ap->ipa_want_se = !!(ira->ira_flags & IRAF_IPSEC_DECAPS); ap->ipa_want_ah = (ah_assoc != NULL); ap->ipa_want_esp = (esp_assoc != NULL); @@ -3500,13 +3437,14 @@ ipsec_sel_rel(ipsec_sel_t **spp, netstack_t *ns) * Free a policy rule which we know is no longer being referenced. */ void -ipsec_policy_free(ipsec_policy_t *ipp, netstack_t *ns) +ipsec_policy_free(ipsec_policy_t *ipp) { ASSERT(ipp->ipsp_refs == 0); ASSERT(ipp->ipsp_sel != NULL); ASSERT(ipp->ipsp_act != NULL); + ASSERT(ipp->ipsp_netstack != NULL); - ipsec_sel_rel(&ipp->ipsp_sel, ns); + ipsec_sel_rel(&ipp->ipsp_sel, ipp->ipsp_netstack); IPACT_REFRELE(ipp->ipsp_act); kmem_cache_free(ipsec_pol_cache, ipp); } @@ -3544,6 +3482,7 @@ ipsec_policy_create(ipsec_selkey_t *keys, const ipsec_act_t *a, HASH_NULL(ipp, ipsp_hash); + ipp->ipsp_netstack = ns; /* Needed for ipsec_policy_free */ ipp->ipsp_refs = 1; /* caller's reference */ ipp->ipsp_sel = sp; ipp->ipsp_act = ap; @@ -3613,7 +3552,7 @@ ipsec_policy_delete(ipsec_policy_head_t *php, ipsec_selkey_t *keys, int dir, continue; } - IPPOL_UNCHAIN(php, ip, ns); + IPPOL_UNCHAIN(php, ip); php->iph_gen++; ipsec_update_present_flags(ns->netstack_ipsec); @@ -3664,7 +3603,7 @@ ipsec_policy_delete_index(ipsec_policy_head_t *php, uint64_t policy_index, break; } - IPPOL_UNCHAIN(php, ip, ns); + IPPOL_UNCHAIN(php, ip); found = B_TRUE; } @@ -3897,8 +3836,7 @@ ipsec_enter_policy(ipsec_policy_head_t *php, ipsec_policy_t *ipp, int direction, } static void -ipsec_ipr_flush(ipsec_policy_head_t *php, ipsec_policy_root_t *ipr, - netstack_t *ns) +ipsec_ipr_flush(ipsec_policy_head_t *php, ipsec_policy_root_t *ipr) { ipsec_policy_t *ip, *nip; int af, chain, nchain; @@ -3906,7 +3844,7 @@ ipsec_ipr_flush(ipsec_policy_head_t *php, ipsec_policy_root_t *ipr, for (af = 0; af < IPSEC_NAF; af++) { for (ip = ipr->ipr_nonhash[af]; ip != NULL; ip = nip) { nip = ip->ipsp_hash.hash_next; - IPPOL_UNCHAIN(php, ip, ns); + IPPOL_UNCHAIN(php, ip); } ipr->ipr_nonhash[af] = NULL; } @@ -3916,7 +3854,7 @@ ipsec_ipr_flush(ipsec_policy_head_t *php, ipsec_policy_root_t *ipr, for (ip = ipr->ipr_hash[chain].hash_head; ip != NULL; ip = nip) { nip = ip->ipsp_hash.hash_next; - IPPOL_UNCHAIN(php, ip, ns); + IPPOL_UNCHAIN(php, ip); } ipr->ipr_hash[chain].hash_head = NULL; } @@ -3954,8 +3892,9 @@ ipsec_polhead_flush(ipsec_policy_head_t *php, netstack_t *ns) ASSERT(RW_WRITE_HELD(&php->iph_lock)); for (dir = 0; dir < IPSEC_NTYPES; dir++) - ipsec_ipr_flush(php, &php->iph_root[dir], ns); + ipsec_ipr_flush(php, &php->iph_root[dir]); + php->iph_gen++; ipsec_update_present_flags(ns->netstack_ipsec); } @@ -4066,727 +4005,219 @@ ipsec_polhead_split(ipsec_policy_head_t *php, netstack_t *ns) * * NOTE2: This function is called by cleartext cases, so it needs to be * in IP proper. + * + * Note: the caller has moved other parts of ira into ixa already. */ boolean_t -ipsec_in_to_out(mblk_t *ipsec_mp, ipha_t *ipha, ip6_t *ip6h, zoneid_t zoneid) -{ - ipsec_in_t *ii; - ipsec_out_t *io; - boolean_t v4; - mblk_t *mp; - boolean_t secure; - uint_t ifindex; +ipsec_in_to_out(ip_recv_attr_t *ira, ip_xmit_attr_t *ixa, mblk_t *data_mp, + ipha_t *ipha, ip6_t *ip6h) +{ ipsec_selector_t sel; - ipsec_action_t *reflect_action = NULL; - netstack_t *ns; - - ASSERT(ipsec_mp->b_datap->db_type == M_CTL); + ipsec_action_t *reflect_action = NULL; + netstack_t *ns = ixa->ixa_ipst->ips_netstack; bzero((void*)&sel, sizeof (sel)); - ii = (ipsec_in_t *)ipsec_mp->b_rptr; - - mp = ipsec_mp->b_cont; - ASSERT(mp != NULL); - - if (ii->ipsec_in_action != NULL) { + if (ira->ira_ipsec_action != NULL) { /* transfer reference.. */ - reflect_action = ii->ipsec_in_action; - ii->ipsec_in_action = NULL; - } else if (!ii->ipsec_in_loopback) - reflect_action = ipsec_in_to_out_action(ii); - secure = ii->ipsec_in_secure; - ifindex = ii->ipsec_in_ill_index; - ns = ii->ipsec_in_ns; - v4 = ii->ipsec_in_v4; - - ipsec_in_release_refs(ii); /* No netstack_rele/hold needed */ - - /* - * Use the global zone's id if we don't have a specific zone - * identified. This is likely to happen when the received packet's - * destination is a Trusted Extensions all-zones address. We did - * not copy the zoneid from ii->ipsec_in_zone id because that - * information represents the zoneid we started input processing - * with. The caller should have a better idea of which zone the - * received packet was destined for. - */ - - if (zoneid == ALL_ZONES) - zoneid = GLOBAL_ZONEID; + reflect_action = ira->ira_ipsec_action; + ira->ira_ipsec_action = NULL; + } else if (!(ira->ira_flags & IRAF_LOOPBACK)) + reflect_action = ipsec_in_to_out_action(ira); /* * The caller is going to send the datagram out which might - * go on the wire or delivered locally through ip_wput_local. + * go on the wire or delivered locally through ire_send_local. * * 1) If it goes out on the wire, new associations will be * obtained. - * 2) If it is delivered locally, ip_wput_local will convert - * this IPSEC_OUT to a IPSEC_IN looking at the requests. + * 2) If it is delivered locally, ire_send_local will convert + * this ip_xmit_attr_t back to a ip_recv_attr_t looking at the + * requests. */ + ixa->ixa_ipsec_action = reflect_action; - io = (ipsec_out_t *)ipsec_mp->b_rptr; - bzero(io, sizeof (ipsec_out_t)); - io->ipsec_out_type = IPSEC_OUT; - io->ipsec_out_len = sizeof (ipsec_out_t); - io->ipsec_out_frtn.free_func = ipsec_out_free; - io->ipsec_out_frtn.free_arg = (char *)io; - io->ipsec_out_act = reflect_action; - - if (!ipsec_init_outbound_ports(&sel, mp, ipha, ip6h, 0, - ns->netstack_ipsec)) + if (!ipsec_init_outbound_ports(&sel, data_mp, ipha, ip6h, 0, + ns->netstack_ipsec)) { + /* Note: data_mp already consumed and ip_drop_packet done */ return (B_FALSE); - - io->ipsec_out_src_port = sel.ips_local_port; - io->ipsec_out_dst_port = sel.ips_remote_port; - io->ipsec_out_proto = sel.ips_protocol; - io->ipsec_out_icmp_type = sel.ips_icmp_type; - io->ipsec_out_icmp_code = sel.ips_icmp_code; + } + ixa->ixa_ipsec_src_port = sel.ips_local_port; + ixa->ixa_ipsec_dst_port = sel.ips_remote_port; + ixa->ixa_ipsec_proto = sel.ips_protocol; + ixa->ixa_ipsec_icmp_type = sel.ips_icmp_type; + ixa->ixa_ipsec_icmp_code = sel.ips_icmp_code; /* * Don't use global policy for this, as we want * to use the same protection that was applied to the inbound packet. + * Thus we set IXAF_NO_IPSEC is it arrived in the clear to make + * it be sent in the clear. */ - io->ipsec_out_use_global_policy = B_FALSE; - io->ipsec_out_proc_begin = B_FALSE; - io->ipsec_out_secure = secure; - io->ipsec_out_v4 = v4; - io->ipsec_out_ill_index = ifindex; - io->ipsec_out_zoneid = zoneid; - io->ipsec_out_ns = ns; /* No netstack_hold */ + if (ira->ira_flags & IRAF_IPSEC_SECURE) + ixa->ixa_flags |= IXAF_IPSEC_SECURE; + else + ixa->ixa_flags |= IXAF_NO_IPSEC; return (B_TRUE); } -mblk_t * -ipsec_in_tag(mblk_t *mp, mblk_t *cont, netstack_t *ns) -{ - ipsec_in_t *ii = (ipsec_in_t *)mp->b_rptr; - ipsec_in_t *nii; - mblk_t *nmp; - frtn_t nfrtn; - ipsec_stack_t *ipss = ns->netstack_ipsec; - - ASSERT(ii->ipsec_in_type == IPSEC_IN); - ASSERT(ii->ipsec_in_len == sizeof (ipsec_in_t)); - - nmp = ipsec_in_alloc(ii->ipsec_in_v4, ns); - if (nmp == NULL) { - ip_drop_packet_chain(cont, B_FALSE, NULL, NULL, - DROPPER(ipss, ipds_spd_nomem), - &ipss->ipsec_spd_dropper); - return (NULL); - } - - ASSERT(nmp->b_datap->db_type == M_CTL); - ASSERT(nmp->b_wptr == (nmp->b_rptr + sizeof (ipsec_info_t))); - - /* - * Bump refcounts. - */ - if (ii->ipsec_in_ah_sa != NULL) - IPSA_REFHOLD(ii->ipsec_in_ah_sa); - if (ii->ipsec_in_esp_sa != NULL) - IPSA_REFHOLD(ii->ipsec_in_esp_sa); - if (ii->ipsec_in_policy != NULL) - IPPH_REFHOLD(ii->ipsec_in_policy); - - /* - * Copy everything, but preserve the free routine provided by - * ipsec_in_alloc(). - */ - nii = (ipsec_in_t *)nmp->b_rptr; - nfrtn = nii->ipsec_in_frtn; - bcopy(ii, nii, sizeof (*ii)); - nii->ipsec_in_frtn = nfrtn; - - nmp->b_cont = cont; - - return (nmp); -} - -mblk_t * -ipsec_out_tag(mblk_t *mp, mblk_t *cont, netstack_t *ns) -{ - ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr; - ipsec_out_t *nio; - mblk_t *nmp; - frtn_t nfrtn; - ipsec_stack_t *ipss = ns->netstack_ipsec; - - ASSERT(io->ipsec_out_type == IPSEC_OUT); - ASSERT(io->ipsec_out_len == sizeof (ipsec_out_t)); - - nmp = ipsec_alloc_ipsec_out(ns); - if (nmp == NULL) { - ip_drop_packet_chain(cont, B_FALSE, NULL, NULL, - DROPPER(ipss, ipds_spd_nomem), - &ipss->ipsec_spd_dropper); - return (NULL); - } - ASSERT(nmp->b_datap->db_type == M_CTL); - ASSERT(nmp->b_wptr == (nmp->b_rptr + sizeof (ipsec_info_t))); - - /* - * Bump refcounts. - */ - if (io->ipsec_out_ah_sa != NULL) - IPSA_REFHOLD(io->ipsec_out_ah_sa); - if (io->ipsec_out_esp_sa != NULL) - IPSA_REFHOLD(io->ipsec_out_esp_sa); - if (io->ipsec_out_polhead != NULL) - IPPH_REFHOLD(io->ipsec_out_polhead); - if (io->ipsec_out_policy != NULL) - IPPOL_REFHOLD(io->ipsec_out_policy); - if (io->ipsec_out_act != NULL) - IPACT_REFHOLD(io->ipsec_out_act); - if (io->ipsec_out_latch != NULL) - IPLATCH_REFHOLD(io->ipsec_out_latch); - if (io->ipsec_out_cred != NULL) - crhold(io->ipsec_out_cred); - - /* - * Copy everything, but preserve the free routine provided by - * ipsec_alloc_ipsec_out(). - */ - nio = (ipsec_out_t *)nmp->b_rptr; - nfrtn = nio->ipsec_out_frtn; - bcopy(io, nio, sizeof (*io)); - nio->ipsec_out_frtn = nfrtn; - - nmp->b_cont = cont; - - return (nmp); -} - -static void -ipsec_out_release_refs(ipsec_out_t *io) +void +ipsec_out_release_refs(ip_xmit_attr_t *ixa) { - netstack_t *ns = io->ipsec_out_ns; - - ASSERT(io->ipsec_out_type == IPSEC_OUT); - ASSERT(io->ipsec_out_len == sizeof (ipsec_out_t)); - ASSERT(io->ipsec_out_ns != NULL); + if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE)) + return; - /* Note: IPSA_REFRELE is multi-line macro */ - if (io->ipsec_out_ah_sa != NULL) - IPSA_REFRELE(io->ipsec_out_ah_sa); - if (io->ipsec_out_esp_sa != NULL) - IPSA_REFRELE(io->ipsec_out_esp_sa); - if (io->ipsec_out_polhead != NULL) - IPPH_REFRELE(io->ipsec_out_polhead, ns); - if (io->ipsec_out_policy != NULL) - IPPOL_REFRELE(io->ipsec_out_policy, ns); - if (io->ipsec_out_act != NULL) - IPACT_REFRELE(io->ipsec_out_act); - if (io->ipsec_out_cred != NULL) { - crfree(io->ipsec_out_cred); - io->ipsec_out_cred = NULL; + if (ixa->ixa_ipsec_ah_sa != NULL) { + IPSA_REFRELE(ixa->ixa_ipsec_ah_sa); + ixa->ixa_ipsec_ah_sa = NULL; } - if (io->ipsec_out_latch) { - IPLATCH_REFRELE(io->ipsec_out_latch, ns); - io->ipsec_out_latch = NULL; + if (ixa->ixa_ipsec_esp_sa != NULL) { + IPSA_REFRELE(ixa->ixa_ipsec_esp_sa); + ixa->ixa_ipsec_esp_sa = NULL; } -} - -static void -ipsec_out_free(void *arg) -{ - ipsec_out_t *io = (ipsec_out_t *)arg; - ipsec_out_release_refs(io); - kmem_cache_free(ipsec_info_cache, arg); -} - -static void -ipsec_in_release_refs(ipsec_in_t *ii) -{ - netstack_t *ns = ii->ipsec_in_ns; - - ASSERT(ii->ipsec_in_ns != NULL); - - /* Note: IPSA_REFRELE is multi-line macro */ - if (ii->ipsec_in_ah_sa != NULL) - IPSA_REFRELE(ii->ipsec_in_ah_sa); - if (ii->ipsec_in_esp_sa != NULL) - IPSA_REFRELE(ii->ipsec_in_esp_sa); - if (ii->ipsec_in_policy != NULL) - IPPH_REFRELE(ii->ipsec_in_policy, ns); - if (ii->ipsec_in_da != NULL) { - freeb(ii->ipsec_in_da); - ii->ipsec_in_da = NULL; + if (ixa->ixa_ipsec_policy != NULL) { + IPPOL_REFRELE(ixa->ixa_ipsec_policy); + ixa->ixa_ipsec_policy = NULL; } -} - -static void -ipsec_in_free(void *arg) -{ - ipsec_in_t *ii = (ipsec_in_t *)arg; - ipsec_in_release_refs(ii); - kmem_cache_free(ipsec_info_cache, arg); -} - -/* - * This is called only for outbound datagrams if the datagram needs to - * go out secure. A NULL mp can be passed to get an ipsec_out. This - * facility is used by ip_unbind. - * - * NOTE : o As the data part could be modified by ipsec_out_process etc. - * we can't make it fast by calling a dup. - */ -mblk_t * -ipsec_alloc_ipsec_out(netstack_t *ns) -{ - mblk_t *ipsec_mp; - ipsec_out_t *io = kmem_cache_alloc(ipsec_info_cache, KM_NOSLEEP); - - if (io == NULL) - return (NULL); - - bzero(io, sizeof (ipsec_out_t)); - - io->ipsec_out_type = IPSEC_OUT; - io->ipsec_out_len = sizeof (ipsec_out_t); - io->ipsec_out_frtn.free_func = ipsec_out_free; - io->ipsec_out_frtn.free_arg = (char *)io; - - /* - * Set the zoneid to ALL_ZONES which is used as an invalid value. Code - * using ipsec_out_zoneid should assert that the zoneid has been set to - * a sane value. - */ - io->ipsec_out_zoneid = ALL_ZONES; - io->ipsec_out_ns = ns; /* No netstack_hold */ - - ipsec_mp = desballoc((uint8_t *)io, sizeof (ipsec_info_t), BPRI_HI, - &io->ipsec_out_frtn); - if (ipsec_mp == NULL) { - ipsec_out_free(io); - - return (NULL); + if (ixa->ixa_ipsec_action != NULL) { + IPACT_REFRELE(ixa->ixa_ipsec_action); + ixa->ixa_ipsec_action = NULL; } - ipsec_mp->b_datap->db_type = M_CTL; - ipsec_mp->b_wptr = ipsec_mp->b_rptr + sizeof (ipsec_info_t); - - return (ipsec_mp); -} - -/* - * Attach an IPSEC_OUT; use pol for policy if it is non-null. - * Otherwise initialize using conn. - * - * If pol is non-null, we consume a reference to it. - */ -mblk_t * -ipsec_attach_ipsec_out(mblk_t **mp, conn_t *connp, ipsec_policy_t *pol, - uint8_t proto, netstack_t *ns) -{ - mblk_t *ipsec_mp; - ipsec_stack_t *ipss = ns->netstack_ipsec; - - ASSERT((pol != NULL) || (connp != NULL)); - - ipsec_mp = ipsec_alloc_ipsec_out(ns); - if (ipsec_mp == NULL) { - ipsec_rl_strlog(ns, IP_MOD_ID, 0, 0, SL_ERROR|SL_NOTE, - "ipsec_attach_ipsec_out: Allocation failure\n"); - ip_drop_packet(*mp, B_FALSE, NULL, NULL, - DROPPER(ipss, ipds_spd_nomem), - &ipss->ipsec_spd_dropper); - *mp = NULL; - return (NULL); + if (ixa->ixa_ipsec_latch) { + IPLATCH_REFRELE(ixa->ixa_ipsec_latch); + ixa->ixa_ipsec_latch = NULL; } - ipsec_mp->b_cont = *mp; - /* - * If *mp is NULL, ipsec_init_ipsec_out() won't/should not be using it. - */ - return (ipsec_init_ipsec_out(ipsec_mp, mp, connp, pol, proto, ns)); + /* Clear the soft references to the SAs */ + ixa->ixa_ipsec_ref[0].ipsr_sa = NULL; + ixa->ixa_ipsec_ref[0].ipsr_bucket = NULL; + ixa->ixa_ipsec_ref[0].ipsr_gen = 0; + ixa->ixa_ipsec_ref[1].ipsr_sa = NULL; + ixa->ixa_ipsec_ref[1].ipsr_bucket = NULL; + ixa->ixa_ipsec_ref[1].ipsr_gen = 0; + ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; } -/* - * Initialize the IPSEC_OUT (ipsec_mp) using pol if it is non-null. - * Otherwise initialize using conn. - * - * If pol is non-null, we consume a reference to it. - */ -mblk_t * -ipsec_init_ipsec_out(mblk_t *ipsec_mp, mblk_t **mp, conn_t *connp, - ipsec_policy_t *pol, uint8_t proto, netstack_t *ns) +void +ipsec_in_release_refs(ip_recv_attr_t *ira) { - ipsec_out_t *io; - ipsec_policy_t *p; - ipha_t *ipha; - ip6_t *ip6h; - ipsec_stack_t *ipss = ns->netstack_ipsec; - - ASSERT(ipsec_mp->b_cont == *mp); - - ASSERT((pol != NULL) || (connp != NULL)); - - ASSERT(ipsec_mp->b_datap->db_type == M_CTL); - ASSERT(ipsec_mp->b_wptr == (ipsec_mp->b_rptr + sizeof (ipsec_info_t))); - io = (ipsec_out_t *)ipsec_mp->b_rptr; - ASSERT(io->ipsec_out_type == IPSEC_OUT); - ASSERT(io->ipsec_out_len == sizeof (ipsec_out_t)); - io->ipsec_out_latch = NULL; - /* - * Set the zoneid when we have the connp. - * Otherwise, we're called from ip_wput_attach_policy() who will take - * care of setting the zoneid. - */ - if (connp != NULL) - io->ipsec_out_zoneid = connp->conn_zoneid; - - io->ipsec_out_ns = ns; /* No netstack_hold */ - - if (*mp != NULL) { - ipha = (ipha_t *)(*mp)->b_rptr; - if (IPH_HDR_VERSION(ipha) == IP_VERSION) { - io->ipsec_out_v4 = B_TRUE; - ip6h = NULL; - } else { - io->ipsec_out_v4 = B_FALSE; - ip6h = (ip6_t *)ipha; - ipha = NULL; - } - } else { - ASSERT(connp != NULL && connp->conn_policy_cached); - ip6h = NULL; - ipha = NULL; - io->ipsec_out_v4 = !connp->conn_pkt_isv6; - } - - p = NULL; - - /* - * Take latched policies over global policy. Check here again for - * this, in case we had conn_latch set while the packet was flying - * around in IP. - */ - if (connp != NULL && connp->conn_latch != NULL) { - ASSERT(ns == connp->conn_netstack); - p = connp->conn_latch->ipl_out_policy; - io->ipsec_out_latch = connp->conn_latch; - IPLATCH_REFHOLD(connp->conn_latch); - if (p != NULL) { - IPPOL_REFHOLD(p); - } - io->ipsec_out_src_port = connp->conn_lport; - io->ipsec_out_dst_port = connp->conn_fport; - io->ipsec_out_icmp_type = io->ipsec_out_icmp_code = 0; - if (pol != NULL) - IPPOL_REFRELE(pol, ns); - } else if (pol != NULL) { - ipsec_selector_t sel; - - bzero((void*)&sel, sizeof (sel)); - - p = pol; - /* - * conn does not have the port information. Get - * it from the packet. - */ + if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) + return; - if (!ipsec_init_outbound_ports(&sel, *mp, ipha, ip6h, 0, - ns->netstack_ipsec)) { - /* Callee did ip_drop_packet() on *mp. */ - *mp = NULL; - freeb(ipsec_mp); - return (NULL); - } - io->ipsec_out_src_port = sel.ips_local_port; - io->ipsec_out_dst_port = sel.ips_remote_port; - io->ipsec_out_icmp_type = sel.ips_icmp_type; - io->ipsec_out_icmp_code = sel.ips_icmp_code; + if (ira->ira_ipsec_ah_sa != NULL) { + IPSA_REFRELE(ira->ira_ipsec_ah_sa); + ira->ira_ipsec_ah_sa = NULL; } - - io->ipsec_out_proto = proto; - io->ipsec_out_use_global_policy = B_TRUE; - io->ipsec_out_secure = (p != NULL); - io->ipsec_out_policy = p; - - if (p == NULL) { - if (connp->conn_policy != NULL) { - io->ipsec_out_secure = B_TRUE; - ASSERT(io->ipsec_out_latch == NULL); - ASSERT(io->ipsec_out_use_global_policy == B_TRUE); - io->ipsec_out_need_policy = B_TRUE; - ASSERT(io->ipsec_out_polhead == NULL); - IPPH_REFHOLD(connp->conn_policy); - io->ipsec_out_polhead = connp->conn_policy; - } - } else { - /* Handle explicit drop action. */ - if (p->ipsp_act->ipa_act.ipa_type == IPSEC_ACT_DISCARD || - p->ipsp_act->ipa_act.ipa_type == IPSEC_ACT_REJECT) { - ip_drop_packet(ipsec_mp, B_FALSE, NULL, NULL, - DROPPER(ipss, ipds_spd_explicit), - &ipss->ipsec_spd_dropper); - *mp = NULL; - ipsec_mp = NULL; - } + if (ira->ira_ipsec_esp_sa != NULL) { + IPSA_REFRELE(ira->ira_ipsec_esp_sa); + ira->ira_ipsec_esp_sa = NULL; } - - return (ipsec_mp); + ira->ira_flags &= ~IRAF_IPSEC_SECURE; } /* - * Allocate an IPSEC_IN mblk. This will be prepended to an inbound datagram - * and keep track of what-if-any IPsec processing will be applied to the - * datagram. - */ -mblk_t * -ipsec_in_alloc(boolean_t isv4, netstack_t *ns) -{ - mblk_t *ipsec_in; - ipsec_in_t *ii = kmem_cache_alloc(ipsec_info_cache, KM_NOSLEEP); - - if (ii == NULL) - return (NULL); - - bzero(ii, sizeof (ipsec_info_t)); - ii->ipsec_in_type = IPSEC_IN; - ii->ipsec_in_len = sizeof (ipsec_in_t); - - ii->ipsec_in_v4 = isv4; - ii->ipsec_in_secure = B_TRUE; - ii->ipsec_in_ns = ns; /* No netstack_hold */ - ii->ipsec_in_stackid = ns->netstack_stackid; - - ii->ipsec_in_frtn.free_func = ipsec_in_free; - ii->ipsec_in_frtn.free_arg = (char *)ii; - - ii->ipsec_in_zoneid = ALL_ZONES; /* default for received packets */ - - ipsec_in = desballoc((uint8_t *)ii, sizeof (ipsec_info_t), BPRI_HI, - &ii->ipsec_in_frtn); - if (ipsec_in == NULL) { - ip1dbg(("ipsec_in_alloc: IPSEC_IN allocation failure.\n")); - ipsec_in_free(ii); - return (NULL); - } - - ipsec_in->b_datap->db_type = M_CTL; - ipsec_in->b_wptr += sizeof (ipsec_info_t); - - return (ipsec_in); -} - -/* - * This is called from ip_wput_local when a packet which needs - * security is looped back, to convert the IPSEC_OUT to a IPSEC_IN - * before fanout, where the policy check happens. In most of the - * cases, IPSEC processing has *never* been done. There is one case - * (ip_wput_ire_fragmentit -> ip_wput_frag -> icmp_frag_needed) where - * the packet is destined for localhost, IPSEC processing has already - * been done. + * This is called from ire_send_local when a packet + * is looped back. We setup the ip_recv_attr_t "borrowing" the references + * held by the callers. + * Note that we don't do any IPsec but we carry the actions and IPSEC flags + * across so that the fanout policy checks see that IPsec was applied. * - * Future: This could happen after SA selection has occurred for - * outbound.. which will tell us who the src and dst identities are.. - * Then it's just a matter of splicing the ah/esp SA pointers from the - * ipsec_out_t to the ipsec_in_t. + * The caller should do ipsec_in_release_refs() on the ira by calling + * ira_cleanup(). */ void -ipsec_out_to_in(mblk_t *ipsec_mp) +ipsec_out_to_in(ip_xmit_attr_t *ixa, ill_t *ill, ip_recv_attr_t *ira) { - ipsec_in_t *ii; - ipsec_out_t *io; ipsec_policy_t *pol; ipsec_action_t *act; - boolean_t v4, icmp_loopback; - zoneid_t zoneid; - netstack_t *ns; - ASSERT(ipsec_mp->b_datap->db_type == M_CTL); + /* Non-IPsec operations */ + ira->ira_free_flags = 0; + ira->ira_zoneid = ixa->ixa_zoneid; + ira->ira_cred = ixa->ixa_cred; + ira->ira_cpid = ixa->ixa_cpid; + ira->ira_tsl = ixa->ixa_tsl; + ira->ira_ill = ira->ira_rill = ill; + ira->ira_flags = ixa->ixa_flags & IAF_MASK; + ira->ira_no_loop_zoneid = ixa->ixa_no_loop_zoneid; + ira->ira_pktlen = ixa->ixa_pktlen; + ira->ira_ip_hdr_length = ixa->ixa_ip_hdr_length; + ira->ira_protocol = ixa->ixa_protocol; + ira->ira_mhip = NULL; + + ira->ira_flags |= IRAF_LOOPBACK | IRAF_L2SRC_LOOPBACK; + + ira->ira_sqp = ixa->ixa_sqp; + ira->ira_ring = NULL; + + ira->ira_ruifindex = ill->ill_phyint->phyint_ifindex; + ira->ira_rifindex = ira->ira_ruifindex; + + if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE)) + return; - io = (ipsec_out_t *)ipsec_mp->b_rptr; + ira->ira_flags |= IRAF_IPSEC_SECURE; - v4 = io->ipsec_out_v4; - zoneid = io->ipsec_out_zoneid; - icmp_loopback = io->ipsec_out_icmp_loopback; - ns = io->ipsec_out_ns; + ira->ira_ipsec_ah_sa = NULL; + ira->ira_ipsec_esp_sa = NULL; - act = io->ipsec_out_act; + act = ixa->ixa_ipsec_action; if (act == NULL) { - pol = io->ipsec_out_policy; + pol = ixa->ixa_ipsec_policy; if (pol != NULL) { act = pol->ipsp_act; IPACT_REFHOLD(act); } } - io->ipsec_out_act = NULL; - - ipsec_out_release_refs(io); /* No netstack_rele/hold needed */ - - ii = (ipsec_in_t *)ipsec_mp->b_rptr; - bzero(ii, sizeof (ipsec_in_t)); - ii->ipsec_in_type = IPSEC_IN; - ii->ipsec_in_len = sizeof (ipsec_in_t); - ii->ipsec_in_loopback = B_TRUE; - ii->ipsec_in_ns = ns; /* No netstack_hold */ - - ii->ipsec_in_frtn.free_func = ipsec_in_free; - ii->ipsec_in_frtn.free_arg = (char *)ii; - ii->ipsec_in_action = act; - ii->ipsec_in_zoneid = zoneid; - - /* - * In most of the cases, we can't look at the ipsec_out_XXX_sa - * because this never went through IPSEC processing. So, look at - * the requests and infer whether it would have gone through - * IPSEC processing or not. Initialize the "done" fields with - * the requests. The possible values for "done" fields are : - * - * 1) zero, indicates that a particular preference was never - * requested. - * 2) non-zero, indicates that it could be IPSEC_PREF_REQUIRED/ - * IPSEC_PREF_NEVER. If IPSEC_REQ_DONE is set, it means that - * IPSEC processing has been completed. - */ - ii->ipsec_in_secure = B_TRUE; - ii->ipsec_in_v4 = v4; - ii->ipsec_in_icmp_loopback = icmp_loopback; + ixa->ixa_ipsec_action = NULL; + ira->ira_ipsec_action = act; } /* - * Consults global policy to see whether this datagram should - * go out secure. If so it attaches a ipsec_mp in front and - * returns. + * Consults global policy and per-socket policy to see whether this datagram + * should go out secure. If so it updates the ip_xmit_attr_t + * Should not be used when connecting, since then we want to latch the policy. + * + * If connp is NULL we just look at the global policy. + * + * Returns NULL if the packet was dropped, in which case the MIB has + * been incremented and ip_drop_packet done. */ mblk_t * -ip_wput_attach_policy(mblk_t *ipsec_mp, ipha_t *ipha, ip6_t *ip6h, ire_t *ire, - conn_t *connp, boolean_t unspec_src, zoneid_t zoneid) +ip_output_attach_policy(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, + const conn_t *connp, ip_xmit_attr_t *ixa) { - mblk_t *mp; - ipsec_out_t *io = NULL; ipsec_selector_t sel; - uint_t ill_index; - boolean_t conn_dontroutex; - boolean_t conn_multicast_loopx; - boolean_t policy_present; - ip_stack_t *ipst = ire->ire_ipst; + boolean_t policy_present; + ip_stack_t *ipst = ixa->ixa_ipst; netstack_t *ns = ipst->ips_netstack; ipsec_stack_t *ipss = ns->netstack_ipsec; + ipsec_policy_t *p; + ixa->ixa_ipsec_policy_gen = ipss->ipsec_system_policy.iph_gen; ASSERT((ipha != NULL && ip6h == NULL) || (ip6h != NULL && ipha == NULL)); - bzero((void*)&sel, sizeof (sel)); - if (ipha != NULL) policy_present = ipss->ipsec_outbound_v4_policy_present; else policy_present = ipss->ipsec_outbound_v6_policy_present; - /* - * Fast Path to see if there is any policy. - */ - if (!policy_present) { - if (ipsec_mp->b_datap->db_type == M_CTL) { - io = (ipsec_out_t *)ipsec_mp->b_rptr; - if (!io->ipsec_out_secure) { - /* - * If there is no global policy and ip_wput - * or ip_wput_multicast has attached this mp - * for multicast case, free the ipsec_mp and - * return the original mp. - */ - mp = ipsec_mp->b_cont; - freeb(ipsec_mp); - ipsec_mp = mp; - io = NULL; - } - ASSERT(io == NULL || !io->ipsec_out_tunnel); - } - if (((io == NULL) || (io->ipsec_out_polhead == NULL)) && - ((connp == NULL) || (connp->conn_policy == NULL))) - return (ipsec_mp); - } - ill_index = 0; - conn_multicast_loopx = conn_dontroutex = B_FALSE; - mp = ipsec_mp; - if (ipsec_mp->b_datap->db_type == M_CTL) { - mp = ipsec_mp->b_cont; - /* - * This is a connection where we have some per-socket - * policy or ip_wput has attached an ipsec_mp for - * the multicast datagram. - */ - io = (ipsec_out_t *)ipsec_mp->b_rptr; - if (!io->ipsec_out_secure) { - /* - * This ipsec_mp was allocated in ip_wput or - * ip_wput_multicast so that we will know the - * value of ill_index, conn_dontroute, - * conn_multicast_loop in the multicast case if - * we inherit global policy here. - */ - ill_index = io->ipsec_out_ill_index; - conn_dontroutex = io->ipsec_out_dontroute; - conn_multicast_loopx = io->ipsec_out_multicast_loop; - freeb(ipsec_mp); - ipsec_mp = mp; - io = NULL; - } - ASSERT(io == NULL || !io->ipsec_out_tunnel); - } + if (!policy_present && (connp == NULL || connp->conn_policy == NULL)) + return (mp); + + bzero((void*)&sel, sizeof (sel)); if (ipha != NULL) { - sel.ips_local_addr_v4 = (ipha->ipha_src != 0 ? - ipha->ipha_src : ire->ire_src_addr); + sel.ips_local_addr_v4 = ipha->ipha_src; sel.ips_remote_addr_v4 = ip_get_dst(ipha); - sel.ips_protocol = (uint8_t)ipha->ipha_protocol; sel.ips_isv4 = B_TRUE; } else { - ushort_t hdr_len; - uint8_t *nexthdrp; - boolean_t is_fragment; - sel.ips_isv4 = B_FALSE; - if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) { - if (!unspec_src) - sel.ips_local_addr_v6 = ire->ire_src_addr_v6; - } else { - sel.ips_local_addr_v6 = ip6h->ip6_src; - } - - sel.ips_remote_addr_v6 = ip_get_dst_v6(ip6h, mp, &is_fragment); - if (is_fragment) { - /* - * It's a packet fragment for a packet that - * we have already processed (since IPsec processing - * is done before fragmentation), so we don't - * have to do policy checks again. Fragments can - * come back to us for processing if they have - * been queued up due to flow control. - */ - if (ipsec_mp->b_datap->db_type == M_CTL) { - mp = ipsec_mp->b_cont; - freeb(ipsec_mp); - ipsec_mp = mp; - } - return (ipsec_mp); - } - - /* IPv6 common-case. */ - sel.ips_protocol = ip6h->ip6_nxt; - switch (ip6h->ip6_nxt) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_SCTP: - case IPPROTO_ICMPV6: - break; - default: - if (!ip_hdr_length_nexthdr_v6(mp, ip6h, - &hdr_len, &nexthdrp)) { - BUMP_MIB(&ipst->ips_ip6_mib, - ipIfStatsOutDiscards); - freemsg(ipsec_mp); /* Not IPsec-related drop. */ - return (NULL); - } - sel.ips_protocol = *nexthdrp; - break; - } + sel.ips_local_addr_v6 = ip6h->ip6_src; + sel.ips_remote_addr_v6 = ip_get_dst_v6(ip6h, mp, NULL); } + sel.ips_protocol = ixa->ixa_protocol; if (!ipsec_init_outbound_ports(&sel, mp, ipha, ip6h, 0, ipss)) { if (ipha != NULL) { @@ -4794,65 +4225,36 @@ ip_wput_attach_policy(mblk_t *ipsec_mp, ipha_t *ipha, ip6_t *ip6h, ire_t *ire, } else { BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsOutDiscards); } - - /* Callee dropped the packet. */ + /* Note: mp already consumed and ip_drop_packet done */ return (NULL); } - if (io != NULL) { - /* - * We seem to have some local policy (we already have - * an ipsec_out). Look at global policy and see - * whether we have to inherit or not. - */ - io->ipsec_out_need_policy = B_FALSE; - ipsec_mp = ipsec_apply_global_policy(ipsec_mp, connp, - &sel, ns); - ASSERT((io->ipsec_out_policy != NULL) || - (io->ipsec_out_act != NULL)); - ASSERT(io->ipsec_out_need_policy == B_FALSE); - return (ipsec_mp); + ASSERT(ixa->ixa_ipsec_policy == NULL); + p = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, connp, &sel, ns); + ixa->ixa_ipsec_policy = p; + if (p != NULL) { + ixa->ixa_flags |= IXAF_IPSEC_SECURE; + if (connp == NULL || connp->conn_policy == NULL) + ixa->ixa_flags |= IXAF_IPSEC_GLOBAL_POLICY; + } else { + ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; } - /* - * We pass in a pointer to a pointer because mp can become - * NULL due to allocation failures or explicit drops. Callers - * of this function should assume a NULL mp means the packet - * was dropped. - */ - ipsec_mp = ipsec_attach_global_policy(&mp, connp, &sel, ns); - if (ipsec_mp == NULL) - return (mp); /* * Copy the right port information. */ - ASSERT(ipsec_mp->b_datap->db_type == M_CTL); - io = (ipsec_out_t *)ipsec_mp->b_rptr; - - ASSERT(io->ipsec_out_need_policy == B_FALSE); - ASSERT((io->ipsec_out_policy != NULL) || - (io->ipsec_out_act != NULL)); - io->ipsec_out_src_port = sel.ips_local_port; - io->ipsec_out_dst_port = sel.ips_remote_port; - io->ipsec_out_icmp_type = sel.ips_icmp_type; - io->ipsec_out_icmp_code = sel.ips_icmp_code; - /* - * Set ill_index, conn_dontroute and conn_multicast_loop - * for multicast datagrams. - */ - io->ipsec_out_ill_index = ill_index; - io->ipsec_out_dontroute = conn_dontroutex; - io->ipsec_out_multicast_loop = conn_multicast_loopx; - - if (zoneid == ALL_ZONES) - zoneid = GLOBAL_ZONEID; - io->ipsec_out_zoneid = zoneid; - return (ipsec_mp); + ixa->ixa_ipsec_src_port = sel.ips_local_port; + ixa->ixa_ipsec_dst_port = sel.ips_remote_port; + ixa->ixa_ipsec_icmp_type = sel.ips_icmp_type; + ixa->ixa_ipsec_icmp_code = sel.ips_icmp_code; + ixa->ixa_ipsec_proto = sel.ips_protocol; + return (mp); } /* * When appropriate, this function caches inbound and outbound policy - * for this connection. + * for this connection. The outbound policy is stored in conn_ixa. + * Note that it can not be used for SCTP since conn_faddr isn't set for SCTP. * * XXX need to work out more details about per-interface policy and * caching here! @@ -4866,20 +4268,38 @@ ipsec_conn_cache_policy(conn_t *connp, boolean_t isv4) netstack_t *ns = connp->conn_netstack; ipsec_stack_t *ipss = ns->netstack_ipsec; + connp->conn_ixa->ixa_ipsec_policy_gen = + ipss->ipsec_system_policy.iph_gen; /* * There is no policy latching for ICMP sockets because we can't * decide on which policy to use until we see the packet and get * type/code selectors. */ - if (connp->conn_ulp == IPPROTO_ICMP || - connp->conn_ulp == IPPROTO_ICMPV6) { + if (connp->conn_proto == IPPROTO_ICMP || + connp->conn_proto == IPPROTO_ICMPV6) { connp->conn_in_enforce_policy = connp->conn_out_enforce_policy = B_TRUE; if (connp->conn_latch != NULL) { - IPLATCH_REFRELE(connp->conn_latch, ns); + IPLATCH_REFRELE(connp->conn_latch); connp->conn_latch = NULL; } - connp->conn_flags |= IPCL_CHECK_POLICY; + if (connp->conn_latch_in_policy != NULL) { + IPPOL_REFRELE(connp->conn_latch_in_policy); + connp->conn_latch_in_policy = NULL; + } + if (connp->conn_latch_in_action != NULL) { + IPACT_REFRELE(connp->conn_latch_in_action); + connp->conn_latch_in_action = NULL; + } + if (connp->conn_ixa->ixa_ipsec_policy != NULL) { + IPPOL_REFRELE(connp->conn_ixa->ixa_ipsec_policy); + connp->conn_ixa->ixa_ipsec_policy = NULL; + } + if (connp->conn_ixa->ixa_ipsec_action != NULL) { + IPACT_REFRELE(connp->conn_ixa->ixa_ipsec_action); + connp->conn_ixa->ixa_ipsec_action = NULL; + } + connp->conn_ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; return (0); } @@ -4898,38 +4318,57 @@ ipsec_conn_cache_policy(conn_t *connp, boolean_t isv4) return (ENOMEM); } - sel.ips_protocol = connp->conn_ulp; + bzero((void*)&sel, sizeof (sel)); + + sel.ips_protocol = connp->conn_proto; sel.ips_local_port = connp->conn_lport; sel.ips_remote_port = connp->conn_fport; sel.ips_is_icmp_inv_acq = 0; sel.ips_isv4 = isv4; if (isv4) { - sel.ips_local_addr_v4 = connp->conn_src; - sel.ips_remote_addr_v4 = connp->conn_rem; + sel.ips_local_addr_v4 = connp->conn_laddr_v4; + sel.ips_remote_addr_v4 = connp->conn_faddr_v4; } else { - sel.ips_local_addr_v6 = connp->conn_srcv6; - sel.ips_remote_addr_v6 = connp->conn_remv6; + sel.ips_local_addr_v6 = connp->conn_laddr_v6; + sel.ips_remote_addr_v6 = connp->conn_faddr_v6; } - p = ipsec_find_policy(IPSEC_TYPE_INBOUND, connp, NULL, &sel, - ns); - if (connp->conn_latch->ipl_in_policy != NULL) - IPPOL_REFRELE(connp->conn_latch->ipl_in_policy, ns); - connp->conn_latch->ipl_in_policy = p; + p = ipsec_find_policy(IPSEC_TYPE_INBOUND, connp, &sel, ns); + if (connp->conn_latch_in_policy != NULL) + IPPOL_REFRELE(connp->conn_latch_in_policy); + connp->conn_latch_in_policy = p; connp->conn_in_enforce_policy = (p != NULL); - p = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, connp, NULL, &sel, - ns); - if (connp->conn_latch->ipl_out_policy != NULL) - IPPOL_REFRELE(connp->conn_latch->ipl_out_policy, ns); - connp->conn_latch->ipl_out_policy = p; + p = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, connp, &sel, ns); + if (connp->conn_ixa->ixa_ipsec_policy != NULL) + IPPOL_REFRELE(connp->conn_ixa->ixa_ipsec_policy); + connp->conn_ixa->ixa_ipsec_policy = p; connp->conn_out_enforce_policy = (p != NULL); - + if (p != NULL) { + connp->conn_ixa->ixa_flags |= IXAF_IPSEC_SECURE; + if (connp->conn_policy == NULL) { + connp->conn_ixa->ixa_flags |= + IXAF_IPSEC_GLOBAL_POLICY; + } + } else { + connp->conn_ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; + } /* Clear the latched actions too, in case we're recaching. */ - if (connp->conn_latch->ipl_out_action != NULL) - IPACT_REFRELE(connp->conn_latch->ipl_out_action); - if (connp->conn_latch->ipl_in_action != NULL) - IPACT_REFRELE(connp->conn_latch->ipl_in_action); + if (connp->conn_ixa->ixa_ipsec_action != NULL) { + IPACT_REFRELE(connp->conn_ixa->ixa_ipsec_action); + connp->conn_ixa->ixa_ipsec_action = NULL; + } + if (connp->conn_latch_in_action != NULL) { + IPACT_REFRELE(connp->conn_latch_in_action); + connp->conn_latch_in_action = NULL; + } + connp->conn_ixa->ixa_ipsec_src_port = sel.ips_local_port; + connp->conn_ixa->ixa_ipsec_dst_port = sel.ips_remote_port; + connp->conn_ixa->ixa_ipsec_icmp_type = sel.ips_icmp_type; + connp->conn_ixa->ixa_ipsec_icmp_code = sel.ips_icmp_code; + connp->conn_ixa->ixa_ipsec_proto = sel.ips_protocol; + } else { + connp->conn_ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; } /* @@ -4945,28 +4384,125 @@ ipsec_conn_cache_policy(conn_t *connp, boolean_t isv4) * global policy (because conn_policy_cached is already set). */ connp->conn_policy_cached = B_TRUE; - if (connp->conn_in_enforce_policy) - connp->conn_flags |= IPCL_CHECK_POLICY; return (0); } +/* + * When appropriate, this function caches outbound policy for faddr/fport. + * It is used when we are not connected i.e., when we can not latch the + * policy. + */ void -iplatch_free(ipsec_latch_t *ipl, netstack_t *ns) -{ - if (ipl->ipl_out_policy != NULL) - IPPOL_REFRELE(ipl->ipl_out_policy, ns); - if (ipl->ipl_in_policy != NULL) - IPPOL_REFRELE(ipl->ipl_in_policy, ns); - if (ipl->ipl_in_action != NULL) - IPACT_REFRELE(ipl->ipl_in_action); - if (ipl->ipl_out_action != NULL) - IPACT_REFRELE(ipl->ipl_out_action); +ipsec_cache_outbound_policy(const conn_t *connp, const in6_addr_t *v6src, + const in6_addr_t *v6dst, in_port_t dstport, ip_xmit_attr_t *ixa) +{ + boolean_t isv4 = (ixa->ixa_flags & IXAF_IS_IPV4) != 0; + boolean_t global_policy_present; + netstack_t *ns = connp->conn_netstack; + ipsec_stack_t *ipss = ns->netstack_ipsec; + + ixa->ixa_ipsec_policy_gen = ipss->ipsec_system_policy.iph_gen; + + /* + * There is no policy caching for ICMP sockets because we can't + * decide on which policy to use until we see the packet and get + * type/code selectors. + */ + if (connp->conn_proto == IPPROTO_ICMP || + connp->conn_proto == IPPROTO_ICMPV6) { + ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; + if (ixa->ixa_ipsec_policy != NULL) { + IPPOL_REFRELE(ixa->ixa_ipsec_policy); + ixa->ixa_ipsec_policy = NULL; + } + if (ixa->ixa_ipsec_action != NULL) { + IPACT_REFRELE(ixa->ixa_ipsec_action); + ixa->ixa_ipsec_action = NULL; + } + return; + } + + global_policy_present = isv4 ? + (ipss->ipsec_outbound_v4_policy_present || + ipss->ipsec_inbound_v4_policy_present) : + (ipss->ipsec_outbound_v6_policy_present || + ipss->ipsec_inbound_v6_policy_present); + + if ((connp->conn_policy != NULL) || global_policy_present) { + ipsec_selector_t sel; + ipsec_policy_t *p; + + bzero((void*)&sel, sizeof (sel)); + + sel.ips_protocol = connp->conn_proto; + sel.ips_local_port = connp->conn_lport; + sel.ips_remote_port = dstport; + sel.ips_is_icmp_inv_acq = 0; + sel.ips_isv4 = isv4; + if (isv4) { + IN6_V4MAPPED_TO_IPADDR(v6src, sel.ips_local_addr_v4); + IN6_V4MAPPED_TO_IPADDR(v6dst, sel.ips_remote_addr_v4); + } else { + sel.ips_local_addr_v6 = *v6src; + sel.ips_remote_addr_v6 = *v6dst; + } + + p = ipsec_find_policy(IPSEC_TYPE_OUTBOUND, connp, &sel, ns); + if (ixa->ixa_ipsec_policy != NULL) + IPPOL_REFRELE(ixa->ixa_ipsec_policy); + ixa->ixa_ipsec_policy = p; + if (p != NULL) { + ixa->ixa_flags |= IXAF_IPSEC_SECURE; + if (connp->conn_policy == NULL) + ixa->ixa_flags |= IXAF_IPSEC_GLOBAL_POLICY; + } else { + ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; + } + /* Clear the latched actions too, in case we're recaching. */ + if (ixa->ixa_ipsec_action != NULL) { + IPACT_REFRELE(ixa->ixa_ipsec_action); + ixa->ixa_ipsec_action = NULL; + } + + ixa->ixa_ipsec_src_port = sel.ips_local_port; + ixa->ixa_ipsec_dst_port = sel.ips_remote_port; + ixa->ixa_ipsec_icmp_type = sel.ips_icmp_type; + ixa->ixa_ipsec_icmp_code = sel.ips_icmp_code; + ixa->ixa_ipsec_proto = sel.ips_protocol; + } else { + ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; + if (ixa->ixa_ipsec_policy != NULL) { + IPPOL_REFRELE(ixa->ixa_ipsec_policy); + ixa->ixa_ipsec_policy = NULL; + } + if (ixa->ixa_ipsec_action != NULL) { + IPACT_REFRELE(ixa->ixa_ipsec_action); + ixa->ixa_ipsec_action = NULL; + } + } +} + +/* + * Returns B_FALSE if the policy has gone stale. + */ +boolean_t +ipsec_outbound_policy_current(ip_xmit_attr_t *ixa) +{ + ipsec_stack_t *ipss = ixa->ixa_ipst->ips_netstack->netstack_ipsec; + + if (!(ixa->ixa_flags & IXAF_IPSEC_GLOBAL_POLICY)) + return (B_TRUE); + + return (ixa->ixa_ipsec_policy_gen == ipss->ipsec_system_policy.iph_gen); +} + +void +iplatch_free(ipsec_latch_t *ipl) +{ if (ipl->ipl_local_cid != NULL) IPSID_REFRELE(ipl->ipl_local_cid); if (ipl->ipl_remote_cid != NULL) IPSID_REFRELE(ipl->ipl_remote_cid); - if (ipl->ipl_local_id != NULL) - crfree(ipl->ipl_local_id); mutex_destroy(&ipl->ipl_lock); kmem_free(ipl, sizeof (*ipl)); } @@ -5622,18 +5158,19 @@ ipsec_unregister_prov_update(void) * SAs are available. If there's no per-tunnel policy, or a match comes back * with no match, then still return the packet and have global policy take * a crack at it in IP. + * This updates the ip_xmit_attr with the IPsec policy. * * Remember -> we can be forwarding packets. Keep that in mind w.r.t. * inner-packet contents. */ mblk_t * ipsec_tun_outbound(mblk_t *mp, iptun_t *iptun, ipha_t *inner_ipv4, - ip6_t *inner_ipv6, ipha_t *outer_ipv4, ip6_t *outer_ipv6, int outer_hdr_len) + ip6_t *inner_ipv6, ipha_t *outer_ipv4, ip6_t *outer_ipv6, int outer_hdr_len, + ip_xmit_attr_t *ixa) { ipsec_policy_head_t *polhead; ipsec_selector_t sel; - mblk_t *ipsec_mp, *ipsec_mp_head, *nmp; - ipsec_out_t *io; + mblk_t *nmp; boolean_t is_fragment; ipsec_policy_t *pol; ipsec_tun_pol_t *itp = iptun->iptun_itp; @@ -5644,6 +5181,15 @@ ipsec_tun_outbound(mblk_t *mp, iptun_t *iptun, ipha_t *inner_ipv4, outer_ipv4 != NULL && outer_ipv6 == NULL); /* We take care of inners in a bit. */ + /* Are the IPsec fields initialized at all? */ + if (!(ixa->ixa_flags & IXAF_IPSEC_SECURE)) { + ASSERT(ixa->ixa_ipsec_policy == NULL); + ASSERT(ixa->ixa_ipsec_latch == NULL); + ASSERT(ixa->ixa_ipsec_action == NULL); + ASSERT(ixa->ixa_ipsec_ah_sa == NULL); + ASSERT(ixa->ixa_ipsec_esp_sa == NULL); + } + ASSERT(itp != NULL && (itp->itp_flags & ITPF_P_ACTIVE)); polhead = itp->itp_policy; @@ -5675,7 +5221,7 @@ ipsec_tun_outbound(mblk_t *mp, iptun_t *iptun, ipha_t *inner_ipv4, if (mp->b_cont != NULL) { nmp = msgpullup(mp, -1); if (nmp == NULL) { - ip_drop_packet(mp, B_FALSE, NULL, NULL, + ip_drop_packet(mp, B_FALSE, NULL, DROPPER(ipss, ipds_spd_nomem), &ipss->ipsec_spd_dropper); return (NULL); @@ -5734,8 +5280,8 @@ ipsec_tun_outbound(mblk_t *mp, iptun_t *iptun, ipha_t *inner_ipv4, ip6h = (ip6_t *)mp->b_rptr; if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ip6_hdr_length, &v6_proto_p)) { - ip_drop_packet_chain(mp, B_FALSE, - NULL, NULL, DROPPER(ipss, + ip_drop_packet_chain(mp, B_FALSE, NULL, + DROPPER(ipss, ipds_spd_malformed_packet), &ipss->ipsec_spd_dropper); return (NULL); @@ -5761,8 +5307,8 @@ ipsec_tun_outbound(mblk_t *mp, iptun_t *iptun, ipha_t *inner_ipv4, sel.ips_remote_addr_v6 = inner_ipv6->ip6_dst; if (!ip_hdr_length_nexthdr_v6(mp, inner_ipv6, &ip6_hdr_length, &v6_proto_p)) { - ip_drop_packet_chain(mp, B_FALSE, - NULL, NULL, DROPPER(ipss, + ip_drop_packet_chain(mp, B_FALSE, NULL, + DROPPER(ipss, ipds_spd_malformed_frag), &ipss->ipsec_spd_dropper); return (NULL); @@ -5802,8 +5348,7 @@ ipsec_tun_outbound(mblk_t *mp, iptun_t *iptun, ipha_t *inner_ipv4, /* Success so far! */ } rw_enter(&polhead->iph_lock, RW_READER); - pol = ipsec_find_policy_head(NULL, polhead, IPSEC_TYPE_OUTBOUND, - &sel, ns); + pol = ipsec_find_policy_head(NULL, polhead, IPSEC_TYPE_OUTBOUND, &sel); rw_exit(&polhead->iph_lock); if (pol == NULL) { /* @@ -5825,7 +5370,7 @@ ipsec_tun_outbound(mblk_t *mp, iptun_t *iptun, ipha_t *inner_ipv4, cmn_err(CE_WARN, "ipsec_tun_outbound(): No matching tunnel " "per-port policy\n"); #endif - ip_drop_packet_chain(mp, B_FALSE, NULL, NULL, + ip_drop_packet_chain(mp, B_FALSE, NULL, DROPPER(ipss, ipds_spd_explicit), &ipss->ipsec_spd_dropper); return (NULL); @@ -5835,101 +5380,65 @@ ipsec_tun_outbound(mblk_t *mp, iptun_t *iptun, ipha_t *inner_ipv4, cmn_err(CE_WARN, "Having matching tunnel per-port policy\n"); #endif - /* Construct an IPSEC_OUT message. */ - ipsec_mp = ipsec_mp_head = ipsec_alloc_ipsec_out(ns); - if (ipsec_mp == NULL) { - IPPOL_REFRELE(pol, ns); - ip_drop_packet(mp, B_FALSE, NULL, NULL, - DROPPER(ipss, ipds_spd_nomem), - &ipss->ipsec_spd_dropper); - return (NULL); - } - ipsec_mp->b_cont = mp; - io = (ipsec_out_t *)ipsec_mp->b_rptr; - IPPH_REFHOLD(polhead); /* - * NOTE: free() function of ipsec_out mblk will release polhead and - * pol references. + * NOTE: ixa_cleanup() function will release pol references. */ - io->ipsec_out_polhead = polhead; - io->ipsec_out_policy = pol; + ixa->ixa_ipsec_policy = pol; /* * NOTE: There is a subtle difference between iptun_zoneid and * iptun_connp->conn_zoneid explained in iptun_conn_create(). When * interacting with the ip module, we must use conn_zoneid. */ - io->ipsec_out_zoneid = iptun->iptun_connp->conn_zoneid; - io->ipsec_out_v4 = (outer_ipv4 != NULL); - io->ipsec_out_secure = B_TRUE; + ixa->ixa_zoneid = iptun->iptun_connp->conn_zoneid; + + ASSERT((outer_ipv4 != NULL) ? (ixa->ixa_flags & IXAF_IS_IPV4) : + !(ixa->ixa_flags & IXAF_IS_IPV4)); + ASSERT(ixa->ixa_ipsec_policy != NULL); + ixa->ixa_flags |= IXAF_IPSEC_SECURE; if (!(itp->itp_flags & ITPF_P_TUNNEL)) { /* Set up transport mode for tunnelled packets. */ - io->ipsec_out_proto = (inner_ipv4 != NULL) ? IPPROTO_ENCAP : + ixa->ixa_ipsec_proto = (inner_ipv4 != NULL) ? IPPROTO_ENCAP : IPPROTO_IPV6; - return (ipsec_mp); + return (mp); } /* Fill in tunnel-mode goodies here. */ - io->ipsec_out_tunnel = B_TRUE; + ixa->ixa_flags |= IXAF_IPSEC_TUNNEL; /* XXX Do I need to fill in all of the goodies here? */ if (inner_ipv4) { - io->ipsec_out_inaf = AF_INET; - io->ipsec_out_insrc[0] = + ixa->ixa_ipsec_inaf = AF_INET; + ixa->ixa_ipsec_insrc[0] = pol->ipsp_sel->ipsl_key.ipsl_local.ipsad_v4; - io->ipsec_out_indst[0] = + ixa->ixa_ipsec_indst[0] = pol->ipsp_sel->ipsl_key.ipsl_remote.ipsad_v4; } else { - io->ipsec_out_inaf = AF_INET6; - io->ipsec_out_insrc[0] = + ixa->ixa_ipsec_inaf = AF_INET6; + ixa->ixa_ipsec_insrc[0] = pol->ipsp_sel->ipsl_key.ipsl_local.ipsad_v6.s6_addr32[0]; - io->ipsec_out_insrc[1] = + ixa->ixa_ipsec_insrc[1] = pol->ipsp_sel->ipsl_key.ipsl_local.ipsad_v6.s6_addr32[1]; - io->ipsec_out_insrc[2] = + ixa->ixa_ipsec_insrc[2] = pol->ipsp_sel->ipsl_key.ipsl_local.ipsad_v6.s6_addr32[2]; - io->ipsec_out_insrc[3] = + ixa->ixa_ipsec_insrc[3] = pol->ipsp_sel->ipsl_key.ipsl_local.ipsad_v6.s6_addr32[3]; - io->ipsec_out_indst[0] = + ixa->ixa_ipsec_indst[0] = pol->ipsp_sel->ipsl_key.ipsl_remote.ipsad_v6.s6_addr32[0]; - io->ipsec_out_indst[1] = + ixa->ixa_ipsec_indst[1] = pol->ipsp_sel->ipsl_key.ipsl_remote.ipsad_v6.s6_addr32[1]; - io->ipsec_out_indst[2] = + ixa->ixa_ipsec_indst[2] = pol->ipsp_sel->ipsl_key.ipsl_remote.ipsad_v6.s6_addr32[2]; - io->ipsec_out_indst[3] = + ixa->ixa_ipsec_indst[3] = pol->ipsp_sel->ipsl_key.ipsl_remote.ipsad_v6.s6_addr32[3]; } - io->ipsec_out_insrcpfx = pol->ipsp_sel->ipsl_key.ipsl_local_pfxlen; - io->ipsec_out_indstpfx = pol->ipsp_sel->ipsl_key.ipsl_remote_pfxlen; + ixa->ixa_ipsec_insrcpfx = pol->ipsp_sel->ipsl_key.ipsl_local_pfxlen; + ixa->ixa_ipsec_indstpfx = pol->ipsp_sel->ipsl_key.ipsl_remote_pfxlen; /* NOTE: These are used for transport mode too. */ - io->ipsec_out_src_port = pol->ipsp_sel->ipsl_key.ipsl_lport; - io->ipsec_out_dst_port = pol->ipsp_sel->ipsl_key.ipsl_rport; - io->ipsec_out_proto = pol->ipsp_sel->ipsl_key.ipsl_proto; + ixa->ixa_ipsec_src_port = pol->ipsp_sel->ipsl_key.ipsl_lport; + ixa->ixa_ipsec_dst_port = pol->ipsp_sel->ipsl_key.ipsl_rport; + ixa->ixa_ipsec_proto = pol->ipsp_sel->ipsl_key.ipsl_proto; - /* - * The mp pointer still valid - * Add ipsec_out to each fragment. - * The fragment head already has one - */ - nmp = mp->b_next; - mp->b_next = NULL; - mp = nmp; - ASSERT(ipsec_mp != NULL); - while (mp != NULL) { - nmp = mp->b_next; - ipsec_mp->b_next = ipsec_out_tag(ipsec_mp_head, mp, ns); - if (ipsec_mp->b_next == NULL) { - ip_drop_packet_chain(ipsec_mp_head, B_FALSE, NULL, NULL, - DROPPER(ipss, ipds_spd_nomem), - &ipss->ipsec_spd_dropper); - ip_drop_packet_chain(mp, B_FALSE, NULL, NULL, - DROPPER(ipss, ipds_spd_nomem), - &ipss->ipsec_spd_dropper); - return (NULL); - } - ipsec_mp = ipsec_mp->b_next; - mp->b_next = NULL; - mp = nmp; - } - return (ipsec_mp_head); + return (mp); } /* @@ -5937,16 +5446,28 @@ ipsec_tun_outbound(mblk_t *mp, iptun_t *iptun, ipha_t *inner_ipv4, * calls ip_drop_packet() for me on NULL returns. */ mblk_t * -ipsec_check_ipsecin_policy_reasm(mblk_t *ipsec_mp, ipsec_policy_t *pol, +ipsec_check_ipsecin_policy_reasm(mblk_t *attr_mp, ipsec_policy_t *pol, ipha_t *inner_ipv4, ip6_t *inner_ipv6, uint64_t pkt_unique, netstack_t *ns) { - /* Assume ipsec_mp is a chain of b_next-linked IPSEC_IN M_CTLs. */ + /* Assume attr_mp is a chain of b_next-linked ip_recv_attr mblk. */ mblk_t *data_chain = NULL, *data_tail = NULL; - mblk_t *ii_next; - - while (ipsec_mp != NULL) { - ii_next = ipsec_mp->b_next; - ipsec_mp->b_next = NULL; /* No tripping asserts. */ + mblk_t *next; + mblk_t *data_mp; + ip_recv_attr_t iras; + + while (attr_mp != NULL) { + ASSERT(ip_recv_attr_is_mblk(attr_mp)); + next = attr_mp->b_next; + attr_mp->b_next = NULL; /* No tripping asserts. */ + + data_mp = attr_mp->b_cont; + attr_mp->b_cont = NULL; + if (!ip_recv_attr_from_mblk(attr_mp, &iras)) { + /* The ill or ip_stack_t disappeared on us */ + freemsg(data_mp); /* ip_drop_packet?? */ + ira_cleanup(&iras, B_TRUE); + goto fail; + } /* * Need IPPOL_REFHOLD(pol) for extras because @@ -5954,67 +5475,67 @@ ipsec_check_ipsecin_policy_reasm(mblk_t *ipsec_mp, ipsec_policy_t *pol, */ IPPOL_REFHOLD(pol); - if (ipsec_check_ipsecin_policy(ipsec_mp, pol, inner_ipv4, - inner_ipv6, pkt_unique, ns) != NULL) { - if (data_tail == NULL) { - /* First one */ - data_chain = data_tail = ipsec_mp->b_cont; - } else { - data_tail->b_next = ipsec_mp->b_cont; - data_tail = data_tail->b_next; - } - freeb(ipsec_mp); + data_mp = ipsec_check_ipsecin_policy(data_mp, pol, inner_ipv4, + inner_ipv6, pkt_unique, &iras, ns); + ira_cleanup(&iras, B_TRUE); + + if (data_mp == NULL) + goto fail; + + if (data_tail == NULL) { + /* First one */ + data_chain = data_tail = data_mp; } else { - /* - * ipsec_check_ipsecin_policy() freed ipsec_mp - * already. Need to get rid of any extra pol - * references, and any remaining bits as well. - */ - IPPOL_REFRELE(pol, ns); - ipsec_freemsg_chain(data_chain); - ipsec_freemsg_chain(ii_next); /* ipdrop stats? */ - return (NULL); + data_tail->b_next = data_mp; + data_tail = data_mp; } - ipsec_mp = ii_next; + attr_mp = next; } /* * One last release because either the loop bumped it up, or we never * called ipsec_check_ipsecin_policy(). */ - IPPOL_REFRELE(pol, ns); + IPPOL_REFRELE(pol); /* data_chain is ready for return to tun module. */ return (data_chain); -} +fail: + /* + * Need to get rid of any extra pol + * references, and any remaining bits as well. + */ + IPPOL_REFRELE(pol); + ipsec_freemsg_chain(data_chain); + ipsec_freemsg_chain(next); /* ipdrop stats? */ + return (NULL); +} /* - * Returns B_TRUE if the inbound packet passed an IPsec policy check. Returns - * B_FALSE if it failed or if it is a fragment needing its friends before a + * Return a message if the inbound packet passed an IPsec policy check. Returns + * NULL if it failed or if it is a fragment needing its friends before a * policy check can be performed. * - * Expects a non-NULL *data_mp, an optional ipsec_mp, and a non-NULL polhead. - * data_mp may be reassigned with a b_next chain of packets if fragments + * Expects a non-NULL data_mp, and a non-NULL polhead. + * The returned mblk may be a b_next chain of packets if fragments * neeeded to be collected for a proper policy check. * - * Always frees ipsec_mp, but only frees data_mp if returns B_FALSE. This - * function calls ip_drop_packet() on data_mp if need be. + * This function calls ip_drop_packet() on data_mp if need be. * * NOTE: outer_hdr_len is signed. If it's a negative value, the caller * is inspecting an ICMP packet. */ -boolean_t -ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp, +mblk_t * +ipsec_tun_inbound(ip_recv_attr_t *ira, mblk_t *data_mp, ipsec_tun_pol_t *itp, ipha_t *inner_ipv4, ip6_t *inner_ipv6, ipha_t *outer_ipv4, ip6_t *outer_ipv6, int outer_hdr_len, netstack_t *ns) { ipsec_policy_head_t *polhead; ipsec_selector_t sel; - mblk_t *message = (ipsec_mp == NULL) ? *data_mp : ipsec_mp; ipsec_policy_t *pol; uint16_t tmpport; selret_t rc; - boolean_t retval, port_policy_present, is_icmp, global_present; + boolean_t port_policy_present, is_icmp, global_present; in6_addr_t tmpaddr; ipaddr_t tmp4; uint8_t flags, *inner_hdr; @@ -6032,7 +5553,6 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp, ASSERT(inner_ipv4 != NULL && inner_ipv6 == NULL || inner_ipv4 == NULL && inner_ipv6 != NULL); - ASSERT(message == *data_mp || message->b_cont == *data_mp); if (outer_hdr_len < 0) { outer_hdr_len = (-outer_hdr_len); @@ -6042,6 +5562,8 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp, } if (itp != NULL && (itp->itp_flags & ITPF_P_ACTIVE)) { + mblk_t *mp = data_mp; + polhead = itp->itp_policy; /* * We need to perform full Tunnel-Mode enforcement, @@ -6061,53 +5583,66 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp, flags = ((port_policy_present ? SEL_PORT_POLICY : SEL_NONE) | (is_icmp ? SEL_IS_ICMP : SEL_NONE) | SEL_TUNNEL_MODE); - rc = ipsec_init_inbound_sel(&sel, *data_mp, inner_ipv4, + rc = ipsec_init_inbound_sel(&sel, data_mp, inner_ipv4, inner_ipv6, flags); switch (rc) { case SELRET_NOMEM: - ip_drop_packet(message, B_TRUE, NULL, NULL, + ip_drop_packet(data_mp, B_TRUE, NULL, DROPPER(ipss, ipds_spd_nomem), &ipss->ipsec_spd_dropper); - return (B_FALSE); + return (NULL); case SELRET_TUNFRAG: /* * At this point, if we're cleartext, we don't want * to go there. */ - if (ipsec_mp == NULL) { - ip_drop_packet(*data_mp, B_TRUE, NULL, NULL, + if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) { + ip_drop_packet(data_mp, B_TRUE, NULL, DROPPER(ipss, ipds_spd_got_clear), &ipss->ipsec_spd_dropper); - *data_mp = NULL; - return (B_FALSE); + return (NULL); + } + /* + * If we need to queue the packet. First we + * get an mblk with the attributes. ipsec_fragcache_add + * will prepend that to the queued data and return + * a list of b_next messages each of which starts with + * the attribute mblk. + */ + mp = ip_recv_attr_to_mblk(ira); + if (mp == NULL) { + ip_drop_packet(data_mp, B_TRUE, NULL, + DROPPER(ipss, ipds_spd_nomem), + &ipss->ipsec_spd_dropper); + return (NULL); } - ASSERT(((ipsec_in_t *)ipsec_mp->b_rptr)-> - ipsec_in_secure); - message = ipsec_fragcache_add(&itp->itp_fragcache, - ipsec_mp, *data_mp, outer_hdr_len, ipss); + mp = ipsec_fragcache_add(&itp->itp_fragcache, + mp, data_mp, outer_hdr_len, ipss); - if (message == NULL) { + if (mp == NULL) { /* * Data is cached, fragment chain is not - * complete. I consume ipsec_mp and data_mp + * complete. */ - return (B_FALSE); + return (NULL); } /* * If we get here, we have a full fragment chain. * Reacquire headers and selectors from first fragment. */ - inner_hdr = message->b_cont->b_rptr; + ASSERT(ip_recv_attr_is_mblk(mp)); + data_mp = mp->b_cont; + inner_hdr = data_mp->b_rptr; if (outer_ipv4 != NULL) { inner_hdr += IPH_HDR_LENGTH( - (ipha_t *)message->b_cont->b_rptr); + (ipha_t *)data_mp->b_rptr); } else { - inner_hdr += ip_hdr_length_v6(message->b_cont, - (ip6_t *)message->b_cont->b_rptr); + inner_hdr += ip_hdr_length_v6(data_mp, + (ip6_t *)data_mp->b_rptr); } - ASSERT(inner_hdr <= message->b_cont->b_wptr); + ASSERT(inner_hdr <= data_mp->b_wptr); if (inner_ipv4 != NULL) { inner_ipv4 = (ipha_t *)inner_hdr; @@ -6121,7 +5656,7 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp, * Use SEL_TUNNEL_MODE to take into account the outer * header. Use SEL_POST_FRAG so we always get ports. */ - rc = ipsec_init_inbound_sel(&sel, message->b_cont, + rc = ipsec_init_inbound_sel(&sel, data_mp, inner_ipv4, inner_ipv6, SEL_TUNNEL_MODE | SEL_POST_FRAG); switch (rc) { @@ -6132,17 +5667,15 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp, */ break; case SELRET_NOMEM: - ip_drop_packet_chain(message, B_TRUE, - NULL, NULL, + ip_drop_packet_chain(mp, B_TRUE, NULL, DROPPER(ipss, ipds_spd_nomem), &ipss->ipsec_spd_dropper); - return (B_FALSE); + return (NULL); case SELRET_BADPKT: - ip_drop_packet_chain(message, B_TRUE, - NULL, NULL, + ip_drop_packet_chain(mp, B_TRUE, NULL, DROPPER(ipss, ipds_spd_malformed_frag), &ipss->ipsec_spd_dropper); - return (B_FALSE); + return (NULL); case SELRET_TUNFRAG: cmn_err(CE_WARN, "(TUNFRAG on 2nd call...)"); /* FALLTHRU */ @@ -6151,7 +5684,7 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp, " returns bizarro 0x%x", rc); /* Guaranteed panic! */ ASSERT(rc == SELRET_NOMEM); - return (B_FALSE); + return (NULL); } /* FALLTHRU */ case SELRET_SUCCESS: @@ -6174,7 +5707,7 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp, "ipsec_init_inbound_sel() returns bizarro 0x%x", rc); ASSERT(rc == SELRET_NOMEM); /* Guaranteed panic! */ - return (B_FALSE); + return (NULL); } if (is_icmp) { @@ -6192,42 +5725,54 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp, /* find_policy_head() */ rw_enter(&polhead->iph_lock, RW_READER); pol = ipsec_find_policy_head(NULL, polhead, IPSEC_TYPE_INBOUND, - &sel, ns); + &sel); rw_exit(&polhead->iph_lock); if (pol != NULL) { - if (ipsec_mp == NULL || - !((ipsec_in_t *)ipsec_mp->b_rptr)-> - ipsec_in_secure) { - retval = pol->ipsp_act->ipa_allow_clear; - if (!retval) { + uint64_t pkt_unique; + + if (!(ira->ira_flags & IRAF_IPSEC_SECURE)) { + if (!pol->ipsp_act->ipa_allow_clear) { /* * XXX should never get here with * tunnel reassembled fragments? */ - ASSERT(message->b_next == NULL); - ip_drop_packet(message, B_TRUE, NULL, - NULL, + ASSERT(mp == data_mp); + ip_drop_packet(data_mp, B_TRUE, NULL, DROPPER(ipss, ipds_spd_got_clear), &ipss->ipsec_spd_dropper); - } else if (ipsec_mp != NULL) { - freeb(ipsec_mp); + IPPOL_REFRELE(pol); + return (NULL); + } else { + IPPOL_REFRELE(pol); + return (mp); } - - IPPOL_REFRELE(pol, ns); - return (retval); } + pkt_unique = SA_UNIQUE_ID(sel.ips_remote_port, + sel.ips_local_port, + (inner_ipv4 == NULL) ? IPPROTO_IPV6 : + IPPROTO_ENCAP, sel.ips_protocol); + /* * NOTE: The following releases pol's reference and * calls ip_drop_packet() for me on NULL returns. * * "sel" is still good here, so let's use it! */ - *data_mp = ipsec_check_ipsecin_policy_reasm(message, - pol, inner_ipv4, inner_ipv6, SA_UNIQUE_ID( - sel.ips_remote_port, sel.ips_local_port, - (inner_ipv4 == NULL) ? IPPROTO_IPV6 : - IPPROTO_ENCAP, sel.ips_protocol), ns); - return (*data_mp != NULL); + if (data_mp == mp) { + /* A single packet without attributes */ + data_mp = ipsec_check_ipsecin_policy(data_mp, + pol, inner_ipv4, inner_ipv6, pkt_unique, + ira, ns); + } else { + /* + * We pass in the b_next chain of attr_mp's + * and get back a b_next chain of data_mp's. + */ + data_mp = ipsec_check_ipsecin_policy_reasm(mp, + pol, inner_ipv4, inner_ipv6, pkt_unique, + ns); + } + return (data_mp); } /* @@ -6237,11 +5782,10 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp, * a new-style tunnel-mode tunnel. */ if ((itp->itp_flags & ITPF_P_TUNNEL) && !is_icmp) { - ip_drop_packet_chain(message, B_TRUE, NULL, - NULL, + ip_drop_packet_chain(data_mp, B_TRUE, NULL, DROPPER(ipss, ipds_spd_explicit), &ipss->ipsec_spd_dropper); - return (B_FALSE); + return (NULL); } } @@ -6251,24 +5795,22 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp, * tunnel-mode tunnel, which either returns with a pass, or gets * hit by the ip_drop_packet_chain() call right above here. */ + ASSERT(data_mp->b_next == NULL); /* If no per-tunnel security, check global policy now. */ - if (ipsec_mp != NULL && !global_present) { - if (((ipsec_in_t *)(ipsec_mp->b_rptr))-> - ipsec_in_icmp_loopback) { + if ((ira->ira_flags & IRAF_IPSEC_SECURE) && !global_present) { + if (ira->ira_flags & IRAF_TRUSTED_ICMP) { /* - * This is an ICMP message with an ipsec_mp - * attached. We should accept it. + * This is an ICMP message that was geenrated locally. + * We should accept it. */ - if (ipsec_mp != NULL) - freeb(ipsec_mp); - return (B_TRUE); + return (data_mp); } - ip_drop_packet(ipsec_mp, B_TRUE, NULL, NULL, + ip_drop_packet(data_mp, B_TRUE, NULL, DROPPER(ipss, ipds_spd_got_secure), &ipss->ipsec_spd_dropper); - return (B_FALSE); + return (NULL); } if (is_icmp) { @@ -6294,11 +5836,10 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp, } } - /* NOTE: Frees message if it returns NULL. */ - if (ipsec_check_global_policy(message, NULL, outer_ipv4, outer_ipv6, - (ipsec_mp != NULL), ns) == NULL) { - return (B_FALSE); - } + data_mp = ipsec_check_global_policy(data_mp, NULL, outer_ipv4, + outer_ipv6, ira, ns); + if (data_mp == NULL) + return (NULL); if (is_icmp) { /* Set things back to normal. */ @@ -6314,14 +5855,11 @@ ipsec_tun_inbound(mblk_t *ipsec_mp, mblk_t **data_mp, ipsec_tun_pol_t *itp, } } - if (ipsec_mp != NULL) - freeb(ipsec_mp); - /* * At this point, we pretend it's a cleartext accepted * packet. */ - return (B_TRUE); + return (data_mp); } /* @@ -6365,7 +5903,7 @@ itp_unlink(ipsec_tun_pol_t *node, netstack_t *ns) rw_enter(&ipss->ipsec_tunnel_policy_lock, RW_WRITER); ipss->ipsec_tunnel_policy_gen++; - ipsec_fragcache_uninit(&node->itp_fragcache); + ipsec_fragcache_uninit(&node->itp_fragcache, ipss); avl_remove(&ipss->ipsec_tunnel_policies, node); rw_exit(&ipss->ipsec_tunnel_policy_lock); ITP_REFRELE(node, ns); @@ -6615,7 +6153,7 @@ ipsec_fragcache_init(ipsec_fragcache_t *frag) } void -ipsec_fragcache_uninit(ipsec_fragcache_t *frag) +ipsec_fragcache_uninit(ipsec_fragcache_t *frag, ipsec_stack_t *ipss) { ipsec_fragcache_entry_t *fep; int i; @@ -6627,7 +6165,7 @@ ipsec_fragcache_uninit(ipsec_fragcache_t *frag) fep = (frag->itpf_ptr)[i]; while (fep != NULL) { /* Returned fep is next in chain or NULL */ - fep = fragcache_delentry(i, fep, frag); + fep = fragcache_delentry(i, fep, frag, ipss); } } /* @@ -6658,10 +6196,12 @@ ipsec_fragcache_uninit(ipsec_fragcache_t *frag) /* * Add a fragment to the fragment cache. Consumes mp if NULL is returned. * Returns mp if a whole fragment has been assembled, NULL otherwise + * The returned mp could be a b_next chain of fragments. + * + * The iramp argument is set on inbound; NULL if outbound. */ - mblk_t * -ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp, +ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *iramp, mblk_t *mp, int outer_hdr_len, ipsec_stack_t *ipss) { boolean_t is_v4; @@ -6672,7 +6212,7 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp, uint8_t v6_proto; uint8_t *v6_proto_p; uint16_t ip6_hdr_length; - ip6_pkt_t ipp; + ip_pkt_t ipp; ip6_frag_t *fraghdr; ipsec_fragcache_entry_t *fep; int i; @@ -6680,10 +6220,7 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp, int firstbyte, lastbyte; int offset; int last; - boolean_t inbound = (ipsec_mp != NULL); - mblk_t *first_mp = inbound ? ipsec_mp : mp; - - ASSERT(first_mp == mp || first_mp->b_cont == mp); + boolean_t inbound = (iramp != NULL); /* * You're on the slow path, so insure that every packet in the @@ -6692,14 +6229,14 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp, if (mp->b_cont != NULL) { nmp = msgpullup(mp, -1); if (nmp == NULL) { - ip_drop_packet(first_mp, inbound, NULL, NULL, + ip_drop_packet(mp, inbound, NULL, DROPPER(ipss, ipds_spd_nomem), &ipss->ipsec_spd_dropper); + if (inbound) + (void) ip_recv_attr_free_mblk(iramp); return (NULL); } freemsg(mp); - if (ipsec_mp != NULL) - ipsec_mp->b_cont = nmp; mp = nmp; } @@ -6721,9 +6258,11 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp, * If it fails we have a malformed packet */ mutex_exit(&frag->itpf_lock); - ip_drop_packet(first_mp, inbound, NULL, NULL, + ip_drop_packet(mp, inbound, NULL, DROPPER(ipss, ipds_spd_malformed_packet), &ipss->ipsec_spd_dropper); + if (inbound) + (void) ip_recv_attr_free_mblk(iramp); return (NULL); } else { v6_proto = *v6_proto_p; @@ -6731,16 +6270,18 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp, bzero(&ipp, sizeof (ipp)); - (void) ip_find_hdr_v6(mp, ip6h, &ipp, NULL); + (void) ip_find_hdr_v6(mp, ip6h, B_FALSE, &ipp, NULL); if (!(ipp.ipp_fields & IPPF_FRAGHDR)) { /* * We think this is a fragment, but didn't find * a fragment header. Something is wrong. */ mutex_exit(&frag->itpf_lock); - ip_drop_packet(first_mp, inbound, NULL, NULL, + ip_drop_packet(mp, inbound, NULL, DROPPER(ipss, ipds_spd_malformed_frag), &ipss->ipsec_spd_dropper); + if (inbound) + (void) ip_recv_attr_free_mblk(iramp); return (NULL); } fraghdr = ipp.ipp_fraghdr; @@ -6759,7 +6300,7 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp, */ itpf_time = gethrestime_sec(); if (itpf_time >= frag->itpf_expire_hint) - ipsec_fragcache_clean(frag); + ipsec_fragcache_clean(frag, ipss); /* Lookup to see if there is an existing entry */ @@ -6814,11 +6355,13 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp, /* check for bogus fragments and delete the entry */ if (firstbyte > 0 && firstbyte <= 8) { if (fep != NULL) - (void) fragcache_delentry(i, fep, frag); + (void) fragcache_delentry(i, fep, frag, ipss); mutex_exit(&frag->itpf_lock); - ip_drop_packet(first_mp, inbound, NULL, NULL, + ip_drop_packet(mp, inbound, NULL, DROPPER(ipss, ipds_spd_malformed_frag), &ipss->ipsec_spd_dropper); + if (inbound) + (void) ip_recv_attr_free_mblk(iramp); return (NULL); } @@ -6826,12 +6369,14 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp, if (fep == NULL) { if (frag->itpf_freelist == NULL) { /* see if there is some space */ - ipsec_fragcache_clean(frag); + ipsec_fragcache_clean(frag, ipss); if (frag->itpf_freelist == NULL) { mutex_exit(&frag->itpf_lock); - ip_drop_packet(first_mp, inbound, NULL, NULL, + ip_drop_packet(mp, inbound, NULL, DROPPER(ipss, ipds_spd_nomem), &ipss->ipsec_spd_dropper); + if (inbound) + (void) ip_recv_attr_free_mblk(iramp); return (NULL); } } @@ -6879,7 +6424,7 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp, ipha_t *niph; ipha_t *oniph; ip6_t *nip6h; - ip6_pkt_t nipp; + ip_pkt_t nipp; ip6_frag_t *nfraghdr; uint16_t nip6_hdr_length; uint8_t *nv6_proto_p; @@ -6929,14 +6474,17 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp, if (!ip_hdr_length_nexthdr_v6(ndata_mp, nip6h, &nip6_hdr_length, &nv6_proto_p)) { mutex_exit(&frag->itpf_lock); - ip_drop_packet_chain(nmp, inbound, NULL, NULL, + ip_drop_packet_chain(nmp, inbound, NULL, DROPPER(ipss, ipds_spd_malformed_frag), &ipss->ipsec_spd_dropper); ipsec_freemsg_chain(ndata_mp); + if (inbound) + (void) ip_recv_attr_free_mblk(iramp); return (NULL); } bzero(&nipp, sizeof (nipp)); - (void) ip_find_hdr_v6(ndata_mp, nip6h, &nipp, NULL); + (void) ip_find_hdr_v6(ndata_mp, nip6h, B_FALSE, &nipp, + NULL); nfraghdr = nipp.ipp_fraghdr; nfirstbyte = ntohs(nfraghdr->ip6f_offlg & IP6F_OFF_MASK); @@ -6968,11 +6516,13 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp, if (bcmp(data, ndata, MIN(lastbyte, nlastbyte) - firstbyte)) { /* Overlapping data does not match */ - (void) fragcache_delentry(i, fep, frag); + (void) fragcache_delentry(i, fep, frag, ipss); mutex_exit(&frag->itpf_lock); - ip_drop_packet(first_mp, inbound, NULL, NULL, + ip_drop_packet(mp, inbound, NULL, DROPPER(ipss, ipds_spd_overlap_frag), &ipss->ipsec_spd_dropper); + if (inbound) + (void) ip_recv_attr_free_mblk(iramp); return (NULL); } /* Part of defense for jolt2.c fragmentation attack */ @@ -6987,9 +6537,11 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp, * ---------- ------ */ mutex_exit(&frag->itpf_lock); - ip_drop_packet(first_mp, inbound, NULL, NULL, + ip_drop_packet(mp, inbound, NULL, DROPPER(ipss, ipds_spd_evil_frag), &ipss->ipsec_spd_dropper); + if (inbound) + (void) ip_recv_attr_free_mblk(iramp); return (NULL); } @@ -7027,12 +6579,17 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp, if (bcmp(data, ndata, MIN(lastbyte, nlastbyte) - nfirstbyte)) { /* Overlap mismatch */ - (void) fragcache_delentry(i, fep, frag); + (void) fragcache_delentry(i, fep, frag, + ipss); mutex_exit(&frag->itpf_lock); - ip_drop_packet(first_mp, inbound, NULL, - NULL, DROPPER(ipss, + ip_drop_packet(mp, inbound, NULL, + DROPPER(ipss, ipds_spd_overlap_frag), &ipss->ipsec_spd_dropper); + if (inbound) { + (void) ip_recv_attr_free_mblk( + iramp); + } return (NULL); } } @@ -7046,21 +6603,31 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp, prevmp = nmp; } - first_mp->b_next = nmp; + /* Prepend the attributes before we link it in */ + if (iramp != NULL) { + ASSERT(iramp->b_cont == NULL); + iramp->b_cont = mp; + mp = iramp; + iramp = NULL; + } + mp->b_next = nmp; if (prevmp == NULL) { - fep->itpfe_fraglist = first_mp; + fep->itpfe_fraglist = mp; } else { - prevmp->b_next = first_mp; + prevmp->b_next = mp; } if (last) fep->itpfe_last = 1; /* Part of defense for jolt2.c fragmentation attack */ if (++(fep->itpfe_depth) > IPSEC_MAX_FRAGS) { - (void) fragcache_delentry(i, fep, frag); + (void) fragcache_delentry(i, fep, frag, ipss); mutex_exit(&frag->itpf_lock); - ip_drop_packet(first_mp, inbound, NULL, NULL, + if (inbound) + mp = ip_recv_attr_free_mblk(mp); + + ip_drop_packet(mp, inbound, NULL, DROPPER(ipss, ipds_spd_max_frags), &ipss->ipsec_spd_dropper); return (NULL); @@ -7078,7 +6645,7 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp, #ifdef FRAGCACHE_DEBUG cmn_err(CE_WARN, "Last fragment cached.\n"); - cmn_err(CE_WARN, "mp = %p, first_mp = %p.\n", mp, first_mp); + cmn_err(CE_WARN, "mp = %p\n", mp); #endif offset = 0; @@ -7118,14 +6685,15 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp, if (!ip_hdr_length_nexthdr_v6(data_mp, ip6h, &ip6_hdr_length, &v6_proto_p)) { mutex_exit(&frag->itpf_lock); - ip_drop_packet_chain(mp, inbound, NULL, NULL, + ip_drop_packet_chain(mp, inbound, NULL, DROPPER(ipss, ipds_spd_malformed_frag), &ipss->ipsec_spd_dropper); return (NULL); } v6_proto = *v6_proto_p; bzero(&ipp, sizeof (ipp)); - (void) ip_find_hdr_v6(data_mp, ip6h, &ipp, NULL); + (void) ip_find_hdr_v6(data_mp, ip6h, B_FALSE, &ipp, + NULL); fraghdr = ipp.ipp_fraghdr; firstbyte = ntohs(fraghdr->ip6f_offlg & IP6F_OFF_MASK); @@ -7163,7 +6731,7 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp, (!is_v4 && !(fraghdr->ip6f_offlg & IP6F_MORE_FRAG))) { mp = fep->itpfe_fraglist; fep->itpfe_fraglist = NULL; - (void) fragcache_delentry(i, fep, frag); + (void) fragcache_delentry(i, fep, frag, ipss); mutex_exit(&frag->itpf_lock); if ((is_v4 && (firstbyte + ntohs(iph->ipha_length) > @@ -7171,7 +6739,7 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp, ntohs(ip6h->ip6_plen) > 65535))) { /* It is an invalid "ping-o-death" packet */ /* Discard it */ - ip_drop_packet_chain(mp, inbound, NULL, NULL, + ip_drop_packet_chain(mp, inbound, NULL, DROPPER(ipss, ipds_spd_evil_frag), &ipss->ipsec_spd_dropper); return (NULL); @@ -7181,7 +6749,7 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp, "mp->b_next = %p", mp, mp->b_next); #endif /* - * For inbound case, mp has ipsec_in b_next'd chain + * For inbound case, mp has attrmp b_next'd chain * For outbound case, it is just data mp chain */ return (mp); @@ -7202,7 +6770,7 @@ ipsec_fragcache_add(ipsec_fragcache_t *frag, mblk_t *ipsec_mp, mblk_t *mp, } static void -ipsec_fragcache_clean(ipsec_fragcache_t *frag) +ipsec_fragcache_clean(ipsec_fragcache_t *frag, ipsec_stack_t *ipss) { ipsec_fragcache_entry_t *fep; int i; @@ -7221,7 +6789,7 @@ ipsec_fragcache_clean(ipsec_fragcache_t *frag) while (fep) { if (fep->itpfe_exp < itpf_time) { /* found */ - fep = fragcache_delentry(i, fep, frag); + fep = fragcache_delentry(i, fep, frag, ipss); } else { if (fep->itpfe_exp < earlyexp) { earlyfep = fep; @@ -7237,12 +6805,12 @@ ipsec_fragcache_clean(ipsec_fragcache_t *frag) /* if (!found) */ if (frag->itpf_freelist == NULL) - (void) fragcache_delentry(earlyi, earlyfep, frag); + (void) fragcache_delentry(earlyi, earlyfep, frag, ipss); } static ipsec_fragcache_entry_t * fragcache_delentry(int slot, ipsec_fragcache_entry_t *fep, - ipsec_fragcache_t *frag) + ipsec_fragcache_t *frag, ipsec_stack_t *ipss) { ipsec_fragcache_entry_t *targp; ipsec_fragcache_entry_t *nextp = fep->itpfe_next; @@ -7250,7 +6818,12 @@ fragcache_delentry(int slot, ipsec_fragcache_entry_t *fep, ASSERT(MUTEX_HELD(&frag->itpf_lock)); /* Free up any fragment list still in cache entry */ - ipsec_freemsg_chain(fep->itpfe_fraglist); + if (fep->itpfe_fraglist != NULL) { + ip_drop_packet_chain(fep->itpfe_fraglist, + ip_recv_attr_is_mblk(fep->itpfe_fraglist), NULL, + DROPPER(ipss, ipds_spd_nomem), &ipss->ipsec_spd_dropper); + } + fep->itpfe_fraglist = NULL; targp = (frag->itpf_ptr)[slot]; ASSERT(targp != 0); diff --git a/usr/src/uts/common/inet/ip/spdsock.c b/usr/src/uts/common/inet/ip/spdsock.c index e15d23fdd8..1b25af4a97 100644 --- a/usr/src/uts/common/inet/ip/spdsock.c +++ b/usr/src/uts/common/inet/ip/spdsock.c @@ -58,7 +58,6 @@ #include <inet/nd.h> #include <inet/ip_if.h> #include <inet/optcom.h> -#include <inet/ipsec_info.h> #include <inet/ipsec_impl.h> #include <inet/spdsock.h> #include <inet/sadb.h> @@ -1150,9 +1149,8 @@ spdsock_addrule(queue_t *q, ipsec_policy_head_t *iph, mblk_t *mp, fail: rw_exit(&iph->iph_lock); - while ((--rulep) >= &rules[0]) { - IPPOL_REFRELE(rulep->pol, spds->spds_netstack); - } + while ((--rulep) >= &rules[0]) + IPPOL_REFRELE(rulep->pol); ipsec_actvec_free(actp, nact); fail2: if (itp != NULL) { @@ -2519,8 +2517,8 @@ error: * be invoked either once IPsec is loaded on a cached request, or * when a request is received while IPsec is loaded. */ -static void -spdsock_do_updatealg(spd_ext_t *extv[], int *diag, spd_stack_t *spds) +static int +spdsock_do_updatealg(spd_ext_t *extv[], spd_stack_t *spds) { struct spd_ext_actions *actp; struct spd_attribute *attr, *endattr; @@ -2529,17 +2527,15 @@ spdsock_do_updatealg(spd_ext_t *extv[], int *diag, spd_stack_t *spds) ipsec_algtype_t alg_type = 0; boolean_t skip_alg = B_TRUE, doing_proto = B_FALSE; uint_t i, cur_key, cur_block, algid; + int diag = -1; - *diag = -1; ASSERT(MUTEX_HELD(&spds->spds_alg_lock)); /* parse the message, building the list of algorithms */ actp = (struct spd_ext_actions *)extv[SPD_EXT_ACTION]; - if (actp == NULL) { - *diag = SPD_DIAGNOSTIC_NO_ACTION_EXT; - return; - } + if (actp == NULL) + return (SPD_DIAGNOSTIC_NO_ACTION_EXT); start = (uint64_t *)actp; end = (start + actp->spd_actions_len); @@ -2583,7 +2579,7 @@ spdsock_do_updatealg(spd_ext_t *extv[], int *diag, spd_stack_t *spds) ss1dbg(spds, ("spdsock_do_updatealg: " "invalid alg id %d\n", attr->spd_attr_value)); - *diag = SPD_DIAGNOSTIC_ALG_ID_RANGE; + diag = SPD_DIAGNOSTIC_ALG_ID_RANGE; goto bail; } alg->alg_id = attr->spd_attr_value; @@ -2623,7 +2619,7 @@ spdsock_do_updatealg(spd_ext_t *extv[], int *diag, spd_stack_t *spds) cur_key >= alg->alg_nkey_sizes) { ss1dbg(spds, ("spdsock_do_updatealg: " "too many key sizes\n")); - *diag = SPD_DIAGNOSTIC_ALG_NUM_KEY_SIZES; + diag = SPD_DIAGNOSTIC_ALG_NUM_KEY_SIZES; goto bail; } alg->alg_key_sizes[cur_key++] = attr->spd_attr_value; @@ -2659,7 +2655,7 @@ spdsock_do_updatealg(spd_ext_t *extv[], int *diag, spd_stack_t *spds) cur_block >= alg->alg_nblock_sizes) { ss1dbg(spds, ("spdsock_do_updatealg: " "too many block sizes\n")); - *diag = SPD_DIAGNOSTIC_ALG_NUM_BLOCK_SIZES; + diag = SPD_DIAGNOSTIC_ALG_NUM_BLOCK_SIZES; goto bail; } alg->alg_block_sizes[cur_block++] = @@ -2686,7 +2682,7 @@ spdsock_do_updatealg(spd_ext_t *extv[], int *diag, spd_stack_t *spds) cur_block >= alg->alg_nparams) { ss1dbg(spds, ("spdsock_do_updatealg: " "too many params\n")); - *diag = SPD_DIAGNOSTIC_ALG_NUM_BLOCK_SIZES; + diag = SPD_DIAGNOSTIC_ALG_NUM_BLOCK_SIZES; goto bail; } /* @@ -2703,7 +2699,7 @@ spdsock_do_updatealg(spd_ext_t *extv[], int *diag, spd_stack_t *spds) if (attr->spd_attr_value > CRYPTO_MAX_MECH_NAME) { ss1dbg(spds, ("spdsock_do_updatealg: " "mech name too long\n")); - *diag = SPD_DIAGNOSTIC_ALG_MECH_NAME_LEN; + diag = SPD_DIAGNOSTIC_ALG_MECH_NAME_LEN; goto bail; } mech_name = (char *)(attr + 1); @@ -2751,6 +2747,7 @@ bail: for (algid = 0; algid < IPSEC_MAX_ALGS; algid++) if (spds->spds_algs[alg_type][algid] != NULL) ipsec_alg_free(spds->spds_algs[alg_type][algid]); + return (diag); } /* @@ -2803,9 +2800,12 @@ spdsock_updatealg(queue_t *q, mblk_t *mp, spd_ext_t *extv[]) int diag; mutex_enter(&spds->spds_alg_lock); - spdsock_do_updatealg(extv, &diag, spds); - mutex_exit(&spds->spds_alg_lock); + diag = spdsock_do_updatealg(extv, spds); if (diag == -1) { + /* Keep the lock held while we walk the SA tables. */ + sadb_alg_update(IPSEC_ALG_ALL, 0, 0, + spds->spds_netstack); + mutex_exit(&spds->spds_alg_lock); spd_echo(q, mp); if (audit_active) { cred_t *cr; @@ -2817,6 +2817,7 @@ spdsock_updatealg(queue_t *q, mblk_t *mp, spd_ext_t *extv[]) cpid); } } else { + mutex_exit(&spds->spds_alg_lock); spdsock_diag(q, mp, diag); if (audit_active) { cred_t *cr; @@ -3117,10 +3118,7 @@ spdsock_update_pending_algs(netstack_t *ns) mutex_enter(&spds->spds_alg_lock); if (spds->spds_algs_pending) { - int diag; - - spdsock_do_updatealg(spds->spds_extv_algs, &diag, - spds); + (void) spdsock_do_updatealg(spds->spds_extv_algs, spds); spds->spds_algs_pending = B_FALSE; } mutex_exit(&spds->spds_alg_lock); @@ -3265,7 +3263,7 @@ spdsock_opt_get(queue_t *q, int level, int name, uchar_t *ptr) int spdsock_opt_set(queue_t *q, uint_t mgmt_flags, int level, int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, - void *thisdg_attrs, cred_t *cr, mblk_t *mblk) + void *thisdg_attrs, cred_t *cr) { int *i1 = (int *)invalp; spdsock_t *ss = (spdsock_t *)q->q_ptr; @@ -3337,11 +3335,9 @@ spdsock_wput_other(queue_t *q, mblk_t *mp) } if (((union T_primitives *)mp->b_rptr)->type == T_SVR4_OPTMGMT_REQ) { - (void) svr4_optcom_req(q, mp, cr, - &spdsock_opt_obj, B_FALSE); + svr4_optcom_req(q, mp, cr, &spdsock_opt_obj); } else { - (void) tpi_optcom_req(q, mp, cr, - &spdsock_opt_obj, B_FALSE); + tpi_optcom_req(q, mp, cr, &spdsock_opt_obj); } break; case T_DATA_REQ: diff --git a/usr/src/uts/common/inet/ip/spdsock_opt_data.c b/usr/src/uts/common/inet/ip/spdsock_opt_data.c index df797bb37a..c5438f29cc 100644 --- a/usr/src/uts/common/inet/ip/spdsock_opt_data.c +++ b/usr/src/uts/common/inet/ip/spdsock_opt_data.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/stream.h> #define _SUN_TPI_VERSION 1 @@ -53,9 +51,9 @@ */ opdes_t spdsock_opt_arr[] = { - { SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_PASSNEXT, + { SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, 0, (t_uscalar_t)sizeof (int), 0 }, - { SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_PASSNEXT, + { SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, 0, (t_uscalar_t)sizeof (int), 0 }, }; @@ -88,7 +86,6 @@ optdb_obj_t spdsock_opt_obj = { NULL, /* SPDSOCK default value function pointer */ spdsock_opt_get, /* SPDSOCK get function pointer */ spdsock_opt_set, /* SPDSOCK set function pointer */ - B_TRUE, /* SPDSOCK is tpi provider */ SPDSOCK_OPT_ARR_CNT, /* SPDSOCK option database count of entries */ spdsock_opt_arr, /* SPDSOCK option database */ SPDSOCK_VALID_LEVELS_CNT, /* SPDSOCK valid level count of entries */ diff --git a/usr/src/uts/common/inet/ip/tn_ipopt.c b/usr/src/uts/common/inet/ip/tn_ipopt.c index 359b8d4623..1ce050ec69 100644 --- a/usr/src/uts/common/inet/ip/tn_ipopt.c +++ b/usr/src/uts/common/inet/ip/tn_ipopt.c @@ -271,38 +271,40 @@ tsol_get_option_v6(mblk_t *mp, tsol_ip_label_t *label_type, uchar_t **buffer) * tsol_check_dest() * * This routine verifies if a destination is allowed to recieve messages - * based on the message cred's security label. If any adjustments to - * the cred are needed due to the connection's MAC mode or - * the destination's ability to receive labels, an "effective cred" - * will be returned. + * based on the security label. If any adjustments to the label are needed + * due to the connection's MAC mode or the destination's ability + * to receive labels, an "effective label" will be returned. + * + * zone_is_global is set if the actual zoneid is global. That is, it is + * not set for an exclusive-IP zone. * - * On successful return, effective_cred will point to the new creds needed - * or will be NULL if new creds aren't needed. On error, effective_cred - * is NULL. + * On successful return, effective_tsl will point to the new label needed + * or will be NULL if a new label isn't needed. On error, effective_tsl will + * point to NULL. * * Returns: - * 0 Have or constructed appropriate credentials - * EHOSTUNREACH The credentials failed the remote host accreditation + * 0 Label (was|is now) correct + * EHOSTUNREACH The label failed the remote host accreditation * ENOMEM Memory allocation failure */ int -tsol_check_dest(const cred_t *credp, const void *dst, uchar_t version, - uint_t mac_mode, cred_t **effective_cred) +tsol_check_dest(const ts_label_t *tsl, const void *dst, + uchar_t version, uint_t mac_mode, boolean_t zone_is_global, + ts_label_t **effective_tsl) { - ts_label_t *tsl, *newtsl = NULL; + ts_label_t *newtsl = NULL; tsol_tpc_t *dst_rhtp; - zoneid_t zoneid; - if (effective_cred != NULL) - *effective_cred = NULL; + if (effective_tsl != NULL) + *effective_tsl = NULL; ASSERT(version == IPV4_VERSION || (version == IPV6_VERSION && !IN6_IS_ADDR_V4MAPPED((in6_addr_t *)dst))); /* Always pass kernel level communication (NULL label) */ - if ((tsl = crgetlabel(credp)) == NULL) { + if (tsl == NULL) { DTRACE_PROBE2(tx__tnopt__log__info__labeling__mac__allownull, - char *, "destination ip(1) with null cred was passed", + char *, "destination ip(1) with null label was passed", ipaddr_t, dst); return (0); } @@ -358,9 +360,8 @@ tsol_check_dest(const cred_t *credp, const void *dst, uchar_t version, } if (!blequal(&dst_rhtp->tpc_tp.tp_def_label, &tsl->tsl_label)) { - zoneid = crgetzoneid(credp); if (mac_mode != CONN_MAC_AWARE || - !(zoneid == GLOBAL_ZONEID || + !(zone_is_global || bldominates(&tsl->tsl_label, &dst_rhtp->tpc_tp.tp_def_label))) { DTRACE_PROBE4( @@ -438,51 +439,43 @@ tsol_check_dest(const cred_t *credp, const void *dst, uchar_t version, } /* - * Generate a new cred if we modified the security label or - * label flags. + * Return the new label. */ if (newtsl != NULL) { - if (effective_cred != NULL) { - *effective_cred = copycred_from_tslabel(credp, - newtsl, KM_NOSLEEP); - } - label_rele(newtsl); - if (effective_cred != NULL && *effective_cred == NULL) { - TPC_RELE(dst_rhtp); - return (ENOMEM); - } + if (effective_tsl != NULL) + *effective_tsl = newtsl; + else + label_rele(newtsl); } TPC_RELE(dst_rhtp); return (0); } /* - * tsol_compute_label() + * tsol_compute_label_v4() * * This routine computes the IP label that should be on a packet based on the * connection and destination information. * + * The zoneid is the IP zoneid (i.e., GLOBAL_ZONEID for exlusive-IP zones). + * * Returns: * 0 Fetched label * EHOSTUNREACH No route to destination * EINVAL Label cannot be computed */ int -tsol_compute_label(const cred_t *credp, ipaddr_t dst, uchar_t *opt_storage, - ip_stack_t *ipst) +tsol_compute_label_v4(const ts_label_t *tsl, zoneid_t zoneid, ipaddr_t dst, + uchar_t *opt_storage, ip_stack_t *ipst) { uint_t sec_opt_len; - ts_label_t *tsl; - ire_t *ire, *sire = NULL; - tsol_ire_gw_secattr_t *attrp; - zoneid_t zoneid, ip_zoneid; - - ASSERT(credp != NULL); + ire_t *ire; + tsol_ire_gw_secattr_t *attrp = NULL; if (opt_storage != NULL) opt_storage[IPOPT_OLEN] = 0; - if ((tsl = crgetlabel(credp)) == NULL) + if (tsl == NULL) return (0); /* always pass multicast */ @@ -493,67 +486,44 @@ tsol_compute_label(const cred_t *credp, ipaddr_t dst, uchar_t *opt_storage, return (0); if (tsl->tsl_flags & TSLF_UNLABELED) { - /* * The destination is unlabeled. Only add a label if the * destination is not a broadcast/local/loopback address, * the destination is not on the same subnet, and the * next-hop gateway is labeled. - * - * For exclusive stacks we set the zoneid to zero - * to operate as if we are in the global zone for - * IRE lookups. */ - zoneid = crgetzoneid(credp); - if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID) - ip_zoneid = GLOBAL_ZONEID; - else - ip_zoneid = zoneid; - - ire = ire_cache_lookup(dst, ip_zoneid, tsl, ipst); - - if (ire != NULL && (ire->ire_type & (IRE_BROADCAST | IRE_LOCAL | - IRE_LOOPBACK | IRE_INTERFACE)) != 0) { - IRE_REFRELE(ire); - return (0); - } else if (ire == NULL) { - ire = ire_ftable_lookup(dst, 0, 0, 0, NULL, &sire, - ip_zoneid, 0, tsl, (MATCH_IRE_RECURSIVE | - MATCH_IRE_DEFAULT | MATCH_IRE_SECATTR), ipst); - } - - /* no route to destination */ - if (ire == NULL) { + ire = ire_route_recursive_v4(dst, 0, NULL, zoneid, tsl, + MATCH_IRE_SECATTR, B_TRUE, 0, ipst, NULL, &attrp, NULL); + ASSERT(ire != NULL); + if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { + /* no route to destination */ + ire_refrele(ire); DTRACE_PROBE3( tx__tnopt__log__info__labeling__routedst__v4, char *, "No route to unlabeled dest ip(1) with " - "creds(2).", ipaddr_t, dst, cred_t *, credp); + "with label(2).", ipaddr_t, dst, ts_label_t *, tsl); return (EHOSTUNREACH); } + if (ire->ire_type & (IRE_BROADCAST | IRE_LOCAL | IRE_LOOPBACK | + IRE_INTERFACE)) { + ire_refrele(ire); + return (0); + } /* - * Prefix IRE from f-table lookup means that the destination - * is not directly connected; check the next-hop attributes. + * ire_route_recursive gives us the first attrp it finds + * in the recursive lookup. */ - if (sire != NULL) { - ASSERT(ire != NULL); - IRE_REFRELE(ire); - ire = sire; - } - /* * Return now if next hop gateway is unlabeled. There is * no need to generate a CIPSO option for this message. */ - attrp = ire->ire_gw_secattr; if (attrp == NULL || attrp->igsa_rhc == NULL || attrp->igsa_rhc->rhc_tpc->tpc_tp.host_type == UNLABELED) { - IRE_REFRELE(ire); + ire_refrele(ire); return (0); } - - IRE_REFRELE(ire); - + ire_refrele(ire); } /* compute the CIPSO option */ @@ -562,8 +532,8 @@ tsol_compute_label(const cred_t *credp, ipaddr_t dst, uchar_t *opt_storage, if (sec_opt_len == 0) { DTRACE_PROBE3(tx__tnopt__log__error__labeling__lostops__v4, - char *, "options lack length for dest ip(1) with creds(2).", - ipaddr_t, dst, cred_t *, credp); + char *, "options lack length for dest ip(1) with label(2).", + ipaddr_t, dst, ts_label_t *, tsl); return (EINVAL); } @@ -575,6 +545,9 @@ tsol_compute_label(const cred_t *credp, ipaddr_t dst, uchar_t *opt_storage, * header, move the 'buflen' bytes back to fill the gap, and return the number * of bytes removed (as zero or negative number). Assumes that the headers are * sane. + * + * Note that tsol_remove_secopt does not adjust ipha_length but + * tsol_remove_secopt_v6 does adjust ip6_plen. */ int tsol_remove_secopt(ipha_t *ipha, int buflen) @@ -659,6 +632,9 @@ tsol_remove_secopt(ipha_t *ipha, int buflen) * option cannot be inserted. (Note that negative return values are possible * when noops must be compressed, and that only -1 indicates error. Successful * return value is always evenly divisible by 4, by definition.) + * + * Note that tsol_prepend_option does not adjust ipha_length but + * tsol_prepend_option_v6 does adjust ip6_plen. */ int tsol_prepend_option(uchar_t *optbuf, ipha_t *ipha, int buflen) @@ -810,28 +786,39 @@ tsol_prepend_option(uchar_t *optbuf, ipha_t *ipha, int buflen) } /* - * tsol_check_label() + * tsol_check_label_v4() * * This routine computes the IP label that should be on the packet based on the - * connection and destination information. If the label is there, it returns - * zero, so the caller knows that the label is syncronized, and further calls - * are not required. If the label isn't right, then the right one is inserted. + * connection and destination information. It's called by the IP forwarding + * logic and by ip_output_simple. The ULPs generate the labels before calling + * conn_ip_output. If any adjustments to + * the label are needed due to the connection's MAC-exempt status or + * the destination's ability to receive labels, an "effective label" + * will be returned. * * The packet's header is clear before entering IPsec's engine. * + * The zoneid is the IP zoneid (i.e., GLOBAL_ZONEID for exlusive-IP zones). + * zone_is_global is set if the actual zoneid is global. + * + * On successful return, effective_tslp will point to the new label needed + * or will be NULL if a new label isn't needed. On error, effective_tsl will + * point to NULL. + * * Returns: - * 0 Label on packet (was|is now) correct + * 0 Label on (was|is now) correct * EACCES The packet failed the remote host accreditation. * ENOMEM Memory allocation failure. * EINVAL Label cannot be computed */ int -tsol_check_label(const cred_t *credp, mblk_t **mpp, uint_t mac_mode, - ip_stack_t *ipst, pid_t pid) +tsol_check_label_v4(const ts_label_t *tsl, zoneid_t zoneid, mblk_t **mpp, + uint_t mac_mode, boolean_t zone_is_global, ip_stack_t *ipst, + ts_label_t **effective_tslp) { mblk_t *mp = *mpp; ipha_t *ipha; - cred_t *effective_cred = NULL; + ts_label_t *effective_tsl = NULL; uchar_t opt_storage[IP_MAX_OPT_LENGTH]; uint_t hlen; uint_t sec_opt_len; @@ -839,19 +826,18 @@ tsol_check_label(const cred_t *credp, mblk_t **mpp, uint_t mac_mode, int delta_remove = 0, delta_add, adjust; int retv; + *effective_tslp = NULL; opt_storage[IPOPT_OPTVAL] = 0; ipha = (ipha_t *)mp->b_rptr; /* * Verify the destination is allowed to receive packets at - * the security label of the message data. check_dest() - * may create a new effective cred with a modified label - * or label flags. Apply any such cred to the message block - * for use in future routing decisions. + * the security label of the message data. tsol_check_dest() + * may create a new effective label or label flags. */ - retv = tsol_check_dest(credp, &ipha->ipha_dst, IPV4_VERSION, - mac_mode, &effective_cred); + retv = tsol_check_dest(tsl, &ipha->ipha_dst, IPV4_VERSION, + mac_mode, zone_is_global, &effective_tsl); if (retv != 0) return (retv); @@ -859,16 +845,15 @@ tsol_check_label(const cred_t *credp, mblk_t **mpp, uint_t mac_mode, * Calculate the security label to be placed in the text * of the message (if any). */ - if (effective_cred != NULL) { - if ((retv = tsol_compute_label(effective_cred, + if (effective_tsl != NULL) { + if ((retv = tsol_compute_label_v4(effective_tsl, zoneid, ipha->ipha_dst, opt_storage, ipst)) != 0) { - crfree(effective_cred); + label_rele(effective_tsl); return (retv); } - mblk_setcred(mp, effective_cred, pid); - crfree(effective_cred); + *effective_tslp = effective_tsl; } else { - if ((retv = tsol_compute_label(credp, + if ((retv = tsol_compute_label_v4(tsl, zoneid, ipha->ipha_dst, opt_storage, ipst)) != 0) { return (retv); } @@ -890,10 +875,6 @@ tsol_check_label(const cred_t *credp, mblk_t **mpp, uint_t mac_mode, return (0); } - if (msg_getcred(mp, NULL) == NULL) { - mblk_setcred(mp, (cred_t *)credp, NOPID); - } - /* * If there is an option there, then it must be the wrong one; delete. */ @@ -918,8 +899,13 @@ tsol_check_label(const cred_t *credp, mblk_t **mpp, uint_t mac_mode, copylen = 256; new_mp = allocb_tmpl(hlen + copylen + (mp->b_rptr - mp->b_datap->db_base), mp); - if (new_mp == NULL) + if (new_mp == NULL) { + if (effective_tsl != NULL) { + label_rele(effective_tsl); + *effective_tslp = NULL; + } return (ENOMEM); + } /* keep the bias */ new_mp->b_rptr += mp->b_rptr - mp->b_datap->db_base; @@ -948,6 +934,10 @@ tsol_check_label(const cred_t *credp, mblk_t **mpp, uint_t mac_mode, return (0); param_prob: + if (effective_tsl != NULL) { + label_rele(effective_tsl); + *effective_tslp = NULL; + } return (EINVAL); } @@ -972,19 +962,17 @@ param_prob: * i.e starting from the IP6OPT_LS but not including the pad at the end. * The user must prepend two octets (either padding or next header / length) * and append padding out to the next 8 octet boundary. + * + * The zoneid is the IP zoneid (i.e., GLOBAL_ZONEID for exlusive-IP zones). */ int -tsol_compute_label_v6(const cred_t *credp, const in6_addr_t *dst, - uchar_t *opt_storage, ip_stack_t *ipst) +tsol_compute_label_v6(const ts_label_t *tsl, zoneid_t zoneid, + const in6_addr_t *dst, uchar_t *opt_storage, ip_stack_t *ipst) { - ts_label_t *tsl; uint_t sec_opt_len; uint32_t doi; - zoneid_t zoneid, ip_zoneid; - ire_t *ire, *sire; - tsol_ire_gw_secattr_t *attrp; - - ASSERT(credp != NULL); + ire_t *ire; + tsol_ire_gw_secattr_t *attrp = NULL; if (ip6opt_ls == 0) return (EINVAL); @@ -992,15 +980,13 @@ tsol_compute_label_v6(const cred_t *credp, const in6_addr_t *dst, if (opt_storage != NULL) opt_storage[IPOPT_OLEN] = 0; - if ((tsl = crgetlabel(credp)) == NULL) + if (tsl == NULL) return (0); /* Always pass multicast */ if (IN6_IS_ADDR_MULTICAST(dst)) return (0); - zoneid = crgetzoneid(credp); - /* * Fill in a V6 label. If a new format is added here, make certain * that the maximum size of this label is reflected in sys/tsol/tnet.h @@ -1012,62 +998,41 @@ tsol_compute_label_v6(const cred_t *credp, const in6_addr_t *dst, if (tsl->tsl_flags & TSLF_UNLABELED) { /* * The destination is unlabeled. Only add a label if the - * destination is not broadcast/local/loopback address, + * destination is not a broadcast/local/loopback address, * the destination is not on the same subnet, and the * next-hop gateway is labeled. - * - * For exclusive stacks we set the zoneid to zero to - * operate as if we are in the global zone when - * performing IRE lookups and conn_t comparisons. */ - if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID) - ip_zoneid = GLOBAL_ZONEID; - else - ip_zoneid = zoneid; - - sire = NULL; - ire = ire_cache_lookup_v6(dst, ip_zoneid, tsl, ipst); - - if (ire != NULL && (ire->ire_type & (IRE_LOCAL | - IRE_LOOPBACK | IRE_INTERFACE)) != 0) { - IRE_REFRELE(ire); - return (0); - } else if (ire == NULL) { - ire = ire_ftable_lookup_v6(dst, NULL, NULL, 0, NULL, - &sire, ip_zoneid, 0, tsl, (MATCH_IRE_RECURSIVE | - MATCH_IRE_DEFAULT | MATCH_IRE_SECATTR), ipst); - } - - /* no route to destination */ - if (ire == NULL) { + ire = ire_route_recursive_v6(dst, 0, NULL, zoneid, tsl, + MATCH_IRE_SECATTR, B_TRUE, 0, ipst, NULL, &attrp, NULL); + ASSERT(ire != NULL); + if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { + /* no route to destination */ + ire_refrele(ire); DTRACE_PROBE3( tx__tnopt__log__info__labeling__routedst__v6, char *, "No route to unlabeled dest ip6(1) with " - "creds(2).", in6_addr_t *, dst, cred_t *, credp); + "label(2).", in6_addr_t *, dst, ts_label_t *, tsl); return (EHOSTUNREACH); } - + if (ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK | + IRE_INTERFACE)) { + ire_refrele(ire); + return (0); + } /* - * Prefix IRE from f-table lookup means that the destination - * is not directly connected; check the next-hop attributes. + * ire_route_recursive gives us the first attrp it finds + * in the recursive lookup. */ - if (sire != NULL) { - ASSERT(ire != NULL); - IRE_REFRELE(ire); - ire = sire; - } - /* * Return now if next hop gateway is unlabeled. There is * no need to generate a CIPSO option for this message. */ - attrp = ire->ire_gw_secattr; if (attrp == NULL || attrp->igsa_rhc == NULL || attrp->igsa_rhc->rhc_tpc->tpc_tp.host_type == UNLABELED) { - IRE_REFRELE(ire); + ire_refrele(ire); return (0); } - IRE_REFRELE(ire); + ire_refrele(ire); } /* compute the CIPSO option */ @@ -1079,7 +1044,7 @@ tsol_compute_label_v6(const cred_t *credp, const in6_addr_t *dst, if (sec_opt_len == 0) { DTRACE_PROBE3(tx__tnopt__log__error__labeling__lostops__v6, char *, "options lack length for dest ip6(1) with " - "creds(2).", in6_addr_t *, dst, cred_t *, credp); + "label(2).", in6_addr_t *, dst, ts_label_t *, tsl); return (EINVAL); } @@ -1188,6 +1153,9 @@ tsol_find_secopt_v6( * Header and data following the label option that is deleted are copied * (i.e. slid backward) to the right position, and returns the number * of bytes removed (as zero or negative number.) + * + * Note that tsol_remove_secopt does not adjust ipha_length but + * tsol_remove_secopt_v6 does adjust ip6_plen. */ int tsol_remove_secopt_v6(ip6_t *ip6h, int buflen) @@ -1286,6 +1254,9 @@ tsol_remove_secopt_v6(ip6_t *ip6h, int buflen) * extra option being added. Header and data following the position where * the label option is inserted are copied (i.e. slid forward) to the right * position. + * + * Note that tsol_prepend_option does not adjust ipha_length but + * tsol_prepend_option_v6 does adjust ip6_plen. */ int tsol_prepend_option_v6(uchar_t *optbuf, ip6_t *ip6h, int buflen) @@ -1368,22 +1339,36 @@ tsol_prepend_option_v6(uchar_t *optbuf, ip6_t *ip6h, int buflen) * tsol_check_label_v6() * * This routine computes the IP label that should be on the packet based on the - * connection and destination information. It's called only by the IP - * forwarding logic, because all internal modules atop IP know how to generate - * their own labels. + * connection and destination information. It's called by the IP forwarding + * logic and by ip_output_simple. The ULPs generate the labels before calling + * conn_ip_output. If any adjustments to + * the label are needed due to the connection's MAC-exempt status or + * the destination's ability to receive labels, an "effective label" + * will be returned. + * + * The packet's header is clear before entering IPsec's engine. + * + * The zoneid is the IP zoneid (i.e., GLOBAL_ZONEID for exlusive-IP zones). + * zone_is_global is set if the actual zoneid is global. + * + * On successful return, effective_tslp will point to the new label needed + * or will be NULL if a new label isn't needed. On error, effective_tsl will + * point to NULL. * * Returns: - * 0 Label on packet was already correct + * 0 Label on (was|is now) correct * EACCES The packet failed the remote host accreditation. * ENOMEM Memory allocation failure. + * EINVAL Label cannot be computed */ int -tsol_check_label_v6(const cred_t *credp, mblk_t **mpp, uint_t mode, - ip_stack_t *ipst, pid_t pid) +tsol_check_label_v6(const ts_label_t *tsl, zoneid_t zoneid, mblk_t **mpp, + uint_t mac_mode, boolean_t zone_is_global, ip_stack_t *ipst, + ts_label_t **effective_tslp) { mblk_t *mp = *mpp; ip6_t *ip6h; - cred_t *effective_cred; + ts_label_t *effective_tsl = NULL; /* * Label option length is limited to IP_MAX_OPT_LENGTH for * symmetry with IPv4. Can be relaxed if needed @@ -1399,16 +1384,16 @@ tsol_check_label_v6(const cred_t *credp, mblk_t **mpp, uint_t mode, uint_t hbhlen; boolean_t hbh_needed; + *effective_tslp = NULL; + /* * Verify the destination is allowed to receive packets at - * the security label of the message data. check_dest() - * may create a new effective cred with a modified label - * or label flags. Apply any such cred to the message block - * for use in future routing decisions. + * the security label of the message data. tsol_check_dest() + * may create a new effective label or label flags. */ ip6h = (ip6_t *)mp->b_rptr; - retv = tsol_check_dest(credp, &ip6h->ip6_dst, IPV6_VERSION, - mode, &effective_cred); + retv = tsol_check_dest(tsl, &ip6h->ip6_dst, IPV6_VERSION, + mac_mode, zone_is_global, &effective_tsl); if (retv != 0) return (retv); @@ -1416,16 +1401,15 @@ tsol_check_label_v6(const cred_t *credp, mblk_t **mpp, uint_t mode, * Calculate the security label to be placed in the text * of the message (if any). */ - if (effective_cred != NULL) { - if ((retv = tsol_compute_label_v6(effective_cred, + if (effective_tsl != NULL) { + if ((retv = tsol_compute_label_v6(effective_tsl, zoneid, &ip6h->ip6_dst, opt_storage, ipst)) != 0) { - crfree(effective_cred); + label_rele(effective_tsl); return (retv); } - mblk_setcred(mp, effective_cred, pid); - crfree(effective_cred); + *effective_tslp = effective_tsl; } else { - if ((retv = tsol_compute_label_v6(credp, + if ((retv = tsol_compute_label_v6(tsl, zoneid, &ip6h->ip6_dst, opt_storage, ipst)) != 0) return (retv); } @@ -1457,10 +1441,6 @@ tsol_check_label_v6(const cred_t *credp, mblk_t **mpp, uint_t mode, return (0); } - if (msg_getcred(mp, NULL) == NULL) { - mblk_setcred(mp, (cred_t *)credp, NOPID); - } - if (secopt != NULL && sec_opt_len != 0 && (bcmp(opt_storage, secopt, sec_opt_len + 2) == 0)) { /* The packet has the correct label already */ @@ -1499,8 +1479,13 @@ tsol_check_label_v6(const cred_t *credp, mblk_t **mpp, uint_t mode, copylen = hdr_len; new_mp = allocb_tmpl(hlen + copylen + (mp->b_rptr - mp->b_datap->db_base), mp); - if (new_mp == NULL) + if (new_mp == NULL) { + if (effective_tsl != NULL) { + label_rele(effective_tsl); + *effective_tslp = NULL; + } return (ENOMEM); + } /* keep the bias */ new_mp->b_rptr += mp->b_rptr - mp->b_datap->db_base; @@ -1522,208 +1507,13 @@ tsol_check_label_v6(const cred_t *credp, mblk_t **mpp, uint_t mode, ASSERT(mp->b_wptr + delta_add <= DB_LIM(mp)); mp->b_wptr += delta_add; + /* tsol_prepend_option_v6 has adjusted ip6_plen */ return (0); param_prob: - return (EINVAL); -} - -/* - * Update the given IPv6 "sticky options" structure to contain the provided - * label, which is encoded as an IPv6 option. Existing label is removed if - * necessary, and storage is allocated/freed/resized. - * - * Returns 0 on success, errno on failure. - */ -int -tsol_update_sticky(ip6_pkt_t *ipp, uint_t *labellen, const uchar_t *labelopt) -{ - int rawlen, optlen, newlen; - uchar_t *newopts; - - /* - * rawlen is the size of the IPv6 label to be inserted from labelopt. - * optlen is the total length of that option, including any necessary - * headers and padding. newlen is the new size of the total hop-by-hop - * options buffer, including user options. - */ - ASSERT(*labellen <= ipp->ipp_hopoptslen); - ASSERT((ipp->ipp_hopopts == NULL && ipp->ipp_hopoptslen == 0) || - (ipp->ipp_hopopts != NULL && ipp->ipp_hopoptslen != 0)); - - if ((rawlen = labelopt[1]) != 0) { - rawlen += 2; /* add in header size */ - optlen = (2 + rawlen + 7) & ~7; - } else { - optlen = 0; - } - newlen = ipp->ipp_hopoptslen + optlen - *labellen; - if (newlen == 0 && ipp->ipp_hopopts != NULL) { - /* Deleting all existing hop-by-hop options */ - kmem_free(ipp->ipp_hopopts, ipp->ipp_hopoptslen); - ipp->ipp_hopopts = NULL; - ipp->ipp_fields &= ~IPPF_HOPOPTS; - } else if (optlen != *labellen) { - /* If the label not same size as last time, then reallocate */ - if (newlen > IP6_MAX_OPT_LENGTH) - return (EHOSTUNREACH); - newopts = kmem_alloc(newlen, KM_NOSLEEP); - if (newopts == NULL) - return (ENOMEM); - /* - * If the user has hop-by-hop stickyoptions set, then copy his - * options in after the security label. - */ - if (ipp->ipp_hopoptslen > *labellen) { - bcopy(ipp->ipp_hopopts + *labellen, newopts + optlen, - ipp->ipp_hopoptslen - *labellen); - /* - * Stomp out any header gunk here - this was the - * previous next-header and option length field. - */ - newopts[optlen] = IP6OPT_PADN; - newopts[optlen + 1] = 0; - } - if (ipp->ipp_hopopts != NULL) - kmem_free(ipp->ipp_hopopts, ipp->ipp_hopoptslen); - ipp->ipp_hopopts = (ip6_hbh_t *)newopts; - } - ipp->ipp_hopoptslen = newlen; - *labellen = optlen; - - newopts = (uchar_t *)ipp->ipp_hopopts; - - /* If there are any options, then fix up reported length */ - if (newlen > 0) { - newopts[1] = (newlen + 7) / 8 - 1; - ipp->ipp_fields |= IPPF_HOPOPTS; - } - - /* If there's a label, then insert it now */ - if (optlen > 0) { - /* skip next-header and length fields */ - newopts += 2; - bcopy(labelopt, newopts, rawlen); - newopts += rawlen; - /* make sure padding comes out right */ - optlen -= 2 + rawlen; - if (optlen == 1) { - newopts[0] = IP6OPT_PAD1; - } else if (optlen > 1) { - newopts[0] = IP6OPT_PADN; - optlen -= 2; - newopts[1] = optlen; - if (optlen > 0) - bzero(newopts + 2, optlen); - } - } - return (0); -} - -int -tsol_update_options(uchar_t **opts, uint_t *totlen, uint_t *labellen, - const uchar_t *labelopt) -{ - int optlen, newlen; - uchar_t *newopts; - - optlen = (labelopt[IPOPT_OLEN] + 3) & ~3; - newlen = *totlen + optlen - *labellen; - if (optlen > *labellen) { - if (newlen > IP_MAX_OPT_LENGTH) - return (EHOSTUNREACH); - newopts = (uchar_t *)mi_alloc(newlen, BPRI_HI); - if (newopts == NULL) - return (ENOMEM); - if (*totlen > *labellen) { - bcopy(*opts + *labellen, newopts + optlen, - *totlen - *labellen); - } - if (*opts != NULL) - mi_free((char *)*opts); - *opts = newopts; - } else if (optlen < *labellen) { - if (newlen == 0 && *opts != NULL) { - mi_free((char *)*opts); - *opts = NULL; - } - if (*totlen > *labellen) { - ovbcopy(*opts + *labellen, *opts + optlen, - *totlen - *labellen); - } - } - *totlen = newlen; - *labellen = optlen; - if (optlen > 0) { - newopts = *opts; - bcopy(labelopt, newopts, optlen); - /* check if there are user-supplied options that follow */ - if (optlen < newlen) { - /* compute amount of embedded alignment needed */ - optlen -= newopts[IPOPT_OLEN]; - newopts += newopts[IPOPT_OLEN]; - while (--optlen >= 0) - *newopts++ = IPOPT_NOP; - } else if (optlen != newopts[IPOPT_OLEN]) { - /* - * The label option is the only option and it is - * not a multiple of 4 bytes. - */ - optlen -= newopts[IPOPT_OLEN]; - newopts += newopts[IPOPT_OLEN]; - while (--optlen >= 0) - *newopts++ = IPOPT_EOL; - } + if (effective_tsl != NULL) { + label_rele(effective_tsl); + *effective_tslp = NULL; } - return (0); -} - -/* - * This does the bulk of the processing for setting IPPROTO_IP {T_,}IP_OPTIONS. - */ -boolean_t -tsol_option_set(uchar_t **opts, uint_t *optlen, uint_t labellen, - const uchar_t *useropts, uint_t userlen) -{ - int newlen; - uchar_t *newopts; - - newlen = userlen + labellen; - if (newlen > *optlen) { - /* need more room */ - newopts = (uchar_t *)mi_alloc(newlen, BPRI_HI); - if (newopts == NULL) - return (B_FALSE); - /* - * The supplied *opts can't be NULL in this case, - * since there's an existing label. - */ - if (labellen > 0) - bcopy(*opts, newopts, labellen); - if (*opts != NULL) - mi_free((char *)*opts); - *opts = newopts; - } - - if (newlen == 0) { - /* special case -- no remaining IP options at all */ - if (*opts != NULL) { - mi_free((char *)*opts); - *opts = NULL; - } - } else if (userlen > 0) { - /* merge in the user's options */ - newopts = *opts; - if (labellen > 0) { - int extra = labellen - newopts[IPOPT_OLEN]; - - newopts += newopts[IPOPT_OLEN]; - while (--extra >= 0) - *newopts++ = IPOPT_NOP; - } - bcopy(useropts, newopts, userlen); - } - - *optlen = newlen; - return (B_TRUE); + return (EINVAL); } diff --git a/usr/src/uts/common/inet/ip/tnet.c b/usr/src/uts/common/inet/ip/tnet.c index 1e5c0eb170..262d5bc339 100644 --- a/usr/src/uts/common/inet/ip/tnet.c +++ b/usr/src/uts/common/inet/ip/tnet.c @@ -133,16 +133,7 @@ int tsol_strict_error; * - A set of route-related attributes that only get set for prefix * IREs. If this is non-NULL, the prefix IRE has been associated * with a set of gateway security attributes by way of route add/ - * change functionality. This field stays NULL for IRE_CACHEs. - * - * igsa_gcgrp - * - * - Group of gc's which only gets set for IRE_CACHEs. Each of the gc - * points to a gcdb record that contains the security attributes - * used to perform the credential checks of the packet which uses - * the IRE. If the group is not empty, the list of gc's can be - * traversed starting at gcgrp_head. This field stays NULL for - * prefix IREs. + * change functionality. */ static kmem_cache_t *ire_gw_secattr_cache; @@ -223,7 +214,6 @@ ire_gw_secattr_constructor(void *buf, void *cdrarg, int kmflags) attrp->igsa_rhc = NULL; attrp->igsa_gc = NULL; - attrp->igsa_gcgrp = NULL; return (0); } @@ -257,14 +247,9 @@ ire_gw_secattr_free(tsol_ire_gw_secattr_t *attrp) GC_REFRELE(attrp->igsa_gc); attrp->igsa_gc = NULL; } - if (attrp->igsa_gcgrp != NULL) { - GCGRP_REFRELE(attrp->igsa_gcgrp); - attrp->igsa_gcgrp = NULL; - } ASSERT(attrp->igsa_rhc == NULL); ASSERT(attrp->igsa_gc == NULL); - ASSERT(attrp->igsa_gcgrp == NULL); kmem_cache_free(ire_gw_secattr_cache, attrp); } @@ -387,9 +372,6 @@ rtsa_validate(const struct rtsa_s *rp) /* * A brief explanation of the reference counting scheme: * - * Prefix IREs have a non-NULL igsa_gc and a NULL igsa_gcgrp; - * IRE_CACHEs have it vice-versa. - * * Apart from dynamic references due to to reference holds done * actively by threads, we have the following references: * @@ -402,8 +384,6 @@ rtsa_validate(const struct rtsa_s *rp) * to the gc_refcnt. * * gcgrp_refcnt: - * - An IRE_CACHE that points to an igsa_gcgrp contributes a reference - * to the gcgrp_refcnt of the associated tsol_gcgrp_t. * - Every tsol_gc_t in the chain headed by tsol_gcgrp_t contributes * a reference to the gcgrp_refcnt. */ @@ -613,7 +593,6 @@ gcgrp_inactive(tsol_gcgrp_t *gcgrp) mod_hash_t *hashp; ASSERT(MUTEX_HELD(&gcgrp_lock)); - ASSERT(!RW_LOCK_HELD(&gcgrp->gcgrp_rwlock)); ASSERT(gcgrp != NULL && gcgrp->gcgrp_refcnt == 0); ASSERT(gcgrp->gcgrp_head == NULL && gcgrp->gcgrp_count == 0); @@ -686,21 +665,21 @@ cipso_to_sl(const uchar_t *option, bslabel_t *sl) } /* - * If present, parse a CIPSO label in the incoming packet and - * construct a ts_label_t that reflects the CIPSO label and attach it - * to the dblk cred. Later as the mblk flows up through the stack any + * If present, parse the CIPSO label in the incoming packet and + * construct a ts_label_t that reflects the CIPSO label and put it in + * the ip_recv_attr_t. Later as the packet flows up through the stack any * code that needs to examine the packet label can inspect the label - * from the dblk cred. This function is called right in ip_rput for - * all packets, i.e. locally destined and to be forwarded packets. The - * forwarding path needs to examine the label to determine how to - * forward the packet. + * from the ira_tsl. This function is + * called right in ip_input for all packets, i.e. locally destined and + * to be forwarded packets. The forwarding path needs to examine the label + * to determine how to forward the packet. * * This routine pulls all message text up into the first mblk. * For IPv4, only the first 20 bytes of the IP header are guaranteed * to exist. For IPv6, only the IPv6 header is guaranteed to exist. */ boolean_t -tsol_get_pkt_label(mblk_t *mp, int version) +tsol_get_pkt_label(mblk_t *mp, int version, ip_recv_attr_t *ira) { tsol_tpc_t *src_rhtp = NULL; uchar_t *opt_ptr = NULL; @@ -713,7 +692,6 @@ tsol_get_pkt_label(mblk_t *mp, int version) const void *src; const ip6_t *ip6h; cred_t *credp; - pid_t cpid; int proto; ASSERT(DB_TYPE(mp) == M_DATA); @@ -846,28 +824,37 @@ tsol_get_pkt_label(mblk_t *mp, int version) return (B_FALSE); } - /* Make sure no other thread is messing with this mblk */ - ASSERT(DB_REF(mp) == 1); - /* Preserve db_cpid */ - credp = msg_extractcred(mp, &cpid); - if (credp == NULL) { + if (ira->ira_cred == NULL) { credp = newcred_from_bslabel(&sl, doi, KM_NOSLEEP); + if (credp == NULL) + return (B_FALSE); } else { cred_t *newcr; - newcr = copycred_from_bslabel(credp, &sl, doi, + newcr = copycred_from_bslabel(ira->ira_cred, &sl, doi, KM_NOSLEEP); - crfree(credp); + if (newcr == NULL) + return (B_FALSE); + if (ira->ira_free_flags & IRA_FREE_CRED) { + crfree(ira->ira_cred); + ira->ira_free_flags &= ~IRA_FREE_CRED; + ira->ira_cred = NULL; + } credp = newcr; } - if (credp == NULL) - return (B_FALSE); - crgetlabel(credp)->tsl_flags |= label_flags; - - mblk_setcred(mp, credp, cpid); - crfree(credp); /* mblk has ref on cred */ + /* + * Put the label in ira_tsl for convinience, while keeping + * the cred in ira_cred for getpeerucred which is used to get + * labels with TX. + * Note: no explicit refcnt/free_flag for ira_tsl. The free_flag + * for IRA_FREE_CRED is sufficient for both. + */ + ira->ira_tsl = crgetlabel(credp); + ira->ira_cred = credp; + ira->ira_free_flags |= IRA_FREE_CRED; + ira->ira_tsl->tsl_flags |= label_flags; return (B_TRUE); } @@ -878,25 +865,25 @@ tsol_get_pkt_label(mblk_t *mp, int version) */ boolean_t tsol_receive_local(const mblk_t *mp, const void *addr, uchar_t version, - boolean_t shared_addr, const conn_t *connp) + ip_recv_attr_t *ira, const conn_t *connp) { const cred_t *credp; ts_label_t *plabel, *conn_plabel; tsol_tpc_t *tp; boolean_t retv; const bslabel_t *label, *conn_label; + boolean_t shared_addr = (ira->ira_flags & IRAF_TX_SHARED_ADDR); /* - * The cases in which this can happen are: - * - IPv6 Router Alert, where ip_rput_data_v6 deliberately skips - * over the label attachment process. - * - MLD output looped-back to ourselves. - * - IPv4 Router Discovery, where tsol_get_pkt_label intentionally - * avoids the labeling process. - * We trust that all valid paths in the code set the cred pointer when - * needed. + * tsol_get_pkt_label intentionally avoids the labeling process for: + * - IPv6 router and neighbor discovery as well as redirects. + * - MLD packets. (Anything between ICMPv6 code 130 and 138.) + * - IGMP packets. + * - IPv4 router discovery. + * In those cases ire_cred is NULL. */ - if ((credp = msg_getcred(mp, NULL)) == NULL) + credp = ira->ira_cred; + if (credp == NULL) return (B_TRUE); /* @@ -904,17 +891,18 @@ tsol_receive_local(const mblk_t *mp, const void *addr, uchar_t version, * same zoneid as the selected destination, then no checks are * necessary. Membership in the zone is enough proof. This is * intended to be a hot path through this function. + * Note: Using crgetzone here is ok since the peer is local. */ if (!crisremote(credp) && crgetzone(credp) == crgetzone(connp->conn_cred)) return (B_TRUE); - plabel = crgetlabel(credp); + plabel = ira->ira_tsl; conn_plabel = crgetlabel(connp->conn_cred); ASSERT(plabel != NULL && conn_plabel != NULL); label = label2bslabel(plabel); - conn_label = label2bslabel(crgetlabel(connp->conn_cred)); + conn_label = label2bslabel(conn_plabel); /* @@ -954,12 +942,8 @@ tsol_receive_local(const mblk_t *mp, const void *addr, uchar_t version, blequal(label, conn_label)) return (B_TRUE); - /* - * conn_zoneid is global for an exclusive stack, thus we use - * conn_cred to get the zoneid - */ if ((connp->conn_mac_mode == CONN_MAC_DEFAULT) || - (crgetzoneid(connp->conn_cred) != GLOBAL_ZONEID && + (!connp->conn_zone_is_global && (plabel->tsl_doi != conn_plabel->tsl_doi || !bldominates(conn_label, label)))) { DTRACE_PROBE3( @@ -1046,16 +1030,13 @@ tsol_receive_local(const mblk_t *mp, const void *addr, uchar_t version, } boolean_t -tsol_can_accept_raw(mblk_t *mp, boolean_t check_host) +tsol_can_accept_raw(mblk_t *mp, ip_recv_attr_t *ira, boolean_t check_host) { ts_label_t *plabel = NULL; tsol_tpc_t *src_rhtp, *dst_rhtp; boolean_t retv; - cred_t *credp; - credp = msg_getcred(mp, NULL); - if (credp != NULL) - plabel = crgetlabel(credp); + plabel = ira->ira_tsl; /* We are bootstrapping or the internal template was never deleted */ if (plabel == NULL) @@ -1144,7 +1125,7 @@ tsol_can_accept_raw(mblk_t *mp, boolean_t check_host) * TSLF_UNLABELED flag is sufficient. */ boolean_t -tsol_can_reply_error(const mblk_t *mp) +tsol_can_reply_error(const mblk_t *mp, ip_recv_attr_t *ira) { ts_label_t *plabel = NULL; tsol_tpc_t *rhtp; @@ -1152,7 +1133,6 @@ tsol_can_reply_error(const mblk_t *mp) const ip6_t *ip6h; boolean_t retv; bslabel_t *pktbs; - cred_t *credp; /* Caller must pull up at least the IP header */ ASSERT(MBLKL(mp) >= (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION ? @@ -1161,9 +1141,7 @@ tsol_can_reply_error(const mblk_t *mp) if (!tsol_strict_error) return (B_TRUE); - credp = msg_getcred(mp, NULL); - if (credp != NULL) - plabel = crgetlabel(credp); + plabel = ira->ira_tsl; /* We are bootstrapping or the internal template was never deleted */ if (plabel == NULL) @@ -1227,33 +1205,30 @@ tsol_can_reply_error(const mblk_t *mp) } /* - * Finds the zone associated with the given packet. Returns GLOBAL_ZONEID if - * the zone cannot be located. + * Finds the zone associated with the receive attributes. Returns GLOBAL_ZONEID + * if the zone cannot be located. * * This is used by the classifier when the packet matches an ALL_ZONES IRE, and * there's no MLP defined. * * Note that we assume that this is only invoked in the ALL_ZONES case. - * Handling other cases would require handle exclusive stack zones where either + * Handling other cases would require handling exclusive IP zones where either * this routine or the callers would have to map from * the zoneid (zone->zone_id) to what IP uses in conn_zoneid etc. */ zoneid_t -tsol_packet_to_zoneid(const mblk_t *mp) +tsol_attr_to_zoneid(const ip_recv_attr_t *ira) { - cred_t *cr = msg_getcred(mp, NULL); zone_t *zone; ts_label_t *label; - if (cr != NULL) { - if ((label = crgetlabel(cr)) != NULL) { - zone = zone_find_by_label(label); - if (zone != NULL) { - zoneid_t zoneid = zone->zone_id; + if ((label = ira->ira_tsl) != NULL) { + zone = zone_find_by_label(label); + if (zone != NULL) { + zoneid_t zoneid = zone->zone_id; - zone_rele(zone); - return (zoneid); - } + zone_rele(zone); + return (zoneid); } } return (GLOBAL_ZONEID); @@ -1273,7 +1248,7 @@ tsol_ire_match_gwattr(ire_t *ire, const ts_label_t *tsl) /* Not in Trusted mode or IRE is local/loopback/broadcast/interface */ if (!is_system_labeled() || (ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK | IRE_BROADCAST | - IRE_INTERFACE))) + IRE_IF_ALL | IRE_MULTICAST | IRE_NOROUTE))) goto done; /* @@ -1304,29 +1279,16 @@ tsol_ire_match_gwattr(ire_t *ire, const ts_label_t *tsl) mutex_enter(&attrp->igsa_lock); /* - * Depending on the IRE type (prefix vs. cache), we seek the group + * We seek the group * structure which contains all security credentials of the gateway. - * A prefix IRE is associated with at most one gateway credential, - * while a cache IRE is associated with every credentials that the - * gateway has. + * An offline IRE is associated with at most one gateway credential. */ - if ((gc = attrp->igsa_gc) != NULL) { /* prefix */ + if ((gc = attrp->igsa_gc) != NULL) { gcgrp = gc->gc_grp; ASSERT(gcgrp != NULL); rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); - } else if ((gcgrp = attrp->igsa_gcgrp) != NULL) { /* cache */ - rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); - gc = gcgrp->gcgrp_head; - if (gc == NULL) { - /* gc group is empty, so the drop lock now */ - ASSERT(gcgrp->gcgrp_count == 0); - rw_exit(&gcgrp->gcgrp_rwlock); - gcgrp = NULL; - } - } - - if (gcgrp != NULL) GCGRP_REFHOLD(gcgrp); + } if ((gw_rhc = attrp->igsa_rhc) != NULL) { /* @@ -1354,12 +1316,11 @@ tsol_ire_match_gwattr(ire_t *ire, const ts_label_t *tsl) ASSERT(ga->ga_af == AF_INET6); paddr = &ga->ga_addr; } - } else if (ire->ire_ipversion == IPV6_VERSION && - !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) { - paddr = &ire->ire_gateway_addr_v6; - } else if (ire->ire_ipversion == IPV4_VERSION && - ire->ire_gateway_addr != INADDR_ANY) { - paddr = &ire->ire_gateway_addr; + } else if (ire->ire_type & IRE_OFFLINK) { + if (ire->ire_ipversion == IPV6_VERSION) + paddr = &ire->ire_gateway_addr_v6; + else if (ire->ire_ipversion == IPV4_VERSION) + paddr = &ire->ire_gateway_addr; } /* We've found a gateway address to do the template lookup */ @@ -1408,6 +1369,7 @@ tsol_ire_match_gwattr(ire_t *ire, const ts_label_t *tsl) } if (gc != NULL) { + tsol_gcdb_t *gcdb; /* * In the case of IRE_CACHE we've got one or more gateway @@ -1418,18 +1380,9 @@ tsol_ire_match_gwattr(ire_t *ire, const ts_label_t *tsl) * just the route itself, so the loop is executed only once. */ ASSERT(gcgrp != NULL); - do { - gcdb = gc->gc_db; - if (tsl->tsl_doi == gcdb->gcdb_doi && - _blinrange(&tsl->tsl_label, &gcdb->gcdb_slrange)) - break; - if (ire->ire_type == IRE_CACHE) - gc = gc->gc_next; - else - gc = NULL; - } while (gc != NULL); - - if (gc == NULL) { + gcdb = gc->gc_db; + if (tsl->tsl_doi != gcdb->gcdb_doi || + !_blinrange(&tsl->tsl_label, &gcdb->gcdb_slrange)) { DTRACE_PROBE3( tx__ip__log__drop__irematch__nogcmatched, char *, "ire(1), tsl(2): all gc failed match", @@ -1493,12 +1446,13 @@ done: /* * Performs label accreditation checks for packet forwarding. + * Add or remove a CIPSO option as needed. * * Returns a pointer to the modified mblk if allowed for forwarding, * or NULL if the packet must be dropped. */ mblk_t * -tsol_ip_forward(ire_t *ire, mblk_t *mp) +tsol_ip_forward(ire_t *ire, mblk_t *mp, const ip_recv_attr_t *ira) { tsol_ire_gw_secattr_t *attrp = NULL; ipha_t *ipha; @@ -1516,11 +1470,14 @@ tsol_ip_forward(ire_t *ire, mblk_t *mp) boolean_t need_tpc_rele = B_FALSE; ipaddr_t *gw; ip_stack_t *ipst = ire->ire_ipst; - cred_t *credp; - pid_t pid; + int err; + ts_label_t *effective_tsl = NULL; ASSERT(ire != NULL && mp != NULL); - ASSERT(ire->ire_stq != NULL); + /* + * Note that the ire is the first one found, i.e., an IRE_OFFLINK if + * the destination is offlink. + */ af = (ire->ire_ipversion == IPV4_VERSION) ? AF_INET : AF_INET6; @@ -1530,16 +1487,6 @@ tsol_ip_forward(ire_t *ire, mblk_t *mp) psrc = &ipha->ipha_src; pdst = &ipha->ipha_dst; proto = ipha->ipha_protocol; - - /* - * off_link is TRUE if destination not directly reachable. - * Surya note: we avoid creation of per-dst IRE_CACHE entries - * for forwarded packets, so we set off_link to be TRUE - * if the packet dst is different from the ire_addr of - * the ire for the nexthop. - */ - off_link = ((ipha->ipha_dst != ire->ire_addr) || - (ire->ire_gateway_addr != INADDR_ANY)); if (!tsol_get_option_v4(mp, &label_type, &opt_ptr)) return (NULL); } else { @@ -1561,14 +1508,15 @@ tsol_ip_forward(ire_t *ire, mblk_t *mp) } proto = *nexthdrp; } - - /* destination not directly reachable? */ - off_link = !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6); if (!tsol_get_option_v6(mp, &label_type, &opt_ptr)) return (NULL); } + /* + * off_link is TRUE if destination not directly reachable. + */ + off_link = (ire->ire_type & IRE_OFFLINK); - if ((tsl = msg_getlabel(mp)) == NULL) + if ((tsl = ira->ira_tsl) == NULL) return (mp); if (tsl->tsl_flags & TSLF_IMPLICIT_IN) { @@ -1611,11 +1559,7 @@ tsol_ip_forward(ire_t *ire, mblk_t *mp) attrp = ire->ire_gw_secattr; gw_rhtp = attrp->igsa_rhc->rhc_tpc; } else { - /* - * use the ire_addr if this is the IRE_CACHE of nexthop - */ - gw = (ire->ire_gateway_addr == NULL? &ire->ire_addr : - &ire->ire_gateway_addr); + gw = &ire->ire_gateway_addr; gw_rhtp = find_tpc(gw, ire->ire_ipversion, B_FALSE); need_tpc_rele = B_TRUE; } @@ -1702,7 +1646,13 @@ tsol_ip_forward(ire_t *ire, mblk_t *mp) /* adjust is negative */ ASSERT((mp->b_wptr + adjust) >= mp->b_rptr); mp->b_wptr += adjust; - + /* + * Note that caller adjusts ira_pktlen and + * ira_ip_hdr_length + * + * For AF_INET6 note that tsol_remove_secopt_v6 + * adjusted ip6_plen. + */ if (af == AF_INET) { ipha = (ipha_t *)mp->b_rptr; iplen = ntohs(ipha->ipha_length) + adjust; @@ -1729,17 +1679,34 @@ tsol_ip_forward(ire_t *ire, mblk_t *mp) (!off_link || gw_rhtp->tpc_tp.host_type == UNLABELED)) goto keep_label; - - credp = msg_getcred(mp, &pid); - if ((af == AF_INET && - tsol_check_label(credp, &mp, CONN_MAC_DEFAULT, ipst, pid) != 0) || - (af == AF_INET6 && - tsol_check_label_v6(credp, &mp, CONN_MAC_DEFAULT, ipst, - pid) != 0)) { + /* + * Since we are forwarding packets we use GLOBAL_ZONEID for + * the IRE lookup in tsol_check_label. + * Since mac_exempt is false the zoneid isn't used for anything + * but the IRE lookup, hence we set zone_is_global to false. + */ + if (af == AF_INET) { + err = tsol_check_label_v4(tsl, GLOBAL_ZONEID, &mp, + CONN_MAC_DEFAULT, B_FALSE, ipst, &effective_tsl); + } else { + err = tsol_check_label_v6(tsl, GLOBAL_ZONEID, &mp, + CONN_MAC_DEFAULT, B_FALSE, ipst, &effective_tsl); + } + if (err != 0) { + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); + ip_drop_output("tsol_check_label", mp, NULL); + freemsg(mp); mp = NULL; goto keep_label; } + /* + * The effective_tsl must never affect the routing decision, hence + * we ignore it here. + */ + if (effective_tsl != NULL) + label_rele(effective_tsl); + if (af == AF_INET) { ipha = (ipha_t *)mp->b_rptr; ipha->ipha_hdr_checksum = 0; @@ -1885,13 +1852,13 @@ tsol_rtsa_init(rt_msghdr_t *rtm, tsol_rtsecattr_t *sp, caddr_t cp) } int -tsol_ire_init_gwattr(ire_t *ire, uchar_t ipversion, tsol_gc_t *gc, - tsol_gcgrp_t *gcgrp) +tsol_ire_init_gwattr(ire_t *ire, uchar_t ipversion, tsol_gc_t *gc) { tsol_ire_gw_secattr_t *attrp; boolean_t exists = B_FALSE; in_addr_t ga_addr4; void *paddr = NULL; + tsol_gcgrp_t *gcgrp = NULL; ASSERT(ire != NULL); @@ -1917,20 +1884,16 @@ tsol_ire_init_gwattr(ire_t *ire, uchar_t ipversion, tsol_gc_t *gc, if (attrp->igsa_gc != NULL) GC_REFRELE(attrp->igsa_gc); - if (attrp->igsa_gcgrp != NULL) - GCGRP_REFRELE(attrp->igsa_gcgrp); } ASSERT(!exists || MUTEX_HELD(&attrp->igsa_lock)); /* * References already held by caller and we keep them; - * note that both gc and gcgrp may be set to NULL to - * clear out igsa_gc and igsa_gcgrp, respectively. + * note that gc may be set to NULL to clear out igsa_gc. */ attrp->igsa_gc = gc; - attrp->igsa_gcgrp = gcgrp; - if (gcgrp == NULL && gc != NULL) { + if (gc != NULL) { gcgrp = gc->gc_grp; ASSERT(gcgrp != NULL); } @@ -1955,12 +1918,11 @@ tsol_ire_init_gwattr(ire_t *ire, uchar_t ipversion, tsol_gc_t *gc, ASSERT(ga->ga_af == AF_INET6); paddr = &ga->ga_addr; } - } else if (ipversion == IPV6_VERSION && - !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6)) { - paddr = &ire->ire_gateway_addr_v6; - } else if (ipversion == IPV4_VERSION && - ire->ire_gateway_addr != INADDR_ANY) { - paddr = &ire->ire_gateway_addr; + } else if (ire->ire_type & IRE_OFFLINK) { + if (ipversion == IPV6_VERSION) + paddr = &ire->ire_gateway_addr_v6; + else if (ipversion == IPV4_VERSION) + paddr = &ire->ire_gateway_addr; } /* @@ -1990,7 +1952,7 @@ tsol_ire_init_gwattr(ire_t *ire, uchar_t ipversion, tsol_gc_t *gc, * If we can't figure out what it is, then return mlptSingle. That's actually * an error case. * - * The callers are assume to pass in zone->zone_id and not the zoneid that + * The callers are assumed to pass in zone->zone_id and not the zoneid that * is stored in a conn_t (since the latter will be GLOBAL_ZONEID in an * exclusive stack zone). */ @@ -2022,23 +1984,28 @@ tsol_mlp_addr_type(zoneid_t zoneid, uchar_t version, const void *addr, version = IPV4_VERSION; } + /* Check whether the IRE_LOCAL (or ipif) is ALL_ZONES */ if (version == IPV4_VERSION) { in4 = *(const in_addr_t *)addr; if ((in4 == INADDR_ANY) || CLASSD(in4)) { return (mlptBoth); } - ire = ire_cache_lookup(in4, ip_zoneid, NULL, ipst); + ire = ire_ftable_lookup_v4(in4, 0, 0, IRE_LOCAL|IRE_LOOPBACK, + NULL, ip_zoneid, NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, + 0, ipst, NULL); } else { if (IN6_IS_ADDR_UNSPECIFIED((const in6_addr_t *)addr) || IN6_IS_ADDR_MULTICAST((const in6_addr_t *)addr)) { return (mlptBoth); } - ire = ire_cache_lookup_v6(addr, ip_zoneid, NULL, ipst); + ire = ire_ftable_lookup_v6(addr, 0, 0, IRE_LOCAL|IRE_LOOPBACK, + NULL, ip_zoneid, NULL, MATCH_IRE_TYPE | MATCH_IRE_ZONEONLY, + 0, ipst, NULL); } /* * If we can't find the IRE, then we have to behave exactly like - * ip_bind_laddr{,_v6}. That means looking up the IPIF so that users - * can bind to addresses on "down" interfaces. + * ip_laddr_verify_{v4,v6}. That means looking up the IPIF so that + * users can bind to addresses on "down" interfaces. * * If we can't find that either, then the bind is going to fail, so * just give up. Note that there's a miniscule chance that the address @@ -2047,10 +2014,10 @@ tsol_mlp_addr_type(zoneid_t zoneid, uchar_t version, const void *addr, if (ire == NULL) { if (version == IPV4_VERSION) ipif = ipif_lookup_addr(*(const in_addr_t *)addr, NULL, - ip_zoneid, NULL, NULL, NULL, NULL, ipst); + ip_zoneid, ipst); else ipif = ipif_lookup_addr_v6((const in6_addr_t *)addr, - NULL, ip_zoneid, NULL, NULL, NULL, NULL, ipst); + NULL, ip_zoneid, ipst); if (ipif == NULL) { return (mlptSingle); } diff --git a/usr/src/uts/common/inet/ip2mac_impl.h b/usr/src/uts/common/inet/ip2mac_impl.h index 19d0931441..9a09e14487 100644 --- a/usr/src/uts/common/inet/ip2mac_impl.h +++ b/usr/src/uts/common/inet/ip2mac_impl.h @@ -37,10 +37,10 @@ extern "C" { #ifdef _KERNEL -extern void nce_cb_dispatch(nce_t *); -extern void nce_ip2mac_response(ip2mac_t *, nce_t *); -extern void nce_cb_refhold_locked(nce_t *); -extern void nce_cb_refrele(nce_t *); +extern void ncec_cb_dispatch(ncec_t *); +extern void ncec_ip2mac_response(ip2mac_t *, ncec_t *); +extern void ncec_cb_refhold_locked(ncec_t *); +extern void ncec_cb_refrele(ncec_t *); #endif /* _KERNEL */ diff --git a/usr/src/uts/common/inet/ip6.h b/usr/src/uts/common/inet/ip6.h index 5408ab9e55..10c6c81ba2 100644 --- a/usr/src/uts/common/inet/ip6.h +++ b/usr/src/uts/common/inet/ip6.h @@ -57,105 +57,12 @@ typedef enum { IP6_SCOPE_GLOBAL } in6addr_scope_t; -#ifdef _KERNEL +/* From RFC 3542 - setting for IPV6_USE_MIN_MTU socket option */ +#define IPV6_USE_MIN_MTU_MULTICAST -1 /* Default */ +#define IPV6_USE_MIN_MTU_NEVER 0 +#define IPV6_USE_MIN_MTU_ALWAYS 1 -/* - * Private header used between the transports and IP to carry the content - * of the options IPV6_PKTINFO/IPV6_RECVPKTINFO (the interface index only) - * and IPV6_NEXTHOP. - * Also used to specify that raw sockets do not want the UDP/TCP transport - * checksums calculated in IP (akin to IP_HDR_INCLUDED) and provide for - * IPV6_CHECKSUM on the transmit side (using ip6i_checksum_off). - * - * When this header is used it must be the first header in the packet i.e. - * before the real ip6 header. The use of a next header value of 255 - * (IPPROTO_RAW) in this header indicates its presence. Note that - * ip6_nxt = IPPROTO_RAW indicates that "this" header is ip6_info - the - * next header is always IPv6. - * - * Note that ip6i_nexthop is at the same offset as ip6_dst so that - * this header can be kept in the packet while the it passes through - * ip_newroute* and the ndp code. Those routines will use ip6_dst for - * resolution. - * - * Implementation offset assumptions about ip6_info_t and ip6_t fields - * and their alignments shown in figure below - * - * ip6_info (Private headers from transports to IP) header below - * _______________________________________________________________ _ _ _ _ _ - * | .... | ip6i_nxt (255)| ......................|ip6i_nexthop| ...ip6_t. - * --------------------------------------------------------------- - - - - - - * ^ ^ - * <---- >| same offset for {ip6i_nxt,ip6_nxt} ^ - * ^ ^ - * <------^-------------------------------------->| same offset for - * ^ ^ {ip6i_nxthop,ip6_dst} - * _______________________________________________________________ _ _ _ - * | .... | ip6_nxt | ......................|ip6_dst | .other hdrs... - * --------------------------------------------------------------- - - - - * ip6_t (IPv6 protocol) header above - */ -struct ip6_info { - union { - struct ip6_info_ctl { - uint32_t ip6i_un1_flow; - uint16_t ip6i_un1_plen; /* payload length */ - uint8_t ip6i_un1_nxt; /* next header */ - uint8_t ip6i_un1_hlim; /* hop limit */ - } ip6i_un1; - } ip6i_ctlun; - int ip6i_flags; /* See below */ - int ip6i_ifindex; - int ip6i_checksum_off; - int ip6i_pad; - in6_addr_t ip6i_nexthop; /* Same offset as ip6_dst */ -}; -typedef struct ip6_info ip6i_t; - -#define ip6i_flow ip6i_ctlun.ip6i_un1.ip6i_un1_flow -#define ip6i_vcf ip6i_flow /* Version, class, flow */ -#define ip6i_nxt ip6i_ctlun.ip6i_un1.ip6i_un1_nxt -#define ip6i_hops ip6i_ctlun.ip6i_un1.ip6i_un1_hlim - -/* ip6_info flags */ -#define IP6I_IFINDEX 0x1 /* ip6i_ifindex is set (to nonzero value) */ -#define IP6I_NEXTHOP 0x2 /* ip6i_nexthop is different than ip6_dst */ -#define IP6I_NO_ULP_CKSUM 0x4 - /* - * Do not generate TCP/UDP/SCTP transport checksum. - * Used by raw sockets. Does not affect the - * generation of transport checksums for ICMPv6 - * since such packets always arrive through - * a raw socket. - */ -#define IP6I_UNSPEC_SRC 0x8 - /* Used to carry conn_unspec_src through ip_newroute* */ -#define IP6I_RAW_CHECKSUM 0x10 - /* Compute checksum and stuff in ip6i_checksum_off */ -#define IP6I_VERIFY_SRC 0x20 /* Verify ip6_src. Used when IPV6_PKTINFO */ -#define IP6I_IPMP_PROBE 0x40 /* IPMP (in.mpathd) probe packet */ - /* 0x80 - 0x100 available */ -#define IP6I_DONTFRAG 0x200 /* Don't fragment this packet */ -#define IP6I_HOPLIMIT 0x400 /* hoplimit has been set by the sender */ - -/* - * These constants refer to the IPV6_USE_MIN_MTU API. The - * actually values used in the API are these values shifted down - * 10 bits minus 2 [-1, 1]. 0 (-2 after conversion) is considered - * the same as the default (-1). IP6I_API_USE_MIN_MTU(f, x) returns - * the flags field updated with min mtu. IP6I_USE_MIN_MTU_API takes the - * field and returns the API value (+ the -2 value). - */ -#define IP6I_USE_MIN_MTU_UNICAST 0x400 -#define IP6I_USE_MIN_MTU_ALWAYS 0x800 -#define IP6I_USE_MIN_MTU_NEVER 0xC00 -#define IP6I_USE_MIN_MTU_API(x) ((((x) & 0xC00) >> 10) - 2) -#define IP6I_API_USE_MIN_MTU(f, x) (((f) & ~0xC00) &\ - ((((x) + 2) & 0x3) << 11)) -#define IPV6_USE_MIN_MTU_DEFAULT -2 -#define IPV6_USE_MIN_MTU_UNICAST -1 -#define IPV6_USE_MIN_MTU_ALWAYS 0 -#define IPV6_USE_MIN_MTU_NEVER 1 +#ifdef _KERNEL /* Extract the scope from a multicast address */ #ifdef _BIG_ENDIAN @@ -195,28 +102,18 @@ typedef struct ip6_info ip6i_t; #define MIN_EHDR_LEN 8 #define MAX_EHDR_LEN 2048 -/* - * The high-order bit of the version field is used by the transports to - * indicate a reachability confirmation to IP. - */ -#define IP_FORWARD_PROG_BIT 0x8 - #ifdef _BIG_ENDIAN #define IPV6_DEFAULT_VERS_AND_FLOW 0x60000000 #define IPV6_VERS_AND_FLOW_MASK 0xF0000000 -#define IP_FORWARD_PROG ((uint32_t)IP_FORWARD_PROG_BIT << 28) #define V6_MCAST 0xFF000000 #define V6_LINKLOCAL 0xFE800000 #define IPV6_FLOW_TCLASS(x) (((x) & IPV6_FLOWINFO_TCLASS) >> 20) #define IPV6_TCLASS_FLOW(f, c) (((f) & ~IPV6_FLOWINFO_TCLASS) |\ ((c) << 20)) - #else #define IPV6_DEFAULT_VERS_AND_FLOW 0x00000060 #define IPV6_VERS_AND_FLOW_MASK 0x000000F0 -#define IP_FORWARD_PROG ((uint32_t)IP_FORWARD_PROG_BIT << 4) - #define V6_MCAST 0x000000FF #define V6_LINKLOCAL 0x000080FE @@ -328,71 +225,66 @@ extern const in6_addr_t ipv6_unspecified_group; * FUNCTION PROTOTYPES */ -struct ipsec_out_s; - extern void convert2ascii(char *buf, const in6_addr_t *addr); extern char *inet_ntop(int, const void *, char *, int); extern int inet_pton(int, char *, void *); -extern void icmp_time_exceeded_v6(queue_t *, mblk_t *, uint8_t, - boolean_t, boolean_t, zoneid_t, ip_stack_t *); -extern void icmp_unreachable_v6(queue_t *, mblk_t *, uint8_t, - boolean_t, boolean_t, zoneid_t, ip_stack_t *); -extern void icmp_inbound_error_fanout_v6(queue_t *, mblk_t *, ip6_t *, - icmp6_t *, ill_t *, ill_t *, boolean_t, zoneid_t); -extern boolean_t conn_wantpacket_v6(conn_t *, ill_t *, ip6_t *, int, zoneid_t); -extern mblk_t *ip_add_info_v6(mblk_t *, ill_t *, const in6_addr_t *); +extern void icmp_param_problem_nexthdr_v6(mblk_t *, boolean_t, + ip_recv_attr_t *); +extern void icmp_pkt2big_v6(mblk_t *, uint32_t, boolean_t, + ip_recv_attr_t *); +extern void icmp_time_exceeded_v6(mblk_t *, uint8_t, boolean_t, + ip_recv_attr_t *); +extern void icmp_unreachable_v6(mblk_t *, uint8_t, boolean_t, + ip_recv_attr_t *); +extern mblk_t *icmp_inbound_v6(mblk_t *, ip_recv_attr_t *); +extern void icmp_inbound_error_fanout_v6(mblk_t *, icmp6_t *, + ip_recv_attr_t *); +extern void icmp_update_out_mib_v6(ill_t *, icmp6_t *); + +extern boolean_t conn_wantpacket_v6(conn_t *, ip_recv_attr_t *, ip6_t *); + extern in6addr_scope_t ip_addr_scope_v6(const in6_addr_t *); -extern mblk_t *ip_bind_v6(queue_t *, mblk_t *, conn_t *, ip6_pkt_t *); -extern void ip_build_hdrs_v6(uchar_t *, uint_t, ip6_pkt_t *, uint8_t); -extern int ip_fanout_send_icmp_v6(queue_t *, mblk_t *, uint_t, - uint_t, uint8_t, uint_t, boolean_t, zoneid_t, ip_stack_t *); -extern int ip_find_hdr_v6(mblk_t *, ip6_t *, ip6_pkt_t *, uint8_t *); -extern in6_addr_t ip_get_dst_v6(ip6_t *, mblk_t *, boolean_t *); +extern void ip_build_hdrs_v6(uchar_t *, uint_t, const ip_pkt_t *, uint8_t, + uint32_t); +extern void ip_fanout_udp_multi_v6(mblk_t *, ip6_t *, uint16_t, uint16_t, + ip_recv_attr_t *); +extern void ip_fanout_send_icmp_v6(mblk_t *, uint_t, uint8_t, + ip_recv_attr_t *); +extern void ip_fanout_proto_v6(mblk_t *, ip6_t *, ip_recv_attr_t *); +extern int ip_find_hdr_v6(mblk_t *, ip6_t *, boolean_t, ip_pkt_t *, + uint8_t *); +extern in6_addr_t ip_get_dst_v6(ip6_t *, const mblk_t *, boolean_t *); extern ip6_rthdr_t *ip_find_rthdr_v6(ip6_t *, uint8_t *); -extern int ip_hdr_complete_v6(ip6_t *, zoneid_t, ip_stack_t *); extern boolean_t ip_hdr_length_nexthdr_v6(mblk_t *, ip6_t *, uint16_t *, uint8_t **); extern int ip_hdr_length_v6(mblk_t *, ip6_t *); -extern int ip_check_v6_mblk(mblk_t *, ill_t *); extern uint32_t ip_massage_options_v6(ip6_t *, ip6_rthdr_t *, netstack_t *); -extern void ip_wput_frag_v6(mblk_t *, ire_t *, uint_t, conn_t *, int, int); -extern void ip_wput_ipsec_out_v6(queue_t *, mblk_t *, ip6_t *, ill_t *, - ire_t *); -extern int ip_total_hdrs_len_v6(ip6_pkt_t *); +extern void ip_forward_xmit_v6(nce_t *, mblk_t *, ip6_t *, ip_recv_attr_t *, + uint32_t, uint32_t); +extern mblk_t *ip_fraghdr_add_v6(mblk_t *, uint32_t, ip_xmit_attr_t *); +extern int ip_fragment_v6(mblk_t *, nce_t *, iaflags_t, uint_t, uint32_t, + uint32_t, zoneid_t, zoneid_t, pfirepostfrag_t postfragfn, + uintptr_t *ixa_cookie); +extern int ip_process_options_v6(mblk_t *, ip6_t *, + uint8_t *, uint_t, uint8_t, ip_recv_attr_t *); +extern void ip_process_rthdr(mblk_t *, ip6_t *, ip6_rthdr_t *, + ip_recv_attr_t *); +extern int ip_total_hdrs_len_v6(const ip_pkt_t *); +extern mblk_t *ipsec_early_ah_v6(mblk_t *, ip_recv_attr_t *); extern int ipsec_ah_get_hdr_size_v6(mblk_t *, boolean_t); -extern void ip_wput_v6(queue_t *, mblk_t *); -extern void ip_wput_local_v6(queue_t *, ill_t *, ip6_t *, mblk_t *, - ire_t *, int, zoneid_t); -extern void ip_output_v6(void *, mblk_t *, void *, int); -extern void ip_xmit_v6(mblk_t *, ire_t *, uint_t, conn_t *, int, - struct ipsec_out_s *); +extern void ip_send_potential_redirect_v6(mblk_t *, ip6_t *, ire_t *, + ip_recv_attr_t *); extern void ip_rput_v6(queue_t *, mblk_t *); -extern void ip_rput_data_v6(queue_t *, ill_t *, mblk_t *, ip6_t *, - uint_t, mblk_t *, mblk_t *); -extern void mld_input(queue_t *, mblk_t *, ill_t *); +extern mblk_t *mld_input(mblk_t *, ip_recv_attr_t *); extern void mld_joingroup(ilm_t *); extern void mld_leavegroup(ilm_t *); extern void mld_timeout_handler(void *); extern void pr_addr_dbg(char *, int, const void *); -extern int ip_multirt_apply_membership_v6(int (*fn)(conn_t *, boolean_t, - const in6_addr_t *, int, mcast_record_t, const in6_addr_t *, mblk_t *), - ire_t *, conn_t *, boolean_t, const in6_addr_t *, mcast_record_t, - const in6_addr_t *, mblk_t *); -extern void ip_newroute_ipif_v6(queue_t *, mblk_t *, ipif_t *, - const in6_addr_t *, const in6_addr_t *, int, zoneid_t); -extern void ip_newroute_v6(queue_t *, mblk_t *, const in6_addr_t *, - const in6_addr_t *, ill_t *, zoneid_t, ip_stack_t *); extern void *ip6_kstat_init(netstackid_t, ip6_stat_t *); extern void ip6_kstat_fini(netstackid_t, kstat_t *); -extern size_t ip6_get_src_preferences(conn_t *, uint32_t *); -extern int ip6_set_src_preferences(conn_t *, uint32_t); -extern int ip6_set_pktinfo(cred_t *, conn_t *, struct in6_pktinfo *); -extern int ip_proto_bind_laddr_v6(conn_t *, mblk_t **, uint8_t, - const in6_addr_t *, uint16_t, boolean_t); -extern int ip_proto_bind_connected_v6(conn_t *, mblk_t **, - uint8_t, in6_addr_t *, uint16_t, const in6_addr_t *, ip6_pkt_t *, - uint16_t, boolean_t, boolean_t, cred_t *); +extern size_t ip6_get_src_preferences(ip_xmit_attr_t *, uint32_t *); +extern int ip6_set_src_preferences(ip_xmit_attr_t *, uint32_t); #endif /* _KERNEL */ diff --git a/usr/src/uts/common/inet/ip_arp.h b/usr/src/uts/common/inet/ip_arp.h new file mode 100644 index 0000000000..2cb7e7a05a --- /dev/null +++ b/usr/src/uts/common/inet/ip_arp.h @@ -0,0 +1,136 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _IP_ARP_H +#define _IP_ARP_H + +/* + * Data-structures and functions related to the IP STREAMS queue that handles + * packets with the SAP set to 0x806 (ETHERTYPE_ARP). + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/types.h> +#include <inet/ip.h> +#include <inet/ip_ndp.h> +#include <sys/stream.h> + +#ifdef _KERNEL +extern struct streamtab dummymodinfo; + +struct arl_ill_common_s; +/* + * The arl_s structure tracks the state of the associated ARP stream. + */ +typedef struct arl_s { + queue_t *arl_rq; + queue_t *arl_wq; + ip_stack_t *arl_ipst; + zoneid_t arl_zoneid; + cred_t *arl_credp; + ip_m_t arl_media; + struct arl_ill_common_s *arl_common; + int arl_muxid; + uint_t arl_ppa; + t_uscalar_t arl_sap; + t_uscalar_t arl_sap_length; + uint_t arl_phys_addr_length; + char *arl_name; + int arl_name_length; + t_uscalar_t arl_mactype; +#define arl_first_mp_to_free arl_dlpi_deferred + mblk_t *arl_dlpi_deferred; + mblk_t *arl_unbind_mp; + mblk_t *arl_detach_mp; +#define arl_last_mp_to_free arl_detach_mp + uint_t arl_state_flags; + uint_t + arl_needs_attach:1, + arl_dlpi_style_set:1, + arl_pad_to_bit_31:30; + uint_t arl_refcnt; + kcondvar_t arl_cv; + t_uscalar_t arl_dlpi_pending; + kmutex_t arl_lock; + int arl_error; +} arl_t; + +/* + * The arl_ill_common_t structure is a super-structure that contains pointers + * to a pair of matching ill_t, arl_t structures. Given an arl_t (or + * ill_t) the corresponding ill_t (or arl_t) must be obtained by + * synchronizing on the ai_lock, and ensuring that the desired ill/arl + * pointer is non-null, not condemned. The arl_ill_common_t is allocated in + * arl_init() and freed only when both the ill_t and the arl_t structures + * become NULL. + * Lock hierarchy: the ai_lock must be take before the ill_lock or arl_lock. + */ + +typedef struct arl_ill_common_s { + kmutex_t ai_lock; + ill_t *ai_ill; + arl_t *ai_arl; + kcondvar_t ai_ill_unplumb_done; /* sent from ip_modclose() */ +} arl_ill_common_t; + +extern boolean_t arp_no_defense; + +extern struct module_info arp_mod_info; +extern int arp_ll_up(ill_t *); +extern int arp_ll_down(ill_t *); +extern boolean_t arp_announce(ncec_t *); +extern boolean_t arp_probe(ncec_t *); +extern int arp_request(ncec_t *, in_addr_t, ill_t *); +extern void arp_failure(mblk_t *, ip_recv_attr_t *); +extern int arl_wait_for_info_ack(arl_t *); +extern int arl_init(queue_t *, arl_t *); +extern void arl_set_muxid(ill_t *, int); +extern int arl_get_muxid(ill_t *); +extern void arp_send_replumb_conf(ill_t *); +extern void arp_unbind_complete(ill_t *); +extern ill_t *arl_to_ill(arl_t *); +#endif + +#define ARP_RETRANS_TIMER 500 /* time in milliseconds */ + +/* The following are arl_state_flags */ +#define ARL_LL_SUBNET_PENDING 0x01 /* Waiting for DL_INFO_ACK from drv */ +#define ARL_CONDEMNED 0x02 /* No more new ref's to the ILL */ +#define ARL_DL_UNBIND_IN_PROGRESS 0x04 /* UNBIND_REQ is sent */ +#define ARL_LL_BIND_PENDING 0x0020 /* BIND sent */ +#define ARL_LL_UP 0x0040 /* BIND acked */ +#define ARL_LL_DOWN 0x0080 +#define ARL_LL_UNBOUND 0x0100 /* UNBIND acked */ +#define ARL_LL_REPLUMBING 0x0200 /* replumb in progress */ + +#ifdef __cplusplus +} +#endif + +#endif /* _IP_ARP_H */ diff --git a/usr/src/uts/common/inet/ip_ftable.h b/usr/src/uts/common/inet/ip_ftable.h index 6a3a05183b..d8fa9e566d 100644 --- a/usr/src/uts/common/inet/ip_ftable.h +++ b/usr/src/uts/common/inet/ip_ftable.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -56,7 +56,7 @@ struct rt_entry { * * The comment below (and for other netstack_t references) refers * to the fact that we only do netstack_hold in particular cases, - * such as the references from open streams (ill_t and conn_t's + * such as the references from open endpoints (ill_t and conn_t's * pointers). Internally within IP we rely on IP's ability to cleanup e.g. * ire_t's when an ill goes away. */ @@ -74,26 +74,8 @@ int rtfunc(struct radix_node *, void *); typedef struct rt_entry rt_t; typedef struct rtfuncarg rtf_t; -/* For ire_forward() */ -enum ire_forward_action { - Forward_ok, /* OK to use this IRE to forward */ - Forward_check_multirt, /* CGTP multirt check required */ - Forward_ret_icmp_err, /* Callers to return an ICMP error */ - Forward_blackhole /* Packet is silently discarded */ -}; - struct ts_label_s; -extern ire_t *ire_ftable_lookup(ipaddr_t, ipaddr_t, ipaddr_t, int, - const ipif_t *, ire_t **, zoneid_t, uint32_t, - const struct ts_label_s *, int, ip_stack_t *); -extern ire_t *ire_lookup_multi(ipaddr_t, zoneid_t, ip_stack_t *); -extern ire_t *ipif_lookup_multi_ire(ipif_t *, ipaddr_t); extern void ire_delete_host_redirects(ipaddr_t, ip_stack_t *); -extern ire_t *ire_ihandle_lookup_onlink(ire_t *); -extern ire_t *ire_forward(ipaddr_t, enum ire_forward_action *, ire_t *, - ire_t *, const struct ts_label_s *, ip_stack_t *); -extern ire_t *ire_forward_simple(ipaddr_t, enum ire_forward_action *, - ip_stack_t *); extern irb_t *ire_get_bucket(ire_t *); extern uint_t ifindex_lookup(const struct sockaddr *, zoneid_t); extern int ipfil_sendpkt(const struct sockaddr *, mblk_t *, uint_t, zoneid_t); diff --git a/usr/src/uts/common/inet/ip_if.h b/usr/src/uts/common/inet/ip_if.h index 80a9f74e88..d081d9256b 100644 --- a/usr/src/uts/common/inet/ip_if.h +++ b/usr/src/uts/common/inet/ip_if.h @@ -80,12 +80,12 @@ extern "C" { #define IFF_PHYINTINST_FLAGS (IFF_DEBUG|IFF_NOTRAILERS|IFF_NOARP| \ IFF_MULTICAST|IFF_ROUTER|IFF_NONUD|IFF_NORTEXCH|IFF_IPV4|IFF_IPV6| \ - IFF_XRESOLV|IFF_COS_ENABLED) + IFF_COS_ENABLED|IFF_FIXEDMTU) #define IFF_LOGINT_FLAGS (IFF_UP|IFF_BROADCAST|IFF_POINTOPOINT| \ IFF_UNNUMBERED|IFF_DHCPRUNNING|IFF_PRIVATE|IFF_NOXMIT|IFF_NOLOCAL| \ IFF_DEPRECATED|IFF_ADDRCONF|IFF_ANYCAST|IFF_NOFAILOVER| \ - IFF_PREFERRED|IFF_TEMPORARY|IFF_FIXEDMTU|IFF_DUPLICATE) + IFF_PREFERRED|IFF_TEMPORARY|IFF_DUPLICATE) #define PHYI_LOOPBACK IFF_LOOPBACK /* is a loopback net */ #define PHYI_RUNNING IFF_RUNNING /* resources allocated */ @@ -109,8 +109,8 @@ extern "C" { #define ILLF_NORTEXCH IFF_NORTEXCH /* No routing info exchange */ #define ILLF_IPV4 IFF_IPV4 /* IPv4 interface */ #define ILLF_IPV6 IFF_IPV6 /* IPv6 interface */ -#define ILLF_XRESOLV IFF_XRESOLV /* IPv6 external resolver */ #define ILLF_COS_ENABLED IFF_COS_ENABLED /* Is CoS marking supported */ +#define ILLF_FIXEDMTU IFF_FIXEDMTU /* set with SIOCSLIFMTU */ #define IPIF_UP IFF_UP /* interface is up */ #define IPIF_BROADCAST IFF_BROADCAST /* broadcast address valid */ @@ -126,7 +126,6 @@ extern "C" { #define IPIF_NOFAILOVER IFF_NOFAILOVER /* No failover on NIC failure */ #define IPIF_PREFERRED IFF_PREFERRED /* Prefer as source address */ #define IPIF_TEMPORARY IFF_TEMPORARY /* RFC3041 */ -#define IPIF_FIXEDMTU IFF_FIXEDMTU /* set with SIOCSLIFMTU */ #define IPIF_DUPLICATE IFF_DUPLICATE /* address is in use */ #ifdef DEBUG @@ -135,6 +134,12 @@ extern "C" { #define ILL_MAC_PERIM_HELD(ill) #endif +/* + * match flags for ipif_lookup_addr_common* functions + */ +#define IPIF_MATCH_ILLGRP 0x00000001 +#define IPIF_MATCH_NONDUP 0x00000002 + /* for ipif_resolver_up */ enum ip_resolver_action { Res_act_initial, /* initial address establishment */ @@ -143,134 +148,144 @@ enum ip_resolver_action { Res_act_none /* do nothing */ }; -extern mblk_t *ill_arp_alloc(ill_t *, const uchar_t *, caddr_t); -extern mblk_t *ipif_area_alloc(ipif_t *, uint_t); -extern mblk_t *ipif_ared_alloc(ipif_t *); -extern mblk_t *ill_ared_alloc(ill_t *, ipaddr_t); -extern mblk_t *ill_arie_alloc(ill_t *, const char *, const void *); -extern boolean_t ill_dlpi_pending(ill_t *, t_uscalar_t); +extern int ill_add_ires(ill_t *); +extern void ill_delete_ires(ill_t *); extern void ill_dlpi_done(ill_t *, t_uscalar_t); +extern boolean_t ill_dlpi_pending(ill_t *, t_uscalar_t); +extern void ill_dlpi_dispatch(ill_t *, mblk_t *); extern void ill_dlpi_send(ill_t *, mblk_t *); extern void ill_dlpi_send_deferred(ill_t *); +extern void ill_dlpi_queue(ill_t *, mblk_t *); +extern void ill_dlpi_send_queued(ill_t *); +extern void ill_mcast_queue(ill_t *, mblk_t *); +extern void ill_mcast_send_queued(ill_t *); +extern void ill_mcast_timer_start(ip_stack_t *); extern void ill_capability_done(ill_t *); extern mblk_t *ill_dlur_gen(uchar_t *, uint_t, t_uscalar_t, t_scalar_t); /* NOTE: Keep unmodified ill_lookup_on_ifindex for ipp for now */ -extern ill_t *ill_lookup_on_ifindex_global_instance(uint_t, boolean_t, - queue_t *, mblk_t *, ipsq_func_t, int *); -extern ill_t *ill_lookup_on_ifindex(uint_t, boolean_t, queue_t *, mblk_t *, - ipsq_func_t, int *, ip_stack_t *); -extern ill_t *ill_lookup_on_name(char *, boolean_t, - boolean_t, queue_t *, mblk_t *, ipsq_func_t, int *, boolean_t *, +extern ill_t *ill_lookup_on_ifindex_global_instance(uint_t, boolean_t); +extern ill_t *ill_lookup_on_ifindex(uint_t, boolean_t, ip_stack_t *); +extern ill_t *ill_lookup_on_ifindex_zoneid(uint_t, zoneid_t, boolean_t, ip_stack_t *); +extern ill_t *ill_lookup_on_name(char *, boolean_t, + boolean_t, boolean_t *, ip_stack_t *); +extern boolean_t ip_ifindex_valid(uint_t, boolean_t, ip_stack_t *); extern uint_t ill_get_next_ifindex(uint_t, boolean_t, ip_stack_t *); extern uint_t ill_get_ifindex_by_name(char *, ip_stack_t *); -extern void ill_grp_cache_delete(ire_t *, char *); -extern void ill_ipif_cache_delete(ire_t *, char *); -extern void ill_stq_cache_delete(ire_t *, char *); +extern uint_t ill_get_upper_ifindex(const ill_t *); extern void ill_delete(ill_t *); extern void ill_delete_tail(ill_t *); extern int ill_dl_phys(ill_t *, ipif_t *, mblk_t *, queue_t *); -extern int ill_dls_info(struct sockaddr_dl *, const ipif_t *); +extern int ill_dls_info(struct sockaddr_dl *, const ill_t *); extern void ill_fastpath_ack(ill_t *, mblk_t *); -extern void ill_fastpath_nack(ill_t *); extern int ill_fastpath_probe(ill_t *, mblk_t *); -extern void ill_fastpath_flush(ill_t *); extern int ill_forward_set(ill_t *, boolean_t); extern void ill_frag_prune(ill_t *, uint_t); extern void ill_frag_free_pkts(ill_t *, ipfb_t *, ipf_t *, int); extern time_t ill_frag_timeout(ill_t *, time_t); extern int ill_init(queue_t *, ill_t *); -extern void ill_refresh_bcast(ill_t *); extern void ill_restart_dad(ill_t *, boolean_t); extern void ill_setdefaulttoken(ill_t *); extern void ill_setdesttoken(ill_t *); +extern void ill_set_inputfn(ill_t *); +extern void ill_set_inputfn_all(ip_stack_t *); extern int ill_set_phys_addr(ill_t *, mblk_t *); extern int ill_replumb(ill_t *, mblk_t *); extern void ill_set_ndmp(ill_t *, mblk_t *, uint_t, uint_t); -extern mblk_t *ill_pending_mp_get(ill_t *, conn_t **, uint_t); -extern boolean_t ill_pending_mp_add(ill_t *, conn_t *, mblk_t *); extern boolean_t ill_is_freeable(ill_t *ill); extern void ill_refhold(ill_t *); extern void ill_refhold_locked(ill_t *); -extern int ill_check_and_refhold(ill_t *); +extern boolean_t ill_check_and_refhold(ill_t *); extern void ill_refrele(ill_t *); extern boolean_t ill_waiter_inc(ill_t *); extern void ill_waiter_dcr(ill_t *); extern void ill_trace_ref(ill_t *); extern void ill_untrace_ref(ill_t *); +extern void ill_downi(ire_t *, char *); +extern void ill_downi_if_clone(ire_t *, char *); extern boolean_t ill_down_start(queue_t *, mblk_t *); +extern ill_t *ill_lookup_group_v4(ipaddr_t, zoneid_t, + ip_stack_t *, boolean_t *, ipaddr_t *); extern ill_t *ill_lookup_group_v6(const in6_addr_t *, zoneid_t, - ip_stack_t *); + ip_stack_t *, boolean_t *, in6_addr_t *); extern void ill_capability_ack(ill_t *, mblk_t *); extern void ill_capability_probe(ill_t *); extern void ill_capability_reset(ill_t *, boolean_t); extern void ill_taskq_dispatch(ip_stack_t *); -extern void ill_mtu_change(ire_t *, char *); +extern void ill_get_name(const ill_t *, char *, int); +extern void ill_group_cleanup(ill_t *); extern int ill_up_ipifs(ill_t *, queue_t *, mblk_t *); +extern void ip_update_source_selection(ip_stack_t *); extern uint_t ill_appaddr_cnt(const ill_t *); extern uint_t ill_ptpaddr_cnt(const ill_t *); +extern uint_t ill_admupaddr_cnt(const ill_t *); + +extern ill_t *ill_lookup_multicast(ip_stack_t *, zoneid_t, boolean_t); +extern void ill_save_ire(ill_t *, ire_t *); +extern void ill_remove_saved_ire(ill_t *, ire_t *); +extern int ill_recover_saved_ire(ill_t *); extern void ip_interface_cleanup(ip_stack_t *); extern void ipif_get_name(const ipif_t *, char *, int); extern ipif_t *ipif_getby_indexes(uint_t, uint_t, boolean_t, ip_stack_t *); extern void ipif_init(ip_stack_t *); -extern ipif_t *ipif_lookup_addr(ipaddr_t, ill_t *, zoneid_t, queue_t *, - mblk_t *, ipsq_func_t, int *, ip_stack_t *); -extern boolean_t ip_addr_exists(ipaddr_t, zoneid_t, ip_stack_t *); +extern ipif_t *ipif_lookup_addr(ipaddr_t, ill_t *, zoneid_t, ip_stack_t *); +extern ipif_t *ipif_lookup_addr_exact(ipaddr_t, ill_t *, ip_stack_t *); +extern ipif_t *ipif_lookup_addr_nondup(ipaddr_t, ill_t *, zoneid_t, + ip_stack_t *); extern ipif_t *ipif_lookup_addr_v6(const in6_addr_t *, ill_t *, zoneid_t, - queue_t *, mblk_t *, ipsq_func_t, int *, ip_stack_t *); -extern boolean_t ip_addr_exists_v6(const in6_addr_t *, zoneid_t, ip_stack_t *); extern ipif_t *ipif_lookup_addr_exact_v6(const in6_addr_t *, ill_t *, ip_stack_t *); +extern ipif_t *ipif_lookup_addr_nondup_v6(const in6_addr_t *, ill_t *, + zoneid_t, ip_stack_t *); extern zoneid_t ipif_lookup_addr_zoneid(ipaddr_t, ill_t *, ip_stack_t *); extern zoneid_t ipif_lookup_addr_zoneid_v6(const in6_addr_t *, ill_t *, ip_stack_t *); -extern ipif_t *ipif_lookup_group(ipaddr_t, zoneid_t, ip_stack_t *); -extern ipif_t *ipif_lookup_group_v6(const in6_addr_t *, zoneid_t, - ip_stack_t *); -extern ipif_t *ipif_lookup_interface(ipaddr_t, ipaddr_t, - queue_t *, mblk_t *, ipsq_func_t, int *, ip_stack_t *); -extern ipif_t *ipif_lookup_multicast(ip_stack_t *, zoneid_t, boolean_t); +extern ipif_t *ipif_lookup_interface(ipaddr_t, ipaddr_t, ip_stack_t *); extern ipif_t *ipif_lookup_remote(ill_t *, ipaddr_t, zoneid_t); -extern ipif_t *ipif_lookup_onlink_addr(ipaddr_t, zoneid_t, ip_stack_t *); -extern ipif_t *ipif_lookup_seqid(ill_t *, uint_t); -extern boolean_t ipif_lookup_zoneid(ill_t *, zoneid_t, int, ipif_t **); -extern ipif_t *ipif_select_source(ill_t *, ipaddr_t, zoneid_t); -extern boolean_t ipif_usesrc_avail(ill_t *, zoneid_t); +extern boolean_t ipif_lookup_testaddr_v6(ill_t *, const in6_addr_t *, + ipif_t **); +extern boolean_t ipif_lookup_testaddr_v4(ill_t *, const in_addr_t *, + ipif_t **); +extern ipif_t *ipif_select_source_v4(ill_t *, ipaddr_t, zoneid_t, boolean_t, + boolean_t *); +extern boolean_t ipif_zone_avail(uint_t, boolean_t, zoneid_t, ip_stack_t *); +extern ipif_t *ipif_good_addr(ill_t *, zoneid_t); +extern int ip_select_source_v4(ill_t *, ipaddr_t, ipaddr_t, ipaddr_t, + zoneid_t, ip_stack_t *, ipaddr_t *, uint32_t *, uint64_t *); extern void ipif_refhold(ipif_t *); extern void ipif_refhold_locked(ipif_t *); extern void ipif_refrele(ipif_t *); extern void ipif_all_down_tail(ipsq_t *, queue_t *, mblk_t *, void *); -extern void ipif_resolver_down(ipif_t *); extern int ipif_resolver_up(ipif_t *, enum ip_resolver_action); -extern int ipif_arp_setup_multicast(ipif_t *, mblk_t **); extern int ipif_down(ipif_t *, queue_t *, mblk_t *); -extern void ipif_down_tail(ipif_t *); +extern int ipif_down_tail(ipif_t *); extern void ipif_multicast_down(ipif_t *); extern void ipif_multicast_up(ipif_t *); extern void ipif_ndp_down(ipif_t *); extern int ipif_ndp_up(ipif_t *, boolean_t); -extern int ipif_ndp_setup_multicast(ipif_t *, struct nce_s **); extern int ipif_up_done(ipif_t *); extern int ipif_up_done_v6(ipif_t *); extern void ipif_up_notify(ipif_t *); -extern void ipif_update_other_ipifs_v6(ipif_t *); -extern void ipif_recreate_interface_routes_v6(ipif_t *, ipif_t *); -extern void ill_update_source_selection(ill_t *); extern ipif_t *ipif_select_source_v6(ill_t *, const in6_addr_t *, boolean_t, - uint32_t, zoneid_t); + uint32_t, zoneid_t, boolean_t, boolean_t *); +extern int ip_select_source_v6(ill_t *, const in6_addr_t *, + const in6_addr_t *, zoneid_t, ip_stack_t *, uint_t, uint32_t, in6_addr_t *, + uint32_t *, uint64_t *); extern boolean_t ipif_cant_setlinklocal(ipif_t *); extern void ipif_setlinklocal(ipif_t *); extern void ipif_setdestlinklocal(ipif_t *); -extern ipif_t *ipif_lookup_on_ifindex(uint_t, boolean_t, zoneid_t, queue_t *, - mblk_t *, ipsq_func_t, int *, ip_stack_t *); +extern ipif_t *ipif_lookup_on_ifindex(uint_t, boolean_t, zoneid_t, + ip_stack_t *); extern ipif_t *ipif_get_next_ipif(ipif_t *curr, ill_t *ill); extern void ipif_ill_refrele_tail(ill_t *ill); +extern void ipif_nce_down(ipif_t *ipif); +extern int ipif_arp_down(ipif_t *ipif); extern void ipif_mask_reply(ipif_t *); extern int ipif_up(ipif_t *, queue_t *, mblk_t *); @@ -290,7 +305,7 @@ extern void qwriter_ip(ill_t *, queue_t *, mblk_t *, ipsq_func_t, int, boolean_t); typedef int ip_extract_func_t(queue_t *, mblk_t *, const ip_ioctl_cmd_t *, - cmd_info_t *, ipsq_func_t); + cmd_info_t *); extern ip_extract_func_t ip_extract_arpreq, ip_extract_lifreq; @@ -298,16 +313,14 @@ extern int ip_addr_availability_check(ipif_t *); extern void ip_ll_subnet_defaults(ill_t *, mblk_t *); extern int ip_rt_add(ipaddr_t, ipaddr_t, ipaddr_t, ipaddr_t, int, - ipif_t *, ire_t **, boolean_t, queue_t *, mblk_t *, ipsq_func_t, - struct rtsa_s *, ip_stack_t *); + ill_t *, ire_t **, boolean_t, struct rtsa_s *, ip_stack_t *, zoneid_t); extern int ip_rt_add_v6(const in6_addr_t *, const in6_addr_t *, - const in6_addr_t *, const in6_addr_t *, int, ipif_t *, ire_t **, - queue_t *, mblk_t *, ipsq_func_t, struct rtsa_s *, ip_stack_t *ipst); + const in6_addr_t *, const in6_addr_t *, int, ill_t *, ire_t **, + struct rtsa_s *, ip_stack_t *, zoneid_t); extern int ip_rt_delete(ipaddr_t, ipaddr_t, ipaddr_t, uint_t, int, - ipif_t *, boolean_t, queue_t *, mblk_t *, ipsq_func_t, ip_stack_t *); + ill_t *, boolean_t, ip_stack_t *, zoneid_t); extern int ip_rt_delete_v6(const in6_addr_t *, const in6_addr_t *, - const in6_addr_t *, uint_t, int, ipif_t *, queue_t *, mblk_t *, - ipsq_func_t, ip_stack_t *); + const in6_addr_t *, uint_t, int, ill_t *, ip_stack_t *, zoneid_t); extern int ip_siocdelndp_v6(ipif_t *, sin_t *, queue_t *, mblk_t *, ip_ioctl_cmd_t *, void *); extern int ip_siocqueryndp_v6(ipif_t *, sin_t *, queue_t *, mblk_t *, @@ -454,11 +467,12 @@ extern int ip_sioctl_get_lifsrcof(ipif_t *, sin_t *, queue_t *, extern void ip_sioctl_copyin_resume(ipsq_t *, queue_t *, mblk_t *, void *); extern void ip_sioctl_copyin_setup(queue_t *, mblk_t *); -extern void ip_sioctl_iocack(ipsq_t *, queue_t *, mblk_t *, void *); extern ip_ioctl_cmd_t *ip_sioctl_lookup(int); - -extern void conn_delete_ire(conn_t *, caddr_t); -extern boolean_t phyint_exists(uint_t, ip_stack_t *); +extern void ipif_delete_ires_v4(ipif_t *); +extern void ipif_delete_ires_v6(ipif_t *); +extern int ipif_arp_up(ipif_t *, enum ip_resolver_action, boolean_t); +extern void ipif_dup_recovery(void *); +extern void ipif_do_recovery(ipif_t *); /* * Notes on reference tracing on ill, ipif, ire, nce data structures: diff --git a/usr/src/uts/common/inet/ip_impl.h b/usr/src/uts/common/inet/ip_impl.h index 5f9d674e17..694f7a63b0 100644 --- a/usr/src/uts/common/inet/ip_impl.h +++ b/usr/src/uts/common/inet/ip_impl.h @@ -50,10 +50,12 @@ extern "C" { #define IP_HDR_CSUM_TTL_ADJUST 256 #define IP_TCP_CSUM_COMP IPPROTO_TCP #define IP_UDP_CSUM_COMP IPPROTO_UDP +#define IP_ICMPV6_CSUM_COMP IPPROTO_ICMPV6 #else #define IP_HDR_CSUM_TTL_ADJUST 1 #define IP_TCP_CSUM_COMP (IPPROTO_TCP << 8) #define IP_UDP_CSUM_COMP (IPPROTO_UDP << 8) +#define IP_ICMPV6_CSUM_COMP (IPPROTO_ICMPV6 << 8) #endif #define TCP_CHECKSUM_OFFSET 16 @@ -62,240 +64,20 @@ extern "C" { #define UDP_CHECKSUM_OFFSET 6 #define UDP_CHECKSUM_SIZE 2 +#define ICMPV6_CHECKSUM_OFFSET 2 +#define ICMPV6_CHECKSUM_SIZE 2 + #define IPH_TCPH_CHECKSUMP(ipha, hlen) \ ((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + TCP_CHECKSUM_OFFSET))) #define IPH_UDPH_CHECKSUMP(ipha, hlen) \ ((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + UDP_CHECKSUM_OFFSET))) +#define IPH_ICMPV6_CHECKSUMP(ipha, hlen) \ + ((uint16_t *)(((uchar_t *)(ipha)) + ((hlen) + ICMPV6_CHECKSUM_OFFSET))) + #define ILL_HCKSUM_CAPABLE(ill) \ (((ill)->ill_capabilities & ILL_CAPAB_HCKSUM) != 0) -/* - * Macro that performs software checksum calculation on the IP header. - */ -#define IP_HDR_CKSUM(ipha, sum, v_hlen_tos_len, ttl_protocol) { \ - (sum) += (ttl_protocol) + (ipha)->ipha_ident + \ - ((v_hlen_tos_len) >> 16) + \ - ((v_hlen_tos_len) & 0xFFFF) + \ - (ipha)->ipha_fragment_offset_and_flags; \ - (sum) = (((sum) & 0xFFFF) + ((sum) >> 16)); \ - (sum) = ~((sum) + ((sum) >> 16)); \ - (ipha)->ipha_hdr_checksum = (uint16_t)(sum); \ -} - -#define IS_IP_HDR_HWCKSUM(ipsec, mp, ill) \ - ((!ipsec) && (DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM) && \ - ILL_HCKSUM_CAPABLE(ill) && dohwcksum) - -/* - * This macro acts as a wrapper around IP_CKSUM_XMIT_FAST, and it performs - * several checks on the IRE and ILL (among other things) in order to see - * whether or not hardware checksum offload is allowed for the outgoing - * packet. It assumes that the caller has held a reference to the IRE. - */ -#define IP_CKSUM_XMIT(ill, ire, mp, ihp, up, proto, start, end, \ - max_frag, ipsec_len, pseudo) { \ - uint32_t _hck_flags; \ - /* \ - * We offload checksum calculation to hardware when IPsec isn't \ - * present and if fragmentation isn't required. We also check \ - * if M_DATA fastpath is safe to be used on the corresponding \ - * IRE; this check is performed without grabbing ire_lock but \ - * instead by holding a reference to it. This is sufficient \ - * for IRE_CACHE; for IRE_BROADCAST on non-Ethernet links, the \ - * DL_NOTE_FASTPATH_FLUSH indication could come up from the \ - * driver and trigger the IRE (hence fp_mp) deletion. This is \ - * why only IRE_CACHE type is eligible for offload. \ - * \ - * The presense of IP options also forces the network stack to \ - * calculate the checksum in software. This is because: \ - * \ - * Wrap around: certain partial-checksum NICs (eri, ce) limit \ - * the size of "start offset" width to 6-bit. This effectively \ - * sets the largest value of the offset to 64-bytes, starting \ - * from the MAC header. When the cumulative MAC and IP headers \ - * exceed such limit, the offset will wrap around. This causes \ - * the checksum to be calculated at the wrong place. \ - * \ - * IPv4 source routing: none of the full-checksum capable NICs \ - * is capable of correctly handling the IPv4 source-routing \ - * option for purposes of calculating the pseudo-header; the \ - * actual destination is different from the destination in the \ - * header which is that of the next-hop. (This case may not be \ - * true for NICs which can parse IPv6 extension headers, but \ - * we choose to simplify the implementation by not offloading \ - * checksum when they are present.) \ - * \ - */ \ - if ((ill) != NULL && ILL_HCKSUM_CAPABLE(ill) && \ - !((ire)->ire_flags & RTF_MULTIRT) && \ - (!((ire)->ire_type & IRE_BROADCAST) || \ - (ill)->ill_type == IFT_ETHER) && \ - (ipsec_len) == 0 && \ - (((ire)->ire_ipversion == IPV4_VERSION && \ - (start) == IP_SIMPLE_HDR_LENGTH && \ - ((ire)->ire_nce != NULL && \ - (ire)->ire_nce->nce_fp_mp != NULL && \ - MBLKHEAD(mp) >= MBLKL((ire)->ire_nce->nce_fp_mp))) || \ - ((ire)->ire_ipversion == IPV6_VERSION && \ - (start) == IPV6_HDR_LEN && \ - (ire)->ire_nce->nce_fp_mp != NULL && \ - MBLKHEAD(mp) >= MBLKL((ire)->ire_nce->nce_fp_mp))) && \ - (max_frag) >= (uint_t)((end) + (ipsec_len)) && \ - dohwcksum) { \ - _hck_flags = (ill)->ill_hcksum_capab->ill_hcksum_txflags; \ - } else { \ - _hck_flags = 0; \ - } \ - IP_CKSUM_XMIT_FAST((ire)->ire_ipversion, _hck_flags, mp, ihp, \ - up, proto, start, end, pseudo); \ -} - -/* - * Based on the device capabilities, this macro either marks an outgoing - * packet with hardware checksum offload information or calculate the - * checksum in software. If the latter is performed, the checksum field - * of the dblk is cleared; otherwise it will be non-zero and contain the - * necessary flag(s) for the driver. - */ -#define IP_CKSUM_XMIT_FAST(ipver, hck_flags, mp, ihp, up, proto, start, \ - end, pseudo) { \ - uint32_t _sum; \ - /* \ - * Underlying interface supports hardware checksum offload for \ - * the payload; leave the payload checksum for the hardware to \ - * calculate. N.B: We only need to set up checksum info on the \ - * first mblk. \ - */ \ - DB_CKSUMFLAGS(mp) = 0; \ - if (((ipver) == IPV4_VERSION && \ - ((hck_flags) & HCKSUM_INET_FULL_V4)) || \ - ((ipver) == IPV6_VERSION && \ - ((hck_flags) & HCKSUM_INET_FULL_V6))) { \ - /* \ - * Hardware calculates pseudo-header, header and the \ - * payload checksums, so clear the checksum field in \ - * the protocol header. \ - */ \ - *(up) = 0; \ - DB_CKSUMFLAGS(mp) |= HCK_FULLCKSUM; \ - } else if ((hck_flags) & HCKSUM_INET_PARTIAL) { \ - /* \ - * Partial checksum offload has been enabled. Fill \ - * the checksum field in the protocl header with the \ - * pseudo-header checksum value. \ - */ \ - _sum = ((proto) == IPPROTO_UDP) ? \ - IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP; \ - _sum += *(up) + (pseudo); \ - _sum = (_sum & 0xFFFF) + (_sum >> 16); \ - *(up) = (_sum & 0xFFFF) + (_sum >> 16); \ - /* \ - * Offsets are relative to beginning of IP header. \ - */ \ - DB_CKSUMSTART(mp) = (start); \ - DB_CKSUMSTUFF(mp) = ((proto) == IPPROTO_UDP) ? \ - (start) + UDP_CHECKSUM_OFFSET : \ - (start) + TCP_CHECKSUM_OFFSET; \ - DB_CKSUMEND(mp) = (end); \ - DB_CKSUMFLAGS(mp) |= HCK_PARTIALCKSUM; \ - } else { \ - /* \ - * Software checksumming. \ - */ \ - _sum = ((proto) == IPPROTO_UDP) ? \ - IP_UDP_CSUM_COMP : IP_TCP_CSUM_COMP; \ - _sum += (pseudo); \ - _sum = IP_CSUM(mp, start, _sum); \ - *(up) = (uint16_t)(((proto) == IPPROTO_UDP) ? \ - (_sum ? _sum : ~_sum) : _sum); \ - } \ - /* \ - * Hardware supports IP header checksum offload; clear the \ - * contents of IP header checksum field as expected by NIC. \ - * Do this only if we offloaded either full or partial sum. \ - */ \ - if ((ipver) == IPV4_VERSION && DB_CKSUMFLAGS(mp) != 0 && \ - ((hck_flags) & HCKSUM_IPHDRCKSUM)) { \ - DB_CKSUMFLAGS(mp) |= HCK_IPV4_HDRCKSUM; \ - ((ipha_t *)(ihp))->ipha_hdr_checksum = 0; \ - } \ -} - -/* - * Macro to inspect the checksum of a fully-reassembled incoming datagram. - */ -#define IP_CKSUM_RECV_REASS(hck_flags, off, pseudo, sum, err) { \ - (err) = B_FALSE; \ - if ((hck_flags) & HCK_FULLCKSUM) { \ - /* \ - * The sum of all fragment checksums should \ - * result in -0 (0xFFFF) or otherwise invalid. \ - */ \ - if ((sum) != 0xFFFF) \ - (err) = B_TRUE; \ - } else if ((hck_flags) & HCK_PARTIALCKSUM) { \ - (sum) += (pseudo); \ - (sum) = ((sum) & 0xFFFF) + ((sum) >> 16); \ - (sum) = ((sum) & 0xFFFF) + ((sum) >> 16); \ - if (~(sum) & 0xFFFF) \ - (err) = B_TRUE; \ - } else if (((sum) = IP_CSUM(mp, off, pseudo)) != 0) { \ - (err) = B_TRUE; \ - } \ -} - -/* - * This macro inspects an incoming packet to see if the checksum value - * contained in it is valid; if the hardware has provided the information, - * the value is verified, otherwise it performs software checksumming. - * The checksum value is returned to caller. - */ -#define IP_CKSUM_RECV(hck_flags, sum, cksum_start, ulph_off, mp, mp1, err) { \ - int32_t _len; \ - \ - (err) = B_FALSE; \ - if ((hck_flags) & HCK_FULLCKSUM) { \ - /* \ - * Full checksum has been computed by the hardware \ - * and has been attached. If the driver wants us to \ - * verify the correctness of the attached value, in \ - * order to protect against faulty hardware, compare \ - * it against -0 (0xFFFF) to see if it's valid. \ - */ \ - (sum) = DB_CKSUM16(mp); \ - if (!((hck_flags) & HCK_FULLCKSUM_OK) && (sum) != 0xFFFF) \ - (err) = B_TRUE; \ - } else if (((hck_flags) & HCK_PARTIALCKSUM) && \ - ((mp1) == NULL || (mp1)->b_cont == NULL) && \ - (ulph_off) >= DB_CKSUMSTART(mp) && \ - ((_len = (ulph_off) - DB_CKSUMSTART(mp)) & 1) == 0) { \ - uint32_t _adj; \ - /* \ - * Partial checksum has been calculated by hardware \ - * and attached to the packet; in addition, any \ - * prepended extraneous data is even byte aligned, \ - * and there are at most two mblks associated with \ - * the packet. If any such data exists, we adjust \ - * the checksum; also take care any postpended data. \ - */ \ - IP_ADJCKSUM_PARTIAL(cksum_start, mp, mp1, _len, _adj); \ - /* \ - * One's complement subtract extraneous checksum \ - */ \ - (sum) += DB_CKSUM16(mp); \ - if (_adj >= (sum)) \ - (sum) = ~(_adj - (sum)) & 0xFFFF; \ - else \ - (sum) -= _adj; \ - (sum) = ((sum) & 0xFFFF) + ((int)(sum) >> 16); \ - (sum) = ((sum) & 0xFFFF) + ((int)(sum) >> 16); \ - if (~(sum) & 0xFFFF) \ - (err) = B_TRUE; \ - } else if (((sum) = IP_CSUM(mp, ulph_off, sum)) != 0) { \ - (err) = B_TRUE; \ - } \ -} /* * Macro to adjust a given checksum value depending on any prepended @@ -338,98 +120,37 @@ extern "C" { } \ } -#define ILL_MDT_CAPABLE(ill) \ - (((ill)->ill_capabilities & ILL_CAPAB_MDT) != 0) - -/* - * ioctl identifier and structure for Multidata Transmit update - * private M_CTL communication from IP to ULP. - */ -#define MDT_IOC_INFO_UPDATE (('M' << 8) + 1020) - -typedef struct ip_mdt_info_s { - uint_t mdt_info_id; /* MDT_IOC_INFO_UPDATE */ - ill_mdt_capab_t mdt_capab; /* ILL MDT capabilities */ -} ip_mdt_info_t; +#define IS_SIMPLE_IPH(ipha) \ + ((ipha)->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION) /* - * Macro that determines whether or not a given ILL is allowed for MDT. + * Currently supported flags for LSO. */ -#define ILL_MDT_USABLE(ill) \ - (ILL_MDT_CAPABLE(ill) && \ - ill->ill_mdt_capab != NULL && \ - ill->ill_mdt_capab->ill_mdt_version == MDT_VERSION_2 && \ - ill->ill_mdt_capab->ill_mdt_on != 0) +#define LSO_BASIC_TCP_IPV4 DLD_LSO_BASIC_TCP_IPV4 +#define LSO_BASIC_TCP_IPV6 DLD_LSO_BASIC_TCP_IPV6 -#define ILL_LSO_CAPABLE(ill) \ - (((ill)->ill_capabilities & ILL_CAPAB_DLD_LSO) != 0) +#define ILL_LSO_CAPABLE(ill) \ + (((ill)->ill_capabilities & ILL_CAPAB_LSO) != 0) -/* - * ioctl identifier and structure for Large Segment Offload - * private M_CTL communication from IP to ULP. - */ -#define LSO_IOC_INFO_UPDATE (('L' << 24) + ('S' << 16) + ('O' << 8)) - -typedef struct ip_lso_info_s { - uint_t lso_info_id; /* LSO_IOC_INFO_UPDATE */ - ill_lso_capab_t lso_capab; /* ILL LSO capabilities */ -} ip_lso_info_t; - -/* - * Macro that determines whether or not a given ILL is allowed for LSO. - */ #define ILL_LSO_USABLE(ill) \ (ILL_LSO_CAPABLE(ill) && \ - ill->ill_lso_capab != NULL && \ - ill->ill_lso_capab->ill_lso_on != 0) + ill->ill_lso_capab != NULL) -#define ILL_LSO_TCP_USABLE(ill) \ +#define ILL_LSO_TCP_IPV4_USABLE(ill) \ (ILL_LSO_USABLE(ill) && \ - ill->ill_lso_capab->ill_lso_flags & DLD_LSO_TX_BASIC_TCP_IPV4) + ill->ill_lso_capab->ill_lso_flags & LSO_BASIC_TCP_IPV4) -/* - * Macro that determines whether or not a given CONN may be considered - * for fast path prior to proceeding further with LSO or Multidata. - */ -#define CONN_IS_LSO_MD_FASTPATH(connp) \ - ((connp)->conn_dontroute == 0 && /* SO_DONTROUTE */ \ - !((connp)->conn_nexthop_set) && /* IP_NEXTHOP */ \ - (connp)->conn_outgoing_ill == NULL) /* IP{V6}_BOUND_IF */ - -/* Definitions for fragmenting IP packets using MDT. */ - -/* - * Smaller and private version of pdescinfo_t used specifically for IP, - * which allows for only a single payload span per packet. - */ -typedef struct ip_pdescinfo_s PDESCINFO_STRUCT(2) ip_pdescinfo_t; +#define ILL_LSO_TCP_IPV6_USABLE(ill) \ + (ILL_LSO_USABLE(ill) && \ + ill->ill_lso_capab->ill_lso_flags & LSO_BASIC_TCP_IPV6) -/* - * Macro version of ip_can_frag_mdt() which avoids the function call if we - * only examine a single message block. - */ -#define IP_CAN_FRAG_MDT(mp, hdr_len, len) \ - (((mp)->b_cont == NULL) ? \ - (MBLKL(mp) >= ((hdr_len) + ip_wput_frag_mdt_min)) : \ - ip_can_frag_mdt((mp), (hdr_len), (len))) +#define ILL_ZCOPY_CAPABLE(ill) \ + (((ill)->ill_capabilities & ILL_CAPAB_ZEROCOPY) != 0) -/* - * Macro that determines whether or not a given IPC requires - * outbound IPSEC processing. - */ -#define CONN_IPSEC_OUT_ENCAPSULATED(connp) \ - ((connp)->conn_out_enforce_policy || \ - ((connp)->conn_latch != NULL && \ - (connp)->conn_latch->ipl_out_policy != NULL)) +#define ILL_ZCOPY_USABLE(ill) \ + (ILL_ZCOPY_CAPABLE(ill) && (ill->ill_zerocopy_capab != NULL) && \ + (ill->ill_zerocopy_capab->ill_zerocopy_flags != 0)) -/* - * Macro that checks whether or not a particular UDP conn is - * flow-controlling on the read-side. - * - * Note that this check is done after the conn is found in - * the UDP fanout table. - */ -#define CONN_UDP_FLOWCTLD(connp) !canputnext((connp)->conn_rq) /* Macro that follows definitions of flags for mac_tx() (see mac_client.h) */ #define IP_DROP_ON_NO_DESC 0x01 /* Equivalent to MAC_DROP_ON_NO_DESC */ @@ -437,74 +158,7 @@ typedef struct ip_pdescinfo_s PDESCINFO_STRUCT(2) ip_pdescinfo_t; #define ILL_DIRECT_CAPABLE(ill) \ (((ill)->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) -#define ILL_SEND_TX(ill, ire, hint, mp, flag, connp) { \ - if (ILL_DIRECT_CAPABLE(ill) && DB_TYPE(mp) == M_DATA) { \ - ill_dld_direct_t *idd; \ - uintptr_t cookie; \ - conn_t *udp_connp = (conn_t *)connp; \ - \ - idd = &(ill)->ill_dld_capab->idc_direct; \ - /* \ - * Send the packet directly to DLD, where it \ - * may be queued depending on the availability \ - * of transmit resources at the media layer. \ - * Ignore the returned value for the time being \ - * In future, we may want to take this into \ - * account and flow control the TCP. \ - */ \ - cookie = idd->idd_tx_df(idd->idd_tx_dh, mp, \ - (uintptr_t)(hint), flag); \ - \ - /* \ - * non-NULL cookie indicates flow control situation \ - * and the cookie itself identifies this specific \ - * Tx ring that is blocked. This cookie is used to \ - * block the UDP conn that is sending packets over \ - * this specific Tx ring. \ - */ \ - if ((cookie != NULL) && (udp_connp != NULL) && \ - (udp_connp->conn_ulp == IPPROTO_UDP)) { \ - idl_tx_list_t *idl_txl; \ - ip_stack_t *ipst; \ - \ - /* \ - * Flow controlled. \ - */ \ - DTRACE_PROBE2(ill__send__tx__cookie, \ - uintptr_t, cookie, conn_t *, udp_connp); \ - ipst = udp_connp->conn_netstack->netstack_ip; \ - idl_txl = \ - &ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)];\ - mutex_enter(&idl_txl->txl_lock); \ - if (udp_connp->conn_direct_blocked || \ - (idd->idd_tx_fctl_df(idd->idd_tx_fctl_dh, \ - cookie) == 0)) { \ - DTRACE_PROBE1(ill__tx__not__blocked, \ - boolean, \ - udp_connp->conn_direct_blocked); \ - } else if (idl_txl->txl_cookie != NULL && \ - idl_txl->txl_cookie != cookie) { \ - udp_t *udp = udp_connp->conn_udp; \ - udp_stack_t *us = udp->udp_us; \ - \ - DTRACE_PROBE2(ill__send__tx__collision, \ - uintptr_t, cookie, \ - uintptr_t, idl_txl->txl_cookie); \ - UDP_STAT(us, udp_cookie_coll); \ - } else { \ - udp_connp->conn_direct_blocked = B_TRUE;\ - idl_txl->txl_cookie = cookie; \ - conn_drain_insert(udp_connp, idl_txl); \ - DTRACE_PROBE1(ill__send__tx__insert, \ - conn_t *, udp_connp); \ - } \ - mutex_exit(&idl_txl->txl_lock); \ - } \ - } else { \ - putnext((ire)->ire_stq, mp); \ - } \ -} - +/* This macro is used by the mac layer */ #define MBLK_RX_FANOUT_SLOWPATH(mp, ipha) \ (DB_TYPE(mp) != M_DATA || DB_REF(mp) != 1 || !OK_32PTR(ipha) || \ (((uchar_t *)ipha + IP_SIMPLE_HDR_LENGTH) >= (mp)->b_wptr)) @@ -520,13 +174,11 @@ typedef struct ip_pdescinfo_s PDESCINFO_STRUCT(2) ip_pdescinfo_t; netstackid_to_zoneid((ipst)->ips_netstack->netstack_stackid) : \ (zoneid)) -extern int ip_wput_frag_mdt_min; -extern boolean_t ip_can_frag_mdt(mblk_t *, ssize_t, ssize_t); -extern mblk_t *ip_prepend_zoneid(mblk_t *, zoneid_t, ip_stack_t *); extern void ill_flow_enable(void *, ip_mac_tx_cookie_t); -extern zoneid_t ip_get_zoneid_v4(ipaddr_t, mblk_t *, ip_stack_t *, zoneid_t); +extern zoneid_t ip_get_zoneid_v4(ipaddr_t, mblk_t *, ip_recv_attr_t *, + zoneid_t); extern zoneid_t ip_get_zoneid_v6(in6_addr_t *, mblk_t *, const ill_t *, - ip_stack_t *, zoneid_t); + ip_recv_attr_t *, zoneid_t); /* * flag passed in by IP based protocols to get a private ip stream with @@ -542,8 +194,6 @@ extern zoneid_t ip_get_zoneid_v6(in6_addr_t *, mblk_t *, const ill_t *, #define DEV_IP "/devices/pseudo/ip@0:ip" #define DEV_IP6 "/devices/pseudo/ip6@0:ip6" -extern struct kmem_cache *ip_helper_stream_cache; - #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/inet/ip_ire.h b/usr/src/uts/common/inet/ip_ire.h index f4882f7640..d4dfd9c97e 100644 --- a/usr/src/uts/common/inet/ip_ire.h +++ b/usr/src/uts/common/inet/ip_ire.h @@ -68,106 +68,26 @@ extern "C" { ((addr).s6_addr8[14] & (mask).s6_addr8[14]) ^ \ ((addr).s6_addr8[15] & (mask).s6_addr8[15])) & ((table_size) - 1)) +#define IRE_HIDDEN_TYPE(ire_type) ((ire_type) & \ + (IRE_HOST | IRE_PREFIX | IRE_DEFAULT | IRE_IF_ALL | IRE_BROADCAST)) + /* * match parameter definitions for IRE lookup routines. */ #define MATCH_IRE_DSTONLY 0x0000 /* Match just the address */ #define MATCH_IRE_TYPE 0x0001 /* Match IRE type */ -#define MATCH_IRE_SRC 0x0002 /* Match IRE source address */ -#define MATCH_IRE_MASK 0x0004 /* Match IRE mask */ -#define MATCH_IRE_WQ 0x0008 /* Match IRE ire_stq to write queue */ -#define MATCH_IRE_GW 0x0010 /* Match IRE gateway */ -#define MATCH_IRE_IPIF 0x0020 /* Match IRE ipif */ -#define MATCH_IRE_RECURSIVE 0x0040 /* Do recursive lookup if necessary */ -#define MATCH_IRE_DEFAULT 0x0080 /* Return default route if no route */ - /* found. */ -#define MATCH_IRE_RJ_BHOLE 0x0100 /* During lookup if we hit an ire */ - /* with RTF_REJECT or RTF_BLACKHOLE, */ - /* return the ire. No recursive */ - /* lookup should be done. */ -#define MATCH_IRE_IHANDLE 0x0200 /* Match IRE on ihandle */ -#define MATCH_IRE_MARK_TESTHIDDEN 0x0400 /* Match IRE_MARK_TESTHIDDEN IREs */ - -/* - * MATCH_IRE_PARENT is used whenever we unconditionally want to get the - * parent IRE (sire) while recursively searching IREs for an offsubnet - * destination. With this flag, even if no IRE_CACHETABLE or IRE_INTERFACE - * is found to help resolving IRE_OFFSUBNET in lookup routines, the - * IRE_OFFSUBNET sire, if any, is returned to the caller. - */ -/* UNUSED 0x0800 */ -#define MATCH_IRE_ILL 0x1000 /* Match IRE on the ill */ - -#define MATCH_IRE_PARENT 0x2000 /* Match parent ire, if any, */ - /* even if ire is not matched. */ -#define MATCH_IRE_ZONEONLY 0x4000 /* Match IREs in specified zone, ie */ +#define MATCH_IRE_MASK 0x0002 /* Match IRE mask */ +#define MATCH_IRE_SHORTERMASK 0x0004 /* A mask shorter than the argument */ +#define MATCH_IRE_GW 0x0008 /* Match IRE gateway */ +#define MATCH_IRE_ILL 0x0010 /* Match IRE on the ill */ +#define MATCH_IRE_ZONEONLY 0x0020 /* Match IREs in specified zone, ie */ /* don't match IRE_LOCALs from other */ /* zones or shared IREs */ -#define MATCH_IRE_MARK_PRIVATE_ADDR 0x8000 /* Match IRE ire_marks with */ - /* IRE_MARK_PRIVATE_ADDR. */ -#define MATCH_IRE_SECATTR 0x10000 /* Match gateway security attributes */ -#define MATCH_IRE_COMPLETE 0x20000 /* ire_ftable_lookup() can return */ - /* IRE_CACHE entry only if it is */ - /* ND_REACHABLE */ +#define MATCH_IRE_SECATTR 0x0040 /* Match gateway security attributes */ +#define MATCH_IRE_TESTHIDDEN 0x0080 /* Match ire_testhidden IREs */ -/* - * Any ire to nce association is long term, and - * the refhold and refrele may be done by different - * threads. So all cases of making or breaking ire to - * nce association should all effectively use the NOTR variants. - * To understand the *effectively* part read on. - * - * ndp_lookup() and ndp_add_v4()/ndp_add_v6() implicitly do - * NCE_REFHOLD. So wherever we make ire to nce association after - * calling these functions, we effectively want to end up with - * NCE_REFHOLD_NOTR. We call this macro to achieve this effect. This - * macro changes a NCE_REFHOLD to a NCE_REFHOLD_NOTR. The macro's - * NCE_REFRELE cancels off ndp_lookup[ndp_add]'s implicit NCE_REFHOLD, - * and what you are left with is a NCE_REFHOLD_NOTR - */ -#define NCE_REFHOLD_TO_REFHOLD_NOTR(nce) { \ - NCE_REFHOLD_NOTR(nce); \ - NCE_REFRELE(nce); \ -} - -/* - * find the next ire_t entry in the ire_next chain starting at ire - * that is not CONDEMNED. ire is set to NULL if we reach the end of the list. - * Caller must hold the ire_bucket lock. - */ +#define MAX_IRE_RECURSION 4 /* Max IREs in ire_route_recursive */ -#define IRE_FIND_NEXT_ORIGIN(ire) { \ - while ((ire) != NULL && ((ire)->ire_marks & IRE_MARK_CONDEMNED))\ - (ire) = (ire)->ire_next; \ -} - - -/* Structure for ire_cache_count() */ -typedef struct { - int icc_total; /* Total number of IRE_CACHE */ - int icc_unused; /* # off/no PMTU unused since last reclaim */ - int icc_offlink; /* # offlink without PMTU information */ - int icc_pmtu; /* # offlink with PMTU information */ - int icc_onlink; /* # onlink */ -} ire_cache_count_t; - -/* - * Structure for ire_cache_reclaim(). Each field is a fraction i.e. 1 meaning - * reclaim all, N meaning reclaim 1/Nth of all entries, 0 meaning reclaim none. - * - * The comment below (and for other netstack_t references) refers - * to the fact that we only do netstack_hold in particular cases, - * such as the references from open streams (ill_t and conn_t's - * pointers). Internally within IP we rely on IP's ability to cleanup e.g. - * ire_t's when an ill goes away. - */ -typedef struct { - int icr_unused; /* Fraction for unused since last reclaim */ - int icr_offlink; /* Fraction for offlink without PMTU info */ - int icr_pmtu; /* Fraction for offlink with PMTU info */ - int icr_onlink; /* Fraction for onlink */ - ip_stack_t *icr_ipst; /* Does not have a netstack_hold */ -} ire_cache_reclaim_t; /* * We use atomics so that we get an accurate accounting on the ires. @@ -176,180 +96,250 @@ typedef struct { #define BUMP_IRE_STATS(ire_stats, x) atomic_add_64(&(ire_stats).x, 1) #ifdef _KERNEL -/* - * Structure for passing args for the IRE cache lookup functions. - */ -typedef struct ire_ctable_args_s { - void *ict_addr; - void *ict_gateway; - int ict_type; - const ipif_t *ict_ipif; - zoneid_t ict_zoneid; - const ts_label_t *ict_tsl; - int ict_flags; - ip_stack_t *ict_ipst; - queue_t *ict_wq; -} ire_ctable_args_t; - struct ts_label_s; struct nce_s; +/* + * structure for passing args between ire_ftable_lookup and ire_find_best_route + */ +typedef struct ire_ftable_args_s { + in6_addr_t ift_addr_v6; + in6_addr_t ift_mask_v6; + in6_addr_t ift_gateway_v6; +#define ift_addr V4_PART_OF_V6(ift_addr_v6) +#define ift_mask V4_PART_OF_V6(ift_mask_v6) +#define ift_gateway V4_PART_OF_V6(ift_gateway_v6) + int ift_type; + const ill_t *ift_ill; + zoneid_t ift_zoneid; + const ts_label_t *ift_tsl; + int ift_flags; + ire_t *ift_best_ire; +} ire_ftable_args_t; extern ipaddr_t ip_plen_to_mask(uint_t); extern in6_addr_t *ip_plen_to_mask_v6(uint_t, in6_addr_t *); extern int ip_ire_advise(queue_t *, mblk_t *, cred_t *); extern int ip_ire_delete(queue_t *, mblk_t *, cred_t *); -extern boolean_t ip_ire_clookup_and_delete(ipaddr_t, ipif_t *, ip_stack_t *); -extern void ip_ire_clookup_and_delete_v6(const in6_addr_t *, - ip_stack_t *); - -extern void ip_ire_req(queue_t *, mblk_t *); +extern void ip_ire_reclaim(void *); extern int ip_mask_to_plen(ipaddr_t); extern int ip_mask_to_plen_v6(const in6_addr_t *); -extern ire_t *ipif_to_ire(const ipif_t *); -extern ire_t *ipif_to_ire_v6(const ipif_t *); - -extern int ire_add(ire_t **, queue_t *, mblk_t *, ipsq_func_t, boolean_t); -extern void ire_add_then_send(queue_t *, ire_t *, mblk_t *); -extern int ire_add_v6(ire_t **, queue_t *, mblk_t *, ipsq_func_t); -extern int ire_atomic_start(irb_t *irb_ptr, ire_t *ire, queue_t *q, - mblk_t *mp, ipsq_func_t func); +extern ire_t *ire_add(ire_t *); +extern ire_t *ire_add_v6(ire_t *); +extern int ire_atomic_start(irb_t *irb_ptr, ire_t *ire); extern void ire_atomic_end(irb_t *irb_ptr, ire_t *ire); -extern void ire_cache_count(ire_t *, char *); -extern ire_t *ire_cache_lookup(ipaddr_t, zoneid_t, - const struct ts_label_s *, ip_stack_t *); -extern ire_t *ire_cache_lookup_simple(ipaddr_t, ip_stack_t *); -extern ire_t *ire_cache_lookup_v6(const in6_addr_t *, zoneid_t, - const struct ts_label_s *, ip_stack_t *); -extern void ire_cache_reclaim(ire_t *, char *); - -extern ire_t *ire_create_mp(uchar_t *, uchar_t *, uchar_t *, uchar_t *, - uint_t, struct nce_s *, queue_t *, queue_t *, ushort_t, ipif_t *, ipaddr_t, - uint32_t, uint32_t, uint32_t, const iulp_t *, tsol_gc_t *, tsol_gcgrp_t *, - ip_stack_t *); -extern ire_t *ire_create(uchar_t *, uchar_t *, uchar_t *, uchar_t *, - uint_t *, struct nce_s *, queue_t *, queue_t *, ushort_t, ipif_t *, - ipaddr_t, uint32_t, uint32_t, uint32_t, const iulp_t *, tsol_gc_t *, - tsol_gcgrp_t *, ip_stack_t *); - -extern ire_t **ire_check_and_create_bcast(ipif_t *, ipaddr_t, - ire_t **, int); -extern ire_t **ire_create_bcast(ipif_t *, ipaddr_t, ire_t **); -extern ire_t *ire_init(ire_t *, uchar_t *, uchar_t *, uchar_t *, uchar_t *, - uint_t *, struct nce_s *, queue_t *, queue_t *, ushort_t, ipif_t *, - ipaddr_t, uint32_t, uint32_t, uint32_t, const iulp_t *, tsol_gc_t *, - tsol_gcgrp_t *, ip_stack_t *); - -extern boolean_t ire_init_common(ire_t *, uint_t *, struct nce_s *, queue_t *, - queue_t *, ushort_t, ipif_t *, uint32_t, uint32_t, uint32_t, uchar_t, - const iulp_t *, tsol_gc_t *, tsol_gcgrp_t *, ip_stack_t *); - -extern ire_t *ire_create_v6(const in6_addr_t *, const in6_addr_t *, - const in6_addr_t *, const in6_addr_t *, uint_t *, struct nce_s *, queue_t *, - queue_t *, ushort_t, ipif_t *, - const in6_addr_t *, uint32_t, uint32_t, uint_t, const iulp_t *, - tsol_gc_t *, tsol_gcgrp_t *, ip_stack_t *); - -extern ire_t *ire_create_mp_v6(const in6_addr_t *, const in6_addr_t *, - const in6_addr_t *, const in6_addr_t *, struct nce_s *, queue_t *, - queue_t *, ushort_t, ipif_t *, - const in6_addr_t *, uint32_t, uint32_t, uint_t, const iulp_t *, - tsol_gc_t *, tsol_gcgrp_t *, ip_stack_t *); - +extern ire_t *ire_create(uchar_t *, uchar_t *, uchar_t *, + ushort_t, ill_t *, zoneid_t, uint_t, tsol_gc_t *, ip_stack_t *); -extern void ire_clookup_delete_cache_gw(ipaddr_t, zoneid_t, - ip_stack_t *); -extern void ire_clookup_delete_cache_gw_v6(const in6_addr_t *, zoneid_t, +extern ire_t **ire_create_bcast(ill_t *, ipaddr_t, zoneid_t, ire_t **); +extern ire_t *ire_create_if_clone(ire_t *, const in6_addr_t *, uint_t *); +extern ire_t *ire_lookup_bcast(ill_t *, ipaddr_t, zoneid_t); +extern int ire_init_v4(ire_t *, uchar_t *, uchar_t *, uchar_t *, + ushort_t, ill_t *, zoneid_t, uint_t, tsol_gc_t *, ip_stack_t *); +extern int ire_init_v6(ire_t *, const in6_addr_t *, const in6_addr_t *, + const in6_addr_t *, ushort_t, ill_t *, zoneid_t, uint_t, tsol_gc_t *, ip_stack_t *); -extern ire_t *ire_ctable_lookup(ipaddr_t, ipaddr_t, int, const ipif_t *, - zoneid_t, const struct ts_label_s *, int, ip_stack_t *); +extern int ire_init_common(ire_t *, ushort_t, ill_t *, zoneid_t, uint_t, + uchar_t, tsol_gc_t *, ip_stack_t *); -extern ire_t *ire_ctable_lookup_v6(const in6_addr_t *, const in6_addr_t *, - int, const ipif_t *, zoneid_t, const struct ts_label_s *, int, - ip_stack_t *); +extern ire_t *ire_create_v6(const in6_addr_t *, const in6_addr_t *, + const in6_addr_t *, ushort_t, ill_t *, zoneid_t, uint_t, + tsol_gc_t *, ip_stack_t *); extern void ire_delete(ire_t *); -extern void ire_delete_cache_gw(ire_t *, char *); -extern void ire_delete_cache_gw_v6(ire_t *, char *); -extern void ire_delete_cache_v6(ire_t *, char *); extern void ire_delete_v6(ire_t *); -extern void ire_expire(ire_t *, char *); +/* + * ire_pref used to make sure we don't set up routing loops in the ire_dep + * chain. + */ +extern int ire_pref(ire_t *); +extern boolean_t ire_dep_build(ire_t *[], uint_t [], uint_t); +extern void ire_dep_delete_if_clone(ire_t *); +extern void ire_dep_incr_generation(ire_t *); +extern void ire_dep_remove(ire_t *); +extern void ire_dep_unbuild(ire_t *[], uint_t); +extern uint_t ire_dep_validate_generations(ire_t *); +extern void ire_dep_invalidate_generations(ire_t *); +extern boolean_t ire_determine_nce_capable(ire_t *); extern void ire_flush_cache_v4(ire_t *, int); extern void ire_flush_cache_v6(ire_t *, int); +extern ire_t *ire_ftable_lookup_v4(ipaddr_t, ipaddr_t, ipaddr_t, int, + const ill_t *, zoneid_t, const struct ts_label_s *, int, uint32_t, + ip_stack_t *, uint_t *); extern ire_t *ire_ftable_lookup_v6(const in6_addr_t *, const in6_addr_t *, - const in6_addr_t *, int, const ipif_t *, ire_t **, zoneid_t, - uint32_t, const struct ts_label_s *, int, ip_stack_t *); - -extern ire_t *ire_ihandle_lookup_onlink(ire_t *); -extern ire_t *ire_ihandle_lookup_offlink(ire_t *, ire_t *); -extern ire_t *ire_ihandle_lookup_offlink_v6(ire_t *, ire_t *); - -extern boolean_t ire_local_same_lan(ire_t *, ire_t *); -extern boolean_t ire_local_ok_across_zones(ire_t *, zoneid_t, void *, - const struct ts_label_s *, ip_stack_t *); - -extern ire_t *ire_lookup_local(zoneid_t, ip_stack_t *); -extern ire_t *ire_lookup_local_v6(zoneid_t, ip_stack_t *); - -extern ire_t *ire_lookup_multi(ipaddr_t, zoneid_t, ip_stack_t *); -extern ire_t *ire_lookup_multi_v6(const in6_addr_t *, zoneid_t, - ip_stack_t *); - + const in6_addr_t *, int, const ill_t *, zoneid_t, + const struct ts_label_s *, int, uint32_t, ip_stack_t *, uint_t *); + +extern ire_t *ire_ftable_lookup_simple_v4(ipaddr_t, uint32_t, ip_stack_t *, + uint_t *); +extern ire_t *ire_ftable_lookup_simple_v6(const in6_addr_t *, uint32_t, + ip_stack_t *, uint_t *); + +extern boolean_t ire_gateway_ok_zone_v4(ipaddr_t, zoneid_t, ill_t *, + const ts_label_t *, ip_stack_t *, boolean_t); +extern boolean_t ire_gateway_ok_zone_v6(const in6_addr_t *, zoneid_t, ill_t *, + const ts_label_t *, ip_stack_t *, boolean_t); + +extern ire_t *ire_alt_local(ire_t *, zoneid_t, const ts_label_t *, + const ill_t *, uint_t *); + +extern ill_t *ire_lookup_multi_ill_v4(ipaddr_t, zoneid_t, ip_stack_t *, + boolean_t *, ipaddr_t *); +extern ill_t *ire_lookup_multi_ill_v6(const in6_addr_t *, zoneid_t, + ip_stack_t *, boolean_t *, in6_addr_t *); + +extern ire_t *ire_nexthop(ire_t *); +extern ill_t *ire_nexthop_ill(ire_t *); +extern ill_t *ire_nce_ill(ire_t *); + +extern ire_t *ire_reject(ip_stack_t *, boolean_t); +extern ire_t *ire_blackhole(ip_stack_t *, boolean_t); +extern ire_t *ire_multicast(ill_t *); + +/* The different ire_recvfn functions */ +extern void ire_recv_forward_v4(ire_t *, mblk_t *, void *, + ip_recv_attr_t *); +extern void ire_recv_noroute_v4(ire_t *, mblk_t *, void *, + ip_recv_attr_t *); +extern void ire_recv_broadcast_v4(ire_t *, mblk_t *, void *, + ip_recv_attr_t *); +extern void ire_recv_multicast_v4(ire_t *, mblk_t *, void *, + ip_recv_attr_t *); +extern void ire_recv_multirt_v4(ire_t *, mblk_t *, void *, + ip_recv_attr_t *); +extern void ire_recv_loopback_v4(ire_t *, mblk_t *, void *, + ip_recv_attr_t *); +extern void ire_recv_local_v4(ire_t *, mblk_t *, void *, + ip_recv_attr_t *); +extern void ire_recv_noaccept_v4(ire_t *, mblk_t *, void *, + ip_recv_attr_t *); + +extern void ire_recv_forward_v6(ire_t *, mblk_t *, void *, + ip_recv_attr_t *); +extern void ire_recv_noroute_v6(ire_t *, mblk_t *, void *, + ip_recv_attr_t *); +extern void ire_recv_multicast_v6(ire_t *, mblk_t *, void *, + ip_recv_attr_t *); +extern void ire_recv_multirt_v6(ire_t *, mblk_t *, void *, + ip_recv_attr_t *); +extern void ire_recv_loopback_v6(ire_t *, mblk_t *, void *, + ip_recv_attr_t *); +extern void ire_recv_local_v6(ire_t *, mblk_t *, void *, ip_recv_attr_t *); +extern void ire_recv_noaccept_v6(ire_t *, mblk_t *, void *, + ip_recv_attr_t *); + +extern void irb_refhold(irb_t *); +extern void irb_refhold_locked(irb_t *); +extern void irb_refrele(irb_t *); +extern void irb_increment_generation(irb_t *); + +extern void ire_refhold(ire_t *); +extern void ire_refhold_notr(ire_t *); +extern void ire_refhold_locked(ire_t *); extern void ire_refrele(ire_t *); extern void ire_refrele_notr(ire_t *); -extern ire_t *ire_route_lookup(ipaddr_t, ipaddr_t, ipaddr_t, int, - const ipif_t *, ire_t **, zoneid_t, const struct ts_label_s *, int, - ip_stack_t *); - -extern ire_t *ire_route_lookup_v6(const in6_addr_t *, const in6_addr_t *, - const in6_addr_t *, int, const ipif_t *, ire_t **, zoneid_t, - const struct ts_label_s *, int, ip_stack_t *); - -extern ill_t *ire_to_ill(const ire_t *); +extern void ire_make_condemned(ire_t *); +extern boolean_t ire_no_good(ire_t *); +extern nce_t *ire_handle_condemned_nce(nce_t *, ire_t *, ipha_t *, ip6_t *, + boolean_t); + +extern ire_t *ire_round_robin(irb_t *, ire_ftable_args_t *, uint_t, + ire_t *, ip_stack_t *); + +extern ire_t *ire_route_recursive_v4(ipaddr_t, uint_t, const ill_t *, + zoneid_t, const ts_label_t *, uint_t, boolean_t, uint32_t, ip_stack_t *, + ipaddr_t *, tsol_ire_gw_secattr_t **, uint_t *); +extern ire_t *ire_route_recursive_v6(const in6_addr_t *, uint_t, + const ill_t *, zoneid_t, const ts_label_t *, uint_t, boolean_t, uint32_t, + ip_stack_t *, in6_addr_t *, tsol_ire_gw_secattr_t **, uint_t *); +extern ire_t *ire_route_recursive_dstonly_v4(ipaddr_t, boolean_t, + uint32_t, ip_stack_t *); +extern ire_t *ire_route_recursive_dstonly_v6(const in6_addr_t *, boolean_t, + uint32_t, ip_stack_t *); +extern ire_t *ire_route_recursive_impl_v4(ire_t *ire, ipaddr_t, uint_t, + const ill_t *, zoneid_t, const ts_label_t *, uint_t, boolean_t, uint32_t, + ip_stack_t *, ipaddr_t *, tsol_ire_gw_secattr_t **, uint_t *); +extern ire_t *ire_route_recursive_impl_v6(ire_t *ire, const in6_addr_t *, + uint_t, const ill_t *, zoneid_t, const ts_label_t *, uint_t, boolean_t, + uint32_t, ip_stack_t *, in6_addr_t *, tsol_ire_gw_secattr_t **, uint_t *); + +/* The different ire_sendfn functions */ +extern int ire_send_local_v4(ire_t *, mblk_t *, void *, + ip_xmit_attr_t *, uint32_t *); +extern int ire_send_multirt_v4(ire_t *, mblk_t *, void *, + ip_xmit_attr_t *, uint32_t *); +extern int ire_send_noroute_v4(ire_t *, mblk_t *, void *, + ip_xmit_attr_t *, uint32_t *); +extern int ire_send_multicast_v4(ire_t *, mblk_t *, void *, + ip_xmit_attr_t *, uint32_t *); +extern int ire_send_broadcast_v4(ire_t *, mblk_t *, void *, + ip_xmit_attr_t *, uint32_t *); +extern int ire_send_wire_v4(ire_t *, mblk_t *, void *, + ip_xmit_attr_t *, uint32_t *); +extern int ire_send_local_v6(ire_t *, mblk_t *, void *, + ip_xmit_attr_t *, uint32_t *); +extern int ire_send_multirt_v6(ire_t *, mblk_t *, void *, + ip_xmit_attr_t *, uint32_t *); +extern int ire_send_noroute_v6(ire_t *, mblk_t *, void *, + ip_xmit_attr_t *, uint32_t *); +extern int ire_send_multicast_v6(ire_t *, mblk_t *, void *, + ip_xmit_attr_t *, uint32_t *); +extern int ire_send_wire_v6(ire_t *, mblk_t *, void *, + ip_xmit_attr_t *, uint32_t *); + +extern nce_t *ire_to_nce_pkt(ire_t *, mblk_t *); +extern nce_t *ire_to_nce(ire_t *, ipaddr_t, const in6_addr_t *); + +/* Different ire_postfragfn functions */ +extern int ip_xmit(mblk_t *, struct nce_s *, + iaflags_t, uint_t, uint32_t, zoneid_t, zoneid_t, uintptr_t *); +extern int ip_postfrag_loopcheck(mblk_t *, struct nce_s *, + iaflags_t, uint_t, uint32_t, zoneid_t, zoneid_t, uintptr_t *); +extern int ip_postfrag_multirt_v4(mblk_t *, struct nce_s *, + iaflags_t, uint_t, uint32_t, zoneid_t, zoneid_t, uintptr_t *); +extern int ip_postfrag_multirt_v6(mblk_t *, struct nce_s *, + iaflags_t, uint_t, uint32_t, zoneid_t, zoneid_t, uintptr_t *); + +extern void ip_postfrag_loopback(mblk_t *, struct nce_s *, + iaflags_t, uint_t, zoneid_t); +extern int ire_revalidate_nce(ire_t *); + +extern ire_t *ip_select_route_pkt(mblk_t *, ip_xmit_attr_t *, + uint_t *, int *, boolean_t *); +extern ire_t *ip_select_route(const in6_addr_t *, ip_xmit_attr_t *, + uint_t *, in6_addr_t *, int *, boolean_t *); +extern ire_t *ip_select_route_v4(ipaddr_t, ip_xmit_attr_t *, + uint_t *, ipaddr_t *, int *, boolean_t *); +extern ire_t *ip_select_route_v6(const in6_addr_t *, ip_xmit_attr_t *, + uint_t *, in6_addr_t *, int *, boolean_t *); extern void ire_walk(pfv_t, void *, ip_stack_t *); extern void ire_walk_ill(uint_t, uint_t, pfv_t, void *, ill_t *); -extern void ire_walk_ill_v4(uint_t, uint_t, pfv_t, void *, ill_t *); -extern void ire_walk_ill_v6(uint_t, uint_t, pfv_t, void *, ill_t *); extern void ire_walk_v4(pfv_t, void *, zoneid_t, ip_stack_t *); extern void ire_walk_ill_tables(uint_t match_flags, uint_t ire_type, pfv_t func, void *arg, size_t ftbl_sz, size_t htbl_sz, - irb_t **ipftbl, size_t ctbl_sz, irb_t *ipctbl, ill_t *ill, + irb_t **ipftbl, ill_t *ill, zoneid_t zoneid, ip_stack_t *); extern void ire_walk_v6(pfv_t, void *, zoneid_t, ip_stack_t *); -extern boolean_t ire_multirt_lookup(ire_t **, ire_t **, uint32_t, int *, - const struct ts_label_s *, ip_stack_t *); -extern boolean_t ire_multirt_need_resolve(ipaddr_t, - const struct ts_label_s *, ip_stack_t *); -extern boolean_t ire_multirt_lookup_v6(ire_t **, ire_t **, uint32_t, - const struct ts_label_s *, ip_stack_t *); -extern boolean_t ire_multirt_need_resolve_v6(const in6_addr_t *, - const struct ts_label_s *, ip_stack_t *); - -extern ire_t *ipif_lookup_multi_ire(ipif_t *, ipaddr_t); -extern ire_t *ipif_lookup_multi_ire_v6(ipif_t *, const in6_addr_t *); - -extern ire_t *ire_get_next_bcast_ire(ire_t *, ire_t *); -extern ire_t *ire_get_next_default_ire(ire_t *, ire_t *); - -extern void ire_arpresolve(ire_t *); -extern void ire_freemblk(ire_t *); extern boolean_t ire_match_args(ire_t *, ipaddr_t, ipaddr_t, ipaddr_t, - int, const ipif_t *, zoneid_t, uint32_t, const struct ts_label_s *, int, - queue_t *); -extern int ire_nce_init(ire_t *, struct nce_s *); + int, const ill_t *, zoneid_t, const struct ts_label_s *, int); +extern boolean_t ire_match_args_v6(ire_t *, const in6_addr_t *, + const in6_addr_t *, const in6_addr_t *, int, const ill_t *, zoneid_t, + const ts_label_t *, int); + +extern struct nce_s *arp_nce_init(ill_t *, in_addr_t, int); extern boolean_t ire_walk_ill_match(uint_t, uint_t, ire_t *, ill_t *, zoneid_t, ip_stack_t *); -extern ire_t *ire_arpresolve_lookup(ipaddr_t, ipaddr_t, ipif_t *, zoneid_t, - ip_stack_t *, queue_t *); +extern void ire_increment_generation(ire_t *); +extern void ire_increment_multicast_generation(ip_stack_t *, boolean_t); #endif /* _KERNEL */ diff --git a/usr/src/uts/common/inet/ip_multi.h b/usr/src/uts/common/inet/ip_multi.h index 7dee133967..c41ef99e3e 100644 --- a/usr/src/uts/common/inet/ip_multi.h +++ b/usr/src/uts/common/inet/ip_multi.h @@ -49,18 +49,9 @@ typedef enum { } ilg_stat_t; /* - * Flags shared via ips_mrt_flags, used by mcast_restart_timers_thread(). - */ -typedef enum { - IP_MRT_STOP = 0x1, /* request to stop thread */ - IP_MRT_DONE = 0x2, /* indication that thread is stopped */ - IP_MRT_RUN = 0x4 /* request to restart timers */ -} ip_mrt_flags_t; - -/* * Extern functions */ -extern mblk_t *igmp_input(queue_t *, mblk_t *, ill_t *); +extern mblk_t *igmp_input(mblk_t *, ip_recv_attr_t *); extern void igmp_joingroup(ilm_t *); extern void igmp_leavegroup(ilm_t *); extern void igmp_slowtimo(void *); @@ -73,85 +64,64 @@ extern void mld_statechange(ilm_t *, mcast_record_t, slist_t *); extern void mld_slowtimo(void *); extern void ilg_delete_all(conn_t *connp); -extern ilg_t *ilg_lookup_ill_v6(conn_t *, const in6_addr_t *, - ill_t *); -extern ilg_t *ilg_lookup_ill_withsrc(conn_t *, ipaddr_t, ipaddr_t, - ill_t *); -extern ilg_t *ilg_lookup_ill_withsrc_v6(conn_t *, const in6_addr_t *, - const in6_addr_t *, ill_t *); +extern boolean_t conn_hasmembers_ill_withsrc_v4(conn_t *, ipaddr_t, + ipaddr_t, ill_t *); +extern boolean_t conn_hasmembers_ill_withsrc_v6(conn_t *, + const in6_addr_t *, const in6_addr_t *, ill_t *); extern void ill_leave_multicast(ill_t *); extern void ill_recover_multicast(ill_t *); -extern int ip_get_dlpi_mbcast(ill_t *, mblk_t *); - -extern void ilm_free(ipif_t *); -extern ilm_t *ilm_lookup_ill(ill_t *, ipaddr_t, zoneid_t); -extern ilm_t *ilm_lookup_ill_v6(ill_t *, const in6_addr_t *, - boolean_t, zoneid_t); -extern ilm_t *ilm_lookup_ipif(ipif_t *, ipaddr_t); - -extern int ilm_numentries_v6(ill_t *, const in6_addr_t *); -extern int ilm_walk_ipif(ipif_t *); -extern int ilm_walk_ill(ill_t *); -extern void ilm_walker_cleanup(ill_t *); -extern int ip_ll_send_disabmulti_req(ill_t *, const in6_addr_t *); -extern int ip_ll_send_enabmulti_req(ill_t *, const in6_addr_t *); - -extern int ip_addmulti(ipaddr_t, ipif_t *, ilg_stat_t, - mcast_record_t, slist_t *); -extern int ip_addmulti_v6(const in6_addr_t *, ill_t *, - zoneid_t, ilg_stat_t, mcast_record_t, slist_t *); -extern int ip_delmulti(ipaddr_t, ipif_t *, boolean_t, boolean_t); -extern int ip_delmulti_v6(const in6_addr_t *, ill_t *, - zoneid_t, boolean_t, boolean_t); +extern void ip_dlur_to_mhi(ill_t *, mblk_t *, + struct mac_header_info_s *); + +/* These make up the data path interface used by ip_output and ip_input */ +extern boolean_t ill_hasmembers_v4(ill_t *, ipaddr_t); +extern boolean_t ill_hasmembers_v6(ill_t *, const in6_addr_t *); +extern boolean_t ill_hasmembers_otherzones_v4(ill_t *, ipaddr_t, + zoneid_t); +extern boolean_t ill_hasmembers_otherzones_v6(ill_t *, + const in6_addr_t *, zoneid_t); +extern zoneid_t ill_hasmembers_nextzone_v4(ill_t *, ipaddr_t, zoneid_t); +extern zoneid_t ill_hasmembers_nextzone_v6(ill_t *, const in6_addr_t *, + zoneid_t); + +extern ilm_t *ip_addmulti(const in6_addr_t *, ill_t *, zoneid_t, + int *); +extern int ip_delmulti(ilm_t *); +extern int ip_mforward(mblk_t *, ip_recv_attr_t *); +extern void ip_mroute_decap(mblk_t *, ip_recv_attr_t *); extern int ill_join_allmulti(ill_t *); extern void ill_leave_allmulti(ill_t *); extern int ip_join_allmulti(uint_t, boolean_t, ip_stack_t *); extern int ip_leave_allmulti(uint_t, boolean_t, ip_stack_t *); extern void ip_purge_allmulti(ill_t *); -extern void ip_multicast_loopback(queue_t *, ill_t *, mblk_t *, - int, zoneid_t); -extern int ip_mforward(ill_t *, ipha_t *, mblk_t *); -extern void ip_mroute_decap(queue_t *, mblk_t *, ill_t *); extern int ip_mroute_mrt(mblk_t *, ip_stack_t *); extern int ip_mroute_stats(mblk_t *, ip_stack_t *); extern int ip_mroute_vif(mblk_t *, ip_stack_t *); -extern int ip_mrouter_done(mblk_t *, ip_stack_t *); -extern int ip_mrouter_get(int, queue_t *, uchar_t *); -extern int ip_mrouter_set(int, queue_t *, int, uchar_t *, int, - mblk_t *); +extern int ip_mrouter_done(ip_stack_t *); +extern int ip_mrouter_get(int, conn_t *, uchar_t *); +extern int ip_mrouter_set(int, conn_t *, int, uchar_t *, int); extern void ip_mrouter_stack_init(ip_stack_t *); extern void ip_mrouter_stack_destroy(ip_stack_t *); -extern int ip_opt_add_group(conn_t *, boolean_t, ipaddr_t, - ipaddr_t, uint_t *, mcast_record_t, ipaddr_t, mblk_t *); -extern int ip_opt_delete_group(conn_t *, boolean_t, ipaddr_t, - ipaddr_t, uint_t *, mcast_record_t, ipaddr_t, mblk_t *); -extern int ip_opt_add_group_v6(conn_t *, boolean_t, - const in6_addr_t *, int, mcast_record_t, const in6_addr_t *, mblk_t *); -extern int ip_opt_delete_group_v6(conn_t *, boolean_t, - const in6_addr_t *, int, mcast_record_t, const in6_addr_t *, mblk_t *); +extern int ip_opt_add_group(conn_t *, boolean_t, + const in6_addr_t *, ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *); +extern int ip_opt_delete_group(conn_t *, boolean_t, + const in6_addr_t *, ipaddr_t, uint_t, mcast_record_t, const in6_addr_t *); extern int mrt_ioctl(ipif_t *, sin_t *, queue_t *, mblk_t *, ip_ioctl_cmd_t *, void *); extern int ip_sioctl_msfilter(ipif_t *, sin_t *, queue_t *, mblk_t *, ip_ioctl_cmd_t *, void *); -extern int ip_extract_msfilter(queue_t *, mblk_t *, - const ip_ioctl_cmd_t *, cmd_info_t *, ipsq_func_t); extern int ip_copyin_msfilter(queue_t *, mblk_t *); -extern void ip_wput_ctl(queue_t *, mblk_t *); - -extern int pim_input(queue_t *, mblk_t *, ill_t *); -extern void reset_conn_ipif(ipif_t *); -extern void reset_conn_ill(ill_t *); +extern mblk_t *pim_input(mblk_t *, ip_recv_attr_t *); +extern void update_conn_ill(ill_t *, ip_stack_t *); extern void reset_mrt_ill(ill_t *); extern void reset_mrt_vif_ipif(ipif_t *); -extern void mcast_restart_timers_thread(ip_stack_t *); +extern void igmp_start_timers(unsigned, ip_stack_t *); +extern void mld_start_timers(unsigned, ip_stack_t *); extern void ilm_inactive(ilm_t *); -extern ilm_t *ilm_walker_start(ilm_walker_t *, ill_t *); -extern ilm_t *ilm_walker_step(ilm_walker_t *, ilm_t *); -extern void ilm_walker_finish(ilm_walker_t *); #endif /* _KERNEL */ diff --git a/usr/src/uts/common/inet/ip_ndp.h b/usr/src/uts/common/inet/ip_ndp.h index c1a48b1f1a..21c907f3f3 100644 --- a/usr/src/uts/common/inet/ip_ndp.h +++ b/usr/src/uts/common/inet/ip_ndp.h @@ -35,7 +35,7 @@ /* * Internal definitions for the kernel implementation of the IPv6 - * Neighbor Discovery Protocol (NDP). + * Neighbor Discovery Protocol (NDP) and Address Resolution Protocol (ARP). */ #ifdef __cplusplus @@ -48,131 +48,149 @@ extern "C" { * callbacks set up with ip2mac interface, waiting for result * of neighbor resolution. */ -typedef struct nce_cb_s { - list_node_t nce_cb_node; - void *nce_cb_id; - uint32_t nce_cb_flags; - ip2mac_callback_t *nce_cb_func; - void *nce_cb_arg; -} nce_cb_t; +typedef struct ncec_cb_s { + list_node_t ncec_cb_node; /* next entry in list */ + void *ncec_cb_id; + uint32_t ncec_cb_flags; + ip2mac_callback_t *ncec_cb_func; + void *ncec_cb_arg; +} ncec_cb_t; #define NCE_CB_DISPATCHED 0x00000001 /* - * NDP Cache Entry + * Core information tracking Neighbor Reachability is tracked in the + * ncec_s/ncec_t. The information contained in the ncec_t does not contain + * any link-specific details other than the pointer to the ill_t itself. + * The link-specific information is tracked in the nce_t structure. */ -typedef struct nce_s { - struct nce_s *nce_next; /* Hash chain next pointer */ - struct nce_s **nce_ptpn; /* Pointer to previous next */ - struct ill_s *nce_ill; /* Associated ill */ - uint16_t nce_flags; /* See below */ - uint16_t nce_state; /* See reachability states in if.h */ - int16_t nce_pcnt; /* Probe counter */ - uint16_t nce_rcnt; /* Retransmit counter */ - in6_addr_t nce_addr; /* address of the nighbor */ - in6_addr_t nce_mask; /* If not all ones, mask allows an */ - /* entry to respond to requests for a group of addresses, for */ - /* instantance multicast addresses */ - in6_addr_t nce_extract_mask; /* For mappings */ - uint32_t nce_ll_extract_start; /* For mappings */ -#define nce_first_mp_to_free nce_fp_mp - mblk_t *nce_fp_mp; /* link layer fast path mp */ - mblk_t *nce_res_mp; /* DL_UNITDATA_REQ */ - mblk_t *nce_qd_mp; /* Head outgoing queued packets */ -#define nce_last_mp_to_free nce_qd_mp - mblk_t *nce_timer_mp; /* NDP timer mblk */ - mblk_t *nce_mp; /* mblk we are in, last to be freed */ - uint64_t nce_last; /* Time last reachable in msec */ - uint32_t nce_refcnt; /* nce active usage count */ - kmutex_t nce_lock; /* See comments on top for what */ +struct ncec_s { + struct ncec_s *ncec_next; /* Hash chain next pointer */ + struct ncec_s **ncec_ptpn; /* Pointer to previous next */ + struct ill_s *ncec_ill; /* Associated ill */ + uint16_t ncec_flags; /* See below */ + uint16_t ncec_state; /* See reachability states in if.h */ + int16_t ncec_pcnt; /* Probe counter */ + uint16_t ncec_rcnt; /* Retransmit counter */ + in6_addr_t ncec_addr; /* address of the nighbor */ + uchar_t *ncec_lladdr; + mblk_t *ncec_qd_mp; /* Head outgoing queued packets */ + uint64_t ncec_last; /* Time last reachable in msec */ + uint32_t ncec_refcnt; /* ncec active usage count */ + kmutex_t ncec_lock; /* See comments on top for what */ /* this field protects */ - int nce_unsolicit_count; /* Unsolicited Adv count */ - struct nce_s *nce_fastpath; /* for fastpath list */ - timeout_id_t nce_timeout_id; - uchar_t nce_ipversion; /* IPv4(ARP)/IPv6(NDP) version */ - uint_t nce_defense_count; /* number of NDP conflicts */ - uint_t nce_defense_time; /* last time defended (secs) */ - uint64_t nce_init_time; /* time when it was set to ND_INITIAL */ - boolean_t nce_trace_disable; /* True when alloc fails */ - list_t nce_cb; - uint_t nce_cb_walker_cnt; + int ncec_unsolicit_count; /* Unsolicited Adv count */ + timeout_id_t ncec_timeout_id; + uchar_t ncec_ipversion; /* IPv4(ARP)/IPv6(NDP) version */ + uint_t ncec_defense_count; /* number of NDP conflicts */ + uint_t ncec_last_time_defended; /* last time defended (secs) */ + uint64_t ncec_init_time; /* time when it was set to ND_INITIAL */ + boolean_t ncec_trace_disable; /* True when alloc fails */ + /* + * interval to keep track of DAD probes. + */ + clock_t ncec_xmit_interval; + ip_stack_t *ncec_ipst; /* Does not have a netstack_hold */ + list_t ncec_cb; /* callbacks waiting for resolution */ + uint_t ncec_cb_walker_cnt; + uint_t ncec_nprobes; + uint_t ncec_lladdr_length; +}; + +/* + * The nce_t list hangs off the ill_s and tracks information that depends + * on the underlying physical link. Thus when the ill goes down, + * the nce_t list has to be flushed. This is done as part of ill_delete() + * + * When the fastpath ack comes back in ill_fastpath_ack we call + * nce_fastpath_update to update the nce_t. We never actually + * flush the fastpath list, which is kept as an index into the + * ncec_t structures. + * + * when we ndp_delete, we remove the nce entries pointing + * at the dying ncec from the ill_fastpath_list chain. + * + */ +struct nce_s { + list_node_t nce_node; + ill_t *nce_ill; + boolean_t nce_is_condemned; + in6_addr_t nce_addr; + /* + * link-layer specific fields below + */ + mblk_t *nce_dlur_mp; /* DL_UNITDATA_REQ mp */ + mblk_t *nce_fp_mp; /* fast path mp */ + struct ncec_s *nce_common; + kmutex_t nce_lock; + uint32_t nce_refcnt; uint_t nce_ipif_cnt; /* number of ipifs with the nce_addr */ /* as their local address */ -} nce_t; +}; /* * The ndp_g_t structure contains protocol specific information needed * to synchronize and manage neighbor cache entries for IPv4 and IPv6. * There are 2 such structures, ips_ndp4 and ips_ndp6. * ips_ndp6 contains the data structures needed for IPv6 Neighbor Discovery. - * ips_ndp4 has IPv4 link layer info in its nce_t structures - * Note that the nce_t is not currently used as the arp cache itself; - * it is used for the following purposes: - * - queue packets in nce_qd_mp while waiting for arp resolution to complete - * - nce_{res, fp}_mp are used to track DL_UNITDATA request/responses. - * - track state of ARP resolution in the nce_state; + * ips_ndp4 contains the data structures for IPv4 ARP. * * Locking notes: * ndp_g_lock protects neighbor cache tables access and - * insertion/removal of cache entries into/from these tables. - * nce_lock protects nce_pcnt, nce_rcnt, nce_qd_mp nce_state, nce_res_mp, - * nce_refcnt, nce_last, and nce_cb_walker_cnt. - * nce_refcnt is incremented for every ire pointing to this nce and - * every time ndp_lookup() finds an nce. - * Should there be a need to obtain nce_lock and ndp_g_lock, ndp_g_lock is + * insertion/removal of cache entries into/from these tables. The ncec_lock + * and nce_lock protect fields in the ncec_t and nce_t structures. + * Should there be a need to obtain nce[c]_lock and ndp_g_lock, ndp_g_lock is * acquired first. - * To avoid becoming exclusive when deleting NCEs, ndp_walk() routine holds - * the ndp_g_lock (i.e global lock) and marks NCEs to be deleted with - * NCE_F_CONDEMNED. When all active users of such NCEs are gone the walk - * routine passes a list for deletion to nce_ire_delete_list(). - * - * When the link-layer address of some onlink host changes, ARP will send - * an AR_CN_ANNOUNCE message to ip so that stale neighbor-cache - * information will not get used. This message is processed in ip_arp_news() - * by walking the nce list, and updating as appropriate. The ndp_g_hw_change - * flag is set by ip_arp_news() to notify nce_t users that ip_arp_news() is - * in progress. */ typedef struct ndp_g_s { kmutex_t ndp_g_lock; /* Lock protecting cache hash table */ - nce_t *nce_mask_entries; /* mask not all ones */ - nce_t *nce_hash_tbl[NCE_TABLE_SIZE]; + ncec_t *nce_hash_tbl[NCE_TABLE_SIZE]; int ndp_g_walker; /* # of active thread walking hash list */ boolean_t ndp_g_walker_cleanup; /* true implies defer deletion. */ - int ndp_g_hw_change; /* non-zero if nce flush in progress */ } ndp_g_t; -#define NDP_HW_CHANGE_INCR(ndp) { \ - mutex_enter(&(ndp)->ndp_g_lock); \ - (ndp)->ndp_g_hw_change++; \ - mutex_exit(&(ndp)->ndp_g_lock); \ -} - -#define NDP_HW_CHANGE_DECR(ndp) { \ - mutex_enter(&(ndp)->ndp_g_lock); \ - (ndp)->ndp_g_hw_change--; \ - mutex_exit(&(ndp)->ndp_g_lock); \ -} - -/* nce_flags */ -#define NCE_F_PERMANENT 0x1 -#define NCE_F_MAPPING 0x2 +/* ncec_flags */ +#define NCE_F_MYADDR 0x1 /* ipif exists for the ncec_addr */ +#define NCE_F_UNVERIFIED 0x2 /* DAD in progress. */ #define NCE_F_ISROUTER 0x4 -/* unused 0x8 */ +#define NCE_F_FAST 0x8 + +/* + * NCE_F_NONUD is used to disable IPv6 Neighbor Unreachability Detection or + * IPv4 aging and maps to the ATF_PERM flag for arp(1m) + */ #define NCE_F_NONUD 0x10 + #define NCE_F_ANYCAST 0x20 #define NCE_F_CONDEMNED 0x40 #define NCE_F_UNSOL_ADV 0x80 #define NCE_F_BCAST 0x100 +#define NCE_F_MCAST 0x200 + +/* + * NCE_F_PUBLISH is set for all ARP/ND entries that we announce. This + * includes locally configured addresses as well as those that we proxy for. + */ +#define NCE_F_PUBLISH 0x400 + +/* + * NCE_F_AUTHORITY is set for any address that we have authoritatitve + * information for. This includes locally configured addresses as well + * as statically configured arp entries that are set up using the "permanent" + * option described in arp(1m). The NCE_F_AUTHORITY asserts that we would + * reject any updates for that nce's (host, link-layer-address) information + */ +#define NCE_F_AUTHORITY 0x800 -#define NCE_EXTERNAL_FLAGS_MASK \ - (NCE_F_PERMANENT | NCE_F_MAPPING | NCE_F_ISROUTER | NCE_F_NONUD | \ - NCE_F_ANYCAST | NCE_F_UNSOL_ADV) +#define NCE_F_DELAYED 0x1000 /* rescheduled on dad_defend_rate */ +#define NCE_F_STATIC 0x2000 /* State REACHABLE, STALE, DELAY or PROBE */ -#define NCE_ISREACHABLE(nce) \ - (((((nce)->nce_state) >= ND_REACHABLE) && \ - ((nce)->nce_state) <= ND_PROBE)) +#define NCE_ISREACHABLE(ncec) \ + (((((ncec)->ncec_state) >= ND_REACHABLE) && \ + ((ncec)->ncec_state) <= ND_PROBE)) + +#define NCE_ISCONDEMNED(ncec) ((ncec)->ncec_flags & NCE_F_CONDEMNED) /* NDP flags set in SOL/ADV requests */ #define NDP_UNICAST 0x1 @@ -184,95 +202,14 @@ typedef struct ndp_g_s { /* Number of packets queued in NDP for a neighbor */ #define ND_MAX_Q 4 - -#ifdef DEBUG -#define NCE_TRACE_REF(nce) nce_trace_ref(nce) -#define NCE_UNTRACE_REF(nce) nce_untrace_ref(nce) -#else -#define NCE_TRACE_REF(nce) -#define NCE_UNTRACE_REF(nce) -#endif - -#define NCE_REFHOLD(nce) { \ - mutex_enter(&(nce)->nce_lock); \ - (nce)->nce_refcnt++; \ - ASSERT((nce)->nce_refcnt != 0); \ - NCE_TRACE_REF(nce); \ - mutex_exit(&(nce)->nce_lock); \ -} - -#define NCE_REFHOLD_NOTR(nce) { \ - mutex_enter(&(nce)->nce_lock); \ - (nce)->nce_refcnt++; \ - ASSERT((nce)->nce_refcnt != 0); \ - mutex_exit(&(nce)->nce_lock); \ -} - -#define NCE_REFHOLD_LOCKED(nce) { \ - ASSERT(MUTEX_HELD(&(nce)->nce_lock)); \ - (nce)->nce_refcnt++; \ - NCE_TRACE_REF(nce); \ -} - -/* nce_inactive destroys the mutex thus no mutex_exit is needed */ -#define NCE_REFRELE(nce) { \ - mutex_enter(&(nce)->nce_lock); \ - NCE_UNTRACE_REF(nce); \ - ASSERT((nce)->nce_refcnt != 0); \ - if (--(nce)->nce_refcnt == 0) \ - ndp_inactive(nce); \ - else { \ - mutex_exit(&(nce)->nce_lock);\ - } \ -} - -#define NCE_REFRELE_NOTR(nce) { \ - mutex_enter(&(nce)->nce_lock); \ - ASSERT((nce)->nce_refcnt != 0); \ - if (--(nce)->nce_refcnt == 0) \ - ndp_inactive(nce); \ - else { \ - mutex_exit(&(nce)->nce_lock);\ - } \ -} - -#define NDP_RESTART_TIMER(nce, ms) { \ - ASSERT(!MUTEX_HELD(&(nce)->nce_lock)); \ - if ((nce)->nce_timeout_id != 0) { \ - /* Ok to untimeout bad id. we don't hold a lock. */ \ - (void) untimeout((nce)->nce_timeout_id); \ - } \ - mutex_enter(&(nce)->nce_lock); \ - /* Don't start the timer if the nce has been deleted */ \ - if (!((nce)->nce_flags & NCE_F_CONDEMNED)) \ - nce->nce_timeout_id = timeout(ndp_timer, nce, \ - MSEC_TO_TICK(ms) == 0 ? 1 : MSEC_TO_TICK(ms)); \ - mutex_exit(&(nce)->nce_lock); \ -} - -/* Structure for ndp_cache_count() */ -typedef struct { - int ncc_total; /* Total number of NCEs */ - int ncc_host; /* NCE entries without R bit set */ -} ncc_cache_count_t; - -/* - * Structure of ndp_cache_reclaim(). Each field is a fraction i.e. 1 means - * reclaim all, N means reclaim 1/Nth of all entries, 0 means reclaim none. - */ -typedef struct { - int ncr_host; /* Fraction for host entries */ -} nce_cache_reclaim_t; - /* - * Structure for nce_delete_hw_changed; specifies an IPv4 address to link-layer - * address mapping. Any route that has a cached copy of a mapping for that - * IPv4 address that doesn't match the given mapping must be purged. + * Structure for nce_update_hw_changed; */ typedef struct { ipaddr_t hwm_addr; /* IPv4 address */ - uint_t hwm_hwlen; /* Length of hardware address (may be 0) */ + uint_t hwm_hwlen; /* Length of hardware address (may be 0) */ uchar_t *hwm_hwaddr; /* Pointer to new hardware address, if any */ + int hwm_flags; } nce_hw_map_t; /* When SAP is greater than zero address appears before SAP */ @@ -284,6 +221,15 @@ typedef struct { ((sizeof (dl_unitdata_req_t)) + ((ill)->ill_phys_addr_length)) : \ (sizeof (dl_unitdata_req_t))) +#define NCE_MYADDR(ncec) (((ncec)->ncec_flags & NCE_F_MYADDR) != 0) + +/* + * NCE_PUBLISH() identifies the addresses that we are publishing. This + * includes locally configured address (NCE_MYADDR()) as well as those that + * we are proxying. + */ +#define NCE_PUBLISH(ncec) ((ncec->ncec_flags & NCE_F_PUBLISH) != 0) + #ifdef _BIG_ENDIAN #define NCE_LL_SAP_COPY(ill, mp) \ { \ @@ -327,55 +273,65 @@ typedef struct { /* NDP Cache Entry Hash Table */ #define NCE_TABLE_SIZE 256 -extern void ndp_cache_count(nce_t *, char *); -extern void ndp_cache_reclaim(nce_t *, char *); -extern void ndp_delete(nce_t *); -extern void ndp_delete_per_ill(nce_t *, uchar_t *); -extern void ndp_fastpath_flush(nce_t *, char *); -extern boolean_t ndp_fastpath_update(nce_t *, void *); +extern void ip_nce_reclaim(void *); +extern void ncec_delete(ncec_t *); +extern void ncec_delete_per_ill(ncec_t *, uchar_t *); +extern void nce_fastpath_update(ill_t *, mblk_t *); extern nd_opt_hdr_t *ndp_get_option(nd_opt_hdr_t *, int, int); -extern void ndp_inactive(nce_t *); -extern void ndp_input(ill_t *, mblk_t *, mblk_t *); -extern boolean_t ndp_lookup_ipaddr(in_addr_t, netstack_t *); -extern nce_t *ndp_lookup_v6(ill_t *, boolean_t, const in6_addr_t *, - boolean_t); -extern nce_t *ndp_lookup_v4(ill_t *, const in_addr_t *, boolean_t); -extern int ndp_mcastreq(ill_t *, const in6_addr_t *, uint32_t, uint32_t, +extern void ncec_inactive(ncec_t *); +extern void ndp_input(mblk_t *, ip_recv_attr_t *); +extern ncec_t *ncec_lookup_illgrp_v6(ill_t *, const in6_addr_t *); +extern ncec_t *ncec_lookup_illgrp_v4(ill_t *, const in_addr_t *); +extern nce_t *nce_lookup_v4(ill_t *, const in_addr_t *); +extern nce_t *nce_lookup_v6(ill_t *, const in6_addr_t *); +extern void nce_make_unreachable(ncec_t *); +extern mblk_t *ndp_mcastreq(ill_t *, const in6_addr_t *, uint32_t, uint32_t, mblk_t *); -extern int ndp_noresolver(ill_t *, const in6_addr_t *); -extern void ndp_process(nce_t *, uchar_t *, uint32_t, boolean_t); +extern nce_t *ndp_nce_init(ill_t *, const in6_addr_t *, int); +extern void nce_process(ncec_t *, uchar_t *, uint32_t, boolean_t); extern int ndp_query(ill_t *, lif_nd_req_t *); -extern int ndp_resolver(ill_t *, const in6_addr_t *, mblk_t *, zoneid_t); extern int ndp_sioc_update(ill_t *, lif_nd_req_t *); extern boolean_t ndp_verify_optlen(nd_opt_hdr_t *, int); -extern void ndp_timer(void *); -extern void ndp_walk(ill_t *, pfi_t, void *, ip_stack_t *); -extern void ndp_walk_common(ndp_g_t *, ill_t *, pfi_t, +extern void nce_timer(void *); +extern void ncec_walk(ill_t *, pfi_t, void *, ip_stack_t *); +extern void ncec_walk_common(ndp_g_t *, ill_t *, pfi_t, void *, boolean_t); -extern boolean_t ndp_restart_dad(nce_t *); -extern void ndp_do_recovery(ipif_t *); -extern void nce_resolv_failed(nce_t *); -extern void arp_resolv_failed(nce_t *); -extern void nce_fastpath_list_add(nce_t *); -extern void nce_fastpath_list_delete(nce_t *); -extern void nce_fastpath_list_dispatch(ill_t *, - boolean_t (*)(nce_t *, void *), void *); -extern void nce_queue_mp_common(nce_t *, mblk_t *, boolean_t); -extern void nce_delete_hw_changed(nce_t *, void *); -extern void nce_fastpath(nce_t *); -extern int ndp_add_v6(ill_t *, uchar_t *, const in6_addr_t *, - const in6_addr_t *, const in6_addr_t *, uint32_t, uint16_t, uint16_t, - nce_t **); -extern int ndp_lookup_then_add_v6(ill_t *, boolean_t, uchar_t *, - const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, uint32_t, - uint16_t, uint16_t, nce_t **); -extern int ndp_lookup_then_add_v4(ill_t *, - const in_addr_t *, uint16_t, nce_t **, nce_t *); -extern void ip_ndp_resolve(nce_t *); +extern boolean_t nce_restart_dad(ncec_t *); +extern void ndp_resolv_failed(ncec_t *); +extern void arp_resolv_failed(ncec_t *); +extern void nce_fastpath_list_delete(ill_t *, ncec_t *, list_t *); +extern void nce_queue_mp(ncec_t *, mblk_t *, boolean_t); +extern void nce_update_hw_changed(ncec_t *, void *); +extern int nce_lookup_then_add_v6(ill_t *, uchar_t *, uint_t, + const in6_addr_t *, uint16_t, uint16_t, nce_t **); +extern int nce_lookup_then_add_v4(ill_t *, uchar_t *, uint_t, + const in_addr_t *, uint16_t, uint16_t, nce_t **); +extern boolean_t nce_cmp_ll_addr(const ncec_t *, const uchar_t *, uint32_t); +extern void nce_update(ncec_t *, uint16_t, uchar_t *); +extern nce_t *nce_lookup_mapping(ill_t *, const in6_addr_t *); + +extern void nce_restart_timer(ncec_t *, uint_t); +extern void ncec_refrele(ncec_t *); +extern void ncec_refhold(ncec_t *); +extern void ncec_refrele_notr(ncec_t *); +extern void ncec_refhold_notr(ncec_t *); +extern void nce_resolv_ok(ncec_t *); +extern uint32_t ndp_solicit(ncec_t *, in6_addr_t, ill_t *); +extern boolean_t ip_nce_conflict(mblk_t *, ip_recv_attr_t *, ncec_t *); +extern boolean_t ndp_announce(ncec_t *); +extern void ip_nce_lookup_and_update(ipaddr_t *, ipif_t *, ip_stack_t *, + uchar_t *, int, int); +extern void nce_refrele(nce_t *); +extern void nce_refhold(nce_t *); +extern void nce_delete(nce_t *); +extern void nce_flush(ill_t *, boolean_t); +extern void nce_walk(ill_t *, pfi_t, void *); +extern void ip_ndp_resolve(struct ncec_s *); +extern void ip_addr_recover(ipsq_t *, queue_t *, mblk_t *, void *); #ifdef DEBUG -extern void nce_trace_ref(nce_t *); -extern void nce_untrace_ref(nce_t *); +extern void nce_trace_ref(ncec_t *); +extern void nce_untrace_ref(ncec_t *); #endif #endif /* _KERNEL */ diff --git a/usr/src/uts/common/inet/ip_netinfo.h b/usr/src/uts/common/inet/ip_netinfo.h index b34cf0751e..a496248e23 100644 --- a/usr/src/uts/common/inet/ip_netinfo.h +++ b/usr/src/uts/common/inet/ip_netinfo.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -41,10 +41,13 @@ extern void ip_net_init(ip_stack_t *, netstack_t *); extern void ip_net_destroy(ip_stack_t *); extern void ipv4_hook_init(ip_stack_t *); extern void ipv6_hook_init(ip_stack_t *); +extern void arp_hook_init(ip_stack_t *); extern void ipv4_hook_destroy(ip_stack_t *); extern void ipv6_hook_destroy(ip_stack_t *); +extern void arp_hook_destroy(ip_stack_t *); extern void ipv4_hook_shutdown(ip_stack_t *); extern void ipv6_hook_shutdown(ip_stack_t *); +extern void arp_hook_shutdown(ip_stack_t *); extern void ip_ne_queue_func(void *); #endif /* _KERNEL */ diff --git a/usr/src/uts/common/inet/ip_rts.h b/usr/src/uts/common/inet/ip_rts.h index 61bc451995..f5cbedd370 100644 --- a/usr/src/uts/common/inet/ip_rts.h +++ b/usr/src/uts/common/inet/ip_rts.h @@ -48,7 +48,8 @@ extern "C" { #ifdef _KERNEL extern void ip_rts_change(int, ipaddr_t, ipaddr_t, - ipaddr_t, ipaddr_t, ipaddr_t, int, int, int, ip_stack_t *); + ipaddr_t, ipaddr_t, ipaddr_t, int, int, + int, ip_stack_t *); extern void ip_rts_change_v6(int, const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, int, int, int, @@ -74,15 +75,17 @@ extern size_t rts_data_msg_size(int, sa_family_t, uint_t); extern void rts_fill_msg_v6(int, int, const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, - const in6_addr_t *, const in6_addr_t *, const ipif_t *, mblk_t *, - uint_t, const tsol_gc_t *); + const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, + const ill_t *, mblk_t *, const tsol_gc_t *); extern size_t rts_header_msg_size(int); +extern void rts_merge_metrics(iulp_t *, const iulp_t *); + extern void rts_queue_input(mblk_t *, conn_t *, sa_family_t, uint_t, ip_stack_t *); -extern int ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *, cred_t *); +extern int ip_rts_request_common(mblk_t *mp, conn_t *, cred_t *); #endif /* _KERNEL */ diff --git a/usr/src/uts/common/inet/ip_stack.h b/usr/src/uts/common/inet/ip_stack.h index b5d9715c65..d2f6c07234 100644 --- a/usr/src/uts/common/inet/ip_stack.h +++ b/usr/src/uts/common/inet/ip_stack.h @@ -38,6 +38,7 @@ extern "C" { #ifdef _KERNEL #include <sys/list.h> + /* * IP statistics. */ @@ -46,52 +47,45 @@ extern "C" { ((ipst)->ips_ip_statistics.x.value.ui64 += (n)) typedef struct ip_stat { - kstat_named_t ipsec_fanout_proto; kstat_named_t ip_udp_fannorm; kstat_named_t ip_udp_fanmb; - kstat_named_t ip_udp_fanothers; - kstat_named_t ip_udp_fast_path; - kstat_named_t ip_udp_slow_path; - kstat_named_t ip_udp_input_err; - kstat_named_t ip_tcppullup; - kstat_named_t ip_tcpoptions; - kstat_named_t ip_multipkttcp; - kstat_named_t ip_tcp_fast_path; - kstat_named_t ip_tcp_slow_path; - kstat_named_t ip_tcp_input_error; + kstat_named_t ip_recv_pullup; kstat_named_t ip_db_ref; - kstat_named_t ip_notaligned1; - kstat_named_t ip_notaligned2; - kstat_named_t ip_multimblk3; - kstat_named_t ip_multimblk4; - kstat_named_t ip_ipoptions; - kstat_named_t ip_classify_fail; + kstat_named_t ip_notaligned; + kstat_named_t ip_multimblk; kstat_named_t ip_opt; - kstat_named_t ip_udp_rput_local; kstat_named_t ipsec_proto_ahesp; kstat_named_t ip_conn_flputbq; kstat_named_t ip_conn_walk_drain; kstat_named_t ip_out_sw_cksum; + kstat_named_t ip_out_sw_cksum_bytes; kstat_named_t ip_in_sw_cksum; - kstat_named_t ip_trash_ire_reclaim_calls; - kstat_named_t ip_trash_ire_reclaim_success; - kstat_named_t ip_ire_arp_timer_expired; - kstat_named_t ip_ire_redirect_timer_expired; - kstat_named_t ip_ire_pmtu_timer_expired; - kstat_named_t ip_input_multi_squeue; + kstat_named_t ip_ire_reclaim_calls; + kstat_named_t ip_ire_reclaim_deleted; + kstat_named_t ip_nce_reclaim_calls; + kstat_named_t ip_nce_reclaim_deleted; + kstat_named_t ip_dce_reclaim_calls; + kstat_named_t ip_dce_reclaim_deleted; kstat_named_t ip_tcp_in_full_hw_cksum_err; kstat_named_t ip_tcp_in_part_hw_cksum_err; kstat_named_t ip_tcp_in_sw_cksum_err; - kstat_named_t ip_tcp_out_sw_cksum_bytes; kstat_named_t ip_udp_in_full_hw_cksum_err; kstat_named_t ip_udp_in_part_hw_cksum_err; kstat_named_t ip_udp_in_sw_cksum_err; - kstat_named_t ip_udp_out_sw_cksum_bytes; - kstat_named_t ip_frag_mdt_pkt_out; - kstat_named_t ip_frag_mdt_discarded; - kstat_named_t ip_frag_mdt_allocfail; - kstat_named_t ip_frag_mdt_addpdescfail; - kstat_named_t ip_frag_mdt_allocd; + kstat_named_t conn_in_recvdstaddr; + kstat_named_t conn_in_recvopts; + kstat_named_t conn_in_recvif; + kstat_named_t conn_in_recvslla; + kstat_named_t conn_in_recvucred; + kstat_named_t conn_in_recvttl; + kstat_named_t conn_in_recvhopopts; + kstat_named_t conn_in_recvhoplimit; + kstat_named_t conn_in_recvdstopts; + kstat_named_t conn_in_recvrthdrdstopts; + kstat_named_t conn_in_recvrthdr; + kstat_named_t conn_in_recvpktinfo; + kstat_named_t conn_in_recvtclass; + kstat_named_t conn_in_timestamp; } ip_stat_t; @@ -103,20 +97,22 @@ typedef struct ip_stat { ((ipst)->ips_ip6_statistics.x.value.ui64 += (n)) typedef struct ip6_stat { - kstat_named_t ip6_udp_fast_path; - kstat_named_t ip6_udp_slow_path; kstat_named_t ip6_udp_fannorm; kstat_named_t ip6_udp_fanmb; + kstat_named_t ip6_recv_pullup; + kstat_named_t ip6_db_ref; + kstat_named_t ip6_notaligned; + kstat_named_t ip6_multimblk; + kstat_named_t ipsec_proto_ahesp; kstat_named_t ip6_out_sw_cksum; + kstat_named_t ip6_out_sw_cksum_bytes; kstat_named_t ip6_in_sw_cksum; kstat_named_t ip6_tcp_in_full_hw_cksum_err; kstat_named_t ip6_tcp_in_part_hw_cksum_err; kstat_named_t ip6_tcp_in_sw_cksum_err; - kstat_named_t ip6_tcp_out_sw_cksum_bytes; kstat_named_t ip6_udp_in_full_hw_cksum_err; kstat_named_t ip6_udp_in_part_hw_cksum_err; kstat_named_t ip6_udp_in_sw_cksum_err; - kstat_named_t ip6_udp_out_sw_cksum_bytes; kstat_named_t ip6_frag_mdt_pkt_out; kstat_named_t ip6_frag_mdt_discarded; kstat_named_t ip6_frag_mdt_allocfail; @@ -150,6 +146,8 @@ typedef struct srcid_map { struct ip_stack { netstack_t *ips_netstack; /* Common netstack */ + uint_t ips_src_generation; /* Both IPv4 and IPv6 */ + struct ipparam_s *ips_param_arr; /* ndd variable table */ struct ipndp_s *ips_ndp_arr; @@ -178,10 +176,6 @@ struct ip_stack { kmutex_t ips_ip_mi_lock; kmutex_t ips_ip_addr_avail_lock; krwlock_t ips_ill_g_lock; - krwlock_t ips_ipsec_capab_ills_lock; - /* protects the list of IPsec capable ills */ - struct ipsec_capab_ill_s *ips_ipsec_capab_ills_ah; - struct ipsec_capab_ill_s *ips_ipsec_capab_ills_esp; krwlock_t ips_ill_g_usesrc_lock; @@ -198,10 +192,10 @@ struct ip_stack { struct connf_s *ips_rts_clients; struct connf_s *ips_ipcl_conn_fanout; struct connf_s *ips_ipcl_bind_fanout; - struct connf_s *ips_ipcl_proto_fanout; + struct connf_s *ips_ipcl_proto_fanout_v4; struct connf_s *ips_ipcl_proto_fanout_v6; struct connf_s *ips_ipcl_udp_fanout; - struct connf_s *ips_ipcl_raw_fanout; + struct connf_s *ips_ipcl_raw_fanout; /* RAW SCTP sockets */ struct connf_s *ips_ipcl_iptun_fanout; uint_t ips_ipcl_conn_fanout_size; uint_t ips_ipcl_bind_fanout_size; @@ -237,31 +231,47 @@ struct ip_stack { /* IPv4 forwarding table */ struct radix_node_head *ips_ip_ftable; - /* This is dynamically allocated in ip_ire_init */ - struct irb *ips_ip_cache_table; - #define IPV6_ABITS 128 #define IP6_MASK_TABLE_SIZE (IPV6_ABITS + 1) /* 129 ptrs */ - struct irb *ips_ip_forwarding_table_v6[IP6_MASK_TABLE_SIZE]; - /* This is dynamically allocated in ip_ire_init */ - struct irb *ips_ip_cache_table_v6; - uint32_t ips_ire_handle; /* * ire_ft_init_lock is used while initializing ip_forwarding_table * dynamically in ire_add. */ kmutex_t ips_ire_ft_init_lock; - kmutex_t ips_ire_handle_lock; /* Protects ire_handle */ - uint32_t ips_ip_cache_table_size; - uint32_t ips_ip6_cache_table_size; + /* + * This is the IPv6 counterpart of RADIX_NODE_HEAD_LOCK. It is used + * to prevent adds and deletes while we are doing a ftable_lookup + * and extracting the ire_generation. + */ + krwlock_t ips_ip6_ire_head_lock; + uint32_t ips_ip6_ftable_hash_size; ire_stats_t ips_ire_stats_v4; /* IPv4 ire statistics */ ire_stats_t ips_ire_stats_v6; /* IPv6 ire statistics */ + /* Count how many condemned objects for kmem_cache callbacks */ + uint32_t ips_num_ire_condemned; + uint32_t ips_num_nce_condemned; + uint32_t ips_num_dce_condemned; + + struct ire_s *ips_ire_reject_v4; /* For unreachable dests */ + struct ire_s *ips_ire_reject_v6; /* For unreachable dests */ + struct ire_s *ips_ire_blackhole_v4; /* For temporary failures */ + struct ire_s *ips_ire_blackhole_v6; /* For temporary failures */ + + /* ips_ire_dep_lock protects ire_dep_* relationship between IREs */ + krwlock_t ips_ire_dep_lock; + + /* Destination Cache Entries */ + struct dce_s *ips_dce_default; + uint_t ips_dce_hashsize; + struct dcb_s *ips_dce_hash_v4; + struct dcb_s *ips_dce_hash_v6; + /* pending binds */ mblk_t *ips_ip6_asp_pending_ops; mblk_t *ips_ip6_asp_pending_ops_tail; @@ -293,9 +303,10 @@ struct ip_stack { uint_t ips_icmp_pkt_err_sent; /* Protected by ip_mi_lock */ - void *ips_ip_g_head; /* Instance Data List Head */ + void *ips_ip_g_head; /* IP Instance Data List Head */ + void *ips_arp_g_head; /* ARP Instance Data List Head */ - caddr_t ips_ip_g_nd; /* Named Dispatch List Head */ + caddr_t ips_ip_g_nd; /* Named Dispatch List Head */ /* Multirouting stuff */ /* Interval (in ms) between consecutive 'bad MTU' warnings */ @@ -306,27 +317,11 @@ struct ip_stack { struct cgtp_filter_ops *ips_ip_cgtp_filter_ops; /* CGTP hooks */ boolean_t ips_ip_cgtp_filter; /* Enable/disable CGTP hooks */ - kmutex_t ips_ip_trash_timer_lock; - timeout_id_t ips_ip_ire_expire_id; /* IRE expiration timer. */ struct ipsq_s *ips_ipsq_g_head; uint_t ips_ill_index; /* Used to assign interface indicies */ /* When set search for unused index */ boolean_t ips_ill_index_wrap; - clock_t ips_ip_ire_arp_time_elapsed; - /* Time since IRE cache last flushed */ - clock_t ips_ip_ire_rd_time_elapsed; - /* ... redirect IREs last flushed */ - clock_t ips_ip_ire_pmtu_time_elapsed; - /* Time since path mtu increase */ - - uint_t ips_ip_redirect_cnt; - /* Num of redirect routes in ftable */ - uint_t ips_ipv6_ire_default_count; - /* Number of IPv6 IRE_DEFAULT entries */ - uint_t ips_ipv6_ire_default_index; - /* Walking IPv6 index used to mod in */ - uint_t ips_loopback_packets; /* NDP/NCE structures for IPv4 and IPv6 */ @@ -379,15 +374,17 @@ struct ip_stack { struct srcid_map *ips_srcid_head; krwlock_t ips_srcid_lock; - uint64_t ips_ipif_g_seqid; + uint64_t ips_ipif_g_seqid; /* Used only for sctp_addr.c */ union phyint_list_u *ips_phyint_g_list; /* start of phyint list */ -/* ip_neti.c */ +/* ip_netinfo.c */ hook_family_t ips_ipv4root; hook_family_t ips_ipv6root; + hook_family_t ips_arproot; net_handle_t ips_ipv4_net_data; net_handle_t ips_ipv6_net_data; + net_handle_t ips_arp_net_data; /* * Hooks for firewalling @@ -397,17 +394,23 @@ struct ip_stack { hook_event_t ips_ip4_forwarding_event; hook_event_t ips_ip4_loopback_in_event; hook_event_t ips_ip4_loopback_out_event; + hook_event_t ips_ip6_physical_in_event; hook_event_t ips_ip6_physical_out_event; hook_event_t ips_ip6_forwarding_event; hook_event_t ips_ip6_loopback_in_event; hook_event_t ips_ip6_loopback_out_event; + hook_event_t ips_arp_physical_in_event; + hook_event_t ips_arp_physical_out_event; + hook_event_t ips_arp_nic_events; + hook_event_token_t ips_ipv4firewall_physical_in; hook_event_token_t ips_ipv4firewall_physical_out; hook_event_token_t ips_ipv4firewall_forwarding; hook_event_token_t ips_ipv4firewall_loopback_in; hook_event_token_t ips_ipv4firewall_loopback_out; + hook_event_token_t ips_ipv6firewall_physical_in; hook_event_token_t ips_ipv6firewall_physical_out; hook_event_token_t ips_ipv6firewall_forwarding; @@ -419,6 +422,10 @@ struct ip_stack { hook_event_token_t ips_ipv4nicevents; hook_event_token_t ips_ipv6nicevents; + hook_event_token_t ips_arp_physical_in; + hook_event_token_t ips_arp_physical_out; + hook_event_token_t ips_arpnicevents; + net_handle_t ips_ip4_observe_pr; net_handle_t ips_ip6_observe_pr; hook_event_t ips_ip4_observe; @@ -432,13 +439,6 @@ struct ip_stack { krwlock_t ips_ipmp_lock; mod_hash_t *ips_ipmp_grp_hash; -/* igmp.c */ - /* multicast restart timers thread logic */ - kmutex_t ips_mrt_lock; - uint_t ips_mrt_flags; - kcondvar_t ips_mrt_cv; - kcondvar_t ips_mrt_done_cv; - kthread_t *ips_mrt_thread; }; typedef struct ip_stack ip_stack_t; diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h index e24bcd9a73..15a7c32376 100644 --- a/usr/src/uts/common/inet/ipclassifier.h +++ b/usr/src/uts/common/inet/ipclassifier.h @@ -41,8 +41,11 @@ extern "C" { #include <sys/sunddi.h> #include <sys/sunldi.h> -typedef void (*edesc_spf)(void *, mblk_t *, void *, int); -typedef void (*edesc_rpf)(void *, mblk_t *, void *); +typedef void (*edesc_rpf)(void *, mblk_t *, void *, ip_recv_attr_t *); +struct icmph_s; +struct icmp6_hdr; +typedef boolean_t (*edesc_vpf)(conn_t *, void *, struct icmph_s *, + struct icmp6_hdr *, ip_recv_attr_t *); /* * ============================== @@ -53,7 +56,7 @@ typedef void (*edesc_rpf)(void *, mblk_t *, void *); /* * The connection structure contains the common information/flags/ref needed. * Implementation will keep the connection struct, the layers (with their - * respective data for event i.e. tcp_t if event was tcp_input) all in one + * respective data for event i.e. tcp_t if event was tcp_input_data) all in one * contiguous memory location. */ @@ -61,14 +64,14 @@ typedef void (*edesc_rpf)(void *, mblk_t *, void *); /* Unused 0x00020000 */ /* Unused 0x00040000 */ #define IPCL_FULLY_BOUND 0x00080000 /* Bound to correct squeue */ -#define IPCL_CHECK_POLICY 0x00100000 /* Needs policy checking */ -#define IPCL_SOCKET 0x00200000 /* Sockfs connection */ -#define IPCL_ACCEPTOR 0x00400000 /* Sockfs priv acceptor */ +/* Unused 0x00100000 */ +/* Unused 0x00200000 */ +/* Unused 0x00400000 */ #define IPCL_CL_LISTENER 0x00800000 /* Cluster listener */ -#define IPCL_EAGER 0x01000000 /* Incoming connection */ +/* Unused 0x01000000 */ /* Unused 0x02000000 */ -#define IPCL_TCP6 0x04000000 /* AF_INET6 TCP */ -#define IPCL_TCP4 0x08000000 /* IPv4 packet format TCP */ +/* Unused 0x04000000 */ +/* Unused 0x08000000 */ /* Unused 0x10000000 */ /* Unused 0x20000000 */ #define IPCL_CONNECTED 0x40000000 /* Conn in connected table */ @@ -83,41 +86,21 @@ typedef void (*edesc_rpf)(void *, mblk_t *, void *); #define IPCL_RTSCONN 0x00000020 /* From rts_conn_cache */ /* Unused 0x00000040 */ #define IPCL_IPTUN 0x00000080 /* iptun module above us */ + #define IPCL_NONSTR 0x00001000 /* A non-STREAMS socket */ -#define IPCL_IN_SQUEUE 0x10000000 /* Waiting squeue to finish */ +/* Unused 0x10000000 */ -/* Conn Masks */ -#define IPCL_TCP (IPCL_TCP4|IPCL_TCP6) #define IPCL_REMOVED 0x00000100 #define IPCL_REUSED 0x00000200 -/* The packet format is IPv4; could be an AF_INET or AF_INET6 socket */ -#define IPCL_IS_TCP4(connp) \ - (((connp)->conn_flags & IPCL_TCP4)) - -/* Connected AF_INET with no IPsec policy */ -#define IPCL_IS_TCP4_CONNECTED_NO_POLICY(connp) \ - (((connp)->conn_flags & \ - (IPCL_TCP4|IPCL_CONNECTED|IPCL_CHECK_POLICY|IPCL_TCP6)) \ - == (IPCL_TCP4|IPCL_CONNECTED)) - #define IPCL_IS_CONNECTED(connp) \ ((connp)->conn_flags & IPCL_CONNECTED) #define IPCL_IS_BOUND(connp) \ ((connp)->conn_flags & IPCL_BOUND) -/* AF_INET TCP that is bound */ -#define IPCL_IS_TCP4_BOUND(connp) \ - (((connp)->conn_flags & \ - (IPCL_TCP4|IPCL_BOUND|IPCL_TCP6)) == \ - (IPCL_TCP4|IPCL_BOUND)) - -#define IPCL_IS_FULLY_BOUND(connp) \ - ((connp)->conn_flags & IPCL_FULLY_BOUND) - /* - * Can't use conn_protocol since we need to tell difference + * Can't use conn_proto since we need to tell difference * between a real TCP socket and a SOCK_RAW, IPPROTO_TCP. */ #define IPCL_IS_TCP(connp) \ @@ -180,22 +163,80 @@ typedef struct ip_helper_stream_info_s { #define CONN_MAC_IMPLICIT 2 /* + * conn receive ancillary definition. + * + * These are the set of socket options that make the receive side + * potentially pass up ancillary data items. + * We have a union with an integer so that we can quickly check whether + * any ancillary data items need to be added. + */ +typedef struct crb_s { + union { + uint32_t crbu_all; + struct { + uint32_t + crbb_recvdstaddr : 1, /* IP_RECVDSTADDR option */ + crbb_recvopts : 1, /* IP_RECVOPTS option */ + crbb_recvif : 1, /* IP_RECVIF option */ + crbb_recvslla : 1, /* IP_RECVSLLA option */ + + crbb_recvttl : 1, /* IP_RECVTTL option */ + crbb_ip_recvpktinfo : 1, /* IP*_RECVPKTINFO option */ + crbb_ipv6_recvhoplimit : 1, /* IPV6_RECVHOPLIMIT option */ + crbb_ipv6_recvhopopts : 1, /* IPV6_RECVHOPOPTS option */ + + crbb_ipv6_recvdstopts : 1, /* IPV6_RECVDSTOPTS option */ + crbb_ipv6_recvrthdr : 1, /* IPV6_RECVRTHDR option */ + crbb_old_ipv6_recvdstopts : 1, /* old form of IPV6_DSTOPTS */ + crbb_ipv6_recvrthdrdstopts : 1, /* IPV6_RECVRTHDRDSTOPTS */ + + crbb_ipv6_recvtclass : 1, /* IPV6_RECVTCLASS */ + crbb_recvucred : 1, /* IP_RECVUCRED option */ + crbb_timestamp : 1; /* SO_TIMESTAMP "socket" option */ + + } crbb; + } crbu; +} crb_t; + +#define crb_all crbu.crbu_all +#define crb_recvdstaddr crbu.crbb.crbb_recvdstaddr +#define crb_recvopts crbu.crbb.crbb_recvopts +#define crb_recvif crbu.crbb.crbb_recvif +#define crb_recvslla crbu.crbb.crbb_recvslla +#define crb_recvttl crbu.crbb.crbb_recvttl +#define crb_ip_recvpktinfo crbu.crbb.crbb_ip_recvpktinfo +#define crb_ipv6_recvhoplimit crbu.crbb.crbb_ipv6_recvhoplimit +#define crb_ipv6_recvhopopts crbu.crbb.crbb_ipv6_recvhopopts +#define crb_ipv6_recvdstopts crbu.crbb.crbb_ipv6_recvdstopts +#define crb_ipv6_recvrthdr crbu.crbb.crbb_ipv6_recvrthdr +#define crb_old_ipv6_recvdstopts crbu.crbb.crbb_old_ipv6_recvdstopts +#define crb_ipv6_recvrthdrdstopts crbu.crbb.crbb_ipv6_recvrthdrdstopts +#define crb_ipv6_recvtclass crbu.crbb.crbb_ipv6_recvtclass +#define crb_recvucred crbu.crbb.crbb_recvucred +#define crb_timestamp crbu.crbb.crbb_timestamp + +/* * The initial fields in the conn_t are setup by the kmem_cache constructor, * and are preserved when it is freed. Fields after that are bzero'ed when * the conn_t is freed. + * + * Much of the conn_t is protected by conn_lock. + * + * conn_lock is also used by some ULPs (like UDP and RAWIP) to protect + * their state. */ struct conn_s { kmutex_t conn_lock; uint32_t conn_ref; /* Reference counter */ uint32_t conn_flags; /* Conn Flags */ - union { tcp_t *cp_tcp; /* Pointer to the tcp struct */ struct udp_s *cp_udp; /* Pointer to the udp struct */ struct icmp_s *cp_icmp; /* Pointer to rawip struct */ struct rts_s *cp_rts; /* Pointer to rts struct */ struct iptun_s *cp_iptun; /* Pointer to iptun_t */ + struct sctp_s *cp_sctp; /* For IPCL_SCTPCONN */ void *cp_priv; } conn_proto_priv; #define conn_tcp conn_proto_priv.cp_tcp @@ -203,71 +244,68 @@ struct conn_s { #define conn_icmp conn_proto_priv.cp_icmp #define conn_rts conn_proto_priv.cp_rts #define conn_iptun conn_proto_priv.cp_iptun +#define conn_sctp conn_proto_priv.cp_sctp #define conn_priv conn_proto_priv.cp_priv kcondvar_t conn_cv; - uint8_t conn_ulp; /* protocol type */ + uint8_t conn_proto; /* protocol type */ edesc_rpf conn_recv; /* Pointer to recv routine */ + edesc_rpf conn_recvicmp; /* For ICMP error */ + edesc_vpf conn_verifyicmp; /* Verify ICMP error */ + + ip_xmit_attr_t *conn_ixa; /* Options if no ancil data */ /* Fields after this are bzero'ed when the conn_t is freed. */ +#define conn_start_clr conn_recv_ancillary + + /* Options for receive-side ancillary data */ + crb_t conn_recv_ancillary; squeue_t *conn_sqp; /* Squeue for processing */ uint_t conn_state_flags; /* IP state flags */ -#define conn_start_clr conn_state_flags - ire_t *conn_ire_cache; /* outbound ire cache */ + int conn_lingertime; /* linger time (in seconds) */ + unsigned int conn_on_sqp : 1, /* Conn is being processed */ - conn_dontroute : 1, /* SO_DONTROUTE state */ - conn_loopback : 1, /* SO_LOOPBACK state */ + conn_linger : 1, /* SO_LINGER state */ + conn_useloopback : 1, /* SO_USELOOPBACK state */ conn_broadcast : 1, /* SO_BROADCAST state */ conn_reuseaddr : 1, /* SO_REUSEADDR state */ - conn_multicast_loop : 1, /* IP_MULTICAST_LOOP */ + conn_keepalive : 1, /* SO_KEEPALIVE state */ conn_multi_router : 1, /* Wants all multicast pkts */ - conn_draining : 1, /* ip_wsrv running */ - conn_did_putbq : 1, /* ip_wput did a putbq */ + conn_unspec_src : 1, /* IP_UNSPEC_SRC */ conn_policy_cached : 1, /* Is policy cached/latched ? */ conn_in_enforce_policy : 1, /* Enforce Policy on inbound */ - conn_out_enforce_policy : 1, /* Enforce Policy on outbound */ - conn_af_isv6 : 1, /* ip address family ver 6 */ - conn_pkt_isv6 : 1, /* ip packet format ver 6 */ - conn_ip_recvpktinfo : 1, /* IPV*_RECVPKTINFO option */ - - conn_ipv6_recvhoplimit : 1, /* IPV6_RECVHOPLIMIT option */ - conn_ipv6_recvhopopts : 1, /* IPV6_RECVHOPOPTS option */ - conn_ipv6_recvdstopts : 1, /* IPV6_RECVDSTOPTS option */ - conn_ipv6_recvrthdr : 1, /* IPV6_RECVRTHDR option */ - conn_ipv6_recvrtdstopts : 1, /* IPV6_RECVRTHDRDSTOPTS */ + conn_debug : 1, /* SO_DEBUG */ conn_ipv6_v6only : 1, /* IPV6_V6ONLY */ - conn_ipv6_recvtclass : 1, /* IPV6_RECVTCLASS */ + conn_oobinline : 1, /* SO_OOBINLINE state */ + conn_dgram_errind : 1, /* SO_DGRAM_ERRIND state */ + + conn_exclbind : 1, /* SO_EXCLBIND state */ + conn_mdt_ok : 1, /* MDT is permitted */ + conn_allzones : 1, /* SO_ALLZONES */ conn_ipv6_recvpathmtu : 1, /* IPV6_RECVPATHMTU */ - conn_pathmtu_valid : 1, /* The cached mtu is valid. */ - conn_ipv6_dontfrag : 1, /* IPV6_DONTFRAG */ - conn_fully_bound : 1, /* Fully bound connection */ - conn_recvif : 1, /* IP_RECVIF option */ + conn_mcbc_bind : 1, /* Bound to multi/broadcast */ - conn_recvslla : 1, /* IP_RECVSLLA option */ - conn_mdt_ok : 1, /* MDT is permitted */ - conn_nexthop_set : 1, - conn_allzones : 1; /* SO_ALLZONES */ + conn_pad_to_bit_31 : 11; - unsigned int - conn_lso_ok : 1; /* LSO is usable */ boolean_t conn_direct_blocked; /* conn is flow-controlled */ squeue_t *conn_initial_sqp; /* Squeue at open time */ squeue_t *conn_final_sqp; /* Squeue after connect */ ill_t *conn_dhcpinit_ill; /* IP_DHCPINIT_IF */ - ipsec_latch_t *conn_latch; /* latched state */ - ill_t *conn_outgoing_ill; /* IP{,V6}_BOUND_IF */ - edesc_spf conn_send; /* Pointer to send routine */ + ipsec_latch_t *conn_latch; /* latched IDS */ + struct ipsec_policy_s *conn_latch_in_policy; /* latched policy (in) */ + struct ipsec_action_s *conn_latch_in_action; /* latched action (in) */ + uint_t conn_bound_if; /* IP*_BOUND_IF */ queue_t *conn_rq; /* Read queue */ queue_t *conn_wq; /* Write queue */ dev_t conn_dev; /* Minor number */ @@ -275,80 +313,137 @@ struct conn_s { ip_helper_stream_info_t *conn_helper_info; cred_t *conn_cred; /* Credentials */ + pid_t conn_cpid; /* pid from open/connect */ + uint64_t conn_open_time; /* time when this was opened */ + connf_t *conn_g_fanout; /* Global Hash bucket head */ struct conn_s *conn_g_next; /* Global Hash chain next */ struct conn_s *conn_g_prev; /* Global Hash chain prev */ struct ipsec_policy_head_s *conn_policy; /* Configured policy */ - in6_addr_t conn_bound_source_v6; -#define conn_bound_source V4_PART_OF_V6(conn_bound_source_v6) - + in6_addr_t conn_bound_addr_v6; /* Address in bind() */ +#define conn_bound_addr_v4 V4_PART_OF_V6(conn_bound_addr_v6) connf_t *conn_fanout; /* Hash bucket we're part of */ struct conn_s *conn_next; /* Hash chain next */ struct conn_s *conn_prev; /* Hash chain prev */ + struct { - in6_addr_t connua_laddr; /* Local address */ + in6_addr_t connua_laddr; /* Local address - match */ in6_addr_t connua_faddr; /* Remote address */ } connua_v6addr; -#define conn_src V4_PART_OF_V6(connua_v6addr.connua_laddr) -#define conn_rem V4_PART_OF_V6(connua_v6addr.connua_faddr) -#define conn_srcv6 connua_v6addr.connua_laddr -#define conn_remv6 connua_v6addr.connua_faddr +#define conn_laddr_v4 V4_PART_OF_V6(connua_v6addr.connua_laddr) +#define conn_faddr_v4 V4_PART_OF_V6(connua_v6addr.connua_faddr) +#define conn_laddr_v6 connua_v6addr.connua_laddr +#define conn_faddr_v6 connua_v6addr.connua_faddr + in6_addr_t conn_saddr_v6; /* Local address - source */ +#define conn_saddr_v4 V4_PART_OF_V6(conn_saddr_v6) + union { /* Used for classifier match performance */ - uint32_t conn_ports2; + uint32_t connu_ports2; struct { - in_port_t tcpu_fport; /* Remote port */ - in_port_t tcpu_lport; /* Local port */ - } tcpu_ports; + in_port_t connu_fport; /* Remote port */ + in_port_t connu_lport; /* Local port */ + } connu_ports; } u_port; -#define conn_fport u_port.tcpu_ports.tcpu_fport -#define conn_lport u_port.tcpu_ports.tcpu_lport -#define conn_ports u_port.conn_ports2 -#define conn_upq conn_rq - uint8_t conn_unused_byte; - - uint_t conn_proto; /* SO_PROTOTYPE state */ - ill_t *conn_incoming_ill; /* IP{,V6}_BOUND_IF */ +#define conn_fport u_port.connu_ports.connu_fport +#define conn_lport u_port.connu_ports.connu_lport +#define conn_ports u_port.connu_ports2 + + uint_t conn_incoming_ifindex; /* IP{,V6}_BOUND_IF, scopeid */ ill_t *conn_oper_pending_ill; /* pending shared ioctl */ - ilg_t *conn_ilg; /* Group memberships */ - int conn_ilg_allocated; /* Number allocated */ - int conn_ilg_inuse; /* Number currently used */ - int conn_ilg_walker_cnt; /* No of ilg walkers */ - /* XXXX get rid of this, once ilg_delete_all is fixed */ - kcondvar_t conn_refcv; - - struct ipif_s *conn_multicast_ipif; /* IP_MULTICAST_IF */ - ill_t *conn_multicast_ill; /* IPV6_MULTICAST_IF */ - struct conn_s *conn_drain_next; /* Next conn in drain list */ - struct conn_s *conn_drain_prev; /* Prev conn in drain list */ + krwlock_t conn_ilg_lock; /* Protects conn_ilg_* */ + ilg_t *conn_ilg; /* Group memberships */ + + kcondvar_t conn_refcv; /* For conn_oper_pending_ill */ + + struct conn_s *conn_drain_next; /* Next conn in drain list */ + struct conn_s *conn_drain_prev; /* Prev conn in drain list */ idl_t *conn_idl; /* Ptr to the drain list head */ mblk_t *conn_ipsec_opt_mp; /* ipsec option mblk */ - uint32_t conn_src_preferences; /* prefs for src addr select */ - /* mtuinfo from IPV6_PACKET_TOO_BIG conditional on conn_pathmtu_valid */ - struct ip6_mtuinfo mtuinfo; zoneid_t conn_zoneid; /* zone connection is in */ - in6_addr_t conn_nexthop_v6; /* nexthop IP address */ - uchar_t conn_broadcast_ttl; /* IP_BROADCAST_TTL */ -#define conn_nexthop_v4 V4_PART_OF_V6(conn_nexthop_v6) - cred_t *conn_effective_cred; /* Effective TX credentials */ int conn_rtaware; /* RT_AWARE sockopt value */ kcondvar_t conn_sq_cv; /* For non-STREAMS socket IO */ - kthread_t *conn_sq_caller; /* Caller of squeue sync ops */ sock_upcalls_t *conn_upcalls; /* Upcalls to sockfs */ sock_upper_handle_t conn_upper_handle; /* Upper handle: sonode * */ unsigned int - conn_ulp_labeled : 1, /* ULP label is synced */ conn_mlp_type : 2, /* mlp_type_t; tsol/tndb.h */ conn_anon_mlp : 1, /* user wants anon MLP */ - conn_anon_port : 1, /* user bound anonymously */ + conn_mac_mode : 2, /* normal/loose/implicit MAC */ - conn_spare : 26; + conn_anon_priv_bind : 1, /* *_ANON_PRIV_BIND state */ + conn_zone_is_global : 1, /* GLOBAL_ZONEID */ + conn_spare : 24; boolean_t conn_flow_cntrld; netstack_t *conn_netstack; /* Corresponds to a netstack_hold */ + + /* + * IP format that packets received for this struct should use. + * Value can be IP4_VERSION or IPV6_VERSION. + * The sending version is encoded using IXAF_IS_IPV4. + */ + ushort_t conn_ipversion; + + /* Written to only once at the time of opening the endpoint */ + sa_family_t conn_family; /* Family from socket() call */ + uint_t conn_so_type; /* Type from socket() call */ + + uint_t conn_sndbuf; /* SO_SNDBUF state */ + uint_t conn_rcvbuf; /* SO_RCVBUF state */ + uint_t conn_wroff; /* Current write offset */ + + uint_t conn_sndlowat; /* Send buffer low water mark */ + uint_t conn_rcvlowat; /* Recv buffer low water mark */ + + uint8_t conn_default_ttl; /* Default TTL/hoplimit */ + + uint32_t conn_flowinfo; /* Connected flow id and tclass */ + + /* + * The most recent address for sendto. Initially set to zero + * which is always different than then the destination address + * since the send interprets zero as the loopback address. + */ + in6_addr_t conn_v6lastdst; +#define conn_v4lastdst V4_PART_OF_V6(conn_v6lastdst) + ushort_t conn_lastipversion; + in_port_t conn_lastdstport; + uint32_t conn_lastflowinfo; /* IPv6-only */ + uint_t conn_lastscopeid; /* IPv6-only */ + uint_t conn_lastsrcid; /* Only for AF_INET6 */ + /* + * When we are not connected conn_saddr might be unspecified. + * We track the source that was used with conn_v6lastdst here. + */ + in6_addr_t conn_v6lastsrc; +#define conn_v4lastsrc V4_PART_OF_V6(conn_v6lastsrc) + + /* Templates for transmitting packets */ + ip_pkt_t conn_xmit_ipp; /* Options if no ancil data */ + + /* + * Header template - conn_ht_ulp is a pointer into conn_ht_iphc. + * Note that ixa_ip_hdr_length indicates the offset of ht_ulp in + * ht_iphc + * + * The header template is maintained for connected endpoints (and + * updated when sticky options are changed) and also for the lastdst. + * There is no conflict between those usages since SOCK_DGRAM and + * SOCK_RAW can not be used to specify a destination address (with + * sendto/sendmsg) if the socket has been connected. + */ + uint8_t *conn_ht_iphc; /* Start of IP header */ + uint_t conn_ht_iphc_allocated; /* Allocated buffer size */ + uint_t conn_ht_iphc_len; /* IP+ULP size */ + uint8_t *conn_ht_ulp; /* Upper-layer header */ + uint_t conn_ht_ulp_len; /* ULP header len */ + + /* Checksum to compensate for source routed packets. Host byte order */ + uint32_t conn_sum; + #ifdef CONN_DEBUG #define CONN_TRACE_MAX 10 int conn_trace_last; /* ndx of last used tracebuf */ @@ -357,18 +452,6 @@ struct conn_s { }; /* - * These two macros are used by TX. First priority is SCM_UCRED having - * set the label in the mblk. Second priority is the open credentials with - * peer's label (aka conn_effective_cred). Last priority is the open - * credentials. BEST_CRED takes all three into account in the above order. - * CONN_CRED is for connection-oriented cases when we don't need to look - * at the mblk. - */ -#define CONN_CRED(connp) ((connp)->conn_effective_cred == NULL ? \ - (connp)->conn_cred : (connp)->conn_effective_cred) -#define BEST_CRED(mp, connp, pidp) ip_best_cred(mp, connp, pidp) - -/* * connf_t - connection fanout data. * * The hash tables and their linkage (conn_t.{hashnextp, hashprevp} are @@ -461,29 +544,22 @@ struct connf_s { /* - * IPCL_PROTO_MATCH() only matches conns with the specified zoneid, while - * IPCL_PROTO_MATCH_V6() can match other conns in the multicast case, see - * ip_fanout_proto(). + * IPCL_PROTO_MATCH() and IPCL_PROTO_MATCH_V6() only matches conns with + * the specified ira_zoneid or conn_allzones by calling conn_wantpacket. */ -#define IPCL_PROTO_MATCH(connp, protocol, ipha, ill, \ - fanout_flags, zoneid) \ - ((((connp)->conn_src == INADDR_ANY) || \ - (((connp)->conn_src == ((ipha)->ipha_dst)) && \ - (((connp)->conn_rem == INADDR_ANY) || \ - ((connp)->conn_rem == ((ipha)->ipha_src))))) && \ - IPCL_ZONE_MATCH(connp, zoneid) && \ - (conn_wantpacket((connp), (ill), (ipha), (fanout_flags), \ - (zoneid)) || ((protocol) == IPPROTO_PIM) || \ - ((protocol) == IPPROTO_RSVP))) - -#define IPCL_PROTO_MATCH_V6(connp, protocol, ip6h, ill, \ - fanout_flags, zoneid) \ - ((IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_srcv6) || \ - (IN6_ARE_ADDR_EQUAL(&(connp)->conn_srcv6, &((ip6h)->ip6_dst)) && \ - (IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_remv6) || \ - IN6_ARE_ADDR_EQUAL(&(connp)->conn_remv6, &((ip6h)->ip6_src))))) && \ - (conn_wantpacket_v6((connp), (ill), (ip6h), \ - (fanout_flags), (zoneid)) || ((protocol) == IPPROTO_RSVP))) +#define IPCL_PROTO_MATCH(connp, ira, ipha) \ + ((((connp)->conn_laddr_v4 == INADDR_ANY) || \ + (((connp)->conn_laddr_v4 == ((ipha)->ipha_dst)) && \ + (((connp)->conn_faddr_v4 == INADDR_ANY) || \ + ((connp)->conn_faddr_v4 == ((ipha)->ipha_src))))) && \ + conn_wantpacket((connp), (ira), (ipha))) + +#define IPCL_PROTO_MATCH_V6(connp, ira, ip6h) \ + ((IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_laddr_v6) || \ + (IN6_ARE_ADDR_EQUAL(&(connp)->conn_laddr_v6, &((ip6h)->ip6_dst)) && \ + (IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_faddr_v6) || \ + IN6_ARE_ADDR_EQUAL(&(connp)->conn_faddr_v6, &((ip6h)->ip6_src))))) && \ + (conn_wantpacket_v6((connp), (ira), (ip6h)))) #define IPCL_CONN_HASH(src, ports, ipst) \ ((unsigned)(ntohl((src)) ^ ((ports) >> 24) ^ ((ports) >> 16) ^ \ @@ -493,31 +569,17 @@ struct connf_s { IPCL_CONN_HASH(V4_PART_OF_V6((src)), (ports), (ipst)) #define IPCL_CONN_MATCH(connp, proto, src, dst, ports) \ - ((connp)->conn_ulp == (proto) && \ + ((connp)->conn_proto == (proto) && \ (connp)->conn_ports == (ports) && \ - _IPCL_V4_MATCH((connp)->conn_remv6, (src)) && \ - _IPCL_V4_MATCH((connp)->conn_srcv6, (dst)) && \ + _IPCL_V4_MATCH((connp)->conn_faddr_v6, (src)) && \ + _IPCL_V4_MATCH((connp)->conn_laddr_v6, (dst)) && \ !(connp)->conn_ipv6_v6only) #define IPCL_CONN_MATCH_V6(connp, proto, src, dst, ports) \ - ((connp)->conn_ulp == (proto) && \ + ((connp)->conn_proto == (proto) && \ (connp)->conn_ports == (ports) && \ - IN6_ARE_ADDR_EQUAL(&(connp)->conn_remv6, &(src)) && \ - IN6_ARE_ADDR_EQUAL(&(connp)->conn_srcv6, &(dst))) - -#define IPCL_CONN_INIT(connp, protocol, src, rem, ports) { \ - (connp)->conn_ulp = protocol; \ - IN6_IPADDR_TO_V4MAPPED(src, &(connp)->conn_srcv6); \ - IN6_IPADDR_TO_V4MAPPED(rem, &(connp)->conn_remv6); \ - (connp)->conn_ports = ports; \ -} - -#define IPCL_CONN_INIT_V6(connp, protocol, src, rem, ports) { \ - (connp)->conn_ulp = protocol; \ - (connp)->conn_srcv6 = src; \ - (connp)->conn_remv6 = rem; \ - (connp)->conn_ports = ports; \ -} + IN6_ARE_ADDR_EQUAL(&(connp)->conn_faddr_v6, &(src)) && \ + IN6_ARE_ADDR_EQUAL(&(connp)->conn_laddr_v6, &(dst))) #define IPCL_PORT_HASH(port, size) \ ((((port) >> 8) ^ (port)) & ((size) - 1)) @@ -527,33 +589,45 @@ struct connf_s { (ipst)->ips_ipcl_bind_fanout_size) #define IPCL_BIND_MATCH(connp, proto, laddr, lport) \ - ((connp)->conn_ulp == (proto) && \ + ((connp)->conn_proto == (proto) && \ (connp)->conn_lport == (lport) && \ - (_IPCL_V4_MATCH_ANY((connp)->conn_srcv6) || \ - _IPCL_V4_MATCH((connp)->conn_srcv6, (laddr))) && \ + (_IPCL_V4_MATCH_ANY((connp)->conn_laddr_v6) || \ + _IPCL_V4_MATCH((connp)->conn_laddr_v6, (laddr))) && \ !(connp)->conn_ipv6_v6only) #define IPCL_BIND_MATCH_V6(connp, proto, laddr, lport) \ - ((connp)->conn_ulp == (proto) && \ + ((connp)->conn_proto == (proto) && \ (connp)->conn_lport == (lport) && \ - (IN6_ARE_ADDR_EQUAL(&(connp)->conn_srcv6, &(laddr)) || \ - IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_srcv6))) + (IN6_ARE_ADDR_EQUAL(&(connp)->conn_laddr_v6, &(laddr)) || \ + IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_laddr_v6))) +/* + * We compare conn_laddr since it captures both connected and a bind to + * a multicast or broadcast address. + * The caller needs to match the zoneid and also call conn_wantpacket + * for multicast, broadcast, or when conn_incoming_ifindex is set. + */ #define IPCL_UDP_MATCH(connp, lport, laddr, fport, faddr) \ (((connp)->conn_lport == (lport)) && \ - ((_IPCL_V4_MATCH_ANY((connp)->conn_srcv6) || \ - (_IPCL_V4_MATCH((connp)->conn_srcv6, (laddr)) && \ - (_IPCL_V4_MATCH_ANY((connp)->conn_remv6) || \ - (_IPCL_V4_MATCH((connp)->conn_remv6, (faddr)) && \ + ((_IPCL_V4_MATCH_ANY((connp)->conn_laddr_v6) || \ + (_IPCL_V4_MATCH((connp)->conn_laddr_v6, (laddr)) && \ + (_IPCL_V4_MATCH_ANY((connp)->conn_faddr_v6) || \ + (_IPCL_V4_MATCH((connp)->conn_faddr_v6, (faddr)) && \ (connp)->conn_fport == (fport)))))) && \ !(connp)->conn_ipv6_v6only) +/* + * We compare conn_laddr since it captures both connected and a bind to + * a multicast or broadcast address. + * The caller needs to match the zoneid and also call conn_wantpacket_v6 + * for multicast or when conn_incoming_ifindex is set. + */ #define IPCL_UDP_MATCH_V6(connp, lport, laddr, fport, faddr) \ (((connp)->conn_lport == (lport)) && \ - (IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_srcv6) || \ - (IN6_ARE_ADDR_EQUAL(&(connp)->conn_srcv6, &(laddr)) && \ - (IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_remv6) || \ - (IN6_ARE_ADDR_EQUAL(&(connp)->conn_remv6, &(faddr)) && \ + (IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_laddr_v6) || \ + (IN6_ARE_ADDR_EQUAL(&(connp)->conn_laddr_v6, &(laddr)) && \ + (IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_faddr_v6) || \ + (IN6_ARE_ADDR_EQUAL(&(connp)->conn_faddr_v6, &(faddr)) && \ (connp)->conn_fport == (fport)))))) #define IPCL_IPTUN_HASH(laddr, faddr) \ @@ -567,32 +641,12 @@ struct connf_s { (laddr)->s6_addr32[2] ^ (laddr)->s6_addr32[3]) #define IPCL_IPTUN_MATCH(connp, laddr, faddr) \ - (_IPCL_V4_MATCH((connp)->conn_srcv6, (laddr)) && \ - _IPCL_V4_MATCH((connp)->conn_remv6, (faddr))) + (_IPCL_V4_MATCH((connp)->conn_laddr_v6, (laddr)) && \ + _IPCL_V4_MATCH((connp)->conn_faddr_v6, (faddr))) #define IPCL_IPTUN_MATCH_V6(connp, laddr, faddr) \ - (IN6_ARE_ADDR_EQUAL(&(connp)->conn_srcv6, (laddr)) && \ - IN6_ARE_ADDR_EQUAL(&(connp)->conn_remv6, (faddr))) - -#define IPCL_TCP_EAGER_INIT(connp, protocol, src, rem, ports) { \ - (connp)->conn_flags |= (IPCL_TCP4|IPCL_EAGER); \ - IN6_IPADDR_TO_V4MAPPED(src, &(connp)->conn_srcv6); \ - IN6_IPADDR_TO_V4MAPPED(rem, &(connp)->conn_remv6); \ - (connp)->conn_ports = ports; \ - (connp)->conn_send = ip_output; \ - (connp)->conn_sqp = IP_SQUEUE_GET(lbolt); \ - (connp)->conn_initial_sqp = (connp)->conn_sqp; \ -} - -#define IPCL_TCP_EAGER_INIT_V6(connp, protocol, src, rem, ports) { \ - (connp)->conn_flags |= (IPCL_TCP6|IPCL_EAGER); \ - (connp)->conn_srcv6 = src; \ - (connp)->conn_remv6 = rem; \ - (connp)->conn_ports = ports; \ - (connp)->conn_send = ip_output_v6; \ - (connp)->conn_sqp = IP_SQUEUE_GET(lbolt); \ - (connp)->conn_initial_sqp = (connp)->conn_sqp; \ -} + (IN6_ARE_ADDR_EQUAL(&(connp)->conn_laddr_v6, (laddr)) && \ + IN6_ARE_ADDR_EQUAL(&(connp)->conn_faddr_v6, (faddr))) #define IPCL_UDP_HASH(lport, ipst) \ IPCL_PORT_HASH(lport, (ipst)->ips_ipcl_udp_fanout_size) @@ -606,18 +660,20 @@ struct connf_s { /* * This is similar to IPCL_BIND_MATCH except that the local port check * is changed to a wildcard port check. + * We compare conn_laddr since it captures both connected and a bind to + * a multicast or broadcast address. */ #define IPCL_RAW_MATCH(connp, proto, laddr) \ - ((connp)->conn_ulp == (proto) && \ + ((connp)->conn_proto == (proto) && \ (connp)->conn_lport == 0 && \ - (_IPCL_V4_MATCH_ANY((connp)->conn_srcv6) || \ - _IPCL_V4_MATCH((connp)->conn_srcv6, (laddr)))) + (_IPCL_V4_MATCH_ANY((connp)->conn_laddr_v6) || \ + _IPCL_V4_MATCH((connp)->conn_laddr_v6, (laddr)))) #define IPCL_RAW_MATCH_V6(connp, proto, laddr) \ - ((connp)->conn_ulp == (proto) && \ + ((connp)->conn_proto == (proto) && \ (connp)->conn_lport == 0 && \ - (IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_srcv6) || \ - IN6_ARE_ADDR_EQUAL(&(connp)->conn_srcv6, &(laddr)))) + (IN6_IS_ADDR_UNSPECIFIED(&(connp)->conn_laddr_v6) || \ + IN6_ARE_ADDR_EQUAL(&(connp)->conn_laddr_v6, &(laddr)))) /* Function prototypes */ extern void ipcl_g_init(void); @@ -631,28 +687,27 @@ void ipcl_hash_insert_wildcard(connf_t *, conn_t *); void ipcl_hash_remove(conn_t *); void ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp); -extern int ipcl_bind_insert(conn_t *, uint8_t, ipaddr_t, uint16_t); -extern int ipcl_bind_insert_v6(conn_t *, uint8_t, const in6_addr_t *, - uint16_t); -extern int ipcl_conn_insert(conn_t *, uint8_t, ipaddr_t, ipaddr_t, - uint32_t); -extern int ipcl_conn_insert_v6(conn_t *, uint8_t, const in6_addr_t *, - const in6_addr_t *, uint32_t, uint_t); +extern int ipcl_bind_insert(conn_t *); +extern int ipcl_bind_insert_v4(conn_t *); +extern int ipcl_bind_insert_v6(conn_t *); +extern int ipcl_conn_insert(conn_t *); +extern int ipcl_conn_insert_v4(conn_t *); +extern int ipcl_conn_insert_v6(conn_t *); extern conn_t *ipcl_get_next_conn(connf_t *, conn_t *, uint32_t); -void ipcl_proto_insert(conn_t *, uint8_t); -void ipcl_proto_insert_v6(conn_t *, uint8_t); -conn_t *ipcl_classify_v4(mblk_t *, uint8_t, uint_t, zoneid_t, ip_stack_t *); -conn_t *ipcl_classify_v6(mblk_t *, uint8_t, uint_t, zoneid_t, ip_stack_t *); -conn_t *ipcl_classify(mblk_t *, zoneid_t, ip_stack_t *); -conn_t *ipcl_classify_raw(mblk_t *, uint8_t, zoneid_t, uint32_t, ipha_t *, +conn_t *ipcl_classify_v4(mblk_t *, uint8_t, uint_t, ip_recv_attr_t *, + ip_stack_t *); +conn_t *ipcl_classify_v6(mblk_t *, uint8_t, uint_t, ip_recv_attr_t *, ip_stack_t *); +conn_t *ipcl_classify(mblk_t *, ip_recv_attr_t *, ip_stack_t *); +conn_t *ipcl_classify_raw(mblk_t *, uint8_t, uint32_t, ipha_t *, + ip6_t *, ip_recv_attr_t *, ip_stack_t *); conn_t *ipcl_iptun_classify_v4(ipaddr_t *, ipaddr_t *, ip_stack_t *); conn_t *ipcl_iptun_classify_v6(in6_addr_t *, in6_addr_t *, ip_stack_t *); void ipcl_globalhash_insert(conn_t *); void ipcl_globalhash_remove(conn_t *); void ipcl_walk(pfv_t, void *, ip_stack_t *); -conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcph_t *, int, ip_stack_t *); +conn_t *ipcl_tcp_lookup_reversed_ipv4(ipha_t *, tcpha_t *, int, ip_stack_t *); conn_t *ipcl_tcp_lookup_reversed_ipv6(ip6_t *, tcpha_t *, int, uint_t, ip_stack_t *); conn_t *ipcl_lookup_listener_v4(uint16_t, ipaddr_t, zoneid_t, ip_stack_t *); @@ -661,17 +716,19 @@ conn_t *ipcl_lookup_listener_v6(uint16_t, in6_addr_t *, uint_t, zoneid_t, int conn_trace_ref(conn_t *); int conn_untrace_ref(conn_t *); void ipcl_conn_cleanup(conn_t *); -conn_t *ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *, ipha_t *, tcph_t *, +extern uint_t conn_recvancillary_size(conn_t *, crb_t, ip_recv_attr_t *, + mblk_t *, ip_pkt_t *); +extern void conn_recvancillary_add(conn_t *, crb_t, ip_recv_attr_t *, + ip_pkt_t *, uchar_t *, uint_t); +conn_t *ipcl_conn_tcp_lookup_reversed_ipv4(conn_t *, ipha_t *, tcpha_t *, ip_stack_t *); -conn_t *ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *, ip6_t *, tcph_t *, +conn_t *ipcl_conn_tcp_lookup_reversed_ipv6(conn_t *, ip6_t *, tcpha_t *, ip_stack_t *); -extern int ip_create_helper_stream(conn_t *connp, ldi_ident_t li); -extern void ip_free_helper_stream(conn_t *connp); - -extern int ip_get_options(conn_t *, int, int, void *, t_uscalar_t *, cred_t *); -extern int ip_set_options(conn_t *, int, int, const void *, t_uscalar_t, - cred_t *); +extern int ip_create_helper_stream(conn_t *, ldi_ident_t); +extern void ip_free_helper_stream(conn_t *); +extern int ip_helper_stream_setup(queue_t *, dev_t *, int, int, + cred_t *, boolean_t); #ifdef __cplusplus } diff --git a/usr/src/uts/common/inet/ipdrop.h b/usr/src/uts/common/inet/ipdrop.h index 153c9c1925..74fe8cfd94 100644 --- a/usr/src/uts/common/inet/ipdrop.h +++ b/usr/src/uts/common/inet/ipdrop.h @@ -41,8 +41,10 @@ typedef struct ipdropper_s { void ip_drop_register(ipdropper_t *, char *); void ip_drop_unregister(ipdropper_t *); -void ip_drop_packet(mblk_t *, boolean_t, ill_t *, ire_t *, struct kstat_named *, +void ip_drop_packet(mblk_t *, boolean_t, ill_t *, struct kstat_named *, ipdropper_t *); +void ip_drop_input(char *, mblk_t *, ill_t *); +void ip_drop_output(char *, mblk_t *, ill_t *); /* * ip_dropstats - When a protocol developer comes up with a new reason to diff --git a/usr/src/uts/common/inet/ipp_common.h b/usr/src/uts/common/inet/ipp_common.h index 9ac9837f66..d7380896b6 100644 --- a/usr/src/uts/common/inet/ipp_common.h +++ b/usr/src/uts/common/inet/ipp_common.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _INET_IPP_COMMON_H #define _INET_IPP_COMMON_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -49,14 +47,6 @@ extern uint32_t ipp_action_count; #define IPP_ENABLED(proc, ipst) ((ipp_action_count != 0) && \ (~((ipst)->ips_ip_policy_mask) & (proc))) -/* Apply IPQoS policies for inbound traffic? */ -#define IP6_IN_IPP(flags, ipst) (IPP_ENABLED(IPP_LOCAL_IN, ipst) && \ - (!((flags) & IP6_NO_IPPOLICY))) - -/* Apply IPQoS policies for oubound traffic? */ -#define IP6_OUT_IPP(flags, ipst) \ - (IPP_ENABLED(IPP_LOCAL_OUT, ipst) && (!((flags) & IP6_NO_IPPOLICY))) - /* Extracts 8 bit traffic class from IPV6 flow label field */ #ifdef _BIG_ENDIAN #define __IPV6_TCLASS_FROM_FLOW(n) (((n)>>20) & 0xff) @@ -78,7 +68,9 @@ typedef struct ip_priv { } ip_priv_t; /* The entry point for ip policy processing */ -extern void ip_process(ip_proc_t, mblk_t **, uint32_t); +#ifdef ILL_CONDEMNED +extern mblk_t *ip_process(ip_proc_t, mblk_t *, ill_t *, ill_t *); +#endif extern void ip_priv_free(void *); #endif /* _KERNEL */ diff --git a/usr/src/uts/common/inet/ipsec_impl.h b/usr/src/uts/common/inet/ipsec_impl.h index c5fa9367fe..228e01008d 100644 --- a/usr/src/uts/common/inet/ipsec_impl.h +++ b/usr/src/uts/common/inet/ipsec_impl.h @@ -410,24 +410,25 @@ struct ipsec_policy_s uint32_t ipsp_refs; ipsec_sel_t *ipsp_sel; /* selector set (shared) */ ipsec_action_t *ipsp_act; /* action (may be shared) */ + netstack_t *ipsp_netstack; /* No netstack_hold */ }; #define IPPOL_REFHOLD(ipp) { \ atomic_add_32(&(ipp)->ipsp_refs, 1); \ ASSERT((ipp)->ipsp_refs != 0); \ } -#define IPPOL_REFRELE(ipp, ns) { \ +#define IPPOL_REFRELE(ipp) { \ ASSERT((ipp)->ipsp_refs != 0); \ membar_exit(); \ if (atomic_add_32_nv(&(ipp)->ipsp_refs, -1) == 0) \ - ipsec_policy_free(ipp, ns); \ + ipsec_policy_free(ipp); \ (ipp) = 0; \ } -#define IPPOL_UNCHAIN(php, ip, ns) \ - HASHLIST_UNCHAIN((ip), ipsp_hash); \ - avl_remove(&(php)->iph_rulebyid, (ip)); \ - IPPOL_REFRELE(ip, ns); +#define IPPOL_UNCHAIN(php, ip) \ + HASHLIST_UNCHAIN((ip), ipsp_hash); \ + avl_remove(&(php)->iph_rulebyid, (ip)); \ + IPPOL_REFRELE(ip); /* * Policy ruleset. One per (protocol * direction) for system policy. @@ -590,8 +591,6 @@ typedef struct ipsid_s atomic_add_32(&(ipsid)->ipsid_refcnt, -1); \ } -struct ipsec_out_s; - /* * Following are the estimates of what the maximum AH and ESP header size * would be. This is used to tell the upper layer the right value of MSS @@ -708,6 +707,17 @@ typedef struct ipsif_s kmutex_t ipsif_lock; } ipsif_t; +/* + * For call to the kernel crypto framework. State needed during + * the execution of a crypto request. + */ +typedef struct ipsec_crypto_s { + size_t ic_skip_len; /* len to skip for AH auth */ + crypto_data_t ic_crypto_data; /* single op crypto data */ + crypto_dual_data_t ic_crypto_dual_data; /* for dual ops */ + crypto_data_t ic_crypto_mac; /* to store the MAC */ + ipsa_cm_mech_t ic_cmm; +} ipsec_crypto_t; /* * IPsec stack instances @@ -826,45 +836,40 @@ extern boolean_t ipsec_loaded(ipsec_stack_t *); extern boolean_t ipsec_failed(ipsec_stack_t *); /* - * callback from ipsec_loader to ip - */ -extern void ip_ipsec_load_complete(ipsec_stack_t *); - -/* * ipsec policy entrypoints (spd.c) */ extern void ipsec_policy_g_destroy(void); extern void ipsec_policy_g_init(void); +extern mblk_t *ipsec_add_crypto_data(mblk_t *, ipsec_crypto_t **); +extern mblk_t *ipsec_remove_crypto_data(mblk_t *, ipsec_crypto_t **); +extern mblk_t *ipsec_free_crypto_data(mblk_t *); extern int ipsec_alloc_table(ipsec_policy_head_t *, int, int, boolean_t, netstack_t *); extern void ipsec_polhead_init(ipsec_policy_head_t *, int); extern void ipsec_polhead_destroy(ipsec_policy_head_t *); extern void ipsec_polhead_free_table(ipsec_policy_head_t *); extern mblk_t *ipsec_check_global_policy(mblk_t *, conn_t *, ipha_t *, - ip6_t *, boolean_t, netstack_t *); + ip6_t *, ip_recv_attr_t *, netstack_t *ns); extern mblk_t *ipsec_check_inbound_policy(mblk_t *, conn_t *, ipha_t *, ip6_t *, - boolean_t); + ip_recv_attr_t *); -extern boolean_t ipsec_in_to_out(mblk_t *, ipha_t *, ip6_t *, zoneid_t); +extern boolean_t ipsec_in_to_out(ip_recv_attr_t *, ip_xmit_attr_t *, + mblk_t *, ipha_t *, ip6_t *); +extern void ipsec_in_release_refs(ip_recv_attr_t *); +extern void ipsec_out_release_refs(ip_xmit_attr_t *); extern void ipsec_log_policy_failure(int, char *, ipha_t *, ip6_t *, boolean_t, - netstack_t *); + netstack_t *); extern boolean_t ipsec_inbound_accept_clear(mblk_t *, ipha_t *, ip6_t *); extern int ipsec_conn_cache_policy(conn_t *, boolean_t); -extern mblk_t *ipsec_alloc_ipsec_out(netstack_t *); -extern mblk_t *ipsec_attach_ipsec_out(mblk_t **, conn_t *, ipsec_policy_t *, - uint8_t, netstack_t *); -extern mblk_t *ipsec_init_ipsec_out(mblk_t *, mblk_t **, conn_t *, - ipsec_policy_t *, uint8_t, netstack_t *); -struct ipsec_in_s; -extern ipsec_action_t *ipsec_in_to_out_action(struct ipsec_in_s *); -extern boolean_t ipsec_check_ipsecin_latch(struct ipsec_in_s *, mblk_t *, - struct ipsec_latch_s *, ipha_t *, ip6_t *, const char **, kstat_named_t **, - conn_t *); -extern void ipsec_latch_inbound(ipsec_latch_t *ipl, struct ipsec_in_s *ii); - -extern void ipsec_policy_free(ipsec_policy_t *, netstack_t *); +extern void ipsec_cache_outbound_policy(const conn_t *, const in6_addr_t *, + const in6_addr_t *, in_port_t, ip_xmit_attr_t *); +extern boolean_t ipsec_outbound_policy_current(ip_xmit_attr_t *); +extern ipsec_action_t *ipsec_in_to_out_action(ip_recv_attr_t *); +extern void ipsec_latch_inbound(conn_t *connp, ip_recv_attr_t *ira); + +extern void ipsec_policy_free(ipsec_policy_t *); extern void ipsec_action_free(ipsec_action_t *); extern void ipsec_polhead_free(ipsec_policy_head_t *, netstack_t *); extern ipsec_policy_head_t *ipsec_polhead_split(ipsec_policy_head_t *, @@ -894,12 +899,8 @@ extern void ipsec_actvec_free(ipsec_act_t *, uint_t); extern int ipsec_req_from_head(ipsec_policy_head_t *, ipsec_req_t *, int); extern mblk_t *ipsec_construct_inverse_acquire(sadb_msg_t *, sadb_ext_t **, netstack_t *); -extern mblk_t *ip_wput_attach_policy(mblk_t *, ipha_t *, ip6_t *, ire_t *, - conn_t *, boolean_t, zoneid_t); -extern mblk_t *ip_wput_ire_parse_ipsec_out(mblk_t *, ipha_t *, ip6_t *, - ire_t *, conn_t *, boolean_t, zoneid_t); -extern ipsec_policy_t *ipsec_find_policy(int, conn_t *, - struct ipsec_out_s *, ipsec_selector_t *, netstack_t *); +extern ipsec_policy_t *ipsec_find_policy(int, const conn_t *, + ipsec_selector_t *, netstack_t *); extern ipsid_t *ipsid_lookup(int, char *, netstack_t *); extern boolean_t ipsid_equal(ipsid_t *, ipsid_t *); extern void ipsid_gc(netstack_t *); @@ -912,29 +913,29 @@ extern void ipsec_enter_policy(ipsec_policy_head_t *, ipsec_policy_t *, int, netstack_t *); extern boolean_t ipsec_check_action(ipsec_act_t *, int *, netstack_t *); -extern mblk_t *ipsec_out_tag(mblk_t *, mblk_t *, netstack_t *); -extern mblk_t *ipsec_in_tag(mblk_t *, mblk_t *, netstack_t *); -extern mblk_t *ip_copymsg(mblk_t *mp); - -extern void iplatch_free(ipsec_latch_t *, netstack_t *); +extern void iplatch_free(ipsec_latch_t *); extern ipsec_latch_t *iplatch_create(void); extern int ipsec_set_req(cred_t *, conn_t *, ipsec_req_t *); extern void ipsec_insert_always(avl_tree_t *tree, void *new_node); extern int32_t ipsec_act_ovhd(const ipsec_act_t *act); -extern int sadb_whack_label(mblk_t **, ipsa_t *); -extern int sadb_whack_label_v6(mblk_t **, ipsa_t *); +extern mblk_t *sadb_whack_label(mblk_t *, ipsa_t *, ip_xmit_attr_t *, + kstat_named_t *, ipdropper_t *); +extern mblk_t *sadb_whack_label_v4(mblk_t *, ipsa_t *, kstat_named_t *, + ipdropper_t *); +extern mblk_t *sadb_whack_label_v6(mblk_t *, ipsa_t *, kstat_named_t *, + ipdropper_t *); extern boolean_t update_iv(uint8_t *, queue_t *, ipsa_t *, ipsecesp_stack_t *); /* * Tunnel-support SPD functions and variables. */ struct iptun_s; /* Defined in inet/iptun/iptun_impl.h. */ -extern boolean_t ipsec_tun_inbound(mblk_t *, mblk_t **, ipsec_tun_pol_t *, +extern mblk_t *ipsec_tun_inbound(ip_recv_attr_t *, mblk_t *, ipsec_tun_pol_t *, ipha_t *, ip6_t *, ipha_t *, ip6_t *, int, netstack_t *); extern mblk_t *ipsec_tun_outbound(mblk_t *, struct iptun_s *, ipha_t *, - ip6_t *, ipha_t *, ip6_t *, int); + ip6_t *, ipha_t *, ip6_t *, int, ip_xmit_attr_t *); extern void itp_free(ipsec_tun_pol_t *, netstack_t *); extern ipsec_tun_pol_t *create_tunnel_policy(char *, int *, uint64_t *, netstack_t *); @@ -951,9 +952,9 @@ extern ipsec_tun_pol_t *itp_get_byaddr(uint32_t *, uint32_t *, int, */ extern void ipsecah_in_assocfailure(mblk_t *, char, ushort_t, char *, - uint32_t, void *, int, ipsecah_stack_t *); + uint32_t, void *, int, ip_recv_attr_t *ira); extern void ipsecesp_in_assocfailure(mblk_t *, char, ushort_t, char *, - uint32_t, void *, int, ipsecesp_stack_t *); + uint32_t, void *, int, ip_recv_attr_t *ira); extern void ipsecesp_send_keepalive(ipsa_t *); /* @@ -987,13 +988,8 @@ extern void ipsecah_algs_changed(netstack_t *); extern void ipsecesp_algs_changed(netstack_t *); extern void ipsecesp_init_funcs(ipsa_t *); extern void ipsecah_init_funcs(ipsa_t *); -extern ipsec_status_t ipsecah_icmp_error(mblk_t *); -extern ipsec_status_t ipsecesp_icmp_error(mblk_t *); - -/* - * Wrapper for putnext() to ipsec accelerated interface. - */ -extern void ipsec_hw_putnext(queue_t *, mblk_t *); +extern mblk_t *ipsecah_icmp_error(mblk_t *, ip_recv_attr_t *); +extern mblk_t *ipsecesp_icmp_error(mblk_t *, ip_recv_attr_t *); /* * spdsock functions that are called directly by IP. @@ -1003,11 +999,11 @@ extern void spdsock_update_pending_algs(netstack_t *); /* * IP functions that are called from AH and ESP. */ -extern boolean_t ipsec_outbound_sa(mblk_t *, uint_t); -extern esph_t *ipsec_inbound_esp_sa(mblk_t *, netstack_t *); -extern ah_t *ipsec_inbound_ah_sa(mblk_t *, netstack_t *); +extern boolean_t ipsec_outbound_sa(mblk_t *, ip_xmit_attr_t *, uint_t); +extern mblk_t *ipsec_inbound_esp_sa(mblk_t *, ip_recv_attr_t *, esph_t **); +extern mblk_t *ipsec_inbound_ah_sa(mblk_t *, ip_recv_attr_t *, ah_t **); extern ipsec_policy_t *ipsec_find_policy_head(ipsec_policy_t *, - ipsec_policy_head_t *, int, ipsec_selector_t *, netstack_t *); + ipsec_policy_head_t *, int, ipsec_selector_t *); /* * IP dropper init/destroy. @@ -1019,7 +1015,7 @@ void ip_drop_destroy(ipsec_stack_t *); * Common functions */ extern boolean_t ip_addr_match(uint8_t *, int, in6_addr_t *); -extern boolean_t ipsec_label_match(cred_t *, cred_t *); +extern boolean_t ipsec_label_match(ts_label_t *, ts_label_t *); /* * AH and ESP counters types. diff --git a/usr/src/uts/common/inet/ipsec_info.h b/usr/src/uts/common/inet/ipsec_info.h index 3c7ede8405..c1bde9fcb7 100644 --- a/usr/src/uts/common/inet/ipsec_info.h +++ b/usr/src/uts/common/inet/ipsec_info.h @@ -34,22 +34,12 @@ extern "C" { /* * IPsec informational messages. These are M_CTL STREAMS messages, which - * convey IPsec information between various IP and related modules. The - * messages come in a few flavors: - * - * * IPSEC_{IN,OUT} - These show what IPsec action have been taken (for - * inbound datagrams), or need to be taken (for outbound datagrams). - * They flow between AH/ESP and IP. + * convey IPsec information between various IP and related modules. Most + * have been deprecated by the de-STREAMS-ing of TCP/IP. What remains is: * * * Keysock consumer interface - These messages are wrappers for * PF_KEY messages. They flow between AH/ESP and keysock. * - * Some of these messages include pointers such as a netstack_t pointer. - * We do not explicitly reference count those with netstack_hold/rele, - * since we depend on IP's ability to discard all of the IPSEC_{IN,OUT} - * messages in order to handle the ipsa pointers. - * We have special logic when doing asynch callouts to kEF for which we - * verify netstack_t pointer using the netstackid_t. */ /* @@ -69,223 +59,11 @@ extern "C" { * M_CTL types for IPsec messages. Remember, the values 0x40 - 0x4f and 0x60 * - 0x6f are not to be used because of potential little-endian confusion. * - * Offsets 1-25 (decimal) are in use, spread through this file. + * Offsets 3-7 (decimal) are in use, spread through this file. * Check for duplicates through the whole file before adding. */ /* - * IPSEC_{IN,OUT} policy expressors. - */ -#define IPSEC_IN (IPSEC_M_CTL + 1) -#define IPSEC_OUT (IPSEC_M_CTL + 2) -#define MAXSALTSIZE 8 - -/* - * For combined mode ciphers, store the crypto_mechanism_t in the - * per-packet ipsec_in_t/ipsec_out_t structures. This is because the PARAMS - * and nonce values change for each packet. For non-combined mode - * ciphers, these values are constant for the life of the SA. - */ -typedef struct ipsa_cm_mech_s { - crypto_mechanism_t combined_mech; - union { - CK_AES_CCM_PARAMS paramu_ccm; - CK_AES_GCM_PARAMS paramu_gcm; - } paramu; - uint8_t nonce[MAXSALTSIZE + sizeof (uint64_t)]; -#define param_ulMACSize paramu.paramu_ccm.ulMACSize -#define param_ulNonceSize paramu.paramu_ccm.ipsa_ulNonceSize -#define param_ulAuthDataSize paramu.paramu_ccm.ipsa_ulAuthDataSize -#define param_ulDataSize paramu.paramu_ccm.ipsa_ulDataSize -#define param_nonce paramu.paramu_ccm.nonce -#define param_authData paramu.paramu_ccm.authData -#define param_pIv paramu.paramu_gcm.ipsa_pIv -#define param_ulIvLen paramu.paramu_gcm.ulIvLen -#define param_ulIvBits paramu.paramu_gcm.ulIvBits -#define param_pAAD paramu.paramu_gcm.pAAD -#define param_ulAADLen paramu.paramu_gcm.ulAADLen -#define param_ulTagBits paramu.paramu_gcm.ulTagBits -} ipsa_cm_mech_t; - -/* - * This is used for communication between IP and IPSEC (AH/ESP) - * for Inbound datagrams. IPSEC_IN is allocated by IP before IPSEC - * processing begins. On return spi fields are initialized so that - * IP can locate the security associations later on for doing policy - * checks. For loopback case, IPSEC processing is not done. But the - * attributes of the security are reflected in <foo>_done fields below. - * The code in policy check infers that it is a loopback case and - * would not try to get the associations. - * - * The comment below (and for other netstack_t references) refers - * to the fact that we only do netstack_hold in particular cases, - * such as the references from open streams (ill_t and conn_t's - * pointers). Internally within IP we rely on IP's ability to cleanup e.g. - * ire_t's when an ill goes away. - */ -typedef struct ipsec_in_s { - uint32_t ipsec_in_type; - uint32_t ipsec_in_len; - frtn_t ipsec_in_frtn; /* for esballoc() callback */ - struct ipsa_s *ipsec_in_ah_sa; /* SA for AH */ - struct ipsa_s *ipsec_in_esp_sa; /* SA for ESP */ - - struct ipsec_policy_head_s *ipsec_in_policy; - struct ipsec_action_s *ipsec_in_action; /* how we made it in.. */ - unsigned int - ipsec_in_secure : 1, /* Is the message attached secure ? */ - ipsec_in_v4 : 1, /* Is this an ipv4 packet ? */ - ipsec_in_loopback : 1, /* Is this a loopback request ? */ - ipsec_in_dont_check : 1, /* Used by TCP to avoid policy check */ - - ipsec_in_decaps : 1, /* Was this packet decapsulated from */ - /* a matching inner packet? */ - ipsec_in_accelerated : 1, /* hardware accelerated packet */ - - ipsec_in_icmp_loopback : 1, /* Looped-back ICMP packet, */ - /* all should trust this. */ - ipsec_in_pad_bits : 25; - - int ipsec_in_ill_index; /* interface on which ipha_dst was */ - /* configured when pkt was recv'd */ - int ipsec_in_rill_index; /* interface on which pkt was recv'd */ - uint32_t ipsec_in_esp_udp_ports; /* For an ESP-in-UDP packet. */ - mblk_t *ipsec_in_da; /* data attr. for accelerated pkts */ - - /* - * For call to the kernel crypto framework. State needed during - * the execution of a crypto request. Storing these here - * allow us to avoid a separate allocation before calling the - * crypto framework. - */ - size_t ipsec_in_skip_len; /* len to skip for AH auth */ - crypto_data_t ipsec_in_crypto_data; /* single op crypto data */ - crypto_dual_data_t ipsec_in_crypto_dual_data; /* for dual ops */ - crypto_data_t ipsec_in_crypto_mac; /* to store the MAC */ - - zoneid_t ipsec_in_zoneid; /* target zone for the datagram */ - netstack_t *ipsec_in_ns; /* Does not have a netstack_hold */ - ipsa_cm_mech_t ipsec_in_cmm; /* PARAMS for Combined mode mechs */ - netstackid_t ipsec_in_stackid; /* Used while waing for kEF callback */ -} ipsec_in_t; - -#define IPSECOUT_MAX_ADDRLEN 4 /* Max addr len. (in 32-bit words) */ -/* - * This is used for communication between IP and IPSEC (AH/ESP) - * for Outbound datagrams. IPSEC_OUT is allocated by IP before IPSEC - * processing begins. On return SA fields are initialized so that - * IP can locate the security associations later on for doing policy - * checks. The policy and the actions associated with this packet are - * stored in the ipsec_out_policy and ipsec_out_act fields respectively. - * IPSEC_OUT is also used to carry non-ipsec information when conn is - * absent or the conn information is lost across the calls to ARP. - * example: message from ARP or from ICMP error routines. - */ -typedef struct ipsec_out_s { - uint32_t ipsec_out_type; - uint32_t ipsec_out_len; - frtn_t ipsec_out_frtn; /* for esballoc() callback */ - struct ipsec_policy_head_s *ipsec_out_polhead; - ipsec_latch_t *ipsec_out_latch; - struct ipsec_policy_s *ipsec_out_policy; /* why are we here? */ - struct ipsec_action_s *ipsec_out_act; /* what do we want? */ - struct ipsa_s *ipsec_out_ah_sa; /* AH SA used for the packet */ - struct ipsa_s *ipsec_out_esp_sa; /* ESP SA used for the packet */ - /* - * NOTE: "Source" and "Dest" are w.r.t. outbound datagrams. Ports can - * be zero, and the protocol number is needed to make the ports - * significant. - */ - uint16_t ipsec_out_src_port; /* Source port number of d-gram. */ - uint16_t ipsec_out_dst_port; /* Destination port number of d-gram. */ - uint8_t ipsec_out_icmp_type; /* ICMP type of d-gram */ - uint8_t ipsec_out_icmp_code; /* ICMP code of d-gram */ - - sa_family_t ipsec_out_inaf; /* Inner address family */ - uint32_t ipsec_out_insrc[IPSECOUT_MAX_ADDRLEN]; /* Inner src address */ - uint32_t ipsec_out_indst[IPSECOUT_MAX_ADDRLEN]; /* Inner dest address */ - uint8_t ipsec_out_insrcpfx; /* Inner source prefix */ - uint8_t ipsec_out_indstpfx; /* Inner destination prefix */ - - uint_t ipsec_out_ill_index; /* ill index used for multicast etc. */ - uint8_t ipsec_out_proto; /* IP protocol number for d-gram. */ - unsigned int - ipsec_out_tunnel : 1, /* Tunnel mode? */ - ipsec_out_use_global_policy : 1, /* Inherit global policy ? */ - ipsec_out_secure : 1, /* Is this secure ? */ - ipsec_out_proc_begin : 1, /* IPSEC processing begun */ - /* - * Following five values reflects the values stored - * in conn. - */ - ipsec_out_multicast_loop : 1, - ipsec_out_dontroute : 1, - ipsec_out_reserved : 1, - ipsec_out_v4 : 1, - - ipsec_out_unspec_src : 1, /* IPv6 ip6i_t info */ - ipsec_out_reachable : 1, /* NDP reachability info */ - ipsec_out_failed: 1, - ipsec_out_se_done: 1, - - ipsec_out_esp_done: 1, - ipsec_out_ah_done: 1, - ipsec_out_need_policy: 1, - - /* - * To indicate that packet must be accelerated, i.e. - * ICV or encryption performed, by Provider. - */ - ipsec_out_accelerated : 1, - /* - * Used by IP to tell IPsec that the outbound ill for this - * packet supports acceleration of the AH or ESP prototocol. - * If set, ipsec_out_capab_ill_index contains the - * index of the ill. - */ - ipsec_out_is_capab_ill : 1, - /* - * Indicates ICMP message destined for self. These - * messages are to be trusted by all receivers. - */ - ipsec_out_icmp_loopback: 1, - ipsec_out_ip_nexthop : 1, /* IP_NEXTHOP option is set */ - ipsec_out_pad_bits : 13; - cred_t *ipsec_out_cred; - uint32_t ipsec_out_capab_ill_index; - - /* - * For call to the kernel crypto framework. State needed during - * the execution of a crypto request. Storing these here - * allow us to avoid a separate allocation before calling the - * crypto framework. - */ - size_t ipsec_out_skip_len; /* len to skip for AH auth */ - crypto_data_t ipsec_out_crypto_data; /* single op crypto data */ - crypto_dual_data_t ipsec_out_crypto_dual_data; /* for dual ops */ - crypto_data_t ipsec_out_crypto_mac; /* to store the MAC */ - - zoneid_t ipsec_out_zoneid; /* source zone for the datagram */ - in6_addr_t ipsec_out_nexthop_v6; /* nexthop IP address */ -#define ipsec_out_nexthop_addr V4_PART_OF_V6(ipsec_out_nexthop_v6) - netstack_t *ipsec_out_ns; /* Does not have a netstack_hold */ - netstackid_t ipsec_out_stackid; /* Used while waing for kEF callback */ - ipsa_cm_mech_t ipsec_out_cmm; /* PARAMS for Combined mode mechs */ -} ipsec_out_t; - -/* - * This is used to mark the ipsec_out_t *req* fields - * when the operation is done without affecting the - * requests. - */ -#define IPSEC_REQ_DONE 0x80000000 -/* - * Operation could not be performed by the AH/ESP - * module. - */ -#define IPSEC_REQ_FAILED 0x40000000 - -/* * Keysock consumer interface. * * The driver/module keysock (which is a driver to PF_KEY sockets, but is @@ -368,32 +146,6 @@ typedef struct keysock_out_err_s { } keysock_out_err_t; /* - * M_CTL message type for sending inbound pkt information between IP & ULP. - * These are _not_ related to IPsec in any way, but are here so that there is - * one place where all these values are defined which makes it easier to track. - * The choice of this value has the same rationale as explained above. - */ -#define IN_PKTINFO (IPSEC_M_CTL + 24) - - -/* - * IPSEC_CTL messages are used by IPsec to send control type requests - * to IP. Such a control message is currently used by IPsec to request - * that IP send the contents of an IPsec SA or the entire SADB to - * every IPsec hardware acceleration capable provider. - */ - -#define IPSEC_CTL (IPSEC_M_CTL + 25) - -typedef struct ipsec_ctl_s { - uint32_t ipsec_ctl_type; - uint32_t ipsec_ctl_len; - uint_t ipsec_ctl_sa_type; - void *ipsec_ctl_sa; -} ipsec_ctl_t; - - -/* * All IPsec informational messages are placed into the ipsec_info_t * union, so that allocation can be done once, and IPsec informational * messages can be recycled. @@ -403,13 +155,10 @@ typedef union ipsec_info_u { uint32_t ipsec_allu_type; uint32_t ipsec_allu_len; /* In bytes */ } ipsec_allu; - ipsec_in_t ipsec_in; - ipsec_out_t ipsec_out; keysock_hello_ack_t keysock_hello_ack; keysock_in_t keysock_in; keysock_out_t keysock_out; keysock_out_err_t keysock_out_err; - ipsec_ctl_t ipsec_ctl; } ipsec_info_t; #define ipsec_info_type ipsec_allu.ipsec_allu_type #define ipsec_info_len ipsec_allu.ipsec_allu_len diff --git a/usr/src/uts/common/inet/ipsecah.h b/usr/src/uts/common/inet/ipsecah.h index c389664164..cde745da88 100644 --- a/usr/src/uts/common/inet/ipsecah.h +++ b/usr/src/uts/common/inet/ipsecah.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _INET_IPSECAH_H #define _INET_IPSECAH_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <inet/ip.h> #include <inet/ipdrop.h> @@ -62,9 +60,6 @@ typedef struct ah_kstats_s kstat_named_t ah_stat_acquire_requests; kstat_named_t ah_stat_bytes_expired; kstat_named_t ah_stat_out_discards; - kstat_named_t ah_stat_in_accelerated; - kstat_named_t ah_stat_out_accelerated; - kstat_named_t ah_stat_noaccel; kstat_named_t ah_stat_crypto_sync; kstat_named_t ah_stat_crypto_async; kstat_named_t ah_stat_crypto_failures; @@ -116,8 +111,6 @@ struct ipsecah_stack { */ queue_t *ah_pfkey_q; timeout_id_t ah_event; - - mblk_t *ah_ip_unbind; }; typedef struct ipsecah_stack ipsecah_stack_t; diff --git a/usr/src/uts/common/inet/ipsecesp.h b/usr/src/uts/common/inet/ipsecesp.h index 2dfb73c667..7be35276aa 100644 --- a/usr/src/uts/common/inet/ipsecesp.h +++ b/usr/src/uts/common/inet/ipsecesp.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _INET_IPSECESP_H #define _INET_IPSECESP_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <inet/ip.h> #include <inet/ipdrop.h> @@ -70,10 +68,7 @@ struct ipsecesp_stack { queue_t *esp_pfkey_q; timeout_id_t esp_event; - mblk_t *esp_ip_unbind; - sadbp_t esp_sadb; - }; typedef struct ipsecesp_stack ipsecesp_stack_t; diff --git a/usr/src/uts/common/inet/iptun/iptun.c b/usr/src/uts/common/inet/iptun/iptun.c index bc2f1d64d5..505aaccb31 100644 --- a/usr/src/uts/common/inet/iptun/iptun.c +++ b/usr/src/uts/common/inet/iptun/iptun.c @@ -76,6 +76,8 @@ #include <inet/ip.h> #include <inet/ip_ire.h> #include <inet/ipsec_impl.h> +#include <sys/tsol/label.h> +#include <sys/tsol/tnet.h> #include <inet/iptun.h> #include "iptun_impl.h" @@ -87,8 +89,6 @@ #define IPTUN_HASH_KEY(key) ((mod_hash_key_t)(uintptr_t)(key)) -#define IPTUNQ_DEV "/dev/iptunq" - #define IPTUN_MIN_IPV4_MTU 576 /* ip.h still uses 68 (!) */ #define IPTUN_MIN_IPV6_MTU IPV6_MIN_MTU #define IPTUN_MAX_IPV4_MTU (IP_MAXPACKET - sizeof (ipha_t)) @@ -113,15 +113,18 @@ static iptun_encaplim_t iptun_encaplim_init = { 0 }; -/* Table containing per-iptun-type information. */ +/* + * Table containing per-iptun-type information. + * Since IPv6 can run over all of these we have the IPv6 min as the min MTU. + */ static iptun_typeinfo_t iptun_type_table[] = { - { IPTUN_TYPE_IPV4, MAC_PLUGIN_IDENT_IPV4, IPV4_VERSION, ip_output, - IPTUN_MIN_IPV4_MTU, IPTUN_MAX_IPV4_MTU, B_TRUE }, - { IPTUN_TYPE_IPV6, MAC_PLUGIN_IDENT_IPV6, IPV6_VERSION, ip_output_v6, + { IPTUN_TYPE_IPV4, MAC_PLUGIN_IDENT_IPV4, IPV4_VERSION, + IPTUN_MIN_IPV6_MTU, IPTUN_MAX_IPV4_MTU, B_TRUE }, + { IPTUN_TYPE_IPV6, MAC_PLUGIN_IDENT_IPV6, IPV6_VERSION, IPTUN_MIN_IPV6_MTU, IPTUN_MAX_IPV6_MTU, B_TRUE }, - { IPTUN_TYPE_6TO4, MAC_PLUGIN_IDENT_6TO4, IPV4_VERSION, ip_output, - IPTUN_MIN_IPV4_MTU, IPTUN_MAX_IPV4_MTU, B_FALSE }, - { IPTUN_TYPE_UNKNOWN, NULL, 0, NULL, 0, 0, B_FALSE } + { IPTUN_TYPE_6TO4, MAC_PLUGIN_IDENT_6TO4, IPV4_VERSION, + IPTUN_MIN_IPV6_MTU, IPTUN_MAX_IPV4_MTU, B_FALSE }, + { IPTUN_TYPE_UNKNOWN, NULL, 0, 0, 0, B_FALSE } }; /* @@ -140,7 +143,6 @@ kmem_cache_t *iptun_cache; ddi_taskq_t *iptun_taskq; typedef enum { - IPTUN_TASK_PMTU_UPDATE, /* obtain new destination path-MTU */ IPTUN_TASK_MTU_UPDATE, /* tell mac about new tunnel link MTU */ IPTUN_TASK_LADDR_UPDATE, /* tell mac about new local address */ IPTUN_TASK_RADDR_UPDATE, /* tell mac about new remote address */ @@ -158,13 +160,23 @@ static int iptun_enter(iptun_t *); static void iptun_exit(iptun_t *); static void iptun_headergen(iptun_t *, boolean_t); static void iptun_drop_pkt(mblk_t *, uint64_t *); -static void iptun_input(void *, mblk_t *, void *); +static void iptun_input(void *, mblk_t *, void *, ip_recv_attr_t *); +static void iptun_input_icmp(void *, mblk_t *, void *, ip_recv_attr_t *); static void iptun_output(iptun_t *, mblk_t *); -static uint32_t iptun_get_maxmtu(iptun_t *, uint32_t); -static uint32_t iptun_update_mtu(iptun_t *, uint32_t); -static uint32_t iptun_get_dst_pmtu(iptun_t *); +static uint32_t iptun_get_maxmtu(iptun_t *, ip_xmit_attr_t *, uint32_t); +static uint32_t iptun_update_mtu(iptun_t *, ip_xmit_attr_t *, uint32_t); +static uint32_t iptun_get_dst_pmtu(iptun_t *, ip_xmit_attr_t *); +static void iptun_update_dst_pmtu(iptun_t *, ip_xmit_attr_t *); static int iptun_setladdr(iptun_t *, const struct sockaddr_storage *); +static void iptun_output_6to4(iptun_t *, mblk_t *); +static void iptun_output_common(iptun_t *, ip_xmit_attr_t *, mblk_t *); +static boolean_t iptun_verifyicmp(conn_t *, void *, icmph_t *, icmp6_t *, + ip_recv_attr_t *); + +static void iptun_notify(void *, ip_xmit_attr_t *, ixa_notify_type_t, + ixa_notify_arg_t); + static mac_callbacks_t iptun_m_callbacks; static int @@ -295,13 +307,6 @@ iptun_m_tx(void *arg, mblk_t *mpchain) return (NULL); } - /* - * Request the destination's path MTU information regularly in case - * path MTU has increased. - */ - if (IPTUN_PMTU_TOO_OLD(iptun)) - iptun_task_dispatch(iptun, IPTUN_TASK_PMTU_UPDATE); - for (mp = mpchain; mp != NULL; mp = nmp) { nmp = mp->b_next; mp->b_next = NULL; @@ -350,7 +355,7 @@ iptun_m_setprop(void *barg, const char *pr_name, mac_prop_id_t pr_num, } break; case MAC_PROP_MTU: { - uint32_t maxmtu = iptun_get_maxmtu(iptun, 0); + uint32_t maxmtu = iptun_get_maxmtu(iptun, NULL, 0); if (value < iptun->iptun_typeinfo->iti_minmtu || value > maxmtu) { @@ -434,7 +439,7 @@ iptun_m_getprop(void *barg, const char *pr_name, mac_prop_id_t pr_num, } break; case MAC_PROP_MTU: { - uint32_t maxmtu = iptun_get_maxmtu(iptun, 0); + uint32_t maxmtu = iptun_get_maxmtu(iptun, NULL, 0); if (is_possible) { range.range_uint32[0].mpur_min = @@ -516,20 +521,11 @@ iptun_enter_by_linkid(datalink_id_t linkid, iptun_t **iptun) } /* - * Handle tasks that were deferred through the iptun_taskq. These fall into - * two categories: - * - * 1. Tasks that were defered because we didn't want to spend time doing them - * while in the data path. Only IPTUN_TASK_PMTU_UPDATE falls into this - * category. - * - * 2. Tasks that were defered because they require calling up to the mac - * module, and we can't call up to the mac module while holding locks. + * Handle tasks that were deferred through the iptun_taskq because they require + * calling up to the mac module, and we can't call up to the mac module while + * holding locks. * - * Handling 1 is easy; we just lookup the iptun_t, perform the task, exit the - * tunnel, and we're done. - * - * Handling 2 is tricky to get right without introducing race conditions and + * This is tricky to get right without introducing race conditions and * deadlocks with the mac module, as we cannot issue an upcall while in the * iptun_t. The reason is that upcalls may try and enter the mac perimeter, * while iptun callbacks (such as iptun_m_setprop()) called from the mac @@ -573,12 +569,6 @@ iptun_task_cb(void *arg) if (iptun_enter_by_linkid(linkid, &iptun) != 0) return; - if (task == IPTUN_TASK_PMTU_UPDATE) { - (void) iptun_update_mtu(iptun, 0); - iptun_exit(iptun); - return; - } - iptun->iptun_flags |= IPTUN_UPCALL_PENDING; switch (task) { @@ -742,53 +732,143 @@ iptun_canbind(iptun_t *iptun) !(iptun->iptun_typeinfo->iti_hasraddr))); } +/* + * Verify that the local address is valid, and insert in the fanout + */ static int iptun_bind(iptun_t *iptun) { - conn_t *connp = iptun->iptun_connp; - int err; + conn_t *connp = iptun->iptun_connp; + int error = 0; + ip_xmit_attr_t *ixa; + iulp_t uinfo; + ip_stack_t *ipst = connp->conn_netstack->netstack_ip; + + /* Get an exclusive ixa for this thread, and replace conn_ixa */ + ixa = conn_get_ixa(connp, B_TRUE); + if (ixa == NULL) + return (ENOMEM); + ASSERT(ixa->ixa_refcnt >= 2); + ASSERT(ixa == connp->conn_ixa); + + /* We create PMTU state including for 6to4 */ + ixa->ixa_flags |= IXAF_PMTU_DISCOVERY; ASSERT(iptun_canbind(iptun)); + mutex_enter(&connp->conn_lock); + /* + * Note that conn_proto can't be set since the upper protocol + * can be both 41 and 4 when IPv6 and IPv4 are over the same tunnel. + * ipcl_iptun_classify doesn't use conn_proto. + */ + connp->conn_ipversion = iptun->iptun_typeinfo->iti_ipvers; + switch (iptun->iptun_typeinfo->iti_type) { case IPTUN_TYPE_IPV4: - /* - * When we set a tunnel's destination address, we do not care - * if the destination is reachable. Transient routing issues - * should not inhibit the creation of a tunnel interface, for - * example. For that reason, we pass in B_FALSE for the - * verify_dst argument of ip_proto_bind_connected_v4() (and - * similarly for IPv6 tunnels below). - */ - err = ip_proto_bind_connected_v4(connp, NULL, IPPROTO_ENCAP, - &iptun->iptun_laddr4, 0, iptun->iptun_raddr4, 0, B_TRUE, - B_FALSE, iptun->iptun_cred); + IN6_IPADDR_TO_V4MAPPED(iptun->iptun_laddr4, + &connp->conn_laddr_v6); + IN6_IPADDR_TO_V4MAPPED(iptun->iptun_raddr4, + &connp->conn_faddr_v6); + ixa->ixa_flags |= IXAF_IS_IPV4; + if (ip_laddr_verify_v4(iptun->iptun_laddr4, IPCL_ZONEID(connp), + ipst, B_FALSE) != IPVL_UNICAST_UP) { + mutex_exit(&connp->conn_lock); + error = EADDRNOTAVAIL; + goto done; + } break; case IPTUN_TYPE_IPV6: - err = ip_proto_bind_connected_v6(connp, NULL, IPPROTO_IPV6, - &iptun->iptun_laddr6, 0, &iptun->iptun_raddr6, NULL, 0, - B_TRUE, B_FALSE, iptun->iptun_cred); + connp->conn_laddr_v6 = iptun->iptun_laddr6; + connp->conn_faddr_v6 = iptun->iptun_raddr6; + ixa->ixa_flags &= ~IXAF_IS_IPV4; + /* We use a zero scopeid for now */ + if (ip_laddr_verify_v6(&iptun->iptun_laddr6, IPCL_ZONEID(connp), + ipst, B_FALSE, 0) != IPVL_UNICAST_UP) { + mutex_exit(&connp->conn_lock); + error = EADDRNOTAVAIL; + goto done; + } break; case IPTUN_TYPE_6TO4: - err = ip_proto_bind_laddr_v4(connp, NULL, IPPROTO_IPV6, - iptun->iptun_laddr4, 0, B_TRUE); - break; + IN6_IPADDR_TO_V4MAPPED(iptun->iptun_laddr4, + &connp->conn_laddr_v6); + IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &connp->conn_faddr_v6); + ixa->ixa_flags |= IXAF_IS_IPV4; + mutex_exit(&connp->conn_lock); + + switch (ip_laddr_verify_v4(iptun->iptun_laddr4, + IPCL_ZONEID(connp), ipst, B_FALSE)) { + case IPVL_UNICAST_UP: + case IPVL_UNICAST_DOWN: + break; + default: + error = EADDRNOTAVAIL; + goto done; + } + goto insert; } - if (err == 0) { - iptun->iptun_flags |= IPTUN_BOUND; + /* In case previous destination was multirt */ + ip_attr_newdst(ixa); - /* - * Now that we're bound with ip below us, this is a good time - * to initialize the destination path MTU and to re-calculate - * the tunnel's link MTU. - */ - (void) iptun_update_mtu(iptun, 0); + /* + * When we set a tunnel's destination address, we do not + * care if the destination is reachable. Transient routing + * issues should not inhibit the creation of a tunnel + * interface, for example. Thus we pass B_FALSE here. + */ + connp->conn_saddr_v6 = connp->conn_laddr_v6; + mutex_exit(&connp->conn_lock); - if (IS_IPTUN_RUNNING(iptun)) - iptun_task_dispatch(iptun, IPTUN_TASK_LINK_UPDATE); - } - return (err); + /* As long as the MTU is large we avoid fragmentation */ + ixa->ixa_flags |= IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF; + + /* We handle IPsec in iptun_output_common */ + error = ip_attr_connect(connp, ixa, &connp->conn_saddr_v6, + &connp->conn_faddr_v6, &connp->conn_faddr_v6, 0, + &connp->conn_saddr_v6, &uinfo, 0); + + if (error != 0) + goto done; + + /* saddr shouldn't change since it was already set */ + ASSERT(IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6, + &connp->conn_saddr_v6)); + + /* We set IXAF_VERIFY_PMTU to catch PMTU increases */ + ixa->ixa_flags |= IXAF_VERIFY_PMTU; + ASSERT(uinfo.iulp_mtu != 0); + + /* + * Allow setting new policies. + * The addresses/ports are already set, thus the IPsec policy calls + * can handle their passed-in conn's. + */ + connp->conn_policy_cached = B_FALSE; + +insert: + error = ipcl_conn_insert(connp); + if (error != 0) + goto done; + + /* Record this as the "last" send even though we haven't sent any */ + connp->conn_v6lastdst = connp->conn_faddr_v6; + + iptun->iptun_flags |= IPTUN_BOUND; + /* + * Now that we're bound with ip below us, this is a good + * time to initialize the destination path MTU and to + * re-calculate the tunnel's link MTU. + */ + (void) iptun_update_mtu(iptun, ixa, 0); + + if (IS_IPTUN_RUNNING(iptun)) + iptun_task_dispatch(iptun, IPTUN_TASK_LINK_UPDATE); + +done: + ixa_refrele(ixa); + return (error); } static void @@ -986,7 +1066,7 @@ iptun_set_sec_simple(iptun_t *iptun, const ipsec_req_t *ipsr) * Adjust MTU and make sure the DL side knows what's up. */ itp->itp_flags = ITPF_P_ACTIVE; - (void) iptun_update_mtu(iptun, 0); + (void) iptun_update_mtu(iptun, NULL, 0); old_policy = B_FALSE; /* Blank out inactive - we succeeded */ } else { rw_exit(&itp->itp_policy->iph_lock); @@ -1170,8 +1250,16 @@ iptun_conn_create(iptun_t *iptun, netstack_t *ns, cred_t *credp) connp->conn_flags |= IPCL_IPTUN; connp->conn_iptun = iptun; connp->conn_recv = iptun_input; - connp->conn_rq = ns->netstack_iptun->iptuns_g_q; - connp->conn_wq = WR(connp->conn_rq); + connp->conn_recvicmp = iptun_input_icmp; + connp->conn_verifyicmp = iptun_verifyicmp; + + /* + * Register iptun_notify to listen to capability changes detected by IP. + * This upcall is made in the context of the call to conn_ip_output. + */ + connp->conn_ixa->ixa_notify = iptun_notify; + connp->conn_ixa->ixa_notify_cookie = iptun; + /* * For exclusive stacks we set conn_zoneid to GLOBAL_ZONEID as is done * for all other conn_t's. @@ -1187,11 +1275,32 @@ iptun_conn_create(iptun_t *iptun, netstack_t *ns, cred_t *credp) connp->conn_cred = credp; /* crfree() is done in ipcl_conn_destroy(), called by CONN_DEC_REF() */ crhold(connp->conn_cred); + connp->conn_cpid = NOPID; - connp->conn_send = iptun->iptun_typeinfo->iti_txfunc; - connp->conn_af_isv6 = iptun->iptun_typeinfo->iti_ipvers == IPV6_VERSION; + /* conn_allzones can not be set this early, hence no IPCL_ZONEID */ + connp->conn_ixa->ixa_zoneid = connp->conn_zoneid; ASSERT(connp->conn_ref == 1); + /* Cache things in ixa without an extra refhold */ + connp->conn_ixa->ixa_cred = connp->conn_cred; + connp->conn_ixa->ixa_cpid = connp->conn_cpid; + if (is_system_labeled()) + connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred); + + /* + * Have conn_ip_output drop packets should our outer source + * go invalid + */ + connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE; + + switch (iptun->iptun_typeinfo->iti_ipvers) { + case IPV4_VERSION: + connp->conn_family = AF_INET6; + break; + case IPV6_VERSION: + connp->conn_family = AF_INET; + break; + } mutex_enter(&connp->conn_lock); connp->conn_state_flags &= ~CONN_INCIPIENT; mutex_exit(&connp->conn_lock); @@ -1207,26 +1316,6 @@ iptun_conn_destroy(conn_t *connp) CONN_DEC_REF(connp); } -static int -iptun_create_g_q(iptun_stack_t *iptuns, cred_t *credp) -{ - int err; - conn_t *connp; - - ASSERT(iptuns->iptuns_g_q == NULL); - /* - * The global queue for this stack is set when iptunq_open() calls - * iptun_set_g_q(). - */ - err = ldi_open_by_name(IPTUNQ_DEV, FWRITE|FREAD, credp, - &iptuns->iptuns_g_q_lh, iptun_ldi_ident); - if (err == 0) { - connp = iptuns->iptuns_g_q->q_ptr; - connp->conn_recv = iptun_input; - } - return (err); -} - static iptun_t * iptun_alloc(void) { @@ -1289,11 +1378,6 @@ iptun_free(iptun_t *iptun) iptun->iptun_connp = NULL; } - netstack_rele(iptun->iptun_ns); - iptun->iptun_ns = NULL; - crfree(iptun->iptun_cred); - iptun->iptun_cred = NULL; - kmem_cache_free(iptun_cache, iptun); atomic_dec_32(&iptun_tunnelcount); } @@ -1340,19 +1424,6 @@ iptun_create(iptun_kparams_t *ik, cred_t *credp) ns = netstack_find_by_cred(credp); iptuns = ns->netstack_iptun; - /* - * Before we create any tunnel, we need to ensure that the default - * STREAMS queue (used to satisfy the ip module's requirement for one) - * is created. We only do this once per stack. The stream is closed - * when the stack is destroyed in iptun_stack_fni(). - */ - mutex_enter(&iptuns->iptuns_lock); - if (iptuns->iptuns_g_q == NULL) - err = iptun_create_g_q(iptuns, zone_kcred()); - mutex_exit(&iptuns->iptuns_lock); - if (err != 0) - goto done; - if ((iptun = iptun_alloc()) == NULL) { err = ENOMEM; goto done; @@ -1360,8 +1431,6 @@ iptun_create(iptun_kparams_t *ik, cred_t *credp) iptun->iptun_linkid = ik->iptun_kparam_linkid; iptun->iptun_zoneid = zoneid; - crhold(credp); - iptun->iptun_cred = credp; iptun->iptun_ns = ns; iptun->iptun_typeinfo = iptun_gettypeinfo(ik->iptun_kparam_type); @@ -1668,49 +1737,142 @@ iptun_set_policy(datalink_id_t linkid, ipsec_tun_pol_t *itp) ITP_REFHOLD(itp); iptun->iptun_itp = itp; /* IPsec policy means IPsec overhead, which means lower MTU. */ - (void) iptun_update_mtu(iptun, 0); + (void) iptun_update_mtu(iptun, NULL, 0); } iptun_exit(iptun); } /* * Obtain the path MTU to the tunnel destination. + * Can return zero in some cases. */ static uint32_t -iptun_get_dst_pmtu(iptun_t *iptun) +iptun_get_dst_pmtu(iptun_t *iptun, ip_xmit_attr_t *ixa) { - ire_t *ire = NULL; - ip_stack_t *ipst = iptun->iptun_ns->netstack_ip; uint32_t pmtu = 0; + conn_t *connp = iptun->iptun_connp; + boolean_t need_rele = B_FALSE; /* - * We only obtain the destination IRE for tunnels that have a remote - * tunnel address. + * We only obtain the pmtu for tunnels that have a remote tunnel + * address. */ if (!(iptun->iptun_flags & IPTUN_RADDR)) return (0); - switch (iptun->iptun_typeinfo->iti_ipvers) { - case IPV4_VERSION: - ire = ire_route_lookup(iptun->iptun_raddr4, INADDR_ANY, - INADDR_ANY, 0, NULL, NULL, iptun->iptun_connp->conn_zoneid, - NULL, (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst); - break; - case IPV6_VERSION: - ire = ire_route_lookup_v6(&iptun->iptun_raddr6, NULL, NULL, 0, - NULL, NULL, iptun->iptun_connp->conn_zoneid, NULL, - (MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT), ipst); - break; + if (ixa == NULL) { + ixa = conn_get_ixa(connp, B_FALSE); + if (ixa == NULL) + return (0); + need_rele = B_TRUE; } + /* + * Guard against ICMP errors before we have sent, as well as against + * and a thread which held conn_ixa. + */ + if (ixa->ixa_ire != NULL) { + pmtu = ip_get_pmtu(ixa); - if (ire != NULL) { - pmtu = ire->ire_max_frag; - ire_refrele(ire); + /* + * For both IPv4 and IPv6 we can have indication that the outer + * header needs fragmentation. + */ + if (ixa->ixa_flags & IXAF_PMTU_TOO_SMALL) { + /* Must allow fragmentation in ip_output */ + ixa->ixa_flags &= ~IXAF_DONTFRAG; + } else if (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_6TO4) { + ixa->ixa_flags |= IXAF_DONTFRAG; + } else { + /* ip_get_pmtu might have set this - we don't want it */ + ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF; + } } + + if (need_rele) + ixa_refrele(ixa); return (pmtu); } /* + * Update the ip_xmit_attr_t to capture the current lower path mtu as known + * by ip. + */ +static void +iptun_update_dst_pmtu(iptun_t *iptun, ip_xmit_attr_t *ixa) +{ + uint32_t pmtu; + conn_t *connp = iptun->iptun_connp; + boolean_t need_rele = B_FALSE; + + /* IXAF_VERIFY_PMTU is not set if we don't have a fixed destination */ + if (!(iptun->iptun_flags & IPTUN_RADDR)) + return; + + if (ixa == NULL) { + ixa = conn_get_ixa(connp, B_FALSE); + if (ixa == NULL) + return; + need_rele = B_TRUE; + } + /* + * Guard against ICMP errors before we have sent, as well as against + * and a thread which held conn_ixa. + */ + if (ixa->ixa_ire != NULL) { + pmtu = ip_get_pmtu(ixa); + /* + * Update ixa_fragsize and ixa_pmtu. + */ + ixa->ixa_fragsize = ixa->ixa_pmtu = pmtu; + + /* + * For both IPv4 and IPv6 we can have indication that the outer + * header needs fragmentation. + */ + if (ixa->ixa_flags & IXAF_PMTU_TOO_SMALL) { + /* Must allow fragmentation in ip_output */ + ixa->ixa_flags &= ~IXAF_DONTFRAG; + } else if (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_6TO4) { + ixa->ixa_flags |= IXAF_DONTFRAG; + } else { + /* ip_get_pmtu might have set this - we don't want it */ + ixa->ixa_flags &= ~IXAF_PMTU_IPV4_DF; + } + } + + if (need_rele) + ixa_refrele(ixa); +} + +/* + * There is nothing that iptun can verify in addition to IP having + * verified the IP addresses in the fanout. + */ +/* ARGSUSED */ +static boolean_t +iptun_verifyicmp(conn_t *connp, void *arg2, icmph_t *icmph, icmp6_t *icmp6, + ip_recv_attr_t *ira) +{ + return (B_TRUE); +} + +/* + * Notify function registered with ip_xmit_attr_t. + */ +static void +iptun_notify(void *arg, ip_xmit_attr_t *ixa, ixa_notify_type_t ntype, + ixa_notify_arg_t narg) +{ + iptun_t *iptun = (iptun_t *)arg; + + switch (ntype) { + case IXAN_PMTU: + (void) iptun_update_mtu(iptun, ixa, narg); + break; + } +} + +/* * Returns the max of old_ovhd and the overhead associated with pol. */ static uint32_t @@ -1765,18 +1927,18 @@ iptun_get_ipsec_overhead(iptun_t *iptun) /* Check for both IPv4 and IPv6. */ sel.ips_protocol = IPPROTO_ENCAP; pol = ipsec_find_policy_head(NULL, iph, IPSEC_TYPE_OUTBOUND, - &sel, ns); + &sel); if (pol != NULL) { ipsec_ovhd = ipsec_act_ovhd(&pol->ipsp_act->ipa_act); - IPPOL_REFRELE(pol, ns); + IPPOL_REFRELE(pol); } sel.ips_protocol = IPPROTO_IPV6; pol = ipsec_find_policy_head(NULL, iph, IPSEC_TYPE_OUTBOUND, - &sel, ns); + &sel); if (pol != NULL) { ipsec_ovhd = max(ipsec_ovhd, ipsec_act_ovhd(&pol->ipsp_act->ipa_act)); - IPPOL_REFRELE(pol, ns); + IPPOL_REFRELE(pol); } IPPH_REFRELE(iph, ns); } else { @@ -1802,10 +1964,14 @@ iptun_get_ipsec_overhead(iptun_t *iptun) } /* - * Calculate and return the maximum possible MTU for the given tunnel. + * Calculate and return the maximum possible upper MTU for the given tunnel. + * + * If new_pmtu is set then we also need to update the lower path MTU information + * in the ip_xmit_attr_t. That is needed since we set IXAF_VERIFY_PMTU so that + * we are notified by conn_ip_output() when the path MTU increases. */ static uint32_t -iptun_get_maxmtu(iptun_t *iptun, uint32_t new_pmtu) +iptun_get_maxmtu(iptun_t *iptun, ip_xmit_attr_t *ixa, uint32_t new_pmtu) { size_t header_size, ipsec_overhead; uint32_t maxmtu, pmtu; @@ -1816,13 +1982,11 @@ iptun_get_maxmtu(iptun_t *iptun, uint32_t new_pmtu) * iptun_get_dst_pmtu(). */ if (new_pmtu != 0) { - if (iptun->iptun_flags & IPTUN_RADDR) { + if (iptun->iptun_flags & IPTUN_RADDR) iptun->iptun_dpmtu = new_pmtu; - iptun->iptun_dpmtu_lastupdate = ddi_get_lbolt(); - } pmtu = new_pmtu; } else if (iptun->iptun_flags & IPTUN_RADDR) { - if ((pmtu = iptun_get_dst_pmtu(iptun)) == 0) { + if ((pmtu = iptun_get_dst_pmtu(iptun, ixa)) == 0) { /* * We weren't able to obtain the path-MTU of the * destination. Use the previous value. @@ -1830,7 +1994,6 @@ iptun_get_maxmtu(iptun_t *iptun, uint32_t new_pmtu) pmtu = iptun->iptun_dpmtu; } else { iptun->iptun_dpmtu = pmtu; - iptun->iptun_dpmtu_lastupdate = ddi_get_lbolt(); } } else { /* @@ -1866,19 +2029,23 @@ iptun_get_maxmtu(iptun_t *iptun, uint32_t new_pmtu) } /* - * Re-calculate the tunnel's MTU and notify the MAC layer of any change in - * MTU. The new_pmtu argument is the new path MTU to the tunnel destination - * to be used in the tunnel MTU calculation. Passing in 0 for new_pmtu causes - * the path MTU to be dynamically updated using iptun_update_pmtu(). + * Re-calculate the tunnel's MTU as seen from above and notify the MAC layer + * of any change in MTU. The new_pmtu argument is the new lower path MTU to + * the tunnel destination to be used in the tunnel MTU calculation. Passing + * in 0 for new_pmtu causes the lower path MTU to be dynamically updated using + * ip_get_pmtu(). * * If the calculated tunnel MTU is different than its previous value, then we * notify the MAC layer above us of this change using mac_maxsdu_update(). */ static uint32_t -iptun_update_mtu(iptun_t *iptun, uint32_t new_pmtu) +iptun_update_mtu(iptun_t *iptun, ip_xmit_attr_t *ixa, uint32_t new_pmtu) { uint32_t newmtu; + /* We always update the ixa since we might have set IXAF_VERIFY_PMTU */ + iptun_update_dst_pmtu(iptun, ixa); + /* * We return the current MTU without updating it if it was pegged to a * static value using the MAC_PROP_MTU link property. @@ -1887,8 +2054,7 @@ iptun_update_mtu(iptun_t *iptun, uint32_t new_pmtu) return (iptun->iptun_mtu); /* If the MTU isn't fixed, then use the maximum possible value. */ - newmtu = iptun_get_maxmtu(iptun, new_pmtu); - + newmtu = iptun_get_maxmtu(iptun, ixa, new_pmtu); /* * We only dynamically adjust the tunnel MTU for tunnels with * destinations because dynamic MTU calculations are based on the @@ -1929,7 +2095,7 @@ iptun_build_icmperr(size_t hdrs_size, mblk_t *orig_pkt) { mblk_t *icmperr_mp; - if ((icmperr_mp = allocb_tmpl(hdrs_size, orig_pkt)) != NULL) { + if ((icmperr_mp = allocb(hdrs_size, BPRI_MED)) != NULL) { icmperr_mp->b_wptr += hdrs_size; /* tack on the offending packet */ icmperr_mp->b_cont = orig_pkt; @@ -1942,12 +2108,15 @@ iptun_build_icmperr(size_t hdrs_size, mblk_t *orig_pkt) * the ICMP error. */ static void -iptun_sendicmp_v4(iptun_t *iptun, icmph_t *icmp, ipha_t *orig_ipha, mblk_t *mp) +iptun_sendicmp_v4(iptun_t *iptun, icmph_t *icmp, ipha_t *orig_ipha, mblk_t *mp, + ts_label_t *tsl) { size_t orig_pktsize, hdrs_size; mblk_t *icmperr_mp; ipha_t *new_ipha; icmph_t *new_icmp; + ip_xmit_attr_t ixas; + conn_t *connp = iptun->iptun_connp; orig_pktsize = msgdsize(mp); hdrs_size = sizeof (ipha_t) + sizeof (icmph_t); @@ -1974,17 +2143,35 @@ iptun_sendicmp_v4(iptun_t *iptun, icmph_t *icmp, ipha_t *orig_ipha, mblk_t *mp) new_icmp->icmph_checksum = 0; new_icmp->icmph_checksum = IP_CSUM(icmperr_mp, sizeof (ipha_t), 0); - ip_output(iptun->iptun_connp, icmperr_mp, iptun->iptun_connp->conn_wq, - IP_WPUT); + bzero(&ixas, sizeof (ixas)); + ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; + if (new_ipha->ipha_src == INADDR_ANY) + ixas.ixa_flags |= IXAF_SET_SOURCE; + + ixas.ixa_zoneid = IPCL_ZONEID(connp); + ixas.ixa_ipst = connp->conn_netstack->netstack_ip; + ixas.ixa_cred = connp->conn_cred; + ixas.ixa_cpid = NOPID; + if (is_system_labeled()) + ixas.ixa_tsl = tsl; + + ixas.ixa_ifindex = 0; + ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; + + (void) ip_output_simple(icmperr_mp, &ixas); + ixa_cleanup(&ixas); } static void -iptun_sendicmp_v6(iptun_t *iptun, icmp6_t *icmp6, ip6_t *orig_ip6h, mblk_t *mp) +iptun_sendicmp_v6(iptun_t *iptun, icmp6_t *icmp6, ip6_t *orig_ip6h, mblk_t *mp, + ts_label_t *tsl) { size_t orig_pktsize, hdrs_size; mblk_t *icmp6err_mp; ip6_t *new_ip6h; icmp6_t *new_icmp6; + ip_xmit_attr_t ixas; + conn_t *connp = iptun->iptun_connp; orig_pktsize = msgdsize(mp); hdrs_size = sizeof (ip6_t) + sizeof (icmp6_t); @@ -2004,16 +2191,31 @@ iptun_sendicmp_v6(iptun_t *iptun, icmp6_t *icmp6, ip6_t *orig_ip6h, mblk_t *mp) new_ip6h->ip6_dst = orig_ip6h->ip6_src; *new_icmp6 = *icmp6; - /* The checksum is calculated in ip_wput_ire_v6(). */ + /* The checksum is calculated in ip_output_simple and friends. */ new_icmp6->icmp6_cksum = new_ip6h->ip6_plen; - ip_output_v6(iptun->iptun_connp, icmp6err_mp, - iptun->iptun_connp->conn_wq, IP_WPUT); + bzero(&ixas, sizeof (ixas)); + ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6; + if (IN6_IS_ADDR_UNSPECIFIED(&new_ip6h->ip6_src)) + ixas.ixa_flags |= IXAF_SET_SOURCE; + + ixas.ixa_zoneid = IPCL_ZONEID(connp); + ixas.ixa_ipst = connp->conn_netstack->netstack_ip; + ixas.ixa_cred = connp->conn_cred; + ixas.ixa_cpid = NOPID; + if (is_system_labeled()) + ixas.ixa_tsl = tsl; + + ixas.ixa_ifindex = 0; + ixas.ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; + + (void) ip_output_simple(icmp6err_mp, &ixas); + ixa_cleanup(&ixas); } static void iptun_icmp_error_v4(iptun_t *iptun, ipha_t *orig_ipha, mblk_t *mp, - uint8_t type, uint8_t code) + uint8_t type, uint8_t code, ts_label_t *tsl) { icmph_t icmp; @@ -2021,12 +2223,12 @@ iptun_icmp_error_v4(iptun_t *iptun, ipha_t *orig_ipha, mblk_t *mp, icmp.icmph_type = type; icmp.icmph_code = code; - iptun_sendicmp_v4(iptun, &icmp, orig_ipha, mp); + iptun_sendicmp_v4(iptun, &icmp, orig_ipha, mp, tsl); } static void iptun_icmp_fragneeded_v4(iptun_t *iptun, uint32_t newmtu, ipha_t *orig_ipha, - mblk_t *mp) + mblk_t *mp, ts_label_t *tsl) { icmph_t icmp; @@ -2035,12 +2237,12 @@ iptun_icmp_fragneeded_v4(iptun_t *iptun, uint32_t newmtu, ipha_t *orig_ipha, icmp.icmph_du_zero = 0; icmp.icmph_du_mtu = htons(newmtu); - iptun_sendicmp_v4(iptun, &icmp, orig_ipha, mp); + iptun_sendicmp_v4(iptun, &icmp, orig_ipha, mp, tsl); } static void iptun_icmp_error_v6(iptun_t *iptun, ip6_t *orig_ip6h, mblk_t *mp, - uint8_t type, uint8_t code, uint32_t offset) + uint8_t type, uint8_t code, uint32_t offset, ts_label_t *tsl) { icmp6_t icmp6; @@ -2050,12 +2252,12 @@ iptun_icmp_error_v6(iptun_t *iptun, ip6_t *orig_ip6h, mblk_t *mp, if (type == ICMP6_PARAM_PROB) icmp6.icmp6_pptr = htonl(offset); - iptun_sendicmp_v6(iptun, &icmp6, orig_ip6h, mp); + iptun_sendicmp_v6(iptun, &icmp6, orig_ip6h, mp, tsl); } static void iptun_icmp_toobig_v6(iptun_t *iptun, uint32_t newmtu, ip6_t *orig_ip6h, - mblk_t *mp) + mblk_t *mp, ts_label_t *tsl) { icmp6_t icmp6; @@ -2063,7 +2265,7 @@ iptun_icmp_toobig_v6(iptun_t *iptun, uint32_t newmtu, ip6_t *orig_ip6h, icmp6.icmp6_code = 0; icmp6.icmp6_mtu = htonl(newmtu); - iptun_sendicmp_v6(iptun, &icmp6, orig_ip6h, mp); + iptun_sendicmp_v6(iptun, &icmp6, orig_ip6h, mp, tsl); } /* @@ -2105,13 +2307,15 @@ is_icmp_error(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h) /* * Find inner and outer IP headers from a tunneled packet as setup for calls * into ipsec_tun_{in,out}bound(). + * Note that we need to allow the outer header to be in a separate mblk from + * the inner header. + * If the caller knows the outer_hlen, the caller passes it in. Otherwise zero. */ static size_t -iptun_find_headers(mblk_t *mp, ipha_t **outer4, ipha_t **inner4, ip6_t **outer6, - ip6_t **inner6) +iptun_find_headers(mblk_t *mp, size_t outer_hlen, ipha_t **outer4, + ipha_t **inner4, ip6_t **outer6, ip6_t **inner6) { ipha_t *ipha; - size_t outer_hlen; size_t first_mblkl = MBLKL(mp); mblk_t *inner_mp; @@ -2128,12 +2332,14 @@ iptun_find_headers(mblk_t *mp, ipha_t **outer4, ipha_t **inner4, ip6_t **outer6, case IPV4_VERSION: *outer4 = ipha; *outer6 = NULL; - outer_hlen = IPH_HDR_LENGTH(ipha); + if (outer_hlen == 0) + outer_hlen = IPH_HDR_LENGTH(ipha); break; case IPV6_VERSION: *outer4 = NULL; *outer6 = (ip6_t *)ipha; - outer_hlen = ip_hdr_length_v6(mp, (ip6_t *)ipha); + if (outer_hlen == 0) + outer_hlen = ip_hdr_length_v6(mp, (ip6_t *)ipha); break; default: return (0); @@ -2192,8 +2398,8 @@ iptun_find_headers(mblk_t *mp, ipha_t **outer4, ipha_t **inner4, ip6_t **outer6, * whatever the very-inner packet is (IPv4(2) or IPv6). */ static void -iptun_input_icmp_v4(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp, - icmph_t *icmph) +iptun_input_icmp_v4(iptun_t *iptun, mblk_t *data_mp, icmph_t *icmph, + ip_recv_attr_t *ira) { uint8_t *orig; ipha_t *outer4, *inner4; @@ -2201,12 +2407,6 @@ iptun_input_icmp_v4(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp, int outer_hlen; uint8_t type, code; - /* - * Change the db_type to M_DATA because subsequent operations assume - * the ICMP packet is M_DATA again (i.e. calls to msgdsize()). - */ - data_mp->b_datap->db_type = M_DATA; - ASSERT(data_mp->b_cont == NULL); /* * Temporarily move b_rptr forward so that iptun_find_headers() can @@ -2220,13 +2420,12 @@ iptun_input_icmp_v4(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp, * here). */ ASSERT(MBLKL(data_mp) >= 0); - outer_hlen = iptun_find_headers(data_mp, &outer4, &inner4, &outer6, + outer_hlen = iptun_find_headers(data_mp, 0, &outer4, &inner4, &outer6, &inner6); ASSERT(outer6 == NULL); data_mp->b_rptr = orig; if (outer_hlen == 0) { - iptun_drop_pkt((ipsec_mp != NULL ? ipsec_mp : data_mp), - &iptun->iptun_ierrors); + iptun_drop_pkt(data_mp, &iptun->iptun_ierrors); return; } @@ -2234,10 +2433,9 @@ iptun_input_icmp_v4(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp, ASSERT(outer4->ipha_protocol == IPPROTO_ENCAP || outer4->ipha_protocol == IPPROTO_IPV6); - /* ipsec_tun_inbound() always frees ipsec_mp. */ - if (!ipsec_tun_inbound(ipsec_mp, &data_mp, iptun->iptun_itp, - inner4, inner6, outer4, outer6, -outer_hlen, - iptun->iptun_ns)) { + data_mp = ipsec_tun_inbound(ira, data_mp, iptun->iptun_itp, + inner4, inner6, outer4, outer6, -outer_hlen, iptun->iptun_ns); + if (data_mp == NULL) { /* Callee did all of the freeing. */ atomic_inc_64(&iptun->iptun_ierrors); return; @@ -2269,15 +2467,15 @@ iptun_input_icmp_v4(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp, * also have IPsec policy by letting iptun_update_mtu * take care of it. */ - newmtu = - iptun_update_mtu(iptun, ntohs(icmph->icmph_du_mtu)); + newmtu = iptun_update_mtu(iptun, NULL, + ntohs(icmph->icmph_du_mtu)); if (inner4 != NULL) { iptun_icmp_fragneeded_v4(iptun, newmtu, inner4, - data_mp); + data_mp, ira->ira_tsl); } else { iptun_icmp_toobig_v6(iptun, newmtu, inner6, - data_mp); + data_mp, ira->ira_tsl); } return; } @@ -2310,10 +2508,13 @@ iptun_input_icmp_v4(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp, return; } - if (inner4 != NULL) - iptun_icmp_error_v4(iptun, inner4, data_mp, type, code); - else - iptun_icmp_error_v6(iptun, inner6, data_mp, type, code, 0); + if (inner4 != NULL) { + iptun_icmp_error_v4(iptun, inner4, data_mp, type, code, + ira->ira_tsl); + } else { + iptun_icmp_error_v6(iptun, inner6, data_mp, type, code, 0, + ira->ira_tsl); + } } /* @@ -2324,17 +2525,17 @@ iptun_input_icmp_v4(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp, static boolean_t iptun_find_encaplimit(mblk_t *mp, ip6_t *ip6h, uint8_t **encaplim_ptr) { - ip6_pkt_t pkt; + ip_pkt_t pkt; uint8_t *endptr; ip6_dest_t *destp; struct ip6_opt *optp; pkt.ipp_fields = 0; /* must be initialized */ - (void) ip_find_hdr_v6(mp, ip6h, &pkt, NULL); + (void) ip_find_hdr_v6(mp, ip6h, B_FALSE, &pkt, NULL); if ((pkt.ipp_fields & IPPF_DSTOPTS) != 0) { destp = pkt.ipp_dstopts; - } else if ((pkt.ipp_fields & IPPF_RTDSTOPTS) != 0) { - destp = pkt.ipp_rtdstopts; + } else if ((pkt.ipp_fields & IPPF_RTHDRDSTOPTS) != 0) { + destp = pkt.ipp_rthdrdstopts; } else { return (B_FALSE); } @@ -2370,8 +2571,8 @@ iptun_find_encaplimit(mblk_t *mp, ip6_t *ip6h, uint8_t **encaplim_ptr) * whatever the very-inner packet is (IPv4 or IPv6(2)). */ static void -iptun_input_icmp_v6(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp, - icmp6_t *icmp6h) +iptun_input_icmp_v6(iptun_t *iptun, mblk_t *data_mp, icmp6_t *icmp6h, + ip_recv_attr_t *ira) { uint8_t *orig; ipha_t *outer4, *inner4; @@ -2379,12 +2580,6 @@ iptun_input_icmp_v6(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp, int outer_hlen; uint8_t type, code; - /* - * Change the db_type to M_DATA because subsequent operations assume - * the ICMP packet is M_DATA again (i.e. calls to msgdsize().) - */ - data_mp->b_datap->db_type = M_DATA; - ASSERT(data_mp->b_cont == NULL); /* @@ -2399,19 +2594,18 @@ iptun_input_icmp_v6(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp, * here). */ ASSERT(MBLKL(data_mp) >= 0); - outer_hlen = iptun_find_headers(data_mp, &outer4, &inner4, &outer6, + outer_hlen = iptun_find_headers(data_mp, 0, &outer4, &inner4, &outer6, &inner6); ASSERT(outer4 == NULL); data_mp->b_rptr = orig; /* Restore r_ptr */ if (outer_hlen == 0) { - iptun_drop_pkt((ipsec_mp != NULL ? ipsec_mp : data_mp), - &iptun->iptun_ierrors); + iptun_drop_pkt(data_mp, &iptun->iptun_ierrors); return; } - if (!ipsec_tun_inbound(ipsec_mp, &data_mp, iptun->iptun_itp, - inner4, inner6, outer4, outer6, -outer_hlen, - iptun->iptun_ns)) { + data_mp = ipsec_tun_inbound(ira, data_mp, iptun->iptun_itp, + inner4, inner6, outer4, outer6, -outer_hlen, iptun->iptun_ns); + if (data_mp == NULL) { /* Callee did all of the freeing. */ atomic_inc_64(&iptun->iptun_ierrors); return; @@ -2466,13 +2660,15 @@ iptun_input_icmp_v6(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp, * have IPsec policy by letting iptun_update_mtu take care of * it. */ - newmtu = iptun_update_mtu(iptun, ntohl(icmp6h->icmp6_mtu)); + newmtu = iptun_update_mtu(iptun, NULL, + ntohl(icmp6h->icmp6_mtu)); if (inner4 != NULL) { iptun_icmp_fragneeded_v4(iptun, newmtu, inner4, - data_mp); + data_mp, ira->ira_tsl); } else { - iptun_icmp_toobig_v6(iptun, newmtu, inner6, data_mp); + iptun_icmp_toobig_v6(iptun, newmtu, inner6, data_mp, + ira->ira_tsl); } return; } @@ -2481,51 +2677,57 @@ iptun_input_icmp_v6(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp, return; } - if (inner4 != NULL) - iptun_icmp_error_v4(iptun, inner4, data_mp, type, code); - else - iptun_icmp_error_v6(iptun, inner6, data_mp, type, code, 0); + if (inner4 != NULL) { + iptun_icmp_error_v4(iptun, inner4, data_mp, type, code, + ira->ira_tsl); + } else { + iptun_icmp_error_v6(iptun, inner6, data_mp, type, code, 0, + ira->ira_tsl); + } } +/* + * Called as conn_recvicmp from IP for ICMP errors. + */ +/* ARGSUSED2 */ static void -iptun_input_icmp(iptun_t *iptun, mblk_t *ipsec_mp, mblk_t *data_mp) +iptun_input_icmp(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) { - mblk_t *tmpmp; - size_t hlen; + conn_t *connp = arg; + iptun_t *iptun = connp->conn_iptun; + mblk_t *tmpmp; + size_t hlen; - if (data_mp->b_cont != NULL) { + ASSERT(IPCL_IS_IPTUN(connp)); + + if (mp->b_cont != NULL) { /* * Since ICMP error processing necessitates access to bits * that are within the ICMP error payload (the original packet * that caused the error), pull everything up into a single * block for convenience. */ - data_mp->b_datap->db_type = M_DATA; - if ((tmpmp = msgpullup(data_mp, -1)) == NULL) { - iptun_drop_pkt((ipsec_mp != NULL ? ipsec_mp : data_mp), - &iptun->iptun_norcvbuf); + if ((tmpmp = msgpullup(mp, -1)) == NULL) { + iptun_drop_pkt(mp, &iptun->iptun_norcvbuf); return; } - freemsg(data_mp); - data_mp = tmpmp; - if (ipsec_mp != NULL) - ipsec_mp->b_cont = data_mp; + freemsg(mp); + mp = tmpmp; } + hlen = ira->ira_ip_hdr_length; switch (iptun->iptun_typeinfo->iti_ipvers) { case IPV4_VERSION: /* * The outer IP header coming up from IP is always ipha_t * alligned (otherwise, we would have crashed in ip). */ - hlen = IPH_HDR_LENGTH((ipha_t *)data_mp->b_rptr); - iptun_input_icmp_v4(iptun, ipsec_mp, data_mp, - (icmph_t *)(data_mp->b_rptr + hlen)); + iptun_input_icmp_v4(iptun, mp, (icmph_t *)(mp->b_rptr + hlen), + ira); break; case IPV6_VERSION: - hlen = ip_hdr_length_v6(data_mp, (ip6_t *)data_mp->b_rptr); - iptun_input_icmp_v6(iptun, ipsec_mp, data_mp, - (icmp6_t *)(data_mp->b_rptr + hlen)); + iptun_input_icmp_v6(iptun, mp, (icmp6_t *)(mp->b_rptr + hlen), + ira); break; } } @@ -2578,63 +2780,24 @@ iptun_in_6to4_ok(iptun_t *iptun, ipha_t *outer4, ip6_t *inner6) * Input function for everything that comes up from the ip module below us. * This is called directly from the ip module via connp->conn_recv(). * - * There are two kinds of packets that can arrive here: (1) IP-in-IP tunneled - * packets and (2) ICMP errors containing IP-in-IP packets transmitted by us. - * They have the following structure: - * - * 1) M_DATA - * 2) M_CTL[->M_DATA] - * - * (2) Is an M_CTL optionally followed by M_DATA, where the M_CTL block is the - * start of the actual ICMP packet (it doesn't contain any special control - * information). - * - * Either (1) or (2) can be IPsec-protected, in which case an M_CTL block - * containing an ipsec_in_t will have been prepended to either (1) or (2), - * making a total of four combinations of possible mblk chains: - * - * A) (1) - * B) (2) - * C) M_CTL(ipsec_in_t)->(1) - * D) M_CTL(ipsec_in_t)->(2) + * We receive M_DATA messages with IP-in-IP tunneled packets. */ -/* ARGSUSED */ +/* ARGSUSED2 */ static void -iptun_input(void *arg, mblk_t *mp, void *arg2) +iptun_input(void *arg, mblk_t *data_mp, void *arg2, ip_recv_attr_t *ira) { conn_t *connp = arg; iptun_t *iptun = connp->conn_iptun; int outer_hlen; ipha_t *outer4, *inner4; ip6_t *outer6, *inner6; - mblk_t *data_mp = mp; ASSERT(IPCL_IS_IPTUN(connp)); - ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_CTL); - - if (DB_TYPE(mp) == M_CTL) { - if (((ipsec_in_t *)(mp->b_rptr))->ipsec_in_type != IPSEC_IN) { - iptun_input_icmp(iptun, NULL, mp); - return; - } - - data_mp = mp->b_cont; - if (DB_TYPE(data_mp) == M_CTL) { - /* Protected ICMP packet. */ - iptun_input_icmp(iptun, mp, data_mp); - return; - } - } - - /* - * Request the destination's path MTU information regularly in case - * path MTU has increased. - */ - if (IPTUN_PMTU_TOO_OLD(iptun)) - iptun_task_dispatch(iptun, IPTUN_TASK_PMTU_UPDATE); + ASSERT(DB_TYPE(data_mp) == M_DATA); - if ((outer_hlen = iptun_find_headers(data_mp, &outer4, &inner4, &outer6, - &inner6)) == 0) + outer_hlen = iptun_find_headers(data_mp, ira->ira_ip_hdr_length, + &outer4, &inner4, &outer6, &inner6); + if (outer_hlen == 0) goto drop; /* @@ -2644,25 +2807,22 @@ iptun_input(void *arg, mblk_t *mp, void *arg2) * the more involved tsol_receive_local() since the tunnel link itself * cannot be assigned to shared-stack non-global zones. */ - if (is_system_labeled()) { - cred_t *msg_cred; - - if ((msg_cred = msg_getcred(data_mp, NULL)) == NULL) + if (ira->ira_flags & IRAF_SYSTEM_LABELED) { + if (ira->ira_tsl == NULL) goto drop; - if (tsol_check_dest(msg_cred, (outer4 != NULL ? + if (tsol_check_dest(ira->ira_tsl, (outer4 != NULL ? (void *)&outer4->ipha_dst : (void *)&outer6->ip6_dst), (outer4 != NULL ? IPV4_VERSION : IPV6_VERSION), - CONN_MAC_DEFAULT, NULL) != 0) + CONN_MAC_DEFAULT, B_FALSE, NULL) != 0) goto drop; } - if (!ipsec_tun_inbound((mp == data_mp ? NULL : mp), &data_mp, - iptun->iptun_itp, inner4, inner6, outer4, outer6, outer_hlen, - iptun->iptun_ns)) { + data_mp = ipsec_tun_inbound(ira, data_mp, iptun->iptun_itp, + inner4, inner6, outer4, outer6, outer_hlen, iptun->iptun_ns); + if (data_mp == NULL) { /* Callee did all of the freeing. */ return; } - mp = data_mp; if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_6TO4 && !iptun_in_6to4_ok(iptun, outer4, inner6)) @@ -2673,6 +2833,8 @@ iptun_input(void *arg, mblk_t *mp, void *arg2) * we might as well split up any b_next chains here. */ do { + mblk_t *mp; + mp = data_mp->b_next; data_mp->b_next = NULL; @@ -2684,7 +2846,7 @@ iptun_input(void *arg, mblk_t *mp, void *arg2) } while (data_mp != NULL); return; drop: - iptun_drop_pkt(mp, &iptun->iptun_ierrors); + iptun_drop_pkt(data_mp, &iptun->iptun_ierrors); } /* @@ -2744,6 +2906,10 @@ iptun_out_process_6to4(iptun_t *iptun, ipha_t *outer4, ip6_t *inner6) /* destination is a 6to4 router */ IN6_6TO4_TO_V4ADDR(&inner6->ip6_dst, (struct in_addr *)&outer4->ipha_dst); + + /* Reject attempts to send to INADDR_ANY */ + if (outer4->ipha_dst == INADDR_ANY) + return (B_FALSE); } else { /* * The destination is a native IPv6 address. If output to a @@ -2770,12 +2936,11 @@ iptun_out_process_6to4(iptun_t *iptun, ipha_t *outer4, ip6_t *inner6) */ static mblk_t * iptun_out_process_ipv4(iptun_t *iptun, mblk_t *mp, ipha_t *outer4, - ipha_t *inner4, ip6_t *inner6) + ipha_t *inner4, ip6_t *inner6, ip_xmit_attr_t *ixa) { uint8_t *innerptr = (inner4 != NULL ? (uint8_t *)inner4 : (uint8_t *)inner6); - size_t minmtu = (inner4 != NULL ? - IPTUN_MIN_IPV4_MTU : IPTUN_MIN_IPV6_MTU); + size_t minmtu = iptun->iptun_typeinfo->iti_minmtu; if (inner4 != NULL) { ASSERT(outer4->ipha_protocol == IPPROTO_ENCAP); @@ -2791,13 +2956,11 @@ iptun_out_process_ipv4(iptun_t *iptun, mblk_t *mp, ipha_t *outer4, } else { ASSERT(outer4->ipha_protocol == IPPROTO_IPV6 && inner6 != NULL); - - if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_6TO4 && - !iptun_out_process_6to4(iptun, outer4, inner6)) { - iptun_drop_pkt(mp, &iptun->iptun_oerrors); - return (NULL); - } } + if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) + outer4->ipha_fragment_offset_and_flags |= IPH_DF_HTONS; + else + outer4->ipha_fragment_offset_and_flags &= ~IPH_DF_HTONS; /* * As described in section 3.2.2 of RFC4213, if the packet payload is @@ -2807,11 +2970,19 @@ iptun_out_process_ipv4(iptun_t *iptun, mblk_t *mp, ipha_t *outer4, * won't be allowed to drop its MTU as a result, since the packet was * already smaller than the smallest allowable MTU for that interface. */ - if (mp->b_wptr - innerptr <= minmtu) + if (mp->b_wptr - innerptr <= minmtu) { outer4->ipha_fragment_offset_and_flags = 0; + ixa->ixa_flags &= ~IXAF_DONTFRAG; + } else if (!(ixa->ixa_flags & IXAF_PMTU_TOO_SMALL) && + (iptun->iptun_typeinfo->iti_type != IPTUN_TYPE_6TO4)) { + ixa->ixa_flags |= IXAF_DONTFRAG; + } - outer4->ipha_length = htons(msgdsize(mp)); + ixa->ixa_ip_hdr_length = IPH_HDR_LENGTH(outer4); + ixa->ixa_pktlen = msgdsize(mp); + ixa->ixa_protocol = outer4->ipha_protocol; + outer4->ipha_length = htons(ixa->ixa_pktlen); return (mp); } @@ -2830,7 +3001,7 @@ iptun_insert_encaplimit(iptun_t *iptun, mblk_t *mp, ip6_t *outer6, ASSERT(mp->b_cont == NULL); mp->b_rptr += sizeof (ip6_t); - newmp = allocb_tmpl(sizeof (iptun_ipv6hdrs_t) + MBLKL(mp), mp); + newmp = allocb(sizeof (iptun_ipv6hdrs_t) + MBLKL(mp), BPRI_MED); if (newmp == NULL) { iptun_drop_pkt(mp, &iptun->iptun_noxmtbuf); return (NULL); @@ -2861,8 +3032,12 @@ iptun_insert_encaplimit(iptun_t *iptun, mblk_t *mp, ip6_t *outer6, * on error. */ static mblk_t * -iptun_out_process_ipv6(iptun_t *iptun, mblk_t *mp, ip6_t *outer6, ip6_t *inner6) +iptun_out_process_ipv6(iptun_t *iptun, mblk_t *mp, ip6_t *outer6, + ipha_t *inner4, ip6_t *inner6, ip_xmit_attr_t *ixa) { + uint8_t *innerptr = (inner4 != NULL ? + (uint8_t *)inner4 : (uint8_t *)inner6); + size_t minmtu = iptun->iptun_typeinfo->iti_minmtu; uint8_t *limit, *configlimit; uint32_t offset; iptun_ipv6hdrs_t *v6hdrs; @@ -2887,7 +3062,7 @@ iptun_out_process_ipv6(iptun_t *iptun, mblk_t *mp, ip6_t *outer6, ip6_t *inner6) mp->b_rptr = (uint8_t *)inner6; offset = limit - mp->b_rptr; iptun_icmp_error_v6(iptun, inner6, mp, ICMP6_PARAM_PROB, - 0, offset); + 0, offset, ixa->ixa_tsl); atomic_inc_64(&iptun->iptun_noxmtbuf); return (NULL); } @@ -2900,6 +3075,7 @@ iptun_out_process_ipv6(iptun_t *iptun, mblk_t *mp, ip6_t *outer6, ip6_t *inner6) if ((mp = iptun_insert_encaplimit(iptun, mp, outer6, (*limit - 1))) == NULL) return (NULL); + v6hdrs = (iptun_ipv6hdrs_t *)mp->b_rptr; } else { /* * There is an existing encapsulation limit option in @@ -2914,9 +3090,23 @@ iptun_out_process_ipv6(iptun_t *iptun, mblk_t *mp, ip6_t *outer6, ip6_t *inner6) if ((*limit - 1) < *configlimit) *configlimit = (*limit - 1); } + ixa->ixa_ip_hdr_length = sizeof (iptun_ipv6hdrs_t); + ixa->ixa_protocol = v6hdrs->it6h_encaplim.iel_destopt.ip6d_nxt; + } else { + ixa->ixa_ip_hdr_length = sizeof (ip6_t); + ixa->ixa_protocol = outer6->ip6_nxt; } + /* + * See iptun_output_process_ipv4() why we allow fragmentation for + * small packets + */ + if (mp->b_wptr - innerptr <= minmtu) + ixa->ixa_flags &= ~IXAF_DONTFRAG; + else if (!(ixa->ixa_flags & IXAF_PMTU_TOO_SMALL)) + ixa->ixa_flags |= IXAF_DONTFRAG; - outer6->ip6_plen = htons(msgdsize(mp) - sizeof (ip6_t)); + ixa->ixa_pktlen = msgdsize(mp); + outer6->ip6_plen = htons(ixa->ixa_pktlen - sizeof (ip6_t)); return (mp); } @@ -2929,11 +3119,9 @@ static void iptun_output(iptun_t *iptun, mblk_t *mp) { conn_t *connp = iptun->iptun_connp; - int outer_hlen; mblk_t *newmp; - ipha_t *outer4, *inner4; - ip6_t *outer6, *inner6; - ipsec_tun_pol_t *itp = iptun->iptun_itp; + int error; + ip_xmit_attr_t *ixa; ASSERT(mp->b_datap->db_type == M_DATA); @@ -2946,17 +3134,262 @@ iptun_output(iptun_t *iptun, mblk_t *mp) mp = newmp; } - outer_hlen = iptun_find_headers(mp, &outer4, &inner4, &outer6, &inner6); + if (iptun->iptun_typeinfo->iti_type == IPTUN_TYPE_6TO4) { + iptun_output_6to4(iptun, mp); + return; + } + + if (is_system_labeled()) { + /* + * Since the label can be different meaning a potentially + * different IRE,we always use a unique ip_xmit_attr_t. + */ + ixa = conn_get_ixa_exclusive(connp); + } else { + /* + * If no other thread is using conn_ixa this just gets a + * reference to conn_ixa. Otherwise we get a safe copy of + * conn_ixa. + */ + ixa = conn_get_ixa(connp, B_FALSE); + } + if (ixa == NULL) { + iptun_drop_pkt(mp, &iptun->iptun_oerrors); + return; + } + + /* + * In case we got a safe copy of conn_ixa, then we need + * to fill in any pointers in it. + */ + if (ixa->ixa_ire == NULL) { + error = ip_attr_connect(connp, ixa, &connp->conn_saddr_v6, + &connp->conn_faddr_v6, &connp->conn_faddr_v6, 0, + NULL, NULL, 0); + if (error != 0) { + if (ixa->ixa_ire != NULL && + (error == EHOSTUNREACH || error == ENETUNREACH)) { + /* + * Let conn_ip_output/ire_send_noroute return + * the error and send any local ICMP error. + */ + error = 0; + } else { + ixa_refrele(ixa); + iptun_drop_pkt(mp, &iptun->iptun_oerrors); + return; + } + } + } + + iptun_output_common(iptun, ixa, mp); + ixa_refrele(ixa); +} + +/* + * We use an ixa based on the last destination. + */ +static void +iptun_output_6to4(iptun_t *iptun, mblk_t *mp) +{ + conn_t *connp = iptun->iptun_connp; + ipha_t *outer4, *inner4; + ip6_t *outer6, *inner6; + ip_xmit_attr_t *ixa; + ip_xmit_attr_t *oldixa; + int error; + boolean_t need_connect; + in6_addr_t v6dst; + + ASSERT(mp->b_cont == NULL); /* Verified by iptun_output */ + + /* Make sure we set ipha_dst before we look at ipha_dst */ + + (void) iptun_find_headers(mp, 0, &outer4, &inner4, &outer6, &inner6); + ASSERT(outer4 != NULL); + if (!iptun_out_process_6to4(iptun, outer4, inner6)) { + iptun_drop_pkt(mp, &iptun->iptun_oerrors); + return; + } + + if (is_system_labeled()) { + /* + * Since the label can be different meaning a potentially + * different IRE,we always use a unique ip_xmit_attr_t. + */ + ixa = conn_get_ixa_exclusive(connp); + } else { + /* + * If no other thread is using conn_ixa this just gets a + * reference to conn_ixa. Otherwise we get a safe copy of + * conn_ixa. + */ + ixa = conn_get_ixa(connp, B_FALSE); + } + if (ixa == NULL) { + iptun_drop_pkt(mp, &iptun->iptun_oerrors); + return; + } + + mutex_enter(&connp->conn_lock); + if (connp->conn_v4lastdst == outer4->ipha_dst) { + need_connect = (ixa->ixa_ire == NULL); + } else { + /* In case previous destination was multirt */ + ip_attr_newdst(ixa); + + /* + * We later update conn_ixa when we update conn_v4lastdst + * which enables subsequent packets to avoid redoing + * ip_attr_connect + */ + need_connect = B_TRUE; + } + mutex_exit(&connp->conn_lock); + + /* + * In case we got a safe copy of conn_ixa, or otherwise we don't + * have a current ixa_ire, then we need to fill in any pointers in + * the ixa. + */ + if (need_connect) { + IN6_IPADDR_TO_V4MAPPED(outer4->ipha_dst, &v6dst); + + /* We handle IPsec in iptun_output_common */ + error = ip_attr_connect(connp, ixa, &connp->conn_saddr_v6, + &v6dst, &v6dst, 0, NULL, NULL, 0); + if (error != 0) { + if (ixa->ixa_ire != NULL && + (error == EHOSTUNREACH || error == ENETUNREACH)) { + /* + * Let conn_ip_output/ire_send_noroute return + * the error and send any local ICMP error. + */ + error = 0; + } else { + ixa_refrele(ixa); + iptun_drop_pkt(mp, &iptun->iptun_oerrors); + return; + } + } + } + + iptun_output_common(iptun, ixa, mp); + + /* Atomically replace conn_ixa and conn_v4lastdst */ + mutex_enter(&connp->conn_lock); + if (connp->conn_v4lastdst != outer4->ipha_dst) { + /* Remember the dst which corresponds to conn_ixa */ + connp->conn_v6lastdst = v6dst; + oldixa = conn_replace_ixa(connp, ixa); + } else { + oldixa = NULL; + } + mutex_exit(&connp->conn_lock); + ixa_refrele(ixa); + if (oldixa != NULL) + ixa_refrele(oldixa); +} + +/* + * Check the destination/label. Modifies *mpp by adding/removing CIPSO. + * + * We get the label from the message in order to honor the + * ULPs/IPs choice of label. This will be NULL for forwarded + * packets, neighbor discovery packets and some others. + */ +static int +iptun_output_check_label(mblk_t **mpp, ip_xmit_attr_t *ixa) +{ + cred_t *cr; + int adjust; + int iplen; + int err; + ts_label_t *effective_tsl = NULL; + + + ASSERT(is_system_labeled()); + + cr = msg_getcred(*mpp, NULL); + if (cr == NULL) + return (0); + + /* + * We need to start with a label based on the IP/ULP above us + */ + ip_xmit_attr_restore_tsl(ixa, cr); + + /* + * Need to update packet with any CIPSO option since + * conn_ip_output doesn't do that. + */ + if (ixa->ixa_flags & IXAF_IS_IPV4) { + ipha_t *ipha; + + ipha = (ipha_t *)(*mpp)->b_rptr; + iplen = ntohs(ipha->ipha_length); + err = tsol_check_label_v4(ixa->ixa_tsl, + ixa->ixa_zoneid, mpp, CONN_MAC_DEFAULT, B_FALSE, + ixa->ixa_ipst, &effective_tsl); + if (err != 0) + return (err); + + ipha = (ipha_t *)(*mpp)->b_rptr; + adjust = (int)ntohs(ipha->ipha_length) - iplen; + } else { + ip6_t *ip6h; + + ip6h = (ip6_t *)(*mpp)->b_rptr; + iplen = ntohs(ip6h->ip6_plen); + + err = tsol_check_label_v6(ixa->ixa_tsl, + ixa->ixa_zoneid, mpp, CONN_MAC_DEFAULT, B_FALSE, + ixa->ixa_ipst, &effective_tsl); + if (err != 0) + return (err); + + ip6h = (ip6_t *)(*mpp)->b_rptr; + adjust = (int)ntohs(ip6h->ip6_plen) - iplen; + } + + if (effective_tsl != NULL) { + /* Update the label */ + ip_xmit_attr_replace_tsl(ixa, effective_tsl); + } + ixa->ixa_pktlen += adjust; + ixa->ixa_ip_hdr_length += adjust; + return (0); +} + + +static void +iptun_output_common(iptun_t *iptun, ip_xmit_attr_t *ixa, mblk_t *mp) +{ + ipsec_tun_pol_t *itp = iptun->iptun_itp; + int outer_hlen; + mblk_t *newmp; + ipha_t *outer4, *inner4; + ip6_t *outer6, *inner6; + int error; + boolean_t update_pktlen; + + ASSERT(ixa->ixa_ire != NULL); + + outer_hlen = iptun_find_headers(mp, 0, &outer4, &inner4, &outer6, + &inner6); if (outer_hlen == 0) { iptun_drop_pkt(mp, &iptun->iptun_oerrors); return; } /* Perform header processing. */ - if (outer4 != NULL) - mp = iptun_out_process_ipv4(iptun, mp, outer4, inner4, inner6); - else - mp = iptun_out_process_ipv6(iptun, mp, outer6, inner6); + if (outer4 != NULL) { + mp = iptun_out_process_ipv4(iptun, mp, outer4, inner4, inner6, + ixa); + } else { + mp = iptun_out_process_ipv6(iptun, mp, outer6, inner4, inner6, + ixa); + } if (mp == NULL) return; @@ -2964,27 +3397,57 @@ iptun_output(iptun_t *iptun, mblk_t *mp) * Let's hope the compiler optimizes this with "branch taken". */ if (itp != NULL && (itp->itp_flags & ITPF_P_ACTIVE)) { - if ((mp = ipsec_tun_outbound(mp, iptun, inner4, inner6, outer4, - outer6, outer_hlen)) == NULL) { - /* ipsec_tun_outbound() frees mp on error. */ + /* This updates the ip_xmit_attr_t */ + mp = ipsec_tun_outbound(mp, iptun, inner4, inner6, outer4, + outer6, outer_hlen, ixa); + if (mp == NULL) { atomic_inc_64(&iptun->iptun_oerrors); return; } + if (is_system_labeled()) { + /* + * Might change the packet by adding/removing CIPSO. + * After this caller inner* and outer* and outer_hlen + * might be invalid. + */ + error = iptun_output_check_label(&mp, ixa); + if (error != 0) { + ip2dbg(("label check failed (%d)\n", error)); + iptun_drop_pkt(mp, &iptun->iptun_oerrors); + return; + } + } + /* * ipsec_tun_outbound() returns a chain of tunneled IP * fragments linked with b_next (or a single message if the - * tunneled packet wasn't a fragment). Each message in the - * chain is prepended by an IPSEC_OUT M_CTL block with + * tunneled packet wasn't a fragment). + * If fragcache returned a list then we need to update + * ixa_pktlen for all packets in the list. + */ + update_pktlen = (mp->b_next != NULL); + + /* + * Otherwise, we're good to go. The ixa has been updated with * instructions for outbound IPsec processing. */ for (newmp = mp; newmp != NULL; newmp = mp) { - ASSERT(newmp->b_datap->db_type == M_CTL); atomic_inc_64(&iptun->iptun_opackets); - atomic_add_64(&iptun->iptun_obytes, - msgdsize(newmp->b_cont)); + atomic_add_64(&iptun->iptun_obytes, ixa->ixa_pktlen); mp = mp->b_next; newmp->b_next = NULL; - connp->conn_send(connp, newmp, connp->conn_wq, IP_WPUT); + + if (update_pktlen) + ixa->ixa_pktlen = msgdsize(mp); + + atomic_inc_64(&iptun->iptun_opackets); + atomic_add_64(&iptun->iptun_obytes, ixa->ixa_pktlen); + + error = conn_ip_output(newmp, ixa); + if (error == EMSGSIZE) { + /* IPsec policy might have changed */ + (void) iptun_update_mtu(iptun, ixa, 0); + } } } else { /* @@ -2992,30 +3455,37 @@ iptun_output(iptun_t *iptun, mblk_t *mp) * packet in its output path if there's no active tunnel * policy. */ - atomic_inc_64(&iptun->iptun_opackets); - atomic_add_64(&iptun->iptun_obytes, msgdsize(mp)); - connp->conn_send(connp, mp, connp->conn_wq, IP_WPUT); - } -} + ASSERT(ixa->ixa_ipsec_policy == NULL); + mp = ip_output_attach_policy(mp, outer4, outer6, NULL, ixa); + if (mp == NULL) { + atomic_inc_64(&iptun->iptun_oerrors); + return; + } + if (is_system_labeled()) { + /* + * Might change the packet by adding/removing CIPSO. + * After this caller inner* and outer* and outer_hlen + * might be invalid. + */ + error = iptun_output_check_label(&mp, ixa); + if (error != 0) { + ip2dbg(("label check failed (%d)\n", error)); + iptun_drop_pkt(mp, &iptun->iptun_oerrors); + return; + } + } -/* - * Note that the setting or clearing iptun_{set,get}_g_q() is serialized via - * iptuns_lock and iptunq_open(), so we must never be in a situation where - * iptun_set_g_q() is called if the queue has already been set or vice versa - * (hence the ASSERT()s.) - */ -void -iptun_set_g_q(netstack_t *ns, queue_t *q) -{ - ASSERT(ns->netstack_iptun->iptuns_g_q == NULL); - ns->netstack_iptun->iptuns_g_q = q; -} + atomic_inc_64(&iptun->iptun_opackets); + atomic_add_64(&iptun->iptun_obytes, ixa->ixa_pktlen); -void -iptun_clear_g_q(netstack_t *ns) -{ - ASSERT(ns->netstack_iptun->iptuns_g_q != NULL); - ns->netstack_iptun->iptuns_g_q = NULL; + error = conn_ip_output(mp, ixa); + if (error == EMSGSIZE) { + /* IPsec policy might have changed */ + (void) iptun_update_mtu(iptun, ixa, 0); + } + } + if (ixa->ixa_flags & IXAF_IPSEC_SECURE) + ipsec_out_release_refs(ixa); } static mac_callbacks_t iptun_m_callbacks = { diff --git a/usr/src/uts/common/inet/iptun/iptun_dev.c b/usr/src/uts/common/inet/iptun/iptun_dev.c index 52218bdc18..5043063690 100644 --- a/usr/src/uts/common/inet/iptun/iptun_dev.c +++ b/usr/src/uts/common/inet/iptun/iptun_dev.c @@ -91,11 +91,9 @@ iptun_stack_shutdown(netstackid_t stackid, void *arg) /* note that iptun_delete() removes iptun from the list */ while ((iptun = list_head(&iptuns->iptuns_iptunlist)) != NULL) { linkid = iptun->iptun_linkid; - (void) iptun_delete(linkid, iptun->iptun_cred); + (void) iptun_delete(linkid, iptun->iptun_connp->conn_cred); (void) dls_mgmt_destroy(linkid, B_FALSE); } - if (iptuns->iptuns_g_q != NULL) - (void) ldi_close(iptuns->iptuns_g_q_lh, FWRITE|FREAD, CRED()); } /* diff --git a/usr/src/uts/common/inet/iptun/iptun_impl.h b/usr/src/uts/common/inet/iptun/iptun_impl.h index 593adb7d9c..07e168a423 100644 --- a/usr/src/uts/common/inet/iptun/iptun_impl.h +++ b/usr/src/uts/common/inet/iptun/iptun_impl.h @@ -80,7 +80,6 @@ typedef struct iptun_typeinfo { iptun_type_t iti_type; const char *iti_ident; /* MAC-Type plugin identifier */ uint_t iti_ipvers; /* outer header IP version */ - edesc_spf iti_txfunc; /* function used to transmit to ip */ uint32_t iti_minmtu; /* minimum possible tunnel MTU */ uint32_t iti_maxmtu; /* maximum possible tunnel MTU */ boolean_t iti_hasraddr; /* has a remote adress */ @@ -95,13 +94,6 @@ typedef struct iptun_typeinfo { * * The datapath reads certain fields without locks for performance reasons. * - * - IPTUN_PMTU_TOO_OLD() is used without a lock to determine if the - * destination path-MTU should be queried. This reads iptun_flags - * IPTUN_RADDR, IPTUN_FIXED_MTU, and iptun_dpmtu_lastupdate. All of these - * can change without adversely affecting the tunnel, as the worst case - * scenario is that we launch a task that will ultimately either do nothing - * or needlessly query the destination path-MTU. - * * - IPTUN_IS_RUNNING() is used (read access to iptun_flags IPTUN_BOUND and * IPTUN_MAC_STARTED) to drop packets if they're sent while the tunnel is * not running. This is harmless as the worst case scenario is that a @@ -119,12 +111,10 @@ typedef struct iptun_s { conn_t *iptun_connp; zoneid_t iptun_zoneid; netstack_t *iptun_ns; - cred_t *iptun_cred; struct ipsec_tun_pol_s *iptun_itp; iptun_typeinfo_t *iptun_typeinfo; uint32_t iptun_mtu; uint32_t iptun_dpmtu; /* destination path MTU */ - clock_t iptun_dpmtu_lastupdate; uint8_t iptun_hoplimit; uint8_t iptun_encaplimit; iptun_addr_t iptun_laddr; /* local address */ @@ -172,37 +162,12 @@ typedef struct iptun_s { (IPTUN_BOUND | IPTUN_MAC_STARTED)) /* - * We request ire information for the tunnel destination in order to obtain - * its path MTU information. We use that to calculate the initial link MTU of - * a tunnel. - * - * After that, if the path MTU of the tunnel destination becomes smaller - * than the link MTU of the tunnel, then we will receive a packet too big - * (aka fragmentation needed) ICMP error when we transmit a packet larger - * than the path MTU, and we will adjust the tunne's MTU based on the ICMP - * error's MTU information. - * - * In addition to that, we also need to request the ire information - * periodically to make sure the link MTU of a tunnel doesn't become stale - * if the path MTU of the tunnel destination becomes larger than the link - * MTU of the tunnel. The period for the requests is ten minutes in - * accordance with rfc1191. - */ -#define IPTUN_PMTU_AGE SEC_TO_TICK(600) -#define IPTUN_PMTU_TOO_OLD(ipt) \ - (((ipt)->iptun_flags & IPTUN_RADDR) && \ - !((ipt)->iptun_flags & IPTUN_FIXED_MTU) && \ - (ddi_get_lbolt() - (ipt)->iptun_dpmtu_lastupdate) > IPTUN_PMTU_AGE) - -/* - * iptuns_lock protects iptuns_iptunlist and iptuns_g_q. + * iptuns_lock protects iptuns_iptunlist. */ typedef struct iptun_stack { netstack_t *iptuns_netstack; /* Common netstack */ kmutex_t iptuns_lock; list_t iptuns_iptunlist; /* list of tunnels in this stack. */ - queue_t *iptuns_g_q; /* read-side IP queue */ - ldi_handle_t iptuns_g_q_lh; ipaddr_t iptuns_relay_rtr_addr; } iptun_stack_t; @@ -222,8 +187,6 @@ extern int iptun_info(iptun_kparams_t *, cred_t *); extern int iptun_set_6to4relay(netstack_t *, ipaddr_t); extern void iptun_get_6to4relay(netstack_t *, ipaddr_t *); extern void iptun_set_policy(datalink_id_t, ipsec_tun_pol_t *); -extern void iptun_set_g_q(netstack_t *, queue_t *); -extern void iptun_clear_g_q(netstack_t *); #endif /* _KERNEL */ diff --git a/usr/src/uts/common/inet/keysock.h b/usr/src/uts/common/inet/keysock.h index 50189666c7..cb618cedaf 100644 --- a/usr/src/uts/common/inet/keysock.h +++ b/usr/src/uts/common/inet/keysock.h @@ -19,22 +19,20 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _INET_KEYSOCK_H #define _INET_KEYSOCK_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif extern int keysock_opt_get(queue_t *, int, int, uchar_t *); extern int keysock_opt_set(queue_t *, uint_t, int, int, uint_t, - uchar_t *, uint_t *, uchar_t *, void *, cred_t *cr, mblk_t *mblk); + uchar_t *, uint_t *, uchar_t *, void *, cred_t *cr); /* * Object to represent database of options to search passed to diff --git a/usr/src/uts/common/inet/kssl/ksslrec.c b/usr/src/uts/common/inet/kssl/ksslrec.c index 14a285b4ab..6b7ce0ad42 100644 --- a/usr/src/uts/common/inet/kssl/ksslrec.c +++ b/usr/src/uts/common/inet/kssl/ksslrec.c @@ -239,7 +239,7 @@ kssl_compute_record_mac( * context when called from strsock_kssl_input(). During the * SSL handshake, we are called for client_finished message * handling from a squeue worker thread that gets scheduled - * by an squeue_fill() call. This thread is not in interrupt + * by an SQ_FILL call. This thread is not in interrupt * context and so can block. */ rv = crypto_mac(&spec->hmac_mech, &dd, &spec->hmac_key, diff --git a/usr/src/uts/common/inet/mi.c b/usr/src/uts/common/inet/mi.c index f88fe3709b..9fe77e88c4 100644 --- a/usr/src/uts/common/inet/mi.c +++ b/usr/src/uts/common/inet/mi.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -1359,7 +1359,7 @@ mi_tpi_addr_and_opt(MBLKP mp, char *addr, t_scalar_t addr_length, * This code is used more than just for unitdata ind * (also for T_CONN_IND and T_CONN_CON) and * relies on correct functioning on the happy - * coincidence that the the address and option buffers + * coincidence that the address and option buffers * represented by length/offset in all these primitives * are isomorphic in terms of offset from start of data * structure diff --git a/usr/src/uts/common/inet/mib2.h b/usr/src/uts/common/inet/mib2.h index 16bed4ec2c..06db81ea74 100644 --- a/usr/src/uts/common/inet/mib2.h +++ b/usr/src/uts/common/inet/mib2.h @@ -66,8 +66,8 @@ extern "C" { * "get all" is supported, so all modules get a copy of the request to * return everything it knows. In general, we use MIB2_IP. There is * one exception: in general, IP will not report information related to - * IRE_MARK_TESTHIDDEN routes (e.g., in the MIB2_IP_ROUTE table). - * However, using the special value EXPER_IP_AND_TESTHIDDEN will cause + * ire_testhidden and IRE_IF_CLONE routes (e.g., in the MIB2_IP_ROUTE + * table). However, using the special value EXPER_IP_AND_ALL_IRES will cause * all information to be reported. This special value should only be * used by IPMP-aware low-level utilities (e.g. in.mpathd). * @@ -109,7 +109,7 @@ extern "C" { #define EXPER_IGMP (EXPER+1) #define EXPER_DVMRP (EXPER+2) #define EXPER_RAWIP (EXPER+3) -#define EXPER_IP_AND_TESTHIDDEN (EXPER+4) +#define EXPER_IP_AND_ALL_IRES (EXPER+4) /* * Define range of levels for experimental use @@ -170,6 +170,7 @@ typedef uint32_t DeviceIndex; /* Interface index */ #define EXPER_IP_GROUP_SOURCES 102 #define EXPER_IP6_GROUP_SOURCES 103 #define EXPER_IP_RTATTR 104 +#define EXPER_IP_DCE 105 /* * There can be one of each of these tables per transport (MIB2_* above). @@ -267,15 +268,13 @@ typedef struct mib2_ip { int ipMemberEntrySize; /* Size of ip_member_t */ int ipGroupSourceEntrySize; /* Size of ip_grpsrc_t */ - /* # of IPv6 packets received by IPv4 and dropped */ - Counter ipInIPv6; - /* # of IPv6 packets transmitted by ip_wput */ - Counter ipOutIPv6; - /* # of times ip_wput has switched to become ip_wput_v6 */ - Counter ipOutSwitchIPv6; + Counter ipInIPv6; /* # of IPv6 packets received by IPv4 and dropped */ + Counter ipOutIPv6; /* No longer used */ + Counter ipOutSwitchIPv6; /* No longer used */ int ipRouteAttributeSize; /* Size of mib2_ipAttributeEntry_t */ int transportMLPSize; /* Size of mib2_transportMLPEntry_t */ + int ipDestEntrySize; /* Size of dest_cache_entry_t */ } mib2_ip_t; /* @@ -503,14 +502,11 @@ typedef struct mib2_ipIfStatsEntry { */ Counter ipIfStatsInWrongIPVersion; /* - * Depending on the value of ipIfStatsIPVersion, this counter tracks - * v4: # of IPv6 packets transmitted by ip_wput or, - * v6: # of IPv4 packets transmitted by ip_wput_v6. + * This counter is no longer used */ Counter ipIfStatsOutWrongIPVersion; /* - * Depending on the value of ipIfStatsIPVersion, this counter tracks - * # of times ip_wput has switched to become ip_wput_v6, or vice versa. + * This counter is no longer used */ Counter ipIfStatsOutSwitchIPVersion; @@ -981,6 +977,21 @@ typedef struct ipv6_grpsrc { /* + * List of destination cache entries + */ +typedef struct dest_cache_entry { + /* IP Multicast address */ + IpAddress DestIpv4Address; + Ip6Address DestIpv6Address; + uint_t DestFlags; /* DCEF_* */ + uint32_t DestPmtu; /* Path MTU if DCEF_PMTU */ + uint32_t DestIdent; /* Per destination IP ident. */ + DeviceIndex DestIfindex; /* For IPv6 link-locals */ + uint32_t DestAge; /* Age of MTU info in seconds */ +} dest_cache_entry_t; + + +/* * ICMP Group */ typedef struct mib2_icmp { diff --git a/usr/src/uts/common/inet/optcom.c b/usr/src/uts/common/inet/optcom.c index e35b7f6af5..e4d1abff4c 100644 --- a/usr/src/uts/common/inet/optcom.c +++ b/usr/src/uts/common/inet/optcom.c @@ -58,21 +58,21 @@ * Function prototypes */ static t_scalar_t process_topthdrs_first_pass(mblk_t *, cred_t *, optdb_obj_t *, - boolean_t *, size_t *); + size_t *); static t_scalar_t do_options_second_pass(queue_t *q, mblk_t *reqmp, mblk_t *ack_mp, cred_t *, optdb_obj_t *dbobjp, - mblk_t *first_mp, boolean_t is_restart, boolean_t *queued_statusp); + t_uscalar_t *worst_statusp); static t_uscalar_t get_worst_status(t_uscalar_t, t_uscalar_t); static int do_opt_default(queue_t *, struct T_opthdr *, uchar_t **, t_uscalar_t *, cred_t *, optdb_obj_t *); static void do_opt_current(queue_t *, struct T_opthdr *, uchar_t **, t_uscalar_t *, cred_t *cr, optdb_obj_t *); -static int do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt, +static void do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt, uint_t optset_context, uchar_t **resptrp, t_uscalar_t *worst_statusp, - cred_t *, optdb_obj_t *dbobjp, mblk_t *first_mp); + cred_t *, optdb_obj_t *dbobjp); static boolean_t opt_level_valid(t_uscalar_t, optlevel_t *, uint_t); static size_t opt_level_allopts_lengths(t_uscalar_t, opdes_t *, uint_t); -static boolean_t opt_length_ok(opdes_t *, struct T_opthdr *); +static boolean_t opt_length_ok(opdes_t *, t_uscalar_t optlen); static t_uscalar_t optcom_max_optbuf_len(opdes_t *, uint_t); static boolean_t opt_bloated_maxsize(opdes_t *); @@ -176,35 +176,15 @@ optcom_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error) * job requested. * XXX Code below needs some restructuring after we have some more * macros to support 'struct opthdr' in the headers. - * - * IP-MT notes: The option management framework functions svr4_optcom_req() and - * tpi_optcom_req() allocate and prepend an M_CTL mblk to the actual - * T_optmgmt_req mblk and pass the chain as an additional parameter to the - * protocol set functions. If a protocol set function (such as ip_opt_set) - * cannot process the option immediately it can return EINPROGRESS. ip_opt_set - * enqueues the message in the appropriate sq and returns EINPROGRESS. Later - * the sq framework arranges to restart this operation and passes control to - * the restart function ip_restart_optmgmt() which in turn calls - * svr4_optcom_req() or tpi_optcom_req() to restart the option processing. - * - * XXX Remove the asynchronous behavior of svr_optcom_req() and - * tpi_optcom_req(). */ -int -svr4_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp, - boolean_t pass_to_ip) +void +svr4_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp) { pfi_t deffn = dbobjp->odb_deffn; pfi_t getfn = dbobjp->odb_getfn; opt_set_fn setfn = dbobjp->odb_setfn; opdes_t *opt_arr = dbobjp->odb_opt_des_arr; uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt; - boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider; - opt_restart_t *or; - struct opthdr *restart_opt; - boolean_t is_restart = B_FALSE; - mblk_t *first_mp; - t_uscalar_t max_optbuf_len; int len; mblk_t *mp1 = NULL; @@ -214,33 +194,10 @@ svr4_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp, struct opthdr *opt_end; struct opthdr *opt_start; opdes_t *optd; - boolean_t pass_to_next = B_FALSE; struct T_optmgmt_ack *toa; struct T_optmgmt_req *tor; int error; - /* - * Allocate M_CTL and prepend to the packet for restarting this - * option if needed. IP may need to queue and restart the option - * if it cannot obtain exclusive conditions immediately. Please see - * IP-MT notes before the start of svr4_optcom_req - */ - if (mp->b_datap->db_type == M_CTL) { - is_restart = B_TRUE; - first_mp = mp; - mp = mp->b_cont; - ASSERT(mp->b_wptr - mp->b_rptr >= - sizeof (struct T_optmgmt_req)); - tor = (struct T_optmgmt_req *)mp->b_rptr; - ASSERT(tor->MGMT_flags == T_NEGOTIATE); - - or = (opt_restart_t *)first_mp->b_rptr; - opt_start = or->or_start; - opt_end = or->or_end; - restart_opt = or->or_ropt; - goto restart; - } - tor = (struct T_optmgmt_req *)mp->b_rptr; /* Verify message integrity. */ if (mp->b_wptr - mp->b_rptr < sizeof (struct T_optmgmt_req)) @@ -255,7 +212,7 @@ svr4_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp, break; default: optcom_err_ack(q, mp, TBADFLAG, 0); - return (0); + return; } if (tor->MGMT_flags == T_DEFAULT) { /* Is it a request for default option settings? */ @@ -278,7 +235,6 @@ svr4_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp, * ----historical comment end ------- */ /* T_DEFAULT not passed down */ - ASSERT(topmost_tpiprovider == B_TRUE); freemsg(mp); max_optbuf_len = optcom_max_optbuf_len(opt_arr, opt_arr_cnt); @@ -286,7 +242,7 @@ svr4_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp, if (!mp) { no_mem:; optcom_err_ack(q, mp, TSYSERR, ENOMEM); - return (0); + return; } /* Initialize the T_optmgmt_ack header. */ @@ -362,7 +318,7 @@ no_mem:; mp->b_datap->db_type = M_PCPROTO; /* Ship it back. */ qreply(q, mp); - return (0); + return; } /* T_DEFAULT processing complete - no more T_DEFAULT */ @@ -414,15 +370,15 @@ no_mem:; goto bad_opt; error = proto_opt_check(opt->level, opt->name, opt->len, NULL, - opt_arr, opt_arr_cnt, topmost_tpiprovider, + opt_arr, opt_arr_cnt, tor->MGMT_flags == T_NEGOTIATE, tor->MGMT_flags == T_CHECK, cr); if (error < 0) { optcom_err_ack(q, mp, -error, 0); - return (0); + return; } else if (error > 0) { optcom_err_ack(q, mp, TSYSERR, error); - return (0); + return; } } /* end for loop scanning option buffer */ @@ -491,24 +447,9 @@ no_mem:; /* Ditch the input buffer. */ freemsg(mp); mp = mp1; - /* Always let the next module look at the option. */ - pass_to_next = B_TRUE; break; case T_NEGOTIATE: - first_mp = allocb(sizeof (opt_restart_t), BPRI_LO); - if (first_mp == NULL) { - optcom_err_ack(q, mp, TSYSERR, ENOMEM); - return (0); - } - first_mp->b_datap->db_type = M_CTL; - or = (opt_restart_t *)first_mp->b_rptr; - or->or_start = opt_start; - or->or_end = opt_end; - or->or_type = T_SVR4_OPTMGMT_REQ; - or->or_private = 0; - first_mp->b_cont = mp; -restart: /* * Here we are expecting that the response buffer is exactly * the same size as the input buffer. We pass each opthdr @@ -523,22 +464,16 @@ restart: */ toa = (struct T_optmgmt_ack *)tor; - for (opt = is_restart ? restart_opt: opt_start; opt < opt_end; - opt = next_opt) { + for (opt = opt_start; opt < opt_end; opt = next_opt) { int error; - /* - * Point to the current option in or, in case this - * option has to be restarted later on - */ - or->or_ropt = opt; next_opt = (struct opthdr *)((uchar_t *)&opt[1] + _TPI_ALIGN_OPT(opt->len)); error = (*setfn)(q, SETFN_OPTCOM_NEGOTIATE, opt->level, opt->name, opt->len, (uchar_t *)&opt[1], - &opt->len, (uchar_t *)&opt[1], NULL, cr, first_mp); + &opt->len, (uchar_t *)&opt[1], NULL, cr); /* * Treat positive "errors" as real. * Note: negative errors are to be treated as @@ -549,99 +484,48 @@ restart: * it is valid but was either handled upstream * or will be handled downstream. */ - if (error == EINPROGRESS) { - /* - * The message is queued and will be - * reprocessed later. Typically ip queued - * the message to get some exclusive conditions - * and later on calls this func again. - */ - return (EINPROGRESS); - } else if (error > 0) { + if (error > 0) { optcom_err_ack(q, mp, TSYSERR, error); - freeb(first_mp); - return (0); + return; } /* * error < 0 means option is not recognized. - * But with OP_PASSNEXT the next module - * might recognize it. */ } - /* Done with the restart control mp. */ - freeb(first_mp); - pass_to_next = B_TRUE; break; default: optcom_err_ack(q, mp, TBADFLAG, 0); - return (0); + return; } - if (pass_to_next && (q->q_next != NULL || pass_to_ip)) { - /* Send it down to the next module and let it reply */ - toa->PRIM_type = T_SVR4_OPTMGMT_REQ; /* Changed by IP to ACK */ - if (q->q_next != NULL) - putnext(q, mp); - else - ip_output(Q_TO_CONN(q), mp, q, IP_WPUT); - } else { - /* Set common fields in the header. */ - toa->MGMT_flags = T_SUCCESS; - mp->b_datap->db_type = M_PCPROTO; - toa->PRIM_type = T_OPTMGMT_ACK; - qreply(q, mp); - } - return (0); + /* Set common fields in the header. */ + toa->MGMT_flags = T_SUCCESS; + mp->b_datap->db_type = M_PCPROTO; + toa->PRIM_type = T_OPTMGMT_ACK; + qreply(q, mp); + return; bad_opt:; optcom_err_ack(q, mp, TBADOPT, 0); - return (0); } /* * New optcom_req inspired by TPI/XTI semantics */ -int -tpi_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp, - boolean_t pass_to_ip) +void +tpi_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp) { t_scalar_t t_error; mblk_t *toa_mp; - boolean_t pass_to_next; size_t toa_len; struct T_optmgmt_ack *toa; struct T_optmgmt_req *tor = (struct T_optmgmt_req *)mp->b_rptr; - - opt_restart_t *or; - boolean_t is_restart = B_FALSE; - mblk_t *first_mp = NULL; t_uscalar_t worst_status; - boolean_t queued_status; - - /* - * Allocate M_CTL and prepend to the packet for restarting this - * option if needed. IP may need to queue and restart the option - * if it cannot obtain exclusive conditions immediately. Please see - * IP-MT notes before the start of svr4_optcom_req - */ - if (mp->b_datap->db_type == M_CTL) { - is_restart = B_TRUE; - first_mp = mp; - toa_mp = mp->b_cont; - mp = toa_mp->b_cont; - ASSERT(mp->b_wptr - mp->b_rptr >= - sizeof (struct T_optmgmt_req)); - tor = (struct T_optmgmt_req *)mp->b_rptr; - ASSERT(tor->MGMT_flags == T_NEGOTIATE); - - or = (opt_restart_t *)first_mp->b_rptr; - goto restart; - } /* Verify message integrity. */ if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_optmgmt_req)) { optcom_err_ack(q, mp, TBADOPT, 0); - return (0); + return; } /* Verify MGMT_flags legal */ @@ -654,7 +538,7 @@ tpi_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp, break; default: optcom_err_ack(q, mp, TBADFLAG, 0); - return (0); + return; } /* @@ -669,7 +553,6 @@ tpi_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp, * T_ALLOPT mean that length can be different for output buffer). */ - pass_to_next = B_FALSE; /* initial value */ toa_len = 0; /* initial value */ /* @@ -677,13 +560,11 @@ tpi_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp, * - estimate cumulative length needed for results * - set "status" field based on permissions, option header check * etc. - * - determine "pass_to_next" whether we need to send request to - * downstream module/driver. */ if ((t_error = process_topthdrs_first_pass(mp, cr, dbobjp, - &pass_to_next, &toa_len)) != 0) { + &toa_len)) != 0) { optcom_err_ack(q, mp, t_error, 0); - return (0); + return; } /* @@ -697,26 +578,14 @@ tpi_optcom_req(queue_t *q, mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp, toa_mp = allocb_tmpl(toa_len, mp); if (!toa_mp) { optcom_err_ack(q, mp, TSYSERR, ENOMEM); - return (0); + return; } - first_mp = allocb(sizeof (opt_restart_t), BPRI_LO); - if (first_mp == NULL) { - freeb(toa_mp); - optcom_err_ack(q, mp, TSYSERR, ENOMEM); - return (0); - } - first_mp->b_datap->db_type = M_CTL; - or = (opt_restart_t *)first_mp->b_rptr; /* * Set initial values for generating output. */ - or->or_worst_status = T_SUCCESS; - or->or_type = T_OPTMGMT_REQ; - or->or_private = 0; - /* remaining fields fileed in do_options_second_pass */ + worst_status = T_SUCCESS; /* initial value */ -restart: /* * This routine makes another pass through the option buffer this * time acting on the request based on "status" result in the @@ -724,19 +593,11 @@ restart: * all options of a certain level and acts on each for this request. */ if ((t_error = do_options_second_pass(q, mp, toa_mp, cr, dbobjp, - first_mp, is_restart, &queued_status)) != 0) { + &worst_status)) != 0) { freemsg(toa_mp); optcom_err_ack(q, mp, t_error, 0); - return (0); - } - if (queued_status) { - /* Option will be restarted */ - return (EINPROGRESS); + return; } - worst_status = or->or_worst_status; - /* Done with the first mp */ - freeb(first_mp); - toa_mp->b_cont = NULL; /* * Following code relies on the coincidence that T_optmgmt_req @@ -749,34 +610,12 @@ restart: toa->MGMT_flags = tor->MGMT_flags; - freemsg(mp); /* free input mblk */ - /* - * If there is atleast one option that requires a downstream - * forwarding and if it is possible, we forward the message - * downstream. Else we ack it. - */ - if (pass_to_next && (q->q_next != NULL || pass_to_ip)) { - /* - * We pass it down as T_OPTMGMT_REQ. This code relies - * on the happy coincidence that T_optmgmt_req and - * T_optmgmt_ack are identical data structures - * at the binary representation level. - */ - toa_mp->b_datap->db_type = M_PROTO; - toa->PRIM_type = T_OPTMGMT_REQ; - if (q->q_next != NULL) - putnext(q, toa_mp); - else - ip_output(Q_TO_CONN(q), toa_mp, q, IP_WPUT); - } else { - toa->PRIM_type = T_OPTMGMT_ACK; - toa_mp->b_datap->db_type = M_PCPROTO; - toa->MGMT_flags |= worst_status; /* XXX "worst" or "OR" TPI ? */ - qreply(q, toa_mp); - } - return (0); + toa->PRIM_type = T_OPTMGMT_ACK; + toa_mp->b_datap->db_type = M_PCPROTO; + toa->MGMT_flags |= worst_status; /* XXX "worst" or "OR" TPI ? */ + qreply(q, toa_mp); } @@ -786,17 +625,14 @@ restart: * - estimate cumulative length needed for results * - set "status" field based on permissions, option header check * etc. - * - determine "pass_to_next" whether we need to send request to - * downstream module/driver. */ static t_scalar_t process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp, - boolean_t *pass_to_nextp, size_t *toa_lenp) + size_t *toa_lenp) { opdes_t *opt_arr = dbobjp->odb_opt_des_arr; uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt; - boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider; optlevel_t *valid_level_arr = dbobjp->odb_valid_levels_arr; uint_t valid_level_arr_cnt = dbobjp->odb_valid_levels_arr_cnt; struct T_opthdr *opt; @@ -843,18 +679,14 @@ process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp, * unchanged if they do not understand an * option. */ - if (topmost_tpiprovider) { - if (!opt_level_valid(opt->level, - valid_level_arr, - valid_level_arr_cnt)) - return (TBADOPT); - /* - * level is valid - initialize - * option as not supported - */ - opt->status = T_NOTSUPPORT; - } - + if (!opt_level_valid(opt->level, + valid_level_arr, valid_level_arr_cnt)) + return (TBADOPT); + /* + * level is valid - initialize + * option as not supported + */ + opt->status = T_NOTSUPPORT; *toa_lenp += _TPI_ALIGN_TOPT(opt->len); continue; } @@ -866,7 +698,6 @@ process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp, */ allopt_len = 0; if (tor->MGMT_flags == T_CHECK || - !topmost_tpiprovider || ((allopt_len = opt_level_allopts_lengths(opt->level, opt_arr, opt_arr_cnt)) == 0)) { /* @@ -874,11 +705,6 @@ process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp, * It is not valid to to use T_ALLOPT with * T_CHECK flag. * - * T_ALLOPT is assumed "expanded" at the - * topmost_tpiprovider level so it should not - * be there as an "option name" if this is not - * a topmost_tpiprovider call and we fail it. - * * opt_level_allopts_lengths() is used to verify * that "level" associated with the T_ALLOPT is * supported. @@ -892,15 +718,8 @@ process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp, *toa_lenp += allopt_len; opt->status = T_SUCCESS; - /* XXX - always set T_ALLOPT 'pass_to_next' for now */ - *pass_to_nextp = B_TRUE; continue; } - /* - * Check if option wants to flow downstream - */ - if (optd->opdes_props & OP_PASSNEXT) - *pass_to_nextp = B_TRUE; /* Additional checks dependent on operation. */ switch (tor->MGMT_flags) { @@ -972,7 +791,9 @@ process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp, * Note: This can override anything about this * option request done at a higher level. */ - if (!opt_length_ok(optd, opt)) { + if (opt->len < sizeof (struct T_opthdr) || + !opt_length_ok(optd, + opt->len - sizeof (struct T_opthdr))) { /* bad size */ *toa_lenp += _TPI_ALIGN_TOPT(opt->len); opt->status = T_FAILURE; @@ -1034,23 +855,14 @@ process_topthdrs_first_pass(mblk_t *mp, cred_t *cr, optdb_obj_t *dbobjp, */ static t_scalar_t do_options_second_pass(queue_t *q, mblk_t *reqmp, mblk_t *ack_mp, cred_t *cr, - optdb_obj_t *dbobjp, mblk_t *first_mp, boolean_t is_restart, - boolean_t *queued_statusp) + optdb_obj_t *dbobjp, t_uscalar_t *worst_statusp) { - boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider; int failed_option; struct T_opthdr *opt; - struct T_opthdr *opt_start, *opt_end, *restart_opt; + struct T_opthdr *opt_start, *opt_end; uchar_t *optr; uint_t optset_context; struct T_optmgmt_req *tor = (struct T_optmgmt_req *)reqmp->b_rptr; - opt_restart_t *or; - t_uscalar_t *worst_statusp; - int err; - - *queued_statusp = B_FALSE; - or = (opt_restart_t *)first_mp->b_rptr; - worst_statusp = &or->or_worst_status; optr = (uchar_t *)ack_mp->b_rptr + sizeof (struct T_optmgmt_ack); /* assumed int32_t aligned */ @@ -1058,32 +870,16 @@ do_options_second_pass(queue_t *q, mblk_t *reqmp, mblk_t *ack_mp, cred_t *cr, /* * Set initial values for scanning input */ - if (is_restart) { - opt_start = (struct T_opthdr *)or->or_start; - opt_end = (struct T_opthdr *)or->or_end; - restart_opt = (struct T_opthdr *)or->or_ropt; - } else { - opt_start = (struct T_opthdr *)mi_offset_param(reqmp, - tor->OPT_offset, tor->OPT_length); - if (opt_start == NULL) - return (TBADOPT); - opt_end = (struct T_opthdr *)((uchar_t *)opt_start + - tor->OPT_length); - or->or_start = (struct opthdr *)opt_start; - or->or_end = (struct opthdr *)opt_end; - /* - * construct the mp chain, in case the setfn needs to - * queue this and restart option processing later on. - */ - first_mp->b_cont = ack_mp; - ack_mp->b_cont = reqmp; - } + opt_start = (struct T_opthdr *)mi_offset_param(reqmp, + tor->OPT_offset, tor->OPT_length); + if (opt_start == NULL) + return (TBADOPT); + opt_end = (struct T_opthdr *)((uchar_t *)opt_start + tor->OPT_length); ASSERT(__TPI_TOPT_ISALIGNED(opt_start)); /* verified in first pass */ - for (opt = is_restart ? restart_opt : opt_start; - opt && (opt < opt_end); + for (opt = opt_start; opt && (opt < opt_end); opt = _TPI_TOPT_NEXTHDR(opt_start, tor->OPT_length, opt)) { - or->or_ropt = (struct opthdr *)opt; + /* verified in first pass */ ASSERT(_TPI_TOPT_VALID(opt, opt_start, opt_end)); @@ -1144,9 +940,7 @@ do_options_second_pass(queue_t *q, mblk_t *reqmp, mblk_t *ack_mp, cred_t *cr, */ if (do_opt_default(q, opt, &optr, worst_statusp, cr, dbobjp) < 0) { - /* fail or pass transparently */ - if (topmost_tpiprovider) - opt->status = T_FAILURE; + opt->status = T_FAILURE; bcopy(opt, optr, opt->len); optr += _TPI_ALIGN_TOPT(opt->len); *worst_statusp = get_worst_status(opt->status, @@ -1166,12 +960,8 @@ do_options_second_pass(queue_t *q, mblk_t *reqmp, mblk_t *ack_mp, cred_t *cr, optset_context = SETFN_OPTCOM_CHECKONLY; else /* T_NEGOTIATE */ optset_context = SETFN_OPTCOM_NEGOTIATE; - err = do_opt_check_or_negotiate(q, opt, optset_context, - &optr, worst_statusp, cr, dbobjp, first_mp); - if (err == EINPROGRESS) { - *queued_statusp = B_TRUE; - return (0); - } + do_opt_check_or_negotiate(q, opt, optset_context, + &optr, worst_statusp, cr, dbobjp); break; default: return (TBADFLAG); @@ -1236,7 +1026,6 @@ do_opt_default(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp, pfi_t deffn = dbobjp->odb_deffn; opdes_t *opt_arr = dbobjp->odb_opt_des_arr; uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt; - boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider; struct T_opthdr *topth; opdes_t *optd; @@ -1248,15 +1037,8 @@ do_opt_default(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp, optd = proto_opt_lookup(reqopt->level, reqopt->name, opt_arr, opt_arr_cnt); - if (optd == NULL) { - /* - * not found - fail this one. Should not happen - * for topmost_tpiprovider as calling routine - * should have verified it. - */ - ASSERT(!topmost_tpiprovider); - return (-1); - } + /* Calling routine should have verified it it exists */ + ASSERT(optd != NULL); topth = (struct T_opthdr *)(*resptrp); topth->level = reqopt->level; @@ -1333,10 +1115,7 @@ do_opt_default(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp, * * lookup and stuff default values of all the options of the * level specified - * Note: This expansion of T_ALLOPT should happen in - * a topmost_tpiprovider. */ - ASSERT(topmost_tpiprovider); for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) { if (reqopt->level != optd->opdes_level) continue; @@ -1453,8 +1232,6 @@ do_opt_current(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp, pfi_t getfn = dbobjp->odb_getfn; opdes_t *opt_arr = dbobjp->odb_opt_des_arr; uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt; - boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider; - struct T_opthdr *topth; opdes_t *optd; int optlen; @@ -1484,7 +1261,6 @@ do_opt_current(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp, *resptrp -= sizeof (struct T_opthdr); } } else { /* T_ALLOPT processing */ - ASSERT(topmost_tpiprovider == B_TRUE); /* scan and get all options */ for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) { /* skip other levels */ @@ -1530,14 +1306,9 @@ do_opt_current(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp, } if (*resptrp == initptr) { /* - * getfn failed and does not want to handle this option. Maybe - * something downstream will or something upstream did. (If - * topmost_tpiprovider, initialize "status" to failure which - * can possibly change downstream). Copy the input "as is" from - * input option buffer if any to maintain transparency. + * getfn failed and does not want to handle this option. */ - if (topmost_tpiprovider) - reqopt->status = T_FAILURE; + reqopt->status = T_FAILURE; bcopy(reqopt, *resptrp, reqopt->len); *resptrp += _TPI_ALIGN_TOPT(reqopt->len); *worst_statusp = get_worst_status(reqopt->status, @@ -1545,18 +1316,15 @@ do_opt_current(queue_t *q, struct T_opthdr *reqopt, uchar_t **resptrp, } } -/* ARGSUSED */ -static int +static void do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt, uint_t optset_context, uchar_t **resptrp, t_uscalar_t *worst_statusp, - cred_t *cr, optdb_obj_t *dbobjp, mblk_t *first_mp) + cred_t *cr, optdb_obj_t *dbobjp) { pfi_t deffn = dbobjp->odb_deffn; opt_set_fn setfn = dbobjp->odb_setfn; opdes_t *opt_arr = dbobjp->odb_opt_des_arr; uint_t opt_arr_cnt = dbobjp->odb_opt_arr_cnt; - boolean_t topmost_tpiprovider = dbobjp->odb_topmost_tpiprovider; - struct T_opthdr *topth; opdes_t *optd; int error; @@ -1572,12 +1340,10 @@ do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt, error = (*setfn)(q, optset_context, reqopt->level, reqopt->name, reqopt->len - sizeof (struct T_opthdr), _TPI_TOPT_DATA(reqopt), &optlen, _TPI_TOPT_DATA(topth), - NULL, cr, first_mp); + NULL, cr); if (error) { /* failed - reset "*resptrp" */ *resptrp -= sizeof (struct T_opthdr); - if (error == EINPROGRESS) - return (error); } else { /* * success - "value" already filled in setfn() @@ -1594,7 +1360,6 @@ do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt, } else { /* T_ALLOPT processing */ /* only for T_NEGOTIATE case */ ASSERT(optset_context == SETFN_OPTCOM_NEGOTIATE); - ASSERT(topmost_tpiprovider == B_TRUE); /* scan and set all options to default value */ for (optd = opt_arr; optd < &opt_arr[opt_arr_cnt]; optd++) { @@ -1670,7 +1435,7 @@ do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt, error = (*setfn)(q, SETFN_OPTCOM_NEGOTIATE, reqopt->level, optd->opdes_name, optsize, (uchar_t *)optd->opdes_defbuf, &optlen, - _TPI_TOPT_DATA(topth), NULL, cr, NULL); + _TPI_TOPT_DATA(topth), NULL, cr); if (error) { /* * failed, return as T_FAILURE and null value @@ -1693,20 +1458,14 @@ do_opt_check_or_negotiate(queue_t *q, struct T_opthdr *reqopt, if (*resptrp == initptr) { /* - * setfn failed and does not want to handle this option. Maybe - * something downstream will or something upstream - * did. Copy the input as is from input option buffer if any to - * maintain transparency (maybe something at a level above - * did something. + * setfn failed and does not want to handle this option. */ - if (topmost_tpiprovider) - reqopt->status = T_FAILURE; + reqopt->status = T_FAILURE; bcopy(reqopt, *resptrp, reqopt->len); *resptrp += _TPI_ALIGN_TOPT(reqopt->len); *worst_statusp = get_worst_status(reqopt->status, *worst_statusp); } - return (0); } /* @@ -1886,7 +1645,8 @@ tpi_optcom_buf(queue_t *q, mblk_t *mp, t_scalar_t *opt_lenp, */ /* verify length */ - if (!opt_length_ok(optd, opt)) { + if (opt->len < (t_uscalar_t)sizeof (struct T_opthdr) || + !opt_length_ok(optd, opt->len - sizeof (struct T_opthdr))) { /* bad size */ if ((optd->opdes_props & OP_NOT_ABSREQ) == 0) { /* option is absolute requirement */ @@ -1914,7 +1674,7 @@ tpi_optcom_buf(queue_t *q, mblk_t *mp, t_scalar_t *opt_lenp, error = (*setfn)(q, optset_context, opt->level, opt->name, opt->len - (t_uscalar_t)sizeof (struct T_opthdr), _TPI_TOPT_DATA(opt), &olen, _TPI_TOPT_DATA(opt), - thisdg_attrs, cr, NULL); + thisdg_attrs, cr); if (olen > (int)(opt->len - sizeof (struct T_opthdr))) { /* @@ -2113,8 +1873,12 @@ opt_bloated_maxsize(opdes_t *optd) return (B_FALSE); } +/* + * optlen is the length of the option content + * Caller should check the optlen is at least sizeof (struct T_opthdr) + */ static boolean_t -opt_length_ok(opdes_t *optd, struct T_opthdr *opt) +opt_length_ok(opdes_t *optd, t_uscalar_t optlen) { /* * Verify length. @@ -2122,95 +1886,60 @@ opt_length_ok(opdes_t *optd, struct T_opthdr *opt) * less than maxlen of variable length option. */ if (optd->opdes_props & OP_VARLEN) { - if (opt->len <= optd->opdes_size + - (t_uscalar_t)sizeof (struct T_opthdr)) + if (optlen <= optd->opdes_size) return (B_TRUE); } else { /* fixed length option */ - if (opt->len == optd->opdes_size + - (t_uscalar_t)sizeof (struct T_opthdr)) + if (optlen == optd->opdes_size) return (B_TRUE); } return (B_FALSE); } /* - * This routine appends a pssed in hop-by-hop option to the existing - * option (in this case a cipso label encoded in HOPOPT option). The - * passed in option is always padded. The 'reservelen' is the - * length of reserved data (label). New memory will be allocated if - * the current buffer is not large enough. Return failure if memory + * This routine manages the allocation and free of the space for + * an extension header or option. Returns failure if memory * can not be allocated. */ int -optcom_pkt_set(uchar_t *invalp, uint_t inlen, boolean_t sticky, - uchar_t **optbufp, uint_t *optlenp, uint_t reservelen) +optcom_pkt_set(uchar_t *invalp, uint_t inlen, + uchar_t **optbufp, uint_t *optlenp) { uchar_t *optbuf; uchar_t *optp; - if (!sticky) { - *optbufp = invalp; - *optlenp = inlen; - return (0); - } - - if (inlen == *optlenp - reservelen) { + if (inlen == *optlenp) { /* Unchanged length - no need to reallocate */ - optp = *optbufp + reservelen; + optp = *optbufp; bcopy(invalp, optp, inlen); - if (reservelen != 0) { - /* - * Convert the NextHeader and Length of the - * passed in hop-by-hop header to pads - */ - optp[0] = IP6OPT_PADN; - optp[1] = 0; - } return (0); } - if (inlen + reservelen > 0) { + if (inlen > 0) { /* Allocate new buffer before free */ - optbuf = kmem_alloc(inlen + reservelen, KM_NOSLEEP); + optbuf = kmem_alloc(inlen, KM_NOSLEEP); if (optbuf == NULL) return (ENOMEM); } else { optbuf = NULL; } - /* Copy out old reserved data (label) */ - if (reservelen > 0) - bcopy(*optbufp, optbuf, reservelen); - /* Free old buffer */ if (*optlenp != 0) kmem_free(*optbufp, *optlenp); if (inlen > 0) - bcopy(invalp, optbuf + reservelen, inlen); + bcopy(invalp, optbuf, inlen); - if (reservelen != 0) { - /* - * Convert the NextHeader and Length of the - * passed in hop-by-hop header to pads - */ - optbuf[reservelen] = IP6OPT_PADN; - optbuf[reservelen + 1] = 0; - /* - * Set the Length of the hop-by-hop header, number of 8 - * byte-words following the 1st 8 bytes - */ - optbuf[1] = (reservelen + inlen - 1) >> 3; - } *optbufp = optbuf; - *optlenp = inlen + reservelen; + *optlenp = inlen; return (0); } int process_auxiliary_options(conn_t *connp, void *control, t_uscalar_t controllen, - void *optbuf, optdb_obj_t *dbobjp, int (*opt_set_fn)(conn_t *, uint_t, int, - int, uint_t, uchar_t *, uint_t *, uchar_t *, void *, cred_t *), cred_t *cr) + void *optbuf, optdb_obj_t *dbobjp, int (*opt_set_fn)(conn_t *, + uint_t, int, int, uint_t, uchar_t *, uint_t *, uchar_t *, void *, cred_t *), + cred_t *cr) { struct cmsghdr *cmsg; opdes_t *optd; @@ -2254,7 +1983,7 @@ process_auxiliary_options(conn_t *connp, void *control, t_uscalar_t controllen, } error = opt_set_fn(connp, SETFN_UD_NEGOTIATE, optd->opdes_level, optd->opdes_name, len, (uchar_t *)CMSG_CONTENT(cmsg), - &outlen, (uchar_t *)CMSG_CONTENT(cmsg), (void *)optbuf, cr); + &outlen, (uchar_t *)CMSG_CONTENT(cmsg), optbuf, cr); if (error > 0) { return (error); } else if (outlen > len) { diff --git a/usr/src/uts/common/inet/optcom.h b/usr/src/uts/common/inet/optcom.h index df4f227e95..01ca52a759 100644 --- a/usr/src/uts/common/inet/optcom.h +++ b/usr/src/uts/common/inet/optcom.h @@ -34,6 +34,7 @@ extern "C" { #if defined(_KERNEL) && defined(__STDC__) #include <inet/ipclassifier.h> + /* Options Description Structure */ typedef struct opdes_s { t_uscalar_t opdes_name; /* option name */ @@ -138,20 +139,15 @@ typedef struct opdes_s { #define OA_NO_PERMISSION(x, c) (OA_MATCHED_PRIV((x), (c)) ? \ ((x)->opdes_access_priv == 0) : ((x)->opdes_access_nopriv == 0)) -#define PASS_OPT_TO_IP(connp) \ - if (IPCL_IS_NONSTR(connp)) \ - return (-EINVAL) - /* * Other properties set in opdes_props field. */ -#define OP_PASSNEXT 0x1 /* to pass option to next module or not */ -#define OP_VARLEN 0x2 /* option is varible length */ -#define OP_NOT_ABSREQ 0x4 /* option is not a "absolute requirement" */ +#define OP_VARLEN 0x1 /* option is varible length */ +#define OP_NOT_ABSREQ 0x2 /* option is not a "absolute requirement" */ /* i.e. failure to negotiate does not */ /* abort primitive ("ignore" semantics ok) */ -#define OP_NODEFAULT 0x8 /* no concept of "default value" */ -#define OP_DEF_FN 0x10 /* call a "default function" to get default */ +#define OP_NODEFAULT 0x4 /* no concept of "default value" */ +#define OP_DEF_FN 0x8 /* call a "default function" to get default */ /* value, not from static table */ @@ -165,13 +161,12 @@ typedef t_uscalar_t optlevel_t; typedef int (*opt_def_fn)(queue_t *, int, int, uchar_t *); typedef int (*opt_get_fn)(queue_t *, int, int, uchar_t *); typedef int (*opt_set_fn)(queue_t *, uint_t, int, int, uint_t, uchar_t *, - uint_t *, uchar_t *, void *, cred_t *, mblk_t *); + uint_t *, uchar_t *, void *, cred_t *); typedef struct optdb_obj { opt_def_fn odb_deffn; /* default value function */ opt_get_fn odb_getfn; /* get function */ opt_set_fn odb_setfn; /* set function */ - boolean_t odb_topmost_tpiprovider; /* whether topmost tpi */ /* provider or downstream */ uint_t odb_opt_arr_cnt; /* count of number of options in db */ opdes_t *odb_opt_des_arr; /* option descriptors in db */ @@ -182,22 +177,6 @@ typedef struct optdb_obj { } optdb_obj_t; /* - * This is used to restart option processing. This goes inside an M_CTL - * which is prepended to the packet. IP may need to become exclusive on - * an ill for setting some options. For dg. IP_ADD_MEMBERSHIP. Since - * there can be more than 1 option packed in an option buffer, we need to - * remember where to restart option processing after resuming from a wait - * for exclusive condition in IP. - */ -typedef struct opt_restart_s { - struct opthdr *or_start; /* start of option buffer */ - struct opthdr *or_end; /* end of option buffer */ - struct opthdr *or_ropt; /* restart option here */ - t_uscalar_t or_worst_status; /* Used by tpi_optcom_req */ - t_uscalar_t or_type; /* svr4 or tpi optcom variant */ - int or_private; /* currently used by CGTP */ -} opt_restart_t; -/* * Values for "optset_context" parameter passed to * transport specific "setfn()" routines */ @@ -210,16 +189,12 @@ typedef struct opt_restart_s { * Function prototypes */ extern void optcom_err_ack(queue_t *, mblk_t *, t_scalar_t, int); -extern int svr4_optcom_req(queue_t *, mblk_t *, cred_t *, optdb_obj_t *, - boolean_t); -extern int tpi_optcom_req(queue_t *, mblk_t *, cred_t *, optdb_obj_t *, - boolean_t); +extern void svr4_optcom_req(queue_t *, mblk_t *, cred_t *, optdb_obj_t *); +extern void tpi_optcom_req(queue_t *, mblk_t *, cred_t *, optdb_obj_t *); extern int tpi_optcom_buf(queue_t *, mblk_t *, t_scalar_t *, t_scalar_t, cred_t *, optdb_obj_t *, void *, int *); extern t_uscalar_t optcom_max_optsize(opdes_t *, uint_t); -extern int optcom_pkt_set(uchar_t *, uint_t, boolean_t, uchar_t **, uint_t *, - uint_t); - +extern int optcom_pkt_set(uchar_t *, uint_t, uchar_t **, uint_t *); extern int process_auxiliary_options(conn_t *, void *, t_uscalar_t, void *, optdb_obj_t *, int (*)(conn_t *, uint_t, int, int, uint_t, uchar_t *, uint_t *, uchar_t *, void *, cred_t *), cred_t *); diff --git a/usr/src/uts/common/inet/proto_set.c b/usr/src/uts/common/inet/proto_set.c index 45f07d2ed3..499f046f6d 100644 --- a/usr/src/uts/common/inet/proto_set.c +++ b/usr/src/uts/common/inet/proto_set.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -348,27 +348,21 @@ proto_opt_lookup(t_uscalar_t level, t_uscalar_t name, opdes_t *opt_arr, /* * Do a lookup of the options in the array and do permission and length checking * Returns zero if there is no error (note: for non-tpi-providers not being able - * to find the option is not an error). TPI errors are returned as -ve. + * to find the option is not an error). TPI errors are returned as negative + * numbers and errnos as positive numbers. + * If max_len is set we update it based on the max length of the option. */ int proto_opt_check(int level, int name, int len, t_uscalar_t *max_len, - opdes_t *opt_arr, uint_t opt_arr_cnt, boolean_t topmost_tpiprovider, - boolean_t negotiate, boolean_t check, cred_t *cr) + opdes_t *opt_arr, uint_t opt_arr_cnt, boolean_t negotiate, boolean_t check, + cred_t *cr) { opdes_t *optd; /* Find the option in the opt_arr. */ - if ((optd = proto_opt_lookup(level, name, opt_arr, opt_arr_cnt)) == - NULL) { - /* - * Not found, that is a bad thing if - * the caller is a tpi provider - */ - if (topmost_tpiprovider) - return (-TBADOPT); - else - return (0); /* skip unmodified */ - } + optd = proto_opt_lookup(level, name, opt_arr, opt_arr_cnt); + if (optd == NULL) + return (-TBADOPT); /* Additional checks dependent on operation. */ if (negotiate) { @@ -409,15 +403,12 @@ proto_opt_check(int level, int name, int len, t_uscalar_t *max_len, return (-TBADOPT); } /* - * XXX Change the comments. - * * XXX Since T_CURRENT was not there in TLI and the * official TLI inspired TPI standard, getsockopt() * API uses T_CHECK (for T_CURRENT semantics) - * The following fallthru makes sense because of its - * historical use as semantic equivalent to T_CURRENT. + * Thus T_CHECK includes the T_CURRENT semantics due to that + * historical use. */ - /* FALLTHRU */ if (!OA_READ_PERMISSION(optd, cr)) { /* can't read option value */ if (!(OA_MATCHED_PRIV(optd, cr)) && diff --git a/usr/src/uts/common/inet/proto_set.h b/usr/src/uts/common/inet/proto_set.h index 8e714c7c05..488cf4d478 100644 --- a/usr/src/uts/common/inet/proto_set.h +++ b/usr/src/uts/common/inet/proto_set.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -48,7 +48,7 @@ extern int proto_tlitosyserr(int); extern int proto_verify_ip_addr(int, const struct sockaddr *, socklen_t); extern int proto_opt_check(int, int, int, t_uscalar_t *, opdes_t *, - uint_t, boolean_t, boolean_t, boolean_t, cred_t *); + uint_t, boolean_t, boolean_t, cred_t *); extern opdes_t *proto_opt_lookup(t_uscalar_t, t_uscalar_t, opdes_t *, uint_t); #ifdef __cplusplus diff --git a/usr/src/uts/common/inet/rawip_impl.h b/usr/src/uts/common/inet/rawip_impl.h index 5635bb0f01..348c4f5239 100644 --- a/usr/src/uts/common/inet/rawip_impl.h +++ b/usr/src/uts/common/inet/rawip_impl.h @@ -69,87 +69,25 @@ typedef struct icmp_stack icmp_stack_t; /* Internal icmp control structure, one per open stream */ typedef struct icmp_s { - krwlock_t icmp_rwlock; /* Protects most of icmp_t */ - t_scalar_t icmp_pending_op; /* The current TPI operation */ /* - * Following fields up to icmp_ipversion protected by conn_lock. + * The addresses and ports in the conn_t and icmp_state are protected by + * conn_lock. conn_lock also protects the content of icmp_t. */ uint_t icmp_state; /* TPI state */ - in6_addr_t icmp_v6src; /* Source address of this stream */ - in6_addr_t icmp_bound_v6src; /* Explicitely bound to address */ - sin6_t icmp_v6dst; /* Connected destination */ - /* - * IP format that packets transmitted from this struct should use. - * Value can be IP4_VERSION or IPV6_VERSION. - */ - uchar_t icmp_ipversion; - - /* Written to only once at the time of opening the endpoint */ - sa_family_t icmp_family; /* Family from socket() call */ - - /* Following protected by icmp_rwlock */ - uint32_t icmp_max_hdr_len; /* For write offset in stream head */ - uint_t icmp_proto; - uint_t icmp_ip_snd_options_len; /* Len of IPv4 options */ - uint8_t *icmp_ip_snd_options; /* Ptr to IPv4 options */ - uint8_t icmp_multicast_ttl; /* IP*_MULTICAST_TTL/HOPS */ - ipaddr_t icmp_multicast_if_addr; /* IP_MULTICAST_IF option */ - uint_t icmp_multicast_if_index; /* IPV6_MULTICAST_IF option */ - int icmp_bound_if; /* IP*_BOUND_IF option */ /* Written to only once at the time of opening the endpoint */ conn_t *icmp_connp; - /* Following protected by icmp_rwlock */ uint_t - icmp_debug : 1, /* SO_DEBUG "socket" option. */ - icmp_dontroute : 1, /* SO_DONTROUTE "socket" option. */ - icmp_broadcast : 1, /* SO_BROADCAST "socket" option. */ - icmp_reuseaddr : 1, /* SO_REUSEADDR "socket" option. */ - - icmp_useloopback : 1, /* SO_USELOOPBACK "socket" option. */ icmp_hdrincl : 1, /* IP_HDRINCL option + RAW and IGMP */ - icmp_dgram_errind : 1, /* SO_DGRAM_ERRIND option */ - icmp_unspec_source : 1, /* IP*_UNSPEC_SRC option */ - icmp_raw_checksum : 1, /* raw checksum per IPV6_CHECKSUM */ - icmp_no_tp_cksum : 1, /* icmp_proto is UDP or TCP */ - icmp_ip_recvpktinfo : 1, /* IPV[4,6]_RECVPKTINFO option */ - icmp_ipv6_recvhoplimit : 1, /* IPV6_RECVHOPLIMIT option */ + icmp_pad_to_bit_31: 31; - icmp_ipv6_recvhopopts : 1, /* IPV6_RECVHOPOPTS option */ - icmp_ipv6_recvdstopts : 1, /* IPV6_RECVDSTOPTS option */ - icmp_ipv6_recvrthdr : 1, /* IPV6_RECVRTHDR option */ - icmp_ipv6_recvpathmtu : 1, /* IPV6_RECVPATHMTU option */ - - icmp_recvif:1, /* IP_RECVIF for raw sockets option */ - icmp_ipv6_recvtclass : 1, /* IPV6_RECVTCLASS option */ - icmp_ipv6_recvrtdstopts : 1, /* Obsolete IPV6_RECVRTHDRDSTOPTS */ - icmp_old_ipv6_recvdstopts : 1, /* Old ver of IPV6_RECVDSTOPTS */ - - icmp_timestamp : 1, /* SO_TIMESTAMP "socket" option */ - - icmp_pad_to_bit_31: 11; - - uint8_t icmp_type_of_service; - uint8_t icmp_ttl; /* TTL or hoplimit */ - uint32_t icmp_checksum_off; /* user supplied checksum offset */ icmp6_filter_t *icmp_filter; /* ICMP6_FILTER option */ - ip6_pkt_t icmp_sticky_ipp; /* Sticky options */ - uint8_t *icmp_sticky_hdrs; /* Prebuilt IPv6 hdrs */ - uint_t icmp_sticky_hdrs_len; /* Incl. ip6h and any ip6i */ - zoneid_t icmp_zoneid; /* ID of owning zone */ - uint_t icmp_label_len; /* length of security label */ - uint_t icmp_label_len_v6; /* sec. part of sticky opt */ - in6_addr_t icmp_v6lastdst; /* most recent destination */ - cred_t *icmp_last_cred; /* most recent credentials */ - cred_t *icmp_effective_cred; /* cred with effective label */ + /* Set at open time and never changed */ icmp_stack_t *icmp_is; /* Stack instance */ - size_t icmp_xmit_hiwat; - size_t icmp_xmit_lowat; - size_t icmp_recv_hiwat; - size_t icmp_recv_lowat; + int icmp_delayed_error; kmutex_t icmp_recv_lock; mblk_t *icmp_fallback_queue_head; @@ -165,6 +103,10 @@ typedef struct icmp_s { extern optdb_obj_t icmp_opt_obj; extern uint_t icmp_max_optsize; +extern int icmp_opt_default(queue_t *, t_scalar_t, t_scalar_t, uchar_t *); +extern int icmp_tpi_opt_get(queue_t *, t_scalar_t, t_scalar_t, uchar_t *); +extern int icmp_tpi_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *, + uint_t *, uchar_t *, void *, cred_t *); extern mblk_t *icmp_snmp_get(queue_t *q, mblk_t *mpctl); extern void icmp_ddi_g_init(void); diff --git a/usr/src/uts/common/inet/rts_impl.h b/usr/src/uts/common/inet/rts_impl.h index de7cd8970b..b2b9080e9e 100644 --- a/usr/src/uts/common/inet/rts_impl.h +++ b/usr/src/uts/common/inet/rts_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -71,13 +71,7 @@ typedef struct rts_s { uint_t rts_state; /* Provider interface state */ uint_t rts_error; /* Routing socket error code */ uint_t rts_flag; /* Pending I/O state */ - uint_t rts_proto; /* SO_PROTOTYPE "socket" option. */ - uint_t rts_debug : 1, /* SO_DEBUG "socket" option. */ - rts_dontroute : 1, /* SO_DONTROUTE "socket" option. */ - rts_broadcast : 1, /* SO_BROADCAST "socket" option. */ - rts_reuseaddr : 1, /* SO_REUSEADDR "socket" option. */ - rts_useloopback : 1, /* SO_USELOOPBACK "socket" option. */ - rts_multicast_loop : 1, /* IP_MULTICAST_LOOP option */ + uint_t rts_hdrincl : 1, /* IP_HDRINCL option + RAW and IGMP */ : 0; @@ -86,30 +80,16 @@ typedef struct rts_s { /* Written to only once at the time of opening the endpoint */ conn_t *rts_connp; - /* Outbound flow control */ - size_t rts_xmit_hiwat; - size_t rts_xmit_lowat; - - /* Inbound flow control */ - size_t rts_recv_hiwat; - size_t rts_recv_lowat; - - kmutex_t rts_send_mutex; - kmutex_t rts_recv_mutex; - kcondvar_t rts_send_cv; - kcondvar_t rts_io_cv; + kmutex_t rts_recv_mutex; /* For recv flow control */ } rts_t; #define RTS_WPUT_PENDING 0x1 /* Waiting for write-side to complete */ -#define RTS_REQ_PENDING 0x1 /* For direct sockets */ #define RTS_WRW_PENDING 0x2 /* Routing socket write in progress */ -#define RTS_REQ_INPROG 0x2 /* For direct sockets */ /* * Object to represent database of options to search passed to * {sock,tpi}optcom_req() interface routine to take care of option * management and associated methods. - * XXX. These and other externs should really move to a rts header. */ extern optdb_obj_t rts_opt_obj; extern uint_t rts_max_optsize; @@ -119,7 +99,7 @@ extern void rts_ddi_g_destroy(void); extern int rts_tpi_opt_get(queue_t *, t_scalar_t, t_scalar_t, uchar_t *); extern int rts_tpi_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *, - uint_t *, uchar_t *, void *, cred_t *, mblk_t *); + uint_t *, uchar_t *, void *, cred_t *); extern int rts_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr); diff --git a/usr/src/uts/common/inet/sadb.h b/usr/src/uts/common/inet/sadb.h index 6d3b9b5b27..7a45a41b85 100644 --- a/usr/src/uts/common/inet/sadb.h +++ b/usr/src/uts/common/inet/sadb.h @@ -37,14 +37,34 @@ extern "C" { #define IPSA_MAX_ADDRLEN 4 /* Max address len. (in 32-bits) for an SA. */ -/* - * Return codes of IPsec processing functions. - */ -typedef enum { - IPSEC_STATUS_SUCCESS = 1, - IPSEC_STATUS_FAILED = 2, - IPSEC_STATUS_PENDING = 3 -} ipsec_status_t; +#define MAXSALTSIZE 8 + +/* + * For combined mode ciphers, store the crypto_mechanism_t in the + * per-packet ipsec_in_t/ipsec_out_t structures. This is because the PARAMS + * and nonce values change for each packet. For non-combined mode + * ciphers, these values are constant for the life of the SA. + */ +typedef struct ipsa_cm_mech_s { + crypto_mechanism_t combined_mech; + union { + CK_AES_CCM_PARAMS paramu_ccm; + CK_AES_GCM_PARAMS paramu_gcm; + } paramu; + uint8_t nonce[MAXSALTSIZE + sizeof (uint64_t)]; +#define param_ulMACSize paramu.paramu_ccm.ulMACSize +#define param_ulNonceSize paramu.paramu_ccm.ipsa_ulNonceSize +#define param_ulAuthDataSize paramu.paramu_ccm.ipsa_ulAuthDataSize +#define param_ulDataSize paramu.paramu_ccm.ipsa_ulDataSize +#define param_nonce paramu.paramu_ccm.nonce +#define param_authData paramu.paramu_ccm.authData +#define param_pIv paramu.paramu_gcm.ipsa_pIv +#define param_ulIvLen paramu.paramu_gcm.ulIvLen +#define param_ulIvBits paramu.paramu_gcm.ulIvBits +#define param_pAAD paramu.paramu_gcm.pAAD +#define param_ulAADLen paramu.paramu_gcm.ulAADLen +#define param_ulTagBits paramu.paramu_gcm.ulTagBits +} ipsa_cm_mech_t; /* * The Initialization Vector (also known as IV or Nonce) used to @@ -280,9 +300,13 @@ typedef struct ipsa_s { /* * Input and output processing functions called from IP. + * The mblk_t is the data; the IPsec information is in the attributes + * Returns NULL if the mblk is consumed which it is if there was + * a failure or if pending. If failure then + * the ipIfInDiscards/OutDiscards counters are increased. */ - ipsec_status_t (*ipsa_output_func)(mblk_t *); - ipsec_status_t (*ipsa_input_func)(mblk_t *, void *); + mblk_t *(*ipsa_output_func)(mblk_t *, ip_xmit_attr_t *); + mblk_t *(*ipsa_input_func)(mblk_t *, void *, ip_recv_attr_t *); /* * Soft reference to paired SA @@ -290,8 +314,8 @@ typedef struct ipsa_s { uint32_t ipsa_otherspi; netstack_t *ipsa_netstack; /* Does not have a netstack_hold */ - cred_t *ipsa_cred; /* MLS: cred_t attributes */ - cred_t *ipsa_ocred; /* MLS: outer label */ + ts_label_t *ipsa_tsl; /* MLS: label attributes */ + ts_label_t *ipsa_otsl; /* MLS: outer label */ uint8_t ipsa_mac_exempt; /* MLS: mac exempt flag */ uchar_t ipsa_opt_storage[IP_MAX_OPT_LENGTH]; } ipsa_t; @@ -382,7 +406,7 @@ typedef struct ipsa_s { #define IPSA_F_EALG1 SADB_X_SAFLAGS_EALG1 /* Encrypt alg flag 1 */ #define IPSA_F_EALG2 SADB_X_SAFLAGS_EALG2 /* Encrypt alg flag 2 */ -#define IPSA_F_HW 0x200000 /* hwaccel capable SA */ +#define IPSA_F_ASYNC 0x200000 /* Call KCF asynchronously? */ #define IPSA_F_NATT_LOC SADB_X_SAFLAGS_NATT_LOC #define IPSA_F_NATT_REM SADB_X_SAFLAGS_NATT_REM #define IPSA_F_BEHIND_NAT SADB_X_SAFLAGS_NATTED @@ -503,8 +527,8 @@ typedef struct ipsacq_s { uint8_t ipsacq_icmp_type; uint8_t ipsacq_icmp_code; - /* credentials associated with triggering packet */ - cred_t *ipsacq_cred; + /* label associated with triggering packet */ + ts_label_t *ipsacq_tsl; } ipsacq_t; /* @@ -529,7 +553,7 @@ typedef struct iacqf_s { * A (network protocol, ipsec protocol) specific SADB. * (i.e., one each for {ah, esp} and {v4, v6}. * - * Keep outbound assocs about the same as ire_cache entries for now. + * Keep outbound assocs in a simple hash table for now. * One danger point, multiple SAs for a single dest will clog a bucket. * For the future, consider two-level hashing (2nd hash on IPC?), then probe. */ @@ -550,7 +574,6 @@ typedef struct sadb_s typedef struct sadbp_s { uint32_t s_satype; - queue_t *s_ip_q; uint32_t *s_acquire_timeout; void (*s_acqfn)(ipsacq_t *, mblk_t *, netstack_t *); sadb_t s_v4; @@ -583,14 +606,16 @@ typedef struct templist_s #define ALL_ZEROES_PTR ((uint32_t *)&ipv6_all_zeros) /* - * Form unique id from ipsec_out_t + * Form unique id from ip_xmit_attr_t. */ - -#define SA_FORM_UNIQUE_ID(io) \ - SA_UNIQUE_ID((io)->ipsec_out_src_port, (io)->ipsec_out_dst_port, \ - ((io)->ipsec_out_tunnel ? ((io)->ipsec_out_inaf == AF_INET6 ? \ - IPPROTO_IPV6 : IPPROTO_ENCAP) : (io)->ipsec_out_proto), \ - ((io)->ipsec_out_tunnel ? (io)->ipsec_out_proto : 0)) +#define SA_FORM_UNIQUE_ID(ixa) \ + SA_UNIQUE_ID((ixa)->ixa_ipsec_src_port, (ixa)->ixa_ipsec_dst_port, \ + (((ixa)->ixa_flags & IXAF_IPSEC_TUNNEL) ? \ + ((ixa)->ixa_ipsec_inaf == AF_INET6 ? \ + IPPROTO_IPV6 : IPPROTO_ENCAP) : \ + (ixa)->ixa_ipsec_proto), \ + (((ixa)->ixa_flags & IXAF_IPSEC_TUNNEL) ? \ + (ixa)->ixa_ipsec_proto : 0)) /* * This macro is used to generate unique ids (along with the addresses, both @@ -698,8 +723,8 @@ boolean_t sadb_match_query(ipsa_query_t *q, ipsa_t *sa); /* SA retrieval (inbound and outbound) */ ipsa_t *ipsec_getassocbyspi(isaf_t *, uint32_t, uint32_t *, uint32_t *, sa_family_t); -ipsa_t *ipsec_getassocbyconn(isaf_t *, ipsec_out_t *, uint32_t *, uint32_t *, - sa_family_t, uint8_t, cred_t *); +ipsa_t *ipsec_getassocbyconn(isaf_t *, ip_xmit_attr_t *, uint32_t *, uint32_t *, + sa_family_t, uint8_t, ts_label_t *); /* SA insertion. */ int sadb_insertassoc(ipsa_t *, isaf_t *); @@ -727,9 +752,9 @@ boolean_t sadb_addrfix(keysock_in_t *, queue_t *, mblk_t *, netstack_t *); int sadb_addrset(ire_t *); int sadb_delget_sa(mblk_t *, keysock_in_t *, sadbp_t *, int *, queue_t *, uint8_t); -int sadb_purge_sa(mblk_t *, keysock_in_t *, sadb_t *, int *, queue_t *, - queue_t *); -int sadb_common_add(queue_t *, queue_t *, mblk_t *, sadb_msg_t *, + +int sadb_purge_sa(mblk_t *, keysock_in_t *, sadb_t *, int *, queue_t *); +int sadb_common_add(queue_t *, mblk_t *, sadb_msg_t *, keysock_in_t *, isaf_t *, isaf_t *, ipsa_t *, boolean_t, boolean_t, int *, netstack_t *, sadbp_t *); void sadb_set_usetime(ipsa_t *); @@ -737,7 +762,7 @@ boolean_t sadb_age_bytes(queue_t *, ipsa_t *, uint64_t, boolean_t); int sadb_update_sa(mblk_t *, keysock_in_t *, mblk_t **, sadbp_t *, int *, queue_t *, int (*)(mblk_t *, keysock_in_t *, int *, netstack_t *), netstack_t *, uint8_t); -void sadb_acquire(mblk_t *, ipsec_out_t *, boolean_t, boolean_t); +void sadb_acquire(mblk_t *, ip_xmit_attr_t *, boolean_t, boolean_t); void gcm_params_init(ipsa_t *, uchar_t *, uint_t, uchar_t *, ipsa_cm_mech_t *, crypto_data_t *); void ccm_params_init(ipsa_t *, uchar_t *, uint_t, uchar_t *, ipsa_cm_mech_t *, @@ -754,16 +779,17 @@ boolean_t sadb_replay_check(ipsa_t *, uint32_t); boolean_t sadb_replay_peek(ipsa_t *, uint32_t); int sadb_dump(queue_t *, mblk_t *, keysock_in_t *, sadb_t *); void sadb_replay_delete(ipsa_t *); -void sadb_ager(sadb_t *, queue_t *, queue_t *, int, netstack_t *); +void sadb_ager(sadb_t *, queue_t *, int, netstack_t *); timeout_id_t sadb_retimeout(hrtime_t, queue_t *, void (*)(void *), void *, uint_t *, uint_t, short); void sadb_sa_refrele(void *target); -boolean_t sadb_set_lpkt(ipsa_t *, mblk_t *, netstack_t *); +boolean_t sadb_set_lpkt(ipsa_t *, mblk_t *, ip_recv_attr_t *); mblk_t *sadb_clear_lpkt(ipsa_t *); -void sadb_buf_pkt(ipsa_t *, mblk_t *, netstack_t *); +void sadb_buf_pkt(ipsa_t *, mblk_t *, ip_recv_attr_t *); void sadb_clear_buf_pkt(void *ipkt); +/* Note that buf_pkt is the product of ip_recv_attr_to_mblk() */ #define HANDLE_BUF_PKT(taskq, stack, dropper, buf_pkt) \ { \ if (buf_pkt != NULL) { \ @@ -774,8 +800,9 @@ void sadb_clear_buf_pkt(void *ipkt); while (buf_pkt != NULL) { \ tmp = buf_pkt->b_next; \ buf_pkt->b_next = NULL; \ + buf_pkt = ip_recv_attr_free_mblk(buf_pkt); \ ip_drop_packet(buf_pkt, B_TRUE, NULL, \ - NULL, DROPPER(stack, \ + DROPPER(stack, \ ipds_sadb_inidle_timeout), \ &dropper); \ buf_pkt = tmp; \ @@ -785,24 +812,8 @@ void sadb_clear_buf_pkt(void *ipkt); } \ /* - * Hw accel-related calls (downloading sadb to driver) - */ -void sadb_ill_download(ill_t *, uint_t); -mblk_t *sadb_fmt_sa_req(uint_t, uint_t, ipsa_t *, boolean_t); -/* - * Sub-set of the IPsec hardware acceleration capabilities functions - * implemented by ip_if.c - */ -extern boolean_t ipsec_capab_match(ill_t *, uint_t, boolean_t, ipsa_t *, - netstack_t *); -extern void ill_ipsec_capab_send_all(uint_t, mblk_t *, ipsa_t *, - netstack_t *); - - -/* - * One IPsec -> IP linking routine, and two IPsec rate-limiting routines. + * Two IPsec rate-limiting routines. */ -extern boolean_t sadb_t_bind_req(queue_t *, int); /*PRINTFLIKE6*/ extern void ipsec_rl_strlog(netstack_t *, short, short, char, ushort_t, char *, ...) @@ -818,7 +829,8 @@ extern void ipsec_assocfailure(short, short, char, ushort_t, char *, uint32_t, typedef enum ipsec_algtype { IPSEC_ALG_AUTH = 0, - IPSEC_ALG_ENCR = 1 + IPSEC_ALG_ENCR = 1, + IPSEC_ALG_ALL = 2 } ipsec_algtype_t; /* @@ -886,11 +898,10 @@ extern void ipsec_alg_fix_min_max(ipsec_alginfo_t *, ipsec_algtype_t, extern void alg_flag_check(ipsec_alginfo_t *); extern void ipsec_alg_free(ipsec_alginfo_t *); extern void ipsec_register_prov_update(void); -extern void sadb_alg_update(ipsec_algtype_t, uint8_t, boolean_t, - netstack_t *); +extern void sadb_alg_update(ipsec_algtype_t, uint8_t, boolean_t, netstack_t *); -extern int sadb_sens_len_from_cred(cred_t *); -extern void sadb_sens_from_cred(sadb_sens_t *, int, cred_t *, int); +extern int sadb_sens_len_from_label(ts_label_t *); +extern void sadb_sens_from_label(sadb_sens_t *, int, ts_label_t *, int); /* * Context templates management. diff --git a/usr/src/uts/common/inet/sctp/sctp.c b/usr/src/uts/common/inet/sctp/sctp.c index 00fc6cda42..d444e1f10e 100644 --- a/usr/src/uts/common/inet/sctp/sctp.c +++ b/usr/src/uts/common/inet/sctp/sctp.c @@ -56,6 +56,8 @@ #include <inet/common.h> #include <inet/ip.h> +#include <inet/ip_if.h> +#include <inet/ip_ire.h> #include <inet/ip6.h> #include <inet/mi.h> #include <inet/mib2.h> @@ -74,12 +76,6 @@ int sctpdebug; sin6_t sctp_sin6_null; /* Zero address for quick clears */ -/* - * Have to ensure that sctp_g_q_close is not done by an - * interrupt thread. - */ -static taskq_t *sctp_taskq; - static void sctp_closei_local(sctp_t *sctp); static int sctp_init_values(sctp_t *, sctp_t *, int); static void sctp_icmp_error_ipv6(sctp_t *sctp, mblk_t *mp); @@ -91,12 +87,10 @@ static void sctp_conn_cache_fini(); static int sctp_conn_cache_constructor(); static void sctp_conn_cache_destructor(); static void sctp_conn_clear(conn_t *); -void sctp_g_q_setup(sctp_stack_t *); -void sctp_g_q_create(sctp_stack_t *); -void sctp_g_q_destroy(sctp_stack_t *); +static void sctp_notify(void *, ip_xmit_attr_t *, ixa_notify_type_t, + ixa_notify_arg_t); static void *sctp_stack_init(netstackid_t stackid, netstack_t *ns); -static void sctp_stack_shutdown(netstackid_t stackid, void *arg); static void sctp_stack_fini(netstackid_t stackid, void *arg); /* @@ -178,8 +172,8 @@ sctp_create_eager(sctp_t *psctp) { sctp_t *sctp; mblk_t *ack_mp, *hb_mp; - conn_t *connp, *pconnp; - cred_t *credp; + conn_t *connp; + cred_t *credp; sctp_stack_t *sctps = psctp->sctp_sctps; if ((connp = ipcl_conn_create(IPCL_SCTPCONN, KM_NOSLEEP, @@ -187,8 +181,6 @@ sctp_create_eager(sctp_t *psctp) return (NULL); } - connp->conn_ulp_labeled = is_system_labeled(); - sctp = CONN2SCTP(connp); sctp->sctp_sctps = sctps; @@ -200,7 +192,6 @@ sctp_create_eager(sctp_t *psctp) freeb(ack_mp); sctp_conn_clear(connp); sctp->sctp_sctps = NULL; - SCTP_G_Q_REFRELE(sctps); kmem_cache_free(sctp_conn_cache, connp); return (NULL); } @@ -208,43 +199,20 @@ sctp_create_eager(sctp_t *psctp) sctp->sctp_ack_mp = ack_mp; sctp->sctp_heartbeat_mp = hb_mp; - /* Inherit information from the "parent" */ - sctp->sctp_ipversion = psctp->sctp_ipversion; - sctp->sctp_family = psctp->sctp_family; - pconnp = psctp->sctp_connp; - connp->conn_af_isv6 = pconnp->conn_af_isv6; - connp->conn_pkt_isv6 = pconnp->conn_pkt_isv6; - connp->conn_ipv6_v6only = pconnp->conn_ipv6_v6only; if (sctp_init_values(sctp, psctp, KM_NOSLEEP) != 0) { freeb(ack_mp); freeb(hb_mp); sctp_conn_clear(connp); sctp->sctp_sctps = NULL; - SCTP_G_Q_REFRELE(sctps); kmem_cache_free(sctp_conn_cache, connp); return (NULL); } - /* - * If the parent is multilevel, then we'll fix up the remote cred - * when we do sctp_accept_comm. - */ - if ((credp = pconnp->conn_cred) != NULL) { + if ((credp = psctp->sctp_connp->conn_cred) != NULL) { connp->conn_cred = credp; crhold(credp); - /* - * If the caller has the process-wide flag set, then default to - * MAC exempt mode. This allows read-down to unlabeled hosts. - */ - if (getpflags(NET_MAC_AWARE, credp) != 0) - connp->conn_mac_mode = CONN_MAC_AWARE; } - connp->conn_allzones = pconnp->conn_allzones; - connp->conn_zoneid = pconnp->conn_zoneid; - sctp->sctp_cpid = psctp->sctp_cpid; - sctp->sctp_open_time = lbolt64; - sctp->sctp_mss = psctp->sctp_mss; sctp->sctp_detached = B_TRUE; /* @@ -263,11 +231,6 @@ void sctp_clean_death(sctp_t *sctp, int err) { ASSERT(sctp != NULL); - ASSERT((sctp->sctp_family == AF_INET && - sctp->sctp_ipversion == IPV4_VERSION) || - (sctp->sctp_family == AF_INET6 && - (sctp->sctp_ipversion == IPV4_VERSION || - sctp->sctp_ipversion == IPV6_VERSION))); dprint(3, ("sctp_clean_death %p, state %d\n", (void *)sctp, sctp->sctp_state)); @@ -328,7 +291,8 @@ sctp_clean_death(sctp_t *sctp, int err) int sctp_disconnect(sctp_t *sctp) { - int error = 0; + int error = 0; + conn_t *connp = sctp->sctp_connp; dprint(3, ("sctp_disconnect %p, state %d\n", (void *)sctp, sctp->sctp_state)); @@ -358,7 +322,7 @@ sctp_disconnect(sctp_t *sctp) * If SO_LINGER has set a zero linger time, terminate the * association and send an ABORT. */ - if (sctp->sctp_linger && sctp->sctp_lingertime == 0) { + if (connp->conn_linger && connp->conn_lingertime == 0) { sctp_user_abort(sctp, NULL); WAKE_SCTP(sctp); return (error); @@ -382,7 +346,7 @@ sctp_disconnect(sctp_t *sctp) sctp_send_shutdown(sctp, 0); /* Pass gathered wisdom to IP for keeping */ - sctp_update_ire(sctp); + sctp_update_dce(sctp); /* * If lingering on close then wait until the shutdown @@ -391,21 +355,15 @@ sctp_disconnect(sctp_t *sctp) * can be called more than once. Make sure that only * one thread waits. */ - if (sctp->sctp_linger && sctp->sctp_lingertime > 0 && + if (connp->conn_linger && connp->conn_lingertime > 0 && sctp->sctp_state >= SCTPS_ESTABLISHED && !sctp->sctp_lingering) { clock_t stoptime; /* in ticks */ clock_t ret; - /* - * Process the sendq to send the SHUTDOWN out - * before waiting. - */ - sctp_process_sendq(sctp); - sctp->sctp_lingering = 1; sctp->sctp_client_errno = 0; - stoptime = lbolt + sctp->sctp_lingertime; + stoptime = lbolt + connp->conn_lingertime * hz; mutex_enter(&sctp->sctp_lock); sctp->sctp_running = B_FALSE; @@ -429,7 +387,6 @@ sctp_disconnect(sctp_t *sctp) } WAKE_SCTP(sctp); - sctp_process_sendq(sctp); return (error); } @@ -493,7 +450,6 @@ static void sctp_closei_local(sctp_t *sctp) { mblk_t *mp; - ire_t *ire = NULL; conn_t *connp = sctp->sctp_connp; /* Sanity check, don't do the same thing twice. */ @@ -516,11 +472,7 @@ sctp_closei_local(sctp_t *sctp) /* Set the CONN_CLOSING flag so that IP will not cache IRE again. */ mutex_enter(&connp->conn_lock); connp->conn_state_flags |= CONN_CLOSING; - ire = connp->conn_ire_cache; - connp->conn_ire_cache = NULL; mutex_exit(&connp->conn_lock); - if (ire != NULL) - IRE_REFRELE_NOTR(ire); /* Remove from all hashes. */ sctp_bind_hash_remove(sctp); @@ -534,14 +486,12 @@ sctp_closei_local(sctp_t *sctp) */ mutex_enter(&sctp->sctp_recvq_lock); while ((mp = sctp->sctp_recvq) != NULL) { - mblk_t *ipsec_mp; - sctp->sctp_recvq = mp->b_next; mp->b_next = NULL; - if ((ipsec_mp = mp->b_prev) != NULL) { - freeb(ipsec_mp); - mp->b_prev = NULL; - } + + if (ip_recv_attr_is_mblk(mp)) + mp = ip_recv_attr_free_mblk(mp); + freemsg(mp); } mutex_exit(&sctp->sctp_recvq_lock); @@ -668,7 +618,7 @@ sctp_free(conn_t *connp) SCTP_UNLINK(sctp, sctps); ASSERT(connp->conn_ref == 0); - ASSERT(connp->conn_ulp == IPPROTO_SCTP); + ASSERT(connp->conn_proto == IPPROTO_SCTP); ASSERT(!MUTEX_HELD(&sctp->sctp_reflock)); ASSERT(sctp->sctp_refcnt == 0); @@ -723,8 +673,6 @@ sctp_free(conn_t *connp) list_destroy(&sctp->sctp_saddrs[cnt].sctp_ipif_list); } - ip6_pkt_free(&sctp->sctp_sticky_ipp); - if (sctp->sctp_hopopts != NULL) { mi_free(sctp->sctp_hopopts); sctp->sctp_hopopts = NULL; @@ -737,12 +685,12 @@ sctp_free(conn_t *connp) sctp->sctp_dstoptslen = 0; } ASSERT(sctp->sctp_dstoptslen == 0); - if (sctp->sctp_rtdstopts != NULL) { - mi_free(sctp->sctp_rtdstopts); - sctp->sctp_rtdstopts = NULL; - sctp->sctp_rtdstoptslen = 0; + if (sctp->sctp_rthdrdstopts != NULL) { + mi_free(sctp->sctp_rthdrdstopts); + sctp->sctp_rthdrdstopts = NULL; + sctp->sctp_rthdrdstoptslen = 0; } - ASSERT(sctp->sctp_rtdstoptslen == 0); + ASSERT(sctp->sctp_rthdrdstoptslen == 0); if (sctp->sctp_rthdr != NULL) { mi_free(sctp->sctp_rthdr); sctp->sctp_rthdr = NULL; @@ -806,9 +754,7 @@ sctp_free(conn_t *connp) sctp->sctp_v6label_len = 0; sctp->sctp_v4label_len = 0; - /* Every sctp_t holds one reference on the default queue */ sctp->sctp_sctps = NULL; - SCTP_G_Q_REFRELE(sctps); sctp_conn_clear(connp); kmem_cache_free(sctp_conn_cache, connp); @@ -822,10 +768,12 @@ sctp_display(sctp_t *sctp, char *sup_buf) char buf1[30]; static char priv_buf[INET6_ADDRSTRLEN * 2 + 80]; char *cp; + conn_t *connp; if (sctp == NULL) return ("NULL_SCTP"); + connp = sctp->sctp_connp; buf = (sup_buf != NULL) ? sup_buf : priv_buf; switch (sctp->sctp_state) { @@ -865,7 +813,7 @@ sctp_display(sctp_t *sctp, char *sup_buf) break; } (void) mi_sprintf(buf, "[%u, %u] %s", - ntohs(sctp->sctp_lport), ntohs(sctp->sctp_fport), cp); + ntohs(connp->conn_lport), ntohs(connp->conn_fport), cp); return (buf); } @@ -880,13 +828,9 @@ sctp_init_values(sctp_t *sctp, sctp_t *psctp, int sleep) int err; int cnt; sctp_stack_t *sctps = sctp->sctp_sctps; - conn_t *connp, *pconnp; + conn_t *connp; - ASSERT((sctp->sctp_family == AF_INET && - sctp->sctp_ipversion == IPV4_VERSION) || - (sctp->sctp_family == AF_INET6 && - (sctp->sctp_ipversion == IPV4_VERSION || - sctp->sctp_ipversion == IPV6_VERSION))); + connp = sctp->sctp_connp; sctp->sctp_nsaddrs = 0; for (cnt = 0; cnt < SCTP_IPIF_HASH; cnt++) { @@ -895,7 +839,7 @@ sctp_init_values(sctp_t *sctp, sctp_t *psctp, int sleep) sizeof (sctp_saddr_ipif_t), offsetof(sctp_saddr_ipif_t, saddr_ipif)); } - sctp->sctp_ports = 0; + connp->conn_ports = 0; sctp->sctp_running = B_FALSE; sctp->sctp_state = SCTPS_IDLE; @@ -925,51 +869,16 @@ sctp_init_values(sctp_t *sctp, sctp_t *psctp, int sleep) if (psctp != NULL) { /* * Inherit from parent + * + * Start by inheriting from the conn_t, including conn_ixa and + * conn_xmit_ipp. */ - sctp->sctp_iphc = kmem_zalloc(psctp->sctp_iphc_len, sleep); - if (sctp->sctp_iphc == NULL) { - sctp->sctp_iphc_len = 0; - err = ENOMEM; - goto failure; - } - sctp->sctp_iphc_len = psctp->sctp_iphc_len; - sctp->sctp_hdr_len = psctp->sctp_hdr_len; - - sctp->sctp_iphc6 = kmem_zalloc(psctp->sctp_iphc6_len, sleep); - if (sctp->sctp_iphc6 == NULL) { - sctp->sctp_iphc6_len = 0; - err = ENOMEM; + err = conn_inherit_parent(psctp->sctp_connp, connp); + if (err != 0) goto failure; - } - sctp->sctp_iphc6_len = psctp->sctp_iphc6_len; - sctp->sctp_hdr6_len = psctp->sctp_hdr6_len; - - sctp->sctp_ip_hdr_len = psctp->sctp_ip_hdr_len; - sctp->sctp_ip_hdr6_len = psctp->sctp_ip_hdr6_len; - - /* - * Copy the IP+SCTP header templates from listener - */ - bcopy(psctp->sctp_iphc, sctp->sctp_iphc, - psctp->sctp_hdr_len); - sctp->sctp_ipha = (ipha_t *)sctp->sctp_iphc; - sctp->sctp_sctph = (sctp_hdr_t *)(sctp->sctp_iphc + - sctp->sctp_ip_hdr_len); - - bcopy(psctp->sctp_iphc6, sctp->sctp_iphc6, - psctp->sctp_hdr6_len); - if (((ip6i_t *)(sctp->sctp_iphc6))->ip6i_nxt == IPPROTO_RAW) { - sctp->sctp_ip6h = (ip6_t *)(sctp->sctp_iphc6 + - sizeof (ip6i_t)); - } else { - sctp->sctp_ip6h = (ip6_t *)sctp->sctp_iphc6; - } - sctp->sctp_sctph6 = (sctp_hdr_t *)(sctp->sctp_iphc6 + - sctp->sctp_ip_hdr6_len); sctp->sctp_cookie_lifetime = psctp->sctp_cookie_lifetime; - sctp->sctp_xmit_lowater = psctp->sctp_xmit_lowater; - sctp->sctp_xmit_hiwater = psctp->sctp_xmit_hiwater; + sctp->sctp_cwnd_max = psctp->sctp_cwnd_max; sctp->sctp_rwnd = psctp->sctp_rwnd; sctp->sctp_irwnd = psctp->sctp_rwnd; @@ -996,43 +905,23 @@ sctp_init_values(sctp_t *sctp, sctp_t *psctp, int sleep) sctp->sctp_tx_adaptation_code = psctp->sctp_tx_adaptation_code; /* xxx should be a better way to copy these flags xxx */ - sctp->sctp_debug = psctp->sctp_debug; sctp->sctp_bound_to_all = psctp->sctp_bound_to_all; sctp->sctp_cansleep = psctp->sctp_cansleep; sctp->sctp_send_adaptation = psctp->sctp_send_adaptation; sctp->sctp_ndelay = psctp->sctp_ndelay; sctp->sctp_events = psctp->sctp_events; - sctp->sctp_ipv6_recvancillary = psctp->sctp_ipv6_recvancillary; - - /* Copy IP-layer options */ - connp = sctp->sctp_connp; - pconnp = psctp->sctp_connp; - - connp->conn_broadcast = pconnp->conn_broadcast; - connp->conn_loopback = pconnp->conn_loopback; - connp->conn_dontroute = pconnp->conn_dontroute; - connp->conn_reuseaddr = pconnp->conn_reuseaddr; - } else { /* - * Initialize the header template - */ - if ((err = sctp_header_init_ipv4(sctp, sleep)) != 0) { - goto failure; - } - if ((err = sctp_header_init_ipv6(sctp, sleep)) != 0) { - goto failure; - } - - /* * Set to system defaults */ sctp->sctp_cookie_lifetime = MSEC_TO_TICK(sctps->sctps_cookie_life); - sctp->sctp_xmit_lowater = sctps->sctps_xmit_lowat; - sctp->sctp_xmit_hiwater = sctps->sctps_xmit_hiwat; + connp->conn_sndlowat = sctps->sctps_xmit_lowat; + connp->conn_sndbuf = sctps->sctps_xmit_hiwat; + connp->conn_rcvbuf = sctps->sctps_recv_hiwat; + sctp->sctp_cwnd_max = sctps->sctps_cwnd_max_; - sctp->sctp_rwnd = sctps->sctps_recv_hiwat; + sctp->sctp_rwnd = connp->conn_rcvbuf; sctp->sctp_irwnd = sctp->sctp_rwnd; sctp->sctp_pd_point = sctp->sctp_rwnd; sctp->sctp_rto_max = MSEC_TO_TICK(sctps->sctps_rto_maxg); @@ -1049,13 +938,28 @@ sctp_init_values(sctp_t *sctp, sctp_t *psctp, int sleep) sctp->sctp_hb_interval = MSEC_TO_TICK(sctps->sctps_heartbeat_interval); + + if (connp->conn_family == AF_INET) + connp->conn_default_ttl = sctps->sctps_ipv4_ttl; + else + connp->conn_default_ttl = sctps->sctps_ipv6_hoplimit; + + connp->conn_xmit_ipp.ipp_unicast_hops = + connp->conn_default_ttl; + + /* + * Initialize the header template + */ + if ((err = sctp_build_hdrs(sctp, sleep)) != 0) { + goto failure; + } } + sctp->sctp_understands_asconf = B_TRUE; sctp->sctp_understands_addip = B_TRUE; sctp->sctp_prsctp_aware = B_FALSE; sctp->sctp_connp->conn_ref = 1; - sctp->sctp_connp->conn_fully_bound = B_FALSE; sctp->sctp_prsctpdrop = 0; sctp->sctp_msgcount = 0; @@ -1063,14 +967,7 @@ sctp_init_values(sctp_t *sctp, sctp_t *psctp, int sleep) return (0); failure: - if (sctp->sctp_iphc != NULL) { - kmem_free(sctp->sctp_iphc, sctp->sctp_iphc_len); - sctp->sctp_iphc = NULL; - } - if (sctp->sctp_iphc6 != NULL) { - kmem_free(sctp->sctp_iphc6, sctp->sctp_iphc6_len); - sctp->sctp_iphc6 = NULL; - } + sctp_headers_free(sctp); return (err); } @@ -1102,8 +999,122 @@ sctp_icmp_verf(sctp_t *sctp, sctp_hdr_t *sh, mblk_t *mp) } /* + * Update the SCTP state according to change of PMTU. + * + * Path MTU might have changed by either increase or decrease, so need to + * adjust the MSS based on the value of ixa_pmtu. + */ +static void +sctp_update_pmtu(sctp_t *sctp, sctp_faddr_t *fp, boolean_t decrease_only) +{ + uint32_t pmtu; + int32_t mss; + ip_xmit_attr_t *ixa = fp->ixa; + + if (sctp->sctp_state < SCTPS_ESTABLISHED) + return; + + /* + * Always call ip_get_pmtu() to make sure that IP has updated + * ixa_flags properly. + */ + pmtu = ip_get_pmtu(ixa); + + /* + * Calculate the MSS by decreasing the PMTU by sctp_hdr_len and + * IPsec overhead if applied. Make sure to use the most recent + * IPsec information. + */ + mss = pmtu - conn_ipsec_length(sctp->sctp_connp); + if (ixa->ixa_flags & IXAF_IS_IPV4) + mss -= sctp->sctp_hdr_len; + else + mss -= sctp->sctp_hdr6_len; + + /* + * Nothing to change, so just return. + */ + if (mss == fp->sfa_pmss) + return; + + /* + * Currently, for ICMP errors, only PMTU decrease is handled. + */ + if (mss > fp->sfa_pmss && decrease_only) + return; + +#ifdef DEBUG + (void) printf("sctp_update_pmtu mss from %d to %d\n", + fp->sfa_pmss, mss); +#endif + DTRACE_PROBE2(sctp_update_pmtu, int32_t, fp->sfa_pmss, uint32_t, mss); + + /* + * Update ixa_fragsize and ixa_pmtu. + */ + ixa->ixa_fragsize = ixa->ixa_pmtu = pmtu; + + /* + * Make sure that sfa_pmss is a multiple of + * SCTP_ALIGN. + */ + fp->sfa_pmss = mss & ~(SCTP_ALIGN - 1); + fp->pmtu_discovered = 1; + +#ifdef notyet + if (mss < sctp->sctp_sctps->sctps_mss_min) + ixa->ixa_flags |= IXAF_PMTU_TOO_SMALL; +#endif + if (ixa->ixa_flags & IXAF_PMTU_TOO_SMALL) + ixa->ixa_flags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); + + /* + * If below the min size then ip_get_pmtu cleared IXAF_PMTU_IPV4_DF. + * Make sure to clear IXAF_DONTFRAG, which is used by IP to decide + * whether to fragment the packet. + */ + if (ixa->ixa_flags & IXAF_IS_IPV4) { + if (!(ixa->ixa_flags & IXAF_PMTU_IPV4_DF)) { + fp->df = B_FALSE; + if (fp == sctp->sctp_current) { + sctp->sctp_ipha-> + ipha_fragment_offset_and_flags = 0; + } + } + } +} + +/* + * Notify function registered with ip_xmit_attr_t. It's called in the context + * of conn_ip_output so it's safe to update the SCTP state. + * Currently only used for pmtu changes. + */ +/* ARGSUSED1 */ +static void +sctp_notify(void *arg, ip_xmit_attr_t *ixa, ixa_notify_type_t ntype, + ixa_notify_arg_t narg) +{ + sctp_t *sctp = (sctp_t *)arg; + sctp_faddr_t *fp; + + switch (ntype) { + case IXAN_PMTU: + /* Find the faddr based on the ip_xmit_attr_t pointer */ + for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->next) { + if (fp->ixa == ixa) + break; + } + if (fp != NULL) + sctp_update_pmtu(sctp, fp, B_FALSE); + break; + default: + break; + } +} + +/* * sctp_icmp_error is called by sctp_input() to process ICMP error messages - * passed up by IP. The queue is the default queue. We need to find a sctp_t + * passed up by IP. We need to find a sctp_t * that corresponds to the returned datagram. Passes the message back in on * the correct queue once it has located the connection. * Assumes that IP has pulled up everything up to and including @@ -1116,8 +1127,6 @@ sctp_icmp_error(sctp_t *sctp, mblk_t *mp) ipha_t *ipha; int iph_hdr_length; sctp_hdr_t *sctph; - mblk_t *first_mp; - uint32_t new_mtu; in6_addr_t dst; sctp_faddr_t *fp; sctp_stack_t *sctps = sctp->sctp_sctps; @@ -1125,12 +1134,10 @@ sctp_icmp_error(sctp_t *sctp, mblk_t *mp) dprint(1, ("sctp_icmp_error: sctp=%p, mp=%p\n", (void *)sctp, (void *)mp)); - first_mp = mp; - ipha = (ipha_t *)mp->b_rptr; if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) { ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION); - sctp_icmp_error_ipv6(sctp, first_mp); + sctp_icmp_error_ipv6(sctp, mp); return; } @@ -1144,7 +1151,7 @@ sctp_icmp_error(sctp_t *sctp, mblk_t *mp) /* first_mp must expose the full sctp header. */ if ((uchar_t *)(sctph + 1) >= mp->b_wptr) { /* not enough data for SCTP header */ - freemsg(first_mp); + freemsg(mp); return; } @@ -1175,19 +1182,7 @@ sctp_icmp_error(sctp_t *sctp, mblk_t *mp) if (fp == NULL) { break; } - - new_mtu = ntohs(icmph->icmph_du_mtu); - - if (new_mtu - sctp->sctp_hdr_len >= fp->sfa_pmss) - break; - - /* - * Make sure that sfa_pmss is a multiple of - * SCTP_ALIGN. - */ - fp->sfa_pmss = (new_mtu - sctp->sctp_hdr_len) & - ~(SCTP_ALIGN - 1); - fp->pmtu_discovered = 1; + sctp_update_pmtu(sctp, fp, B_TRUE); /* * It is possible, even likely that a fast retransmit * attempt has been dropped by ip as a result of this @@ -1229,7 +1224,7 @@ sctp_icmp_error(sctp_t *sctp, mblk_t *mp) break; } } - freemsg(first_mp); + freemsg(mp); } /* @@ -1246,7 +1241,6 @@ sctp_icmp_error_ipv6(sctp_t *sctp, mblk_t *mp) uint16_t iph_hdr_length; sctp_hdr_t *sctpha; uint8_t *nexthdrp; - uint32_t new_mtu; sctp_faddr_t *fp; sctp_stack_t *sctps = sctp->sctp_sctps; @@ -1294,16 +1288,16 @@ sctp_icmp_error_ipv6(sctp_t *sctp, mblk_t *mp) break; } - new_mtu = ntohs(icmp6->icmp6_mtu); - - if (new_mtu - sctp->sctp_hdr6_len >= fp->sfa_pmss) - break; - - /* Make sure that sfa_pmss is a multiple of SCTP_ALIGN. */ - fp->sfa_pmss = (new_mtu - sctp->sctp_hdr6_len) & - ~(SCTP_ALIGN - 1); - fp->pmtu_discovered = 1; - + sctp_update_pmtu(sctp, fp, B_TRUE); + /* + * It is possible, even likely that a fast retransmit + * attempt has been dropped by ip as a result of this + * error, retransmission bundles as much as possible. + * A retransmit here prevents significant delays waiting + * on the timer. Analogous to behaviour of TCP after + * ICMP too big. + */ + sctp_rexmit(sctp, fp); break; case ICMP6_DST_UNREACH: @@ -1366,12 +1360,12 @@ sctp_icmp_error_ipv6(sctp_t *sctp, mblk_t *mp) * If parent pointer is passed in, inherit settings from it. */ sctp_t * -sctp_create(void *ulpd, sctp_t *parent, int family, int flags, +sctp_create(void *ulpd, sctp_t *parent, int family, int type, int flags, sock_upcalls_t *upcalls, sctp_sockbuf_limits_t *sbl, cred_t *credp) { sctp_t *sctp, *psctp; - conn_t *sctp_connp; + conn_t *connp; mblk_t *ack_mp, *hb_mp; int sleep = flags & SCTP_CAN_BLOCK ? KM_SLEEP : KM_NOSLEEP; zoneid_t zoneid; @@ -1403,18 +1397,8 @@ sctp_create(void *ulpd, sctp_t *parent, int family, int flags, zoneid = GLOBAL_ZONEID; else zoneid = crgetzoneid(credp); - - /* - * For stackid zero this is done from strplumb.c, but - * non-zero stackids are handled here. - */ - if (sctps->sctps_g_q == NULL && - sctps->sctps_netstack->netstack_stackid != - GLOBAL_NETSTACKID) { - sctp_g_q_setup(sctps); - } } - if ((sctp_connp = ipcl_conn_create(IPCL_SCTPCONN, sleep, + if ((connp = ipcl_conn_create(IPCL_SCTPCONN, sleep, sctps->sctps_netstack)) == NULL) { netstack_rele(sctps->sctps_netstack); SCTP_KSTAT(sctps, sctp_conn_create); @@ -1425,49 +1409,38 @@ sctp_create(void *ulpd, sctp_t *parent, int family, int flags, * done at top of sctp_create. */ netstack_rele(sctps->sctps_netstack); - sctp = CONN2SCTP(sctp_connp); + sctp = CONN2SCTP(connp); sctp->sctp_sctps = sctps; - sctp_connp->conn_ulp_labeled = is_system_labeled(); if ((ack_mp = sctp_timer_alloc(sctp, sctp_ack_timer, sleep)) == NULL || (hb_mp = sctp_timer_alloc(sctp, sctp_heartbeat_timer, sleep)) == NULL) { if (ack_mp != NULL) freeb(ack_mp); - sctp_conn_clear(sctp_connp); + sctp_conn_clear(connp); sctp->sctp_sctps = NULL; - SCTP_G_Q_REFRELE(sctps); - kmem_cache_free(sctp_conn_cache, sctp_connp); + kmem_cache_free(sctp_conn_cache, connp); return (NULL); } sctp->sctp_ack_mp = ack_mp; sctp->sctp_heartbeat_mp = hb_mp; - switch (family) { - case AF_INET6: - sctp_connp->conn_af_isv6 = B_TRUE; - sctp->sctp_ipversion = IPV6_VERSION; - sctp->sctp_family = AF_INET6; - break; + /* + * Have conn_ip_output drop packets should our outer source + * go invalid, and tell us about mtu changes. + */ + connp->conn_ixa->ixa_flags |= IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE | + IXAF_VERIFY_PMTU; + connp->conn_family = family; + connp->conn_so_type = type; - case AF_INET: - sctp_connp->conn_af_isv6 = B_FALSE; - sctp_connp->conn_pkt_isv6 = B_FALSE; - sctp->sctp_ipversion = IPV4_VERSION; - sctp->sctp_family = AF_INET; - break; - default: - ASSERT(0); - break; - } if (sctp_init_values(sctp, psctp, sleep) != 0) { freeb(ack_mp); freeb(hb_mp); - sctp_conn_clear(sctp_connp); + sctp_conn_clear(connp); sctp->sctp_sctps = NULL; - SCTP_G_Q_REFRELE(sctps); - kmem_cache_free(sctp_conn_cache, sctp_connp); + kmem_cache_free(sctp_conn_cache, connp); return (NULL); } sctp->sctp_cansleep = ((flags & SCTP_CAN_BLOCK) == SCTP_CAN_BLOCK); @@ -1476,6 +1449,8 @@ sctp_create(void *ulpd, sctp_t *parent, int family, int flags, sctp->sctp_hdr6_len : sctp->sctp_hdr_len); if (psctp != NULL) { + conn_t *pconnp = psctp->sctp_connp; + RUN_SCTP(psctp); /* * Inherit local address list, local port. Parent is either @@ -1488,10 +1463,9 @@ sctp_create(void *ulpd, sctp_t *parent, int family, int flags, freeb(ack_mp); freeb(hb_mp); sctp_headers_free(sctp); - sctp_conn_clear(sctp_connp); + sctp_conn_clear(connp); sctp->sctp_sctps = NULL; - SCTP_G_Q_REFRELE(sctps); - kmem_cache_free(sctp_conn_cache, sctp_connp); + kmem_cache_free(sctp_conn_cache, connp); return (NULL); } @@ -1500,28 +1474,32 @@ sctp_create(void *ulpd, sctp_t *parent, int family, int flags, * followed by sctp_connect(). So don't add this guy to * bind hash. */ - sctp->sctp_lport = psctp->sctp_lport; + connp->conn_lport = pconnp->conn_lport; sctp->sctp_state = SCTPS_BOUND; - sctp->sctp_allzones = psctp->sctp_allzones; - sctp->sctp_zoneid = psctp->sctp_zoneid; WAKE_SCTP(psctp); } else { - sctp->sctp_zoneid = zoneid; - } - - sctp->sctp_cpid = curproc->p_pid; - sctp->sctp_open_time = lbolt64; + ASSERT(connp->conn_cred == NULL); + connp->conn_zoneid = zoneid; + /* + * conn_allzones can not be set this early, hence + * no IPCL_ZONEID + */ + connp->conn_ixa->ixa_zoneid = zoneid; + connp->conn_open_time = lbolt64; + connp->conn_cred = credp; + crhold(credp); + connp->conn_cpid = curproc->p_pid; - ASSERT(sctp_connp->conn_cred == NULL); - sctp_connp->conn_cred = credp; - crhold(credp); + /* + * If the caller has the process-wide flag set, then default to + * MAC exempt mode. This allows read-down to unlabeled hosts. + */ + if (getpflags(NET_MAC_AWARE, credp) != 0) + connp->conn_mac_mode = CONN_MAC_AWARE; - /* - * If the caller has the process-wide flag set, then default to MAC - * exempt mode. This allows read-down to unlabeled hosts. - */ - if (getpflags(NET_MAC_AWARE, credp) != 0) - sctp_connp->conn_mac_mode = CONN_MAC_AWARE; + connp->conn_zone_is_global = + (crgetzoneid(credp) == GLOBAL_ZONEID); + } /* Initialize SCTP instance values, our verf tag must never be 0 */ (void) random_get_pseudo_bytes((uint8_t *)&sctp->sctp_lvtag, @@ -1536,20 +1514,17 @@ sctp_create(void *ulpd, sctp_t *parent, int family, int flags, sctp->sctp_adv_pap = sctp->sctp_lastack_rxd; /* Information required by upper layer */ - if (ulpd != NULL) { - sctp->sctp_ulpd = ulpd; - - ASSERT(upcalls != NULL); - sctp->sctp_upcalls = upcalls; - ASSERT(sbl != NULL); - /* Fill in the socket buffer limits for sctpsockfs */ - sbl->sbl_txlowat = sctp->sctp_xmit_lowater; - sbl->sbl_txbuf = sctp->sctp_xmit_hiwater; - sbl->sbl_rxbuf = sctp->sctp_rwnd; - sbl->sbl_rxlowat = SCTP_RECV_LOWATER; - } - /* If no ulpd, must be creating the default sctp */ - ASSERT(ulpd != NULL || sctps->sctps_gsctp == NULL); + ASSERT(ulpd != NULL); + sctp->sctp_ulpd = ulpd; + + ASSERT(upcalls != NULL); + sctp->sctp_upcalls = upcalls; + ASSERT(sbl != NULL); + /* Fill in the socket buffer limits for sctpsockfs */ + sbl->sbl_txlowat = connp->conn_sndlowat; + sbl->sbl_txbuf = connp->conn_sndbuf; + sbl->sbl_rxbuf = sctp->sctp_rwnd; + sbl->sbl_rxlowat = SCTP_RECV_LOWATER; /* Insert this in the global list. */ SCTP_LINK(sctp, sctps); @@ -1557,232 +1532,6 @@ sctp_create(void *ulpd, sctp_t *parent, int family, int flags, return (sctp); } -/* - * Make sure we wait until the default queue is setup, yet allow - * sctp_g_q_create() to open a SCTP stream. - * We need to allow sctp_g_q_create() do do an open - * of sctp, hence we compare curhread. - * All others have to wait until the sctps_g_q has been - * setup. - */ -void -sctp_g_q_setup(sctp_stack_t *sctps) -{ - mutex_enter(&sctps->sctps_g_q_lock); - if (sctps->sctps_g_q != NULL) { - mutex_exit(&sctps->sctps_g_q_lock); - return; - } - if (sctps->sctps_g_q_creator == NULL) { - /* This thread will set it up */ - sctps->sctps_g_q_creator = curthread; - mutex_exit(&sctps->sctps_g_q_lock); - sctp_g_q_create(sctps); - mutex_enter(&sctps->sctps_g_q_lock); - ASSERT(sctps->sctps_g_q_creator == curthread); - sctps->sctps_g_q_creator = NULL; - cv_signal(&sctps->sctps_g_q_cv); - ASSERT(sctps->sctps_g_q != NULL); - mutex_exit(&sctps->sctps_g_q_lock); - return; - } - /* Everybody but the creator has to wait */ - if (sctps->sctps_g_q_creator != curthread) { - while (sctps->sctps_g_q == NULL) - cv_wait(&sctps->sctps_g_q_cv, &sctps->sctps_g_q_lock); - } - mutex_exit(&sctps->sctps_g_q_lock); -} - -#define IP "ip" - -#define SCTP6DEV "/devices/pseudo/sctp6@0:sctp6" - -/* - * Create a default sctp queue here instead of in strplumb - */ -void -sctp_g_q_create(sctp_stack_t *sctps) -{ - int error; - ldi_handle_t lh = NULL; - ldi_ident_t li = NULL; - int rval; - cred_t *cr; - major_t IP_MAJ; - -#ifdef NS_DEBUG - (void) printf("sctp_g_q_create()for stack %d\n", - sctps->sctps_netstack->netstack_stackid); -#endif - - IP_MAJ = ddi_name_to_major(IP); - - ASSERT(sctps->sctps_g_q_creator == curthread); - - error = ldi_ident_from_major(IP_MAJ, &li); - if (error) { -#ifdef DEBUG - printf("sctp_g_q_create: lyr ident get failed error %d\n", - error); -#endif - return; - } - - cr = zone_get_kcred(netstackid_to_zoneid( - sctps->sctps_netstack->netstack_stackid)); - ASSERT(cr != NULL); - /* - * We set the sctp default queue to IPv6 because IPv4 falls - * back to IPv6 when it can't find a client, but - * IPv6 does not fall back to IPv4. - */ - error = ldi_open_by_name(SCTP6DEV, FREAD|FWRITE, cr, &lh, li); - if (error) { -#ifdef DEBUG - printf("sctp_g_q_create: open of SCTP6DEV failed error %d\n", - error); -#endif - goto out; - } - - /* - * This ioctl causes the sctp framework to cache a pointer to - * this stream, so we don't want to close the stream after - * this operation. - * Use the kernel credentials that are for the zone we're in. - */ - error = ldi_ioctl(lh, SCTP_IOC_DEFAULT_Q, - (intptr_t)0, FKIOCTL, cr, &rval); - if (error) { -#ifdef DEBUG - printf("sctp_g_q_create: ioctl SCTP_IOC_DEFAULT_Q failed " - "error %d\n", error); -#endif - goto out; - } - sctps->sctps_g_q_lh = lh; /* For sctp_g_q_inactive */ - lh = NULL; -out: - /* Close layered handles */ - if (li) - ldi_ident_release(li); - /* Keep cred around until _inactive needs it */ - sctps->sctps_g_q_cr = cr; -} - -/* - * Remove the sctp_default queue so that new connections will not find it. - * SCTP uses sctp_g_q for all transmission, so all sctp'ts implicitly - * refer to it. Hence have each one have a reference on sctp_g_q_ref! - * - * We decrement the refcnt added in sctp_g_q_create. Once all the - * sctp_t's which use the default go away, sctp_g_q_close will be called - * and close the sctp_g_q. Once sctp_g_q is closed, sctp_close() will drop the - * last reference count on the stack by calling netstack_rele(). - */ -void -sctp_g_q_destroy(sctp_stack_t *sctps) -{ - if (sctps->sctps_g_q == NULL) { - return; /* Nothing to cleanup */ - } - /* - * Keep sctps_g_q and sctps_gsctp until the last reference has - * dropped, since the output is always done using those. - * Need to decrement twice to take sctp_g_q_create and - * the gsctp reference into account so that sctp_g_q_inactive is called - * when all but the default queue remains. - */ -#ifdef NS_DEBUG - (void) printf("sctp_g_q_destroy: ref %d\n", - sctps->sctps_g_q_ref); -#endif - SCTP_G_Q_REFRELE(sctps); -} - -/* - * Called when last user (could be sctp_g_q_destroy) drops reference count - * using SCTP_G_Q_REFRELE. - * Run by sctp_q_q_inactive using a taskq. - */ -static void -sctp_g_q_close(void *arg) -{ - sctp_stack_t *sctps = arg; - int error; - ldi_handle_t lh = NULL; - ldi_ident_t li = NULL; - cred_t *cr; - major_t IP_MAJ; - - IP_MAJ = ddi_name_to_major(IP); - - lh = sctps->sctps_g_q_lh; - if (lh == NULL) - return; /* Nothing to cleanup */ - - error = ldi_ident_from_major(IP_MAJ, &li); - if (error) { -#ifdef NS_DEBUG - printf("sctp_g_q_inactive: lyr ident get failed error %d\n", - error); -#endif - return; - } - - cr = sctps->sctps_g_q_cr; - sctps->sctps_g_q_cr = NULL; - ASSERT(cr != NULL); - - /* - * Make sure we can break the recursion when sctp_close decrements - * the reference count causing g_q_inactive to be called again. - */ - sctps->sctps_g_q_lh = NULL; - - /* close the default queue */ - (void) ldi_close(lh, FREAD|FWRITE, cr); - - /* Close layered handles */ - ldi_ident_release(li); - crfree(cr); - - ASSERT(sctps->sctps_g_q != NULL); - sctps->sctps_g_q = NULL; - /* - * Now free sctps_gsctp. - */ - ASSERT(sctps->sctps_gsctp != NULL); - sctp_closei_local(sctps->sctps_gsctp); - SCTP_CONDEMNED(sctps->sctps_gsctp); - SCTP_REFRELE(sctps->sctps_gsctp); - sctps->sctps_gsctp = NULL; -} - -/* - * Called when last sctp_t drops reference count using SCTP_G_Q_REFRELE. - * - * Have to ensure that the ldi routines are not used by an - * interrupt thread by using a taskq. - */ -void -sctp_g_q_inactive(sctp_stack_t *sctps) -{ - if (sctps->sctps_g_q_lh == NULL) - return; /* Nothing to cleanup */ - - ASSERT(sctps->sctps_g_q_ref == 0); - SCTP_G_Q_REFHOLD(sctps); /* Compensate for what g_q_destroy did */ - - if (servicing_interrupt()) { - (void) taskq_dispatch(sctp_taskq, sctp_g_q_close, - (void *) sctps, TQ_SLEEP); - } else { - sctp_g_q_close(sctps); - } -} - /* Run at module load time */ void sctp_ddi_g_init(void) @@ -1802,16 +1551,12 @@ sctp_ddi_g_init(void) /* Initialize tables used for CRC calculation */ sctp_crc32_init(); - sctp_taskq = taskq_create("sctp_taskq", 1, minclsyspri, 1, 1, - TASKQ_PREPOPULATE); - /* * We want to be informed each time a stack is created or * destroyed in the kernel, so we can maintain the * set of sctp_stack_t's. */ - netstack_register(NS_SCTP, sctp_stack_init, sctp_stack_shutdown, - sctp_stack_fini); + netstack_register(NS_SCTP, sctp_stack_init, NULL, sctp_stack_fini); } static void * @@ -1823,8 +1568,6 @@ sctp_stack_init(netstackid_t stackid, netstack_t *ns) sctps->sctps_netstack = ns; /* Initialize locks */ - mutex_init(&sctps->sctps_g_q_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&sctps->sctps_g_q_cv, NULL, CV_DEFAULT, NULL); mutex_init(&sctps->sctps_g_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&sctps->sctps_epriv_port_lock, NULL, MUTEX_DEFAULT, NULL); sctps->sctps_g_num_epriv_ports = SCTP_NUM_EPRIV_PORTS; @@ -1875,19 +1618,6 @@ sctp_ddi_g_destroy(void) sctp_ftsn_sets_fini(); netstack_unregister(NS_SCTP); - taskq_destroy(sctp_taskq); -} - -/* - * Shut down the SCTP stack instance. - */ -/* ARGSUSED */ -static void -sctp_stack_shutdown(netstackid_t stackid, void *arg) -{ - sctp_stack_t *sctps = (sctp_stack_t *)arg; - - sctp_g_q_destroy(sctps); } /* @@ -1922,8 +1652,6 @@ sctp_stack_fini(netstackid_t stackid, void *arg) mutex_destroy(&sctps->sctps_g_lock); mutex_destroy(&sctps->sctps_epriv_port_lock); - mutex_destroy(&sctps->sctps_g_q_lock); - cv_destroy(&sctps->sctps_g_q_cv); kmem_free(sctps, sizeof (*sctps)); } @@ -1934,7 +1662,8 @@ sctp_display_all(sctp_stack_t *sctps) sctp_t *sctp_walker; mutex_enter(&sctps->sctps_g_lock); - for (sctp_walker = sctps->sctps_gsctp; sctp_walker != NULL; + for (sctp_walker = list_head(&sctps->sctps_g_list); + sctp_walker != NULL; sctp_walker = (sctp_t *)list_next(&sctps->sctps_g_list, sctp_walker)) { (void) sctp_display(sctp_walker, NULL); @@ -2009,81 +1738,6 @@ sctp_inc_taskq(sctp_stack_t *sctps) } #ifdef DEBUG -uint32_t sendq_loop_cnt = 0; -uint32_t sendq_collision = 0; -uint32_t sendq_empty = 0; -#endif - -void -sctp_add_sendq(sctp_t *sctp, mblk_t *mp) -{ - mutex_enter(&sctp->sctp_sendq_lock); - if (sctp->sctp_sendq == NULL) { - sctp->sctp_sendq = mp; - sctp->sctp_sendq_tail = mp; - } else { - sctp->sctp_sendq_tail->b_next = mp; - sctp->sctp_sendq_tail = mp; - } - mutex_exit(&sctp->sctp_sendq_lock); -} - -void -sctp_process_sendq(sctp_t *sctp) -{ - mblk_t *mp; -#ifdef DEBUG - uint32_t loop_cnt = 0; -#endif - - mutex_enter(&sctp->sctp_sendq_lock); - if (sctp->sctp_sendq == NULL || sctp->sctp_sendq_sending) { -#ifdef DEBUG - if (sctp->sctp_sendq == NULL) - sendq_empty++; - else - sendq_collision++; -#endif - mutex_exit(&sctp->sctp_sendq_lock); - return; - } - sctp->sctp_sendq_sending = B_TRUE; - - /* - * Note that while we are in this loop, other thread can put - * new packets in the receive queue. We may be looping for - * quite a while. This is OK even for an interrupt thread. - * The reason is that SCTP should only able to send a limited - * number of packets out in a burst. So the number of times - * we go through this loop should not be many. - */ - while ((mp = sctp->sctp_sendq) != NULL) { - sctp->sctp_sendq = mp->b_next; - ASSERT(sctp->sctp_connp->conn_ref > 0); - mutex_exit(&sctp->sctp_sendq_lock); - mp->b_next = NULL; - CONN_INC_REF(sctp->sctp_connp); - mp->b_flag |= MSGHASREF; - /* If we don't have sctp_current, default to IPv4 */ - IP_PUT(mp, sctp->sctp_connp, sctp->sctp_current == NULL ? - B_TRUE : sctp->sctp_current->isv4); - BUMP_LOCAL(sctp->sctp_opkts); -#ifdef DEBUG - loop_cnt++; -#endif - mutex_enter(&sctp->sctp_sendq_lock); - } - - sctp->sctp_sendq_tail = NULL; - sctp->sctp_sendq_sending = B_FALSE; -#ifdef DEBUG - if (loop_cnt > sendq_loop_cnt) - sendq_loop_cnt = loop_cnt; -#endif - mutex_exit(&sctp->sctp_sendq_lock); -} - -#ifdef DEBUG uint32_t recvq_loop_cnt = 0; uint32_t recvq_call = 0; #endif @@ -2144,10 +1798,19 @@ sctp_find_next_tq(sctp_t *sctp) * If the try_harder argument is B_TRUE, this routine sctp_find_next_tq() * will try very hard to dispatch the task. Refer to the comment * for that routine on how it does that. + * + * On failure the message has been freed i.e., this routine always consumes the + * message. It bumps ipIfStatsInDiscards and and uses ip_drop_input to drop. */ -boolean_t -sctp_add_recvq(sctp_t *sctp, mblk_t *mp, boolean_t caller_hold_lock) +void +sctp_add_recvq(sctp_t *sctp, mblk_t *mp, boolean_t caller_hold_lock, + ip_recv_attr_t *ira) { + mblk_t *attrmp; + ip_stack_t *ipst = sctp->sctp_sctps->sctps_netstack->netstack_ip; + + ASSERT(ira->ira_ill == NULL); + if (!caller_hold_lock) mutex_enter(&sctp->sctp_recvq_lock); @@ -2157,12 +1820,28 @@ sctp_add_recvq(sctp_t *sctp, mblk_t *mp, boolean_t caller_hold_lock) if (!sctp_find_next_tq(sctp)) { if (!caller_hold_lock) mutex_exit(&sctp->sctp_recvq_lock); - return (B_FALSE); + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, NULL); + freemsg(mp); + return; } /* Make sure the sctp_t will not go away. */ SCTP_REFHOLD(sctp); } + attrmp = ip_recv_attr_to_mblk(ira); + if (attrmp == NULL) { + if (!caller_hold_lock) + mutex_exit(&sctp->sctp_recvq_lock); + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, NULL); + freemsg(mp); + return; + } + ASSERT(attrmp->b_cont == NULL); + attrmp->b_cont = mp; + mp = attrmp; + if (sctp->sctp_recvq == NULL) { sctp->sctp_recvq = mp; sctp->sctp_recvq_tail = mp; @@ -2173,7 +1852,6 @@ sctp_add_recvq(sctp_t *sctp, mblk_t *mp, boolean_t caller_hold_lock) if (!caller_hold_lock) mutex_exit(&sctp->sctp_recvq_lock); - return (B_TRUE); } static void @@ -2181,10 +1859,10 @@ sctp_process_recvq(void *arg) { sctp_t *sctp = (sctp_t *)arg; mblk_t *mp; - mblk_t *ipsec_mp; #ifdef DEBUG uint32_t loop_cnt = 0; #endif + ip_recv_attr_t iras; #ifdef _BIG_ENDIAN #define IPVER(ip6h) ((((uint32_t *)ip6h)[0] >> 28) & 0x7) @@ -2204,16 +1882,31 @@ sctp_process_recvq(void *arg) * quite a while. */ while ((mp = sctp->sctp_recvq) != NULL) { + mblk_t *data_mp; + sctp->sctp_recvq = mp->b_next; mutex_exit(&sctp->sctp_recvq_lock); mp->b_next = NULL; #ifdef DEBUG loop_cnt++; #endif - ipsec_mp = mp->b_prev; mp->b_prev = NULL; - sctp_input_data(sctp, mp, ipsec_mp); + data_mp = mp->b_cont; + mp->b_cont = NULL; + if (!ip_recv_attr_from_mblk(mp, &iras)) { + ip_drop_input("ip_recv_attr_from_mblk", mp, NULL); + freemsg(mp); + ira_cleanup(&iras, B_TRUE); + continue; + } + + if (iras.ira_flags & IRAF_ICMP_ERROR) + sctp_icmp_error(sctp, data_mp); + else + sctp_input_data(sctp, data_mp, &iras); + + ira_cleanup(&iras, B_TRUE); mutex_enter(&sctp->sctp_recvq_lock); } @@ -2224,8 +1917,6 @@ sctp_process_recvq(void *arg) WAKE_SCTP(sctp); - /* We may have sent something when processing the receive queue. */ - sctp_process_sendq(sctp); #ifdef DEBUG if (loop_cnt > recvq_loop_cnt) recvq_loop_cnt = loop_cnt; @@ -2238,18 +1929,32 @@ sctp_process_recvq(void *arg) static int sctp_conn_cache_constructor(void *buf, void *cdrarg, int kmflags) { - conn_t *sctp_connp = (conn_t *)buf; - sctp_t *sctp = (sctp_t *)&sctp_connp[1]; + conn_t *connp = (conn_t *)buf; + sctp_t *sctp = (sctp_t *)&connp[1]; + bzero(connp, sizeof (conn_t)); bzero(buf, (char *)&sctp[1] - (char *)buf); - sctp->sctp_connp = sctp_connp; mutex_init(&sctp->sctp_reflock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&sctp->sctp_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&sctp->sctp_recvq_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&sctp->sctp_cv, NULL, CV_DEFAULT, NULL); - mutex_init(&sctp->sctp_sendq_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); + connp->conn_flags = IPCL_SCTPCONN; + connp->conn_proto = IPPROTO_SCTP; + connp->conn_sctp = sctp; + sctp->sctp_connp = connp; + rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); + + connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); + if (connp->conn_ixa == NULL) { + return (ENOMEM); + } + connp->conn_ixa->ixa_refcnt = 1; + connp->conn_ixa->ixa_protocol = connp->conn_proto; + connp->conn_ixa->ixa_xmit_hint = CONN_TO_XMIT_HINT(connp); return (0); } @@ -2257,14 +1962,13 @@ sctp_conn_cache_constructor(void *buf, void *cdrarg, int kmflags) static void sctp_conn_cache_destructor(void *buf, void *cdrarg) { - conn_t *sctp_connp = (conn_t *)buf; - sctp_t *sctp = (sctp_t *)&sctp_connp[1]; + conn_t *connp = (conn_t *)buf; + sctp_t *sctp = (sctp_t *)&connp[1]; + ASSERT(sctp->sctp_connp == connp); ASSERT(!MUTEX_HELD(&sctp->sctp_lock)); ASSERT(!MUTEX_HELD(&sctp->sctp_reflock)); ASSERT(!MUTEX_HELD(&sctp->sctp_recvq_lock)); - ASSERT(!MUTEX_HELD(&sctp->sctp_sendq_lock)); - ASSERT(!MUTEX_HELD(&sctp->sctp_connp->conn_lock)); ASSERT(sctp->sctp_conn_hash_next == NULL); ASSERT(sctp->sctp_conn_hash_prev == NULL); @@ -2317,16 +2021,6 @@ sctp_conn_cache_destructor(void *buf, void *cdrarg) ASSERT(sctp->sctp_recvq_tail == NULL); ASSERT(sctp->sctp_recvq_tq == NULL); - ASSERT(sctp->sctp_sendq == NULL); - ASSERT(sctp->sctp_sendq_tail == NULL); - ASSERT(sctp->sctp_sendq_sending == B_FALSE); - - ASSERT(sctp->sctp_ipp_hopopts == NULL); - ASSERT(sctp->sctp_ipp_rtdstopts == NULL); - ASSERT(sctp->sctp_ipp_rthdr == NULL); - ASSERT(sctp->sctp_ipp_dstopts == NULL); - ASSERT(sctp->sctp_ipp_pathmtu == NULL); - /* * sctp_pad_mp can be NULL if the memory allocation fails * in sctp_init_values() and the conn_t is freed. @@ -2340,8 +2034,18 @@ sctp_conn_cache_destructor(void *buf, void *cdrarg) mutex_destroy(&sctp->sctp_lock); mutex_destroy(&sctp->sctp_recvq_lock); cv_destroy(&sctp->sctp_cv); - mutex_destroy(&sctp->sctp_sendq_lock); + mutex_destroy(&connp->conn_lock); + cv_destroy(&connp->conn_cv); + rw_destroy(&connp->conn_ilg_lock); + + /* Can be NULL if constructor failed */ + if (connp->conn_ixa != NULL) { + ASSERT(connp->conn_ixa->ixa_refcnt == 1); + ASSERT(connp->conn_ixa->ixa_ire == NULL); + ASSERT(connp->conn_ixa->ixa_nce == NULL); + ixa_refrele(connp->conn_ixa); + } } static void @@ -2361,31 +2065,53 @@ sctp_conn_cache_fini() void sctp_conn_init(conn_t *connp) { - connp->conn_flags = IPCL_SCTPCONN; + ASSERT(connp->conn_flags == IPCL_SCTPCONN); connp->conn_rq = connp->conn_wq = NULL; - connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; - connp->conn_ulp = IPPROTO_SCTP; + connp->conn_ixa->ixa_flags |= IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE | + IXAF_VERIFY_PMTU; + + ASSERT(connp->conn_proto == IPPROTO_SCTP); + ASSERT(connp->conn_ixa->ixa_protocol == connp->conn_proto); connp->conn_state_flags |= CONN_INCIPIENT; - mutex_init(&connp->conn_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&connp->conn_cv, NULL, CV_DEFAULT, NULL); + + ASSERT(connp->conn_sctp != NULL); + + /* + * Register sctp_notify to listen to capability changes detected by IP. + * This upcall is made in the context of the call to conn_ip_output + * thus it holds whatever locks sctp holds across conn_ip_output. + */ + connp->conn_ixa->ixa_notify = sctp_notify; + connp->conn_ixa->ixa_notify_cookie = connp->conn_sctp; } static void sctp_conn_clear(conn_t *connp) { /* Clean up conn_t stuff */ - if (connp->conn_latch != NULL) - IPLATCH_REFRELE(connp->conn_latch, connp->conn_netstack); - if (connp->conn_policy != NULL) + if (connp->conn_latch != NULL) { + IPLATCH_REFRELE(connp->conn_latch); + connp->conn_latch = NULL; + } + if (connp->conn_latch_in_policy != NULL) { + IPPOL_REFRELE(connp->conn_latch_in_policy); + connp->conn_latch_in_policy = NULL; + } + if (connp->conn_latch_in_action != NULL) { + IPACT_REFRELE(connp->conn_latch_in_action); + connp->conn_latch_in_action = NULL; + } + if (connp->conn_policy != NULL) { IPPH_REFRELE(connp->conn_policy, connp->conn_netstack); - if (connp->conn_ipsec_opt_mp != NULL) + connp->conn_policy = NULL; + } + if (connp->conn_ipsec_opt_mp != NULL) { freemsg(connp->conn_ipsec_opt_mp); - if (connp->conn_cred != NULL) - crfree(connp->conn_cred); - if (connp->conn_effective_cred != NULL) - crfree(connp->conn_effective_cred); - mutex_destroy(&connp->conn_lock); - cv_destroy(&connp->conn_cv); + connp->conn_ipsec_opt_mp = NULL; + } netstack_rele(connp->conn_netstack); - bzero(connp, sizeof (struct conn_s)); + connp->conn_netstack = NULL; + + /* Leave conn_ixa and other constructed fields in place */ + ipcl_conn_cleanup(connp); } diff --git a/usr/src/uts/common/inet/sctp/sctp_addr.c b/usr/src/uts/common/inet/sctp/sctp_addr.c index b347d30dda..306362211d 100644 --- a/usr/src/uts/common/inet/sctp/sctp_addr.c +++ b/usr/src/uts/common/inet/sctp/sctp_addr.c @@ -41,6 +41,7 @@ #include <inet/common.h> #include <inet/ip.h> #include <inet/ip6.h> +#include <inet/ip_ire.h> #include <inet/ip_if.h> #include <inet/ipclassifier.h> #include <inet/sctp_ip.h> @@ -236,6 +237,7 @@ sctp_get_all_ipifs(sctp_t *sctp, int sleep) int error = 0; sctp_stack_t *sctps = sctp->sctp_sctps; boolean_t isv6; + conn_t *connp = sctp->sctp_connp; rw_enter(&sctps->sctps_g_ipifs_lock, RW_READER); for (i = 0; i < SCTP_IPIF_HASH; i++) { @@ -250,8 +252,8 @@ sctp_get_all_ipifs(sctp_t *sctp, int sleep) !SCTP_IPIF_ZONE_MATCH(sctp, sctp_ipif) || SCTP_IS_ADDR_UNSPEC(!isv6, sctp_ipif->sctp_ipif_saddr) || - (sctp->sctp_ipversion == IPV4_VERSION && isv6) || - (sctp->sctp_connp->conn_ipv6_v6only && !isv6)) { + (connp->conn_family == AF_INET && isv6) || + (connp->conn_ipv6_v6only && !isv6)) { rw_exit(&sctp_ipif->sctp_ipif_lock); sctp_ipif = list_next( &sctps->sctps_g_ipifs[i].sctp_ipif_list, @@ -303,6 +305,7 @@ sctp_valid_addr_list(sctp_t *sctp, const void *addrs, uint32_t addrcnt, boolean_t check_addrs = B_FALSE; boolean_t check_lport = B_FALSE; uchar_t *p = list; + conn_t *connp = sctp->sctp_connp; /* * Need to check for port and address depending on the state. @@ -325,11 +328,11 @@ sctp_valid_addr_list(sctp_t *sctp, const void *addrs, uint32_t addrcnt, boolean_t lookup_saddr = B_TRUE; uint_t ifindex = 0; - switch (sctp->sctp_family) { + switch (connp->conn_family) { case AF_INET: sin4 = (struct sockaddr_in *)addrs + cnt; if (sin4->sin_family != AF_INET || (check_lport && - sin4->sin_port != sctp->sctp_lport)) { + sin4->sin_port != connp->conn_lport)) { err = EINVAL; goto free_ret; } @@ -351,14 +354,14 @@ sctp_valid_addr_list(sctp_t *sctp, const void *addrs, uint32_t addrcnt, case AF_INET6: sin6 = (struct sockaddr_in6 *)addrs + cnt; if (sin6->sin6_family != AF_INET6 || (check_lport && - sin6->sin6_port != sctp->sctp_lport)) { + sin6->sin6_port != connp->conn_lport)) { err = EINVAL; goto free_ret; } addr = sin6->sin6_addr; /* Contains the interface index */ ifindex = sin6->sin6_scope_id; - if (sctp->sctp_connp->conn_ipv6_v6only && + if (connp->conn_ipv6_v6only && IN6_IS_ADDR_V4MAPPED(&addr)) { err = EAFNOSUPPORT; goto free_ret; @@ -382,7 +385,7 @@ sctp_valid_addr_list(sctp_t *sctp, const void *addrs, uint32_t addrcnt, } if (lookup_saddr) { ipif = sctp_lookup_ipif_addr(&addr, B_TRUE, - sctp->sctp_zoneid, !sctp->sctp_connp->conn_allzones, + IPCL_ZONEID(connp), !connp->conn_allzones, ifindex, 0, B_TRUE, sctp->sctp_sctps); if (ipif == NULL) { /* Address not in the list */ @@ -495,6 +498,8 @@ sctp_ipif_hash_insert(sctp_t *sctp, sctp_ipif_t *ipif, int sleep, /* * Given a source address, walk through the peer address list to see * if the source address is being used. If it is, reset that. + * A cleared saddr will then make sctp_make_mp lookup the destination again + * and as part of that look for a new source. */ static void sctp_fix_saddr(sctp_t *sctp, in6_addr_t *saddr) @@ -504,10 +509,6 @@ sctp_fix_saddr(sctp_t *sctp, in6_addr_t *saddr) for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->next) { if (!IN6_ARE_ADDR_EQUAL(&fp->saddr, saddr)) continue; - if (fp->ire != NULL) { - IRE_REFRELE_NOTR(fp->ire); - fp->ire = NULL; - } V6_SET_ZERO(fp->saddr); } } @@ -874,8 +875,8 @@ sctp_update_saddrs(sctp_ipif_t *oipif, sctp_ipif_t *nipif, int idx, sctp_saddr_ipif_t *sobj; int count; - sctp = sctps->sctps_gsctp; mutex_enter(&sctps->sctps_g_lock); + sctp = list_head(&sctps->sctps_g_list); while (sctp != NULL && oipif->sctp_ipif_refcnt > 0) { mutex_enter(&sctp->sctp_reflock); if (sctp->sctp_condemned || @@ -1202,7 +1203,6 @@ sctp_update_ipif(ipif_t *ipif, int op) rw_downgrade(&sctps->sctps_g_ipifs_lock); rw_enter(&sctp_ipif->sctp_ipif_lock, RW_WRITER); sctp_ipif->sctp_ipif_state = SCTP_IPIFS_UP; - sctp_ipif->sctp_ipif_mtu = ipif->ipif_mtu; sctp_ipif->sctp_ipif_flags = ipif->ipif_flags; rw_exit(&sctp_ipif->sctp_ipif_lock); sctp_chk_and_updt_saddr(hindex, sctp_ipif, @@ -1214,7 +1214,6 @@ sctp_update_ipif(ipif_t *ipif, int op) rw_downgrade(&sctps->sctps_g_ipifs_lock); rw_enter(&sctp_ipif->sctp_ipif_lock, RW_WRITER); - sctp_ipif->sctp_ipif_mtu = ipif->ipif_mtu; sctp_ipif->sctp_ipif_zoneid = ipif->ipif_zoneid; sctp_ipif->sctp_ipif_flags = ipif->ipif_flags; rw_exit(&sctp_ipif->sctp_ipif_lock); @@ -1226,7 +1225,6 @@ sctp_update_ipif(ipif_t *ipif, int op) rw_downgrade(&sctps->sctps_g_ipifs_lock); rw_enter(&sctp_ipif->sctp_ipif_lock, RW_WRITER); sctp_ipif->sctp_ipif_state = SCTP_IPIFS_DOWN; - sctp_ipif->sctp_ipif_mtu = ipif->ipif_mtu; sctp_ipif->sctp_ipif_flags = ipif->ipif_flags; rw_exit(&sctp_ipif->sctp_ipif_lock); @@ -1277,6 +1275,7 @@ sctp_del_saddr_list(sctp_t *sctp, const void *addrs, int addcnt, in6_addr_t addr; sctp_ipif_t *sctp_ipif; int ifindex = 0; + conn_t *connp = sctp->sctp_connp; ASSERT(sctp->sctp_nsaddrs >= addcnt); @@ -1288,7 +1287,7 @@ sctp_del_saddr_list(sctp_t *sctp, const void *addrs, int addcnt, } for (cnt = 0; cnt < addcnt; cnt++) { - switch (sctp->sctp_family) { + switch (connp->conn_family) { case AF_INET: sin4 = (struct sockaddr_in *)addrs + cnt; IN6_INADDR_TO_V4MAPPED(&sin4->sin_addr, &addr); @@ -1301,7 +1300,7 @@ sctp_del_saddr_list(sctp_t *sctp, const void *addrs, int addcnt, break; } sctp_ipif = sctp_lookup_ipif_addr(&addr, B_FALSE, - sctp->sctp_zoneid, !sctp->sctp_connp->conn_allzones, + IPCL_ZONEID(connp), !connp->conn_allzones, ifindex, 0, B_TRUE, sctp->sctp_sctps); ASSERT(sctp_ipif != NULL); sctp_ipif_hash_remove(sctp, sctp_ipif); @@ -1356,10 +1355,10 @@ int sctp_saddr_add_addr(sctp_t *sctp, in6_addr_t *addr, uint_t ifindex) { sctp_ipif_t *sctp_ipif; + conn_t *connp = sctp->sctp_connp; - sctp_ipif = sctp_lookup_ipif_addr(addr, B_TRUE, sctp->sctp_zoneid, - !sctp->sctp_connp->conn_allzones, ifindex, 0, B_TRUE, - sctp->sctp_sctps); + sctp_ipif = sctp_lookup_ipif_addr(addr, B_TRUE, IPCL_ZONEID(connp), + !connp->conn_allzones, ifindex, 0, B_TRUE, sctp->sctp_sctps); if (sctp_ipif == NULL) return (EINVAL); @@ -1386,6 +1385,7 @@ sctp_check_saddr(sctp_t *sctp, int supp_af, boolean_t delete, int scanned = 0; int naddr; int nsaddr; + conn_t *connp = sctp->sctp_connp; ASSERT(!sctp->sctp_loopback && !sctp->sctp_linklocal && supp_af != 0); @@ -1393,7 +1393,7 @@ sctp_check_saddr(sctp_t *sctp, int supp_af, boolean_t delete, * Irregardless of the supported address in the INIT, v4 * must be supported. */ - if (sctp->sctp_family == AF_INET) + if (connp->conn_family == AF_INET) supp_af = PARM_SUPP_V4; nsaddr = sctp->sctp_nsaddrs; @@ -1501,13 +1501,15 @@ sctp_getmyaddrs(void *conn, void *myaddrs, int *addrcnt) int l; sctp_saddr_ipif_t *obj; sctp_t *sctp = (sctp_t *)conn; - int family = sctp->sctp_family; + conn_t *connp = sctp->sctp_connp; + int family = connp->conn_family; int max = *addrcnt; size_t added = 0; struct sockaddr_in6 *sin6; struct sockaddr_in *sin4; int scanned = 0; boolean_t skip_lback = B_FALSE; + ip_xmit_attr_t *ixa = connp->conn_ixa; if (sctp->sctp_nsaddrs == 0) return (EINVAL); @@ -1543,15 +1545,27 @@ sctp_getmyaddrs(void *conn, void *myaddrs, int *addrcnt) case AF_INET: sin4 = (struct sockaddr_in *)myaddrs + added; sin4->sin_family = AF_INET; - sin4->sin_port = sctp->sctp_lport; + sin4->sin_port = connp->conn_lport; IN6_V4MAPPED_TO_INADDR(&addr, &sin4->sin_addr); break; case AF_INET6: sin6 = (struct sockaddr_in6 *)myaddrs + added; sin6->sin6_family = AF_INET6; - sin6->sin6_port = sctp->sctp_lport; + sin6->sin6_port = connp->conn_lport; sin6->sin6_addr = addr; + /* + * Note that flowinfo is only returned for + * getpeername just like for TCP and UDP. + */ + sin6->sin6_flowinfo = 0; + + if (IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr) && + (ixa->ixa_flags & IXAF_SCOPEID_SET)) + sin6->sin6_scope_id = ixa->ixa_scopeid; + else + sin6->sin6_scope_id = 0; + sin6->__sin6_src_id = 0; break; } added++; @@ -1700,6 +1714,7 @@ sctp_get_addrlist(sctp_t *sctp, const void *addrs, uint32_t *addrcnt, uchar_t *p; int err = 0; sctp_stack_t *sctps = sctp->sctp_sctps; + conn_t *connp = sctp->sctp_connp; *addrlist = NULL; *size = 0; @@ -1707,7 +1722,7 @@ sctp_get_addrlist(sctp_t *sctp, const void *addrs, uint32_t *addrcnt, /* * Create a list of sockaddr_in[6] structs using the input list. */ - if (sctp->sctp_family == AF_INET) { + if (connp->conn_family == AF_INET) { *size = sizeof (struct sockaddr_in) * *addrcnt; *addrlist = kmem_zalloc(*size, KM_SLEEP); p = *addrlist; @@ -1772,7 +1787,7 @@ get_all_addrs: * We allocate upfront so that the clustering module need to bother * re-sizing the list. */ - if (sctp->sctp_family == AF_INET) { + if (connp->conn_family == AF_INET) { *size = sizeof (struct sockaddr_in) * sctps->sctps_g_ipifs_count; } else { @@ -1805,7 +1820,7 @@ get_all_addrs: SCTP_IS_IPIF_LOOPBACK(sctp_ipif) || SCTP_IS_IPIF_LINKLOCAL(sctp_ipif) || !SCTP_IPIF_ZONE_MATCH(sctp, sctp_ipif) || - (sctp->sctp_ipversion == IPV4_VERSION && + (connp->conn_family == AF_INET && sctp_ipif->sctp_ipif_isv6) || (sctp->sctp_connp->conn_ipv6_v6only && !sctp_ipif->sctp_ipif_isv6)) { @@ -1816,7 +1831,7 @@ get_all_addrs: continue; } rw_exit(&sctp_ipif->sctp_ipif_lock); - if (sctp->sctp_family == AF_INET) { + if (connp->conn_family == AF_INET) { s4 = (struct sockaddr_in *)p; IN6_V4MAPPED_TO_INADDR(&addr, &s4->sin_addr); s4->sin_family = AF_INET; diff --git a/usr/src/uts/common/inet/sctp/sctp_addr.h b/usr/src/uts/common/inet/sctp/sctp_addr.h index 9408c452d4..35e8300958 100644 --- a/usr/src/uts/common/inet/sctp/sctp_addr.h +++ b/usr/src/uts/common/inet/sctp/sctp_addr.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SCTP_ADDR_H #define _SCTP_ADDR_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/list.h> #include <sys/zone.h> #include <inet/ip.h> @@ -54,7 +52,6 @@ extern "C" { typedef struct sctp_ipif_s { list_node_t sctp_ipifs; /* Used by the global list */ struct sctp_ill_s *sctp_ipif_ill; - uint_t sctp_ipif_mtu; uint_t sctp_ipif_id; in6_addr_t sctp_ipif_saddr; int sctp_ipif_state; diff --git a/usr/src/uts/common/inet/sctp/sctp_asconf.c b/usr/src/uts/common/inet/sctp/sctp_asconf.c index 859faab0b8..fd7e34f7ba 100644 --- a/usr/src/uts/common/inet/sctp/sctp_asconf.c +++ b/usr/src/uts/common/inet/sctp/sctp_asconf.c @@ -571,7 +571,8 @@ sctp_input_asconf(sctp_t *sctp, sctp_chunk_hdr_t *ch, sctp_faddr_t *fp) * it is the clustering module's responsibility to free the lists. */ if (cl_sctp_assoc_change != NULL) { - (*cl_sctp_assoc_change)(sctp->sctp_family, alist, asize, + (*cl_sctp_assoc_change)(sctp->sctp_connp->conn_family, + alist, asize, acount, dlist, dsize, dcount, SCTP_CL_PADDR, (cl_sctp_handle_t)sctp); /* alist and dlist will be freed by the clustering module */ @@ -586,9 +587,10 @@ sctp_input_asconf(sctp_t *sctp, sctp_chunk_hdr_t *ch, sctp_faddr_t *fp) ach->sch_len = htons(msgdsize(hmp) - sctp->sctp_hdr_len); else ach->sch_len = htons(msgdsize(hmp) - sctp->sctp_hdr6_len); - sctp_set_iplen(sctp, hmp); - sctp_add_sendq(sctp, hmp); + sctp_set_iplen(sctp, hmp, fp->ixa); + (void) conn_ip_output(hmp, fp->ixa); + BUMP_LOCAL(sctp->sctp_opkts); sctp_validate_peer(sctp); } @@ -809,7 +811,7 @@ sctp_input_asconf_ack(sctp_t *sctp, sctp_chunk_hdr_t *ch, sctp_faddr_t *fp) mp->b_prev = NULL; ainfo->sctp_cl_alist = NULL; ainfo->sctp_cl_dlist = NULL; - (*cl_sctp_assoc_change)(sctp->sctp_family, alist, + (*cl_sctp_assoc_change)(sctp->sctp_connp->conn_family, alist, ainfo->sctp_cl_asize, acount, dlist, ainfo->sctp_cl_dsize, dcount, SCTP_CL_LADDR, (cl_sctp_handle_t)sctp); /* alist and dlist will be freed by the clustering module */ @@ -1010,12 +1012,13 @@ sctp_wput_asconf(sctp_t *sctp, sctp_faddr_t *fp) fp->suna += MBLKL(mp); /* Attach the header and send the chunk */ ipmp->b_cont = mp; - sctp_set_iplen(sctp, ipmp); sctp->sctp_cchunk_pend = 1; SCTP_SET_SENT_FLAG(sctp->sctp_cxmit_list); SCTP_SET_CHUNK_DEST(sctp->sctp_cxmit_list, fp); - sctp_add_sendq(sctp, ipmp); + sctp_set_iplen(sctp, ipmp, fp->ixa); + (void) conn_ip_output(ipmp, fp->ixa); + BUMP_LOCAL(sctp->sctp_opkts); SCTP_FADDR_RC_TIMER_RESTART(sctp, fp, fp->rto); #undef SCTP_SET_SENT_FLAG } @@ -1418,6 +1421,7 @@ sctp_add_ip(sctp_t *sctp, const void *addrs, uint32_t cnt) uint16_t type = htons(PARM_ADD_IP); boolean_t v4mapped = B_FALSE; sctp_cl_ainfo_t *ainfo = NULL; + conn_t *connp = sctp->sctp_connp; /* Does the peer understand ASCONF and Add-IP? */ if (!sctp->sctp_understands_asconf || !sctp->sctp_understands_addip) @@ -1453,7 +1457,7 @@ sctp_add_ip(sctp_t *sctp, const void *addrs, uint32_t cnt) * o Must be part of the association */ for (i = 0; i < cnt; i++) { - switch (sctp->sctp_family) { + switch (connp->conn_family) { case AF_INET: sin4 = (struct sockaddr_in *)addrs + i; v4mapped = B_TRUE; @@ -1538,6 +1542,7 @@ sctp_del_ip(sctp_t *sctp, const void *addrs, uint32_t cnt, uchar_t *ulist, uchar_t *p = ulist; boolean_t check_lport = B_FALSE; sctp_stack_t *sctps = sctp->sctp_sctps; + conn_t *connp = sctp->sctp_connp; /* Does the peer understand ASCONF and Add-IP? */ if (sctp->sctp_state <= SCTPS_LISTEN || !sctps->sctps_addip_enabled || @@ -1577,10 +1582,11 @@ sctp_del_ip(sctp_t *sctp, const void *addrs, uint32_t cnt, uchar_t *ulist, for (i = 0; i < cnt; i++) { ifindex = 0; - switch (sctp->sctp_family) { + switch (connp->conn_family) { case AF_INET: sin4 = (struct sockaddr_in *)addrs + i; - if (check_lport && sin4->sin_port != sctp->sctp_lport) { + if (check_lport && + sin4->sin_port != connp->conn_lport) { error = EINVAL; goto fail; } @@ -1591,7 +1597,7 @@ sctp_del_ip(sctp_t *sctp, const void *addrs, uint32_t cnt, uchar_t *ulist, case AF_INET6: sin6 = (struct sockaddr_in6 *)addrs + i; if (check_lport && - sin6->sin6_port != sctp->sctp_lport) { + sin6->sin6_port != connp->conn_lport) { error = EINVAL; goto fail; } @@ -1675,7 +1681,7 @@ fail: for (i = 0; i < addrcnt; i++) { ifindex = 0; - switch (sctp->sctp_family) { + switch (connp->conn_family) { case AF_INET: sin4 = (struct sockaddr_in *)addrs + i; IN6_INADDR_TO_V4MAPPED(&(sin4->sin_addr), &addr); @@ -1697,7 +1703,7 @@ fail: } int -sctp_set_peerprim(sctp_t *sctp, const void *inp, uint_t inlen) +sctp_set_peerprim(sctp_t *sctp, const void *inp) { const struct sctp_setprim *prim = inp; const struct sockaddr_storage *ss; @@ -1717,9 +1723,6 @@ sctp_set_peerprim(sctp_t *sctp, const void *inp, uint_t inlen) return (EOPNOTSUPP); } - if (inlen < sizeof (*prim)) - return (EINVAL); - /* Don't do anything if we are not connected */ if (sctp->sctp_state != SCTPS_ESTABLISHED) return (EINVAL); diff --git a/usr/src/uts/common/inet/sctp/sctp_asconf.h b/usr/src/uts/common/inet/sctp/sctp_asconf.h index 8940aa00bc..221172d7bb 100644 --- a/usr/src/uts/common/inet/sctp/sctp_asconf.h +++ b/usr/src/uts/common/inet/sctp/sctp_asconf.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _INET_SCTP_SCTP_ASCONF_H #define _INET_SCTP_SCTP_ASCONF_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -57,7 +55,7 @@ extern int sctp_del_ip(sctp_t *, const void *, uint32_t, uchar_t *, size_t); extern void sctp_asconf_free_cxmit(sctp_t *, sctp_chunk_hdr_t *); extern void sctp_input_asconf(sctp_t *, sctp_chunk_hdr_t *, sctp_faddr_t *); extern void sctp_input_asconf_ack(sctp_t *, sctp_chunk_hdr_t *, sctp_faddr_t *); -extern int sctp_set_peerprim(sctp_t *, const void *, uint_t); +extern int sctp_set_peerprim(sctp_t *, const void *); extern void sctp_wput_asconf(sctp_t *, sctp_faddr_t *); #ifdef __cplusplus diff --git a/usr/src/uts/common/inet/sctp/sctp_bind.c b/usr/src/uts/common/inet/sctp/sctp_bind.c index c0c1c7556e..9e0b0e7418 100644 --- a/usr/src/uts/common/inet/sctp/sctp_bind.c +++ b/usr/src/uts/common/inet/sctp/sctp_bind.c @@ -56,6 +56,7 @@ static int sctp_select_port(sctp_t *sctp, in_port_t *requested_port, int *user_specified) { sctp_stack_t *sctps = sctp->sctp_sctps; + conn_t *connp = sctp->sctp_connp; /* * Get a valid port (within the anonymous range and should not @@ -68,7 +69,7 @@ sctp_select_port(sctp_t *sctp, in_port_t *requested_port, int *user_specified) if (*requested_port == 0) { *requested_port = sctp_update_next_port( sctps->sctps_next_port_to_try, - crgetzone(sctp->sctp_credp), sctps); + crgetzone(connp->conn_cred), sctps); if (*requested_port == 0) return (EACCES); *user_specified = 0; @@ -101,7 +102,7 @@ sctp_select_port(sctp_t *sctp, in_port_t *requested_port, int *user_specified) * sctp_bind() should take a cred_t argument so that * we can use it here. */ - if (secpolicy_net_privaddr(sctp->sctp_credp, + if (secpolicy_net_privaddr(connp->conn_cred, *requested_port, IPPROTO_SCTP) != 0) { dprint(1, ("sctp_bind(x): no prive for port %d", @@ -120,6 +121,7 @@ sctp_listen(sctp_t *sctp) { sctp_tf_t *tf; sctp_stack_t *sctps = sctp->sctp_sctps; + conn_t *connp = sctp->sctp_connp; RUN_SCTP(sctp); /* @@ -138,7 +140,7 @@ sctp_listen(sctp_t *sctp) int ret; bzero(&ss, sizeof (ss)); - ss.ss_family = sctp->sctp_family; + ss.ss_family = connp->conn_family; WAKE_SCTP(sctp); if ((ret = sctp_bind(sctp, (struct sockaddr *)&ss, @@ -147,12 +149,18 @@ sctp_listen(sctp_t *sctp) RUN_SCTP(sctp) } + /* Cache things in the ixa without any refhold */ + connp->conn_ixa->ixa_cred = connp->conn_cred; + connp->conn_ixa->ixa_cpid = connp->conn_cpid; + if (is_system_labeled()) + connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred); + sctp->sctp_state = SCTPS_LISTEN; (void) random_get_pseudo_bytes(sctp->sctp_secret, SCTP_SECRET_LEN); sctp->sctp_last_secret_update = lbolt64; bzero(sctp->sctp_old_secret, SCTP_SECRET_LEN); tf = &sctps->sctps_listen_fanout[SCTP_LISTEN_HASH( - ntohs(sctp->sctp_lport))]; + ntohs(connp->conn_lport))]; sctp_listen_hash_insert(tf, sctp); WAKE_SCTP(sctp); return (0); @@ -170,6 +178,10 @@ sctp_bind(sctp_t *sctp, struct sockaddr *sa, socklen_t len) in_port_t requested_port; in_port_t allocated_port; int err = 0; + conn_t *connp = sctp->sctp_connp; + uint_t scope_id; + sin_t *sin; + sin6_t *sin6; ASSERT(sctp != NULL); @@ -188,25 +200,35 @@ sctp_bind(sctp_t *sctp, struct sockaddr *sa, socklen_t len) switch (sa->sa_family) { case AF_INET: + sin = (sin_t *)sa; if (len < sizeof (struct sockaddr_in) || - sctp->sctp_family == AF_INET6) { + connp->conn_family == AF_INET6) { err = EINVAL; goto done; } - requested_port = ntohs(((struct sockaddr_in *)sa)->sin_port); + requested_port = ntohs(sin->sin_port); break; case AF_INET6: + sin6 = (sin6_t *)sa; if (len < sizeof (struct sockaddr_in6) || - sctp->sctp_family == AF_INET) { + connp->conn_family == AF_INET) { err = EINVAL; goto done; } - requested_port = ntohs(((struct sockaddr_in6 *)sa)->sin6_port); + requested_port = ntohs(sin6->sin6_port); /* Set the flowinfo. */ - sctp->sctp_ip6h->ip6_vcf = - (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) | - (((struct sockaddr_in6 *)sa)->sin6_flowinfo & - ~IPV6_VERS_AND_FLOW_MASK); + connp->conn_flowinfo = + sin6->sin6_flowinfo & ~IPV6_VERS_AND_FLOW_MASK; + + scope_id = sin6->sin6_scope_id; + if (scope_id != 0 && IN6_IS_ADDR_LINKSCOPE(&sin6->sin6_addr)) { + connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET; + connp->conn_ixa->ixa_scopeid = scope_id; + connp->conn_incoming_ifindex = scope_id; + } else { + connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; + connp->conn_incoming_ifindex = connp->conn_bound_if; + } break; default: err = EAFNOSUPPORT; @@ -247,7 +269,7 @@ sctp_bindx(sctp_t *sctp, const void *addrs, int addrcnt, int bindop) switch (bindop) { case SCTP_BINDX_ADD_ADDR: return (sctp_bind_add(sctp, addrs, addrcnt, B_FALSE, - sctp->sctp_lport)); + sctp->sctp_connp->conn_lport)); case SCTP_BINDX_REM_ADDR: return (sctp_bind_del(sctp, addrs, addrcnt, B_FALSE)); default: @@ -265,6 +287,7 @@ sctp_bind_add(sctp_t *sctp, const void *addrs, uint32_t addrcnt, int err = 0; boolean_t do_asconf = B_FALSE; sctp_stack_t *sctps = sctp->sctp_sctps; + conn_t *connp = sctp->sctp_connp; if (!caller_hold_lock) RUN_SCTP(sctp); @@ -329,7 +352,7 @@ sctp_bind_add(sctp_t *sctp, const void *addrs, uint32_t addrcnt, return (err); } ASSERT(addrlist != NULL); - (*cl_sctp_check_addrs)(sctp->sctp_family, port, &addrlist, + (*cl_sctp_check_addrs)(connp->conn_family, port, &addrlist, size, &addrcnt, unspec == 1); if (addrcnt == 0) { /* We free the list */ @@ -345,8 +368,8 @@ sctp_bind_add(sctp_t *sctp, const void *addrs, uint32_t addrcnt, err = sctp_valid_addr_list(sctp, addrlist, addrcnt, llist, lsize); if (err == 0 && do_listen) { - (*cl_sctp_listen)(sctp->sctp_family, llist, - addrcnt, sctp->sctp_lport); + (*cl_sctp_listen)(connp->conn_family, llist, + addrcnt, connp->conn_lport); /* list will be freed by the clustering module */ } else if (err != 0 && llist != NULL) { kmem_free(llist, lsize); @@ -373,8 +396,6 @@ sctp_bind_add(sctp_t *sctp, const void *addrs, uint32_t addrcnt, } if (!caller_hold_lock) WAKE_SCTP(sctp); - if (do_asconf) - sctp_process_sendq(sctp); return (0); } @@ -390,6 +411,7 @@ sctp_bind_del(sctp_t *sctp, const void *addrs, uint32_t addrcnt, uchar_t *ulist = NULL; size_t usize = 0; sctp_stack_t *sctps = sctp->sctp_sctps; + conn_t *connp = sctp->sctp_connp; if (!caller_hold_lock) RUN_SCTP(sctp); @@ -439,14 +461,12 @@ sctp_bind_del(sctp_t *sctp, const void *addrs, uint32_t addrcnt, /* ulist will be non-NULL only if cl_sctp_unlisten is non-NULL */ if (ulist != NULL) { ASSERT(cl_sctp_unlisten != NULL); - (*cl_sctp_unlisten)(sctp->sctp_family, ulist, addrcnt, - sctp->sctp_lport); + (*cl_sctp_unlisten)(connp->conn_family, ulist, addrcnt, + connp->conn_lport); /* ulist will be freed by the clustering module */ } if (!caller_hold_lock) WAKE_SCTP(sctp); - if (do_asconf) - sctp_process_sendq(sctp); return (error); } @@ -473,9 +493,10 @@ sctp_bindi(sctp_t *sctp, in_port_t port, boolean_t bind_to_req_port_only, int count = 0; /* maximum number of times to run around the loop */ int loopmax; - zoneid_t zoneid = sctp->sctp_zoneid; - zone_t *zone = crgetzone(sctp->sctp_credp); sctp_stack_t *sctps = sctp->sctp_sctps; + conn_t *connp = sctp->sctp_connp; + zone_t *zone = crgetzone(connp->conn_cred); + zoneid_t zoneid = connp->conn_zoneid; /* * Lookup for free addresses is done in a loop and "loopmax" @@ -523,8 +544,9 @@ sctp_bindi(sctp_t *sctp, in_port_t port, boolean_t bind_to_req_port_only, mutex_enter(&tbf->tf_lock); for (lsctp = tbf->tf_sctp; lsctp != NULL; lsctp = lsctp->sctp_bind_hash) { + conn_t *lconnp = lsctp->sctp_connp; - if (lport != lsctp->sctp_lport || + if (lport != lconnp->conn_lport || lsctp->sctp_state < SCTPS_BOUND) continue; @@ -534,14 +556,14 @@ sctp_bindi(sctp_t *sctp, in_port_t port, boolean_t bind_to_req_port_only, * privilege as being in all zones, as there's * otherwise no way to identify the right receiver. */ - if (lsctp->sctp_zoneid != zoneid && - lsctp->sctp_mac_mode == CONN_MAC_DEFAULT && - sctp->sctp_mac_mode == CONN_MAC_DEFAULT) + if (lconnp->conn_zoneid != zoneid && + lconnp->conn_mac_mode == CONN_MAC_DEFAULT && + connp->conn_mac_mode == CONN_MAC_DEFAULT) continue; addrcmp = sctp_compare_saddrs(sctp, lsctp); if (addrcmp != SCTP_ADDR_DISJOINT) { - if (!sctp->sctp_reuseaddr) { + if (!connp->conn_reuseaddr) { /* in use */ break; } else if (lsctp->sctp_state == SCTPS_BOUND || @@ -563,10 +585,9 @@ sctp_bindi(sctp_t *sctp, in_port_t port, boolean_t bind_to_req_port_only, /* The port number is busy */ mutex_exit(&tbf->tf_lock); } else { - conn_t *connp = sctp->sctp_connp; - if (is_system_labeled()) { mlp_type_t addrtype, mlptype; + uint_t ipversion; /* * On a labeled system we must check the type @@ -575,11 +596,16 @@ sctp_bindi(sctp_t *sctp, in_port_t port, boolean_t bind_to_req_port_only, * and that the user's requested binding * is permitted. */ + if (connp->conn_family == AF_INET) + ipversion = IPV4_VERSION; + else + ipversion = IPV6_VERSION; + addrtype = tsol_mlp_addr_type( connp->conn_allzones ? ALL_ZONES : zone->zone_id, - sctp->sctp_ipversion, - sctp->sctp_ipversion == IPV4_VERSION ? + ipversion, + connp->conn_family == AF_INET ? (void *)&sctp->sctp_ipha->ipha_src : (void *)&sctp->sctp_ip6h->ip6_src, sctps->sctps_netstack->netstack_ip); @@ -631,8 +657,7 @@ sctp_bindi(sctp_t *sctp, in_port_t port, boolean_t bind_to_req_port_only, * number. */ sctp->sctp_state = SCTPS_BOUND; - sctp->sctp_lport = lport; - sctp->sctp_sctph->sh_sport = lport; + connp->conn_lport = lport; ASSERT(&sctps->sctps_bind_fanout[ SCTP_BIND_HASH(port)] == tbf); diff --git a/usr/src/uts/common/inet/sctp/sctp_common.c b/usr/src/uts/common/inet/sctp/sctp_common.c index 3486ba1150..b518eb3981 100644 --- a/usr/src/uts/common/inet/sctp/sctp_common.c +++ b/usr/src/uts/common/inet/sctp/sctp_common.c @@ -44,6 +44,8 @@ #include <inet/ip.h> #include <inet/ip6.h> #include <inet/ip_ire.h> +#include <inet/ip_if.h> +#include <inet/ip_ndp.h> #include <inet/mib2.h> #include <inet/nd.h> #include <inet/optcom.h> @@ -57,7 +59,7 @@ static struct kmem_cache *sctp_kmem_faddr_cache; static void sctp_init_faddr(sctp_t *, sctp_faddr_t *, in6_addr_t *, mblk_t *); -/* Set the source address. Refer to comments in sctp_get_ire(). */ +/* Set the source address. Refer to comments in sctp_get_dest(). */ void sctp_set_saddr(sctp_t *sctp, sctp_faddr_t *fp) { @@ -68,7 +70,7 @@ sctp_set_saddr(sctp_t *sctp, sctp_faddr_t *fp) /* * If there is no source address avaialble, mark this peer address * as unreachable for now. When the heartbeat timer fires, it will - * call sctp_get_ire() to re-check if there is any source address + * call sctp_get_dest() to re-check if there is any source address * available. */ if (!addr_set) @@ -76,25 +78,31 @@ sctp_set_saddr(sctp_t *sctp, sctp_faddr_t *fp) } /* - * Call this function to update the cached IRE of a peer addr fp. + * Call this function to get information about a peer addr fp. + * + * Uses ip_attr_connect to avoid explicit use of ire and source address + * selection. */ void -sctp_get_ire(sctp_t *sctp, sctp_faddr_t *fp) +sctp_get_dest(sctp_t *sctp, sctp_faddr_t *fp) { - ire_t *ire; - ipaddr_t addr4; in6_addr_t laddr; + in6_addr_t nexthop; sctp_saddr_ipif_t *sp; int hdrlen; - ts_label_t *tsl; sctp_stack_t *sctps = sctp->sctp_sctps; - ip_stack_t *ipst = sctps->sctps_netstack->netstack_ip; + conn_t *connp = sctp->sctp_connp; + iulp_t uinfo; + uint_t pmtu; + int error; + uint32_t flags = IPDF_VERIFY_DST | IPDF_IPSEC | + IPDF_SELECT_SRC | IPDF_UNIQUE_DCE; - /* Remove the previous cache IRE */ - if ((ire = fp->ire) != NULL) { - IRE_REFRELE_NOTR(ire); - fp->ire = NULL; - } + /* + * Tell sctp_make_mp it needs to call us again should we not + * complete and set the saddr. + */ + fp->saddr = ipv6_all_zeros; /* * If this addr is not reachable, mark it as unconfirmed for now, the @@ -105,29 +113,28 @@ sctp_get_ire(sctp_t *sctp, sctp_faddr_t *fp) fp->state = SCTP_FADDRS_UNCONFIRMED; } - tsl = crgetlabel(CONN_CRED(sctp->sctp_connp)); + /* + * Socket is connected - enable PMTU discovery. + */ + if (!sctps->sctps_ignore_path_mtu) + fp->ixa->ixa_flags |= IXAF_PMTU_DISCOVERY; - if (fp->isv4) { - IN6_V4MAPPED_TO_IPADDR(&fp->faddr, addr4); - ire = ire_cache_lookup(addr4, sctp->sctp_zoneid, tsl, ipst); - if (ire != NULL) - IN6_IPADDR_TO_V4MAPPED(ire->ire_src_addr, &laddr); - } else { - ire = ire_cache_lookup_v6(&fp->faddr, sctp->sctp_zoneid, tsl, - ipst); - if (ire != NULL) - laddr = ire->ire_src_addr_v6; - } + ip_attr_nexthop(&connp->conn_xmit_ipp, fp->ixa, &fp->faddr, + &nexthop); - if (ire == NULL) { - dprint(3, ("ire2faddr: no ire for %x:%x:%x:%x\n", + laddr = fp->saddr; + error = ip_attr_connect(connp, fp->ixa, &laddr, &fp->faddr, &nexthop, + connp->conn_fport, &laddr, &uinfo, flags); + + if (error != 0) { + dprint(3, ("sctp_get_dest: no ire for %x:%x:%x:%x\n", SCTP_PRINTADDR(fp->faddr))); /* * It is tempting to just leave the src addr * unspecified and let IP figure it out, but we * *cannot* do this, since IP may choose a src addr * that is not part of this association... unless - * this sctp has bound to all addrs. So if the ire + * this sctp has bound to all addrs. So if the dest * lookup fails, try to find one in our src addr * list, unless the sctp has bound to all addrs, in * which case we change the src addr to unspec. @@ -144,56 +151,44 @@ sctp_get_ire(sctp_t *sctp, sctp_faddr_t *fp) return; goto check_current; } + ASSERT(fp->ixa->ixa_ire != NULL); + ASSERT(!(fp->ixa->ixa_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))); + + if (!sctp->sctp_loopback) + sctp->sctp_loopback = uinfo.iulp_loopback; /* Make sure the laddr is part of this association */ - if ((sp = sctp_saddr_lookup(sctp, &ire->ire_ipif->ipif_v6lcl_addr, - 0)) != NULL && !sp->saddr_ipif_dontsrc) { + if ((sp = sctp_saddr_lookup(sctp, &laddr, 0)) != NULL && + !sp->saddr_ipif_dontsrc) { if (sp->saddr_ipif_unconfirmed == 1) sp->saddr_ipif_unconfirmed = 0; + /* We did IPsec policy lookup for laddr already */ fp->saddr = laddr; } else { - dprint(2, ("ire2faddr: src addr is not part of assc\n")); + dprint(2, ("sctp_get_dest: src addr is not part of assoc " + "%x:%x:%x:%x\n", SCTP_PRINTADDR(laddr))); /* * Set the src to the first saddr and hope for the best. - * Note that we will still do the ire caching below. - * Otherwise, whenever we send a packet, we need to do - * the ire lookup again and still may not get the correct - * source address. Note that this case should very seldomly + * Note that this case should very seldomly * happen. One scenario this can happen is an app * explicitly bind() to an address. But that address is * not the preferred source address to send to the peer. */ sctp_set_saddr(sctp, fp); if (fp->state == SCTP_FADDRS_UNREACH) { - IRE_REFRELE(ire); return; } } /* - * Note that ire_cache_lookup_*() returns an ire with the tracing - * bits enabled. This requires the thread holding the ire also - * do the IRE_REFRELE(). Thus we need to do IRE_REFHOLD_NOTR() - * and then IRE_REFRELE() the ire here to make the tracing bits - * work. - */ - IRE_REFHOLD_NOTR(ire); - IRE_REFRELE(ire); - - /* Cache the IRE */ - fp->ire = ire; - if (fp->ire->ire_type == IRE_LOOPBACK && !sctp->sctp_loopback) - sctp->sctp_loopback = 1; - - /* * Pull out RTO information for this faddr and use it if we don't * have any yet. */ - if (fp->srtt == -1 && ire->ire_uinfo.iulp_rtt != 0) { + if (fp->srtt == -1 && uinfo.iulp_rtt != 0) { /* The cached value is in ms. */ - fp->srtt = MSEC_TO_TICK(ire->ire_uinfo.iulp_rtt); - fp->rttvar = MSEC_TO_TICK(ire->ire_uinfo.iulp_rtt_sd); + fp->srtt = MSEC_TO_TICK(uinfo.iulp_rtt); + fp->rttvar = MSEC_TO_TICK(uinfo.iulp_rtt_sd); fp->rto = 3 * fp->srtt; /* Bound the RTO by configured min and max values */ @@ -205,6 +200,7 @@ sctp_get_ire(sctp_t *sctp, sctp_faddr_t *fp) } SCTP_MAX_RTO(sctp, fp); } + pmtu = uinfo.iulp_mtu; /* * Record the MTU for this faddr. If the MTU for this faddr has @@ -215,9 +211,9 @@ sctp_get_ire(sctp_t *sctp, sctp_faddr_t *fp) } else { hdrlen = sctp->sctp_hdr6_len; } - if ((fp->sfa_pmss + hdrlen) != ire->ire_max_frag) { + if ((fp->sfa_pmss + hdrlen) != pmtu) { /* Make sure that sfa_pmss is a multiple of SCTP_ALIGN. */ - fp->sfa_pmss = (ire->ire_max_frag - hdrlen) & ~(SCTP_ALIGN - 1); + fp->sfa_pmss = (pmtu - hdrlen) & ~(SCTP_ALIGN - 1); if (fp->cwnd < (fp->sfa_pmss * 2)) { SET_CWND(fp, fp->sfa_pmss, sctps->sctps_slow_start_initial); @@ -230,28 +226,16 @@ check_current: } void -sctp_update_ire(sctp_t *sctp) +sctp_update_dce(sctp_t *sctp) { - ire_t *ire; sctp_faddr_t *fp; sctp_stack_t *sctps = sctp->sctp_sctps; + iulp_t uinfo; + ip_stack_t *ipst = sctps->sctps_netstack->netstack_ip; + uint_t ifindex; for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->next) { - if ((ire = fp->ire) == NULL) - continue; - mutex_enter(&ire->ire_lock); - - /* - * If the cached IRE is going away, there is no point to - * update it. - */ - if (ire->ire_marks & IRE_MARK_CONDEMNED) { - mutex_exit(&ire->ire_lock); - IRE_REFRELE_NOTR(ire); - fp->ire = NULL; - continue; - } - + bzero(&uinfo, sizeof (uinfo)); /* * Only record the PMTU for this faddr if we actually have * done discovery. This prevents initialized default from @@ -259,70 +243,60 @@ sctp_update_ire(sctp_t *sctp) */ if (fp->pmtu_discovered) { if (fp->isv4) { - ire->ire_max_frag = fp->sfa_pmss + + uinfo.iulp_mtu = fp->sfa_pmss + sctp->sctp_hdr_len; } else { - ire->ire_max_frag = fp->sfa_pmss + + uinfo.iulp_mtu = fp->sfa_pmss + sctp->sctp_hdr6_len; } } - if (sctps->sctps_rtt_updates != 0 && fp->rtt_updates >= sctps->sctps_rtt_updates) { /* - * If there is no old cached values, initialize them - * conservatively. Set them to be (1.5 * new value). - * This code copied from ip_ire_advise(). The cached - * value is in ms. + * dce_update_uinfo() merges these values with the + * old values. */ - if (ire->ire_uinfo.iulp_rtt != 0) { - ire->ire_uinfo.iulp_rtt = - (ire->ire_uinfo.iulp_rtt + - TICK_TO_MSEC(fp->srtt)) >> 1; - } else { - ire->ire_uinfo.iulp_rtt = - TICK_TO_MSEC(fp->srtt + (fp->srtt >> 1)); - } - if (ire->ire_uinfo.iulp_rtt_sd != 0) { - ire->ire_uinfo.iulp_rtt_sd = - (ire->ire_uinfo.iulp_rtt_sd + - TICK_TO_MSEC(fp->rttvar)) >> 1; + uinfo.iulp_rtt = TICK_TO_MSEC(fp->srtt); + uinfo.iulp_rtt_sd = TICK_TO_MSEC(fp->rttvar); + fp->rtt_updates = 0; + } + ifindex = 0; + if (IN6_IS_ADDR_LINKSCOPE(&fp->faddr)) { + /* + * If we are going to create a DCE we'd better have + * an ifindex + */ + if (fp->ixa->ixa_nce != NULL) { + ifindex = fp->ixa->ixa_nce->nce_common-> + ncec_ill->ill_phyint->phyint_ifindex; } else { - ire->ire_uinfo.iulp_rtt_sd = - TICK_TO_MSEC(fp->rttvar + - (fp->rttvar >> 1)); + continue; } - fp->rtt_updates = 0; } - mutex_exit(&ire->ire_lock); + + (void) dce_update_uinfo(&fp->faddr, ifindex, &uinfo, ipst); } } /* - * The sender must set the total length in the IP header. - * If sendto == NULL, the current will be used. + * The sender must later set the total length in the IP header. */ mblk_t * -sctp_make_mp(sctp_t *sctp, sctp_faddr_t *sendto, int trailer) +sctp_make_mp(sctp_t *sctp, sctp_faddr_t *fp, int trailer) { mblk_t *mp; size_t ipsctplen; int isv4; - sctp_faddr_t *fp; sctp_stack_t *sctps = sctp->sctp_sctps; boolean_t src_changed = B_FALSE; - ASSERT(sctp->sctp_current != NULL || sendto != NULL); - if (sendto == NULL) { - fp = sctp->sctp_current; - } else { - fp = sendto; - } + ASSERT(fp != NULL); isv4 = fp->isv4; - /* Try to look for another IRE again. */ - if (fp->ire == NULL) { - sctp_get_ire(sctp, fp); + if (SCTP_IS_ADDR_UNSPEC(isv4, fp->saddr) || + (fp->ixa->ixa_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { + /* Need to pick a source */ + sctp_get_dest(sctp, fp); /* * Although we still may not get an IRE, the source address * may be changed in sctp_get_ire(). Set src_changed to @@ -334,7 +308,9 @@ sctp_make_mp(sctp_t *sctp, sctp_faddr_t *sendto, int trailer) /* There is no suitable source address to use, return. */ if (fp->state == SCTP_FADDRS_UNREACH) return (NULL); - ASSERT(!SCTP_IS_ADDR_UNSPEC(fp->isv4, fp->saddr)); + + ASSERT(fp->ixa->ixa_ire != NULL); + ASSERT(!SCTP_IS_ADDR_UNSPEC(isv4, fp->saddr)); if (isv4) { ipsctplen = sctp->sctp_hdr_len; @@ -342,8 +318,7 @@ sctp_make_mp(sctp_t *sctp, sctp_faddr_t *sendto, int trailer) ipsctplen = sctp->sctp_hdr6_len; } - mp = allocb_cred(ipsctplen + sctps->sctps_wroff_xtra + trailer, - CONN_CRED(sctp->sctp_connp), sctp->sctp_cpid); + mp = allocb(ipsctplen + sctps->sctps_wroff_xtra + trailer, BPRI_MED); if (mp == NULL) { ip1dbg(("sctp_make_mp: error making mp..\n")); return (NULL); @@ -377,18 +352,6 @@ sctp_make_mp(sctp_t *sctp, sctp_faddr_t *sendto, int trailer) } } ASSERT(sctp->sctp_connp != NULL); - - /* - * IP will not free this IRE if it is condemned. SCTP needs to - * free it. - */ - if ((fp->ire != NULL) && (fp->ire->ire_marks & IRE_MARK_CONDEMNED)) { - IRE_REFRELE_NOTR(fp->ire); - fp->ire = NULL; - } - /* Stash the conn and ire ptr info. for IP */ - SCTP_STASH_IPINFO(mp, fp->ire); - return (mp); } @@ -410,17 +373,22 @@ sctp_set_ulp_prop(sctp_t *sctp) } ASSERT(sctp->sctp_ulpd); + sctp->sctp_connp->conn_wroff = sctps->sctps_wroff_xtra + hdrlen + + sizeof (sctp_data_hdr_t); + ASSERT(sctp->sctp_current->sfa_pmss == sctp->sctp_mss); bzero(&sopp, sizeof (sopp)); sopp.sopp_flags = SOCKOPT_MAXBLK|SOCKOPT_WROFF; - sopp.sopp_wroff = sctps->sctps_wroff_xtra + hdrlen + - sizeof (sctp_data_hdr_t); + sopp.sopp_wroff = sctp->sctp_connp->conn_wroff; sopp.sopp_maxblk = sctp->sctp_mss - sizeof (sctp_data_hdr_t); sctp->sctp_ulp_prop(sctp->sctp_ulpd, &sopp); } +/* + * Set the lengths in the packet and the transmit attributes. + */ void -sctp_set_iplen(sctp_t *sctp, mblk_t *mp) +sctp_set_iplen(sctp_t *sctp, mblk_t *mp, ip_xmit_attr_t *ixa) { uint16_t sum = 0; ipha_t *iph; @@ -432,19 +400,15 @@ sctp_set_iplen(sctp_t *sctp, mblk_t *mp) for (; pmp; pmp = pmp->b_cont) sum += pmp->b_wptr - pmp->b_rptr; + ixa->ixa_pktlen = sum; if (isv4) { iph = (ipha_t *)mp->b_rptr; iph->ipha_length = htons(sum); + ixa->ixa_ip_hdr_length = sctp->sctp_ip_hdr_len; } else { ip6h = (ip6_t *)mp->b_rptr; - /* - * If an ip6i_t is present, the real IPv6 header - * immediately follows. - */ - if (ip6h->ip6_nxt == IPPROTO_RAW) - ip6h = (ip6_t *)&ip6h[1]; - ip6h->ip6_plen = htons(sum - ((char *)&sctp->sctp_ip6h[1] - - sctp->sctp_iphc6)); + ip6h->ip6_plen = htons(sum - IPV6_HDR_LEN); + ixa->ixa_ip_hdr_length = sctp->sctp_ip_hdr6_len; } } @@ -501,21 +465,21 @@ sctp_add_faddr(sctp_t *sctp, in6_addr_t *addr, int sleep, boolean_t first) sctp_faddr_t *faddr; mblk_t *timer_mp; int err; + conn_t *connp = sctp->sctp_connp; if (is_system_labeled()) { - cred_t *effective_cred; + ip_xmit_attr_t *ixa = connp->conn_ixa; + ts_label_t *effective_tsl = NULL; + + ASSERT(ixa->ixa_tsl != NULL); /* * Verify the destination is allowed to receive packets * at the security label of the connection we are initiating. * - * tsol_check_dest() will create a new effective cred for + * tsol_check_dest() will create a new effective label for * this connection with a modified label or label flags only - * if there are changes from the original cred. - * - * conn_effective_cred may be non-NULL if a previous - * faddr was already added or if this is a server - * accepting a connection on a multi-label port. + * if there are changes from the original label. * * Accept whatever label we get if this is the first * destination address for this connection. The security @@ -525,27 +489,28 @@ sctp_add_faddr(sctp_t *sctp, in6_addr_t *addr, int sleep, boolean_t first) if (IN6_IS_ADDR_V4MAPPED(addr)) { uint32_t dst; IN6_V4MAPPED_TO_IPADDR(addr, dst); - err = tsol_check_dest(CONN_CRED(sctp->sctp_connp), - &dst, IPV4_VERSION, sctp->sctp_mac_mode, - &effective_cred); + err = tsol_check_dest(ixa->ixa_tsl, + &dst, IPV4_VERSION, connp->conn_mac_mode, + connp->conn_zone_is_global, &effective_tsl); } else { - err = tsol_check_dest(CONN_CRED(sctp->sctp_connp), - addr, IPV6_VERSION, sctp->sctp_mac_mode, - &effective_cred); + err = tsol_check_dest(ixa->ixa_tsl, + addr, IPV6_VERSION, connp->conn_mac_mode, + connp->conn_zone_is_global, &effective_tsl); } if (err != 0) return (err); - if (sctp->sctp_faddrs == NULL && - sctp->sctp_connp->conn_effective_cred == NULL) { - sctp->sctp_connp->conn_effective_cred = effective_cred; - } else if (effective_cred != NULL) { - crfree(effective_cred); + + if (sctp->sctp_faddrs == NULL && effective_tsl != NULL) { + ip_xmit_attr_replace_tsl(ixa, effective_tsl); + } else if (effective_tsl != NULL) { + label_rele(effective_tsl); return (EHOSTUNREACH); } } if ((faddr = kmem_cache_alloc(sctp_kmem_faddr_cache, sleep)) == NULL) return (ENOMEM); + bzero(faddr, sizeof (*faddr)); timer_mp = sctp_timer_alloc((sctp), sctp_rexmit_timer, sleep); if (timer_mp == NULL) { kmem_cache_free(sctp_kmem_faddr_cache, faddr); @@ -553,16 +518,19 @@ sctp_add_faddr(sctp_t *sctp, in6_addr_t *addr, int sleep, boolean_t first) } ((sctpt_t *)(timer_mp->b_rptr))->sctpt_faddr = faddr; - sctp_init_faddr(sctp, faddr, addr, timer_mp); - - /* Check for subnet broadcast. */ - if (faddr->ire != NULL && faddr->ire->ire_type & IRE_BROADCAST) { - IRE_REFRELE_NOTR(faddr->ire); - sctp_timer_free(timer_mp); - faddr->timer_mp = NULL; + /* Start with any options set on the conn */ + faddr->ixa = conn_get_ixa_exclusive(connp); + if (faddr->ixa == NULL) { + freemsg(timer_mp); kmem_cache_free(sctp_kmem_faddr_cache, faddr); - return (EADDRNOTAVAIL); + return (ENOMEM); } + faddr->ixa->ixa_notify_cookie = connp->conn_sctp; + + sctp_init_faddr(sctp, faddr, addr, timer_mp); + ASSERT(faddr->ixa->ixa_cred != NULL); + + /* ip_attr_connect didn't allow broadcats/multicast dest */ ASSERT(faddr->next == NULL); if (sctp->sctp_faddrs == NULL) { @@ -644,7 +612,7 @@ sctp_redo_faddr_srcs(sctp_t *sctp) sctp_faddr_t *fp; for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->next) { - sctp_get_ire(sctp, fp); + sctp_get_dest(sctp, fp); } } @@ -662,15 +630,17 @@ sctp_faddr_alive(sctp_t *sctp, sctp_faddr_t *fp) fp->state = SCTP_FADDRS_ALIVE; sctp_intf_event(sctp, fp->faddr, SCTP_ADDR_AVAILABLE, 0); /* Should have a full IRE now */ - sctp_get_ire(sctp, fp); + sctp_get_dest(sctp, fp); /* * If this is the primary, switch back to it now. And * we probably want to reset the source addr used to reach * it. + * Note that if we didn't find a source in sctp_get_dest + * then we'd be unreachable at this point in time. */ - if (fp == sctp->sctp_primary) { - ASSERT(fp->state != SCTP_FADDRS_UNREACH); + if (fp == sctp->sctp_primary && + fp->state != SCTP_FADDRS_UNREACH) { sctp_set_faddr_current(sctp, fp); return; } @@ -816,9 +786,9 @@ sctp_unlink_faddr(sctp_t *sctp, sctp_faddr_t *fp) fp->rc_timer_mp = NULL; fp->rc_timer_running = 0; } - if (fp->ire != NULL) { - IRE_REFRELE_NOTR(fp->ire); - fp->ire = NULL; + if (fp->ixa != NULL) { + ixa_refrele(fp->ixa); + fp->ixa = NULL; } if (fp == sctp->sctp_faddrs) { @@ -837,7 +807,6 @@ gotit: fpp->next = fp->next; } mutex_exit(&sctp->sctp_conn_tfp->tf_lock); - /* XXX faddr2ire? */ kmem_cache_free(sctp_kmem_faddr_cache, fp); sctp->sctp_nfaddrs--; } @@ -866,8 +835,10 @@ sctp_zap_faddrs(sctp_t *sctp, int caller_holds_lock) for (fp = sctp->sctp_faddrs; fp; fp = fpn) { fpn = fp->next; - if (fp->ire != NULL) - IRE_REFRELE_NOTR(fp->ire); + if (fp->ixa != NULL) { + ixa_refrele(fp->ixa); + fp->ixa = NULL; + } kmem_cache_free(sctp_kmem_faddr_cache, fp); sctp->sctp_nfaddrs--; } @@ -888,242 +859,177 @@ sctp_zap_addrs(sctp_t *sctp) } /* - * Initialize the IPv4 header. Loses any record of any IP options. + * Build two SCTP header templates; one for IPv4 and one for IPv6. + * Store them in sctp_iphc and sctp_iphc6 respectively (and related fields). + * There are no IP addresses in the templates, but the port numbers and + * verifier are field in from the conn_t and sctp_t. + * + * Returns failure if can't allocate memory, or if there is a problem + * with a routing header/option. + * + * We allocate space for the minimum sctp header (sctp_hdr_t). + * + * We massage an routing option/header. There is no checksum implication + * for a routing header for sctp. + * + * Caller needs to update conn_wroff if desired. + * + * TSol notes: This assumes that a SCTP association has a single peer label + * since we only track a single pair of ipp_label_v4/v6 and not a separate one + * for each faddr. */ int -sctp_header_init_ipv4(sctp_t *sctp, int sleep) +sctp_build_hdrs(sctp_t *sctp, int sleep) { + conn_t *connp = sctp->sctp_connp; + ip_pkt_t *ipp = &connp->conn_xmit_ipp; + uint_t ip_hdr_length; + uchar_t *hdrs; + uint_t hdrs_len; + uint_t ulp_hdr_length = sizeof (sctp_hdr_t); + ipha_t *ipha; + ip6_t *ip6h; sctp_hdr_t *sctph; - sctp_stack_t *sctps = sctp->sctp_sctps; + in6_addr_t v6src, v6dst; + ipaddr_t v4src, v4dst; - /* - * This is a simple initialization. If there's - * already a template, it should never be too small, - * so reuse it. Otherwise, allocate space for the new one. - */ - if (sctp->sctp_iphc != NULL) { - ASSERT(sctp->sctp_iphc_len >= SCTP_MAX_COMBINED_HEADER_LENGTH); - bzero(sctp->sctp_iphc, sctp->sctp_iphc_len); - } else { - sctp->sctp_iphc_len = SCTP_MAX_COMBINED_HEADER_LENGTH; - sctp->sctp_iphc = kmem_zalloc(sctp->sctp_iphc_len, sleep); - if (sctp->sctp_iphc == NULL) { - sctp->sctp_iphc_len = 0; - return (ENOMEM); - } - } + v4src = connp->conn_saddr_v4; + v4dst = connp->conn_faddr_v4; + v6src = connp->conn_saddr_v6; + v6dst = connp->conn_faddr_v6; - sctp->sctp_ipha = (ipha_t *)sctp->sctp_iphc; + /* First do IPv4 header */ + ip_hdr_length = ip_total_hdrs_len_v4(ipp); - sctp->sctp_hdr_len = sizeof (ipha_t) + sizeof (sctp_hdr_t); - sctp->sctp_ip_hdr_len = sizeof (ipha_t); - sctp->sctp_ipha->ipha_length = htons(sizeof (ipha_t) + - sizeof (sctp_hdr_t)); - sctp->sctp_ipha->ipha_version_and_hdr_length = - (IP_VERSION << 4) | IP_SIMPLE_HDR_LENGTH_IN_WORDS; + /* In case of TX label and IP options it can be too much */ + if (ip_hdr_length > IP_MAX_HDR_LENGTH) { + /* Preserves existing TX errno for this */ + return (EHOSTUNREACH); + } + hdrs_len = ip_hdr_length + ulp_hdr_length; + ASSERT(hdrs_len != 0); - /* - * These two fields should be zero, and are already set above. - * - * sctp->sctp_ipha->ipha_ident, - * sctp->sctp_ipha->ipha_fragment_offset_and_flags. - */ + if (hdrs_len != sctp->sctp_iphc_len) { + /* Allocate new before we free any old */ + hdrs = kmem_alloc(hdrs_len, sleep); + if (hdrs == NULL) + return (ENOMEM); - sctp->sctp_ipha->ipha_ttl = sctps->sctps_ipv4_ttl; - sctp->sctp_ipha->ipha_protocol = IPPROTO_SCTP; + if (sctp->sctp_iphc != NULL) + kmem_free(sctp->sctp_iphc, sctp->sctp_iphc_len); + sctp->sctp_iphc = hdrs; + sctp->sctp_iphc_len = hdrs_len; + } else { + hdrs = sctp->sctp_iphc; + } + sctp->sctp_hdr_len = sctp->sctp_iphc_len; + sctp->sctp_ip_hdr_len = ip_hdr_length; - sctph = (sctp_hdr_t *)(sctp->sctp_iphc + sizeof (ipha_t)); + sctph = (sctp_hdr_t *)(hdrs + ip_hdr_length); sctp->sctp_sctph = sctph; - - return (0); -} - -/* - * Update sctp_sticky_hdrs based on sctp_sticky_ipp. - * The headers include ip6i_t (if needed), ip6_t, any sticky extension - * headers, and the maximum size sctp header (to avoid reallocation - * on the fly for additional sctp options). - * Returns failure if can't allocate memory. - */ -int -sctp_build_hdrs(sctp_t *sctp) -{ - char *hdrs; - uint_t hdrs_len; - ip6i_t *ip6i; - char buf[SCTP_MAX_HDR_LENGTH]; - ip6_pkt_t *ipp = &sctp->sctp_sticky_ipp; - in6_addr_t src; - in6_addr_t dst; - sctp_stack_t *sctps = sctp->sctp_sctps; - - /* - * save the existing sctp header and source/dest IP addresses - */ - bcopy(sctp->sctp_sctph6, buf, sizeof (sctp_hdr_t)); - src = sctp->sctp_ip6h->ip6_src; - dst = sctp->sctp_ip6h->ip6_dst; - hdrs_len = ip_total_hdrs_len_v6(ipp) + SCTP_MAX_HDR_LENGTH; + sctph->sh_sport = connp->conn_lport; + sctph->sh_dport = connp->conn_fport; + sctph->sh_verf = sctp->sctp_fvtag; + sctph->sh_chksum = 0; + + ipha = (ipha_t *)hdrs; + sctp->sctp_ipha = ipha; + + ipha->ipha_src = v4src; + ipha->ipha_dst = v4dst; + ip_build_hdrs_v4(hdrs, ip_hdr_length, ipp, connp->conn_proto); + ipha->ipha_length = htons(hdrs_len); + ipha->ipha_fragment_offset_and_flags = 0; + + if (ipp->ipp_fields & IPPF_IPV4_OPTIONS) + (void) ip_massage_options(ipha, connp->conn_netstack); + + /* Now IPv6 */ + ip_hdr_length = ip_total_hdrs_len_v6(ipp); + hdrs_len = ip_hdr_length + ulp_hdr_length; ASSERT(hdrs_len != 0); - if (hdrs_len > sctp->sctp_iphc6_len) { - /* Need to reallocate */ - hdrs = kmem_zalloc(hdrs_len, KM_NOSLEEP); + + if (hdrs_len != sctp->sctp_iphc6_len) { + /* Allocate new before we free any old */ + hdrs = kmem_alloc(hdrs_len, sleep); if (hdrs == NULL) return (ENOMEM); - if (sctp->sctp_iphc6_len != 0) + if (sctp->sctp_iphc6 != NULL) kmem_free(sctp->sctp_iphc6, sctp->sctp_iphc6_len); sctp->sctp_iphc6 = hdrs; sctp->sctp_iphc6_len = hdrs_len; - } - ip_build_hdrs_v6((uchar_t *)sctp->sctp_iphc6, - hdrs_len - SCTP_MAX_HDR_LENGTH, ipp, IPPROTO_SCTP); - - /* Set header fields not in ipp */ - if (ipp->ipp_fields & IPPF_HAS_IP6I) { - ip6i = (ip6i_t *)sctp->sctp_iphc6; - sctp->sctp_ip6h = (ip6_t *)&ip6i[1]; } else { - sctp->sctp_ip6h = (ip6_t *)sctp->sctp_iphc6; + hdrs = sctp->sctp_iphc6; } - /* - * sctp->sctp_ip_hdr_len will include ip6i_t if there is one. - */ - sctp->sctp_ip_hdr6_len = hdrs_len - SCTP_MAX_HDR_LENGTH; - sctp->sctp_sctph6 = (sctp_hdr_t *)(sctp->sctp_iphc6 + - sctp->sctp_ip_hdr6_len); - sctp->sctp_hdr6_len = sctp->sctp_ip_hdr6_len + sizeof (sctp_hdr_t); - - bcopy(buf, sctp->sctp_sctph6, sizeof (sctp_hdr_t)); + sctp->sctp_hdr6_len = sctp->sctp_iphc6_len; + sctp->sctp_ip_hdr6_len = ip_hdr_length; - sctp->sctp_ip6h->ip6_src = src; - sctp->sctp_ip6h->ip6_dst = dst; - /* - * If the hoplimit was not set by ip_build_hdrs_v6(), we need to - * set it to the default value for SCTP. - */ - if (!(ipp->ipp_fields & IPPF_UNICAST_HOPS)) - sctp->sctp_ip6h->ip6_hops = sctps->sctps_ipv6_hoplimit; - /* - * If we're setting extension headers after a connection - * has been established, and if we have a routing header - * among the extension headers, call ip_massage_options_v6 to - * manipulate the routing header/ip6_dst set the checksum - * difference in the sctp header template. - * (This happens in sctp_connect_ipv6 if the routing header - * is set prior to the connect.) - */ - - if ((sctp->sctp_state >= SCTPS_COOKIE_WAIT) && - (sctp->sctp_sticky_ipp.ipp_fields & IPPF_RTHDR)) { - ip6_rthdr_t *rth; - - rth = ip_find_rthdr_v6(sctp->sctp_ip6h, - (uint8_t *)sctp->sctp_sctph6); + sctph = (sctp_hdr_t *)(hdrs + ip_hdr_length); + sctp->sctp_sctph6 = sctph; + sctph->sh_sport = connp->conn_lport; + sctph->sh_dport = connp->conn_fport; + sctph->sh_verf = sctp->sctp_fvtag; + sctph->sh_chksum = 0; + + ip6h = (ip6_t *)hdrs; + sctp->sctp_ip6h = ip6h; + + ip6h->ip6_src = v6src; + ip6h->ip6_dst = v6dst; + ip_build_hdrs_v6(hdrs, ip_hdr_length, ipp, connp->conn_proto, + connp->conn_flowinfo); + ip6h->ip6_plen = htons(hdrs_len - IPV6_HDR_LEN); + + if (ipp->ipp_fields & IPPF_RTHDR) { + uint8_t *end; + ip6_rthdr_t *rth; + + end = (uint8_t *)ip6h + ip_hdr_length; + rth = ip_find_rthdr_v6(ip6h, end); if (rth != NULL) { - (void) ip_massage_options_v6(sctp->sctp_ip6h, rth, - sctps->sctps_netstack); + (void) ip_massage_options_v6(ip6h, rth, + connp->conn_netstack); } - } - return (0); -} -/* - * Initialize the IPv6 header. Loses any record of any IPv6 extension headers. - */ -int -sctp_header_init_ipv6(sctp_t *sctp, int sleep) -{ - sctp_hdr_t *sctph; - sctp_stack_t *sctps = sctp->sctp_sctps; - - /* - * This is a simple initialization. If there's - * already a template, it should never be too small, - * so reuse it. Otherwise, allocate space for the new one. - * Ensure that there is enough space to "downgrade" the sctp_t - * to an IPv4 sctp_t. This requires having space for a full load - * of IPv4 options - */ - if (sctp->sctp_iphc6 != NULL) { - ASSERT(sctp->sctp_iphc6_len >= - SCTP_MAX_COMBINED_HEADER_LENGTH); - bzero(sctp->sctp_iphc6, sctp->sctp_iphc6_len); - } else { - sctp->sctp_iphc6_len = SCTP_MAX_COMBINED_HEADER_LENGTH; - sctp->sctp_iphc6 = kmem_zalloc(sctp->sctp_iphc_len, sleep); - if (sctp->sctp_iphc6 == NULL) { - sctp->sctp_iphc6_len = 0; - return (ENOMEM); - } + /* + * Verify that the first hop isn't a mapped address. + * Routers along the path need to do this verification + * for subsequent hops. + */ + if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_dst)) + return (EADDRNOTAVAIL); } - sctp->sctp_hdr6_len = IPV6_HDR_LEN + sizeof (sctp_hdr_t); - sctp->sctp_ip_hdr6_len = IPV6_HDR_LEN; - sctp->sctp_ip6h = (ip6_t *)sctp->sctp_iphc6; - - /* Initialize the header template */ - - sctp->sctp_ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW; - sctp->sctp_ip6h->ip6_plen = ntohs(sizeof (sctp_hdr_t)); - sctp->sctp_ip6h->ip6_nxt = IPPROTO_SCTP; - sctp->sctp_ip6h->ip6_hops = sctps->sctps_ipv6_hoplimit; - - sctph = (sctp_hdr_t *)(sctp->sctp_iphc6 + IPV6_HDR_LEN); - sctp->sctp_sctph6 = sctph; - return (0); } static int -sctp_v4_label(sctp_t *sctp) +sctp_v4_label(sctp_t *sctp, sctp_faddr_t *fp) { - uchar_t optbuf[IP_MAX_OPT_LENGTH]; - const cred_t *cr = CONN_CRED(sctp->sctp_connp); - int added; + conn_t *connp = sctp->sctp_connp; - if (tsol_compute_label(cr, sctp->sctp_ipha->ipha_dst, optbuf, - sctp->sctp_sctps->sctps_netstack->netstack_ip) != 0) - return (EACCES); - - added = tsol_remove_secopt(sctp->sctp_ipha, sctp->sctp_hdr_len); - if (added == -1) - return (EACCES); - sctp->sctp_hdr_len += added; - sctp->sctp_sctph = (sctp_hdr_t *)((uchar_t *)sctp->sctp_sctph + added); - sctp->sctp_ip_hdr_len += added; - if ((sctp->sctp_v4label_len = optbuf[IPOPT_OLEN]) != 0) { - sctp->sctp_v4label_len = (sctp->sctp_v4label_len + 3) & ~3; - added = tsol_prepend_option(optbuf, sctp->sctp_ipha, - sctp->sctp_hdr_len); - if (added == -1) - return (EACCES); - sctp->sctp_hdr_len += added; - sctp->sctp_sctph = (sctp_hdr_t *)((uchar_t *)sctp->sctp_sctph + - added); - sctp->sctp_ip_hdr_len += added; - } - return (0); + ASSERT(fp->ixa->ixa_flags & IXAF_IS_IPV4); + return (conn_update_label(connp, fp->ixa, &fp->faddr, + &connp->conn_xmit_ipp)); } static int -sctp_v6_label(sctp_t *sctp) +sctp_v6_label(sctp_t *sctp, sctp_faddr_t *fp) { - uchar_t optbuf[TSOL_MAX_IPV6_OPTION]; - const cred_t *cr = CONN_CRED(sctp->sctp_connp); + conn_t *connp = sctp->sctp_connp; - if (tsol_compute_label_v6(cr, &sctp->sctp_ip6h->ip6_dst, optbuf, - sctp->sctp_sctps->sctps_netstack->netstack_ip) != 0) - return (EACCES); - if (tsol_update_sticky(&sctp->sctp_sticky_ipp, &sctp->sctp_v6label_len, - optbuf) != 0) - return (EACCES); - if (sctp_build_hdrs(sctp) != 0) - return (EACCES); - return (0); + ASSERT(!(fp->ixa->ixa_flags & IXAF_IS_IPV4)); + return (conn_update_label(connp, fp->ixa, &fp->faddr, + &connp->conn_xmit_ipp)); } /* * XXX implement more sophisticated logic + * + * Tsol note: We have already verified the addresses using tsol_check_dest + * in sctp_add_faddr, thus no need to redo that here. + * We do setup ipp_label_v4 and ipp_label_v6 based on which addresses + * we have. */ int sctp_set_hdraddrs(sctp_t *sctp) @@ -1131,50 +1037,43 @@ sctp_set_hdraddrs(sctp_t *sctp) sctp_faddr_t *fp; int gotv4 = 0; int gotv6 = 0; + conn_t *connp = sctp->sctp_connp; ASSERT(sctp->sctp_faddrs != NULL); ASSERT(sctp->sctp_nsaddrs > 0); /* Set up using the primary first */ + connp->conn_faddr_v6 = sctp->sctp_primary->faddr; + /* saddr may be unspec; make_mp() will handle this */ + connp->conn_saddr_v6 = sctp->sctp_primary->saddr; + connp->conn_laddr_v6 = connp->conn_saddr_v6; if (IN6_IS_ADDR_V4MAPPED(&sctp->sctp_primary->faddr)) { - IN6_V4MAPPED_TO_IPADDR(&sctp->sctp_primary->faddr, - sctp->sctp_ipha->ipha_dst); - /* saddr may be unspec; make_mp() will handle this */ - IN6_V4MAPPED_TO_IPADDR(&sctp->sctp_primary->saddr, - sctp->sctp_ipha->ipha_src); - if (!is_system_labeled() || sctp_v4_label(sctp) == 0) { + if (!is_system_labeled() || + sctp_v4_label(sctp, sctp->sctp_primary) == 0) { gotv4 = 1; - if (sctp->sctp_ipversion == IPV4_VERSION) { - goto copyports; + if (connp->conn_family == AF_INET) { + goto done; } } } else { - sctp->sctp_ip6h->ip6_dst = sctp->sctp_primary->faddr; - /* saddr may be unspec; make_mp() will handle this */ - sctp->sctp_ip6h->ip6_src = sctp->sctp_primary->saddr; - if (!is_system_labeled() || sctp_v6_label(sctp) == 0) + if (!is_system_labeled() || + sctp_v6_label(sctp, sctp->sctp_primary) == 0) { gotv6 = 1; + } } for (fp = sctp->sctp_faddrs; fp; fp = fp->next) { if (!gotv4 && IN6_IS_ADDR_V4MAPPED(&fp->faddr)) { - IN6_V4MAPPED_TO_IPADDR(&fp->faddr, - sctp->sctp_ipha->ipha_dst); - /* copy in the faddr_t's saddr */ - IN6_V4MAPPED_TO_IPADDR(&fp->saddr, - sctp->sctp_ipha->ipha_src); - if (!is_system_labeled() || sctp_v4_label(sctp) == 0) { + if (!is_system_labeled() || + sctp_v4_label(sctp, fp) == 0) { gotv4 = 1; - if (sctp->sctp_ipversion == IPV4_VERSION || - gotv6) { + if (connp->conn_family == AF_INET || gotv6) { break; } } } else if (!gotv6 && !IN6_IS_ADDR_V4MAPPED(&fp->faddr)) { - sctp->sctp_ip6h->ip6_dst = fp->faddr; - /* copy in the faddr_t's saddr */ - sctp->sctp_ip6h->ip6_src = fp->saddr; - if (!is_system_labeled() || sctp_v6_label(sctp) == 0) { + if (!is_system_labeled() || + sctp_v6_label(sctp, fp) == 0) { gotv6 = 1; if (gotv4) break; @@ -1182,16 +1081,10 @@ sctp_set_hdraddrs(sctp_t *sctp) } } -copyports: +done: if (!gotv4 && !gotv6) return (EACCES); - /* copy in the ports for good measure */ - sctp->sctp_sctph->sh_sport = sctp->sctp_lport; - sctp->sctp_sctph->sh_dport = sctp->sctp_fport; - - sctp->sctp_sctph6->sh_sport = sctp->sctp_lport; - sctp->sctp_sctph6->sh_dport = sctp->sctp_fport; return (0); } @@ -1343,6 +1236,7 @@ sctp_get_addrparams(sctp_t *sctp, sctp_t *psctp, mblk_t *pkt, boolean_t check_saddr = B_TRUE; in6_addr_t curaddr; sctp_stack_t *sctps = sctp->sctp_sctps; + conn_t *connp = sctp->sctp_connp; if (sctp_options != NULL) *sctp_options = 0; @@ -1473,8 +1367,7 @@ sctp_get_addrparams(sctp_t *sctp, sctp_t *psctp, mblk_t *pkt, if (ta == 0 || ta == INADDR_BROADCAST || ta == htonl(INADDR_LOOPBACK) || - CLASSD(ta) || - sctp->sctp_connp->conn_ipv6_v6only) { + CLASSD(ta) || connp->conn_ipv6_v6only) { goto next; } IN6_INADDR_TO_V4MAPPED((struct in_addr *) @@ -1492,7 +1385,7 @@ sctp_get_addrparams(sctp_t *sctp, sctp_t *psctp, mblk_t *pkt, goto next; } } else if (ph->sph_type == htons(PARM_ADDR6) && - sctp->sctp_family == AF_INET6) { + connp->conn_family == AF_INET6) { /* An v4 socket should not take v6 addresses. */ if (remaining >= PARM_ADDR6_LEN) { in6_addr_t *addr6; @@ -1567,7 +1460,7 @@ next: } bcopy(&curaddr, dlist, sizeof (curaddr)); sctp_get_faddr_list(sctp, alist, asize); - (*cl_sctp_assoc_change)(sctp->sctp_family, alist, asize, + (*cl_sctp_assoc_change)(connp->conn_family, alist, asize, sctp->sctp_nfaddrs, dlist, dsize, 1, SCTP_CL_PADDR, (cl_sctp_handle_t)sctp); /* alist and dlist will be freed by the clustering module */ @@ -1581,7 +1474,7 @@ next: */ int sctp_secure_restart_check(mblk_t *pkt, sctp_chunk_hdr_t *ich, uint32_t ports, - int sleep, sctp_stack_t *sctps) + int sleep, sctp_stack_t *sctps, ip_recv_attr_t *ira) { sctp_faddr_t *fp, *fphead = NULL; sctp_parm_hdr_t *ph; @@ -1696,7 +1589,7 @@ sctp_secure_restart_check(mblk_t *pkt, sctp_chunk_hdr_t *ich, uint32_t ports, mutex_enter(&tf->tf_lock); for (sctp = tf->tf_sctp; sctp; sctp = sctp->sctp_conn_hash_next) { - if (ports != sctp->sctp_ports) { + if (ports != sctp->sctp_connp->conn_ports) { continue; } compres = sctp_compare_faddrsets(fphead, sctp->sctp_faddrs); @@ -1776,7 +1669,8 @@ done: /* Send off the abort */ sctp_send_abort(sctp, sctp_init2vtag(ich), - SCTP_ERR_RESTART_NEW_ADDRS, dtail, dlen, pkt, 0, B_TRUE); + SCTP_ERR_RESTART_NEW_ADDRS, dtail, dlen, pkt, 0, B_TRUE, + ira); kmem_free(dtail, PARM_ADDR6_LEN * nadded); } @@ -1787,6 +1681,10 @@ cleanup: sctp_faddr_t *fpn; for (fp = fphead; fp; fp = fpn) { fpn = fp->next; + if (fp->ixa != NULL) { + ixa_refrele(fp->ixa); + fp->ixa = NULL; + } kmem_cache_free(sctp_kmem_faddr_cache, fp); } } @@ -1850,6 +1748,8 @@ sctp_init_faddr(sctp_t *sctp, sctp_faddr_t *fp, in6_addr_t *addr, { sctp_stack_t *sctps = sctp->sctp_sctps; + ASSERT(fp->ixa != NULL); + bcopy(addr, &fp->faddr, sizeof (*addr)); if (IN6_IS_ADDR_V4MAPPED(addr)) { fp->isv4 = 1; @@ -1857,11 +1757,13 @@ sctp_init_faddr(sctp_t *sctp, sctp_faddr_t *fp, in6_addr_t *addr, fp->sfa_pmss = (sctps->sctps_initial_mtu - sctp->sctp_hdr_len) & ~(SCTP_ALIGN - 1); + fp->ixa->ixa_flags |= IXAF_IS_IPV4; } else { fp->isv4 = 0; fp->sfa_pmss = (sctps->sctps_initial_mtu - sctp->sctp_hdr6_len) & ~(SCTP_ALIGN - 1); + fp->ixa->ixa_flags &= ~IXAF_IS_IPV4; } fp->cwnd = sctps->sctps_slow_start_initial * fp->sfa_pmss; fp->rto = MIN(sctp->sctp_rto_initial, sctp->sctp_init_rto_max); @@ -1884,14 +1786,13 @@ sctp_init_faddr(sctp_t *sctp, sctp_faddr_t *fp, in6_addr_t *addr, fp->df = 1; fp->pmtu_discovered = 0; fp->next = NULL; - fp->ire = NULL; fp->T3expire = 0; (void) random_get_pseudo_bytes((uint8_t *)&fp->hb_secret, sizeof (fp->hb_secret)); fp->hb_expiry = lbolt64; fp->rxt_unacked = 0; - sctp_get_ire(sctp, fp); + sctp_get_dest(sctp, fp); } /*ARGSUSED*/ diff --git a/usr/src/uts/common/inet/sctp/sctp_conn.c b/usr/src/uts/common/inet/sctp/sctp_conn.c index 60c22a3673..7dc048f919 100644 --- a/usr/src/uts/common/inet/sctp/sctp_conn.c +++ b/usr/src/uts/common/inet/sctp/sctp_conn.c @@ -64,38 +64,19 @@ sctp_accept_comm(sctp_t *listener, sctp_t *acceptor, mblk_t *cr_pkt, uint_t sctp_options; conn_t *aconnp; conn_t *lconnp; - cred_t *credp; - ts_label_t *tslp; sctp_stack_t *sctps = listener->sctp_sctps; sctph = (sctp_hdr_t *)(cr_pkt->b_rptr + ip_hdr_len); ASSERT(OK_32PTR(sctph)); - acceptor->sctp_lport = listener->sctp_lport; - acceptor->sctp_fport = sctph->sh_sport; + aconnp = acceptor->sctp_connp; + lconnp = listener->sctp_connp; + aconnp->conn_lport = lconnp->conn_lport; + aconnp->conn_fport = sctph->sh_sport; ich = (sctp_chunk_hdr_t *)(iack + 1); init = (sctp_init_chunk_t *)(ich + 1); - /* - * If this is an MLP connection, packets are to be - * exchanged using the security label of the received - * Cookie packet instead of the server application's label. - * Create an effective cred for the connection by attaching - * the received packet's security label to the server - * application's cred. - */ - aconnp = acceptor->sctp_connp; - lconnp = listener->sctp_connp; - ASSERT(aconnp->conn_effective_cred == NULL); - if (lconnp->conn_mlp_type != mlptSingle && - (credp = msg_getcred(cr_pkt, NULL)) != NULL && - (tslp = crgetlabel(credp)) != NULL) { - if ((aconnp->conn_effective_cred = copycred_from_tslabel( - aconnp->conn_cred, tslp, KM_NOSLEEP)) == NULL) - return (ENOMEM); - } - /* acceptor isn't in any fanouts yet, so don't need to hold locks */ ASSERT(acceptor->sctp_faddrs == NULL); err = sctp_get_addrparams(acceptor, listener, cr_pkt, ich, @@ -106,14 +87,15 @@ sctp_accept_comm(sctp_t *listener, sctp_t *acceptor, mblk_t *cr_pkt, if ((err = sctp_set_hdraddrs(acceptor)) != 0) return (err); + if ((err = sctp_build_hdrs(acceptor, KM_NOSLEEP)) != 0) + return (err); + if ((sctp_options & SCTP_PRSCTP_OPTION) && listener->sctp_prsctp_aware && sctps->sctps_prsctp_enabled) { acceptor->sctp_prsctp_aware = B_TRUE; } else { acceptor->sctp_prsctp_aware = B_FALSE; } - /* The new sctp_t is fully bound now. */ - acceptor->sctp_connp->conn_fully_bound = B_TRUE; /* Get initial TSNs */ acceptor->sctp_ltsn = ntohl(iack->sic_inittsn); @@ -142,9 +124,9 @@ sctp_accept_comm(sctp_t *listener, sctp_t *acceptor, mblk_t *cr_pkt, RUN_SCTP(acceptor); sctp_conn_hash_insert(&sctps->sctps_conn_fanout[ - SCTP_CONN_HASH(sctps, acceptor->sctp_ports)], acceptor, 0); + SCTP_CONN_HASH(sctps, aconnp->conn_ports)], acceptor, 0); sctp_bind_hash_insert(&sctps->sctps_bind_fanout[ - SCTP_BIND_HASH(ntohs(acceptor->sctp_lport))], acceptor, 0); + SCTP_BIND_HASH(ntohs(aconnp->conn_lport))], acceptor, 0); /* * No need to check for multicast destination since ip will only pass @@ -170,10 +152,9 @@ sctp_accept_comm(sctp_t *listener, sctp_t *acceptor, mblk_t *cr_pkt, /* Process the COOKIE packet, mp, directed at the listener 'sctp' */ sctp_t * sctp_conn_request(sctp_t *sctp, mblk_t *mp, uint_t ifindex, uint_t ip_hdr_len, - sctp_init_chunk_t *iack, mblk_t *ipsec_mp) + sctp_init_chunk_t *iack, ip_recv_attr_t *ira) { sctp_t *eager; - uint_t ipvers; ip6_t *ip6h; int err; conn_t *connp, *econnp; @@ -181,6 +162,8 @@ sctp_conn_request(sctp_t *sctp, mblk_t *mp, uint_t ifindex, uint_t ip_hdr_len, struct sock_proto_props sopp; cred_t *cr; pid_t cpid; + in6_addr_t faddr, laddr; + ip_xmit_attr_t *ixa; /* * No need to check for duplicate as this is the listener @@ -189,89 +172,116 @@ sctp_conn_request(sctp_t *sctp, mblk_t *mp, uint_t ifindex, uint_t ip_hdr_len, * fanout already done cannot find a match, it means that * there is no duplicate. */ - ipvers = IPH_HDR_VERSION(mp->b_rptr); - ASSERT(ipvers == IPV6_VERSION || ipvers == IPV4_VERSION); ASSERT(OK_32PTR(mp->b_rptr)); if ((eager = sctp_create_eager(sctp)) == NULL) { return (NULL); } - if (ipvers != IPV4_VERSION) { - ip6h = (ip6_t *)mp->b_rptr; - if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src)) - eager->sctp_linklocal = 1; - /* - * Record ifindex (might be zero) to tie this connection to - * that interface if either the listener was bound or - * if the connection is using link-local addresses. - */ - if (sctp->sctp_bound_if == ifindex || - IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src)) - eager->sctp_bound_if = ifindex; - /* - * XXX broken. bound_if is always overwritten by statement - * below. What is the right thing to do here? - */ - eager->sctp_bound_if = sctp->sctp_bound_if; - } - connp = sctp->sctp_connp; sctps = sctp->sctp_sctps; econnp = eager->sctp_connp; if (connp->conn_policy != NULL) { - ipsec_in_t *ii; - - ASSERT(ipsec_mp != NULL); - ii = (ipsec_in_t *)(ipsec_mp->b_rptr); - ASSERT(ii->ipsec_in_policy == NULL); - IPPH_REFHOLD(connp->conn_policy); - ii->ipsec_in_policy = connp->conn_policy; - - ipsec_mp->b_datap->db_type = IPSEC_POLICY_SET; - if (!ip_bind_ipsec_policy_set(econnp, ipsec_mp)) { + /* Inherit the policy from the listener; use actions from ira */ + if (!ip_ipsec_policy_inherit(econnp, connp, ira)) { sctp_close_eager(eager); BUMP_MIB(&sctps->sctps_mib, sctpListenDrop); return (NULL); } } - if (ipsec_mp != NULL) { + ip6h = (ip6_t *)mp->b_rptr; + if (ira->ira_flags & IXAF_IS_IPV4) { + ipha_t *ipha; + + ipha = (ipha_t *)ip6h; + IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &laddr); + IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &faddr); + } else { + laddr = ip6h->ip6_dst; + faddr = ip6h->ip6_src; + } + + if (ira->ira_flags & IRAF_IPSEC_SECURE) { /* * XXX need to fix the cached policy issue here. - * We temporarily set the conn_src/conn_rem here so + * We temporarily set the conn_laddr/conn_faddr here so * that IPsec can use it for the latched policy * selector. This is obvioursly wrong as SCTP can * use different addresses... */ - if (ipvers == IPV4_VERSION) { - ipha_t *ipha; - - ipha = (ipha_t *)mp->b_rptr; - econnp->conn_src = ipha->ipha_dst; - econnp->conn_rem = ipha->ipha_src; - } else { - econnp->conn_srcv6 = ip6h->ip6_dst; - econnp->conn_remv6 = ip6h->ip6_src; - } + econnp->conn_laddr_v6 = laddr; + econnp->conn_faddr_v6 = faddr; + econnp->conn_saddr_v6 = laddr; } - if (ipsec_conn_cache_policy(econnp, ipvers == IPV4_VERSION) != 0) { + if (ipsec_conn_cache_policy(econnp, + (ira->ira_flags & IRAF_IS_IPV4) != 0) != 0) { sctp_close_eager(eager); BUMP_MIB(&sctps->sctps_mib, sctpListenDrop); return (NULL); } /* Save for getpeerucred */ - cr = msg_getcred(mp, &cpid); + cr = ira->ira_cred; + cpid = ira->ira_cpid; + + if (is_system_labeled()) { + ip_xmit_attr_t *ixa = econnp->conn_ixa; + + ASSERT(ira->ira_tsl != NULL); + + /* Discard any old label */ + if (ixa->ixa_free_flags & IXA_FREE_TSL) { + ASSERT(ixa->ixa_tsl != NULL); + label_rele(ixa->ixa_tsl); + ixa->ixa_free_flags &= ~IXA_FREE_TSL; + ixa->ixa_tsl = NULL; + } + + if ((connp->conn_mlp_type != mlptSingle || + connp->conn_mac_mode != CONN_MAC_DEFAULT) && + ira->ira_tsl != NULL) { + /* + * If this is an MLP connection or a MAC-Exempt + * connection with an unlabeled node, packets are to be + * exchanged using the security label of the received + * Cookie packet instead of the server application's + * label. + * tsol_check_dest called from ip_set_destination + * might later update TSF_UNLABELED by replacing + * ixa_tsl with a new label. + */ + label_hold(ira->ira_tsl); + ip_xmit_attr_replace_tsl(ixa, ira->ira_tsl); + } else { + ixa->ixa_tsl = crgetlabel(econnp->conn_cred); + } + } err = sctp_accept_comm(sctp, eager, mp, ip_hdr_len, iack); - if (err) { + if (err != 0) { sctp_close_eager(eager); BUMP_MIB(&sctps->sctps_mib, sctpListenDrop); return (NULL); } + ASSERT(eager->sctp_current->ixa != NULL); + + ixa = eager->sctp_current->ixa; + if (!(ira->ira_flags & IXAF_IS_IPV4)) { + ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4)); + + if (IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src) || + IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_dst)) { + eager->sctp_linklocal = 1; + + ixa->ixa_flags |= IXAF_SCOPEID_SET; + ixa->ixa_scopeid = ifindex; + econnp->conn_incoming_ifindex = ifindex; + } + } + /* * On a clustered note send this notification to the clustering * subsystem. @@ -299,9 +309,9 @@ sctp_conn_request(sctp_t *sctp, mblk_t *mp, uint_t ifindex, uint_t ip_hdr_len, /* The clustering module frees these list */ sctp_get_saddr_list(eager, slist, ssize); sctp_get_faddr_list(eager, flist, fsize); - (*cl_sctp_connect)(eager->sctp_family, slist, - eager->sctp_nsaddrs, eager->sctp_lport, flist, - eager->sctp_nfaddrs, eager->sctp_fport, B_FALSE, + (*cl_sctp_connect)(econnp->conn_family, slist, + eager->sctp_nsaddrs, econnp->conn_lport, flist, + eager->sctp_nfaddrs, econnp->conn_fport, B_FALSE, (cl_sctp_handle_t)eager); } @@ -318,7 +328,7 @@ sctp_conn_request(sctp_t *sctp, mblk_t *mp, uint_t ifindex, uint_t ip_hdr_len, bzero(&sopp, sizeof (sopp)); sopp.sopp_flags = SOCKOPT_MAXBLK|SOCKOPT_WROFF; sopp.sopp_maxblk = strmsgsz; - if (eager->sctp_family == AF_INET) { + if (econnp->conn_family == AF_INET) { sopp.sopp_wroff = sctps->sctps_wroff_xtra + sizeof (sctp_data_hdr_t) + sctp->sctp_hdr_len; } else { @@ -335,7 +345,8 @@ sctp_conn_request(sctp_t *sctp, mblk_t *mp, uint_t ifindex, uint_t ip_hdr_len, * with an OK ack. */ int -sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen) +sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen, + cred_t *cr, pid_t pid) { sin_t *sin; sin6_t *sin6; @@ -346,18 +357,18 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen) sctp_t *lsctp; char buf[INET6_ADDRSTRLEN]; int sleep = sctp->sctp_cansleep ? KM_SLEEP : KM_NOSLEEP; - int hdrlen; - ip6_rthdr_t *rth; int err; sctp_faddr_t *cur_fp; sctp_stack_t *sctps = sctp->sctp_sctps; - struct sock_proto_props sopp; + conn_t *connp = sctp->sctp_connp; + uint_t scope_id = 0; + ip_xmit_attr_t *ixa; /* * Determine packet type based on type of address passed in * the request should contain an IPv4 or IPv6 address. * Make sure that address family matches the type of - * family of the the address passed down + * family of the address passed down. */ if (addrlen < sizeof (sin_t)) { return (EINVAL); @@ -372,7 +383,7 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen) ip0dbg(("sctp_connect: non-unicast\n")); return (EINVAL); } - if (sctp->sctp_connp->conn_ipv6_v6only) + if (connp->conn_ipv6_v6only) return (EAFNOSUPPORT); /* convert to v6 mapped */ @@ -397,11 +408,6 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen) IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &dstaddr); } dstport = sin->sin_port; - if (sin->sin_family == AF_INET) { - hdrlen = sctp->sctp_hdr_len; - } else { - hdrlen = sctp->sctp_hdr6_len; - } break; case AF_INET6: sin6 = (sin6_t *)dst; @@ -411,7 +417,7 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen) ip0dbg(("sctp_connect: non-unicast\n")); return (EINVAL); } - if (sctp->sctp_connp->conn_ipv6_v6only && + if (connp->conn_ipv6_v6only && IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { return (EAFNOSUPPORT); } @@ -420,11 +426,13 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen) dstaddr = ipv6_loopback; } else { dstaddr = sin6->sin6_addr; - if (IN6_IS_ADDR_LINKLOCAL(&dstaddr)) + if (IN6_IS_ADDR_LINKLOCAL(&dstaddr)) { sctp->sctp_linklocal = 1; + scope_id = sin6->sin6_scope_id; + } } dstport = sin6->sin6_port; - hdrlen = sctp->sctp_hdr6_len; + connp->conn_flowinfo = sin6->sin6_flowinfo; break; default: dprint(1, ("sctp_connect: unknown family %d\n", @@ -437,12 +445,29 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen) RUN_SCTP(sctp); - if (sctp->sctp_family != dst->sa_family || - (sctp->sctp_connp->conn_state_flags & CONN_CLOSING)) { + if (connp->conn_family != dst->sa_family || + (connp->conn_state_flags & CONN_CLOSING)) { WAKE_SCTP(sctp); return (EINVAL); } + /* We update our cred/cpid based on the caller of connect */ + if (connp->conn_cred != cr) { + crhold(cr); + crfree(connp->conn_cred); + connp->conn_cred = cr; + } + connp->conn_cpid = pid; + + /* Cache things in conn_ixa without any refhold */ + ixa = connp->conn_ixa; + ixa->ixa_cred = cr; + ixa->ixa_cpid = pid; + if (is_system_labeled()) { + /* We need to restart with a label based on the cred */ + ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); + } + switch (sctp->sctp_state) { case SCTPS_IDLE: { struct sockaddr_storage ss; @@ -459,7 +484,7 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen) ASSERT(sctp->sctp_nsaddrs == 0); bzero(&ss, sizeof (ss)); - ss.ss_family = sctp->sctp_family; + ss.ss_family = connp->conn_family; WAKE_SCTP(sctp); if ((err = sctp_bind(sctp, (struct sockaddr *)&ss, sizeof (ss))) != 0) { @@ -474,7 +499,7 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen) /* do the connect */ /* XXX check for attempt to connect to self */ - sctp->sctp_fport = dstport; + connp->conn_fport = dstport; ASSERT(sctp->sctp_iphc); ASSERT(sctp->sctp_iphc6); @@ -487,9 +512,9 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen) */ sctp_conn_hash_remove(sctp); tbf = &sctps->sctps_conn_fanout[SCTP_CONN_HASH(sctps, - sctp->sctp_ports)]; + connp->conn_ports)]; mutex_enter(&tbf->tf_lock); - lsctp = sctp_lookup(sctp, &dstaddr, tbf, &sctp->sctp_ports, + lsctp = sctp_lookup(sctp, &dstaddr, tbf, &connp->conn_ports, SCTPS_COOKIE_WAIT); if (lsctp != NULL) { /* found a duplicate connection */ @@ -498,6 +523,7 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen) WAKE_SCTP(sctp); return (EADDRINUSE); } + /* * OK; set up the peer addr (this may grow after we get * the INIT ACK from the peer with additional addresses). @@ -509,6 +535,7 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen) return (err); } cur_fp = sctp->sctp_faddrs; + ASSERT(cur_fp->ixa != NULL); /* No valid src addr, return. */ if (cur_fp->state == SCTP_FADDRS_UNREACH) { @@ -523,6 +550,16 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen) sctp_conn_hash_insert(tbf, sctp, 1); mutex_exit(&tbf->tf_lock); + ixa = cur_fp->ixa; + ASSERT(ixa->ixa_cred != NULL); + + if (scope_id != 0) { + ixa->ixa_flags |= IXAF_SCOPEID_SET; + ixa->ixa_scopeid = scope_id; + } else { + ixa->ixa_flags &= ~IXAF_SCOPEID_SET; + } + /* initialize composite headers */ if ((err = sctp_set_hdraddrs(sctp)) != 0) { sctp_conn_hash_remove(sctp); @@ -530,15 +567,10 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen) return (err); } - /* - * Massage a routing header (if present) putting the first hop - * in ip6_dst. - */ - rth = ip_find_rthdr_v6(sctp->sctp_ip6h, - (uint8_t *)sctp->sctp_sctph6); - if (rth != NULL) { - (void) ip_massage_options_v6(sctp->sctp_ip6h, rth, - sctps->sctps_netstack); + if ((err = sctp_build_hdrs(sctp, KM_SLEEP)) != 0) { + sctp_conn_hash_remove(sctp); + WAKE_SCTP(sctp); + return (err); } /* @@ -556,9 +588,6 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen) /* Mark this address as alive */ cur_fp->state = SCTP_FADDRS_ALIVE; - /* This sctp_t is fully bound now. */ - sctp->sctp_connp->conn_fully_bound = B_TRUE; - /* Send the INIT to the peer */ SCTP_FADDR_TIMER_RESTART(sctp, cur_fp, cur_fp->rto); sctp->sctp_state = SCTPS_COOKIE_WAIT; @@ -567,7 +596,7 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen) * address list, so take the hash lock. */ mutex_enter(&tbf->tf_lock); - initmp = sctp_init_mp(sctp); + initmp = sctp_init_mp(sctp, cur_fp); if (initmp == NULL) { mutex_exit(&tbf->tf_lock); /* @@ -605,24 +634,20 @@ sctp_connect(sctp_t *sctp, const struct sockaddr *dst, uint32_t addrlen) /* The clustering module frees the lists */ sctp_get_saddr_list(sctp, slist, ssize); sctp_get_faddr_list(sctp, flist, fsize); - (*cl_sctp_connect)(sctp->sctp_family, slist, - sctp->sctp_nsaddrs, sctp->sctp_lport, - flist, sctp->sctp_nfaddrs, sctp->sctp_fport, + (*cl_sctp_connect)(connp->conn_family, slist, + sctp->sctp_nsaddrs, connp->conn_lport, + flist, sctp->sctp_nfaddrs, connp->conn_fport, B_TRUE, (cl_sctp_handle_t)sctp); } - WAKE_SCTP(sctp); - /* OK to call IP_PUT() here instead of sctp_add_sendq(). */ - CONN_INC_REF(sctp->sctp_connp); - initmp->b_flag |= MSGHASREF; - IP_PUT(initmp, sctp->sctp_connp, sctp->sctp_current->isv4); + ASSERT(ixa->ixa_cred != NULL); + ASSERT(ixa->ixa_ire != NULL); + + (void) conn_ip_output(initmp, ixa); BUMP_LOCAL(sctp->sctp_opkts); + WAKE_SCTP(sctp); notify_ulp: - bzero(&sopp, sizeof (sopp)); - sopp.sopp_flags = SOCKOPT_WROFF; - sopp.sopp_wroff = sctps->sctps_wroff_xtra + hdrlen + - sizeof (sctp_data_hdr_t); - sctp->sctp_ulp_prop(sctp->sctp_ulpd, &sopp); + sctp_set_ulp_prop(sctp); return (0); default: diff --git a/usr/src/uts/common/inet/sctp/sctp_cookie.c b/usr/src/uts/common/inet/sctp/sctp_cookie.c index 601938c928..4baf0a7147 100644 --- a/usr/src/uts/common/inet/sctp/sctp_cookie.c +++ b/usr/src/uts/common/inet/sctp/sctp_cookie.c @@ -40,6 +40,7 @@ #include <inet/common.h> #include <inet/ip.h> #include <inet/ip6.h> +#include <inet/ipsec_impl.h> #include <inet/sctp_ip.h> #include <inet/ipclassifier.h> #include "sctp_impl.h" @@ -156,7 +157,7 @@ hmac_md5(uchar_t *text, size_t text_len, uchar_t *key, size_t key_len, static int validate_init_params(sctp_t *sctp, sctp_chunk_hdr_t *ch, sctp_init_chunk_t *init, mblk_t *inmp, sctp_parm_hdr_t **want_cookie, - mblk_t **errmp, int *supp_af, uint_t *sctp_options) + mblk_t **errmp, int *supp_af, uint_t *sctp_options, ip_recv_attr_t *ira) { sctp_parm_hdr_t *cph; sctp_init_chunk_t *ic; @@ -168,6 +169,7 @@ validate_init_params(sctp_t *sctp, sctp_chunk_hdr_t *ch, boolean_t got_errchunk = B_FALSE; uint16_t ptype; sctp_mpc_t mpc; + conn_t *connp = sctp->sctp_connp; ASSERT(errmp != NULL); @@ -336,8 +338,8 @@ done: * is NULL. */ if (want_cookie == NULL && - ((sctp->sctp_family == AF_INET && !(*supp_af & PARM_SUPP_V4)) || - (sctp->sctp_family == AF_INET6 && !(*supp_af & PARM_SUPP_V6) && + ((connp->conn_family == AF_INET && !(*supp_af & PARM_SUPP_V4)) || + (connp->conn_family == AF_INET6 && !(*supp_af & PARM_SUPP_V6) && sctp->sctp_connp->conn_ipv6_v6only))) { dprint(1, ("sctp:validate_init_params: supp addr\n")); serror = SCTP_ERR_BAD_ADDR; @@ -353,7 +355,7 @@ cookie_abort: dprint(1, ("validate_init_params: cookie absent\n")); sctp_send_abort(sctp, sctp_init2vtag(ch), SCTP_ERR_MISSING_PARM, - (char *)&mpc, sizeof (sctp_mpc_t), inmp, 0, B_FALSE); + (char *)&mpc, sizeof (sctp_mpc_t), inmp, 0, B_FALSE, ira); return (0); } @@ -365,7 +367,7 @@ abort: return (0); sctp_send_abort(sctp, sctp_init2vtag(ch), serror, details, - errlen, inmp, 0, B_FALSE); + errlen, inmp, 0, B_FALSE, ira); return (0); } @@ -453,14 +455,17 @@ cl_sctp_cookie_paddr(sctp_chunk_hdr_t *ch, in6_addr_t *addr) sizeof (sctp_parm_hdr_t) + /* param header */ \ 16 /* MD5 hash */ +/* + * Note that sctp is the listener, hence we shouldn't modify it. + */ void sctp_send_initack(sctp_t *sctp, sctp_hdr_t *initsh, sctp_chunk_hdr_t *ch, - mblk_t *initmp) + mblk_t *initmp, ip_recv_attr_t *ira) { ipha_t *initiph; ip6_t *initip6h; - ipha_t *iackiph; - ip6_t *iackip6h; + ipha_t *iackiph = NULL; + ip6_t *iackip6h = NULL; sctp_chunk_hdr_t *iack_ch; sctp_init_chunk_t *iack; sctp_init_chunk_t *init; @@ -485,10 +490,10 @@ sctp_send_initack(sctp_t *sctp, sctp_hdr_t *initsh, sctp_chunk_hdr_t *ch, mblk_t *errmp = NULL; boolean_t initcollision = B_FALSE; boolean_t linklocal = B_FALSE; - cred_t *cr; - pid_t pid; - ts_label_t *initlabel; sctp_stack_t *sctps = sctp->sctp_sctps; + conn_t *connp = sctp->sctp_connp; + int err; + ip_xmit_attr_t *ixa = NULL; BUMP_LOCAL(sctp->sctp_ibchunks); isv4 = (IPH_HDR_VERSION(initmp->b_rptr) == IPV4_VERSION); @@ -501,21 +506,24 @@ sctp_send_initack(sctp_t *sctp, sctp_hdr_t *initsh, sctp_chunk_hdr_t *ch, } else { initip6h = (ip6_t *)initmp->b_rptr; ipsctplen = sctp->sctp_ip_hdr6_len; - if (IN6_IS_ADDR_LINKLOCAL(&initip6h->ip6_src)) + if (IN6_IS_ADDR_LINKLOCAL(&initip6h->ip6_src) || + IN6_IS_ADDR_LINKLOCAL(&initip6h->ip6_dst)) linklocal = B_TRUE; supp_af |= PARM_SUPP_V6; + if (!sctp->sctp_connp->conn_ipv6_v6only) + supp_af |= PARM_SUPP_V4; } ASSERT(OK_32PTR(initsh)); init = (sctp_init_chunk_t *)((char *)(initsh + 1) + sizeof (*iack_ch)); /* Make sure we like the peer's parameters */ if (validate_init_params(sctp, ch, init, initmp, NULL, &errmp, - &supp_af, &sctp_options) == 0) { + &supp_af, &sctp_options, ira) == 0) { return; } if (errmp != NULL) errlen = msgdsize(errmp); - if (sctp->sctp_family == AF_INET) { + if (connp->conn_family == AF_INET) { /* * Irregardless of the supported address in the INIT, v4 * must be supported. @@ -580,43 +588,65 @@ sctp_send_initack(sctp_t *sctp, sctp_hdr_t *initsh, sctp_chunk_hdr_t *ch, } /* - * If the listen socket is bound to a trusted extensions - * multi-label port, attach a copy of the listener's cred - * to the new INITACK mblk. Modify the cred to contain + * Base the transmission on any routing-related socket options + * that have been set on the listener. + */ + ixa = conn_get_ixa_exclusive(connp); + if (ixa == NULL) { + sctp_send_abort(sctp, sctp_init2vtag(ch), + SCTP_ERR_NO_RESOURCES, NULL, 0, initmp, 0, B_FALSE, ira); + return; + } + ixa->ixa_flags &= ~IXAF_VERIFY_PMTU; + + if (isv4) + ixa->ixa_flags |= IXAF_IS_IPV4; + else + ixa->ixa_flags &= ~IXAF_IS_IPV4; + + /* + * If the listen socket is bound to a trusted extensions multi-label + * port, a MAC-Exempt connection with an unlabeled node, we use the * the security label of the received INIT packet. * If not a multi-label port, attach the unmodified - * listener's cred directly. + * listener's label directly. * * We expect Sun developed kernel modules to properly set * cred labels for sctp connections. We can't be so sure this * will be done correctly when 3rd party kernel modules - * directly use sctp. The initlabel panic guard logic was - * added to cover this possibility. + * directly use sctp. We check for a NULL ira_tsl to cover this + * possibility. */ - if (sctp->sctp_connp->conn_mlp_type != mlptSingle) { - cr = msg_getcred(initmp, &pid); - if (cr == NULL || (initlabel = crgetlabel(cr)) == NULL) { - sctp_send_abort(sctp, sctp_init2vtag(ch), - SCTP_ERR_UNKNOWN, NULL, 0, initmp, 0, B_FALSE); - return; + if (is_system_labeled()) { + /* Discard any old label */ + if (ixa->ixa_free_flags & IXA_FREE_TSL) { + ASSERT(ixa->ixa_tsl != NULL); + label_rele(ixa->ixa_tsl); + ixa->ixa_free_flags &= ~IXA_FREE_TSL; + ixa->ixa_tsl = NULL; } - cr = copycred_from_bslabel(CONN_CRED(sctp->sctp_connp), - &initlabel->tsl_label, initlabel->tsl_doi, KM_NOSLEEP); - if (cr == NULL) { - sctp_send_abort(sctp, sctp_init2vtag(ch), - SCTP_ERR_NO_RESOURCES, NULL, 0, initmp, 0, B_FALSE); - return; + + if (connp->conn_mlp_type != mlptSingle || + connp->conn_mac_mode != CONN_MAC_DEFAULT) { + if (ira->ira_tsl == NULL) { + sctp_send_abort(sctp, sctp_init2vtag(ch), + SCTP_ERR_UNKNOWN, NULL, 0, initmp, 0, + B_FALSE, ira); + ixa_refrele(ixa); + return; + } + label_hold(ira->ira_tsl); + ip_xmit_attr_replace_tsl(ixa, ira->ira_tsl); + } else { + ixa->ixa_tsl = crgetlabel(connp->conn_cred); } - iackmp = allocb_cred(ipsctplen + sctps->sctps_wroff_xtra, - cr, pid); - crfree(cr); - } else { - iackmp = allocb_cred(ipsctplen + sctps->sctps_wroff_xtra, - CONN_CRED(sctp->sctp_connp), sctp->sctp_cpid); } + + iackmp = allocb(ipsctplen + sctps->sctps_wroff_xtra, BPRI_MED); if (iackmp == NULL) { sctp_send_abort(sctp, sctp_init2vtag(ch), - SCTP_ERR_NO_RESOURCES, NULL, 0, initmp, 0, B_FALSE); + SCTP_ERR_NO_RESOURCES, NULL, 0, initmp, 0, B_FALSE, ira); + ixa_refrele(ixa); return; } @@ -632,6 +662,7 @@ sctp_send_initack(sctp_t *sctp, sctp_hdr_t *initsh, sctp_chunk_hdr_t *ch, iackiph->ipha_src = initiph->ipha_dst; iackiph->ipha_length = htons(ipsctplen + errlen); iacksh = (sctp_hdr_t *)(p + sctp->sctp_ip_hdr_len); + ixa->ixa_ip_hdr_length = sctp->sctp_ip_hdr_len; } else { bcopy(sctp->sctp_iphc6, p, sctp->sctp_hdr6_len); iackip6h = (ip6_t *)p; @@ -639,10 +670,12 @@ sctp_send_initack(sctp_t *sctp, sctp_hdr_t *initsh, sctp_chunk_hdr_t *ch, /* Copy the peer's IP addr */ iackip6h->ip6_dst = initip6h->ip6_src; iackip6h->ip6_src = initip6h->ip6_dst; - iackip6h->ip6_plen = htons(ipsctplen - sizeof (*iackip6h) + - errlen); + iackip6h->ip6_plen = htons(ipsctplen + errlen - IPV6_HDR_LEN); iacksh = (sctp_hdr_t *)(p + sctp->sctp_ip_hdr6_len); + ixa->ixa_ip_hdr_length = sctp->sctp_ip_hdr6_len; } + ixa->ixa_pktlen = ipsctplen + errlen; + ASSERT(OK_32PTR(iacksh)); /* Fill in the holes in the SCTP common header */ @@ -776,41 +809,58 @@ sctp_send_initack(sctp_t *sctp, sctp_hdr_t *initsh, sctp_chunk_hdr_t *ch, iackmp->b_cont = errmp; /* OK if NULL */ - if (is_system_labeled() && (cr = msg_getcred(iackmp, &pid)) != NULL && - crgetlabel(cr) != NULL) { - conn_t *connp = sctp->sctp_connp; - int err; - - if (isv4) - err = tsol_check_label(cr, &iackmp, - connp->conn_mac_mode, - sctps->sctps_netstack->netstack_ip, pid); - else - err = tsol_check_label_v6(cr, &iackmp, - connp->conn_mac_mode, - sctps->sctps_netstack->netstack_ip, pid); + if (is_system_labeled()) { + ts_label_t *effective_tsl = NULL; + + ASSERT(ira->ira_tsl != NULL); + + /* Discard any old label */ + if (ixa->ixa_free_flags & IXA_FREE_TSL) { + ASSERT(ixa->ixa_tsl != NULL); + label_rele(ixa->ixa_tsl); + ixa->ixa_free_flags &= ~IXA_FREE_TSL; + } + ixa->ixa_tsl = ira->ira_tsl; /* A multi-level responder */ + + /* + * We need to check for label-related failures which implies + * an extra call to tsol_check_dest (as ip_output_simple + * also does a tsol_check_dest as part of computing the + * label for the packet, but ip_output_simple doesn't return + * a specific errno for that case so we can't rely on its + * check.) + */ + if (isv4) { + err = tsol_check_dest(ixa->ixa_tsl, &iackiph->ipha_dst, + IPV4_VERSION, connp->conn_mac_mode, + connp->conn_zone_is_global, &effective_tsl); + } else { + err = tsol_check_dest(ixa->ixa_tsl, &iackip6h->ip6_dst, + IPV6_VERSION, connp->conn_mac_mode, + connp->conn_zone_is_global, &effective_tsl); + } if (err != 0) { sctp_send_abort(sctp, sctp_init2vtag(ch), - SCTP_ERR_AUTH_ERR, NULL, 0, initmp, 0, B_FALSE); + SCTP_ERR_AUTH_ERR, NULL, 0, initmp, 0, B_FALSE, + ira); + ixa_refrele(ixa); freemsg(iackmp); return; } + if (effective_tsl != NULL) { + /* + * Since ip_output_simple will redo the + * tsol_check_dest, we just drop the ref. + */ + label_rele(effective_tsl); + } } - /* - * Stash the conn ptr info. for IP only as e don't have any - * cached IRE. - */ - SCTP_STASH_IPINFO(iackmp, (ire_t *)NULL); - - /* XXX sctp == sctp_g_q, so using its obchunks is valid */ BUMP_LOCAL(sctp->sctp_opkts); BUMP_LOCAL(sctp->sctp_obchunks); - /* OK to call IP_PUT() here instead of sctp_add_sendq(). */ - CONN_INC_REF(sctp->sctp_connp); - iackmp->b_flag |= MSGHASREF; - IP_PUT(iackmp, sctp->sctp_connp, isv4); + (void) ip_output_simple(iackmp, ixa); + ixa_refrele(ixa); } void @@ -820,7 +870,7 @@ sctp_send_cookie_ack(sctp_t *sctp) mblk_t *camp; sctp_stack_t *sctps = sctp->sctp_sctps; - camp = sctp_make_mp(sctp, NULL, sizeof (*cach)); + camp = sctp_make_mp(sctp, sctp->sctp_current, sizeof (*cach)); if (camp == NULL) { /* XXX should abort, but don't have the inmp anymore */ SCTP_KSTAT(sctps, sctp_send_cookie_ack_failed); @@ -833,11 +883,11 @@ sctp_send_cookie_ack(sctp_t *sctp) cach->sch_flags = 0; cach->sch_len = htons(sizeof (*cach)); - sctp_set_iplen(sctp, camp); - BUMP_LOCAL(sctp->sctp_obchunks); - sctp_add_sendq(sctp, camp); + sctp_set_iplen(sctp, camp, sctp->sctp_current->ixa); + (void) conn_ip_output(camp, sctp->sctp_current->ixa); + BUMP_LOCAL(sctp->sctp_opkts); } static int @@ -859,7 +909,8 @@ sctp_find_al_ind(sctp_parm_hdr_t *sph, ssize_t len, uint32_t *adaptation_code) } void -sctp_send_cookie_echo(sctp_t *sctp, sctp_chunk_hdr_t *iackch, mblk_t *iackmp) +sctp_send_cookie_echo(sctp_t *sctp, sctp_chunk_hdr_t *iackch, mblk_t *iackmp, + ip_recv_attr_t *ira) { mblk_t *cemp; mblk_t *mp = NULL; @@ -886,7 +937,7 @@ sctp_send_cookie_echo(sctp_t *sctp, sctp_chunk_hdr_t *iackch, mblk_t *iackmp) cph = NULL; if (validate_init_params(sctp, iackch, iack, iackmp, &cph, &errmp, - &pad, &sctp_options) == 0) { /* result in 'pad' ignored */ + &pad, &sctp_options, ira) == 0) { /* result in 'pad' ignored */ BUMP_MIB(&sctps->sctps_mib, sctpAborted); sctp_assoc_event(sctp, SCTP_CANT_STR_ASSOC, 0, NULL); sctp_clean_death(sctp, ECONNABORTED); @@ -906,8 +957,8 @@ sctp_send_cookie_echo(sctp_t *sctp, sctp_chunk_hdr_t *iackch, mblk_t *iackmp) else hdrlen = sctp->sctp_hdr6_len; - cemp = allocb_cred(sctps->sctps_wroff_xtra + hdrlen + ceclen + pad, - CONN_CRED(sctp->sctp_connp), sctp->sctp_cpid); + cemp = allocb(sctps->sctps_wroff_xtra + hdrlen + ceclen + pad, + BPRI_MED); if (cemp == NULL) { SCTP_FADDR_TIMER_RESTART(sctp, sctp->sctp_current, sctp->sctp_current->rto); @@ -932,11 +983,13 @@ sctp_send_cookie_echo(sctp_t *sctp, sctp_chunk_hdr_t *iackch, mblk_t *iackmp) * in sctp_connect(). */ sctp->sctp_current->df = B_TRUE; + sctp->sctp_ipha->ipha_fragment_offset_and_flags |= IPH_DF_HTONS; + /* * Since IP uses this info during the fanout process, we need to hold * the lock for this hash line while performing this operation. */ - /* XXX sctp_conn_fanout + SCTP_CONN_HASH(sctps, sctp->sctp_ports); */ + /* XXX sctp_conn_fanout + SCTP_CONN_HASH(sctps, connp->conn_ports); */ ASSERT(sctp->sctp_conn_tfp != NULL); tf = sctp->sctp_conn_tfp; /* sctp isn't a listener so only need to hold conn fanout lock */ @@ -1139,14 +1192,15 @@ sendcookie: sctp->sctp_state = SCTPS_COOKIE_ECHOED; SCTP_FADDR_TIMER_RESTART(sctp, fp, fp->rto); - sctp_set_iplen(sctp, head); - sctp_add_sendq(sctp, head); + sctp_set_iplen(sctp, head, fp->ixa); + (void) conn_ip_output(head, fp->ixa); + BUMP_LOCAL(sctp->sctp_opkts); } int sctp_process_cookie(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *cmp, sctp_init_chunk_t **iackpp, sctp_hdr_t *insctph, int *recv_adaptation, - in6_addr_t *peer_addr) + in6_addr_t *peer_addr, ip_recv_attr_t *ira) { int32_t clen; size_t initplen; @@ -1163,6 +1217,7 @@ sctp_process_cookie(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *cmp, uint32_t *fttag; uint32_t ports; sctp_stack_t *sctps = sctp->sctp_sctps; + conn_t *connp = sctp->sctp_connp; BUMP_LOCAL(sctp->sctp_ibchunks); /* Verify the ICV */ @@ -1232,7 +1287,8 @@ sctp_process_cookie(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *cmp, staleness = TICK_TO_USEC(diff); staleness = htonl(staleness); sctp_send_abort(sctp, init->sic_inittag, SCTP_ERR_STALE_COOKIE, - (char *)&staleness, sizeof (staleness), cmp, 1, B_FALSE); + (char *)&staleness, sizeof (staleness), cmp, 1, B_FALSE, + ira); dprint(1, ("stale cookie %d\n", staleness)); @@ -1242,7 +1298,7 @@ sctp_process_cookie(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *cmp, /* Check for attack by adding addresses to a restart */ bcopy(insctph, &ports, sizeof (ports)); if (sctp_secure_restart_check(cmp, initch, ports, KM_NOSLEEP, - sctps) != 1) { + sctps, ira) != 1) { return (-1); } @@ -1263,7 +1319,7 @@ sctp_process_cookie(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *cmp, dprint(1, ("duplicate cookie from %x:%x:%x:%x (%d)\n", SCTP_PRINTADDR(sctp->sctp_current->faddr), - (int)(sctp->sctp_fport))); + (int)(connp->conn_fport))); return (-1); } @@ -1292,7 +1348,7 @@ sctp_process_cookie(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *cmp, dprint(1, ("sctp peer %x:%x:%x:%x (%d) restarted\n", SCTP_PRINTADDR(sctp->sctp_current->faddr), - (int)(sctp->sctp_fport))); + (int)(connp->conn_fport))); /* reset parameters */ sctp_congest_reset(sctp); @@ -1320,7 +1376,7 @@ sctp_process_cookie(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *cmp, dprint(1, ("init collision with %x:%x:%x:%x (%d)\n", SCTP_PRINTADDR(sctp->sctp_current->faddr), - (int)(sctp->sctp_fport))); + (int)(connp->conn_fport))); return (0); } else if (iack->sic_inittag != sctp->sctp_lvtag && @@ -1330,7 +1386,7 @@ sctp_process_cookie(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *cmp, /* Section 5.2.4 case C: late COOKIE */ dprint(1, ("late cookie from %x:%x:%x:%x (%d)\n", SCTP_PRINTADDR(sctp->sctp_current->faddr), - (int)(sctp->sctp_fport))); + (int)(connp->conn_fport))); return (-1); } else if (init->sic_inittag == sctp->sctp_fvtag && iack->sic_inittag == sctp->sctp_lvtag) { @@ -1341,7 +1397,7 @@ sctp_process_cookie(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *cmp, */ dprint(1, ("cookie tags match from %x:%x:%x:%x (%d)\n", SCTP_PRINTADDR(sctp->sctp_current->faddr), - (int)(sctp->sctp_fport))); + (int)(connp->conn_fport))); if (sctp->sctp_state < SCTPS_ESTABLISHED) { if (!sctp_initialize_params(sctp, init, iack)) return (-1); /* Drop? */ @@ -1412,13 +1468,17 @@ sctp_addrlist2sctp(mblk_t *mp, sctp_hdr_t *sctph, sctp_chunk_hdr_t *ich, /* * params have been put in host byteorder by * sctp_check_input() + * + * For labeled systems, there's no need to check the + * label here. It's known to be good as we checked + * before allowing the connection to become bound. */ if (ph->sph_type == PARM_ADDR4) { IN6_INADDR_TO_V4MAPPED((struct in_addr *)(ph + 1), &src); sctp = sctp_conn_match(&src, &dst, ports, zoneid, - sctps); + 0, sctps); dprint(1, ("sctp_addrlist2sctp: src=%x:%x:%x:%x, sctp=%p\n", @@ -1431,7 +1491,7 @@ sctp_addrlist2sctp(mblk_t *mp, sctp_hdr_t *sctph, sctp_chunk_hdr_t *ich, } else if (ph->sph_type == PARM_ADDR6) { src = *(in6_addr_t *)(ph + 1); sctp = sctp_conn_match(&src, &dst, ports, zoneid, - sctps); + 0, sctps); dprint(1, ("sctp_addrlist2sctp: src=%x:%x:%x:%x, sctp=%p\n", diff --git a/usr/src/uts/common/inet/sctp/sctp_error.c b/usr/src/uts/common/inet/sctp/sctp_error.c index 02d18cf78c..293ff5bd6e 100644 --- a/usr/src/uts/common/inet/sctp/sctp_error.c +++ b/usr/src/uts/common/inet/sctp/sctp_error.c @@ -35,9 +35,11 @@ #include <netinet/in.h> #include <netinet/ip6.h> +#include <inet/ipsec_impl.h> #include <inet/common.h> #include <inet/ip.h> #include <inet/ip6.h> +#include <inet/ipsec_impl.h> #include <inet/mib2.h> #include <inet/sctp_ip.h> #include <inet/ipclassifier.h> @@ -99,6 +101,7 @@ sctp_user_abort(sctp_t *sctp, mblk_t *data) int len, hdrlen; char *cause; sctp_faddr_t *fp = sctp->sctp_current; + ip_xmit_attr_t *ixa = fp->ixa; sctp_stack_t *sctps = sctp->sctp_sctps; /* @@ -147,14 +150,15 @@ sctp_user_abort(sctp_t *sctp, mblk_t *data) freemsg(mp); return; } - sctp_set_iplen(sctp, mp); BUMP_MIB(&sctps->sctps_mib, sctpAborted); BUMP_LOCAL(sctp->sctp_opkts); BUMP_LOCAL(sctp->sctp_obchunks); - CONN_INC_REF(sctp->sctp_connp); - mp->b_flag |= MSGHASREF; - IP_PUT(mp, sctp->sctp_connp, fp->isv4); + sctp_set_iplen(sctp, mp, ixa); + ASSERT(ixa->ixa_ire != NULL); + ASSERT(ixa->ixa_cred != NULL); + + (void) conn_ip_output(mp, ixa); sctp_assoc_event(sctp, SCTP_COMM_LOST, 0, NULL); sctp_clean_death(sctp, ECONNABORTED); @@ -165,29 +169,24 @@ sctp_user_abort(sctp_t *sctp, mblk_t *data) */ void sctp_send_abort(sctp_t *sctp, uint32_t vtag, uint16_t serror, char *details, - size_t len, mblk_t *inmp, int iserror, boolean_t tbit) + size_t len, mblk_t *inmp, int iserror, boolean_t tbit, ip_recv_attr_t *ira) { mblk_t *hmp; uint32_t ip_hdr_len; ipha_t *iniph; - ipha_t *ahiph; + ipha_t *ahiph = NULL; ip6_t *inip6h; - ip6_t *ahip6h; + ip6_t *ahip6h = NULL; sctp_hdr_t *sh; sctp_hdr_t *insh; size_t ahlen; uchar_t *p; ssize_t alen; int isv4; - ire_t *ire; - irb_t *irb; - ts_label_t *tsl; - conn_t *connp; - cred_t *cr = NULL; - pid_t pid; + conn_t *connp = sctp->sctp_connp; sctp_stack_t *sctps = sctp->sctp_sctps; - ip_stack_t *ipst; + ip_xmit_attr_t *ixa; isv4 = (IPH_HDR_VERSION(inmp->b_rptr) == IPV4_VERSION); if (isv4) { @@ -200,11 +199,10 @@ sctp_send_abort(sctp_t *sctp, uint32_t vtag, uint16_t serror, char *details, * If this is a labeled system, then check to see if we're allowed to * send a response to this particular sender. If not, then just drop. */ - if (is_system_labeled() && !tsol_can_reply_error(inmp)) + if (is_system_labeled() && !tsol_can_reply_error(inmp, ira)) return; - hmp = allocb_cred(sctps->sctps_wroff_xtra + ahlen, - CONN_CRED(sctp->sctp_connp), sctp->sctp_cpid); + hmp = allocb(sctps->sctps_wroff_xtra + ahlen, BPRI_MED); if (hmp == NULL) { /* XXX no resources */ return; @@ -262,75 +260,209 @@ sctp_send_abort(sctp_t *sctp, uint32_t vtag, uint16_t serror, char *details, return; } + /* + * Base the transmission on any routing-related socket options + * that have been set on the listener/connection. + */ + ixa = conn_get_ixa_exclusive(connp); + if (ixa == NULL) { + freemsg(hmp); + return; + } + ixa->ixa_flags &= ~IXAF_VERIFY_PMTU; + + ixa->ixa_pktlen = ahlen + alen; if (isv4) { - ahiph->ipha_length = htons(ahlen + alen); + ixa->ixa_flags |= IXAF_IS_IPV4; + ahiph->ipha_length = htons(ixa->ixa_pktlen); + ixa->ixa_ip_hdr_length = sctp->sctp_ip_hdr_len; } else { - ahip6h->ip6_plen = htons(alen + sizeof (*sh)); + ixa->ixa_flags &= ~IXAF_IS_IPV4; + ahip6h->ip6_plen = htons(ixa->ixa_pktlen - IPV6_HDR_LEN); + ixa->ixa_ip_hdr_length = sctp->sctp_ip_hdr6_len; } BUMP_MIB(&sctps->sctps_mib, sctpAborted); BUMP_LOCAL(sctp->sctp_obchunks); - ipst = sctps->sctps_netstack->netstack_ip; - connp = sctp->sctp_connp; - if (is_system_labeled() && (cr = msg_getcred(inmp, &pid)) != NULL && - crgetlabel(cr) != NULL) { - int err; - uint_t mode = connp->conn_mac_mode; + if (is_system_labeled() && ixa->ixa_tsl != NULL) { + ASSERT(ira->ira_tsl != NULL); - if (isv4) - err = tsol_check_label(cr, &hmp, mode, ipst, pid); - else - err = tsol_check_label_v6(cr, &hmp, mode, ipst, pid); - if (err != 0) { - freemsg(hmp); - return; - } + ixa->ixa_tsl = ira->ira_tsl; /* A multi-level responder */ } - /* Stash the conn ptr info. for IP */ - SCTP_STASH_IPINFO(hmp, NULL); + if (ira->ira_flags & IRAF_IPSEC_SECURE) { + /* + * Apply IPsec based on how IPsec was applied to + * the packet that caused the abort. + */ + if (!ipsec_in_to_out(ira, ixa, hmp, ahiph, ahip6h)) { + ip_stack_t *ipst = sctps->sctps_netstack->netstack_ip; - CONN_INC_REF(connp); - hmp->b_flag |= MSGHASREF; - IP_PUT(hmp, connp, sctp->sctp_current == NULL ? B_TRUE : - sctp->sctp_current->isv4); - /* - * Let's just mark the IRE for this destination as temporary - * to prevent any DoS attack. - */ - tsl = cr == NULL ? NULL : crgetlabel(cr); - if (isv4) { - ire = ire_cache_lookup(iniph->ipha_src, sctp->sctp_zoneid, tsl, - ipst); + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); + /* Note: mp already consumed and ip_drop_packet done */ + ixa_refrele(ixa); + return; + } } else { - ire = ire_cache_lookup_v6(&inip6h->ip6_src, sctp->sctp_zoneid, - tsl, ipst); + ixa->ixa_flags |= IXAF_NO_IPSEC; } + + BUMP_LOCAL(sctp->sctp_opkts); + BUMP_LOCAL(sctp->sctp_obchunks); + + (void) ip_output_simple(hmp, ixa); + ixa_refrele(ixa); +} + +/* + * OOTB version of the above. + * If iserror == 0, sends an abort. If iserror != 0, sends an error. + */ +void +sctp_ootb_send_abort(uint32_t vtag, uint16_t serror, char *details, + size_t len, const mblk_t *inmp, int iserror, boolean_t tbit, + ip_recv_attr_t *ira, ip_stack_t *ipst) +{ + uint32_t ip_hdr_len; + size_t ahlen; + ipha_t *ipha = NULL; + ip6_t *ip6h = NULL; + sctp_hdr_t *insctph; + int i; + uint16_t port; + ssize_t alen; + int isv4; + mblk_t *mp; + netstack_t *ns = ipst->ips_netstack; + sctp_stack_t *sctps = ns->netstack_sctp; + ip_xmit_attr_t ixas; + + bzero(&ixas, sizeof (ixas)); + + isv4 = (IPH_HDR_VERSION(inmp->b_rptr) == IPV4_VERSION); + ip_hdr_len = ira->ira_ip_hdr_length; + ahlen = ip_hdr_len + sizeof (sctp_hdr_t); + /* - * In the normal case the ire would be non-null, however it could be - * null, say, if IP needs to resolve the gateway for this address. We - * only care about IRE_CACHE. + * If this is a labeled system, then check to see if we're allowed to + * send a response to this particular sender. If not, then just drop. */ - if (ire == NULL) + if (is_system_labeled() && !tsol_can_reply_error(inmp, ira)) return; - if (ire->ire_type != IRE_CACHE) { - ire_refrele(ire); + + mp = allocb(ahlen + sctps->sctps_wroff_xtra, BPRI_MED); + if (mp == NULL) { return; } - irb = ire->ire_bucket; - /* ire_lock is not needed, as ire_marks is protected by irb_lock */ - rw_enter(&irb->irb_lock, RW_WRITER); + mp->b_rptr += sctps->sctps_wroff_xtra; + mp->b_wptr = mp->b_rptr + ahlen; + bcopy(inmp->b_rptr, mp->b_rptr, ahlen); + /* - * Only increment the temporary IRE count if the original - * IRE is not already marked temporary. + * We follow the logic in tcp_xmit_early_reset() in that we skip + * reversing source route (i.e. replace all IP options with EOL). */ - if (!(ire->ire_marks & IRE_MARK_TEMPORARY)) { - irb->irb_tmp_ire_cnt++; - ire->ire_marks |= IRE_MARK_TEMPORARY; + if (isv4) { + ipaddr_t v4addr; + + ipha = (ipha_t *)mp->b_rptr; + for (i = IP_SIMPLE_HDR_LENGTH; i < (int)ip_hdr_len; i++) + mp->b_rptr[i] = IPOPT_EOL; + /* Swap addresses */ + ipha->ipha_length = htons(ahlen); + v4addr = ipha->ipha_src; + ipha->ipha_src = ipha->ipha_dst; + ipha->ipha_dst = v4addr; + ipha->ipha_ident = 0; + ipha->ipha_ttl = (uchar_t)sctps->sctps_ipv4_ttl; + + ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; + } else { + in6_addr_t v6addr; + + ip6h = (ip6_t *)mp->b_rptr; + /* Remove any extension headers assuming partial overlay */ + if (ip_hdr_len > IPV6_HDR_LEN) { + uint8_t *to; + + to = mp->b_rptr + ip_hdr_len - IPV6_HDR_LEN; + ovbcopy(ip6h, to, IPV6_HDR_LEN); + mp->b_rptr += ip_hdr_len - IPV6_HDR_LEN; + ip_hdr_len = IPV6_HDR_LEN; + ip6h = (ip6_t *)mp->b_rptr; + ip6h->ip6_nxt = IPPROTO_SCTP; + ahlen = ip_hdr_len + sizeof (sctp_hdr_t); + } + ip6h->ip6_plen = htons(ahlen - IPV6_HDR_LEN); + v6addr = ip6h->ip6_src; + ip6h->ip6_src = ip6h->ip6_dst; + ip6h->ip6_dst = v6addr; + ip6h->ip6_hops = (uchar_t)sctps->sctps_ipv6_hoplimit; + + ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6; + if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_dst)) { + ixas.ixa_flags |= IXAF_SCOPEID_SET; + ixas.ixa_scopeid = ira->ira_ruifindex; + } + } + insctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_len); + + /* Swap ports. Verification tag is reused. */ + port = insctph->sh_sport; + insctph->sh_sport = insctph->sh_dport; + insctph->sh_dport = port; + insctph->sh_verf = vtag; + + /* Link in the abort chunk */ + if ((alen = sctp_link_abort(mp, serror, details, len, iserror, tbit)) + < 0) { + freemsg(mp); + return; + } + + ixas.ixa_pktlen = ahlen + alen; + ixas.ixa_ip_hdr_length = ip_hdr_len; + + if (isv4) { + ipha->ipha_length = htons(ixas.ixa_pktlen); + } else { + ip6h->ip6_plen = htons(ixas.ixa_pktlen - IPV6_HDR_LEN); } - rw_exit(&irb->irb_lock); - ire_refrele(ire); + + ixas.ixa_protocol = IPPROTO_SCTP; + ixas.ixa_zoneid = ira->ira_zoneid; + ixas.ixa_ipst = ipst; + ixas.ixa_ifindex = 0; + + BUMP_MIB(&sctps->sctps_mib, sctpAborted); + + if (is_system_labeled()) { + ASSERT(ira->ira_tsl != NULL); + + ixas.ixa_tsl = ira->ira_tsl; /* A multi-level responder */ + } + + if (ira->ira_flags & IRAF_IPSEC_SECURE) { + /* + * Apply IPsec based on how IPsec was applied to + * the packet that was out of the blue. + */ + if (!ipsec_in_to_out(ira, &ixas, mp, ipha, ip6h)) { + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); + /* Note: mp already consumed and ip_drop_packet done */ + return; + } + } else { + /* + * This is in clear. The abort message we are building + * here should go out in clear, independent of our policy. + */ + ixas.ixa_flags |= IXAF_NO_IPSEC; + } + + (void) ip_output_simple(mp, &ixas); + ixa_cleanup(&ixas); } /*ARGSUSED*/ @@ -418,8 +550,9 @@ sctp_add_err(sctp_t *sctp, uint16_t serror, void *details, size_t len, return; } sendmp->b_cont = sctp->sctp_err_chunks; - sctp_set_iplen(sctp, sendmp); - sctp_add_sendq(sctp, sendmp); + sctp_set_iplen(sctp, sendmp, fp->ixa); + (void) conn_ip_output(sendmp, fp->ixa); + BUMP_LOCAL(sctp->sctp_opkts); sctp->sctp_err_chunks = emp; sctp->sctp_err_len = emp_len; @@ -445,17 +578,20 @@ sctp_process_err(sctp_t *sctp) sctp_stack_t *sctps = sctp->sctp_sctps; mblk_t *errmp; mblk_t *sendmp; + sctp_faddr_t *fp; ASSERT(sctp->sctp_err_chunks != NULL); errmp = sctp->sctp_err_chunks; - if ((sendmp = sctp_make_mp(sctp, SCTP_CHUNK_DEST(errmp), 0)) == NULL) { + fp = SCTP_CHUNK_DEST(errmp); + if ((sendmp = sctp_make_mp(sctp, fp, 0)) == NULL) { SCTP_KSTAT(sctps, sctp_send_err_failed); freemsg(errmp); goto done; } sendmp->b_cont = errmp; - sctp_set_iplen(sctp, sendmp); - sctp_add_sendq(sctp, sendmp); + sctp_set_iplen(sctp, sendmp, fp->ixa); + (void) conn_ip_output(sendmp, fp->ixa); + BUMP_LOCAL(sctp->sctp_opkts); done: sctp->sctp_err_chunks = NULL; sctp->sctp_err_len = 0; @@ -467,7 +603,7 @@ done: */ int sctp_handle_error(sctp_t *sctp, sctp_hdr_t *sctph, sctp_chunk_hdr_t *ch, - mblk_t *mp) + mblk_t *mp, ip_recv_attr_t *ira) { sctp_parm_hdr_t *errh; sctp_chunk_hdr_t *uch; @@ -487,11 +623,13 @@ sctp_handle_error(sctp_t *sctp, sctp_hdr_t *sctph, sctp_chunk_hdr_t *ch, */ case SCTP_ERR_BAD_SID: cmn_err(CE_WARN, "BUG! send to invalid SID"); - sctp_send_abort(sctp, sctph->sh_verf, 0, NULL, 0, mp, 0, 0); + sctp_send_abort(sctp, sctph->sh_verf, 0, NULL, 0, mp, 0, 0, + ira); return (ECONNABORTED); case SCTP_ERR_NO_USR_DATA: cmn_err(CE_WARN, "BUG! no usr data"); - sctp_send_abort(sctp, sctph->sh_verf, 0, NULL, 0, mp, 0, 0); + sctp_send_abort(sctp, sctph->sh_verf, 0, NULL, 0, mp, 0, 0, + ira); return (ECONNABORTED); case SCTP_ERR_UNREC_CHUNK: /* Pull out the unrecognized chunk type */ diff --git a/usr/src/uts/common/inet/sctp/sctp_hash.c b/usr/src/uts/common/inet/sctp/sctp_hash.c index 289dbc04e7..b5c838d297 100644 --- a/usr/src/uts/common/inet/sctp/sctp_hash.c +++ b/usr/src/uts/common/inet/sctp/sctp_hash.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -82,7 +82,7 @@ sctp_hash_init(sctp_stack_t *sctps) } sctps->sctps_conn_fanout = (sctp_tf_t *)kmem_zalloc(sctps->sctps_conn_hash_size * - sizeof (sctp_tf_t), KM_SLEEP); + sizeof (sctp_tf_t), KM_SLEEP); for (i = 0; i < sctps->sctps_conn_hash_size; i++) { mutex_init(&sctps->sctps_conn_fanout[i].tf_lock, NULL, MUTEX_DEFAULT, NULL); @@ -129,87 +129,6 @@ sctp_hash_destroy(sctp_stack_t *sctps) } /* - * Walk the SCTP global list and refrele the ire for this ipif - * This is called when an address goes down, so that we release any reference - * to the ire associated with this address. Additionally, for any SCTP if - * this was the only/last address in its source list, we don't kill the - * assoc., if there is no address added subsequently, or if this does not - * come up, then the assoc. will die a natural death (i.e. timeout). - */ -void -sctp_ire_cache_flush(ipif_t *ipif) -{ - sctp_t *sctp; - sctp_t *sctp_prev = NULL; - sctp_faddr_t *fp; - conn_t *connp; - ire_t *ire; - sctp_stack_t *sctps = ipif->ipif_ill->ill_ipst-> - ips_netstack->netstack_sctp; - - sctp = sctps->sctps_gsctp; - mutex_enter(&sctps->sctps_g_lock); - while (sctp != NULL) { - mutex_enter(&sctp->sctp_reflock); - if (sctp->sctp_condemned) { - mutex_exit(&sctp->sctp_reflock); - sctp = list_next(&sctps->sctps_g_list, sctp); - continue; - } - sctp->sctp_refcnt++; - mutex_exit(&sctp->sctp_reflock); - mutex_exit(&sctps->sctps_g_lock); - if (sctp_prev != NULL) - SCTP_REFRELE(sctp_prev); - - RUN_SCTP(sctp); - connp = sctp->sctp_connp; - mutex_enter(&connp->conn_lock); - ire = connp->conn_ire_cache; - if (ire != NULL && ire->ire_ipif == ipif) { - connp->conn_ire_cache = NULL; - mutex_exit(&connp->conn_lock); - IRE_REFRELE_NOTR(ire); - } else { - mutex_exit(&connp->conn_lock); - } - /* check for ires cached in faddr */ - for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->next) { - /* - * If this ipif is being used as the source address - * we need to update it as well, else we will end - * up using the dead source address. - */ - ire = fp->ire; - if (ire != NULL && ire->ire_ipif == ipif) { - fp->ire = NULL; - IRE_REFRELE_NOTR(ire); - } - /* - * This may result in setting the fp as unreachable, - * i.e. if all the source addresses are down. In - * that case the assoc. would timeout. - */ - if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, - &fp->saddr)) { - sctp_set_saddr(sctp, fp); - if (fp == sctp->sctp_current && - fp->state != SCTP_FADDRS_UNREACH) { - sctp_set_faddr_current(sctp, fp); - } - } - } - WAKE_SCTP(sctp); - sctp_prev = sctp; - mutex_enter(&sctps->sctps_g_lock); - sctp = list_next(&sctps->sctps_g_list, sctp); - } - mutex_exit(&sctps->sctps_g_lock); - if (sctp_prev != NULL) - SCTP_REFRELE(sctp_prev); -} - -/* * Exported routine for extracting active SCTP associations. * Like TCP, we terminate the walk if the callback returns non-zero. * @@ -244,9 +163,9 @@ cl_sctp_walk_list_stack(int (*cl_callback)(cl_sctp_info_t *, void *), uchar_t *slist; uchar_t *flist; - sctp = sctps->sctps_gsctp; sctp_prev = NULL; mutex_enter(&sctps->sctps_g_lock); + sctp = list_head(&sctps->sctps_g_list); while (sctp != NULL) { size_t ssize; size_t fsize; @@ -282,11 +201,14 @@ cl_sctp_walk_list_stack(int (*cl_callback)(cl_sctp_info_t *, void *), sctp_get_faddr_list(sctp, flist, fsize); cl_sctpi.cl_sctpi_nladdr = sctp->sctp_nsaddrs; cl_sctpi.cl_sctpi_nfaddr = sctp->sctp_nfaddrs; - cl_sctpi.cl_sctpi_family = sctp->sctp_family; - cl_sctpi.cl_sctpi_ipversion = sctp->sctp_ipversion; + cl_sctpi.cl_sctpi_family = sctp->sctp_connp->conn_family; + if (cl_sctpi.cl_sctpi_family == AF_INET) + cl_sctpi.cl_sctpi_ipversion = IPV4_VERSION; + else + cl_sctpi.cl_sctpi_ipversion = IPV6_VERSION; cl_sctpi.cl_sctpi_state = sctp->sctp_state; - cl_sctpi.cl_sctpi_lport = sctp->sctp_lport; - cl_sctpi.cl_sctpi_fport = sctp->sctp_fport; + cl_sctpi.cl_sctpi_lport = sctp->sctp_connp->conn_lport; + cl_sctpi.cl_sctpi_fport = sctp->sctp_connp->conn_fport; cl_sctpi.cl_sctpi_handle = (cl_sctp_handle_t)sctp; WAKE_SCTP(sctp); cl_sctpi.cl_sctpi_laddrp = slist; @@ -310,20 +232,26 @@ cl_sctp_walk_list_stack(int (*cl_callback)(cl_sctp_info_t *, void *), sctp_t * sctp_conn_match(in6_addr_t *faddr, in6_addr_t *laddr, uint32_t ports, - zoneid_t zoneid, sctp_stack_t *sctps) + zoneid_t zoneid, iaflags_t iraflags, sctp_stack_t *sctps) { sctp_tf_t *tf; sctp_t *sctp; sctp_faddr_t *fp; + conn_t *connp; tf = &(sctps->sctps_conn_fanout[SCTP_CONN_HASH(sctps, ports)]); mutex_enter(&tf->tf_lock); for (sctp = tf->tf_sctp; sctp; sctp = sctp->sctp_conn_hash_next) { - if (ports != sctp->sctp_ports || - !IPCL_ZONE_MATCH(sctp->sctp_connp, zoneid)) { + connp = sctp->sctp_connp; + if (ports != connp->conn_ports) + continue; + if (!(connp->conn_zoneid == zoneid || + connp->conn_allzones || + ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && + (iraflags & IRAF_TX_MAC_EXEMPTABLE) && + (iraflags & IRAF_TX_SHARED_ADDR)))) continue; - } /* check for faddr match */ for (fp = sctp->sctp_faddrs; fp; fp = fp->next) { @@ -351,11 +279,12 @@ done: static sctp_t * listen_match(in6_addr_t *laddr, uint32_t ports, zoneid_t zoneid, - sctp_stack_t *sctps) + iaflags_t iraflags, sctp_stack_t *sctps) { sctp_t *sctp; sctp_tf_t *tf; uint16_t lport; + conn_t *connp; lport = ((uint16_t *)&ports)[1]; @@ -363,10 +292,16 @@ listen_match(in6_addr_t *laddr, uint32_t ports, zoneid_t zoneid, mutex_enter(&tf->tf_lock); for (sctp = tf->tf_sctp; sctp; sctp = sctp->sctp_listen_hash_next) { - if (lport != sctp->sctp_lport || - !IPCL_ZONE_MATCH(sctp->sctp_connp, zoneid)) { + connp = sctp->sctp_connp; + if (lport != connp->conn_lport) + continue; + + if (!(connp->conn_zoneid == zoneid || + connp->conn_allzones || + ((connp->conn_mac_mode != CONN_MAC_DEFAULT) && + (iraflags & IRAF_TX_MAC_EXEMPTABLE) && + (iraflags & IRAF_TX_SHARED_ADDR)))) continue; - } if (sctp_saddr_lookup(sctp, laddr, 0) != NULL) { SCTP_REFHOLD(sctp); @@ -383,48 +318,36 @@ done: /* called by ipsec_sctp_pol */ conn_t * sctp_find_conn(in6_addr_t *src, in6_addr_t *dst, uint32_t ports, - zoneid_t zoneid, sctp_stack_t *sctps) + zoneid_t zoneid, iaflags_t iraflags, sctp_stack_t *sctps) { sctp_t *sctp; - if ((sctp = sctp_conn_match(src, dst, ports, zoneid, sctps)) == NULL) { + sctp = sctp_conn_match(src, dst, ports, zoneid, iraflags, sctps); + if (sctp == NULL) { /* Not in conn fanout; check listen fanout */ - if ((sctp = listen_match(dst, ports, zoneid, sctps)) == NULL) + sctp = listen_match(dst, ports, zoneid, iraflags, sctps); + if (sctp == NULL) return (NULL); } return (sctp->sctp_connp); } +/* + * Fanout to a sctp instance. + */ conn_t * sctp_fanout(in6_addr_t *src, in6_addr_t *dst, uint32_t ports, - zoneid_t zoneid, mblk_t *mp, sctp_stack_t *sctps) - + ip_recv_attr_t *ira, mblk_t *mp, sctp_stack_t *sctps) { + zoneid_t zoneid = ira->ira_zoneid; + iaflags_t iraflags = ira->ira_flags; sctp_t *sctp; - boolean_t shared_addr; - - if ((sctp = sctp_conn_match(src, dst, ports, zoneid, sctps)) == NULL) { - shared_addr = (zoneid == ALL_ZONES); - if (shared_addr) { - /* - * No need to handle exclusive-stack zones since - * ALL_ZONES only applies to the shared stack. - */ - zoneid = tsol_mlp_findzone(IPPROTO_SCTP, - htons(ntohl(ports) & 0xFFFF)); - /* - * If no shared MLP is found, tsol_mlp_findzone returns - * ALL_ZONES. In that case, we assume it's SLP, and - * search for the zone based on the packet label. - * That will also return ALL_ZONES on failure. - */ - if (zoneid == ALL_ZONES) - zoneid = tsol_packet_to_zoneid(mp); - if (zoneid == ALL_ZONES) - return (NULL); - } + + sctp = sctp_conn_match(src, dst, ports, zoneid, iraflags, sctps); + if (sctp == NULL) { /* Not in conn fanout; check listen fanout */ - if ((sctp = listen_match(dst, ports, zoneid, sctps)) == NULL) + sctp = listen_match(dst, ports, zoneid, iraflags, sctps); + if (sctp == NULL) return (NULL); /* * On systems running trusted extensions, check if dst @@ -432,9 +355,9 @@ sctp_fanout(in6_addr_t *src, in6_addr_t *dst, uint32_t ports, * that dst is in 16 byte AF_INET6 format. IPv4-mapped * IPv6 addresses are supported. */ - if (is_system_labeled() && - !tsol_receive_local(mp, dst, IPV6_VERSION, - shared_addr, sctp->sctp_connp)) { + if ((iraflags & IRAF_SYSTEM_LABELED) && + !tsol_receive_local(mp, dst, IPV6_VERSION, ira, + sctp->sctp_connp)) { DTRACE_PROBE3( tx__ip__log__info__classify__sctp, char *, @@ -444,145 +367,84 @@ sctp_fanout(in6_addr_t *src, in6_addr_t *dst, uint32_t ports, return (NULL); } } + /* + * For labeled systems, there's no need to check the + * label here. It's known to be good as we checked + * before allowing the connection to become bound. + */ return (sctp->sctp_connp); } /* - * Fanout for SCTP packets + * Fanout for ICMP errors for SCTP * The caller puts <fport, lport> in the ports parameter. */ -/* ARGSUSED */ void -ip_fanout_sctp(mblk_t *mp, ill_t *recv_ill, ipha_t *ipha, - uint32_t ports, uint_t flags, boolean_t mctl_present, boolean_t ip_policy, - zoneid_t zoneid) +ip_fanout_sctp(mblk_t *mp, ipha_t *ipha, ip6_t *ip6h, uint32_t ports, + ip_recv_attr_t *ira) { - sctp_t *sctp; - boolean_t isv4; - conn_t *connp; - mblk_t *first_mp; - ip6_t *ip6h; - in6_addr_t map_src, map_dst; - in6_addr_t *src, *dst; - ip_stack_t *ipst; - ipsec_stack_t *ipss; - sctp_stack_t *sctps; - - ASSERT(recv_ill != NULL); - ipst = recv_ill->ill_ipst; - sctps = ipst->ips_netstack->netstack_sctp; - ipss = ipst->ips_netstack->netstack_ipsec; - - first_mp = mp; - if (mctl_present) { - mp = first_mp->b_cont; - ASSERT(mp != NULL); - } + sctp_t *sctp; + conn_t *connp; + in6_addr_t map_src, map_dst; + in6_addr_t *src, *dst; + boolean_t secure; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; + netstack_t *ns = ipst->ips_netstack; + ipsec_stack_t *ipss = ns->netstack_ipsec; + sctp_stack_t *sctps = ns->netstack_sctp; + iaflags_t iraflags = ira->ira_flags; + ill_t *rill = ira->ira_rill; + + ASSERT(iraflags & IRAF_ICMP_ERROR); + + secure = iraflags & IRAF_IPSEC_SECURE; /* Assume IP provides aligned packets - otherwise toss */ if (!OK_32PTR(mp->b_rptr)) { - BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards); - freemsg(first_mp); + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, ill); + freemsg(mp); return; } - if (IPH_HDR_VERSION(ipha) == IPV6_VERSION) { - ip6h = (ip6_t *)ipha; + if (!(iraflags & IRAF_IS_IPV4)) { src = &ip6h->ip6_src; dst = &ip6h->ip6_dst; - isv4 = B_FALSE; } else { - ip6h = NULL; IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &map_src); IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &map_dst); src = &map_src; dst = &map_dst; - isv4 = B_TRUE; } - connp = sctp_fanout(src, dst, ports, zoneid, mp, sctps); + connp = sctp_fanout(src, dst, ports, ira, mp, sctps); if (connp == NULL) { - ip_fanout_sctp_raw(first_mp, recv_ill, ipha, isv4, - ports, mctl_present, flags, ip_policy, zoneid); + ip_fanout_sctp_raw(mp, ipha, ip6h, ports, ira); return; } sctp = CONN2SCTP(connp); - /* Found a client; up it goes */ - BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsHCInDelivers); - /* * We check some fields in conn_t without holding a lock. * This should be fine. */ - if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || mctl_present) { - first_mp = ipsec_check_inbound_policy(first_mp, connp, - ipha, NULL, mctl_present); - if (first_mp == NULL) { - SCTP_REFRELE(sctp); - return; - } - } - - /* Initiate IPPF processing for fastpath */ - if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) { - ip_process(IPP_LOCAL_IN, &mp, - recv_ill->ill_phyint->phyint_ifindex); + if (((iraflags & IRAF_IS_IPV4) ? + CONN_INBOUND_POLICY_PRESENT(connp, ipss) : + CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) || + secure) { + mp = ipsec_check_inbound_policy(mp, connp, ipha, + ip6h, ira); if (mp == NULL) { SCTP_REFRELE(sctp); - if (mctl_present) - freeb(first_mp); return; - } else if (mctl_present) { - /* - * ip_process might return a new mp. - */ - ASSERT(first_mp != mp); - first_mp->b_cont = mp; - } else { - first_mp = mp; } } - if (connp->conn_recvif || connp->conn_recvslla || - connp->conn_ip_recvpktinfo) { - int in_flags = 0; - - if (connp->conn_recvif || connp->conn_ip_recvpktinfo) { - in_flags = IPF_RECVIF; - } - if (connp->conn_recvslla) { - in_flags |= IPF_RECVSLLA; - } - if (isv4) { - mp = ip_add_info(mp, recv_ill, in_flags, - IPCL_ZONEID(connp), ipst); - } else { - mp = ip_add_info_v6(mp, recv_ill, &ip6h->ip6_dst); - } - if (mp == NULL) { - SCTP_REFRELE(sctp); - if (mctl_present) - freeb(first_mp); - return; - } else if (mctl_present) { - /* - * ip_add_info might return a new mp. - */ - ASSERT(first_mp != mp); - first_mp->b_cont = mp; - } else { - first_mp = mp; - } - } + ira->ira_ill = ira->ira_rill = NULL; mutex_enter(&sctp->sctp_lock); if (sctp->sctp_running) { - if (mctl_present) - mp->b_prev = first_mp; - if (!sctp_add_recvq(sctp, mp, B_FALSE)) { - BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards); - freemsg(first_mp); - } + sctp_add_recvq(sctp, mp, B_FALSE, ira); mutex_exit(&sctp->sctp_lock); } else { sctp->sctp_running = B_TRUE; @@ -590,24 +452,22 @@ ip_fanout_sctp(mblk_t *mp, ill_t *recv_ill, ipha_t *ipha, mutex_enter(&sctp->sctp_recvq_lock); if (sctp->sctp_recvq != NULL) { - if (mctl_present) - mp->b_prev = first_mp; - if (!sctp_add_recvq(sctp, mp, B_TRUE)) { - BUMP_MIB(recv_ill->ill_ip_mib, - ipIfStatsInDiscards); - freemsg(first_mp); - } + sctp_add_recvq(sctp, mp, B_TRUE, ira); mutex_exit(&sctp->sctp_recvq_lock); WAKE_SCTP(sctp); } else { mutex_exit(&sctp->sctp_recvq_lock); - sctp_input_data(sctp, mp, (mctl_present ? first_mp : - NULL)); + if (ira->ira_flags & IRAF_ICMP_ERROR) { + sctp_icmp_error(sctp, mp); + } else { + sctp_input_data(sctp, mp, ira); + } WAKE_SCTP(sctp); - sctp_process_sendq(sctp); } } SCTP_REFRELE(sctp); + ira->ira_ill = ill; + ira->ira_rill = rill; } void @@ -623,7 +483,7 @@ sctp_conn_hash_remove(sctp_t *sctp) * subsystem. */ if (cl_sctp_disconnect != NULL) { - (*cl_sctp_disconnect)(sctp->sctp_family, + (*cl_sctp_disconnect)(sctp->sctp_connp->conn_family, (cl_sctp_handle_t)sctp); } @@ -683,6 +543,7 @@ void sctp_listen_hash_remove(sctp_t *sctp) { sctp_tf_t *tf = sctp->sctp_listen_tfp; + conn_t *connp = sctp->sctp_connp; if (!tf) { return; @@ -698,8 +559,8 @@ sctp_listen_hash_remove(sctp_t *sctp) ssize = sizeof (in6_addr_t) * sctp->sctp_nsaddrs; slist = kmem_alloc(ssize, KM_SLEEP); sctp_get_saddr_list(sctp, slist, ssize); - (*cl_sctp_unlisten)(sctp->sctp_family, slist, - sctp->sctp_nsaddrs, sctp->sctp_lport); + (*cl_sctp_unlisten)(connp->conn_family, slist, + sctp->sctp_nsaddrs, connp->conn_lport); /* list will be freed by the clustering module */ } @@ -722,7 +583,10 @@ sctp_listen_hash_remove(sctp_t *sctp) sctp->sctp_listen_hash_next; if (sctp->sctp_listen_hash_next != NULL) { - sctp->sctp_listen_hash_next->sctp_listen_hash_prev = + sctp_t *next = sctp->sctp_listen_hash_next; + + ASSERT(next->sctp_listen_hash_prev == sctp); + next->sctp_listen_hash_prev = sctp->sctp_listen_hash_prev; } } @@ -735,6 +599,8 @@ sctp_listen_hash_remove(sctp_t *sctp) void sctp_listen_hash_insert(sctp_tf_t *tf, sctp_t *sctp) { + conn_t *connp = sctp->sctp_connp; + if (sctp->sctp_listen_tfp) { sctp_listen_hash_remove(sctp); } @@ -759,8 +625,8 @@ sctp_listen_hash_insert(sctp_tf_t *tf, sctp_t *sctp) ssize = sizeof (in6_addr_t) * sctp->sctp_nsaddrs; slist = kmem_alloc(ssize, KM_SLEEP); sctp_get_saddr_list(sctp, slist, ssize); - (*cl_sctp_listen)(sctp->sctp_family, slist, - sctp->sctp_nsaddrs, sctp->sctp_lport); + (*cl_sctp_listen)(connp->conn_family, slist, + sctp->sctp_nsaddrs, connp->conn_lport); /* list will be freed by the clustering module */ } } @@ -850,8 +716,8 @@ sctp_lookup(sctp_t *sctp1, in6_addr_t *faddr, sctp_tf_t *tf, uint32_t *ports, for (sctp = tf->tf_sctp; sctp != NULL; sctp = sctp->sctp_conn_hash_next) { - if (*ports != sctp->sctp_ports || sctp->sctp_state < - min_state) { + if (*ports != sctp->sctp_connp->conn_ports || + sctp->sctp_state < min_state) { continue; } @@ -886,38 +752,3 @@ done: } return (sctp); } - -boolean_t -ip_fanout_sctp_raw_match(conn_t *connp, uint32_t ports, ipha_t *ipha) -{ - uint16_t lport; - - if (connp->conn_fully_bound) { - return (IPCL_CONN_MATCH(connp, IPPROTO_SCTP, ipha->ipha_src, - ipha->ipha_dst, ports)); - } else { - lport = htons(ntohl(ports) & 0xFFFF); - return (IPCL_BIND_MATCH(connp, IPPROTO_SCTP, ipha->ipha_dst, - lport)); - } -} - -boolean_t -ip_fanout_sctp_raw_match_v6(conn_t *connp, uint32_t ports, ip6_t *ip6h, - boolean_t for_v4) -{ - uint16_t lport; - in6_addr_t v6dst; - - if (!for_v4 && connp->conn_fully_bound) { - return (IPCL_CONN_MATCH_V6(connp, IPPROTO_SCTP, ip6h->ip6_src, - ip6h->ip6_dst, ports)); - } else { - lport = htons(ntohl(ports) & 0xFFFF); - if (for_v4) - v6dst = ipv6_all_zeros; - else - v6dst = ip6h->ip6_dst; - return (IPCL_BIND_MATCH_V6(connp, IPPROTO_SCTP, v6dst, lport)); - } -} diff --git a/usr/src/uts/common/inet/sctp/sctp_heartbeat.c b/usr/src/uts/common/inet/sctp/sctp_heartbeat.c index 914f1cac3f..2fbffee1c3 100644 --- a/usr/src/uts/common/inet/sctp/sctp_heartbeat.c +++ b/usr/src/uts/common/inet/sctp/sctp_heartbeat.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/systm.h> #include <sys/stream.h> @@ -66,8 +64,14 @@ sctp_return_heartbeat(sctp_t *sctp, sctp_chunk_hdr_t *hbcp, mblk_t *mp) addr = inip6h->ip6_src; } fp = sctp_lookup_faddr(sctp, &addr); - ASSERT(fp != NULL); - + /* If the source address is bogus we silently drop the packet */ + if (fp == NULL) { + dprint(1, + ("sctp_return_heartbeat: %p bogus hb from %x:%x:%x:%x\n", + (void *)sctp, SCTP_PRINTADDR(addr))); + SCTP_KSTAT(sctps, sctp_return_hb_failed); + return; + } dprint(3, ("sctp_return_heartbeat: %p got hb from %x:%x:%x:%x\n", (void *)sctp, SCTP_PRINTADDR(addr))); @@ -98,10 +102,11 @@ sctp_return_heartbeat(sctp_t *sctp, sctp_chunk_hdr_t *hbcp, mblk_t *mp) smp->b_wptr += len; - sctp_set_iplen(sctp, smp); - BUMP_LOCAL(sctp->sctp_obchunks); - sctp_add_sendq(sctp, smp); + + sctp_set_iplen(sctp, smp, fp->ixa); + (void) conn_ip_output(smp, fp->ixa); + BUMP_LOCAL(sctp->sctp_opkts); } /* @@ -126,10 +131,10 @@ sctp_send_heartbeat(sctp_t *sctp, sctp_faddr_t *fp) SCTP_PRINTADDR(fp->faddr), SCTP_PRINTADDR(fp->saddr))); hblen = sizeof (*cp) + - sizeof (*hpp) + - sizeof (*t) + - sizeof (fp->hb_secret) + - sizeof (fp->faddr); + sizeof (*hpp) + + sizeof (*t) + + sizeof (fp->hb_secret) + + sizeof (fp->faddr); hbmp = sctp_make_mp(sctp, fp, hblen); if (hbmp == NULL) { SCTP_KSTAT(sctps, sctp_send_hb_failed); @@ -180,8 +185,6 @@ sctp_send_heartbeat(sctp_t *sctp, sctp_faddr_t *fp) hbmp->b_wptr += hblen; - sctp_set_iplen(sctp, hbmp); - /* Update the faddr's info */ fp->lastactive = now; fp->hb_pending = B_TRUE; @@ -189,7 +192,9 @@ sctp_send_heartbeat(sctp_t *sctp, sctp_faddr_t *fp) BUMP_LOCAL(sctp->sctp_obchunks); BUMP_MIB(&sctps->sctps_mib, sctpTimHeartBeatProbe); - sctp_add_sendq(sctp, hbmp); + sctp_set_iplen(sctp, hbmp, fp->ixa); + (void) conn_ip_output(hbmp, fp->ixa); + BUMP_LOCAL(sctp->sctp_opkts); } /* diff --git a/usr/src/uts/common/inet/sctp/sctp_impl.h b/usr/src/uts/common/inet/sctp/sctp_impl.h index 32268648f6..d84c3762f3 100644 --- a/usr/src/uts/common/inet/sctp/sctp_impl.h +++ b/usr/src/uts/common/inet/sctp/sctp_impl.h @@ -191,7 +191,6 @@ typedef struct sctpparam_s { #define SCTP_MAX_COMBINED_HEADER_LENGTH (60 + 12) /* Maxed out ip + sctp */ #define SCTP_MAX_IP_OPTIONS_LENGTH (60 - IP_SIMPLE_HDR_LENGTH) #define SCTP_MAX_HDR_LENGTH 60 -#define ICMP_MIN_SCTP_HDR_LEN (ICMP_MIN_TP_HDR_LEN + sizeof (sctp_hdr_t)) #define SCTP_SECRET_LEN 16 @@ -213,27 +212,6 @@ typedef struct sctpparam_s { } \ } -#define SCTP_G_Q_REFHOLD(sctps) { \ - atomic_add_32(&(sctps)->sctps_g_q_ref, 1); \ - ASSERT((sctps)->sctps_g_q_ref != 0); \ - DTRACE_PROBE1(sctp__g__q__refhold, sctp_stack_t, sctps); \ -} - -/* - * Decrement the reference count on sctp_g_q - * In architectures e.g sun4u, where atomic_add_32_nv is just - * a cas, we need to maintain the right memory barrier semantics - * as that of mutex_exit i.e all the loads and stores should complete - * before the cas is executed. membar_exit() does that here. - */ -#define SCTP_G_Q_REFRELE(sctps) { \ - ASSERT((sctps)->sctps_g_q_ref != 0); \ - membar_exit(); \ - DTRACE_PROBE1(sctp__g__q__refrele, sctp_stack_t, sctps); \ - if (atomic_add_32_nv(&(sctps)->sctps_g_q_ref, -1) == 0) \ - sctp_g_q_inactive(sctps); \ -} - #define SCTP_PRINTADDR(a) (a).s6_addr32[0], (a).s6_addr32[1],\ (a).s6_addr32[2], (a).s6_addr32[3] @@ -399,15 +377,6 @@ extern sin6_t sctp_sin6_null; /* Zero address for quick clears */ #define SCTP_IS_DETACHED(sctp) ((sctp)->sctp_detached) -/* - * Object to represent database of options to search passed to - * {sock,tpi}optcom_req() interface routine to take care of option - * management and associated methods. - * XXX These and other externs should ideally move to a SCTP header - */ -extern optdb_obj_t sctp_opt_obj; -extern uint_t sctp_max_optbuf_len; - /* Data structure used to track received TSNs */ typedef struct sctp_set_s { struct sctp_set_s *next; @@ -528,7 +497,7 @@ typedef struct sctp_faddr_s { hb_enabled : 1; mblk_t *rc_timer_mp; /* reliable control chunk timer */ - ire_t *ire; /* cached IRE */ + ip_xmit_attr_t *ixa; /* Transmit attributes */ uint32_t T3expire; /* # of times T3 timer expired */ uint64_t hb_secret; /* per addr "secret" in heartbeat */ @@ -600,25 +569,6 @@ typedef struct sctp_s { sctp_ipif_hash_t sctp_saddrs[SCTP_IPIF_HASH]; int sctp_nsaddrs; - /* - * These fields contain the same information as sctp_sctph->th_*port. - * However, the lookup functions can not use the header fields - * since during IP option manipulation the sctp_sctph pointer - * changes. - */ - union { - struct { - in_port_t sctpu_fport; /* Remote port */ - in_port_t sctpu_lport; /* Local port */ - } sctpu_ports1; - uint32_t sctpu_ports2; /* Rem port, */ - /* local port */ - /* Used for SCTP_MATCH performance */ - } sctp_sctpu; -#define sctp_fport sctp_sctpu.sctpu_ports1.sctpu_fport -#define sctp_lport sctp_sctpu.sctpu_ports1.sctpu_lport -#define sctp_ports sctp_sctpu.sctpu_ports2 - kmutex_t sctp_lock; kcondvar_t sctp_cv; boolean_t sctp_running; @@ -637,12 +587,6 @@ typedef struct sctp_s { int32_t sctp_state; conn_t *sctp_connp; /* conn_t stuff */ -#define sctp_zoneid sctp_connp->conn_zoneid -#define sctp_allzones sctp_connp->conn_allzones -#define sctp_mac_mode sctp_connp->conn_mac_mode -#define sctp_credp sctp_connp->conn_cred -#define sctp_reuseaddr sctp_connp->conn_reuseaddr - sctp_stack_t *sctp_sctps; /* Peer address tracking */ @@ -711,9 +655,6 @@ typedef struct sctp_s { uint32_t sctp_T3expire; /* # of times T3timer expired */ uint32_t sctp_assoc_start_time; /* time when assoc was est. */ - /* Outbound flow control */ - int32_t sctp_xmit_hiwater; /* Send high water mark */ - int32_t sctp_xmit_lowater; /* Send low water mark */ uint32_t sctp_frwnd; /* Peer RWND */ uint32_t sctp_cwnd_max; @@ -723,8 +664,8 @@ typedef struct sctp_s { int32_t sctp_rxqueued; /* No. of bytes in RX q's */ /* Pre-initialized composite headers */ - char *sctp_iphc; /* v4 sctp/ip hdr template buffer */ - char *sctp_iphc6; /* v6 sctp/ip hdr template buffer */ + uchar_t *sctp_iphc; /* v4 sctp/ip hdr template buffer */ + uchar_t *sctp_iphc6; /* v6 sctp/ip hdr template buffer */ int32_t sctp_iphc_len; /* actual allocated v4 buffer size */ int32_t sctp_iphc6_len; /* actual allocated v6 buffer size */ @@ -754,17 +695,12 @@ typedef struct sctp_s { uint32_t sctp_understands_asconf : 1, /* Peer handles ASCONF chunks */ - sctp_debug : 1, /* SO_DEBUG "socket" option. */ sctp_cchunk_pend : 1, /* Control chunk in flight. */ - sctp_dgram_errind : 1, /* SO_DGRAM_ERRIND option */ - - sctp_linger : 1, /* SO_LINGER turned on */ sctp_lingering : 1, /* Lingering in close */ sctp_loopback: 1, /* src and dst are the same machine */ - sctp_force_sack : 1, + sctp_force_sack : 1, sctp_ack_timer_running: 1, /* Delayed ACK timer running */ - sctp_recvdstaddr : 1, /* return T_EXTCONN_IND with dstaddr */ sctp_hwcksum : 1, /* The NIC is capable of hwcksum */ sctp_understands_addip : 1, @@ -802,15 +738,11 @@ typedef struct sctp_s { } sctp_events; #define sctp_priv_stream sctp_bits.sctp_priv_stream #define sctp_understands_asconf sctp_bits.sctp_understands_asconf -#define sctp_debug sctp_bits.sctp_debug #define sctp_cchunk_pend sctp_bits.sctp_cchunk_pend -#define sctp_dgram_errind sctp_bits.sctp_dgram_errind -#define sctp_linger sctp_bits.sctp_linger #define sctp_lingering sctp_bits.sctp_lingering #define sctp_loopback sctp_bits.sctp_loopback #define sctp_force_sack sctp_bits.sctp_force_sack #define sctp_ack_timer_running sctp_bits.sctp_ack_timer_running -#define sctp_recvdstaddr sctp_bits.sctp_recvdstaddr #define sctp_hwcksum sctp_bits.sctp_hwcksum #define sctp_understands_addip sctp_bits.sctp_understands_addip #define sctp_bound_to_all sctp_bits.sctp_bound_to_all @@ -853,15 +785,6 @@ typedef struct sctp_s { uint8_t sctp_old_secret[SCTP_SECRET_LEN]; uint32_t sctp_cookie_lifetime; /* cookie lifetime in tick */ - /* - * Address family that app wishes returned addrsses to be in. - * Currently taken from address family used in T_BIND_REQ, but - * should really come from family used in original socket() call. - * Value can be AF_INET or AF_INET6. - */ - uint_t sctp_family; - ushort_t sctp_ipversion; - /* Bind hash tables */ kmutex_t *sctp_bind_lockp; /* Ptr to tf_lock */ struct sctp_s *sctp_bind_hash; @@ -870,14 +793,10 @@ typedef struct sctp_s { /* Shutdown / cleanup */ sctp_faddr_t *sctp_shutdown_faddr; /* rotate faddr during shutd */ int32_t sctp_client_errno; /* How the client screwed up */ - int sctp_lingertime; /* Close linger time (in seconds) */ kmutex_t sctp_reflock; /* Protects sctp_refcnt & timer mp */ ushort_t sctp_refcnt; /* No. of pending upstream msg */ mblk_t *sctp_timer_mp; /* List of fired timers. */ - /* Misc */ - uint_t sctp_bound_if; /* IPV6_BOUND_IF */ - mblk_t *sctp_heartbeat_mp; /* Timer block for heartbeats */ uint32_t sctp_hb_interval; /* Default hb_interval */ @@ -897,47 +816,19 @@ typedef struct sctp_s { mblk_t *sctp_recvq_tail; taskq_t *sctp_recvq_tq; - /* Send queue to IP */ - kmutex_t sctp_sendq_lock; - mblk_t *sctp_sendq; - mblk_t *sctp_sendq_tail; - boolean_t sctp_sendq_sending; - /* IPv6 ancillary data */ - uint_t sctp_ipv6_recvancillary; /* flags */ -#define SCTP_IPV6_RECVPKTINFO 0x01 /* IPV6_RECVPKTINFO opt */ -#define SCTP_IPV6_RECVHOPLIMIT 0x02 /* IPV6_RECVHOPLIMIT opt */ -#define SCTP_IPV6_RECVHOPOPTS 0x04 /* IPV6_RECVHOPOPTS opt */ -#define SCTP_IPV6_RECVDSTOPTS 0x08 /* IPV6_RECVDSTOPTS opt */ -#define SCTP_IPV6_RECVRTHDR 0x10 /* IPV6_RECVRTHDR opt */ -#define SCTP_IPV6_RECVRTDSTOPTS 0x20 /* IPV6_RECVRTHDRDSTOPTS opt */ - uint_t sctp_recvifindex; /* last rcvd IPV6_RCVPKTINFO */ uint_t sctp_recvhops; /* " IPV6_RECVHOPLIMIT */ + uint_t sctp_recvtclass; /* " IPV6_RECVTCLASS */ ip6_hbh_t *sctp_hopopts; /* " IPV6_RECVHOPOPTS */ ip6_dest_t *sctp_dstopts; /* " IPV6_RECVDSTOPTS */ - ip6_dest_t *sctp_rtdstopts; /* " IPV6_RECVRTHDRDSTOPTS */ + ip6_dest_t *sctp_rthdrdstopts; /* " IPV6_RECVRTHDRDSTOPTS */ ip6_rthdr_t *sctp_rthdr; /* " IPV6_RECVRTHDR */ uint_t sctp_hopoptslen; uint_t sctp_dstoptslen; - uint_t sctp_rtdstoptslen; + uint_t sctp_rthdrdstoptslen; uint_t sctp_rthdrlen; - ip6_pkt_t sctp_sticky_ipp; /* Sticky options */ -#define sctp_ipp_fields sctp_sticky_ipp.ipp_fields -#define sctp_ipp_ifindex sctp_sticky_ipp.ipp_ifindex -#define sctp_ipp_addr sctp_sticky_ipp.ipp_addr -#define sctp_ipp_hoplimit sctp_sticky_ipp.ipp_hoplimit -#define sctp_ipp_hopoptslen sctp_sticky_ipp.ipp_hopoptslen -#define sctp_ipp_rtdstoptslen sctp_sticky_ipp.ipp_rtdstoptslen -#define sctp_ipp_rthdrlen sctp_sticky_ipp.ipp_rthdrlen -#define sctp_ipp_dstoptslen sctp_sticky_ipp.ipp_dstoptslen -#define sctp_ipp_hopopts sctp_sticky_ipp.ipp_hopopts -#define sctp_ipp_rtdstopts sctp_sticky_ipp.ipp_rtdstopts -#define sctp_ipp_rthdr sctp_sticky_ipp.ipp_rthdr -#define sctp_ipp_dstopts sctp_sticky_ipp.ipp_dstopts -#define sctp_ipp_pathmtu sctp_sticky_ipp.ipp_pathmtu -#define sctp_ipp_nexthop sctp_sticky_ipp.ipp_nexthop /* Stats */ uint64_t sctp_msgcount; uint64_t sctp_prsctpdrop; @@ -951,9 +842,6 @@ typedef struct sctp_s { mblk_t *sctp_err_chunks; /* Error chunks */ uint32_t sctp_err_len; /* Total error chunks length */ - pid_t sctp_cpid; /* Process id when this was opened */ - uint64_t sctp_open_time; /* time when this was opened */ - /* additional source data for per endpoint association statistics */ uint64_t sctp_outseqtsns; /* TSN rx > expected TSN */ uint64_t sctp_osacks; /* total sacks sent */ @@ -988,7 +876,7 @@ typedef struct sctp_s { #define SCTP_TXQ_LEN(sctp) ((sctp)->sctp_unsent + (sctp)->sctp_unacked) #define SCTP_TXQ_UPDATE(sctp) \ if ((sctp)->sctp_txq_full && SCTP_TXQ_LEN(sctp) <= \ - (sctp)->sctp_xmit_lowater) { \ + (sctp)->sctp_connp->conn_sndlowat) { \ (sctp)->sctp_txq_full = 0; \ (sctp)->sctp_ulp_xmitted((sctp)->sctp_ulpd, \ B_FALSE); \ @@ -1004,8 +892,8 @@ extern void sctp_add_err(sctp_t *, uint16_t, void *, size_t, extern int sctp_add_faddr(sctp_t *, in6_addr_t *, int, boolean_t); extern boolean_t sctp_add_ftsn_set(sctp_ftsn_set_t **, sctp_faddr_t *, mblk_t *, uint_t *, uint32_t *); -extern boolean_t sctp_add_recvq(sctp_t *, mblk_t *, boolean_t); -extern void sctp_add_sendq(sctp_t *, mblk_t *); +extern void sctp_add_recvq(sctp_t *, mblk_t *, boolean_t, + ip_recv_attr_t *); extern void sctp_add_unrec_parm(sctp_parm_hdr_t *, mblk_t **, boolean_t); extern size_t sctp_addr_params(sctp_t *, int, uchar_t *, boolean_t); extern mblk_t *sctp_add_proto_hdr(sctp_t *, sctp_faddr_t *, mblk_t *, int, @@ -1013,7 +901,6 @@ extern mblk_t *sctp_add_proto_hdr(sctp_t *, sctp_faddr_t *, mblk_t *, int, extern void sctp_addr_req(sctp_t *, mblk_t *); extern sctp_t *sctp_addrlist2sctp(mblk_t *, sctp_hdr_t *, sctp_chunk_hdr_t *, zoneid_t, sctp_stack_t *); -extern void sctp_add_hdr(sctp_t *, uchar_t *, size_t); extern void sctp_check_adv_ack_pt(sctp_t *, mblk_t *, mblk_t *); extern void sctp_assoc_event(sctp_t *, uint16_t, uint16_t, sctp_chunk_hdr_t *); @@ -1024,7 +911,7 @@ extern int sctp_bindi(sctp_t *, in_port_t, boolean_t, int, in_port_t *); extern int sctp_bind_add(sctp_t *, const void *, uint32_t, boolean_t, in_port_t); extern int sctp_bind_del(sctp_t *, const void *, uint32_t, boolean_t); -extern int sctp_build_hdrs(sctp_t *); +extern int sctp_build_hdrs(sctp_t *, int); extern int sctp_check_abandoned_msg(sctp_t *, mblk_t *); extern void sctp_clean_death(sctp_t *, int); @@ -1035,11 +922,9 @@ extern void sctp_conn_hash_insert(sctp_tf_t *, sctp_t *, int); extern void sctp_conn_hash_remove(sctp_t *); extern void sctp_conn_init(conn_t *); extern sctp_t *sctp_conn_match(in6_addr_t *, in6_addr_t *, uint32_t, - zoneid_t, sctp_stack_t *); + zoneid_t, iaflags_t, sctp_stack_t *); extern sctp_t *sctp_conn_request(sctp_t *, mblk_t *, uint_t, uint_t, - sctp_init_chunk_t *, mblk_t *); -extern int sctp_conprim_opt_process(queue_t *, mblk_t *, int *, int *, - int *); + sctp_init_chunk_t *, ip_recv_attr_t *); extern uint32_t sctp_cumack(sctp_t *, uint32_t, mblk_t **); extern sctp_t *sctp_create_eager(sctp_t *); @@ -1066,10 +951,9 @@ extern void sctp_ftsn_sets_init(void); extern int sctp_get_addrlist(sctp_t *, const void *, uint32_t *, uchar_t **, int *, size_t *); -extern void sctp_g_q_inactive(sctp_stack_t *); extern int sctp_get_addrparams(sctp_t *, sctp_t *, mblk_t *, sctp_chunk_hdr_t *, uint_t *); -extern void sctp_get_ire(sctp_t *, sctp_faddr_t *); +extern void sctp_get_dest(sctp_t *, sctp_faddr_t *); extern void sctp_get_faddr_list(sctp_t *, uchar_t *, size_t); extern mblk_t *sctp_get_first_sent(sctp_t *); extern mblk_t *sctp_get_msg_to_send(sctp_t *, mblk_t **, mblk_t *, int *, @@ -1077,22 +961,20 @@ extern mblk_t *sctp_get_msg_to_send(sctp_t *, mblk_t **, mblk_t *, int *, extern void sctp_get_saddr_list(sctp_t *, uchar_t *, size_t); extern int sctp_handle_error(sctp_t *, sctp_hdr_t *, sctp_chunk_hdr_t *, - mblk_t *); + mblk_t *, ip_recv_attr_t *); extern void sctp_hash_destroy(sctp_stack_t *); extern void sctp_hash_init(sctp_stack_t *); -extern int sctp_header_init_ipv4(sctp_t *, int); -extern int sctp_header_init_ipv6(sctp_t *, int); extern void sctp_heartbeat_timer(sctp_t *); extern void sctp_icmp_error(sctp_t *, mblk_t *); extern void sctp_inc_taskq(sctp_stack_t *); extern void sctp_info_req(sctp_t *, mblk_t *); -extern mblk_t *sctp_init_mp(sctp_t *); +extern mblk_t *sctp_init_mp(sctp_t *, sctp_faddr_t *); extern boolean_t sctp_initialize_params(sctp_t *, sctp_init_chunk_t *, sctp_init_chunk_t *); extern uint32_t sctp_init2vtag(sctp_chunk_hdr_t *); extern void sctp_intf_event(sctp_t *, in6_addr_t, int, int); -extern void sctp_input_data(sctp_t *, mblk_t *, mblk_t *); +extern void sctp_input_data(sctp_t *, mblk_t *, ip_recv_attr_t *); extern void sctp_instream_cleanup(sctp_t *, boolean_t); extern int sctp_is_a_faddr_clean(sctp_t *); @@ -1124,7 +1006,8 @@ extern int sctp_nd_getset(queue_t *, MBLKP); extern boolean_t sctp_nd_init(sctp_stack_t *); extern sctp_parm_hdr_t *sctp_next_parm(sctp_parm_hdr_t *, ssize_t *); -extern void sctp_ootb_shutdown_ack(sctp_t *, mblk_t *, uint_t); +extern void sctp_ootb_shutdown_ack(mblk_t *, uint_t, ip_recv_attr_t *, + ip_stack_t *); extern size_t sctp_options_param(const sctp_t *, void *, int); extern size_t sctp_options_param_len(const sctp_t *, int); extern void sctp_output(sctp_t *, uint_t); @@ -1132,10 +1015,10 @@ extern void sctp_output(sctp_t *, uint_t); extern boolean_t sctp_param_register(IDP *, sctpparam_t *, int, sctp_stack_t *); extern void sctp_partial_delivery_event(sctp_t *); extern int sctp_process_cookie(sctp_t *, sctp_chunk_hdr_t *, mblk_t *, - sctp_init_chunk_t **, sctp_hdr_t *, int *, in6_addr_t *); + sctp_init_chunk_t **, sctp_hdr_t *, int *, in6_addr_t *, + ip_recv_attr_t *); extern void sctp_process_err(sctp_t *); extern void sctp_process_heartbeat(sctp_t *, sctp_chunk_hdr_t *); -extern void sctp_process_sendq(sctp_t *); extern void sctp_process_timer(sctp_t *); extern void sctp_redo_faddr_srcs(sctp_t *); @@ -1149,13 +1032,17 @@ extern sctp_faddr_t *sctp_rotate_faddr(sctp_t *, sctp_faddr_t *); extern boolean_t sctp_sack(sctp_t *, mblk_t *); extern int sctp_secure_restart_check(mblk_t *, sctp_chunk_hdr_t *, - uint32_t, int, sctp_stack_t *); + uint32_t, int, sctp_stack_t *, ip_recv_attr_t *); extern void sctp_send_abort(sctp_t *, uint32_t, uint16_t, char *, size_t, - mblk_t *, int, boolean_t); + mblk_t *, int, boolean_t, ip_recv_attr_t *); +extern void sctp_ootb_send_abort(uint32_t, uint16_t, char *, size_t, + const mblk_t *, int, boolean_t, ip_recv_attr_t *, + ip_stack_t *); extern void sctp_send_cookie_ack(sctp_t *); -extern void sctp_send_cookie_echo(sctp_t *, sctp_chunk_hdr_t *, mblk_t *); +extern void sctp_send_cookie_echo(sctp_t *, sctp_chunk_hdr_t *, mblk_t *, + ip_recv_attr_t *); extern void sctp_send_initack(sctp_t *, sctp_hdr_t *, sctp_chunk_hdr_t *, - mblk_t *); + mblk_t *, ip_recv_attr_t *); extern void sctp_send_shutdown(sctp_t *, int); extern void sctp_send_heartbeat(sctp_t *, sctp_faddr_t *); extern void sctp_sendfail_event(sctp_t *, mblk_t *, int, boolean_t); @@ -1170,7 +1057,7 @@ extern int sctp_shutdown_received(sctp_t *, sctp_chunk_hdr_t *, boolean_t, boolean_t, sctp_faddr_t *); extern void sctp_shutdown_complete(sctp_t *); extern void sctp_set_if_mtu(sctp_t *); -extern void sctp_set_iplen(sctp_t *, mblk_t *); +extern void sctp_set_iplen(sctp_t *, mblk_t *, ip_xmit_attr_t *); extern void sctp_set_ulp_prop(sctp_t *); extern void sctp_ss_rexmit(sctp_t *); extern size_t sctp_supaddr_param_len(sctp_t *); @@ -1183,7 +1070,7 @@ extern void sctp_timer_free(mblk_t *); extern void sctp_timer_stop(mblk_t *); extern void sctp_unlink_faddr(sctp_t *, sctp_faddr_t *); -extern void sctp_update_ire(sctp_t *sctp); +extern void sctp_update_dce(sctp_t *sctp); extern in_port_t sctp_update_next_port(in_port_t, zone_t *zone, sctp_stack_t *); extern void sctp_update_rtt(sctp_t *, sctp_faddr_t *, clock_t); extern void sctp_user_abort(sctp_t *, mblk_t *); @@ -1209,17 +1096,6 @@ extern void (*cl_sctp_assoc_change)(sa_family_t, uchar_t *, size_t, uint_t, extern void (*cl_sctp_check_addrs)(sa_family_t, in_port_t, uchar_t **, size_t, uint_t *, boolean_t); -/* Send a mp to IP. */ -#define IP_PUT(mp, conn, isv4) \ -{ \ - sctp_stack_t *sctps = conn->conn_netstack->netstack_sctp; \ - \ - if ((isv4)) \ - ip_output((conn), (mp), WR(sctps->sctps_g_q), IP_WPUT); \ - else \ - ip_output_v6((conn), (mp), WR(sctps->sctps_g_q), IP_WPUT);\ -} - #define RUN_SCTP(sctp) \ { \ mutex_enter(&(sctp)->sctp_lock); \ diff --git a/usr/src/uts/common/inet/sctp/sctp_init.c b/usr/src/uts/common/inet/sctp/sctp_init.c index 5547609c98..ff34147a65 100644 --- a/usr/src/uts/common/inet/sctp/sctp_init.c +++ b/usr/src/uts/common/inet/sctp/sctp_init.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/stream.h> #include <sys/ddi.h> @@ -45,32 +43,6 @@ #include "sctp_impl.h" #include "sctp_addr.h" -/* - * This will compute the checksum over the SCTP packet, so this - * function should only be called after the whole packet has been - * built. - * - * rptr should point to the IP / SCTP composite header. - * len should be the length of the entire packet, including the IP - * header. - */ -void -sctp_add_hdr(sctp_t *sctp, uchar_t *rptr, size_t len) -{ - ipha_t *iphdr; - short iplen; - - ASSERT(len >= sctp->sctp_hdr_len); - - /* Copy the common header from the template */ - bcopy(sctp->sctp_iphc, rptr, sctp->sctp_hdr_len); - - /* Set the total length in the IP hdr */ - iplen = (short)len; - iphdr = (ipha_t *)rptr; - U16_TO_ABE16(iplen, &iphdr->ipha_length); -} - /*ARGSUSED*/ size_t sctp_supaddr_param_len(sctp_t *sctp) @@ -83,17 +55,18 @@ sctp_supaddr_param(sctp_t *sctp, uchar_t *p) { sctp_parm_hdr_t *sph; uint16_t *addrtype; + conn_t *connp = sctp->sctp_connp; sph = (sctp_parm_hdr_t *)p; sph->sph_type = htons(PARM_SUPP_ADDRS); addrtype = (uint16_t *)(sph + 1); - switch (sctp->sctp_ipversion) { - case IPV4_VERSION: + switch (connp->conn_family) { + case AF_INET: *addrtype++ = htons(PARM_ADDR4); *addrtype = 0; sph->sph_len = htons(sizeof (*sph) + sizeof (*addrtype)); break; - case IPV6_VERSION: + case AF_INET6: *addrtype++ = htons(PARM_ADDR6); if (!sctp->sctp_connp->conn_ipv6_v6only) { *addrtype = htons(PARM_ADDR4); @@ -167,7 +140,7 @@ sctp_adaptation_code_param(sctp_t *sctp, uchar_t *p) } mblk_t * -sctp_init_mp(sctp_t *sctp) +sctp_init_mp(sctp_t *sctp, sctp_faddr_t *fp) { mblk_t *mp; uchar_t *p; @@ -176,12 +149,12 @@ sctp_init_mp(sctp_t *sctp) sctp_chunk_hdr_t *chp; uint16_t schlen; int supp_af; - sctp_stack_t *sctps = sctp->sctp_sctps; + sctp_stack_t *sctps = sctp->sctp_sctps; + conn_t *connp = sctp->sctp_connp; - if (sctp->sctp_family == AF_INET) { + if (connp->conn_family == AF_INET) { supp_af = PARM_SUPP_V4; } else { - /* Assume here that a v6 endpoint supports v4 address. */ if (sctp->sctp_connp->conn_ipv6_v6only) supp_af = PARM_SUPP_V6; else @@ -203,11 +176,17 @@ sctp_init_mp(sctp_t *sctp) sctp->sctp_sctph->sh_verf = 0; sctp->sctp_sctph6->sh_verf = 0; - mp = sctp_make_mp(sctp, NULL, initlen); + mp = sctp_make_mp(sctp, fp, initlen); if (mp == NULL) { SCTP_KSTAT(sctps, sctp_send_init_failed); return (NULL); } + /* sctp_make_mp could have discovered we have no usable sources */ + if (sctp->sctp_nsaddrs == 0) { + freemsg(mp); + SCTP_KSTAT(sctps, sctp_send_init_failed); + return (NULL); + } /* Lay in a new INIT chunk, starting with the chunk header */ chp = (sctp_chunk_hdr_t *)mp->b_wptr; @@ -242,7 +221,7 @@ sctp_init_mp(sctp_t *sctp) BUMP_LOCAL(sctp->sctp_obchunks); - sctp_set_iplen(sctp, mp); + sctp_set_iplen(sctp, mp, fp->ixa); return (mp); } diff --git a/usr/src/uts/common/inet/sctp/sctp_input.c b/usr/src/uts/common/inet/sctp/sctp_input.c index e18bfeacdd..e4a5ef5c5b 100644 --- a/usr/src/uts/common/inet/sctp/sctp_input.c +++ b/usr/src/uts/common/inet/sctp/sctp_input.c @@ -42,6 +42,7 @@ #include <inet/common.h> #include <inet/ip.h> +#include <inet/ip_if.h> #include <inet/ip6.h> #include <inet/mib2.h> #include <inet/ipclassifier.h> @@ -318,7 +319,7 @@ sctp_next_chunk(sctp_chunk_hdr_t *ch, ssize_t *remaining) */ static int sctp_input_add_ancillary(sctp_t *sctp, mblk_t **mp, sctp_data_hdr_t *dcp, - sctp_faddr_t *fp, ip6_pkt_t *ipp) + sctp_faddr_t *fp, ip_pkt_t *ipp, ip_recv_attr_t *ira) { struct T_unitdata_ind *tudi; int optlen; @@ -329,57 +330,61 @@ sctp_input_add_ancillary(sctp_t *sctp, mblk_t **mp, sctp_data_hdr_t *dcp, struct sockaddr_in6 sin_buf[1]; struct sockaddr_in6 *sin6; struct sockaddr_in *sin4; - uint_t addflag = 0; + crb_t addflag; /* Which pieces to add */ + conn_t *connp = sctp->sctp_connp; sin4 = NULL; sin6 = NULL; optlen = hdrlen = 0; + addflag.crb_all = 0; /* Figure out address size */ - if (sctp->sctp_ipversion == IPV4_VERSION) { + if (connp->conn_family == AF_INET) { sin4 = (struct sockaddr_in *)sin_buf; sin4->sin_family = AF_INET; - sin4->sin_port = sctp->sctp_fport; + sin4->sin_port = connp->conn_fport; IN6_V4MAPPED_TO_IPADDR(&fp->faddr, sin4->sin_addr.s_addr); hdrlen = sizeof (*tudi) + sizeof (*sin4); } else { sin6 = sin_buf; sin6->sin6_family = AF_INET6; - sin6->sin6_port = sctp->sctp_fport; + sin6->sin6_port = connp->conn_fport; sin6->sin6_addr = fp->faddr; hdrlen = sizeof (*tudi) + sizeof (*sin6); } - /* If app asked to receive send / recv info */ - if (sctp->sctp_recvsndrcvinfo) { + if (sctp->sctp_recvsndrcvinfo) optlen += sizeof (*cmsg) + sizeof (struct sctp_sndrcvinfo); - if (hdrlen == 0) - hdrlen = sizeof (struct T_optdata_ind); - } - if (sctp->sctp_ipv6_recvancillary == 0) + if (connp->conn_recv_ancillary.crb_all == 0) goto noancillary; - if ((ipp->ipp_fields & IPPF_IFINDEX) && - ipp->ipp_ifindex != sctp->sctp_recvifindex && - (sctp->sctp_ipv6_recvancillary & SCTP_IPV6_RECVPKTINFO)) { + if (connp->conn_recv_ancillary.crb_ip_recvpktinfo && + ira->ira_ruifindex != sctp->sctp_recvifindex) { optlen += sizeof (*cmsg) + sizeof (struct in6_pktinfo); if (hdrlen == 0) hdrlen = sizeof (struct T_unitdata_ind); - addflag |= SCTP_IPV6_RECVPKTINFO; + addflag.crb_ip_recvpktinfo = 1; } /* If app asked for hoplimit and it has changed ... */ - if ((ipp->ipp_fields & IPPF_HOPLIMIT) && - ipp->ipp_hoplimit != sctp->sctp_recvhops && - (sctp->sctp_ipv6_recvancillary & SCTP_IPV6_RECVHOPLIMIT)) { + if (connp->conn_recv_ancillary.crb_ipv6_recvhoplimit && + ipp->ipp_hoplimit != sctp->sctp_recvhops) { optlen += sizeof (*cmsg) + sizeof (uint_t); if (hdrlen == 0) hdrlen = sizeof (struct T_unitdata_ind); - addflag |= SCTP_IPV6_RECVHOPLIMIT; + addflag.crb_ipv6_recvhoplimit = 1; + } + /* If app asked for tclass and it has changed ... */ + if (connp->conn_recv_ancillary.crb_ipv6_recvtclass && + ipp->ipp_tclass != sctp->sctp_recvtclass) { + optlen += sizeof (struct T_opthdr) + sizeof (uint_t); + if (hdrlen == 0) + hdrlen = sizeof (struct T_unitdata_ind); + addflag.crb_ipv6_recvtclass = 1; } /* If app asked for hopbyhop headers and it has changed ... */ - if ((sctp->sctp_ipv6_recvancillary & SCTP_IPV6_RECVHOPOPTS) && + if (connp->conn_recv_ancillary.crb_ipv6_recvhopopts && ip_cmpbuf(sctp->sctp_hopopts, sctp->sctp_hopoptslen, (ipp->ipp_fields & IPPF_HOPOPTS), ipp->ipp_hopopts, ipp->ipp_hopoptslen)) { @@ -387,7 +392,7 @@ sctp_input_add_ancillary(sctp_t *sctp, mblk_t **mp, sctp_data_hdr_t *dcp, sctp->sctp_v6label_len; if (hdrlen == 0) hdrlen = sizeof (struct T_unitdata_ind); - addflag |= SCTP_IPV6_RECVHOPOPTS; + addflag.crb_ipv6_recvhopopts = 1; if (!ip_allocbuf((void **)&sctp->sctp_hopopts, &sctp->sctp_hopoptslen, (ipp->ipp_fields & IPPF_HOPOPTS), @@ -395,45 +400,44 @@ sctp_input_add_ancillary(sctp_t *sctp, mblk_t **mp, sctp_data_hdr_t *dcp, return (-1); } /* If app asked for dst headers before routing headers ... */ - if ((sctp->sctp_ipv6_recvancillary & SCTP_IPV6_RECVRTDSTOPTS) && - ip_cmpbuf(sctp->sctp_rtdstopts, sctp->sctp_rtdstoptslen, - (ipp->ipp_fields & IPPF_RTDSTOPTS), - ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen)) { - optlen += sizeof (*cmsg) + ipp->ipp_rtdstoptslen; + if (connp->conn_recv_ancillary.crb_ipv6_recvrthdrdstopts && + ip_cmpbuf(sctp->sctp_rthdrdstopts, sctp->sctp_rthdrdstoptslen, + (ipp->ipp_fields & IPPF_RTHDRDSTOPTS), + ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen)) { + optlen += sizeof (*cmsg) + ipp->ipp_rthdrdstoptslen; if (hdrlen == 0) hdrlen = sizeof (struct T_unitdata_ind); - addflag |= SCTP_IPV6_RECVRTDSTOPTS; - if (!ip_allocbuf((void **)&sctp->sctp_rtdstopts, - &sctp->sctp_rtdstoptslen, - (ipp->ipp_fields & IPPF_RTDSTOPTS), - ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen)) + addflag.crb_ipv6_recvrthdrdstopts = 1; + if (!ip_allocbuf((void **)&sctp->sctp_rthdrdstopts, + &sctp->sctp_rthdrdstoptslen, + (ipp->ipp_fields & IPPF_RTHDRDSTOPTS), + ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen)) return (-1); } /* If app asked for routing headers and it has changed ... */ - if (sctp->sctp_ipv6_recvancillary & SCTP_IPV6_RECVRTHDR) { - if (ip_cmpbuf(sctp->sctp_rthdr, sctp->sctp_rthdrlen, + if (connp->conn_recv_ancillary.crb_ipv6_recvrthdr && + ip_cmpbuf(sctp->sctp_rthdr, sctp->sctp_rthdrlen, + (ipp->ipp_fields & IPPF_RTHDR), + ipp->ipp_rthdr, ipp->ipp_rthdrlen)) { + optlen += sizeof (*cmsg) + ipp->ipp_rthdrlen; + if (hdrlen == 0) + hdrlen = sizeof (struct T_unitdata_ind); + addflag.crb_ipv6_recvrthdr = 1; + if (!ip_allocbuf((void **)&sctp->sctp_rthdr, + &sctp->sctp_rthdrlen, (ipp->ipp_fields & IPPF_RTHDR), - ipp->ipp_rthdr, ipp->ipp_rthdrlen)) { - optlen += sizeof (*cmsg) + ipp->ipp_rthdrlen; - if (hdrlen == 0) - hdrlen = sizeof (struct T_unitdata_ind); - addflag |= SCTP_IPV6_RECVRTHDR; - if (!ip_allocbuf((void **)&sctp->sctp_rthdr, - &sctp->sctp_rthdrlen, - (ipp->ipp_fields & IPPF_RTHDR), - ipp->ipp_rthdr, ipp->ipp_rthdrlen)) - return (-1); - } + ipp->ipp_rthdr, ipp->ipp_rthdrlen)) + return (-1); } /* If app asked for dest headers and it has changed ... */ - if ((sctp->sctp_ipv6_recvancillary & SCTP_IPV6_RECVDSTOPTS) && + if (connp->conn_recv_ancillary.crb_ipv6_recvdstopts && ip_cmpbuf(sctp->sctp_dstopts, sctp->sctp_dstoptslen, (ipp->ipp_fields & IPPF_DSTOPTS), ipp->ipp_dstopts, ipp->ipp_dstoptslen)) { optlen += sizeof (*cmsg) + ipp->ipp_dstoptslen; if (hdrlen == 0) hdrlen = sizeof (struct T_unitdata_ind); - addflag |= SCTP_IPV6_RECVDSTOPTS; + addflag.crb_ipv6_recvdstopts = 1; if (!ip_allocbuf((void **)&sctp->sctp_dstopts, &sctp->sctp_dstoptslen, (ipp->ipp_fields & IPPF_DSTOPTS), @@ -499,9 +503,11 @@ noancillary: * If app asked for pktinfo and the index has changed ... * Note that the local address never changes for the connection. */ - if (addflag & SCTP_IPV6_RECVPKTINFO) { + if (addflag.crb_ip_recvpktinfo) { struct in6_pktinfo *pkti; + uint_t ifindex; + ifindex = ira->ira_ruifindex; cmsg = (struct cmsghdr *)optptr; cmsg->cmsg_level = IPPROTO_IPV6; cmsg->cmsg_type = IPV6_PKTINFO; @@ -509,19 +515,20 @@ noancillary: optptr += sizeof (*cmsg); pkti = (struct in6_pktinfo *)optptr; - if (sctp->sctp_ipversion == IPV6_VERSION) + if (connp->conn_family == AF_INET6) pkti->ipi6_addr = sctp->sctp_ip6h->ip6_src; else IN6_IPADDR_TO_V4MAPPED(sctp->sctp_ipha->ipha_src, &pkti->ipi6_addr); - pkti->ipi6_ifindex = ipp->ipp_ifindex; + + pkti->ipi6_ifindex = ifindex; optptr += sizeof (*pkti); ASSERT(OK_32PTR(optptr)); /* Save as "last" value */ - sctp->sctp_recvifindex = ipp->ipp_ifindex; + sctp->sctp_recvifindex = ifindex; } /* If app asked for hoplimit and it has changed ... */ - if (addflag & SCTP_IPV6_RECVHOPLIMIT) { + if (addflag.crb_ipv6_recvhoplimit) { cmsg = (struct cmsghdr *)optptr; cmsg->cmsg_level = IPPROTO_IPV6; cmsg->cmsg_type = IPV6_HOPLIMIT; @@ -534,7 +541,21 @@ noancillary: /* Save as "last" value */ sctp->sctp_recvhops = ipp->ipp_hoplimit; } - if (addflag & SCTP_IPV6_RECVHOPOPTS) { + /* If app asked for tclass and it has changed ... */ + if (addflag.crb_ipv6_recvtclass) { + cmsg = (struct cmsghdr *)optptr; + cmsg->cmsg_level = IPPROTO_IPV6; + cmsg->cmsg_type = IPV6_TCLASS; + cmsg->cmsg_len = sizeof (*cmsg) + sizeof (uint_t); + optptr += sizeof (*cmsg); + + *(uint_t *)optptr = ipp->ipp_tclass; + optptr += sizeof (uint_t); + ASSERT(OK_32PTR(optptr)); + /* Save as "last" value */ + sctp->sctp_recvtclass = ipp->ipp_tclass; + } + if (addflag.crb_ipv6_recvhopopts) { cmsg = (struct cmsghdr *)optptr; cmsg->cmsg_level = IPPROTO_IPV6; cmsg->cmsg_type = IPV6_HOPOPTS; @@ -550,23 +571,23 @@ noancillary: (ipp->ipp_fields & IPPF_HOPOPTS), ipp->ipp_hopopts, ipp->ipp_hopoptslen); } - if (addflag & SCTP_IPV6_RECVRTDSTOPTS) { + if (addflag.crb_ipv6_recvrthdrdstopts) { cmsg = (struct cmsghdr *)optptr; cmsg->cmsg_level = IPPROTO_IPV6; cmsg->cmsg_type = IPV6_RTHDRDSTOPTS; - cmsg->cmsg_len = sizeof (*cmsg) + ipp->ipp_rtdstoptslen; + cmsg->cmsg_len = sizeof (*cmsg) + ipp->ipp_rthdrdstoptslen; optptr += sizeof (*cmsg); - bcopy(ipp->ipp_rtdstopts, optptr, ipp->ipp_rtdstoptslen); - optptr += ipp->ipp_rtdstoptslen; + bcopy(ipp->ipp_rthdrdstopts, optptr, ipp->ipp_rthdrdstoptslen); + optptr += ipp->ipp_rthdrdstoptslen; ASSERT(OK_32PTR(optptr)); /* Save as last value */ - ip_savebuf((void **)&sctp->sctp_rtdstopts, - &sctp->sctp_rtdstoptslen, - (ipp->ipp_fields & IPPF_RTDSTOPTS), - ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen); + ip_savebuf((void **)&sctp->sctp_rthdrdstopts, + &sctp->sctp_rthdrdstoptslen, + (ipp->ipp_fields & IPPF_RTHDRDSTOPTS), + ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen); } - if (addflag & SCTP_IPV6_RECVRTHDR) { + if (addflag.crb_ipv6_recvrthdr) { cmsg = (struct cmsghdr *)optptr; cmsg->cmsg_level = IPPROTO_IPV6; cmsg->cmsg_type = IPV6_RTHDR; @@ -582,7 +603,7 @@ noancillary: (ipp->ipp_fields & IPPF_RTHDR), ipp->ipp_rthdr, ipp->ipp_rthdrlen); } - if (addflag & SCTP_IPV6_RECVDSTOPTS) { + if (addflag.crb_ipv6_recvdstopts) { cmsg = (struct cmsghdr *)optptr; cmsg->cmsg_level = IPPROTO_IPV6; cmsg->cmsg_type = IPV6_DSTOPTS; @@ -778,7 +799,6 @@ static mblk_t * sctp_try_partial_delivery(sctp_t *sctp, mblk_t *hmp, sctp_reass_t *srp, sctp_data_hdr_t **dc) { - mblk_t *first_mp; mblk_t *mp; mblk_t *dmp; mblk_t *qmp; @@ -791,8 +811,7 @@ sctp_try_partial_delivery(sctp_t *sctp, mblk_t *hmp, sctp_reass_t *srp, dprint(4, ("trypartial: got=%d, needed=%d\n", (int)(srp->got), (int)(srp->needed))); - first_mp = hmp->b_cont; - mp = first_mp; + mp = hmp->b_cont; qdc = (sctp_data_hdr_t *)mp->b_rptr; ASSERT(SCTP_DATA_GET_BBIT(qdc) && srp->hasBchunk); @@ -1175,7 +1194,7 @@ sctp_add_dup(uint32_t tsn, mblk_t **dups) static void sctp_data_chunk(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *mp, mblk_t **dups, - sctp_faddr_t *fp, ip6_pkt_t *ipp) + sctp_faddr_t *fp, ip_pkt_t *ipp, ip_recv_attr_t *ira) { sctp_data_hdr_t *dc; mblk_t *dmp, *pmp; @@ -1419,7 +1438,8 @@ sctp_data_chunk(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *mp, mblk_t **dups, if (can_deliver) { dmp->b_rptr = (uchar_t *)(dc + 1); - if (sctp_input_add_ancillary(sctp, &dmp, dc, fp, ipp) == 0) { + if (sctp_input_add_ancillary(sctp, &dmp, dc, fp, + ipp, ira) == 0) { dprint(1, ("sctp_data_chunk: delivering %lu bytes\n", msgdsize(dmp))); sctp->sctp_rwnd -= dlen; @@ -1507,7 +1527,7 @@ sctp_data_chunk(sctp_t *sctp, sctp_chunk_hdr_t *ch, mblk_t *mp, mblk_t **dups, if (can_deliver) { dmp->b_rptr = (uchar_t *)(dc + 1); if (sctp_input_add_ancillary(sctp, &dmp, dc, fp, - ipp) == 0) { + ipp, ira) == 0) { dprint(1, ("sctp_data_chunk: delivering %lu " "bytes\n", msgdsize(dmp))); sctp->sctp_rwnd -= dlen; @@ -1646,6 +1666,8 @@ sctp_make_sack(sctp_t *sctp, sctp_faddr_t *sendto, mblk_t *dups) uint32_t dups_len; sctp_faddr_t *fp; + ASSERT(sendto != NULL); + if (sctp->sctp_force_sack) { sctp->sctp_force_sack = 0; goto checks_done; @@ -1696,8 +1718,9 @@ checks_done: return (NULL); } smp->b_cont = sctp->sctp_err_chunks; - sctp_set_iplen(sctp, smp); - sctp_add_sendq(sctp, smp); + sctp_set_iplen(sctp, smp, fp->ixa); + (void) conn_ip_output(smp, fp->ixa); + BUMP_LOCAL(sctp->sctp_opkts); sctp->sctp_err_chunks = NULL; sctp->sctp_err_len = 0; } @@ -1749,8 +1772,6 @@ sctp_sack(sctp_t *sctp, mblk_t *dups) freeb(dups); return (B_FALSE); } - sctp_set_iplen(sctp, smp); - dprint(2, ("sctp_sack: sending to %p %x:%x:%x:%x\n", (void *)sctp->sctp_lastdata, SCTP_PRINTADDR(sctp->sctp_lastdata->faddr))); @@ -1758,7 +1779,10 @@ sctp_sack(sctp_t *sctp, mblk_t *dups) sctp->sctp_active = lbolt64; BUMP_MIB(&sctps->sctps_mib, sctpOutAck); - sctp_add_sendq(sctp, smp); + + sctp_set_iplen(sctp, smp, sctp->sctp_lastdata->ixa); + (void) conn_ip_output(smp, sctp->sctp_lastdata->ixa); + BUMP_LOCAL(sctp->sctp_opkts); return (B_TRUE); } @@ -1813,8 +1837,9 @@ sctp_check_abandoned_msg(sctp_t *sctp, mblk_t *meta) return (ENOMEM); } SCTP_MSG_SET_ABANDONED(meta); - sctp_set_iplen(sctp, head); - sctp_add_sendq(sctp, head); + sctp_set_iplen(sctp, head, fp->ixa); + (void) conn_ip_output(head, fp->ixa); + BUMP_LOCAL(sctp->sctp_opkts); if (!fp->timer_running) SCTP_FADDR_TIMER_RESTART(sctp, fp, fp->rto); mp1 = mp1->b_next; @@ -2080,13 +2105,13 @@ sctp_ftsn_check_frag(sctp_t *sctp, uint16_t ssn, sctp_instr_t *sip) * messages, if any, from the instream queue (that were waiting for this * sid-ssn message to show up). Once we are done try to update the SACK * info. We could get a duplicate Forward TSN, in which case just send - * a SACK. If any of the sid values in the the Forward TSN is invalid, + * a SACK. If any of the sid values in the Forward TSN is invalid, * send back an "Invalid Stream Identifier" error and continue processing * the rest. */ static void sctp_process_forward_tsn(sctp_t *sctp, sctp_chunk_hdr_t *ch, sctp_faddr_t *fp, - ip6_pkt_t *ipp) + ip_pkt_t *ipp, ip_recv_attr_t *ira) { uint32_t *ftsn = (uint32_t *)(ch + 1); ftsn_entry_t *ftsn_entry; @@ -2171,7 +2196,7 @@ sctp_process_forward_tsn(sctp_t *sctp, sctp_chunk_hdr_t *ch, sctp_faddr_t *fp, dmp->b_next = NULL; ASSERT(dmp->b_prev == NULL); if (sctp_input_add_ancillary(sctp, - &dmp, dc, fp, ipp) == 0) { + &dmp, dc, fp, ipp, ira) == 0) { sctp->sctp_rxqueued -= dlen; sctp->sctp_rwnd -= dlen; /* @@ -2280,8 +2305,9 @@ sctp_check_abandoned_data(sctp_t *sctp, sctp_faddr_t *fp) SCTP_FADDR_TIMER_RESTART(sctp, fp, fp->rto); return; } - sctp_set_iplen(sctp, nmp); - sctp_add_sendq(sctp, nmp); + sctp_set_iplen(sctp, nmp, fp->ixa); + (void) conn_ip_output(nmp, fp->ixa); + BUMP_LOCAL(sctp->sctp_opkts); if (!fp->timer_running) SCTP_FADDR_TIMER_RESTART(sctp, fp, fp->rto); } @@ -2604,8 +2630,9 @@ sctp_got_sack(sctp_t *sctp, sctp_chunk_hdr_t *sch) sctp->sctp_zero_win_probe = B_FALSE; sctp->sctp_rxt_nxttsn = sctp->sctp_ltsn; sctp->sctp_rxt_maxtsn = sctp->sctp_ltsn; - sctp_set_iplen(sctp, pkt); - sctp_add_sendq(sctp, pkt); + sctp_set_iplen(sctp, pkt, fp->ixa); + (void) conn_ip_output(pkt, fp->ixa); + BUMP_LOCAL(sctp->sctp_opkts); } } else { if (sctp->sctp_zero_win_probe) { @@ -3160,97 +3187,15 @@ sctp_check_input(sctp_t *sctp, sctp_chunk_hdr_t *ch, ssize_t len, int first) return (1); } -/* ARGSUSED */ -static sctp_hdr_t * -find_sctp_hdrs(mblk_t *mp, in6_addr_t *src, in6_addr_t *dst, - uint_t *ifindex, uint_t *ip_hdr_len, ip6_pkt_t *ipp, ip_pktinfo_t *pinfo) -{ - uchar_t *rptr; - ipha_t *ip4h; - ip6_t *ip6h; - mblk_t *mp1; - - rptr = mp->b_rptr; - if (IPH_HDR_VERSION(rptr) == IPV4_VERSION) { - *ip_hdr_len = IPH_HDR_LENGTH(rptr); - ip4h = (ipha_t *)rptr; - IN6_IPADDR_TO_V4MAPPED(ip4h->ipha_src, src); - IN6_IPADDR_TO_V4MAPPED(ip4h->ipha_dst, dst); - - ipp->ipp_fields |= IPPF_HOPLIMIT; - ipp->ipp_hoplimit = ((ipha_t *)rptr)->ipha_ttl; - if (pinfo != NULL && (pinfo->ip_pkt_flags & IPF_RECVIF)) { - ipp->ipp_fields |= IPPF_IFINDEX; - ipp->ipp_ifindex = pinfo->ip_pkt_ifindex; - } - } else { - ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION); - ip6h = (ip6_t *)rptr; - ipp->ipp_fields = IPPF_HOPLIMIT; - ipp->ipp_hoplimit = ip6h->ip6_hops; - - if (ip6h->ip6_nxt != IPPROTO_SCTP) { - /* Look for ifindex information */ - if (ip6h->ip6_nxt == IPPROTO_RAW) { - ip6i_t *ip6i = (ip6i_t *)ip6h; - - if (ip6i->ip6i_flags & IP6I_IFINDEX) { - ASSERT(ip6i->ip6i_ifindex != 0); - ipp->ipp_fields |= IPPF_IFINDEX; - ipp->ipp_ifindex = ip6i->ip6i_ifindex; - } - rptr = (uchar_t *)&ip6i[1]; - mp->b_rptr = rptr; - if (rptr == mp->b_wptr) { - mp1 = mp->b_cont; - freeb(mp); - mp = mp1; - rptr = mp->b_rptr; - } - ASSERT(mp->b_wptr - rptr >= - IPV6_HDR_LEN + sizeof (sctp_hdr_t)); - ip6h = (ip6_t *)rptr; - } - /* - * Find any potentially interesting extension headers - * as well as the length of the IPv6 + extension - * headers. - */ - *ip_hdr_len = ip_find_hdr_v6(mp, ip6h, ipp, NULL); - } else { - *ip_hdr_len = IPV6_HDR_LEN; - } - *src = ip6h->ip6_src; - *dst = ip6h->ip6_dst; - } - ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); - return ((sctp_hdr_t *)&rptr[*ip_hdr_len]); -#undef IPVER -} - static mblk_t * -sctp_check_in_policy(mblk_t *mp, mblk_t *ipsec_mp) +sctp_check_in_policy(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst) { - ipsec_in_t *ii; - boolean_t check = B_TRUE; boolean_t policy_present; ipha_t *ipha; ip6_t *ip6h; - netstack_t *ns; - ipsec_stack_t *ipss; - - ii = (ipsec_in_t *)ipsec_mp->b_rptr; - ASSERT(ii->ipsec_in_type == IPSEC_IN); - ns = ii->ipsec_in_ns; - ipss = ns->netstack_ipsec; - - if (ii->ipsec_in_dont_check) { - check = B_FALSE; - if (!ii->ipsec_in_secure) { - freeb(ipsec_mp); - ipsec_mp = NULL; - } - } + netstack_t *ns = ipst->ips_netstack; + ipsec_stack_t *ipss = ns->netstack_ipsec; + if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { policy_present = ipss->ipsec_inbound_v4_policy_present; ipha = (ipha_t *)mp->b_rptr; @@ -3261,109 +3206,88 @@ sctp_check_in_policy(mblk_t *mp, mblk_t *ipsec_mp) ip6h = (ip6_t *)mp->b_rptr; } - if (check && policy_present) { + if (policy_present) { /* * The conn_t parameter is NULL because we already know * nobody's home. */ - ipsec_mp = ipsec_check_global_policy(ipsec_mp, (conn_t *)NULL, - ipha, ip6h, B_TRUE, ns); - if (ipsec_mp == NULL) + mp = ipsec_check_global_policy(mp, (conn_t *)NULL, + ipha, ip6h, ira, ns); + if (mp == NULL) return (NULL); } - if (ipsec_mp != NULL) - freeb(ipsec_mp); return (mp); } /* Handle out-of-the-blue packets */ void -sctp_ootb_input(mblk_t *mp, ill_t *recv_ill, zoneid_t zoneid, - boolean_t mctl_present) +sctp_ootb_input(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst) { sctp_t *sctp; sctp_chunk_hdr_t *ch; sctp_hdr_t *sctph; in6_addr_t src, dst; - uint_t ip_hdr_len; - uint_t ifindex; - ip6_pkt_t ipp; + uint_t ip_hdr_len = ira->ira_ip_hdr_length; ssize_t mlen; - ip_pktinfo_t *pinfo = NULL; - mblk_t *first_mp; sctp_stack_t *sctps; - ip_stack_t *ipst; + boolean_t secure; + zoneid_t zoneid = ira->ira_zoneid; + uchar_t *rptr; + + ASSERT(ira->ira_ill == NULL); + + secure = ira->ira_flags & IRAF_IPSEC_SECURE; - ASSERT(recv_ill != NULL); - ipst = recv_ill->ill_ipst; sctps = ipst->ips_netstack->netstack_sctp; BUMP_MIB(&sctps->sctps_mib, sctpOutOfBlue); BUMP_MIB(&sctps->sctps_mib, sctpInSCTPPkts); - if (sctps->sctps_gsctp == NULL) { - /* - * For non-zero stackids the default queue isn't created - * until the first open, thus there can be a need to send - * an error before then. But we can't do that, hence we just - * drop the packet. Later during boot, when the default queue - * has been setup, a retransmitted packet from the peer - * will result in a error. - */ - ASSERT(sctps->sctps_netstack->netstack_stackid != - GLOBAL_NETSTACKID); - freemsg(mp); - return; - } - - first_mp = mp; - if (mctl_present) - mp = mp->b_cont; - - /* Initiate IPPf processing, if needed. */ - if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) { - ip_process(IPP_LOCAL_IN, &mp, - recv_ill->ill_phyint->phyint_ifindex); - if (mp == NULL) { - if (mctl_present) - freeb(first_mp); - return; - } - } - if (mp->b_cont != NULL) { /* * All subsequent code is vastly simplified if it can * assume a single contiguous chunk of data. */ if (pullupmsg(mp, -1) == 0) { - BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards); - freemsg(first_mp); + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, NULL); + freemsg(mp); return; } } - /* - * We don't really need to call this function... Need to - * optimize later. - */ - sctph = find_sctp_hdrs(mp, &src, &dst, &ifindex, &ip_hdr_len, - &ipp, pinfo); + rptr = mp->b_rptr; + sctph = ((sctp_hdr_t *)&rptr[ip_hdr_len]); + if (ira->ira_flags & IRAF_IS_IPV4) { + ipha_t *ipha; + + ipha = (ipha_t *)rptr; + IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &src); + IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &dst); + } else { + ip6_t *ip6h; + + ip6h = (ip6_t *)rptr; + src = ip6h->ip6_src; + dst = ip6h->ip6_dst; + } + mlen = mp->b_wptr - (uchar_t *)(sctph + 1); if ((ch = sctp_first_chunk((uchar_t *)(sctph + 1), mlen)) == NULL) { dprint(3, ("sctp_ootb_input: invalid packet\n")); - BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards); - freemsg(first_mp); + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, NULL); + freemsg(mp); return; } switch (ch->sch_id) { case CHUNK_INIT: /* no listener; send abort */ - if (mctl_present && sctp_check_in_policy(mp, first_mp) == NULL) + if (secure && sctp_check_in_policy(mp, ira, ipst) == NULL) return; - sctp_send_abort(sctps->sctps_gsctp, sctp_init2vtag(ch), 0, - NULL, 0, mp, 0, B_TRUE); + sctp_ootb_send_abort(sctp_init2vtag(ch), 0, + NULL, 0, mp, 0, B_TRUE, ira, ipst); break; case CHUNK_INIT_ACK: /* check for changed src addr */ @@ -3372,11 +3296,7 @@ sctp_ootb_input(mblk_t *mp, ill_t *recv_ill, zoneid_t zoneid, /* success; proceed to normal path */ mutex_enter(&sctp->sctp_lock); if (sctp->sctp_running) { - if (!sctp_add_recvq(sctp, mp, B_FALSE)) { - BUMP_MIB(recv_ill->ill_ip_mib, - ipIfStatsInDiscards); - freemsg(mp); - } + sctp_add_recvq(sctp, mp, B_FALSE, ira); mutex_exit(&sctp->sctp_lock); } else { /* @@ -3387,152 +3307,101 @@ sctp_ootb_input(mblk_t *mp, ill_t *recv_ill, zoneid_t zoneid, */ sctp->sctp_running = B_TRUE; mutex_exit(&sctp->sctp_lock); - sctp_input_data(sctp, mp, NULL); + sctp_input_data(sctp, mp, ira); WAKE_SCTP(sctp); - sctp_process_sendq(sctp); } SCTP_REFRELE(sctp); return; } - if (mctl_present) - freeb(first_mp); /* else bogus init ack; drop it */ break; case CHUNK_SHUTDOWN_ACK: - if (mctl_present && sctp_check_in_policy(mp, first_mp) == NULL) + if (secure && sctp_check_in_policy(mp, ira, ipst) == NULL) return; - sctp_ootb_shutdown_ack(sctps->sctps_gsctp, mp, ip_hdr_len); - sctp_process_sendq(sctps->sctps_gsctp); + sctp_ootb_shutdown_ack(mp, ip_hdr_len, ira, ipst); return; case CHUNK_ERROR: case CHUNK_ABORT: case CHUNK_COOKIE_ACK: case CHUNK_SHUTDOWN_COMPLETE: - if (mctl_present) - freeb(first_mp); break; default: - if (mctl_present && sctp_check_in_policy(mp, first_mp) == NULL) + if (secure && sctp_check_in_policy(mp, ira, ipst) == NULL) return; - sctp_send_abort(sctps->sctps_gsctp, sctph->sh_verf, 0, - NULL, 0, mp, 0, B_TRUE); + sctp_ootb_send_abort(sctph->sh_verf, 0, + NULL, 0, mp, 0, B_TRUE, ira, ipst); break; } - sctp_process_sendq(sctps->sctps_gsctp); freemsg(mp); } +/* + * Handle sctp packets. + * Note that we rele the sctp_t (the caller got a reference on it). + */ void -sctp_input(conn_t *connp, ipha_t *ipha, mblk_t *mp, mblk_t *first_mp, - ill_t *recv_ill, boolean_t isv4, boolean_t mctl_present) +sctp_input(conn_t *connp, ipha_t *ipha, ip6_t *ip6h, mblk_t *mp, + ip_recv_attr_t *ira) { - sctp_t *sctp = CONN2SCTP(connp); - ip_stack_t *ipst = recv_ill->ill_ipst; + sctp_t *sctp = CONN2SCTP(connp); + boolean_t secure; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; + iaflags_t iraflags = ira->ira_flags; + ill_t *rill = ira->ira_rill; + + secure = iraflags & IRAF_IPSEC_SECURE; /* * We check some fields in conn_t without holding a lock. * This should be fine. */ - if (CONN_INBOUND_POLICY_PRESENT(connp, ipss) || mctl_present) { - first_mp = ipsec_check_inbound_policy(first_mp, connp, - ipha, NULL, mctl_present); - if (first_mp == NULL) { - BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards); - SCTP_REFRELE(sctp); - return; - } - } - - /* Initiate IPPF processing for fastpath */ - if (IPP_ENABLED(IPP_LOCAL_IN, ipst)) { - ip_process(IPP_LOCAL_IN, &mp, - recv_ill->ill_phyint->phyint_ifindex); + if (((iraflags & IRAF_IS_IPV4) ? + CONN_INBOUND_POLICY_PRESENT(connp, ipss) : + CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss)) || + secure) { + mp = ipsec_check_inbound_policy(mp, connp, ipha, + ip6h, ira); if (mp == NULL) { + BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); + /* Note that mp is NULL */ + ip_drop_input("ipIfStatsInDiscards", mp, ill); SCTP_REFRELE(sctp); - if (mctl_present) - freeb(first_mp); return; - } else if (mctl_present) { - /* - * ip_process might return a new mp. - */ - ASSERT(first_mp != mp); - first_mp->b_cont = mp; - } else { - first_mp = mp; } } - if (connp->conn_recvif || connp->conn_recvslla || - connp->conn_ip_recvpktinfo) { - int in_flags = 0; - - if (connp->conn_recvif || connp->conn_ip_recvpktinfo) { - in_flags = IPF_RECVIF; - } - if (connp->conn_recvslla) { - in_flags |= IPF_RECVSLLA; - } - if (isv4) { - mp = ip_add_info(mp, recv_ill, in_flags, - IPCL_ZONEID(connp), ipst); - } else { - mp = ip_add_info_v6(mp, recv_ill, - &(((ip6_t *)ipha)->ip6_dst)); - } - if (mp == NULL) { - BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards); - SCTP_REFRELE(sctp); - if (mctl_present) - freeb(first_mp); - return; - } else if (mctl_present) { - /* - * ip_add_info might return a new mp. - */ - ASSERT(first_mp != mp); - first_mp->b_cont = mp; - } else { - first_mp = mp; - } - } + ira->ira_ill = ira->ira_rill = NULL; mutex_enter(&sctp->sctp_lock); if (sctp->sctp_running) { - if (mctl_present) - mp->b_prev = first_mp; - if (!sctp_add_recvq(sctp, mp, B_FALSE)) { - BUMP_MIB(recv_ill->ill_ip_mib, ipIfStatsInDiscards); - freemsg(first_mp); - } + sctp_add_recvq(sctp, mp, B_FALSE, ira); mutex_exit(&sctp->sctp_lock); - SCTP_REFRELE(sctp); - return; + goto done; } else { sctp->sctp_running = B_TRUE; mutex_exit(&sctp->sctp_lock); mutex_enter(&sctp->sctp_recvq_lock); if (sctp->sctp_recvq != NULL) { - if (mctl_present) - mp->b_prev = first_mp; - if (!sctp_add_recvq(sctp, mp, B_TRUE)) { - BUMP_MIB(recv_ill->ill_ip_mib, - ipIfStatsInDiscards); - freemsg(first_mp); - } + sctp_add_recvq(sctp, mp, B_TRUE, ira); mutex_exit(&sctp->sctp_recvq_lock); WAKE_SCTP(sctp); - SCTP_REFRELE(sctp); - return; + goto done; } } mutex_exit(&sctp->sctp_recvq_lock); - sctp_input_data(sctp, mp, (mctl_present ? first_mp : NULL)); + if (ira->ira_flags & IRAF_ICMP_ERROR) + sctp_icmp_error(sctp, mp); + else + sctp_input_data(sctp, mp, ira); WAKE_SCTP(sctp); - sctp_process_sendq(sctp); + +done: SCTP_REFRELE(sctp); + ira->ira_ill = ill; + ira->ira_rill = rill; } static void @@ -3549,7 +3418,7 @@ sctp_process_abort(sctp_t *sctp, sctp_chunk_hdr_t *ch, int err) } void -sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp) +sctp_input_data(sctp_t *sctp, mblk_t *mp, ip_recv_attr_t *ira) { sctp_chunk_hdr_t *ch; ssize_t mlen; @@ -3559,17 +3428,15 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp) sctp_init_chunk_t *iack; uint32_t tsn; sctp_data_hdr_t *sdc; - ip6_pkt_t ipp; + ip_pkt_t ipp; in6_addr_t src; in6_addr_t dst; uint_t ifindex; sctp_hdr_t *sctph; - uint_t ip_hdr_len; + uint_t ip_hdr_len = ira->ira_ip_hdr_length; mblk_t *dups = NULL; int recv_adaptation; boolean_t wake_eager = B_FALSE; - mblk_t *pinfo_mp; - ip_pktinfo_t *pinfo = NULL; in6_addr_t peer_src; int64_t now; sctp_stack_t *sctps = sctp->sctp_sctps; @@ -3577,23 +3444,11 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp) boolean_t hb_already = B_FALSE; cred_t *cr; pid_t cpid; + uchar_t *rptr; + conn_t *connp = sctp->sctp_connp; - if (DB_TYPE(mp) != M_DATA) { - ASSERT(DB_TYPE(mp) == M_CTL); - if (MBLKL(mp) == sizeof (ip_pktinfo_t) && - ((ip_pktinfo_t *)mp->b_rptr)->ip_pkt_ulp_type == - IN_PKTINFO) { - pinfo = (ip_pktinfo_t *)mp->b_rptr; - pinfo_mp = mp; - mp = mp->b_cont; - } else { - if (ipsec_mp != NULL) - freeb(ipsec_mp); - sctp_icmp_error(sctp, mp); - return; - } - } ASSERT(DB_TYPE(mp) == M_DATA); + ASSERT(ira->ira_ill == NULL); if (mp->b_cont != NULL) { /* @@ -3602,32 +3457,72 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp) */ if (pullupmsg(mp, -1) == 0) { BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); - if (ipsec_mp != NULL) - freeb(ipsec_mp); - if (pinfo != NULL) - freeb(pinfo_mp); + ip_drop_input("ipIfStatsInDiscards", mp, NULL); freemsg(mp); return; } } BUMP_LOCAL(sctp->sctp_ipkts); - sctph = find_sctp_hdrs(mp, &src, &dst, &ifindex, &ip_hdr_len, - &ipp, pinfo); - if (pinfo != NULL) - freeb(pinfo_mp); + ifindex = ira->ira_ruifindex; + + rptr = mp->b_rptr; + + ipp.ipp_fields = 0; + if (connp->conn_recv_ancillary.crb_all != 0) { + /* + * Record packet information in the ip_pkt_t + */ + if (ira->ira_flags & IRAF_IS_IPV4) { + (void) ip_find_hdr_v4((ipha_t *)rptr, &ipp, + B_FALSE); + } else { + uint8_t nexthdrp; + + /* + * IPv6 packets can only be received by applications + * that are prepared to receive IPv6 addresses. + * The IP fanout must ensure this. + */ + ASSERT(connp->conn_family == AF_INET6); + + (void) ip_find_hdr_v6(mp, (ip6_t *)rptr, B_TRUE, &ipp, + &nexthdrp); + ASSERT(nexthdrp == IPPROTO_SCTP); + + /* Could have caused a pullup? */ + rptr = mp->b_rptr; + } + } + + sctph = ((sctp_hdr_t *)&rptr[ip_hdr_len]); + + if (ira->ira_flags & IRAF_IS_IPV4) { + ipha_t *ipha; + + ipha = (ipha_t *)rptr; + IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &src); + IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &dst); + } else { + ip6_t *ip6h; + + ip6h = (ip6_t *)rptr; + src = ip6h->ip6_src; + dst = ip6h->ip6_dst; + } + mlen = mp->b_wptr - (uchar_t *)(sctph + 1); ch = sctp_first_chunk((uchar_t *)(sctph + 1), mlen); if (ch == NULL) { BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); - if (ipsec_mp != NULL) - freeb(ipsec_mp); + ip_drop_input("ipIfStatsInDiscards", mp, NULL); freemsg(mp); return; } if (!sctp_check_input(sctp, ch, mlen, 1)) { BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, NULL); goto done; } /* @@ -3661,9 +3556,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp) if (sctp->sctp_state > SCTPS_BOUND && sctp->sctp_state < SCTPS_ESTABLISHED) { /* treat as OOTB */ - sctp_ootb_shutdown_ack(sctp, mp, ip_hdr_len); - if (ipsec_mp != NULL) - freeb(ipsec_mp); + sctp_ootb_shutdown_ack(mp, ip_hdr_len, ira, ipst); return; } /* else fallthru */ @@ -3717,7 +3610,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp) tsn = sdc->sdh_tsn; sctp_send_abort(sctp, sctp->sctp_fvtag, SCTP_ERR_NO_USR_DATA, (char *)&tsn, - sizeof (tsn), mp, 0, B_FALSE); + sizeof (tsn), mp, 0, B_FALSE, ira); sctp_assoc_event(sctp, SCTP_COMM_LOST, 0, NULL); sctp_clean_death(sctp, ECONNABORTED); @@ -3726,7 +3619,8 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp) ASSERT(fp != NULL); sctp->sctp_lastdata = fp; - sctp_data_chunk(sctp, ch, mp, &dups, fp, &ipp); + sctp_data_chunk(sctp, ch, mp, &dups, fp, + &ipp, ira); gotdata = 1; /* Restart shutdown timer if shutting down */ if (sctp->sctp_state == SCTPS_SHUTDOWN_SENT) { @@ -3743,7 +3637,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp) sctps->sctps_shutack_wait_bound) { sctp_send_abort(sctp, sctp->sctp_fvtag, 0, NULL, - 0, mp, 0, B_FALSE); + 0, mp, 0, B_FALSE, ira); sctp_assoc_event(sctp, SCTP_COMM_LOST, 0, NULL); sctp_clean_death(sctp, @@ -3764,7 +3658,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp) trysend = sctp_got_sack(sctp, ch); if (trysend < 0) { sctp_send_abort(sctp, sctph->sh_verf, - 0, NULL, 0, mp, 0, B_FALSE); + 0, NULL, 0, mp, 0, B_FALSE, ira); sctp_assoc_event(sctp, SCTP_COMM_LOST, 0, NULL); sctp_clean_death(sctp, @@ -3820,11 +3714,11 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp) goto done; } case CHUNK_INIT: - sctp_send_initack(sctp, sctph, ch, mp); + sctp_send_initack(sctp, sctph, ch, mp, ira); break; case CHUNK_COOKIE: if (sctp_process_cookie(sctp, ch, mp, &iack, - sctph, &recv_adaptation, NULL) != -1) { + sctph, &recv_adaptation, NULL, ira) != -1) { sctp_send_cookie_ack(sctp); sctp_assoc_event(sctp, SCTP_RESTART, 0, NULL); @@ -3841,7 +3735,8 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp) int error; BUMP_LOCAL(sctp->sctp_ibchunks); - error = sctp_handle_error(sctp, sctph, ch, mp); + error = sctp_handle_error(sctp, sctph, ch, mp, + ira); if (error != 0) { sctp_assoc_event(sctp, SCTP_COMM_LOST, 0, NULL); @@ -3864,7 +3759,8 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp) case CHUNK_FORWARD_TSN: ASSERT(fp != NULL); sctp->sctp_lastdata = fp; - sctp_process_forward_tsn(sctp, ch, fp, &ipp); + sctp_process_forward_tsn(sctp, ch, fp, + &ipp, ira); gotdata = 1; BUMP_LOCAL(sctp->sctp_ibchunks); break; @@ -3879,13 +3775,14 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp) case SCTPS_LISTEN: switch (ch->sch_id) { case CHUNK_INIT: - sctp_send_initack(sctp, sctph, ch, mp); + sctp_send_initack(sctp, sctph, ch, mp, ira); break; case CHUNK_COOKIE: { sctp_t *eager; if (sctp_process_cookie(sctp, ch, mp, &iack, - sctph, &recv_adaptation, &peer_src) == -1) { + sctph, &recv_adaptation, &peer_src, + ira) == -1) { BUMP_MIB(&sctps->sctps_mib, sctpInInvalidCookie); goto done; @@ -3900,11 +3797,11 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp) goto done; eager = sctp_conn_request(sctp, mp, ifindex, - ip_hdr_len, iack, ipsec_mp); + ip_hdr_len, iack, ira); if (eager == NULL) { sctp_send_abort(sctp, sctph->sh_verf, SCTP_ERR_NO_RESOURCES, NULL, 0, mp, - 0, B_FALSE); + 0, B_FALSE, ira); goto done; } @@ -3933,9 +3830,6 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp) BUMP_MIB(&sctps->sctps_mib, sctpPassiveEstab); if (mlen > ntohs(ch->sch_len)) { eager->sctp_cookie_mp = dupb(mp); - mblk_setcred(eager->sctp_cookie_mp, - CONN_CRED(eager->sctp_connp), - eager->sctp_cpid); /* * If no mem, just let * the peer retransmit. @@ -3986,7 +3880,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp) default: BUMP_LOCAL(sctp->sctp_ibchunks); sctp_send_abort(sctp, sctph->sh_verf, 0, NULL, - 0, mp, 0, B_TRUE); + 0, mp, 0, B_TRUE, ira); goto done; } break; @@ -3996,20 +3890,21 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp) case CHUNK_INIT_ACK: sctp_stop_faddr_timers(sctp); sctp_faddr_alive(sctp, sctp->sctp_current); - sctp_send_cookie_echo(sctp, ch, mp); + sctp_send_cookie_echo(sctp, ch, mp, ira); BUMP_LOCAL(sctp->sctp_ibchunks); break; case CHUNK_ABORT: sctp_process_abort(sctp, ch, ECONNREFUSED); goto done; case CHUNK_INIT: - sctp_send_initack(sctp, sctph, ch, mp); + sctp_send_initack(sctp, sctph, ch, mp, ira); break; case CHUNK_COOKIE: - cr = msg_getcred(mp, &cpid); + cr = ira->ira_cred; + cpid = ira->ira_cpid; if (sctp_process_cookie(sctp, ch, mp, &iack, - sctph, &recv_adaptation, NULL) == -1) { + sctph, &recv_adaptation, NULL, ira) == -1) { BUMP_MIB(&sctps->sctps_mib, sctpInInvalidCookie); break; @@ -4053,7 +3948,8 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp) case SCTPS_COOKIE_ECHOED: switch (ch->sch_id) { case CHUNK_COOKIE_ACK: - cr = msg_getcred(mp, &cpid); + cr = ira->ira_cred; + cpid = ira->ira_cpid; if (!SCTP_IS_DETACHED(sctp)) { sctp->sctp_ulp_connected( @@ -4084,10 +3980,11 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp) sctp_process_abort(sctp, ch, ECONNREFUSED); goto done; case CHUNK_COOKIE: - cr = msg_getcred(mp, &cpid); + cr = ira->ira_cred; + cpid = ira->ira_cpid; if (sctp_process_cookie(sctp, ch, mp, &iack, - sctph, &recv_adaptation, NULL) == -1) { + sctph, &recv_adaptation, NULL, ira) == -1) { BUMP_MIB(&sctps->sctps_mib, sctpInInvalidCookie); break; @@ -4122,7 +4019,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp) trysend = 1; break; case CHUNK_INIT: - sctp_send_initack(sctp, sctph, ch, mp); + sctp_send_initack(sctp, sctph, ch, mp, ira); break; case CHUNK_ERROR: { sctp_parm_hdr_t *p; @@ -4165,7 +4062,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp) switch (ch->sch_id) { case CHUNK_ABORT: /* Pass gathered wisdom to IP for keeping */ - sctp_update_ire(sctp); + sctp_update_dce(sctp); sctp_process_abort(sctp, ch, 0); goto done; case CHUNK_SHUTDOWN_COMPLETE: @@ -4175,7 +4072,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp) NULL); /* Pass gathered wisdom to IP for keeping */ - sctp_update_ire(sctp); + sctp_update_dce(sctp); sctp_clean_death(sctp, 0); goto done; case CHUNK_SHUTDOWN_ACK: @@ -4215,7 +4112,7 @@ sctp_input_data(sctp_t *sctp, mblk_t *mp, mblk_t *ipsec_mp) trysend = sctp_got_sack(sctp, ch); if (trysend < 0) { sctp_send_abort(sctp, sctph->sh_verf, - 0, NULL, 0, mp, 0, B_FALSE); + 0, NULL, 0, mp, 0, B_FALSE, ira); sctp_assoc_event(sctp, SCTP_COMM_LOST, 0, NULL); sctp_clean_death(sctp, @@ -4287,8 +4184,6 @@ nomorechunks: done: if (dups != NULL) freeb(dups); - if (ipsec_mp != NULL) - freeb(ipsec_mp); freemsg(mp); if (sctp->sctp_err_chunks != NULL) @@ -4297,15 +4192,9 @@ done: if (wake_eager) { /* * sctp points to newly created control block, need to - * release it before exiting. Before releasing it and - * processing the sendq, need to grab a hold on it. - * Otherwise, another thread can close it while processing - * the sendq. + * release it before exiting. */ - SCTP_REFHOLD(sctp); WAKE_SCTP(sctp); - sctp_process_sendq(sctp); - SCTP_REFRELE(sctp); } } @@ -4340,12 +4229,6 @@ sctp_recvd(sctp_t *sctp, int len) sctp->sctp_force_sack = 1; BUMP_MIB(&sctps->sctps_mib, sctpOutWinUpdate); (void) sctp_sack(sctp, NULL); - old = 1; - } else { - old = 0; } WAKE_SCTP(sctp); - if (old > 0) { - sctp_process_sendq(sctp); - } } diff --git a/usr/src/uts/common/inet/sctp/sctp_ioc.c b/usr/src/uts/common/inet/sctp/sctp_ioc.c index 7150c48c4b..5f5c2ee629 100644 --- a/usr/src/uts/common/inet/sctp/sctp_ioc.c +++ b/usr/src/uts/common/inet/sctp/sctp_ioc.c @@ -49,69 +49,7 @@ #include "sctp_impl.h" /* - * We need a stream q for sending packets to IP. This q should - * be set in strplumb() time. Once it is set, it will never - * be removed. Since it is done in strplumb() time, there is - * no need to have a lock on the default q. - */ -static void -sctp_def_q_set(queue_t *q, mblk_t *mp) -{ - conn_t *connp = (conn_t *)q->q_ptr; - struct iocblk *iocp = (struct iocblk *)mp->b_rptr; - mblk_t *mp1; - hrtime_t t; - sctp_stack_t *sctps = connp->conn_netstack-> - netstack_sctp; - - if ((mp1 = mp->b_cont) == NULL) { - iocp->ioc_error = EINVAL; - ip0dbg(("sctp_def_q_set: no file descriptor\n")); - goto done; - } - - mutex_enter(&sctps->sctps_g_q_lock); - if (sctps->sctps_g_q != NULL) { - mutex_exit(&sctps->sctps_g_q_lock); - ip0dbg(("sctp_def_q_set: already set\n")); - iocp->ioc_error = EALREADY; - goto done; - } - - sctps->sctps_g_q = q; - mutex_exit(&sctps->sctps_g_q_lock); - sctps->sctps_gsctp = (sctp_t *)sctp_create(NULL, NULL, AF_INET6, - SCTP_CAN_BLOCK, NULL, NULL, connp->conn_cred); - mutex_enter(&sctps->sctps_g_q_lock); - if (sctps->sctps_gsctp == NULL) { - sctps->sctps_g_q = NULL; - mutex_exit(&sctps->sctps_g_q_lock); - iocp->ioc_error = ENOMEM; - goto done; - } - mutex_exit(&sctps->sctps_g_q_lock); - ASSERT(sctps->sctps_g_q_ref >= 1); - ASSERT(list_head(&sctps->sctps_g_list) == sctps->sctps_gsctp); - - /* - * As a good citizen of using /dev/urandom, add some entropy - * to the random number pool. - */ - t = gethrtime(); - (void) random_add_entropy((uint8_t *)&t, sizeof (t), 0); -done: - if (mp1 != NULL) { - freemsg(mp1); - mp->b_cont = NULL; - } - iocp->ioc_count = 0; - mp->b_datap->db_type = M_IOCACK; - qreply(q, mp); -} - - -/* - * sctp_wput_ioctl is called by sctp_wput_slow to handle all + * sctp_wput_ioctl is called by sctp_wput to handle all * M_IOCTL messages. */ void @@ -119,7 +57,6 @@ sctp_wput_ioctl(queue_t *q, mblk_t *mp) { conn_t *connp = (conn_t *)q->q_ptr; struct iocblk *iocp; - cred_t *cr; if (connp == NULL) { ip0dbg(("sctp_wput_ioctl: null conn\n")); @@ -127,24 +64,7 @@ sctp_wput_ioctl(queue_t *q, mblk_t *mp) } iocp = (struct iocblk *)mp->b_rptr; - /* - * prefer credential from mblk over ioctl; - * see ip_sioctl_copyin_setup - */ - cr = msg_getcred(mp, NULL); - if (cr == NULL) - cr = iocp->ioc_cr; - switch (iocp->ioc_cmd) { - case SCTP_IOC_DEFAULT_Q: - /* Wants to be the default wq. */ - if (cr != NULL && secpolicy_ip_config(cr, B_FALSE) != 0) { - iocp->ioc_error = EPERM; - goto err_ret; - } - sctp_def_q_set(q, mp); - return; - case ND_SET: /* sctp_nd_getset() -> nd_getset() does the checking. */ case ND_GET: @@ -244,6 +164,9 @@ sctp_str_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) netstack_rele(ns); connp->conn_zoneid = zoneid; + connp->conn_ixa->ixa_flags |= IXAF_SET_ULP_CKSUM; + /* conn_allzones can not be set this early, hence no IPCL_ZONEID */ + connp->conn_ixa->ixa_zoneid = zoneid; connp->conn_rq = q; connp->conn_wq = WR(q); @@ -276,6 +199,12 @@ sctp_str_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) ASSERT(connp->conn_cred == NULL); connp->conn_cred = credp; crhold(connp->conn_cred); + connp->conn_cpid = curproc->p_pid; + /* Cache things in ixa without an extra refhold */ + connp->conn_ixa->ixa_cred = connp->conn_cred; + connp->conn_ixa->ixa_cpid = connp->conn_cpid; + if (is_system_labeled()) + connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred); /* * Make the conn globally visible to walkers diff --git a/usr/src/uts/common/inet/sctp/sctp_notify.c b/usr/src/uts/common/inet/sctp/sctp_notify.c index 3ede878954..ea46e0bbd2 100644 --- a/usr/src/uts/common/inet/sctp/sctp_notify.c +++ b/usr/src/uts/common/inet/sctp/sctp_notify.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -51,6 +51,7 @@ sctp_notify(sctp_t *sctp, mblk_t *emp, size_t len) sctp_faddr_t *fp; int32_t rwnd = 0; int error; + conn_t *connp = sctp->sctp_connp; if ((mp = allocb(sizeof (*tudi) + sizeof (void *) + sizeof (struct sockaddr_in6), BPRI_HI)) == NULL) { @@ -82,7 +83,7 @@ sctp_notify(sctp_t *sctp, mblk_t *emp, size_t len) tudi->SRC_length = sizeof (*sin4); sin4 = (struct sockaddr_in *)(tudi + 1); sin4->sin_family = AF_INET; - sin4->sin_port = sctp->sctp_fport; + sin4->sin_port = connp->conn_fport; IN6_V4MAPPED_TO_IPADDR(&fp->faddr, sin4->sin_addr.s_addr); mp->b_wptr = (uchar_t *)(sin4 + 1); } else { @@ -91,7 +92,7 @@ sctp_notify(sctp_t *sctp, mblk_t *emp, size_t len) tudi->SRC_length = sizeof (*sin6); sin6 = (struct sockaddr_in6 *)(tudi + 1); sin6->sin6_family = AF_INET6; - sin6->sin6_port = sctp->sctp_fport; + sin6->sin6_port = connp->conn_fport; sin6->sin6_addr = fp->faddr; mp->b_wptr = (uchar_t *)(sin6 + 1); } diff --git a/usr/src/uts/common/inet/sctp/sctp_opt_data.c b/usr/src/uts/common/inet/sctp/sctp_opt_data.c index 322e4d461e..ee5eb445af 100644 --- a/usr/src/uts/common/inet/sctp/sctp_opt_data.c +++ b/usr/src/uts/common/inet/sctp/sctp_opt_data.c @@ -43,6 +43,7 @@ #include <inet/ip.h> #include <inet/ip_ire.h> #include <inet/ip_if.h> +#include <inet/proto_set.h> #include <inet/ipclassifier.h> #include <inet/ipsec_impl.h> @@ -60,68 +61,6 @@ static int sctp_getpeeraddrs(sctp_t *, void *, int *); -/* - * Copy the standard header into its new location, - * lay in the new options and then update the relevant - * fields in both sctp_t and the standard header. - * Returns 0 on success, errno otherwise. - */ -static int -sctp_opt_set_header(sctp_t *sctp, const void *ptr, uint_t len) -{ - uint8_t *ip_optp; - sctp_hdr_t *new_sctph; - - if ((len > SCTP_MAX_IP_OPTIONS_LENGTH) || (len & 0x3)) - return (EINVAL); - - if (len > IP_MAX_OPT_LENGTH - sctp->sctp_v4label_len) - return (EINVAL); - - ip_optp = (uint8_t *)sctp->sctp_ipha + IP_SIMPLE_HDR_LENGTH; - - if (sctp->sctp_v4label_len > 0) { - int padlen; - uint8_t opt; - - /* convert list termination to no-ops as needed */ - padlen = sctp->sctp_v4label_len - ip_optp[IPOPT_OLEN]; - ip_optp += ip_optp[IPOPT_OLEN]; - opt = len > 0 ? IPOPT_NOP : IPOPT_EOL; - while (--padlen >= 0) - *ip_optp++ = opt; - ASSERT(ip_optp == (uint8_t *)sctp->sctp_ipha + - IP_SIMPLE_HDR_LENGTH + sctp->sctp_v4label_len); - } - - /* - * Move the existing SCTP header out where it belongs. - */ - new_sctph = (sctp_hdr_t *)(ip_optp + len); - ovbcopy(sctp->sctp_sctph, new_sctph, sizeof (sctp_hdr_t)); - sctp->sctp_sctph = new_sctph; - - /* - * Insert the new user-supplied IP options. - */ - if (len > 0) - bcopy(ptr, ip_optp, len); - - len += sctp->sctp_v4label_len; - sctp->sctp_ip_hdr_len = len; - sctp->sctp_ipha->ipha_version_and_hdr_length = - (IP_VERSION << 4) | (len >> 2); - sctp->sctp_hdr_len = len + sizeof (sctp_hdr_t); - - if (sctp->sctp_current) { - /* - * Could be setting options before setting up connection. - */ - sctp_set_ulp_prop(sctp); - } - return (0); -} - static int sctp_get_status(sctp_t *sctp, void *ptr) { @@ -132,6 +71,7 @@ sctp_get_status(sctp_t *sctp, void *ptr) struct sctp_paddrinfo *sp; mblk_t *meta, *mp; int i; + conn_t *connp = sctp->sctp_connp; sstat->sstat_state = sctp->sctp_state; sstat->sstat_rwnd = sctp->sctp_frwnd; @@ -146,13 +86,13 @@ sctp_get_status(sctp_t *sctp, void *ptr) if (fp->isv4) { sin = (struct sockaddr_in *)&sp->spinfo_address; sin->sin_family = AF_INET; - sin->sin_port = sctp->sctp_fport; + sin->sin_port = connp->conn_fport; IN6_V4MAPPED_TO_INADDR(&fp->faddr, &sin->sin_addr); sp->spinfo_mtu = sctp->sctp_hdr_len; } else { sin6 = (struct sockaddr_in6 *)&sp->spinfo_address; sin6->sin6_family = AF_INET6; - sin6->sin6_port = sctp->sctp_fport; + sin6->sin6_port = connp->conn_fport; sin6->sin6_addr = fp->faddr; sp->spinfo_mtu = sctp->sctp_hdr6_len; } @@ -261,18 +201,16 @@ sctp_get_rtoinfo(sctp_t *sctp, void *ptr) } static int -sctp_set_rtoinfo(sctp_t *sctp, const void *invalp, uint_t inlen) +sctp_set_rtoinfo(sctp_t *sctp, const void *invalp) { const struct sctp_rtoinfo *srto; boolean_t ispriv; sctp_stack_t *sctps = sctp->sctp_sctps; + conn_t *connp = sctp->sctp_connp; - if (inlen < sizeof (*srto)) { - return (EINVAL); - } srto = invalp; - ispriv = secpolicy_ip_config(sctp->sctp_credp, B_TRUE) == 0; + ispriv = secpolicy_ip_config(connp->conn_cred, B_TRUE) == 0; /* * Bounds checking. Priviledged user can set the RTO initial @@ -334,17 +272,13 @@ sctp_get_assocparams(sctp_t *sctp, void *ptr) } static int -sctp_set_assocparams(sctp_t *sctp, const void *invalp, uint_t inlen) +sctp_set_assocparams(sctp_t *sctp, const void *invalp) { const struct sctp_assocparams *sap = invalp; uint32_t sum = 0; sctp_faddr_t *fp; sctp_stack_t *sctps = sctp->sctp_sctps; - if (inlen < sizeof (*sap)) { - return (EINVAL); - } - if (sap->sasoc_asocmaxrxt) { if (sctp->sctp_faddrs) { /* @@ -403,6 +337,7 @@ sctp_set_initmsg(sctp_t *sctp, const void *invalp, uint_t inlen) { const struct sctp_initmsg *si = invalp; sctp_stack_t *sctps = sctp->sctp_sctps; + conn_t *connp = sctp->sctp_connp; if (sctp->sctp_state > SCTPS_LISTEN) { return (EINVAL); @@ -430,7 +365,7 @@ sctp_set_initmsg(sctp_t *sctp, const void *invalp, uint_t inlen) return (EINVAL); } if (si->sinit_max_init_timeo != 0 && - (secpolicy_ip_config(sctp->sctp_credp, B_TRUE) != 0 && + (secpolicy_ip_config(connp->conn_cred, B_TRUE) != 0 && (si->sinit_max_init_timeo < sctps->sctps_rto_maxg_low || si->sinit_max_init_timeo > sctps->sctps_rto_maxg_high))) { return (EINVAL); @@ -506,7 +441,7 @@ sctp_get_peer_addr_params(sctp_t *sctp, void *ptr) } static int -sctp_set_peer_addr_params(sctp_t *sctp, const void *invalp, uint_t inlen) +sctp_set_peer_addr_params(sctp_t *sctp, const void *invalp) { const struct sctp_paddrparams *spp = invalp; sctp_faddr_t *fp, *fp2; @@ -515,10 +450,6 @@ sctp_set_peer_addr_params(sctp_t *sctp, const void *invalp, uint_t inlen) int64_t now; sctp_stack_t *sctps = sctp->sctp_sctps; - if (inlen < sizeof (*spp)) { - return (EINVAL); - } - retval = sctp_find_peer_fp(sctp, &spp->spp_address, &fp); if (retval != 0) { return (retval); @@ -620,13 +551,10 @@ sctp_get_def_send_params(sctp_t *sctp, void *ptr) } static int -sctp_set_def_send_params(sctp_t *sctp, const void *invalp, uint_t inlen) +sctp_set_def_send_params(sctp_t *sctp, const void *invalp) { const struct sctp_sndrcvinfo *sinfo = invalp; - if (inlen < sizeof (*sinfo)) { - return (EINVAL); - } if (sinfo->sinfo_stream >= sctp->sctp_num_ostr) { return (EINVAL); } @@ -641,16 +569,12 @@ sctp_set_def_send_params(sctp_t *sctp, const void *invalp, uint_t inlen) } static int -sctp_set_prim(sctp_t *sctp, const void *invalp, uint_t inlen) +sctp_set_prim(sctp_t *sctp, const void *invalp) { const struct sctp_setpeerprim *pp = invalp; int retval; sctp_faddr_t *fp; - if (inlen < sizeof (*pp)) { - return (EINVAL); - } - retval = sctp_find_peer_fp(sctp, &pp->sspp_addr, &fp); if (retval) return (retval); @@ -670,6 +594,183 @@ sctp_set_prim(sctp_t *sctp, const void *invalp, uint_t inlen) return (0); } +/* + * Table of all known options handled on a SCTP protocol stack. + * + * Note: This table contains options processed by both SCTP and IP levels + * and is the superset of options that can be performed on a SCTP and IP + * stack. + */ +opdes_t sctp_opt_arr[] = { + +{ SO_LINGER, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, + sizeof (struct linger), 0 }, + +{ SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_KEEPALIVE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 + }, +{ SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, +{ SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 + }, +{ SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), + 0 }, +{ SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), + 0 }, +{ SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), + 0 }, +{ SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, + +{ SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, + +{ SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, + +{ SCTP_ADAPTATION_LAYER, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0, + sizeof (struct sctp_setadaptation), 0 }, +{ SCTP_ADD_ADDR, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, OP_VARLEN, + sizeof (int), 0 }, +{ SCTP_ASSOCINFO, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0, + sizeof (struct sctp_assocparams), 0 }, +{ SCTP_AUTOCLOSE, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SCTP_DEFAULT_SEND_PARAM, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0, + sizeof (struct sctp_sndrcvinfo), 0 }, +{ SCTP_DISABLE_FRAGMENTS, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0, + sizeof (int), 0 }, +{ SCTP_EVENTS, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0, + sizeof (struct sctp_event_subscribe), 0 }, +{ SCTP_GET_LADDRS, IPPROTO_SCTP, OA_R, OA_R, OP_NP, OP_VARLEN, + sizeof (int), 0 }, +{ SCTP_GET_NLADDRS, IPPROTO_SCTP, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, +{ SCTP_GET_NPADDRS, IPPROTO_SCTP, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, +{ SCTP_GET_PADDRS, IPPROTO_SCTP, OA_R, OA_R, OP_NP, OP_VARLEN, + sizeof (int), 0 }, +{ SCTP_GET_PEER_ADDR_INFO, IPPROTO_SCTP, OA_R, OA_R, OP_NP, 0, + sizeof (struct sctp_paddrinfo), 0 }, +{ SCTP_INITMSG, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0, + sizeof (struct sctp_initmsg), 0 }, +{ SCTP_I_WANT_MAPPED_V4_ADDR, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0, + sizeof (int), 0 }, +{ SCTP_MAXSEG, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SCTP_NODELAY, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SCTP_PEER_ADDR_PARAMS, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0, + sizeof (struct sctp_paddrparams), 0 }, +{ SCTP_PRIMARY_ADDR, IPPROTO_SCTP, OA_W, OA_W, OP_NP, 0, + sizeof (struct sctp_setpeerprim), 0 }, +{ SCTP_PRSCTP, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SCTP_GET_ASSOC_STATS, IPPROTO_SCTP, OA_R, OA_R, OP_NP, 0, + sizeof (sctp_assoc_stats_t), 0 }, +{ SCTP_REM_ADDR, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, OP_VARLEN, + sizeof (int), 0 }, +{ SCTP_RTOINFO, IPPROTO_SCTP, OA_RW, OA_RW, OP_NP, 0, + sizeof (struct sctp_rtoinfo), 0 }, +{ SCTP_SET_PEER_PRIMARY_ADDR, IPPROTO_SCTP, OA_W, OA_W, OP_NP, 0, + sizeof (struct sctp_setprim), 0 }, +{ SCTP_STATUS, IPPROTO_SCTP, OA_R, OA_R, OP_NP, 0, + sizeof (struct sctp_status), 0 }, +{ SCTP_UC_SWAP, IPPROTO_SCTP, OA_W, OA_W, OP_NP, 0, + sizeof (struct sctp_uc_swap), 0 }, + +{ IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, + (OP_VARLEN|OP_NODEFAULT), + 40, -1 /* not initialized */ }, +{ T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, + (OP_VARLEN|OP_NODEFAULT), + 40, -1 /* not initialized */ }, + +{ IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ IP_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN, + sizeof (int), -1 /* not initialized */ }, + +{ IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT, + sizeof (ipsec_req_t), -1 /* not initialized */ }, + +{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, + sizeof (int), 0 /* no ifindex */ }, + +{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0, + sizeof (int), 0 }, + +{ IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN, + sizeof (int), -1 /* not initialized */ }, + +{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, + sizeof (int), 0 /* no ifindex */ }, + +{ IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, + +{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0, + sizeof (in_addr_t), -1 /* not initialized */ }, + +{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0, + sizeof (int), 0 }, + +{ IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, + (OP_NODEFAULT|OP_VARLEN), + sizeof (struct in6_pktinfo), -1 /* not initialized */ }, +{ IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, + OP_NODEFAULT, + sizeof (sin6_t), -1 /* not initialized */ }, +{ IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, + (OP_VARLEN|OP_NODEFAULT), 255*8, + -1 /* not initialized */ }, +{ IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, + (OP_VARLEN|OP_NODEFAULT), 255*8, + -1 /* not initialized */ }, +{ IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, + (OP_VARLEN|OP_NODEFAULT), 255*8, + -1 /* not initialized */ }, +{ IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, + (OP_VARLEN|OP_NODEFAULT), 255*8, + -1 /* not initialized */ }, +{ IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, + OP_NODEFAULT, + sizeof (int), -1 /* not initialized */ }, +{ IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, + OP_NODEFAULT, + sizeof (struct ip6_mtuinfo), -1 /* not initialized */ }, +{ IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, + sizeof (int), 0 }, +{ IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, + sizeof (int), 0 }, +{ IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, + sizeof (int), 0 }, + +/* Enable receipt of ancillary data */ +{ IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, + sizeof (int), 0 }, +{ IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, + sizeof (int), 0 }, +{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, + sizeof (int), 0 }, +{ IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, + sizeof (int), 0 }, +{ _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, + sizeof (int), 0 }, +{ IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, + sizeof (int), 0 }, +{ IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, + sizeof (int), 0 }, +{ IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, + sizeof (int), 0 }, +{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, + sizeof (int), 0 }, + +{ IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT, + sizeof (ipsec_req_t), -1 /* not initialized */ }, +{ IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, + sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT }, +}; + +uint_t sctp_opt_arr_size = A_CNT(sctp_opt_arr); + /* Handy on off switch for socket option processing. */ #define ONOFF(x) ((x) == 0 ? 0 : 1) @@ -682,8 +783,12 @@ sctp_get_opt(sctp_t *sctp, int level, int name, void *ptr, socklen_t *optlen) int *i1 = (int *)ptr; int retval = 0; int buflen = *optlen; - conn_t *connp = sctp->sctp_connp; - ip6_pkt_t *ipp = &sctp->sctp_sticky_ipp; + conn_t *connp = sctp->sctp_connp; + conn_opt_arg_t coas; + + coas.coa_connp = connp; + coas.coa_ixa = connp->conn_ixa; + coas.coa_ipp = &connp->conn_xmit_ipp; /* In most cases, the return buffer is just an int */ *optlen = sizeof (int32_t); @@ -695,83 +800,30 @@ sctp_get_opt(sctp_t *sctp, int level, int name, void *ptr, socklen_t *optlen) return (EINVAL); } - switch (level) { - case SOL_SOCKET: - switch (name) { - case SO_LINGER: { - struct linger *lgr = (struct linger *)ptr; - - lgr->l_onoff = sctp->sctp_linger ? SO_LINGER : 0; - lgr->l_linger = TICK_TO_MSEC(sctp->sctp_lingertime); - *optlen = sizeof (struct linger); - break; - } - case SO_DEBUG: - *i1 = sctp->sctp_debug ? SO_DEBUG : 0; - break; - case SO_DONTROUTE: - *i1 = connp->conn_dontroute ? SO_DONTROUTE : 0; - break; - case SO_USELOOPBACK: - *i1 = connp->conn_loopback ? SO_USELOOPBACK : 0; - break; - case SO_BROADCAST: - *i1 = connp->conn_broadcast ? SO_BROADCAST : 0; - break; - case SO_REUSEADDR: - *i1 = connp->conn_reuseaddr ? SO_REUSEADDR : 0; - break; - case SO_DGRAM_ERRIND: - *i1 = sctp->sctp_dgram_errind ? SO_DGRAM_ERRIND : 0; - break; - case SO_SNDBUF: - *i1 = sctp->sctp_xmit_hiwater; - break; - case SO_RCVBUF: - *i1 = sctp->sctp_rwnd; - break; - case SO_ALLZONES: - *i1 = connp->conn_allzones; - break; - case SO_MAC_EXEMPT: - *i1 = (connp->conn_mac_mode == CONN_MAC_AWARE); - break; - case SO_MAC_IMPLICIT: - *i1 = (connp->conn_mac_mode == CONN_MAC_IMPLICIT); - break; - case SO_PROTOTYPE: - *i1 = IPPROTO_SCTP; - break; - case SO_DOMAIN: - *i1 = sctp->sctp_family; - break; - default: - retval = ENOPROTOOPT; - break; + /* + * Check that the level and name are supported by SCTP, and that + * the length and credentials are ok. + */ + retval = proto_opt_check(level, name, buflen, NULL, sctp_opt_arr, + sctp_opt_arr_size, B_FALSE, B_TRUE, connp->conn_cred); + if (retval != 0) { + WAKE_SCTP(sctp); + if (retval < 0) { + retval = proto_tlitosyserr(-retval); } - break; + return (retval); + } + switch (level) { case IPPROTO_SCTP: switch (name) { case SCTP_RTOINFO: - if (buflen < sizeof (struct sctp_rtoinfo)) { - retval = EINVAL; - break; - } *optlen = sctp_get_rtoinfo(sctp, ptr); break; case SCTP_ASSOCINFO: - if (buflen < sizeof (struct sctp_assocparams)) { - retval = EINVAL; - break; - } *optlen = sctp_get_assocparams(sctp, ptr); break; case SCTP_INITMSG: - if (buflen < sizeof (struct sctp_initmsg)) { - retval = EINVAL; - break; - } *optlen = sctp_get_initmsg(sctp, ptr); break; case SCTP_NODELAY: @@ -781,34 +833,18 @@ sctp_get_opt(sctp_t *sctp, int level, int name, void *ptr, socklen_t *optlen) *i1 = TICK_TO_SEC(sctp->sctp_autoclose); break; case SCTP_ADAPTATION_LAYER: - if (buflen < sizeof (struct sctp_setadaptation)) { - retval = EINVAL; - break; - } ((struct sctp_setadaptation *)ptr)->ssb_adaptation_ind = sctp->sctp_tx_adaptation_code; break; case SCTP_PEER_ADDR_PARAMS: - if (buflen < sizeof (struct sctp_paddrparams)) { - retval = EINVAL; - break; - } *optlen = sctp_get_peer_addr_params(sctp, ptr); break; case SCTP_DEFAULT_SEND_PARAM: - if (buflen < sizeof (struct sctp_sndrcvinfo)) { - retval = EINVAL; - break; - } *optlen = sctp_get_def_send_params(sctp, ptr); break; case SCTP_EVENTS: { struct sctp_event_subscribe *ev; - if (buflen < sizeof (struct sctp_event_subscribe)) { - retval = EINVAL; - break; - } ev = (struct sctp_event_subscribe *)ptr; ev->sctp_data_io_event = ONOFF(sctp->sctp_recvsndrcvinfo); @@ -830,17 +866,9 @@ sctp_get_opt(sctp_t *sctp, int level, int name, void *ptr, socklen_t *optlen) break; } case SCTP_STATUS: - if (buflen < sizeof (struct sctp_status)) { - retval = EINVAL; - break; - } *optlen = sctp_get_status(sctp, ptr); break; case SCTP_GET_PEER_ADDR_INFO: - if (buflen < sizeof (struct sctp_paddrinfo)) { - retval = EINVAL; - break; - } retval = sctp_get_paddrinfo(sctp, ptr, optlen); break; case SCTP_GET_NLADDRS: @@ -850,7 +878,7 @@ sctp_get_opt(sctp_t *sctp, int level, int name, void *ptr, socklen_t *optlen) int addr_cnt; int addr_size; - if (sctp->sctp_family == AF_INET) + if (connp->conn_family == AF_INET) addr_size = sizeof (struct sockaddr_in); else addr_size = sizeof (struct sockaddr_in6); @@ -874,7 +902,7 @@ sctp_get_opt(sctp_t *sctp, int level, int name, void *ptr, socklen_t *optlen) int addr_cnt; int addr_size; - if (sctp->sctp_family == AF_INET) + if (connp->conn_family == AF_INET) addr_size = sizeof (struct sockaddr_in); else addr_size = sizeof (struct sockaddr_in6); @@ -891,11 +919,6 @@ sctp_get_opt(sctp_t *sctp, int level, int name, void *ptr, socklen_t *optlen) case SCTP_GET_ASSOC_STATS: { sctp_assoc_stats_t *sas; - if (buflen < sizeof (sctp_assoc_stats_t)) { - retval = EINVAL; - break; - } - sas = (sctp_assoc_stats_t *)ptr; /* @@ -947,15 +970,15 @@ sctp_get_opt(sctp_t *sctp, int level, int name, void *ptr, socklen_t *optlen) case SCTP_I_WANT_MAPPED_V4_ADDR: case SCTP_MAXSEG: case SCTP_DISABLE_FRAGMENTS: - /* Not yet supported. */ default: + /* Not yet supported. */ retval = ENOPROTOOPT; break; } - break; - + WAKE_SCTP(sctp); + return (retval); case IPPROTO_IP: - if (sctp->sctp_family != AF_INET) { + if (connp->conn_family != AF_INET) { retval = EINVAL; break; } @@ -972,231 +995,52 @@ sctp_get_opt(sctp_t *sctp, int level, int name, void *ptr, socklen_t *optlen) * ip_opt_get_user() adds the final destination * at the start. */ - char *opt_ptr; int opt_len; uchar_t obuf[SCTP_MAX_IP_OPTIONS_LENGTH + IP_ADDR_LEN]; - opt_ptr = (char *)sctp->sctp_ipha + - IP_SIMPLE_HDR_LENGTH; - opt_len = (char *)sctp->sctp_sctph - opt_ptr; - /* Caller ensures enough space */ - if (opt_len > 0) { - /* - * TODO: Do we have to handle getsockopt on an - * initiator as well? - */ - opt_len = ip_opt_get_user(sctp->sctp_ipha, - obuf); - ASSERT(opt_len <= sizeof (obuf)); - } else { - opt_len = 0; - } + opt_len = ip_opt_get_user(connp, obuf); + ASSERT(opt_len <= sizeof (obuf)); + if (buflen < opt_len) { /* Silently truncate */ opt_len = buflen; } *optlen = opt_len; bcopy(obuf, ptr, opt_len); - break; - } - case IP_TOS: - case T_IP_TOS: - *i1 = (int)sctp->sctp_ipha->ipha_type_of_service; - break; - case IP_TTL: - *i1 = (int)sctp->sctp_ipha->ipha_ttl; - break; - case IP_NEXTHOP: - if (connp->conn_nexthop_set) { - *(ipaddr_t *)ptr = connp->conn_nexthop_v4; - *optlen = sizeof (ipaddr_t); - } else { - *optlen = 0; - } - break; - default: - retval = ENOPROTOOPT; - break; - } - break; - case IPPROTO_IPV6: - if (sctp->sctp_family != AF_INET6) { - retval = EINVAL; - break; - } - switch (name) { - case IPV6_UNICAST_HOPS: - *i1 = (unsigned int) sctp->sctp_ip6h->ip6_hops; - break; /* goto sizeof (int) option return */ - case IPV6_RECVPKTINFO: - if (sctp->sctp_ipv6_recvancillary & - SCTP_IPV6_RECVPKTINFO) { - *i1 = 1; - } else { - *i1 = 0; - } - break; /* goto sizeof (int) option return */ - case IPV6_RECVHOPLIMIT: - if (sctp->sctp_ipv6_recvancillary & - SCTP_IPV6_RECVHOPLIMIT) { - *i1 = 1; - } else { - *i1 = 0; - } - break; /* goto sizeof (int) option return */ - case IPV6_RECVHOPOPTS: - if (sctp->sctp_ipv6_recvancillary & - SCTP_IPV6_RECVHOPOPTS) { - *i1 = 1; - } else { - *i1 = 0; - } - break; /* goto sizeof (int) option return */ - case IPV6_RECVDSTOPTS: - if (sctp->sctp_ipv6_recvancillary & - SCTP_IPV6_RECVDSTOPTS) { - *i1 = 1; - } else { - *i1 = 0; - } - break; /* goto sizeof (int) option return */ - case IPV6_RECVRTHDR: - if (sctp->sctp_ipv6_recvancillary & - SCTP_IPV6_RECVRTHDR) { - *i1 = 1; - } else { - *i1 = 0; - } - break; /* goto sizeof (int) option return */ - case IPV6_RECVRTHDRDSTOPTS: - if (sctp->sctp_ipv6_recvancillary & - SCTP_IPV6_RECVRTDSTOPTS) { - *i1 = 1; - } else { - *i1 = 0; - } - break; /* goto sizeof (int) option return */ - case IPV6_PKTINFO: { - struct in6_pktinfo *pkti; - - if (buflen < sizeof (struct in6_pktinfo)) { - retval = EINVAL; - break; - } - pkti = (struct in6_pktinfo *)ptr; - if (ipp->ipp_fields & IPPF_IFINDEX) - pkti->ipi6_ifindex = ipp->ipp_ifindex; - else - pkti->ipi6_ifindex = 0; - if (ipp->ipp_fields & IPPF_ADDR) - pkti->ipi6_addr = ipp->ipp_addr; - else - pkti->ipi6_addr = ipv6_all_zeros; - *optlen = sizeof (struct in6_pktinfo); - break; - } - case IPV6_NEXTHOP: { - sin6_t *sin6; - - if (buflen < sizeof (sin6_t)) { - retval = EINVAL; - break; - } - sin6 = (sin6_t *)ptr; - if (!(ipp->ipp_fields & IPPF_NEXTHOP)) - break; - *sin6 = sctp_sin6_null; - sin6->sin6_family = AF_INET6; - sin6->sin6_addr = ipp->ipp_nexthop; - *optlen = sizeof (sin6_t); - break; + WAKE_SCTP(sctp); + return (0); } - case IPV6_HOPOPTS: { - int len; - - if (!(ipp->ipp_fields & IPPF_HOPOPTS)) - break; - len = ipp->ipp_hopoptslen - sctp->sctp_v6label_len; - if (len <= 0) - break; - if (buflen < len) { - retval = EINVAL; - break; - } - bcopy((char *)ipp->ipp_hopopts + - sctp->sctp_v6label_len, ptr, len); - if (sctp->sctp_v6label_len > 0) { - char *cptr = ptr; - - /* - * If the label length is greater than zero, - * then we need to hide the label from user. - * Make it look as though a normal Hop-By-Hop - * Options Header is present here. - */ - cptr[0] = ((char *)ipp->ipp_hopopts)[0]; - cptr[1] = (len + 7) / 8 - 1; - } - *optlen = len; - break; - } - case IPV6_RTHDRDSTOPTS: - if (!(ipp->ipp_fields & IPPF_RTDSTOPTS)) - break; - if (buflen < ipp->ipp_rtdstoptslen) { - retval = EINVAL; - break; - } - bcopy(ipp->ipp_rtdstopts, ptr, ipp->ipp_rtdstoptslen); - *optlen = ipp->ipp_rtdstoptslen; - break; - case IPV6_RTHDR: - if (!(ipp->ipp_fields & IPPF_RTHDR)) - break; - if (buflen < ipp->ipp_rthdrlen) { - retval = EINVAL; - break; - } - bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen); - *optlen = ipp->ipp_rthdrlen; - break; - case IPV6_DSTOPTS: - if (!(ipp->ipp_fields & IPPF_DSTOPTS)) - break; - if (buflen < ipp->ipp_dstoptslen) { - retval = EINVAL; - break; - } - bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen); - *optlen = ipp->ipp_dstoptslen; - break; - case IPV6_V6ONLY: - *i1 = sctp->sctp_connp->conn_ipv6_v6only; - break; default: - retval = ENOPROTOOPT; break; } break; - - default: - retval = ENOPROTOOPT; - break; } + mutex_enter(&connp->conn_lock); + retval = conn_opt_get(&coas, level, name, ptr); + mutex_exit(&connp->conn_lock); WAKE_SCTP(sctp); - return (retval); + if (retval == -1) + return (EINVAL); + *optlen = retval; + return (0); } int sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp, socklen_t inlen) { - ip6_pkt_t *ipp = &sctp->sctp_sticky_ipp; int *i1 = (int *)invalp; boolean_t onoff; int retval = 0, addrcnt; conn_t *connp = sctp->sctp_connp; sctp_stack_t *sctps = sctp->sctp_sctps; + conn_opt_arg_t coas; + + coas.coa_connp = connp; + coas.coa_ixa = connp->conn_ixa; + coas.coa_ipp = &connp->conn_xmit_ipp; + coas.coa_ancillary = B_FALSE; + coas.coa_changed = 0; /* In all cases, the size of the option must be bigger than int */ if (inlen >= sizeof (int32_t)) { @@ -1211,74 +1055,42 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp, return (EINVAL); } + /* + * Check that the level and name are supported by SCTP, and that + * the length an credentials are ok. + */ + retval = proto_opt_check(level, name, inlen, NULL, sctp_opt_arr, + sctp_opt_arr_size, B_TRUE, B_FALSE, connp->conn_cred); + if (retval != 0) { + if (retval < 0) { + retval = proto_tlitosyserr(-retval); + } + goto done; + } + + /* Note: both SCTP and TCP interpret l_linger as being in seconds */ switch (level) { case SOL_SOCKET: - if (inlen < sizeof (int32_t)) { - retval = EINVAL; - break; - } switch (name) { - case SO_LINGER: { - struct linger *lgr; - - if (inlen != sizeof (struct linger)) { - retval = EINVAL; - break; - } - lgr = (struct linger *)invalp; - if (lgr->l_onoff != 0) { - sctp->sctp_linger = 1; - sctp->sctp_lingertime = MSEC_TO_TICK( - lgr->l_linger); - } else { - sctp->sctp_linger = 0; - sctp->sctp_lingertime = 0; - } - break; - } - case SO_DEBUG: - sctp->sctp_debug = onoff; - break; - case SO_KEEPALIVE: - break; - case SO_DONTROUTE: - /* - * SO_DONTROUTE, SO_USELOOPBACK and SO_BROADCAST are - * only of interest to IP. - */ - connp->conn_dontroute = onoff; - break; - case SO_USELOOPBACK: - connp->conn_loopback = onoff; - break; - case SO_BROADCAST: - connp->conn_broadcast = onoff; - break; - case SO_REUSEADDR: - connp->conn_reuseaddr = onoff; - break; - case SO_DGRAM_ERRIND: - sctp->sctp_dgram_errind = onoff; - break; case SO_SNDBUF: if (*i1 > sctps->sctps_max_buf) { retval = ENOBUFS; - break; + goto done; } if (*i1 < 0) { retval = EINVAL; - break; + goto done; } - sctp->sctp_xmit_hiwater = *i1; - if (sctps->sctps_snd_lowat_fraction != 0) - sctp->sctp_xmit_lowater = - sctp->sctp_xmit_hiwater / + connp->conn_sndbuf = *i1; + if (sctps->sctps_snd_lowat_fraction != 0) { + connp->conn_sndlowat = connp->conn_sndbuf / sctps->sctps_snd_lowat_fraction; - break; + } + goto done; case SO_RCVBUF: if (*i1 > sctps->sctps_max_buf) { retval = ENOBUFS; - break; + goto done; } /* Silently ignore zero */ if (*i1 != 0) { @@ -1294,12 +1106,16 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp, *i1 = MAX(*i1, sctps->sctps_recv_hiwat_minmss * sctp->sctp_mss); - sctp->sctp_rwnd = *i1; + /* + * Note that sctp_rwnd is modified by the + * protocol and here we just whack it. + */ + connp->conn_rcvbuf = sctp->sctp_rwnd = *i1; sctp->sctp_irwnd = sctp->sctp_rwnd; sctp->sctp_pd_point = sctp->sctp_rwnd; sopp.sopp_flags = SOCKOPT_RCVHIWAT; - sopp.sopp_rxhiwat = *i1; + sopp.sopp_rxhiwat = connp->conn_rcvbuf; sctp->sctp_ulp_prop(sctp->sctp_ulpd, &sopp); } @@ -1307,60 +1123,29 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp, * XXX should we return the rwnd here * and sctp_opt_get ? */ - break; + goto done; case SO_ALLZONES: - if (secpolicy_ip(sctp->sctp_credp, OP_CONFIG, - B_TRUE)) { - retval = EACCES; - break; - } if (sctp->sctp_state >= SCTPS_BOUND) { retval = EINVAL; - break; + goto done; } - sctp->sctp_allzones = onoff; break; case SO_MAC_EXEMPT: - if (secpolicy_net_mac_aware(sctp->sctp_credp) != 0) { - retval = EACCES; - break; - } - if (sctp->sctp_state >= SCTPS_BOUND) { - retval = EINVAL; - break; - } - connp->conn_mac_mode = onoff ? - CONN_MAC_AWARE : CONN_MAC_DEFAULT; - break; - case SO_MAC_IMPLICIT: - if (secpolicy_net_mac_implicit(sctp->sctp_credp) != 0) { - retval = EACCES; - break; - } if (sctp->sctp_state >= SCTPS_BOUND) { retval = EINVAL; - break; + goto done; } - connp->conn_mac_mode = onoff ? - CONN_MAC_AWARE : CONN_MAC_IMPLICIT; - break; - default: - retval = ENOPROTOOPT; break; } break; case IPPROTO_SCTP: - if (inlen < sizeof (int32_t)) { - retval = EINVAL; - break; - } switch (name) { case SCTP_RTOINFO: - retval = sctp_set_rtoinfo(sctp, invalp, inlen); + retval = sctp_set_rtoinfo(sctp, invalp); break; case SCTP_ASSOCINFO: - retval = sctp_set_assocparams(sctp, invalp, inlen); + retval = sctp_set_assocparams(sctp, invalp); break; case SCTP_INITMSG: retval = sctp_set_initmsg(sctp, invalp, inlen); @@ -1378,37 +1163,28 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp, sctp_heartbeat_timer(sctp); break; case SCTP_SET_PEER_PRIMARY_ADDR: - retval = sctp_set_peerprim(sctp, invalp, inlen); + retval = sctp_set_peerprim(sctp, invalp); break; case SCTP_PRIMARY_ADDR: - retval = sctp_set_prim(sctp, invalp, inlen); + retval = sctp_set_prim(sctp, invalp); break; case SCTP_ADAPTATION_LAYER: { struct sctp_setadaptation *ssb; - if (inlen < sizeof (struct sctp_setadaptation)) { - retval = EINVAL; - break; - } ssb = (struct sctp_setadaptation *)invalp; sctp->sctp_send_adaptation = 1; sctp->sctp_tx_adaptation_code = ssb->ssb_adaptation_ind; break; } case SCTP_PEER_ADDR_PARAMS: - retval = sctp_set_peer_addr_params(sctp, invalp, - inlen); + retval = sctp_set_peer_addr_params(sctp, invalp); break; case SCTP_DEFAULT_SEND_PARAM: - retval = sctp_set_def_send_params(sctp, invalp, inlen); + retval = sctp_set_def_send_params(sctp, invalp); break; case SCTP_EVENTS: { struct sctp_event_subscribe *ev; - if (inlen < sizeof (struct sctp_event_subscribe)) { - retval = EINVAL; - break; - } ev = (struct sctp_event_subscribe *)invalp; sctp->sctp_recvsndrcvinfo = ONOFF(ev->sctp_data_io_event); @@ -1438,15 +1214,15 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp, retval = EINVAL; break; } - if (sctp->sctp_family == AF_INET) { + if (connp->conn_family == AF_INET) { addrcnt = inlen / sizeof (struct sockaddr_in); } else { - ASSERT(sctp->sctp_family == AF_INET6); + ASSERT(connp->conn_family == AF_INET6); addrcnt = inlen / sizeof (struct sockaddr_in6); } if (name == SCTP_ADD_ADDR) { retval = sctp_bind_add(sctp, invalp, addrcnt, - B_TRUE, sctp->sctp_lport); + B_TRUE, connp->conn_lport); } else { retval = sctp_bind_del(sctp, invalp, addrcnt, B_TRUE); @@ -1458,10 +1234,6 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp, /* * Change handle & upcalls. */ - if (inlen < sizeof (*us)) { - retval = EINVAL; - break; - } us = (struct sctp_uc_swap *)invalp; sctp->sctp_ulpd = us->sus_handle; sctp->sctp_upcalls = us->sus_upcalls; @@ -1474,33 +1246,17 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp, case SCTP_MAXSEG: case SCTP_DISABLE_FRAGMENTS: /* Not yet supported. */ - default: retval = ENOPROTOOPT; break; } - break; + goto done; case IPPROTO_IP: - if (sctp->sctp_family != AF_INET) { + if (connp->conn_family != AF_INET) { retval = ENOPROTOOPT; - break; - } - if ((name != IP_OPTIONS) && (inlen < sizeof (int32_t))) { - retval = EINVAL; - break; + goto done; } switch (name) { - case IP_OPTIONS: - case T_IP_OPTIONS: - retval = sctp_opt_set_header(sctp, invalp, inlen); - break; - case IP_TOS: - case T_IP_TOS: - sctp->sctp_ipha->ipha_type_of_service = (uchar_t)*i1; - break; - case IP_TTL: - sctp->sctp_ipha->ipha_ttl = (uchar_t)*i1; - break; case IP_SEC_OPT: /* * We should not allow policy setting after @@ -1508,319 +1264,30 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp, */ if (sctp->sctp_state >= SCTPS_LISTEN) { retval = EINVAL; - } else { - retval = ipsec_set_req(sctp->sctp_credp, - sctp->sctp_connp, (ipsec_req_t *)invalp); - } - break; - /* IP level options */ - case IP_UNSPEC_SRC: - connp->conn_unspec_src = onoff; - break; - case IP_NEXTHOP: { - ipaddr_t addr = *i1; - ipif_t *ipif = NULL; - ill_t *ill; - ip_stack_t *ipst = sctps->sctps_netstack->netstack_ip; - - if (secpolicy_ip(sctp->sctp_credp, OP_CONFIG, - B_TRUE) == 0) { - ipif = ipif_lookup_onlink_addr(addr, - connp->conn_zoneid, ipst); - if (ipif == NULL) { - retval = EHOSTUNREACH; - break; - } - ill = ipif->ipif_ill; - mutex_enter(&ill->ill_lock); - if ((ill->ill_state_flags & ILL_CONDEMNED) || - (ipif->ipif_state_flags & IPIF_CONDEMNED)) { - mutex_exit(&ill->ill_lock); - ipif_refrele(ipif); - retval = EHOSTUNREACH; - break; - } - mutex_exit(&ill->ill_lock); - ipif_refrele(ipif); - mutex_enter(&connp->conn_lock); - connp->conn_nexthop_v4 = addr; - connp->conn_nexthop_set = B_TRUE; - mutex_exit(&connp->conn_lock); + goto done; } break; } - default: - retval = ENOPROTOOPT; - break; - } break; - case IPPROTO_IPV6: { - if (sctp->sctp_family != AF_INET6) { - retval = ENOPROTOOPT; - break; + case IPPROTO_IPV6: + if (connp->conn_family != AF_INET6) { + retval = EINVAL; + goto done; } switch (name) { - case IPV6_UNICAST_HOPS: - if (inlen < sizeof (int32_t)) { - retval = EINVAL; - break; - } - if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) { - retval = EINVAL; - break; - } - if (*i1 == -1) { - ipp->ipp_unicast_hops = - sctps->sctps_ipv6_hoplimit; - ipp->ipp_fields &= ~IPPF_UNICAST_HOPS; - } else { - ipp->ipp_unicast_hops = (uint8_t)*i1; - ipp->ipp_fields |= IPPF_UNICAST_HOPS; - } - retval = sctp_build_hdrs(sctp); - break; - case IPV6_UNSPEC_SRC: - if (inlen < sizeof (int32_t)) { - retval = EINVAL; - break; - } - connp->conn_unspec_src = onoff; - break; case IPV6_RECVPKTINFO: - if (inlen < sizeof (int32_t)) { - retval = EINVAL; - break; - } - if (onoff) - sctp->sctp_ipv6_recvancillary |= - SCTP_IPV6_RECVPKTINFO; - else - sctp->sctp_ipv6_recvancillary &= - ~SCTP_IPV6_RECVPKTINFO; /* Send it with the next msg */ sctp->sctp_recvifindex = 0; - connp->conn_ip_recvpktinfo = onoff; + break; + case IPV6_RECVTCLASS: + /* Force it to be sent up with the next msg */ + sctp->sctp_recvtclass = 0xffffffffU; break; case IPV6_RECVHOPLIMIT: - if (inlen < sizeof (int32_t)) { - retval = EINVAL; - break; - } - if (onoff) - sctp->sctp_ipv6_recvancillary |= - SCTP_IPV6_RECVHOPLIMIT; - else - sctp->sctp_ipv6_recvancillary &= - ~SCTP_IPV6_RECVHOPLIMIT; + /* Force it to be sent up with the next msg */ sctp->sctp_recvhops = 0xffffffffU; - connp->conn_ipv6_recvhoplimit = onoff; - break; - case IPV6_RECVHOPOPTS: - if (inlen < sizeof (int32_t)) { - retval = EINVAL; - break; - } - if (onoff) - sctp->sctp_ipv6_recvancillary |= - SCTP_IPV6_RECVHOPOPTS; - else - sctp->sctp_ipv6_recvancillary &= - ~SCTP_IPV6_RECVHOPOPTS; - connp->conn_ipv6_recvhopopts = onoff; - break; - case IPV6_RECVDSTOPTS: - if (inlen < sizeof (int32_t)) { - retval = EINVAL; - break; - } - if (onoff) - sctp->sctp_ipv6_recvancillary |= - SCTP_IPV6_RECVDSTOPTS; - else - sctp->sctp_ipv6_recvancillary &= - ~SCTP_IPV6_RECVDSTOPTS; - connp->conn_ipv6_recvdstopts = onoff; - break; - case IPV6_RECVRTHDR: - if (inlen < sizeof (int32_t)) { - retval = EINVAL; - break; - } - if (onoff) - sctp->sctp_ipv6_recvancillary |= - SCTP_IPV6_RECVRTHDR; - else - sctp->sctp_ipv6_recvancillary &= - ~SCTP_IPV6_RECVRTHDR; - connp->conn_ipv6_recvrthdr = onoff; - break; - case IPV6_RECVRTHDRDSTOPTS: - if (inlen < sizeof (int32_t)) { - retval = EINVAL; - break; - } - if (onoff) - sctp->sctp_ipv6_recvancillary |= - SCTP_IPV6_RECVRTDSTOPTS; - else - sctp->sctp_ipv6_recvancillary &= - ~SCTP_IPV6_RECVRTDSTOPTS; - connp->conn_ipv6_recvrtdstopts = onoff; - break; - case IPV6_PKTINFO: - if (inlen != 0 && - inlen != sizeof (struct in6_pktinfo)) { - retval = EINVAL; - break; - } - - if (inlen == 0) { - ipp->ipp_fields &= ~(IPPF_IFINDEX |IPPF_ADDR); - } else { - struct in6_pktinfo *pkti; - - pkti = (struct in6_pktinfo *)invalp; - /* XXX Need to check if the index exists */ - ipp->ipp_ifindex = pkti->ipi6_ifindex; - ipp->ipp_addr = pkti->ipi6_addr; - if (ipp->ipp_ifindex != 0) - ipp->ipp_fields |= IPPF_IFINDEX; - else - ipp->ipp_fields &= ~IPPF_IFINDEX; - if (!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr)) - ipp->ipp_fields |= IPPF_ADDR; - else - ipp->ipp_fields &= ~IPPF_ADDR; - } - retval = sctp_build_hdrs(sctp); - break; - case IPV6_NEXTHOP: { - struct sockaddr_in6 *sin6; - ip_stack_t *ipst = sctps->sctps_netstack->netstack_ip; - - if (inlen != 0 && inlen != sizeof (sin6_t)) { - retval = EINVAL; - break; - } - - if (inlen == 0) { - ipp->ipp_fields &= ~IPPF_NEXTHOP; - } else { - sin6 = (struct sockaddr_in6 *)invalp; - if (sin6->sin6_family != AF_INET6) { - retval = EAFNOSUPPORT; - break; - } - if (IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { - retval = EADDRNOTAVAIL; - break; - } - ipp->ipp_nexthop = sin6->sin6_addr; - if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) { - ipp->ipp_fields &= ~IPPF_NEXTHOP; - } else { - ire_t *ire; - - ire = ire_route_lookup_v6( - &sin6->sin6_addr, NULL, NULL, 0, - NULL, NULL, ALL_ZONES, NULL, - MATCH_IRE_DEFAULT, ipst); - if (ire == NULL) { - retval = EHOSTUNREACH; - break; - } - ire_refrele(ire); - ipp->ipp_fields |= IPPF_NEXTHOP; - } - } - retval = sctp_build_hdrs(sctp); - break; - } - case IPV6_HOPOPTS: { - ip6_hbh_t *hopts = (ip6_hbh_t *)invalp; - - if (inlen != 0 && - inlen != (8 * (hopts->ip6h_len + 1))) { - retval = EINVAL; - break; - } - - retval = optcom_pkt_set((uchar_t *)invalp, inlen, - B_TRUE, (uchar_t **)&ipp->ipp_hopopts, - &ipp->ipp_hopoptslen, sctp->sctp_v6label_len); - if (retval != 0) - break; - if (ipp->ipp_hopoptslen == 0) - ipp->ipp_fields &= ~IPPF_HOPOPTS; - else - ipp->ipp_fields |= IPPF_HOPOPTS; - retval = sctp_build_hdrs(sctp); - break; - } - case IPV6_RTHDRDSTOPTS: { - ip6_dest_t *dopts = (ip6_dest_t *)invalp; - - if (inlen != 0 && - inlen != (8 * (dopts->ip6d_len + 1))) { - retval = EINVAL; - break; - } - - retval = optcom_pkt_set((uchar_t *)invalp, inlen, - B_TRUE, (uchar_t **)&ipp->ipp_rtdstopts, - &ipp->ipp_rtdstoptslen, 0); - if (retval != 0) - break; - if (ipp->ipp_rtdstoptslen == 0) - ipp->ipp_fields &= ~IPPF_RTDSTOPTS; - else - ipp->ipp_fields |= IPPF_RTDSTOPTS; - retval = sctp_build_hdrs(sctp); - break; - } - case IPV6_DSTOPTS: { - ip6_dest_t *dopts = (ip6_dest_t *)invalp; - - if (inlen != 0 && - inlen != (8 * (dopts->ip6d_len + 1))) { - retval = EINVAL; - break; - } - - retval = optcom_pkt_set((uchar_t *)invalp, inlen, - B_TRUE, (uchar_t **)&ipp->ipp_dstopts, - &ipp->ipp_dstoptslen, 0); - if (retval != 0) - break; - if (ipp->ipp_dstoptslen == 0) - ipp->ipp_fields &= ~IPPF_DSTOPTS; - else - ipp->ipp_fields |= IPPF_DSTOPTS; - retval = sctp_build_hdrs(sctp); break; - } - case IPV6_RTHDR: { - ip6_rthdr_t *rt = (ip6_rthdr_t *)invalp; - - if (inlen != 0 && - inlen != (8 * (rt->ip6r_len + 1))) { - retval = EINVAL; - break; - } - - retval = optcom_pkt_set((uchar_t *)invalp, inlen, - B_TRUE, (uchar_t **)&ipp->ipp_rthdr, - &ipp->ipp_rthdrlen, 0); - if (retval != 0) - break; - if (ipp->ipp_rthdrlen == 0) - ipp->ipp_fields &= ~IPPF_RTHDR; - else - ipp->ipp_fields |= IPPF_RTHDR; - retval = sctp_build_hdrs(sctp); - break; - } case IPV6_SEC_OPT: /* * We should not allow policy setting after @@ -1828,9 +1295,7 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp, */ if (sctp->sctp_state >= SCTPS_LISTEN) { retval = EINVAL; - } else { - retval = ipsec_set_req(sctp->sctp_credp, - sctp->sctp_connp, (ipsec_req_t *)invalp); + goto done; } break; case IPV6_V6ONLY: @@ -1840,21 +1305,44 @@ sctp_set_opt(sctp_t *sctp, int level, int name, const void *invalp, */ if (sctp->sctp_state >= SCTPS_BOUND) { retval = EINVAL; - } else { - sctp->sctp_connp->conn_ipv6_v6only = onoff; + goto done; } break; - default: - retval = ENOPROTOOPT; - break; } break; } - default: - retval = ENOPROTOOPT; - break; - } + retval = conn_opt_set(&coas, level, name, inlen, (uchar_t *)invalp, + B_FALSE, connp->conn_cred); + if (retval != 0) + goto done; + + if (coas.coa_changed & COA_ROUTE_CHANGED) { + sctp_faddr_t *fp; + /* + * We recache the information which might pick a different + * source and redo IPsec as a result. + */ + for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->next) + sctp_get_dest(sctp, fp); + } + if (coas.coa_changed & COA_HEADER_CHANGED) { + retval = sctp_build_hdrs(sctp, KM_NOSLEEP); + if (retval != 0) + goto done; + } + if (coas.coa_changed & COA_WROFF_CHANGED) { + connp->conn_wroff = connp->conn_ht_iphc_allocated + + sctps->sctps_wroff_xtra; + if (sctp->sctp_current != NULL) { + /* + * Could be setting options before setting up + * connection. + */ + sctp_set_ulp_prop(sctp); + } + } +done: WAKE_SCTP(sctp); return (retval); } @@ -1871,18 +1359,19 @@ sctp_getsockname(sctp_t *sctp, struct sockaddr *addr, socklen_t *addrlen) int addrcnt = 1; sin_t *sin4; sin6_t *sin6; + conn_t *connp = sctp->sctp_connp; ASSERT(sctp != NULL); RUN_SCTP(sctp); - addr->sa_family = sctp->sctp_family; - switch (sctp->sctp_family) { + addr->sa_family = connp->conn_family; + switch (connp->conn_family) { case AF_INET: sin4 = (sin_t *)addr; if ((sctp->sctp_state <= SCTPS_LISTEN) && sctp->sctp_bound_to_all) { sin4->sin_addr.s_addr = INADDR_ANY; - sin4->sin_port = sctp->sctp_lport; + sin4->sin_port = connp->conn_lport; } else { err = sctp_getmyaddrs(sctp, sin4, &addrcnt); if (err != 0) { @@ -1897,7 +1386,7 @@ sctp_getsockname(sctp_t *sctp, struct sockaddr *addr, socklen_t *addrlen) if ((sctp->sctp_state <= SCTPS_LISTEN) && sctp->sctp_bound_to_all) { bzero(&sin6->sin6_addr, sizeof (sin6->sin6_addr)); - sin6->sin6_port = sctp->sctp_lport; + sin6->sin6_port = connp->conn_lport; } else { err = sctp_getmyaddrs(sctp, sin6, &addrcnt); if (err != 0) { @@ -1906,10 +1395,7 @@ sctp_getsockname(sctp_t *sctp, struct sockaddr *addr, socklen_t *addrlen) } } *addrlen = sizeof (struct sockaddr_in6); - sin6->sin6_flowinfo = sctp->sctp_ip6h->ip6_vcf & - ~IPV6_VERS_AND_FLOW_MASK; - sin6->sin6_scope_id = 0; - sin6->__sin6_src_id = 0; + /* Note that flowinfo is only returned for getpeername */ break; } WAKE_SCTP(sctp); @@ -1927,12 +1413,13 @@ sctp_getpeername(sctp_t *sctp, struct sockaddr *addr, socklen_t *addrlen) int err = 0; int addrcnt = 1; sin6_t *sin6; + conn_t *connp = sctp->sctp_connp; ASSERT(sctp != NULL); RUN_SCTP(sctp); - addr->sa_family = sctp->sctp_family; - switch (sctp->sctp_family) { + addr->sa_family = connp->conn_family; + switch (connp->conn_family) { case AF_INET: err = sctp_getpeeraddrs(sctp, addr, &addrcnt); if (err != 0) { @@ -1949,9 +1436,6 @@ sctp_getpeername(sctp_t *sctp, struct sockaddr *addr, socklen_t *addrlen) break; } *addrlen = sizeof (struct sockaddr_in6); - sin6->sin6_flowinfo = 0; - sin6->sin6_scope_id = 0; - sin6->__sin6_src_id = 0; break; } WAKE_SCTP(sctp); @@ -1973,13 +1457,14 @@ sctp_getpeeraddrs(sctp_t *sctp, void *paddrs, int *addrcnt) int cnt; sctp_faddr_t *fp = sctp->sctp_faddrs; in6_addr_t addr; + conn_t *connp = sctp->sctp_connp; ASSERT(sctp != NULL); if (sctp->sctp_faddrs == NULL) return (ENOTCONN); - family = sctp->sctp_family; + family = connp->conn_family; max = *addrcnt; /* If we want only one, give the primary */ @@ -1989,15 +1474,26 @@ sctp_getpeeraddrs(sctp_t *sctp, void *paddrs, int *addrcnt) case AF_INET: sin4 = paddrs; IN6_V4MAPPED_TO_INADDR(&addr, &sin4->sin_addr); - sin4->sin_port = sctp->sctp_fport; + sin4->sin_port = connp->conn_fport; sin4->sin_family = AF_INET; break; case AF_INET6: sin6 = paddrs; sin6->sin6_addr = addr; - sin6->sin6_port = sctp->sctp_fport; + sin6->sin6_port = connp->conn_fport; sin6->sin6_family = AF_INET6; + sin6->sin6_flowinfo = connp->conn_flowinfo; + if (IN6_IS_ADDR_LINKSCOPE(&addr) && + sctp->sctp_primary != NULL && + (sctp->sctp_primary->ixa->ixa_flags & + IXAF_SCOPEID_SET)) { + sin6->sin6_scope_id = + sctp->sctp_primary->ixa->ixa_scopeid; + } else { + sin6->sin6_scope_id = 0; + } + sin6->__sin6_src_id = 0; break; } return (0); @@ -2010,14 +1506,21 @@ sctp_getpeeraddrs(sctp_t *sctp, void *paddrs, int *addrcnt) ASSERT(IN6_IS_ADDR_V4MAPPED(&addr)); sin4 = (struct sockaddr_in *)paddrs + cnt; IN6_V4MAPPED_TO_INADDR(&addr, &sin4->sin_addr); - sin4->sin_port = sctp->sctp_fport; + sin4->sin_port = connp->conn_fport; sin4->sin_family = AF_INET; break; case AF_INET6: sin6 = (struct sockaddr_in6 *)paddrs + cnt; sin6->sin6_addr = addr; - sin6->sin6_port = sctp->sctp_fport; + sin6->sin6_port = connp->conn_fport; sin6->sin6_family = AF_INET6; + sin6->sin6_flowinfo = connp->conn_flowinfo; + if (IN6_IS_ADDR_LINKSCOPE(&addr) && + (fp->ixa->ixa_flags & IXAF_SCOPEID_SET)) + sin6->sin6_scope_id = fp->ixa->ixa_scopeid; + else + sin6->sin6_scope_id = 0; + sin6->__sin6_src_id = 0; break; } } diff --git a/usr/src/uts/common/inet/sctp/sctp_output.c b/usr/src/uts/common/inet/sctp/sctp_output.c index c16a1166fa..1a50097260 100644 --- a/usr/src/uts/common/inet/sctp/sctp_output.c +++ b/usr/src/uts/common/inet/sctp/sctp_output.c @@ -38,6 +38,7 @@ #include <inet/common.h> #include <inet/mi.h> #include <inet/ip.h> +#include <inet/ip_ire.h> #include <inet/ip6.h> #include <inet/sctp_ip.h> #include <inet/ipclassifier.h> @@ -140,6 +141,7 @@ sctp_sendmsg(sctp_t *sctp, mblk_t *mp, int flags) sctp_msg_hdr_t *sctp_msg_hdr; uint32_t msg_len = 0; uint32_t timetolive = sctp->sctp_def_timetolive; + conn_t *connp = sctp->sctp_connp; ASSERT(DB_TYPE(mproto) == M_PROTO); @@ -228,7 +230,7 @@ sctp_sendmsg(sctp_t *sctp, mblk_t *mp, int flags) RUN_SCTP(sctp); sctp_user_abort(sctp, mp); freemsg(mproto); - goto process_sendq; + goto done2; } if (mp == NULL) goto done; @@ -292,15 +294,14 @@ sctp_sendmsg(sctp_t *sctp, mblk_t *mp, int flags) /* * Notify sockfs if the tx queue is full. */ - if (SCTP_TXQ_LEN(sctp) >= sctp->sctp_xmit_hiwater) { + if (SCTP_TXQ_LEN(sctp) >= connp->conn_sndbuf) { sctp->sctp_txq_full = 1; sctp->sctp_ulp_xmitted(sctp->sctp_ulpd, B_TRUE); } if (sctp->sctp_state == SCTPS_ESTABLISHED) sctp_output(sctp, UINT_MAX); -process_sendq: +done2: WAKE_SCTP(sctp); - sctp_process_sendq(sctp); return (0); unlock_done: WAKE_SCTP(sctp); @@ -569,7 +570,7 @@ sctp_add_proto_hdr(sctp_t *sctp, sctp_faddr_t *fp, mblk_t *mp, int sacklen, int *error) { int hdrlen; - char *hdr; + uchar_t *hdr; int isv4 = fp->isv4; sctp_stack_t *sctps = sctp->sctp_sctps; @@ -584,17 +585,19 @@ sctp_add_proto_hdr(sctp_t *sctp, sctp_faddr_t *fp, mblk_t *mp, int sacklen, hdr = sctp->sctp_iphc6; } /* - * A null fp->ire could mean that the address is 'down'. Similarly, + * A reject|blackhole could mean that the address is 'down'. Similarly, * it is possible that the address went down, we tried to send an * heartbeat and ended up setting fp->saddr as unspec because we * didn't have any usable source address. In either case - * sctp_get_ire() will try find an IRE, if available, and set + * sctp_get_dest() will try find an IRE, if available, and set * the source address, if needed. If we still don't have any * usable source address, fp->state will be SCTP_FADDRS_UNREACH and * we return EHOSTUNREACH. */ - if (fp->ire == NULL || SCTP_IS_ADDR_UNSPEC(fp->isv4, fp->saddr)) { - sctp_get_ire(sctp, fp); + ASSERT(fp->ixa->ixa_ire != NULL); + if ((fp->ixa->ixa_ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || + SCTP_IS_ADDR_UNSPEC(fp->isv4, fp->saddr)) { + sctp_get_dest(sctp, fp); if (fp->state == SCTP_FADDRS_UNREACH) { if (error != NULL) *error = EHOSTUNREACH; @@ -603,8 +606,7 @@ sctp_add_proto_hdr(sctp_t *sctp, sctp_faddr_t *fp, mblk_t *mp, int sacklen, } /* Copy in IP header. */ if ((mp->b_rptr - mp->b_datap->db_base) < - (sctps->sctps_wroff_xtra + hdrlen + sacklen) || DB_REF(mp) > 2 || - !IS_P2ALIGNED(DB_BASE(mp), sizeof (ire_t *))) { + (sctps->sctps_wroff_xtra + hdrlen + sacklen) || DB_REF(mp) > 2) { mblk_t *nmp; /* @@ -612,8 +614,8 @@ sctp_add_proto_hdr(sctp_t *sctp, sctp_faddr_t *fp, mblk_t *mp, int sacklen, * data was moved into chunks, or during retransmission, * or things like snoop is running. */ - nmp = allocb_cred(sctps->sctps_wroff_xtra + hdrlen + sacklen, - CONN_CRED(sctp->sctp_connp), sctp->sctp_cpid); + nmp = allocb(sctps->sctps_wroff_xtra + hdrlen + sacklen, + BPRI_MED); if (nmp == NULL) { if (error != NULL) *error = ENOMEM; @@ -625,7 +627,6 @@ sctp_add_proto_hdr(sctp_t *sctp, sctp_faddr_t *fp, mblk_t *mp, int sacklen, mp = nmp; } else { mp->b_rptr -= (hdrlen + sacklen); - mblk_setcred(mp, CONN_CRED(sctp->sctp_connp), sctp->sctp_cpid); } bcopy(hdr, mp->b_rptr, hdrlen); if (sacklen) { @@ -644,26 +645,16 @@ sctp_add_proto_hdr(sctp_t *sctp, sctp_faddr_t *fp, mblk_t *mp, int sacklen, iph->ipha_src = INADDR_ANY; } } else { - ((ip6_t *)(mp->b_rptr))->ip6_dst = fp->faddr; + ip6_t *ip6h = (ip6_t *)mp->b_rptr; + + ip6h->ip6_dst = fp->faddr; if (!IN6_IS_ADDR_UNSPECIFIED(&fp->saddr)) { - ((ip6_t *)(mp->b_rptr))->ip6_src = fp->saddr; + ip6h->ip6_src = fp->saddr; } else if (sctp->sctp_bound_to_all) { - V6_SET_ZERO(((ip6_t *)(mp->b_rptr))->ip6_src); + ip6h->ip6_src = ipv6_all_zeros; } } } - /* - * IP will not free this IRE if it is condemned. SCTP needs to - * free it. - */ - if ((fp->ire != NULL) && (fp->ire->ire_marks & IRE_MARK_CONDEMNED)) { - IRE_REFRELE_NOTR(fp->ire); - fp->ire = NULL; - } - - /* Stash the conn and ire ptr info for IP */ - SCTP_STASH_IPINFO(mp, fp->ire); - return (mp); } @@ -985,8 +976,9 @@ sctp_fast_rexmit(sctp_t *sctp) iph->ipha_fragment_offset_and_flags = 0; } - sctp_set_iplen(sctp, head); - sctp_add_sendq(sctp, head); + sctp_set_iplen(sctp, head, fp->ixa); + (void) conn_ip_output(head, fp->ixa); + BUMP_LOCAL(sctp->sctp_opkts); sctp->sctp_active = fp->lastactive = lbolt64; } @@ -1280,8 +1272,9 @@ sctp_output(sctp_t *sctp, uint_t num_pkt) seglen - xtralen, ntohl(sdc->sdh_tsn), ntohs(sdc->sdh_ssn), (void *)fp, sctp->sctp_frwnd, cansend, sctp->sctp_lastack_rxd)); - sctp_set_iplen(sctp, head); - sctp_add_sendq(sctp, head); + sctp_set_iplen(sctp, head, fp->ixa); + (void) conn_ip_output(head, fp->ixa); + BUMP_LOCAL(sctp->sctp_opkts); /* arm rto timer (if not set) */ if (!fp->timer_running) SCTP_FADDR_TIMER_RESTART(sctp, fp, fp->rto); @@ -1415,8 +1408,7 @@ sctp_make_ftsn_chunk(sctp_t *sctp, sctp_faddr_t *fp, sctp_ftsn_set_t *sets, xtralen = sctp->sctp_hdr_len + sctps->sctps_wroff_xtra; else xtralen = sctp->sctp_hdr6_len + sctps->sctps_wroff_xtra; - ftsn_mp = allocb_cred(xtralen + seglen, CONN_CRED(sctp->sctp_connp), - sctp->sctp_cpid); + ftsn_mp = allocb(xtralen + seglen, BPRI_MED); if (ftsn_mp == NULL) return (NULL); ftsn_mp->b_rptr += xtralen; @@ -1804,8 +1796,9 @@ out: pkt = sctp_rexmit_packet(sctp, &meta, &mp, fp, &pkt_len); if (pkt != NULL) { ASSERT(pkt_len <= fp->sfa_pmss); - sctp_set_iplen(sctp, pkt); - sctp_add_sendq(sctp, pkt); + sctp_set_iplen(sctp, pkt, fp->ixa); + (void) conn_ip_output(pkt, fp->ixa); + BUMP_LOCAL(sctp->sctp_opkts); } else { SCTP_KSTAT(sctps, sctp_ss_rexmit_failed); } @@ -2022,8 +2015,9 @@ done_bundle: sctp->sctp_rexmitting = B_TRUE; sctp->sctp_rxt_nxttsn = first_ua_tsn; sctp->sctp_rxt_maxtsn = sctp->sctp_ltsn - 1; - sctp_set_iplen(sctp, head); - sctp_add_sendq(sctp, head); + sctp_set_iplen(sctp, head, fp->ixa); + (void) conn_ip_output(head, fp->ixa); + BUMP_LOCAL(sctp->sctp_opkts); /* * Restart the oldfp timer with exponential backoff and @@ -2305,8 +2299,9 @@ found_msg: */ iph->ipha_fragment_offset_and_flags = 0; } - sctp_set_iplen(sctp, pkt); - sctp_add_sendq(sctp, pkt); + sctp_set_iplen(sctp, pkt, fp->ixa); + (void) conn_ip_output(pkt, fp->ixa); + BUMP_LOCAL(sctp->sctp_opkts); /* Check and see if there is more chunk to be retransmitted. */ if (tot_wnd <= pkt_len || tot_wnd - pkt_len < fp->sfa_pmss || diff --git a/usr/src/uts/common/inet/sctp/sctp_param.c b/usr/src/uts/common/inet/sctp/sctp_param.c index 5d5ed19676..26365c5a06 100644 --- a/usr/src/uts/common/inet/sctp/sctp_param.c +++ b/usr/src/uts/common/inet/sctp/sctp_param.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/stream.h> #include <sys/socket.h> #include <sys/ddi.h> @@ -72,11 +70,8 @@ /* * sctp_wroff_xtra is the extra space in front of SCTP/IP header for link * layer header. It has to be a multiple of 4. - * Also there has to be enough space to stash in information passed between - * IP and SCTP. */ -sctpparam_t lcl_sctp_wroff_xtra_param = { sizeof (conn_t *) + sizeof (ire_t *), - 256, 32, "sctp_wroff_xtra" }; +sctpparam_t lcl_sctp_wroff_xtra_param = { 0, 256, 32, "sctp_wroff_xtra" }; /* * All of these are alterable, within the min/max values given, at run time. @@ -343,7 +338,7 @@ sctp_nd_init(sctp_stack_t *sctps) bcopy(lcl_sctp_param_arr, pa, sizeof (lcl_sctp_param_arr)); sctps->sctps_params = pa; return (sctp_param_register(&sctps->sctps_g_nd, pa, - A_CNT(lcl_sctp_param_arr), sctps)); + A_CNT(lcl_sctp_param_arr), sctps)); } int diff --git a/usr/src/uts/common/inet/sctp/sctp_shutdown.c b/usr/src/uts/common/inet/sctp/sctp_shutdown.c index b58016eb15..ff835a60c0 100644 --- a/usr/src/uts/common/inet/sctp/sctp_shutdown.c +++ b/usr/src/uts/common/inet/sctp/sctp_shutdown.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -35,6 +35,7 @@ #include <netinet/in.h> #include <netinet/ip6.h> +#include <inet/ipsec_impl.h> #include <inet/common.h> #include <inet/ip.h> #include <inet/ip6.h> @@ -129,12 +130,12 @@ sctp_send_shutdown(sctp_t *sctp, int rexmit) /* Link the shutdown chunk in after the IP/SCTP header */ - sctp_set_iplen(sctp, sendmp); - BUMP_LOCAL(sctp->sctp_obchunks); /* Send the shutdown and restart the timer */ - sctp_add_sendq(sctp, sendmp); + sctp_set_iplen(sctp, sendmp, fp->ixa); + (void) conn_ip_output(sendmp, fp->ixa); + BUMP_LOCAL(sctp->sctp_opkts); done: sctp->sctp_state = SCTPS_SHUTDOWN_SENT; @@ -211,11 +212,11 @@ sctp_shutdown_received(sctp_t *sctp, sctp_chunk_hdr_t *sch, boolean_t crwsd, } } - sctp_set_iplen(sctp, samp); - BUMP_LOCAL(sctp->sctp_obchunks); - sctp_add_sendq(sctp, samp); + sctp_set_iplen(sctp, samp, fp->ixa); + (void) conn_ip_output(samp, fp->ixa); + BUMP_LOCAL(sctp->sctp_opkts); dotimer: sctp->sctp_state = SCTPS_SHUTDOWN_ACK_SENT; @@ -232,7 +233,7 @@ sctp_shutdown_complete(sctp_t *sctp) sctp_chunk_hdr_t *scch; sctp_stack_t *sctps = sctp->sctp_sctps; - scmp = sctp_make_mp(sctp, NULL, sizeof (*scch)); + scmp = sctp_make_mp(sctp, sctp->sctp_current, sizeof (*scch)); if (scmp == NULL) { /* XXX use timer approach */ SCTP_KSTAT(sctps, sctp_send_shutdown_comp_failed); @@ -246,11 +247,11 @@ sctp_shutdown_complete(sctp_t *sctp) scmp->b_wptr += sizeof (*scch); - sctp_set_iplen(sctp, scmp); - BUMP_LOCAL(sctp->sctp_obchunks); - sctp_add_sendq(sctp, scmp); + sctp_set_iplen(sctp, scmp, sctp->sctp_current->ixa); + (void) conn_ip_output(scmp, sctp->sctp_current->ixa); + BUMP_LOCAL(sctp->sctp_opkts); } /* @@ -259,91 +260,99 @@ sctp_shutdown_complete(sctp_t *sctp) * and instead must draw all necessary info from the incoming packet. */ void -sctp_ootb_shutdown_ack(sctp_t *gsctp, mblk_t *inmp, uint_t ip_hdr_len) +sctp_ootb_shutdown_ack(mblk_t *mp, uint_t ip_hdr_len, ip_recv_attr_t *ira, + ip_stack_t *ipst) { boolean_t isv4; - ipha_t *inip4h; - ip6_t *inip6h; + ipha_t *ipha = NULL; + ip6_t *ip6h = NULL; sctp_hdr_t *insctph; sctp_chunk_hdr_t *scch; int i; uint16_t port; mblk_t *mp1; - sctp_stack_t *sctps = gsctp->sctp_sctps; + netstack_t *ns = ipst->ips_netstack; + sctp_stack_t *sctps = ns->netstack_sctp; + ip_xmit_attr_t ixas; - isv4 = (IPH_HDR_VERSION(inmp->b_rptr) == IPV4_VERSION); + bzero(&ixas, sizeof (ixas)); - /* - * The gsctp should contain the minimal IP header. So the - * incoming mblk should be able to hold the new SCTP packet. - */ - ASSERT(MBLKL(inmp) >= sizeof (*insctph) + sizeof (*scch) + - (isv4 ? gsctp->sctp_ip_hdr_len : gsctp->sctp_ip_hdr6_len)); + isv4 = (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION); + + ASSERT(MBLKL(mp) >= sizeof (*insctph) + sizeof (*scch) + + (isv4 ? sizeof (ipha_t) : sizeof (ip6_t))); /* * Check to see if we can reuse the incoming mblk. There should - * not be other reference and the db_base of the mblk should be - * properly aligned. Since this packet comes from below, + * not be other reference. Since this packet comes from below, * there should be enough header space to fill in what the lower - * layers want to add. And we will not stash anything there. + * layers want to add. */ - if (!IS_P2ALIGNED(DB_BASE(inmp), sizeof (ire_t *)) || - DB_REF(inmp) != 1) { - mp1 = allocb(MBLKL(inmp) + sctps->sctps_wroff_xtra, BPRI_MED); + if (DB_REF(mp) != 1) { + mp1 = allocb(MBLKL(mp) + sctps->sctps_wroff_xtra, BPRI_MED); if (mp1 == NULL) { - freeb(inmp); + freeb(mp); return; } mp1->b_rptr += sctps->sctps_wroff_xtra; - mp1->b_wptr = mp1->b_rptr + MBLKL(inmp); - bcopy(inmp->b_rptr, mp1->b_rptr, MBLKL(inmp)); - freeb(inmp); - inmp = mp1; + mp1->b_wptr = mp1->b_rptr + MBLKL(mp); + bcopy(mp->b_rptr, mp1->b_rptr, MBLKL(mp)); + freeb(mp); + mp = mp1; } else { - ASSERT(DB_CKSUMFLAGS(inmp) == 0); + DB_CKSUMFLAGS(mp) = 0; } + ixas.ixa_pktlen = ip_hdr_len + sizeof (*insctph) + sizeof (*scch); + ixas.ixa_ip_hdr_length = ip_hdr_len; /* * We follow the logic in tcp_xmit_early_reset() in that we skip - * reversing source route (i.e. relpace all IP options with EOL). + * reversing source route (i.e. replace all IP options with EOL). */ if (isv4) { ipaddr_t v4addr; - inip4h = (ipha_t *)inmp->b_rptr; + ipha = (ipha_t *)mp->b_rptr; for (i = IP_SIMPLE_HDR_LENGTH; i < (int)ip_hdr_len; i++) - inmp->b_rptr[i] = IPOPT_EOL; + mp->b_rptr[i] = IPOPT_EOL; /* Swap addresses */ - inip4h->ipha_length = htons(ip_hdr_len + sizeof (*insctph) + - sizeof (*scch)); - v4addr = inip4h->ipha_src; - inip4h->ipha_src = inip4h->ipha_dst; - inip4h->ipha_dst = v4addr; - inip4h->ipha_ident = 0; - inip4h->ipha_ttl = (uchar_t)sctps->sctps_ipv4_ttl; + ipha->ipha_length = htons(ixas.ixa_pktlen); + v4addr = ipha->ipha_src; + ipha->ipha_src = ipha->ipha_dst; + ipha->ipha_dst = v4addr; + ipha->ipha_ident = 0; + ipha->ipha_ttl = (uchar_t)sctps->sctps_ipv4_ttl; + + ixas.ixa_flags = IXAF_BASIC_SIMPLE_V4; } else { in6_addr_t v6addr; - inip6h = (ip6_t *)inmp->b_rptr; + ip6h = (ip6_t *)mp->b_rptr; /* Remove any extension headers assuming partial overlay */ if (ip_hdr_len > IPV6_HDR_LEN) { uint8_t *to; - to = inmp->b_rptr + ip_hdr_len - IPV6_HDR_LEN; - ovbcopy(inip6h, to, IPV6_HDR_LEN); - inmp->b_rptr += ip_hdr_len - IPV6_HDR_LEN; + to = mp->b_rptr + ip_hdr_len - IPV6_HDR_LEN; + ovbcopy(ip6h, to, IPV6_HDR_LEN); + mp->b_rptr += ip_hdr_len - IPV6_HDR_LEN; ip_hdr_len = IPV6_HDR_LEN; - inip6h = (ip6_t *)inmp->b_rptr; - inip6h->ip6_nxt = IPPROTO_SCTP; + ip6h = (ip6_t *)mp->b_rptr; + ip6h->ip6_nxt = IPPROTO_SCTP; + } + ip6h->ip6_plen = htons(ixas.ixa_pktlen - IPV6_HDR_LEN); + v6addr = ip6h->ip6_src; + ip6h->ip6_src = ip6h->ip6_dst; + ip6h->ip6_dst = v6addr; + ip6h->ip6_hops = (uchar_t)sctps->sctps_ipv6_hoplimit; + + ixas.ixa_flags = IXAF_BASIC_SIMPLE_V6; + if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_dst)) { + ixas.ixa_flags |= IXAF_SCOPEID_SET; + ixas.ixa_scopeid = ira->ira_ruifindex; } - inip6h->ip6_plen = htons(ip_hdr_len + sizeof (*insctph) + - sizeof (*scch) - IPV6_HDR_LEN); - v6addr = inip6h->ip6_src; - inip6h->ip6_src = inip6h->ip6_dst; - inip6h->ip6_dst = v6addr; - inip6h->ip6_hops = (uchar_t)sctps->sctps_ipv6_hoplimit; } - insctph = (sctp_hdr_t *)(inmp->b_rptr + ip_hdr_len); + + insctph = (sctp_hdr_t *)(mp->b_rptr + ip_hdr_len); /* Swap ports. Verification tag is reused. */ port = insctph->sh_sport; @@ -359,9 +368,29 @@ sctp_ootb_shutdown_ack(sctp_t *gsctp, mblk_t *inmp, uint_t ip_hdr_len) /* Set the T-bit */ SCTP_SET_TBIT(scch); - BUMP_LOCAL(gsctp->sctp_obchunks); - /* Nothing to stash... */ - SCTP_STASH_IPINFO(inmp, (ire_t *)NULL); + ixas.ixa_protocol = IPPROTO_SCTP; + ixas.ixa_zoneid = ira->ira_zoneid; + ixas.ixa_ipst = ipst; + ixas.ixa_ifindex = 0; + + if (ira->ira_flags & IRAF_IPSEC_SECURE) { + /* + * Apply IPsec based on how IPsec was applied to + * the packet that was out of the blue. + */ + if (!ipsec_in_to_out(ira, &ixas, mp, ipha, ip6h)) { + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); + /* Note: mp already consumed and ip_drop_packet done */ + return; + } + } else { + /* + * This is in clear. The message we are building + * here should go out in clear, independent of our policy. + */ + ixas.ixa_flags |= IXAF_NO_IPSEC; + } - sctp_add_sendq(gsctp, inmp); + (void) ip_output_simple(mp, &ixas); + ixa_cleanup(&ixas); } diff --git a/usr/src/uts/common/inet/sctp/sctp_snmp.c b/usr/src/uts/common/inet/sctp/sctp_snmp.c index f859cd6ba5..f1e7deceae 100644 --- a/usr/src/uts/common/inet/sctp/sctp_snmp.c +++ b/usr/src/uts/common/inet/sctp/sctp_snmp.c @@ -78,9 +78,9 @@ sctp_kstat_update(kstat_t *kp, int rw) * individual set of statistics. */ SET_MIB(sctps->sctps_mib.sctpCurrEstab, 0); - sctp = sctps->sctps_gsctp; sctp_prev = NULL; mutex_enter(&sctps->sctps_g_lock); + sctp = list_head(&sctps->sctps_g_list); while (sctp != NULL) { mutex_enter(&sctp->sctp_reflock); if (sctp->sctp_condemned) { @@ -471,8 +471,8 @@ sctp_snmp_get_mib2(queue_t *q, mblk_t *mpctl, sctp_stack_t *sctps) SET_MIB(sctps->sctps_mib.sctpCurrEstab, 0); idx = 0; - sctp = sctps->sctps_gsctp; mutex_enter(&sctps->sctps_g_lock); + sctp = list_head(&sctps->sctps_g_list); while (sctp != NULL) { mutex_enter(&sctp->sctp_reflock); if (sctp->sctp_condemned) { @@ -541,8 +541,8 @@ sctp_snmp_get_mib2(queue_t *q, mblk_t *mpctl, sctp_stack_t *sctps) sctp->sctp_reassmsgs = 0; sce.sctpAssocId = ntohl(sctp->sctp_lvtag); - sce.sctpAssocLocalPort = ntohs(sctp->sctp_lport); - sce.sctpAssocRemPort = ntohs(sctp->sctp_fport); + sce.sctpAssocLocalPort = ntohs(sctp->sctp_connp->conn_lport); + sce.sctpAssocRemPort = ntohs(sctp->sctp_connp->conn_fport); RUN_SCTP(sctp); if (sctp->sctp_primary != NULL) { @@ -659,11 +659,10 @@ done: needattr = B_TRUE; break; } - if (connp->conn_fully_bound && - connp->conn_effective_cred != NULL) { + if (sctp->sctp_connp->conn_ixa->ixa_tsl != NULL) { ts_label_t *tsl; - tsl = crgetlabel(connp->conn_effective_cred); + tsl = sctp->sctp_connp->conn_ixa->ixa_tsl; mlp.tme_flags |= MIB2_TMEF_IS_LABELED; mlp.tme_doi = label2doi(tsl); mlp.tme_label = *label2bslabel(tsl); diff --git a/usr/src/uts/common/inet/sctp/sctp_stack.h b/usr/src/uts/common/inet/sctp/sctp_stack.h index d467b38a17..e9ad5cf9c7 100644 --- a/usr/src/uts/common/inet/sctp/sctp_stack.h +++ b/usr/src/uts/common/inet/sctp/sctp_stack.h @@ -20,15 +20,13 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _INET_SCTP_SCTP_STACK_H #define _INET_SCTP_SCTP_STACK_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/netstack.h> #include <sys/taskq.h> @@ -76,17 +74,6 @@ struct sctp_stack { mib2_sctp_t sctps_mib; - /* Protected by sctps_g_q_lock */ - queue_t *sctps_g_q; - uint_t sctps_g_q_ref; /* Number of sctp_t's that use it */ - kmutex_t sctps_g_q_lock; - kcondvar_t sctps_g_q_cv; - kthread_t *sctps_g_q_creator; - struct __ldi_handle *sctps_g_q_lh; - cred_t *sctps_g_q_cr; /* For _inactive close call */ - /* The default sctp_t for responding out of the blue packets. */ - struct sctp_s *sctps_gsctp; - /* Protected by sctps_g_lock */ struct list sctps_g_list; /* SCTP instance data chain */ kmutex_t sctps_g_lock; diff --git a/usr/src/uts/common/inet/sctp/sctp_timer.c b/usr/src/uts/common/inet/sctp/sctp_timer.c index c6fd4a5c71..24b46ad6f0 100644 --- a/usr/src/uts/common/inet/sctp/sctp_timer.c +++ b/usr/src/uts/common/inet/sctp/sctp_timer.c @@ -220,7 +220,6 @@ sctp_timer_fire(sctp_tb_t *sctp_tb) sctp_timer_call(sctp, mp); WAKE_SCTP(sctp); - sctp_process_sendq(sctp); } SCTP_REFRELE(sctp); } @@ -429,7 +428,7 @@ sctp_heartbeat_timer(sctp_t *sctp) for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->next) { /* * If the peer is unreachable because there is no available - * source address, call sctp_get_ire() to see if it is + * source address, call sctp_get_dest() to see if it is * reachable now. If it is OK, the state will become * unconfirmed. And the following code to handle unconfirmed * address will be executed. If it is still not OK, @@ -438,7 +437,7 @@ sctp_heartbeat_timer(sctp_t *sctp) * is disable, this retry may go on forever. */ if (fp->state == SCTP_FADDRS_UNREACH) { - sctp_get_ire(sctp, fp); + sctp_get_dest(sctp, fp); if (fp->state == SCTP_FADDRS_UNREACH) { if (fp->hb_enabled && ++fp->strikes > fp->max_retr && @@ -642,15 +641,14 @@ rxmit_init: * address list won't be modified (it would have been done * the first time around). */ - mp = sctp_init_mp(sctp); + mp = sctp_init_mp(sctp, fp); if (mp != NULL) { BUMP_MIB(&sctps->sctps_mib, sctpTimRetrans); - sctp_add_sendq(sctp, mp); + (void) conn_ip_output(mp, fp->ixa); + BUMP_LOCAL(sctp->sctp_opkts); } break; - case SCTPS_COOKIE_ECHOED: { - ipha_t *iph; - + case SCTPS_COOKIE_ECHOED: BUMP_LOCAL(sctp->sctp_T1expire); if (sctp->sctp_cookie_mp == NULL) { sctp->sctp_state = SCTPS_COOKIE_WAIT; @@ -659,14 +657,10 @@ rxmit_init: mp = dupmsg(sctp->sctp_cookie_mp); if (mp == NULL) break; - iph = (ipha_t *)mp->b_rptr; - /* Reset the IP ident. */ - if (IPH_HDR_VERSION(iph) == IPV4_VERSION) - iph->ipha_ident = 0; - sctp_add_sendq(sctp, mp); + (void) conn_ip_output(mp, fp->ixa); + BUMP_LOCAL(sctp->sctp_opkts); BUMP_MIB(&sctps->sctps_mib, sctpTimRetrans); break; - } case SCTPS_SHUTDOWN_SENT: BUMP_LOCAL(sctp->sctp_T2expire); sctp_send_shutdown(sctp, 1); diff --git a/usr/src/uts/common/inet/sctp_ip.h b/usr/src/uts/common/inet/sctp_ip.h index 7b20d3fd2b..9e4c2ef7ec 100644 --- a/usr/src/uts/common/inet/sctp_ip.h +++ b/usr/src/uts/common/inet/sctp_ip.h @@ -35,40 +35,24 @@ extern "C" { #define SCTP_COMMON_HDR_LENGTH 12 /* SCTP common header length */ /* SCTP routines for IP to call. */ -extern void ip_fanout_sctp(mblk_t *, ill_t *, ipha_t *, uint32_t, - uint_t, boolean_t, boolean_t, zoneid_t); +extern void ip_fanout_sctp(mblk_t *, ipha_t *, ip6_t *, uint32_t, + ip_recv_attr_t *); extern void sctp_ddi_g_init(void); extern void sctp_ddi_g_destroy(void); extern conn_t *sctp_find_conn(in6_addr_t *, in6_addr_t *, uint32_t, - zoneid_t, sctp_stack_t *); + zoneid_t, iaflags_t, sctp_stack_t *); extern conn_t *sctp_fanout(in6_addr_t *, in6_addr_t *, uint32_t, - zoneid_t, mblk_t *, sctp_stack_t *); + ip_recv_attr_t *, mblk_t *, sctp_stack_t *); -extern void sctp_input(conn_t *, ipha_t *, mblk_t *, mblk_t *, ill_t *, - boolean_t, boolean_t); +extern void sctp_input(conn_t *, ipha_t *, ip6_t *, mblk_t *, ip_recv_attr_t *); extern void sctp_wput(queue_t *, mblk_t *); -extern void sctp_ootb_input(mblk_t *, ill_t *, zoneid_t, boolean_t); +extern void sctp_ootb_input(mblk_t *, ip_recv_attr_t *, ip_stack_t *); extern void sctp_hash_init(sctp_stack_t *); extern void sctp_hash_destroy(sctp_stack_t *); extern uint32_t sctp_cksum(mblk_t *, int); extern mblk_t *sctp_snmp_get_mib2(queue_t *, mblk_t *, sctp_stack_t *); extern void sctp_free(conn_t *); -#define SCTP_STASH_IPINFO(mp, ire) \ -{ \ - unsigned char *stp; \ - stp = DB_BASE((mp)); \ - ASSERT(stp + sizeof (ire_t *) < (mp)->b_rptr); \ - *(ire_t **)stp = (ire); \ -} - -#define SCTP_EXTRACT_IPINFO(mp, ire) \ -{ \ - unsigned char *stp; \ - stp = (mp)->b_datap->db_base; \ - (ire) = *(ire_t **)stp; \ -} - /* * SCTP maintains a list of ILLs/IPIFs, these functions are provided by * SCTP to keep its interface list up to date. @@ -87,16 +71,8 @@ extern void sctp_ill_reindex(ill_t *, uint_t); #define SCTP_IPIF_UPDATE 6 /* IP routines for SCTP to call. */ -extern void ip_fanout_sctp_raw(mblk_t *, ill_t *, ipha_t *, boolean_t, - uint32_t, boolean_t, uint_t, boolean_t, zoneid_t); -extern void sctp_ire_cache_flush(ipif_t *); - -/* - * Private (and possibly temporary) ioctls. It is a large number - * to avoid conflict with other ioctls, which are normally smaller - * than 2^16. - */ -#define SCTP_IOC_DEFAULT_Q (('S' << 16) | 1024) +extern void ip_fanout_sctp_raw(mblk_t *, ipha_t *, ip6_t *, uint32_t, + ip_recv_attr_t *); #ifdef __cplusplus } diff --git a/usr/src/uts/common/inet/sctp_itf.h b/usr/src/uts/common/inet/sctp_itf.h index 9ce69fdaf0..2ae6d3669f 100644 --- a/usr/src/uts/common/inet/sctp_itf.h +++ b/usr/src/uts/common/inet/sctp_itf.h @@ -83,9 +83,9 @@ extern int sctp_bindx(struct sctp_s *conn, const void *addrs, int addrcnt, int flags); extern void sctp_close(struct sctp_s *conn); extern int sctp_connect(struct sctp_s *conn, const struct sockaddr *dst, - socklen_t addrlen); + socklen_t addrlen, cred_t *cr, pid_t pid); extern struct sctp_s *sctp_create(void *newhandle, struct sctp_s *parent, - int family, int flags, struct sock_upcalls_s *su, + int family, int type, int flags, struct sock_upcalls_s *su, sctp_sockbuf_limits_t *sbl, cred_t *cr); extern int sctp_disconnect(struct sctp_s *conn); extern int sctp_get_opt(struct sctp_s *conn, int level, int opt, void *opts, diff --git a/usr/src/uts/common/inet/sockmods/socksctp.c b/usr/src/uts/common/inet/sockmods/socksctp.c index 7da9f92dde..4df7e33501 100644 --- a/usr/src/uts/common/inet/sockmods/socksctp.c +++ b/usr/src/uts/common/inet/sockmods/socksctp.c @@ -207,7 +207,7 @@ sosctp_init(struct sonode *so, struct sonode *pso, struct cred *cr, int flags) upcalls = &sosctp_assoc_upcalls; } so->so_proto_handle = (sock_lower_handle_t)sctp_create(so, NULL, - so->so_family, SCTP_CAN_BLOCK, upcalls, &sbl, cr); + so->so_family, so->so_type, SCTP_CAN_BLOCK, upcalls, &sbl, cr); if (so->so_proto_handle == NULL) return (ENOMEM); @@ -350,6 +350,7 @@ sosctp_connect(struct sonode *so, const struct sockaddr *name, socklen_t namelen, int fflag, int flags, struct cred *cr) { int error = 0; + pid_t pid = curproc->p_pid; ASSERT(so->so_type == SOCK_STREAM); @@ -404,7 +405,7 @@ sosctp_connect(struct sonode *so, const struct sockaddr *name, mutex_exit(&so->so_lock); error = sctp_connect((struct sctp_s *)so->so_proto_handle, - name, namelen); + name, namelen, cr, pid); mutex_enter(&so->so_lock); if (error == 0) { @@ -662,7 +663,7 @@ done: int sosctp_uiomove(mblk_t *hdr_mp, ssize_t count, ssize_t blk_size, int wroff, - struct uio *uiop, int flags, cred_t *cr) + struct uio *uiop, int flags) { ssize_t size; int error; @@ -683,8 +684,7 @@ sosctp_uiomove(mblk_t *hdr_mp, ssize_t count, ssize_t blk_size, int wroff, * packets, each mblk will have the extra space before * data to accommodate what SCTP wants to put in there. */ - while ((mp = allocb_cred(size + wroff, cr, - curproc->p_pid)) == NULL) { + while ((mp = allocb(size + wroff, BPRI_MED)) == NULL) { if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) || (flags & MSG_DONTWAIT)) { return (EAGAIN); @@ -887,7 +887,7 @@ sosctp_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, /* Copy in the message. */ if ((error = sosctp_uiomove(mctl, count, ss->ss_wrsize, ss->ss_wroff, - uiop, flags, cr)) != 0) { + uiop, flags)) != 0) { goto error_ret; } error = sctp_sendmsg((struct sctp_s *)so->so_proto_handle, mctl, 0); @@ -1091,7 +1091,7 @@ sosctp_seq_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, /* Copy in the message. */ if ((error = sosctp_uiomove(mctl, count, ssa->ssa_wrsize, - ssa->ssa_wroff, uiop, flags, cr)) != 0) { + ssa->ssa_wroff, uiop, flags)) != 0) { goto lock_rele; } error = sctp_sendmsg((struct sctp_s *)ssa->ssa_conn, mctl, 0); diff --git a/usr/src/uts/common/inet/sockmods/socksctp.h b/usr/src/uts/common/inet/sockmods/socksctp.h index b02622c994..2ac7058821 100644 --- a/usr/src/uts/common/inet/sockmods/socksctp.h +++ b/usr/src/uts/common/inet/sockmods/socksctp.h @@ -116,7 +116,7 @@ extern void sosctp_assoc_isdisconnected(struct sctp_soassoc *ssa, int error); extern int sosctp_waitconnected(struct sonode *so, int fmode); extern int sosctp_uiomove(mblk_t *hdr_mp, ssize_t count, ssize_t blk_size, - int wroff, struct uio *uiop, int flags, cred_t *cr); + int wroff, struct uio *uiop, int flags); /* * Data structure types. diff --git a/usr/src/uts/common/inet/sockmods/socksctpsubr.c b/usr/src/uts/common/inet/sockmods/socksctpsubr.c index 4a4cb08007..a647cbe4f2 100644 --- a/usr/src/uts/common/inet/sockmods/socksctpsubr.c +++ b/usr/src/uts/common/inet/sockmods/socksctpsubr.c @@ -367,6 +367,7 @@ sosctp_assoc_createconn(struct sctp_sonode *ss, const struct sockaddr *name, sctp_assoc_t id; int error; struct cmsghdr *cmsg; + pid_t pid = curproc->p_pid; ASSERT(MUTEX_HELD(&so->so_lock)); @@ -407,7 +408,8 @@ sosctp_assoc_createconn(struct sctp_sonode *ss, const struct sockaddr *name, ssa->ssa_wroff = ss->ss_wroff; ssa->ssa_wrsize = ss->ss_wrsize; ssa->ssa_conn = sctp_create(ssa, (struct sctp_s *)so->so_proto_handle, - so->so_family, SCTP_CAN_BLOCK, &sosctp_assoc_upcalls, &sbl, cr); + so->so_family, so->so_type, SCTP_CAN_BLOCK, &sosctp_assoc_upcalls, + &sbl, cr); mutex_enter(&so->so_lock); ss->ss_assocs[id].ssi_assoc = ssa; @@ -435,7 +437,7 @@ sosctp_assoc_createconn(struct sctp_sonode *ss, const struct sockaddr *name, goto ret_err; } - if ((error = sctp_connect(ssa->ssa_conn, name, namelen)) != 0) + if ((error = sctp_connect(ssa->ssa_conn, name, namelen, cr, pid)) != 0) goto ret_err; mutex_enter(&so->so_lock); diff --git a/usr/src/uts/common/inet/spdsock.h b/usr/src/uts/common/inet/spdsock.h index 7622e56a45..64c63cdd71 100644 --- a/usr/src/uts/common/inet/spdsock.h +++ b/usr/src/uts/common/inet/spdsock.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -110,7 +110,7 @@ extern uint_t spdsock_max_optsize; extern int spdsock_opt_get(queue_t *, int, int, uchar_t *); extern int spdsock_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *, - uint_t *, uchar_t *, void *, cred_t *, mblk_t *); + uint_t *, uchar_t *, void *, cred_t *); #ifdef __cplusplus } diff --git a/usr/src/uts/common/inet/squeue.c b/usr/src/uts/common/inet/squeue.c index e46293d820..db11ef79ae 100644 --- a/usr/src/uts/common/inet/squeue.c +++ b/usr/src/uts/common/inet/squeue.c @@ -39,8 +39,8 @@ * parallelization (on a per H/W execution pipeline basis) with at * most one queuing. * - * The modules needing protection typically calls squeue_enter() or - * squeue_enter_chain() routine as soon as a thread enter the module + * The modules needing protection typically calls SQUEUE_ENTER_ONE() or + * SQUEUE_ENTER() macro as soon as a thread enter the module * from either direction. For each packet, the processing function * and argument is stored in the mblk itself. When the packet is ready * to be processed, the squeue retrieves the stored function and calls @@ -406,11 +406,15 @@ squeue_worker_wakeup(squeue_t *sqp) * and drain in the entering thread context. If process_flag is * SQ_FILL, then we just queue the mblk and return (after signaling * the worker thread if no one else is processing the squeue). + * + * The ira argument can be used when the count is one. + * For a chain the caller needs to prepend any needed mblks from + * ip_recv_attr_to_mblk(). */ /* ARGSUSED */ void squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, - int process_flag, uint8_t tag) + ip_recv_attr_t *ira, int process_flag, uint8_t tag) { conn_t *connp; sqproc_t proc; @@ -421,6 +425,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, ASSERT(tail != NULL); ASSERT(cnt > 0); ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock)); + ASSERT(ira == NULL || cnt == 1); mutex_enter(&sqp->sq_lock); @@ -467,7 +472,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, connp->conn_on_sqp = B_TRUE; DTRACE_PROBE3(squeue__proc__start, squeue_t *, sqp, mblk_t *, mp, conn_t *, connp); - (*proc)(connp, mp, sqp); + (*proc)(connp, mp, sqp, ira); DTRACE_PROBE2(squeue__proc__end, squeue_t *, sqp, conn_t *, connp); connp->conn_on_sqp = B_FALSE; @@ -475,7 +480,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, CONN_DEC_REF(connp); } else { SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, - connp, SQ_FILL, SQTAG_SQUEUE_CHANGE); + connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE); } ASSERT(MUTEX_NOT_HELD(&sqp->sq_lock)); mutex_enter(&sqp->sq_lock); @@ -499,6 +504,33 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, return; } } else { + if (ira != NULL) { + mblk_t *attrmp; + + ASSERT(cnt == 1); + attrmp = ip_recv_attr_to_mblk(ira); + if (attrmp == NULL) { + mutex_exit(&sqp->sq_lock); + ip_drop_input("squeue: " + "ip_recv_attr_to_mblk", + mp, NULL); + /* Caller already set b_prev/b_next */ + mp->b_prev = mp->b_next = NULL; + freemsg(mp); + return; + } + ASSERT(attrmp->b_cont == NULL); + attrmp->b_cont = mp; + /* Move connp and func to new */ + attrmp->b_queue = mp->b_queue; + mp->b_queue = NULL; + attrmp->b_prev = mp->b_prev; + mp->b_prev = NULL; + + ASSERT(mp == tail); + tail = mp = attrmp; + } + ENQUEUE_CHAIN(sqp, mp, tail, cnt); #ifdef DEBUG mp->b_tag = tag; @@ -564,14 +596,14 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, connp->conn_on_sqp = B_TRUE; DTRACE_PROBE3(squeue__proc__start, squeue_t *, sqp, mblk_t *, mp, conn_t *, connp); - (*proc)(connp, mp, sqp); + (*proc)(connp, mp, sqp, ira); DTRACE_PROBE2(squeue__proc__end, squeue_t *, sqp, conn_t *, connp); connp->conn_on_sqp = B_FALSE; CONN_DEC_REF(connp); } else { SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, - connp, SQ_FILL, SQTAG_SQUEUE_CHANGE); + connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE); } mutex_enter(&sqp->sq_lock); @@ -589,7 +621,31 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, #ifdef DEBUG mp->b_tag = tag; #endif + if (ira != NULL) { + mblk_t *attrmp; + ASSERT(cnt == 1); + attrmp = ip_recv_attr_to_mblk(ira); + if (attrmp == NULL) { + mutex_exit(&sqp->sq_lock); + ip_drop_input("squeue: ip_recv_attr_to_mblk", + mp, NULL); + /* Caller already set b_prev/b_next */ + mp->b_prev = mp->b_next = NULL; + freemsg(mp); + return; + } + ASSERT(attrmp->b_cont == NULL); + attrmp->b_cont = mp; + /* Move connp and func to new */ + attrmp->b_queue = mp->b_queue; + mp->b_queue = NULL; + attrmp->b_prev = mp->b_prev; + mp->b_prev = NULL; + + ASSERT(mp == tail); + tail = mp = attrmp; + } ENQUEUE_CHAIN(sqp, mp, tail, cnt); if (!(sqp->sq_state & SQS_PROC)) { squeue_worker_wakeup(sqp); @@ -653,6 +709,7 @@ squeue_drain(squeue_t *sqp, uint_t proc_type, hrtime_t expire) hrtime_t now; boolean_t did_wakeup = B_FALSE; boolean_t sq_poll_capable; + ip_recv_attr_t *ira, iras; sq_poll_capable = (sqp->sq_state & SQS_POLL_CAPAB) != 0; again: @@ -697,6 +754,31 @@ again: connp = (conn_t *)mp->b_prev; mp->b_prev = NULL; + /* Is there an ip_recv_attr_t to handle? */ + if (ip_recv_attr_is_mblk(mp)) { + mblk_t *attrmp = mp; + + ASSERT(attrmp->b_cont != NULL); + + mp = attrmp->b_cont; + attrmp->b_cont = NULL; + ASSERT(mp->b_queue == NULL); + ASSERT(mp->b_prev == NULL); + + if (!ip_recv_attr_from_mblk(attrmp, &iras)) { + /* The ill or ip_stack_t disappeared on us */ + ip_drop_input("ip_recv_attr_from_mblk", + mp, NULL); + ira_cleanup(&iras, B_TRUE); + CONN_DEC_REF(connp); + continue; + } + ira = &iras; + } else { + ira = NULL; + } + + /* * Handle squeue switching. More details in the * block comment at the top of the file @@ -707,15 +789,17 @@ again: connp->conn_on_sqp = B_TRUE; DTRACE_PROBE3(squeue__proc__start, squeue_t *, sqp, mblk_t *, mp, conn_t *, connp); - (*proc)(connp, mp, sqp); + (*proc)(connp, mp, sqp, ira); DTRACE_PROBE2(squeue__proc__end, squeue_t *, sqp, conn_t *, connp); connp->conn_on_sqp = B_FALSE; CONN_DEC_REF(connp); } else { - SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE); } + if (ira != NULL) + ira_cleanup(ira, B_TRUE); } SQUEUE_DBG_CLEAR(sqp); @@ -991,9 +1075,13 @@ poll_again: &tail, &cnt); } mutex_enter(lock); - if (mp != NULL) + if (mp != NULL) { + /* + * The ip_accept function has already added an + * ip_recv_attr_t mblk if that is needed. + */ ENQUEUE_CHAIN(sqp, mp, tail, cnt); - + } ASSERT((sqp->sq_state & (SQS_PROC|SQS_POLLING|SQS_GET_PKTS)) == (SQS_PROC|SQS_POLLING|SQS_GET_PKTS)); @@ -1263,7 +1351,7 @@ squeue_getprivate(squeue_t *sqp, sqprivate_t p) /* ARGSUSED */ void -squeue_wakeup_conn(void *arg, mblk_t *mp, void *arg2) +squeue_wakeup_conn(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { conn_t *connp = (conn_t *)arg; squeue_t *sqp = connp->conn_sqp; diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h index 8442c4f384..321d0756fc 100644 --- a/usr/src/uts/common/inet/tcp.h +++ b/usr/src/uts/common/inet/tcp.h @@ -36,7 +36,6 @@ extern "C" { #include <netinet/tcp.h> #include <sys/socket.h> #include <sys/socket_proto.h> -#include <sys/multidata.h> #include <sys/md5.h> #include <inet/common.h> #include <inet/ip.h> @@ -47,12 +46,6 @@ extern "C" { #include <inet/tcp_sack.h> #include <inet/kssl/ksslapi.h> -/* - * Private (and possibly temporary) ioctl used by configuration code - * to lock in the "default" stream for detached closes. - */ -#define TCP_IOC_DEFAULT_Q (('T' << 8) + 51) - /* TCP states */ #define TCPS_CLOSED -6 #define TCPS_IDLE -5 /* idle (opened, but not bound) */ @@ -73,7 +66,7 @@ extern "C" { /* * Internal flags used in conjunction with the packet header flags. - * Used in tcp_rput_data to keep track of what needs to be done. + * Used in tcp_input_data to keep track of what needs to be done. */ #define TH_LIMIT_XMIT 0x0400 /* Limited xmit is needed */ #define TH_XMIT_NEEDED 0x0800 /* Window opened - send queued data */ @@ -108,11 +101,12 @@ typedef struct tcphdr_s { uint8_t th_urp[2]; /* Urgent pointer */ } tcph_t; -#define TCP_HDR_LENGTH(tcph) (((tcph)->th_offset_and_rsrvd[0] >>2) &(0xF << 2)) +#define TCP_HDR_LENGTH(tcph) \ + ((((tcph_t *)tcph)->th_offset_and_rsrvd[0] >>2) &(0xF << 2)) #define TCP_MAX_COMBINED_HEADER_LENGTH (60 + 60) /* Maxed out ip + tcp */ #define TCP_MAX_IP_OPTIONS_LENGTH (60 - IP_SIMPLE_HDR_LENGTH) #define TCP_MAX_HDR_LENGTH 60 -#define TCP_MAX_TCP_OPTIONS_LENGTH (60 - sizeof (tcph_t)) +#define TCP_MAX_TCP_OPTIONS_LENGTH (60 - sizeof (tcpha_t)) #define TCP_MIN_HEADER_LENGTH 20 #define TCP_MAXWIN 65535 #define TCP_PORT_LEN sizeof (in_port_t) @@ -122,7 +116,7 @@ typedef struct tcphdr_s { #define TCPIP_HDR_LENGTH(mp, n) \ (n) = IPH_HDR_LENGTH((mp)->b_rptr), \ - (n) += TCP_HDR_LENGTH((tcph_t *)&(mp)->b_rptr[(n)]) + (n) += TCP_HDR_LENGTH((tcpha_t *)&(mp)->b_rptr[(n)]) /* TCP Protocol header (used if the header is known to be 32-bit aligned) */ typedef struct tcphdra_s { @@ -173,9 +167,6 @@ typedef struct tcp_s { uint32_t tcp_rnxt; /* Seq we expect to recv next */ uint32_t tcp_rwnd; - queue_t *tcp_rq; /* Our upstream neighbor (client) */ - queue_t *tcp_wq; /* Our downstream neighbor */ - /* Fields arranged in approximate access order along main paths */ mblk_t *tcp_xmit_head; /* Head of rexmit list */ mblk_t *tcp_xmit_last; /* last valid data seen by tcp_wput */ @@ -207,46 +198,16 @@ typedef struct tcp_s { int64_t tcp_last_recv_time; /* Last time we receive a segment. */ uint32_t tcp_init_cwnd; /* Initial cwnd (start/restart) */ - /* - * Following socket options are set by sockfs outside the squeue - * and we want to separate these bit fields from the other bit fields - * set by TCP to avoid grabbing locks. sockfs ensures that only one - * thread in sockfs can set a socket option at a time on a conn_t. - * However TCP may read these options concurrently. The linger option - * needs atomicity since tcp_lingertime also needs to be in sync. - * However TCP uses it only during close, and by then no socket option - * can come down. So we don't need any locks, instead just separating - * the sockfs settable bit fields from the other bit fields is - * sufficient. - */ - uint32_t - tcp_debug : 1, /* SO_DEBUG "socket" option. */ - tcp_dontroute : 1, /* SO_DONTROUTE "socket" option. */ - tcp_broadcast : 1, /* SO_BROADCAST "socket" option. */ - tcp_useloopback : 1, /* SO_USELOOPBACK "socket" option. */ - - tcp_oobinline : 1, /* SO_OOBINLINE "socket" option. */ - tcp_dgram_errind : 1, /* SO_DGRAM_ERRIND option */ - tcp_linger : 1, /* SO_LINGER turned on */ - tcp_reuseaddr : 1, /* SO_REUSEADDR "socket" option. */ - - tcp_junk_to_bit_31 : 24; - /* Following manipulated by TCP under squeue protection */ uint32_t tcp_urp_last_valid : 1, /* Is tcp_urp_last valid? */ - tcp_hard_binding : 1, /* If we've started a full bind */ - tcp_hard_bound : 1, /* If we've done a full bind with IP */ + tcp_hard_binding : 1, /* TCP_DETACHED_NONEAGER */ tcp_fin_acked : 1, /* Has our FIN been acked? */ - tcp_fin_rcvd : 1, /* Have we seen a FIN? */ + tcp_fin_sent : 1, /* Have we sent our FIN yet? */ tcp_ordrel_done : 1, /* Have we sent the ord_rel upstream? */ tcp_detached : 1, /* If we're detached from a stream */ - - tcp_bind_pending : 1, /* Client is waiting for bind ack */ - tcp_unbind_pending : 1, /* Client sent T_UNBIND_REQ */ - tcp_ka_enabled: 1, /* Connection KeepAlive Timer needed */ tcp_zero_win_probe: 1, /* Zero win probing is in progress */ tcp_loopback: 1, /* src and dst are the same machine */ @@ -258,44 +219,40 @@ typedef struct tcp_s { tcp_active_open: 1, /* This is a active open */ tcp_rexmit : 1, /* TCP is retransmitting */ tcp_snd_sack_ok : 1, /* Can use SACK for this connection */ - tcp_empty_flag : 1, /* Empty flag for future use */ - - tcp_recvdstaddr : 1, /* return T_EXTCONN_IND with dst addr */ tcp_hwcksum : 1, /* The NIC is capable of hwcksum */ - tcp_ip_forward_progress : 1, - tcp_anon_priv_bind : 1, + tcp_ip_forward_progress : 1, tcp_ecn_ok : 1, /* Can use ECN for this connection */ tcp_ecn_echo_on : 1, /* Need to do ECN echo */ tcp_ecn_cwr_sent : 1, /* ECN_CWR has been sent */ + tcp_cwr : 1, /* Cwnd has reduced recently */ - tcp_pad_to_bit31 : 4; + tcp_pad_to_bit31 : 11; + /* Following manipulated by TCP under squeue protection */ uint32_t - tcp_mdt : 1, /* Lower layer is capable of MDT */ tcp_snd_ts_ok : 1, tcp_snd_ws_ok : 1, - tcp_exclbind : 1, /* ``exclusive'' binding */ - - tcp_hdr_grown : 1, + tcp_reserved_port : 1, tcp_in_free_list : 1, - tcp_snd_zcopy_on : 1, /* xmit zero-copy enabled */ + tcp_snd_zcopy_on : 1, /* xmit zero-copy enabled */ tcp_snd_zcopy_aware : 1, /* client is zero-copy aware */ tcp_xmit_zc_clean : 1, /* the xmit list is free of zc-mblk */ tcp_wait_for_eagers : 1, /* Wait for eagers to disappear */ - tcp_accept_error : 1, /* Error during TLI accept */ + tcp_accept_error : 1, /* Error during TLI accept */ tcp_send_discon_ind : 1, /* TLI accept err, send discon ind */ tcp_cork : 1, /* tcp_cork option */ tcp_tconnind_started : 1, /* conn_ind message is being sent */ + tcp_lso :1, /* Lower layer is capable of LSO */ - tcp_refuse :1, /* Connection needs refusing */ tcp_is_wnd_shrnk : 1, /* Window has shrunk */ - tcp_pad_to_bit_31 : 15; - uint32_t tcp_if_mtu; /* Outgoing interface MTU. */ + tcp_pad_to_bit_31 : 18; + + uint32_t tcp_initial_pmtu; /* Initial outgoing Path MTU. */ mblk_t *tcp_reass_head; /* Out of order reassembly list head */ mblk_t *tcp_reass_tail; /* Out of order reassembly list tail */ @@ -340,11 +297,6 @@ typedef struct tcp_s { struct tcp_s *tcp_listener; /* Our listener */ - size_t tcp_xmit_hiwater; /* Send buffer high water mark. */ - size_t tcp_xmit_lowater; /* Send buffer low water mark. */ - size_t tcp_recv_hiwater; /* Recv high water mark */ - size_t tcp_recv_lowater; /* Recv low water mark */ - uint32_t tcp_irs; /* Initial recv seq num */ uint32_t tcp_fss; /* Final/fin send seq num */ uint32_t tcp_urg; /* Urgent data seq num */ @@ -354,8 +306,6 @@ typedef struct tcp_s { clock_t tcp_first_ctimer_threshold; /* 1st threshold while connecting */ clock_t tcp_second_ctimer_threshold; /* 2nd ... while connecting */ - int tcp_lingertime; /* Close linger time (in seconds) */ - uint32_t tcp_urp_last; /* Last urp for which signal sent */ mblk_t *tcp_urp_mp; /* T_EXDATA_IND for urgent byte */ mblk_t *tcp_urp_mark_mp; /* zero-length marked/unmarked msg */ @@ -389,21 +339,14 @@ typedef struct tcp_s { int32_t tcp_client_errno; /* How the client screwed up */ - char *tcp_iphc; /* Buffer holding tcp/ip hdr template */ - int tcp_iphc_len; /* actual allocated buffer size */ - int32_t tcp_hdr_len; /* Byte len of combined TCP/IP hdr */ - ipha_t *tcp_ipha; /* IPv4 header in the buffer */ - ip6_t *tcp_ip6h; /* IPv6 header in the buffer */ - int tcp_ip_hdr_len; /* Byte len of our current IPvx hdr */ - tcph_t *tcp_tcph; /* tcp header within combined hdr */ - int32_t tcp_tcp_hdr_len; /* tcp header len within combined */ - /* Saved peer headers in the case of re-fusion */ - ipha_t tcp_saved_ipha; - ip6_t tcp_saved_ip6h; - tcph_t tcp_saved_tcph; - - uint32_t tcp_sum; /* checksum to compensate for source */ - /* routed packets. Host byte order */ + /* + * The header template lives in conn_ht_iphc allocated by tcp_build_hdrs + * We maintain three pointers into conn_ht_iphc. + */ + ipha_t *tcp_ipha; /* IPv4 header in conn_ht_iphc */ + ip6_t *tcp_ip6h; /* IPv6 header in conn_ht_iphc */ + tcpha_t *tcp_tcpha; /* TCP header in conn_ht_iphc */ + uint16_t tcp_last_sent_len; /* Record length for nagle */ uint16_t tcp_dupack_cnt; /* # of consequtive duplicate acks */ @@ -413,75 +356,20 @@ typedef struct tcp_s { t_uscalar_t tcp_acceptor_id; /* ACCEPTOR_id */ int tcp_ipsec_overhead; - /* - * Address family that app wishes returned addrsses to be in. - * Currently taken from address family used in T_BIND_REQ, but - * should really come from family used in original socket() call. - * Value can be AF_INET or AF_INET6. - */ - uint_t tcp_family; - /* - * used for a quick test to determine if any ancillary bits are - * set - */ - uint_t tcp_ipv6_recvancillary; /* Flags */ -#define TCP_IPV6_RECVPKTINFO 0x01 /* IPV6_RECVPKTINFO option */ -#define TCP_IPV6_RECVHOPLIMIT 0x02 /* IPV6_RECVHOPLIMIT option */ -#define TCP_IPV6_RECVHOPOPTS 0x04 /* IPV6_RECVHOPOPTS option */ -#define TCP_IPV6_RECVDSTOPTS 0x08 /* IPV6_RECVDSTOPTS option */ -#define TCP_IPV6_RECVRTHDR 0x10 /* IPV6_RECVRTHDR option */ -#define TCP_IPV6_RECVRTDSTOPTS 0x20 /* IPV6_RECVRTHDRDSTOPTS option */ -#define TCP_IPV6_RECVTCLASS 0x40 /* IPV6_RECVTCLASS option */ -#define TCP_OLD_IPV6_RECVDSTOPTS 0x80 /* old IPV6_RECVDSTOPTS option */ uint_t tcp_recvifindex; /* Last received IPV6_RCVPKTINFO */ uint_t tcp_recvhops; /* Last received IPV6_RECVHOPLIMIT */ uint_t tcp_recvtclass; /* Last received IPV6_RECVTCLASS */ ip6_hbh_t *tcp_hopopts; /* Last received IPV6_RECVHOPOPTS */ ip6_dest_t *tcp_dstopts; /* Last received IPV6_RECVDSTOPTS */ - ip6_dest_t *tcp_rtdstopts; /* Last recvd IPV6_RECVRTHDRDSTOPTS */ + ip6_dest_t *tcp_rthdrdstopts; /* Last recv IPV6_RECVRTHDRDSTOPTS */ ip6_rthdr_t *tcp_rthdr; /* Last received IPV6_RECVRTHDR */ uint_t tcp_hopoptslen; uint_t tcp_dstoptslen; - uint_t tcp_rtdstoptslen; + uint_t tcp_rthdrdstoptslen; uint_t tcp_rthdrlen; mblk_t *tcp_timercache; - cred_t *tcp_cred; /* Credentials when this was opened */ - pid_t tcp_cpid; /* Process id when this was opened */ - uint64_t tcp_open_time; /* time when this was opened */ - - - union { - struct { - uchar_t v4_ttl; - /* Dup of tcp_ipha.iph_type_of_service */ - uchar_t v4_tos; /* Dup of tcp_ipha.iph_ttl */ - } v4_hdr_info; - struct { - uint_t v6_vcf; /* Dup of tcp_ip6h.ip6h_vcf */ - uchar_t v6_hops; /* Dup of tcp_ip6h.ip6h_hops */ - } v6_hdr_info; - } tcp_hdr_info; -#define tcp_ttl tcp_hdr_info.v4_hdr_info.v4_ttl -#define tcp_tos tcp_hdr_info.v4_hdr_info.v4_tos -#define tcp_ip6_vcf tcp_hdr_info.v6_hdr_info.v6_vcf -#define tcp_ip6_hops tcp_hdr_info.v6_hdr_info.v6_hops - - ushort_t tcp_ipversion; - uint_t tcp_bound_if; /* IPV6_BOUND_IF */ - -#define tcp_lport tcp_connp->conn_lport -#define tcp_fport tcp_connp->conn_fport -#define tcp_ports tcp_connp->conn_ports - -#define tcp_remote tcp_connp->conn_rem -#define tcp_ip_src tcp_connp->conn_src - -#define tcp_remote_v6 tcp_connp->conn_remv6 -#define tcp_ip_src_v6 tcp_connp->conn_srcv6 -#define tcp_bound_source_v6 tcp_connp->conn_bound_source_v6 -#define tcp_bound_source tcp_connp->conn_bound_source kmutex_t tcp_closelock; kcondvar_t tcp_closecv; @@ -497,36 +385,13 @@ typedef struct tcp_s { struct tcp_s *tcp_bind_hash_port; /* tcp_t's bound to the same lport */ struct tcp_s **tcp_ptpbhn; - boolean_t tcp_ire_ill_check_done; - uint_t tcp_maxpsz; - - /* - * used for Multidata Transmit - */ - uint_t tcp_mdt_hdr_head; /* leading header fragment extra space */ - uint_t tcp_mdt_hdr_tail; /* trailing header fragment extra space */ - int tcp_mdt_max_pld; /* maximum payload buffers per Multidata */ + uint_t tcp_maxpsz_multiplier; uint32_t tcp_lso_max; /* maximum LSO payload */ uint32_t tcp_ofo_fin_seq; /* Recv out of order FIN seq num */ uint32_t tcp_cwr_snd_max; - uint_t tcp_drop_opt_ack_cnt; /* # tcp generated optmgmt */ - ip6_pkt_t tcp_sticky_ipp; /* Sticky options */ -#define tcp_ipp_fields tcp_sticky_ipp.ipp_fields /* valid fields */ -#define tcp_ipp_ifindex tcp_sticky_ipp.ipp_ifindex /* pktinfo ifindex */ -#define tcp_ipp_addr tcp_sticky_ipp.ipp_addr /* pktinfo src/dst addr */ -#define tcp_ipp_hoplimit tcp_sticky_ipp.ipp_hoplimit -#define tcp_ipp_hopoptslen tcp_sticky_ipp.ipp_hopoptslen -#define tcp_ipp_rtdstoptslen tcp_sticky_ipp.ipp_rtdstoptslen -#define tcp_ipp_rthdrlen tcp_sticky_ipp.ipp_rthdrlen -#define tcp_ipp_dstoptslen tcp_sticky_ipp.ipp_dstoptslen -#define tcp_ipp_hopopts tcp_sticky_ipp.ipp_hopopts -#define tcp_ipp_rtdstopts tcp_sticky_ipp.ipp_rtdstopts -#define tcp_ipp_rthdr tcp_sticky_ipp.ipp_rthdr -#define tcp_ipp_dstopts tcp_sticky_ipp.ipp_dstopts -#define tcp_ipp_nexthop tcp_sticky_ipp.ipp_nexthop -#define tcp_ipp_use_min_mtu tcp_sticky_ipp.ipp_use_min_mtu + struct tcp_s *tcp_saved_listener; /* saved value of listener */ uint32_t tcp_in_ack_unsent; /* ACK for unsent data cnt. */ @@ -562,7 +427,6 @@ typedef struct tcp_s { boolean_t tcp_kssl_inhandshake; /* during SSL handshake */ kssl_ent_t tcp_kssl_ent; /* SSL table entry */ kssl_ctx_t tcp_kssl_ctx; /* SSL session */ - uint_t tcp_label_len; /* length of cached label */ /* * tcp_closemp_used is protected by listener's tcp_eager_lock @@ -620,47 +484,17 @@ typedef struct tcp_s { #define TCP_DEBUG_GETPCSTACK(buffer, depth) #endif -/* - * Track a reference count on the tcps in order to know when - * the tcps_g_q can be removed. As long as there is any - * tcp_t, other that the tcps_g_q itself, in the tcp_stack_t we - * need to keep tcps_g_q around so that a closing connection can - * switch to using tcps_g_q as part of it closing. - */ -#define TCPS_REFHOLD(tcps) { \ - atomic_add_32(&(tcps)->tcps_refcnt, 1); \ - ASSERT((tcps)->tcps_refcnt != 0); \ - DTRACE_PROBE1(tcps__refhold, tcp_stack_t, tcps); \ -} - -/* - * Decrement the reference count on the tcp_stack_t. - * In architectures e.g sun4u, where atomic_add_32_nv is just - * a cas, we need to maintain the right memory barrier semantics - * as that of mutex_exit i.e all the loads and stores should complete - * before the cas is executed. membar_exit() does that here. - */ -#define TCPS_REFRELE(tcps) { \ - ASSERT((tcps)->tcps_refcnt != 0); \ - membar_exit(); \ - DTRACE_PROBE1(tcps__refrele, tcp_stack_t, tcps); \ - if (atomic_add_32_nv(&(tcps)->tcps_refcnt, -1) == 0 && \ - (tcps)->tcps_g_q != NULL) { \ - /* Only tcps_g_q left */ \ - tcp_g_q_inactive(tcps); \ - } \ -} - extern void tcp_free(tcp_t *tcp); extern void tcp_ddi_g_init(void); extern void tcp_ddi_g_destroy(void); -extern void tcp_g_q_inactive(tcp_stack_t *); -extern void tcp_xmit_listeners_reset(mblk_t *mp, uint_t ip_hdr_len, - zoneid_t zoneid, tcp_stack_t *, conn_t *connp); -extern void tcp_conn_request(void *arg, mblk_t *mp, void *arg2); -extern void tcp_conn_request_unbound(void *arg, mblk_t *mp, void *arg2); -extern void tcp_input(void *arg, mblk_t *mp, void *arg2); -extern void tcp_rput_data(void *arg, mblk_t *mp, void *arg2); +extern void tcp_xmit_listeners_reset(mblk_t *, ip_recv_attr_t *, + ip_stack_t *, conn_t *); +extern void tcp_input_listener(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *); +extern void tcp_input_listener_unbound(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *); +extern void tcp_input_data(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *); extern void *tcp_get_conn(void *arg, tcp_stack_t *); extern void tcp_time_wait_collector(void *arg); extern mblk_t *tcp_snmp_get(queue_t *, mblk_t *); @@ -668,7 +502,6 @@ extern int tcp_snmp_set(queue_t *, int, int, uchar_t *, int len); extern mblk_t *tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, mblk_t **end_mp, uint32_t seq, boolean_t sendall, uint32_t *seg_len, boolean_t rexmit); -extern void tcp_xmit_reset(void *arg, mblk_t *mp, void *arg2); /* * The TCP Fanout structure. @@ -706,6 +539,15 @@ typedef struct cl_tcp_info_s { } cl_tcp_info_t; /* + * Hook functions to enable cluster networking + * On non-clustered systems these vectors must always be NULL. + */ +extern void (*cl_inet_listen)(netstackid_t, uint8_t, sa_family_t, + uint8_t *, in_port_t, void *); +extern void (*cl_inet_unlisten)(netstackid_t, uint8_t, sa_family_t, + uint8_t *, in_port_t, void *); + +/* * Contracted Consolidation Private ioctl for aborting TCP connections. * In order to keep the offsets and size of the structure the same between * a 32-bit application and a 64-bit amd64 kernel, we use a #pragma @@ -729,25 +571,6 @@ typedef struct tcp_ioc_abort_conn_s { #pragma pack() #endif -#if (defined(_KERNEL) || defined(_KMEMUSER)) -extern void tcp_rput_other(tcp_t *tcp, mblk_t *mp); -#endif - -#if (defined(_KERNEL)) -#define TCP_XRE_EVENT_IP_FANOUT_TCP 1 - -/* - * This is a private structure used to pass data to an squeue function during - * tcp's listener reset sending path. - */ -typedef struct tcp_xmit_reset_event { - int tcp_xre_event; - int tcp_xre_iphdrlen; - zoneid_t tcp_xre_zoneid; - tcp_stack_t *tcp_xre_tcps; -} tcp_xmit_reset_event_t; -#endif - #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c index c9a941eab2..0e1ef43cfb 100644 --- a/usr/src/uts/common/inet/tcp/tcp.c +++ b/usr/src/uts/common/inet/tcp/tcp.c @@ -46,8 +46,6 @@ #include <sys/ethernet.h> #include <sys/cpuvar.h> #include <sys/dlpi.h> -#include <sys/multidata.h> -#include <sys/multidata_impl.h> #include <sys/pattr.h> #include <sys/policy.h> #include <sys/priv.h> @@ -87,7 +85,6 @@ #include <inet/tcp_impl.h> #include <inet/udp_impl.h> #include <net/pfkeyv2.h> -#include <inet/ipsec_info.h> #include <inet/ipdrop.h> #include <inet/ipclassifier.h> @@ -95,6 +92,7 @@ #include <inet/ip_ftable.h> #include <inet/ip_if.h> #include <inet/ipp_common.h> +#include <inet/ip_rts.h> #include <inet/ip_netinfo.h> #include <sys/squeue_impl.h> #include <sys/squeue.h> @@ -111,7 +109,7 @@ * * The entire tcp state is contained in tcp_t and conn_t structure * which are allocated in tandem using ipcl_conn_create() and passing - * IPCL_CONNTCP as a flag. We use 'conn_ref' and 'conn_lock' to protect + * IPCL_TCPCONN as a flag. We use 'conn_ref' and 'conn_lock' to protect * the references on the tcp_t. The tcp_t structure is never compressed * and packets always land on the correct TCP perimeter from the time * eager is created till the time tcp_t dies (as such the old mentat @@ -172,8 +170,8 @@ * * This is a more interesting case because of various races involved in * establishing a eager in its own perimeter. Read the meta comment on - * top of tcp_conn_request(). But briefly, the squeue is picked by - * ip_tcp_input()/ip_fanout_tcp_v6() based on the interrupted CPU. + * top of tcp_input_listener(). But briefly, the squeue is picked by + * ip_fanout based on the ring or the sender (if loopback). * * Closing a connection: * @@ -198,20 +196,13 @@ * * Special provisions and fast paths: * - * We make special provision for (AF_INET, SOCK_STREAM) sockets which - * can't have 'ipv6_recvpktinfo' set and for these type of sockets, IP - * will never send a M_CTL to TCP. As such, ip_tcp_input() which handles - * all TCP packets from the wire makes a IPCL_IS_TCP4_CONNECTED_NO_POLICY - * check to send packets directly to tcp_rput_data via squeue. Everyone - * else comes through tcp_input() on the read side. - * - * We also make special provisions for sockfs by marking tcp_issocket + * We make special provisions for sockfs by marking tcp_issocket * whenever we have only sockfs on top of TCP. This allows us to skip * putting the tcp in acceptor hash since a sockfs listener can never * become acceptor and also avoid allocating a tcp_t for acceptor STREAM * since eager has already been allocated and the accept now happens * on acceptor STREAM. There is a big blob of comment on top of - * tcp_conn_request explaining the new accept. When socket is POP'd, + * tcp_input_listener explaining the new accept. When socket is POP'd, * sockfs sends us an ioctl to mark the fact and we go back to old * behaviour. Once tcp_issocket is unset, its never set for the * life of that connection. @@ -224,13 +215,6 @@ * only exception is tcp_xmit_listeners_reset() which is called * directly from IP and needs to policy check to see if TH_RST * can be sent out. - * - * PFHooks notes : - * - * For mdt case, one meta buffer contains multiple packets. Mblks for every - * packet are assembled and passed to the hooks. When packets are blocked, - * or boundary of any packet is changed, the mdt processing is stopped, and - * packets of the meta buffer are send to the IP path one by one. */ /* @@ -244,7 +228,7 @@ int tcp_squeue_flag; /* * This controls how tiny a write must be before we try to copy it - * into the the mblk on the tail of the transmit queue. Not much + * into the mblk on the tail of the transmit queue. Not much * speedup is observed for values larger than sixteen. Zero will * disable the optimisation. */ @@ -333,16 +317,6 @@ static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG]; tcp_g_stat_t tcp_g_statistics; kstat_t *tcp_g_kstat; -/* - * Call either ip_output or ip_output_v6. This replaces putnext() calls on the - * tcp write side. - */ -#define CALL_IP_WPUT(connp, q, mp) { \ - ASSERT(((q)->q_flag & QREADR) == 0); \ - TCP_DBGSTAT(connp->conn_netstack->netstack_tcp, tcp_ip_output); \ - connp->conn_send(connp, (mp), (q), IP_WPUT); \ -} - /* Macros for timestamp comparisons */ #define TSTMP_GEQ(a, b) ((int32_t)((a)-(b)) >= 0) #define TSTMP_LT(a, b) ((int32_t)((a)-(b)) < 0) @@ -354,7 +328,7 @@ kstat_t *tcp_g_kstat; * nanoseconds (versus every 4 microseconds suggested by RFC 793, page 27); * a per-connection component which grows by 125000 for every new connection; * and an "extra" component that grows by a random amount centered - * approximately on 64000. This causes the the ISS generator to cycle every + * approximately on 64000. This causes the ISS generator to cycle every * 4.89 hours if no TCP connections are made, and faster if connections are * made. * @@ -381,8 +355,13 @@ static sin6_t sin6_null; /* Zero address for quick clears */ */ #define TCP_OLD_URP_INTERPRETATION 1 +/* + * Since tcp_listener is not cleared atomically with tcp_detached + * being cleared we need this extra bit to tell a detached connection + * apart from one that is in the process of being accepted. + */ #define TCP_IS_DETACHED_NONEAGER(tcp) \ - (TCP_IS_DETACHED(tcp) && \ + (TCP_IS_DETACHED(tcp) && \ (!(tcp)->tcp_hard_binding)) /* @@ -495,7 +474,6 @@ typedef struct tcp_timer_s { static kmem_cache_t *tcp_timercache; kmem_cache_t *tcp_sack_info_cache; -kmem_cache_t *tcp_iphc_cache; /* * For scalability, we must not run a timer for every TCP connection @@ -592,17 +570,6 @@ typedef struct tcp_opt_s { } tcp_opt_t; /* - * TCP option struct passing information b/w lisenter and eager. - */ -struct tcp_options { - uint_t to_flags; - ssize_t to_boundif; /* IPV6_BOUND_IF */ -}; - -#define TCPOPT_BOUNDIF 0x00000001 /* set IPV6_BOUND_IF */ -#define TCPOPT_RECVPKTINFO 0x00000002 /* set IPV6_RECVPKTINFO */ - -/* * RFC1323-recommended phrasing of TSTAMP option, for easier parsing */ @@ -673,43 +640,53 @@ typedef struct tcpt_s { /* * Functions called directly via squeue having a prototype of edesc_t. */ -void tcp_conn_request(void *arg, mblk_t *mp, void *arg2); -static void tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2); -void tcp_accept_finish(void *arg, mblk_t *mp, void *arg2); -static void tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2); -static void tcp_wput_proto(void *arg, mblk_t *mp, void *arg2); -void tcp_input(void *arg, mblk_t *mp, void *arg2); -void tcp_rput_data(void *arg, mblk_t *mp, void *arg2); -static void tcp_close_output(void *arg, mblk_t *mp, void *arg2); -void tcp_output(void *arg, mblk_t *mp, void *arg2); -void tcp_output_urgent(void *arg, mblk_t *mp, void *arg2); -static void tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2); -static void tcp_timer_handler(void *arg, mblk_t *mp, void *arg2); -static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2); +void tcp_input_listener(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *ira); +static void tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy); +void tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy); +static void tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy); +static void tcp_wput_proto(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy); +void tcp_input_data(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *ira); +static void tcp_close_output(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy); +void tcp_output(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy); +void tcp_output_urgent(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy); +static void tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy); +static void tcp_timer_handler(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy); +static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy); /* Prototype for TCP functions */ static void tcp_random_init(void); int tcp_random(void); static void tcp_tli_accept(tcp_t *tcp, mblk_t *mp); -static int tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, +static void tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager); -static int tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp); +static int tcp_set_destination(tcp_t *tcp); static in_port_t tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, int reuseaddr, boolean_t quick_connect, boolean_t bind_to_req_port_only, boolean_t user_specified); static void tcp_closei_local(tcp_t *tcp); static void tcp_close_detached(tcp_t *tcp); -static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, - mblk_t *idmp, mblk_t **defermp); +static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, + mblk_t *idmp, mblk_t **defermp, ip_recv_attr_t *ira); static void tcp_tpi_connect(tcp_t *tcp, mblk_t *mp); static int tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, - in_port_t dstport, uint_t srcid, cred_t *cr, pid_t pid); -static int tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp, - in_port_t dstport, uint32_t flowinfo, uint_t srcid, - uint32_t scope_id, cred_t *cr, pid_t pid); + in_port_t dstport, uint_t srcid); +static int tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp, + in_port_t dstport, uint32_t flowinfo, + uint_t srcid, uint32_t scope_id); static int tcp_clean_death(tcp_t *tcp, int err, uint8_t tag); -static void tcp_def_q_set(tcp_t *tcp, mblk_t *mp); static void tcp_disconnect(tcp_t *tcp, mblk_t *mp); static char *tcp_display(tcp_t *tcp, char *, char); static boolean_t tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum); @@ -735,34 +712,16 @@ static void tcp_acceptor_hash_remove(tcp_t *tcp); static void tcp_capability_req(tcp_t *tcp, mblk_t *mp); static void tcp_info_req(tcp_t *tcp, mblk_t *mp); static void tcp_addr_req(tcp_t *tcp, mblk_t *mp); -static void tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *mp); -void tcp_g_q_setup(tcp_stack_t *); -void tcp_g_q_create(tcp_stack_t *); -void tcp_g_q_destroy(tcp_stack_t *); -static int tcp_header_init_ipv4(tcp_t *tcp); -static int tcp_header_init_ipv6(tcp_t *tcp); -int tcp_init(tcp_t *tcp, queue_t *q); -static int tcp_init_values(tcp_t *tcp); -static mblk_t *tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic); -static void tcp_ip_ire_mark_advice(tcp_t *tcp); +static void tcp_init_values(tcp_t *tcp); static void tcp_ip_notify(tcp_t *tcp); -static mblk_t *tcp_ire_mp(mblk_t **mpp); static void tcp_iss_init(tcp_t *tcp); static void tcp_keepalive_killer(void *arg); -static int tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt); -static void tcp_mss_set(tcp_t *tcp, uint32_t size, boolean_t do_ss); +static int tcp_parse_options(tcpha_t *tcpha, tcp_opt_t *tcpopt); +static void tcp_mss_set(tcp_t *tcp, uint32_t size); static int tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp, int *t_errorp, int *sys_errorp); static boolean_t tcp_allow_connopt_set(int level, int name); int tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr); -int tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr); -int tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, - int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, - uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, - mblk_t *mblk); -static void tcp_opt_reverse(tcp_t *tcp, ipha_t *ipha); -static int tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly, - uchar_t *ptr, uint_t len); static int tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); static boolean_t tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt, tcp_stack_t *); @@ -785,9 +744,9 @@ static uint_t tcp_rcv_drain(tcp_t *tcp); static void tcp_sack_rxmit(tcp_t *tcp, uint_t *flags); static boolean_t tcp_send_rst_chk(tcp_stack_t *); static void tcp_ss_rexmit(tcp_t *tcp); -static mblk_t *tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp); -static void tcp_process_options(tcp_t *, tcph_t *); -static void tcp_rput_common(tcp_t *tcp, mblk_t *mp); +static mblk_t *tcp_input_add_ancillary(tcp_t *tcp, mblk_t *mp, ip_pkt_t *ipp, + ip_recv_attr_t *); +static void tcp_process_options(tcp_t *, tcpha_t *); static void tcp_rsrv(queue_t *q); static int tcp_snmp_state(tcp_t *tcp); static void tcp_timer(void *arg); @@ -801,16 +760,10 @@ void tcp_tpi_accept(queue_t *q, mblk_t *mp); static void tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent); static void tcp_wput_flush(tcp_t *tcp, mblk_t *mp); static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp); -static int tcp_send(queue_t *q, tcp_t *tcp, const int mss, - const int tcp_hdr_len, const int tcp_tcp_hdr_len, +static int tcp_send(tcp_t *tcp, const int mss, + const int total_hdr_len, const int tcp_hdr_len, const int num_sack_blk, int *usable, uint_t *snxt, - int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, - const int mdt_thres); -static int tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, - const int tcp_hdr_len, const int tcp_tcp_hdr_len, - const int num_sack_blk, int *usable, uint_t *snxt, - int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, - const int mdt_thres); + int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time); static void tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk); static void tcp_wsrv(queue_t *q); @@ -818,38 +771,36 @@ static int tcp_xmit_end(tcp_t *tcp); static void tcp_ack_timer(void *arg); static mblk_t *tcp_ack_mp(tcp_t *tcp); static void tcp_xmit_early_reset(char *str, mblk_t *mp, - uint32_t seq, uint32_t ack, int ctl, uint_t ip_hdr_len, - zoneid_t zoneid, tcp_stack_t *, conn_t *connp); + uint32_t seq, uint32_t ack, int ctl, ip_recv_attr_t *, + ip_stack_t *, conn_t *); static void tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl); -static int setmaxps(queue_t *q, int maxpsz); static void tcp_set_rto(tcp_t *, time_t); -static boolean_t tcp_check_policy(tcp_t *, mblk_t *, ipha_t *, ip6_t *, - boolean_t, boolean_t); -static void tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, - boolean_t ipsec_mctl); +static void tcp_icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *); +static void tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *); +static boolean_t tcp_verifyicmp(conn_t *, void *, icmph_t *, icmp6_t *, + ip_recv_attr_t *); static int tcp_build_hdrs(tcp_t *); static void tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, - uint32_t seg_seq, uint32_t seg_ack, int seg_len, - tcph_t *tcph); -boolean_t tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp); -static mblk_t *tcp_mdt_info_mp(mblk_t *); -static void tcp_mdt_update(tcp_t *, ill_mdt_capab_t *, boolean_t); -static int tcp_mdt_add_attrs(multidata_t *, const mblk_t *, - const boolean_t, const uint32_t, const uint32_t, - const uint32_t, const uint32_t, tcp_stack_t *); -static void tcp_multisend_data(tcp_t *, ire_t *, const ill_t *, mblk_t *, - const uint_t, const uint_t, boolean_t *); -static mblk_t *tcp_lso_info_mp(mblk_t *); -static void tcp_lso_update(tcp_t *, ill_lso_capab_t *); -static void tcp_send_data(tcp_t *, queue_t *, mblk_t *); + uint32_t seg_seq, uint32_t seg_ack, int seg_len, tcpha_t *tcpha, + ip_recv_attr_t *ira); +boolean_t tcp_paws_check(tcp_t *tcp, tcpha_t *tcpha, tcp_opt_t *tcpoptp); +static boolean_t tcp_zcopy_check(tcp_t *); +static void tcp_zcopy_notify(tcp_t *); +static mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, boolean_t); +static void tcp_update_lso(tcp_t *tcp, ip_xmit_attr_t *ixa); +static void tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only); +static void tcp_update_zcopy(tcp_t *tcp); +static void tcp_notify(void *, ip_xmit_attr_t *, ixa_notify_type_t, + ixa_notify_arg_t); +static void tcp_rexmit_after_error(tcp_t *tcp); +static void tcp_send_data(tcp_t *, mblk_t *); extern mblk_t *tcp_timermp_alloc(int); extern void tcp_timermp_free(tcp_t *); static void tcp_timer_free(tcp_t *tcp, mblk_t *mp); static void tcp_stop_lingering(tcp_t *tcp); static void tcp_close_linger_timeout(void *arg); static void *tcp_stack_init(netstackid_t stackid, netstack_t *ns); -static void tcp_stack_shutdown(netstackid_t stackid, void *arg); static void tcp_stack_fini(netstackid_t stackid, void *arg); static void *tcp_g_kstat_init(tcp_g_stat_t *); static void tcp_g_kstat_fini(kstat_t *); @@ -858,11 +809,10 @@ static void tcp_kstat_fini(netstackid_t, kstat_t *); static void *tcp_kstat2_init(netstackid_t, tcp_stat_t *); static void tcp_kstat2_fini(netstackid_t, kstat_t *); static int tcp_kstat_update(kstat_t *kp, int rw); -void tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp); -static int tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, - tcph_t *tcph, uint_t ipvers, mblk_t *idmp); -static int tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha, - tcph_t *tcph, mblk_t *idmp); +static mblk_t *tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, + ip_recv_attr_t *ira); +static mblk_t *tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, mblk_t *mp, + ip_recv_attr_t *ira); static int tcp_squeue_switch(int); static int tcp_open(queue_t *, dev_t *, int, int, cred_t *, boolean_t); @@ -872,21 +822,17 @@ static int tcp_tpi_close(queue_t *, int); static int tcp_tpi_close_accept(queue_t *); static void tcp_squeue_add(squeue_t *); -static boolean_t tcp_zcopy_check(tcp_t *); -static void tcp_zcopy_notify(tcp_t *); -static mblk_t *tcp_zcopy_disable(tcp_t *, mblk_t *); -static mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, int); -static void tcp_ire_ill_check(tcp_t *, ire_t *, ill_t *, boolean_t); +static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *); -extern void tcp_kssl_input(tcp_t *, mblk_t *); +extern void tcp_kssl_input(tcp_t *, mblk_t *, cred_t *); -void tcp_eager_kill(void *arg, mblk_t *mp, void *arg2); -void tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2); +void tcp_eager_kill(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy); +void tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy); static int tcp_accept(sock_lower_handle_t, sock_lower_handle_t, sock_upper_handle_t, cred_t *); static int tcp_listen(sock_lower_handle_t, int, cred_t *); -static int tcp_post_ip_bind(tcp_t *, mblk_t *, int, cred_t *, pid_t); static int tcp_do_listen(conn_t *, struct sockaddr *, socklen_t, int, cred_t *, boolean_t); static int tcp_do_connect(conn_t *, const struct sockaddr *, socklen_t, @@ -922,7 +868,8 @@ static void tcp_ulp_newconn(conn_t *, conn_t *, mblk_t *); */ static mblk_t *tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *); static void tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *); -static void tcp_ioctl_abort_handler(tcp_t *, mblk_t *); +static void tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy); static int tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps); static void tcp_ioctl_abort_conn(queue_t *, mblk_t *); static int tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *, @@ -988,12 +935,6 @@ struct streamtab tcpinfov6 = { sock_downcalls_t sock_tcp_downcalls; -/* - * Have to ensure that tcp_g_q_close is not done by an - * interrupt thread. - */ -static taskq_t *tcp_taskq; - /* Setable only in /etc/system. Move to ndd? */ boolean_t tcp_icmp_source_quench = B_FALSE; @@ -1042,8 +983,8 @@ static struct T_info_ack tcp_g_t_info_ack_v6 = { #define PARAM_MAX (~(uint32_t)0) /* Max size IP datagram is 64k - 1 */ -#define TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcph_t))) -#define TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcph_t))) +#define TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcpha_t))) +#define TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcpha_t))) /* Max of the above */ #define TCP_MSS_MAX TCP_MSS_MAX_IPV4 @@ -1128,29 +1069,10 @@ static tcpparam_t lcl_tcp_param_arr[] = { { 0, 100*MS, 50*MS, "tcp_push_timer_interval"}, { 0, 1, 0, "tcp_use_smss_as_mss_opt"}, { 0, PARAM_MAX, 8*MINUTES, "tcp_keepalive_abort_interval"}, + { 0, 1, 0, "tcp_dev_flow_ctl"}, }; /* END CSTYLED */ -/* - * tcp_mdt_hdr_{head,tail}_min are the leading and trailing spaces of - * each header fragment in the header buffer. Each parameter value has - * to be a multiple of 4 (32-bit aligned). - */ -static tcpparam_t lcl_tcp_mdt_head_param = - { 32, 256, 32, "tcp_mdt_hdr_head_min" }; -static tcpparam_t lcl_tcp_mdt_tail_param = - { 0, 256, 32, "tcp_mdt_hdr_tail_min" }; -#define tcps_mdt_hdr_head_min tcps_mdt_head_param->tcp_param_val -#define tcps_mdt_hdr_tail_min tcps_mdt_tail_param->tcp_param_val - -/* - * tcp_mdt_max_pbufs is the upper limit value that tcp uses to figure out - * the maximum number of payload buffers associated per Multidata. - */ -static tcpparam_t lcl_tcp_mdt_max_pbufs_param = - { 1, MULTIDATA_MAX_PBUFS, MULTIDATA_MAX_PBUFS, "tcp_mdt_max_pbufs" }; -#define tcps_mdt_max_pbufs tcps_mdt_max_pbufs_param->tcp_param_val - /* Round up the value to the nearest mss. */ #define MSS_ROUNDUP(value, mss) ((((value) - 1) / (mss) + 1) * (mss)) @@ -1162,7 +1084,7 @@ static tcpparam_t lcl_tcp_mdt_max_pbufs_param = * point ECT(0) for TCP as described in RFC 2481. */ #define SET_ECT(tcp, iph) \ - if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ + if ((tcp)->tcp_connp->conn_ipversion == IPV4_VERSION) { \ /* We need to clear the code point first. */ \ ((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \ ((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \ @@ -1183,23 +1105,12 @@ static tcpparam_t lcl_tcp_mdt_max_pbufs_param = #define IS_VMLOANED_MBLK(mp) \ (((mp)->b_datap->db_struioflag & STRUIO_ZC) != 0) - -/* Enable or disable b_cont M_MULTIDATA chaining for MDT. */ -boolean_t tcp_mdt_chain = B_TRUE; - -/* - * MDT threshold in the form of effective send MSS multiplier; we take - * the MDT path if the amount of unsent data exceeds the threshold value - * (default threshold is 1*SMSS). - */ -uint_t tcp_mdt_smss_threshold = 1; - uint32_t do_tcpzcopy = 1; /* 0: disable, 1: enable, 2: force */ /* * Forces all connections to obey the value of the tcps_maxpsz_multiplier * tunable settable via NDD. Otherwise, the per-connection behavior is - * determined dynamically during tcp_adapt_ire(), which is the default. + * determined dynamically during tcp_set_destination(), which is the default. */ boolean_t tcp_static_maxpsz = B_FALSE; @@ -1273,84 +1184,73 @@ int (*cl_inet_connect2)(netstackid_t stack_id, uint8_t protocol, uint8_t *laddrp, in_port_t lport, uint8_t *faddrp, in_port_t fport, void *args) = NULL; - void (*cl_inet_disconnect)(netstackid_t stack_id, uint8_t protocol, sa_family_t addr_family, uint8_t *laddrp, in_port_t lport, uint8_t *faddrp, in_port_t fport, void *args) = NULL; -/* - * The following are defined in ip.c - */ -extern int (*cl_inet_isclusterwide)(netstackid_t stack_id, uint8_t protocol, - sa_family_t addr_family, uint8_t *laddrp, - void *args); -extern uint32_t (*cl_inet_ipident)(netstackid_t stack_id, uint8_t protocol, - sa_family_t addr_family, uint8_t *laddrp, - uint8_t *faddrp, void *args); - /* * int CL_INET_CONNECT(conn_t *cp, tcp_t *tcp, boolean_t is_outgoing, int err) */ -#define CL_INET_CONNECT(connp, tcp, is_outgoing, err) { \ +#define CL_INET_CONNECT(connp, is_outgoing, err) { \ (err) = 0; \ if (cl_inet_connect2 != NULL) { \ /* \ * Running in cluster mode - register active connection \ * information \ */ \ - if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ - if ((tcp)->tcp_ipha->ipha_src != 0) { \ + if ((connp)->conn_ipversion == IPV4_VERSION) { \ + if ((connp)->conn_laddr_v4 != 0) { \ (err) = (*cl_inet_connect2)( \ (connp)->conn_netstack->netstack_stackid,\ IPPROTO_TCP, is_outgoing, AF_INET, \ - (uint8_t *)(&((tcp)->tcp_ipha->ipha_src)),\ - (in_port_t)(tcp)->tcp_lport, \ - (uint8_t *)(&((tcp)->tcp_ipha->ipha_dst)),\ - (in_port_t)(tcp)->tcp_fport, NULL); \ + (uint8_t *)(&((connp)->conn_laddr_v4)),\ + (in_port_t)(connp)->conn_lport, \ + (uint8_t *)(&((connp)->conn_faddr_v4)),\ + (in_port_t)(connp)->conn_fport, NULL); \ } \ } else { \ if (!IN6_IS_ADDR_UNSPECIFIED( \ - &(tcp)->tcp_ip6h->ip6_src)) { \ + &(connp)->conn_laddr_v6)) { \ (err) = (*cl_inet_connect2)( \ (connp)->conn_netstack->netstack_stackid,\ IPPROTO_TCP, is_outgoing, AF_INET6, \ - (uint8_t *)(&((tcp)->tcp_ip6h->ip6_src)),\ - (in_port_t)(tcp)->tcp_lport, \ - (uint8_t *)(&((tcp)->tcp_ip6h->ip6_dst)),\ - (in_port_t)(tcp)->tcp_fport, NULL); \ + (uint8_t *)(&((connp)->conn_laddr_v6)),\ + (in_port_t)(connp)->conn_lport, \ + (uint8_t *)(&((connp)->conn_faddr_v6)), \ + (in_port_t)(connp)->conn_fport, NULL); \ } \ } \ } \ } -#define CL_INET_DISCONNECT(connp, tcp) { \ +#define CL_INET_DISCONNECT(connp) { \ if (cl_inet_disconnect != NULL) { \ /* \ * Running in cluster mode - deregister active \ * connection information \ */ \ - if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ - if ((tcp)->tcp_ip_src != 0) { \ + if ((connp)->conn_ipversion == IPV4_VERSION) { \ + if ((connp)->conn_laddr_v4 != 0) { \ (*cl_inet_disconnect)( \ (connp)->conn_netstack->netstack_stackid,\ IPPROTO_TCP, AF_INET, \ - (uint8_t *)(&((tcp)->tcp_ip_src)), \ - (in_port_t)(tcp)->tcp_lport, \ - (uint8_t *)(&((tcp)->tcp_ipha->ipha_dst)),\ - (in_port_t)(tcp)->tcp_fport, NULL); \ + (uint8_t *)(&((connp)->conn_laddr_v4)),\ + (in_port_t)(connp)->conn_lport, \ + (uint8_t *)(&((connp)->conn_faddr_v4)),\ + (in_port_t)(connp)->conn_fport, NULL); \ } \ } else { \ if (!IN6_IS_ADDR_UNSPECIFIED( \ - &(tcp)->tcp_ip_src_v6)) { \ + &(connp)->conn_laddr_v6)) { \ (*cl_inet_disconnect)( \ (connp)->conn_netstack->netstack_stackid,\ IPPROTO_TCP, AF_INET6, \ - (uint8_t *)(&((tcp)->tcp_ip_src_v6)),\ - (in_port_t)(tcp)->tcp_lport, \ - (uint8_t *)(&((tcp)->tcp_ip6h->ip6_dst)),\ - (in_port_t)(tcp)->tcp_fport, NULL); \ + (uint8_t *)(&((connp)->conn_laddr_v6)),\ + (in_port_t)(connp)->conn_lport, \ + (uint8_t *)(&((connp)->conn_faddr_v6)), \ + (in_port_t)(connp)->conn_fport, NULL); \ } \ } \ } \ @@ -1367,11 +1267,6 @@ int cl_tcp_walk_list(netstackid_t stack_id, static int cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *), void *arg, tcp_stack_t *tcps); -#define DTRACE_IP_FASTPATH(mp, iph, ill, ipha, ip6h) \ - DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, \ - iph, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, \ - ip6_t *, ip6h, int, 0); - static void tcp_set_recv_threshold(tcp_t *tcp, uint32_t new_rcvthresh) { @@ -1540,7 +1435,7 @@ tcp_time_wait_append(tcp_t *tcp) /* ARGSUSED */ void -tcp_timewait_output(void *arg, mblk_t *mp, void *arg2) +tcp_timewait_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { conn_t *connp = (conn_t *)arg; tcp_t *tcp = connp->conn_tcp; @@ -1551,11 +1446,11 @@ tcp_timewait_output(void *arg, mblk_t *mp, void *arg2) return; } - ASSERT((tcp->tcp_family == AF_INET && - tcp->tcp_ipversion == IPV4_VERSION) || - (tcp->tcp_family == AF_INET6 && - (tcp->tcp_ipversion == IPV4_VERSION || - tcp->tcp_ipversion == IPV6_VERSION))); + ASSERT((connp->conn_family == AF_INET && + connp->conn_ipversion == IPV4_VERSION) || + (connp->conn_family == AF_INET6 && + (connp->conn_ipversion == IPV4_VERSION || + connp->conn_ipversion == IPV6_VERSION))); ASSERT(!tcp->tcp_listener); TCP_STAT(tcps, tcp_time_wait_reap); @@ -1579,10 +1474,17 @@ tcp_ipsec_cleanup(tcp_t *tcp) ASSERT(connp->conn_flags & IPCL_TCPCONN); if (connp->conn_latch != NULL) { - IPLATCH_REFRELE(connp->conn_latch, - connp->conn_netstack); + IPLATCH_REFRELE(connp->conn_latch); connp->conn_latch = NULL; } + if (connp->conn_latch_in_policy != NULL) { + IPPOL_REFRELE(connp->conn_latch_in_policy); + connp->conn_latch_in_policy = NULL; + } + if (connp->conn_latch_in_action != NULL) { + IPACT_REFRELE(connp->conn_latch_in_action); + connp->conn_latch_in_action = NULL; + } if (connp->conn_policy != NULL) { IPPH_REFRELE(connp->conn_policy, connp->conn_netstack); connp->conn_policy = NULL; @@ -1598,9 +1500,6 @@ void tcp_cleanup(tcp_t *tcp) { mblk_t *mp; - char *tcp_iphc; - int tcp_iphc_len; - int tcp_hdr_grown; tcp_sack_info_t *tcp_sack_info; conn_t *connp = tcp->tcp_connp; tcp_stack_t *tcps = tcp->tcp_tcps; @@ -1611,6 +1510,22 @@ tcp_cleanup(tcp_t *tcp) /* Cleanup that which needs the netstack first */ tcp_ipsec_cleanup(tcp); + ixa_cleanup(connp->conn_ixa); + + if (connp->conn_ht_iphc != NULL) { + kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated); + connp->conn_ht_iphc = NULL; + connp->conn_ht_iphc_allocated = 0; + connp->conn_ht_iphc_len = 0; + connp->conn_ht_ulp = NULL; + connp->conn_ht_ulp_len = 0; + tcp->tcp_ipha = NULL; + tcp->tcp_ip6h = NULL; + tcp->tcp_tcpha = NULL; + } + + /* We clear any IP_OPTIONS and extension headers */ + ip_pkt_free(&connp->conn_xmit_ipp); tcp_free(tcp); @@ -1626,8 +1541,6 @@ tcp_cleanup(tcp_t *tcp) } tcp->tcp_kssl_pending = B_FALSE; - conn_delete_ire(connp, NULL); - /* * Since we will bzero the entire structure, we need to * remove it and reinsert it in global hash list. We @@ -1639,46 +1552,36 @@ tcp_cleanup(tcp_t *tcp) */ ipcl_globalhash_remove(connp); - /* - * Now it is safe to decrement the reference counts. - * This might be the last reference on the netstack and TCPS - * in which case it will cause the tcp_g_q_close and - * the freeing of the IP Instance. - */ - connp->conn_netstack = NULL; - netstack_rele(ns); - ASSERT(tcps != NULL); - tcp->tcp_tcps = NULL; - TCPS_REFRELE(tcps); - /* Save some state */ mp = tcp->tcp_timercache; tcp_sack_info = tcp->tcp_sack_info; - tcp_iphc = tcp->tcp_iphc; - tcp_iphc_len = tcp->tcp_iphc_len; - tcp_hdr_grown = tcp->tcp_hdr_grown; tcp_rsrv_mp = tcp->tcp_rsrv_mp; if (connp->conn_cred != NULL) { crfree(connp->conn_cred); connp->conn_cred = NULL; } - if (connp->conn_effective_cred != NULL) { - crfree(connp->conn_effective_cred); - connp->conn_effective_cred = NULL; - } ipcl_conn_cleanup(connp); connp->conn_flags = IPCL_TCPCONN; + + /* + * Now it is safe to decrement the reference counts. + * This might be the last reference on the netstack + * in which case it will cause the freeing of the IP Instance. + */ + connp->conn_netstack = NULL; + connp->conn_ixa->ixa_ipst = NULL; + netstack_rele(ns); + ASSERT(tcps != NULL); + tcp->tcp_tcps = NULL; + bzero(tcp, sizeof (tcp_t)); /* restore the state */ tcp->tcp_timercache = mp; tcp->tcp_sack_info = tcp_sack_info; - tcp->tcp_iphc = tcp_iphc; - tcp->tcp_iphc_len = tcp_iphc_len; - tcp->tcp_hdr_grown = tcp_hdr_grown; tcp->tcp_rsrv_mp = tcp_rsrv_mp; tcp->tcp_connp = connp; @@ -1686,7 +1589,7 @@ tcp_cleanup(tcp_t *tcp) ASSERT(connp->conn_tcp == tcp); ASSERT(connp->conn_flags & IPCL_TCPCONN); connp->conn_state_flags = CONN_INCIPIENT; - ASSERT(connp->conn_ulp == IPPROTO_TCP); + ASSERT(connp->conn_proto == IPPROTO_TCP); ASSERT(connp->conn_ref == 1); } @@ -1777,11 +1680,7 @@ tcp_time_wait_collector(void *arg) /* * Set the CONDEMNED flag now itself so that * the refcnt cannot increase due to any - * walker. But we have still not cleaned up - * conn_ire_cache. This is still ok since - * we are going to clean it up in tcp_cleanup - * immediately and any interface unplumb - * thread will wait till the ire is blown away + * walker. */ connp->conn_state_flags |= CONN_CONDEMNED; mutex_exit(lock); @@ -1809,7 +1708,7 @@ tcp_time_wait_collector(void *arg) mutex_exit( &tcp_time_wait->tcp_time_wait_lock); tcp_bind_hash_remove(tcp); - conn_delete_ire(tcp->tcp_connp, NULL); + ixa_cleanup(tcp->tcp_connp->conn_ixa); tcp_ipsec_cleanup(tcp); CONN_DEC_REF(tcp->tcp_connp); } @@ -1839,7 +1738,7 @@ tcp_time_wait_collector(void *arg) TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); mp = &tcp->tcp_closemp; SQUEUE_ENTER_ONE(connp->conn_sqp, mp, - tcp_timewait_output, connp, + tcp_timewait_output, connp, NULL, SQ_FILL, SQTAG_TCP_TIMEWAIT); } } else { @@ -1867,7 +1766,7 @@ tcp_time_wait_collector(void *arg) TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); mp = &tcp->tcp_closemp; SQUEUE_ENTER_ONE(connp->conn_sqp, mp, - tcp_timewait_output, connp, + tcp_timewait_output, connp, NULL, SQ_FILL, SQTAG_TCP_TIMEWAIT); } mutex_enter(&tcp_time_wait->tcp_time_wait_lock); @@ -1886,24 +1785,23 @@ tcp_time_wait_collector(void *arg) /* * Reply to a clients T_CONN_RES TPI message. This function * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES - * on the acceptor STREAM and processed in tcp_wput_accept(). - * Read the block comment on top of tcp_conn_request(). + * on the acceptor STREAM and processed in tcp_accept_common(). + * Read the block comment on top of tcp_input_listener(). */ static void tcp_tli_accept(tcp_t *listener, mblk_t *mp) { - tcp_t *acceptor; - tcp_t *eager; - tcp_t *tcp; + tcp_t *acceptor; + tcp_t *eager; + tcp_t *tcp; struct T_conn_res *tcr; t_uscalar_t acceptor_id; t_scalar_t seqnum; - mblk_t *opt_mp = NULL; /* T_OPTMGMT_REQ messages */ - struct tcp_options *tcpopt; - mblk_t *ok_mp; - mblk_t *mp1; + mblk_t *discon_mp = NULL; + mblk_t *ok_mp; + mblk_t *mp1; tcp_stack_t *tcps = listener->tcp_tcps; - int error; + conn_t *econnp; if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { tcp_err_ack(listener, mp, TPROTO, 0); @@ -1922,8 +1820,8 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp) * fanout hash lock is held. * This prevents any thread from entering the acceptor queue from * below (since it has not been hard bound yet i.e. any inbound - * packets will arrive on the listener or default tcp queue and - * go through tcp_lookup). + * packets will arrive on the listener conn_t and + * go through the classifier). * The CONN_INC_REF will prevent the acceptor from closing. * * XXX It is still possible for a tli application to send down data @@ -1974,7 +1872,7 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp) } else { acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps); if (acceptor == NULL) { - if (listener->tcp_debug) { + if (listener->tcp_connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_accept: did not find acceptor 0x%x\n", @@ -2013,7 +1911,7 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp) * Rendezvous with an eager connection request packet hanging off * 'tcp' that has the 'seqnum' tag. We tagged the detached open * tcp structure when the connection packet arrived in - * tcp_conn_request(). + * tcp_input_listener(). */ seqnum = tcr->SEQ_number; eager = listener; @@ -2047,37 +1945,26 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp) */ ASSERT(eager->tcp_connp->conn_ref >= 1); - /* Pre allocate the stroptions mblk also */ - opt_mp = allocb(MAX(sizeof (struct tcp_options), - sizeof (struct T_conn_res)), BPRI_HI); - if (opt_mp == NULL) { + /* + * Pre allocate the discon_ind mblk also. tcp_accept_finish will + * use it if something failed. + */ + discon_mp = allocb(MAX(sizeof (struct T_discon_ind), + sizeof (struct stroptions)), BPRI_HI); + if (discon_mp == NULL) { CONN_DEC_REF(acceptor->tcp_connp); CONN_DEC_REF(eager->tcp_connp); tcp_err_ack(listener, mp, TSYSERR, ENOMEM); return; } - DB_TYPE(opt_mp) = M_SETOPTS; - opt_mp->b_wptr += sizeof (struct tcp_options); - tcpopt = (struct tcp_options *)opt_mp->b_rptr; - tcpopt->to_flags = 0; - /* - * Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO - * from listener to acceptor. - */ - if (listener->tcp_bound_if != 0) { - tcpopt->to_flags |= TCPOPT_BOUNDIF; - tcpopt->to_boundif = listener->tcp_bound_if; - } - if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) { - tcpopt->to_flags |= TCPOPT_RECVPKTINFO; - } + econnp = eager->tcp_connp; - /* Re-use mp1 to hold a copy of mp, in case reallocb fails */ + /* Hold a copy of mp, in case reallocb fails */ if ((mp1 = copymsg(mp)) == NULL) { CONN_DEC_REF(acceptor->tcp_connp); CONN_DEC_REF(eager->tcp_connp); - freemsg(opt_mp); + freemsg(discon_mp); tcp_err_ack(listener, mp, TSYSERR, ENOMEM); return; } @@ -2093,7 +1980,7 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp) { int extra; - extra = (eager->tcp_family == AF_INET) ? + extra = (econnp->conn_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t); /* @@ -2104,7 +1991,7 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp) if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) { CONN_DEC_REF(acceptor->tcp_connp); CONN_DEC_REF(eager->tcp_connp); - freemsg(opt_mp); + freemsg(discon_mp); /* Original mp has been freed by now, so use mp1 */ tcp_err_ack(listener, mp1, TSYSERR, ENOMEM); return; @@ -2114,38 +2001,32 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp) switch (extra) { case sizeof (sin_t): { - sin_t *sin = (sin_t *)ok_mp->b_wptr; + sin_t *sin = (sin_t *)ok_mp->b_wptr; - ok_mp->b_wptr += extra; - sin->sin_family = AF_INET; - sin->sin_port = eager->tcp_lport; - sin->sin_addr.s_addr = - eager->tcp_ipha->ipha_src; - break; - } + ok_mp->b_wptr += extra; + sin->sin_family = AF_INET; + sin->sin_port = econnp->conn_lport; + sin->sin_addr.s_addr = econnp->conn_laddr_v4; + break; + } case sizeof (sin6_t): { - sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr; + sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr; - ok_mp->b_wptr += extra; - sin6->sin6_family = AF_INET6; - sin6->sin6_port = eager->tcp_lport; - if (eager->tcp_ipversion == IPV4_VERSION) { - sin6->sin6_flowinfo = 0; - IN6_IPADDR_TO_V4MAPPED( - eager->tcp_ipha->ipha_src, - &sin6->sin6_addr); - } else { - ASSERT(eager->tcp_ip6h != NULL); - sin6->sin6_flowinfo = - eager->tcp_ip6h->ip6_vcf & - ~IPV6_VERS_AND_FLOW_MASK; - sin6->sin6_addr = - eager->tcp_ip6h->ip6_src; - } + ok_mp->b_wptr += extra; + sin6->sin6_family = AF_INET6; + sin6->sin6_port = econnp->conn_lport; + sin6->sin6_addr = econnp->conn_laddr_v6; + sin6->sin6_flowinfo = econnp->conn_flowinfo; + if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) && + (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) { + sin6->sin6_scope_id = + econnp->conn_ixa->ixa_scopeid; + } else { sin6->sin6_scope_id = 0; - sin6->__sin6_src_id = 0; - break; } + sin6->__sin6_src_id = 0; + break; + } default: break; } @@ -2158,15 +2039,7 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp) * the tcp_accept_swap is done since it would be dangerous to * let the application start using the new fd prior to the swap. */ - error = tcp_accept_swap(listener, acceptor, eager); - if (error != 0) { - CONN_DEC_REF(acceptor->tcp_connp); - CONN_DEC_REF(eager->tcp_connp); - freemsg(ok_mp); - /* Original mp has been freed by now, so use mp1 */ - tcp_err_ack(listener, mp1, TSYSERR, error); - return; - } + tcp_accept_swap(listener, acceptor, eager); /* * tcp_accept_swap unlinks eager from listener but does not drop @@ -2244,7 +2117,7 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp) /* We no longer need mp1, since all options processing has passed */ freemsg(mp1); - putnext(listener->tcp_rq, ok_mp); + putnext(listener->tcp_connp->conn_rq, ok_mp); mutex_enter(&listener->tcp_eager_lock); if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { @@ -2305,7 +2178,7 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp) listener->tcp_eager_last_q = tcp; tcp->tcp_eager_next_q = NULL; mutex_exit(&listener->tcp_eager_lock); - putnext(tcp->tcp_rq, conn_ind); + putnext(tcp->tcp_connp->conn_rq, conn_ind); } else { mutex_exit(&listener->tcp_eager_lock); } @@ -2318,26 +2191,20 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp) */ finish: ASSERT(acceptor->tcp_detached); - ASSERT(tcps->tcps_g_q != NULL); + acceptor->tcp_connp->conn_rq = NULL; ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp)); - acceptor->tcp_rq = tcps->tcps_g_q; - acceptor->tcp_wq = WR(tcps->tcps_g_q); + acceptor->tcp_connp->conn_wq = NULL; (void) tcp_clean_death(acceptor, 0, 2); CONN_DEC_REF(acceptor->tcp_connp); /* - * In case we already received a FIN we have to make tcp_rput send - * the ordrel_ind. This will also send up a window update if the window - * has opened up. - * - * In the normal case of a successful connection acceptance - * we give the O_T_BIND_REQ to the read side put procedure as an - * indication that this was just accepted. This tells tcp_rput to - * pass up any data queued in tcp_rcv_list. + * We pass discon_mp to tcp_accept_finish to get on the right squeue. * - * In the fringe case where options sent with T_CONN_RES failed and - * we required, we would be indicating a T_DISCON_IND to blow - * away this connection. + * It will update the setting for sockfs/stream head and also take + * care of any data that arrived before accept() wad called. + * In case we already received a FIN then tcp_accept_finish will send up + * the ordrel. It will also send up a window update if the window + * has opened up. */ /* @@ -2346,7 +2213,7 @@ finish: * and is well know but nothing can be done short of major rewrite * to fix it. Now it is possible to take care of it by assigning TLI/XTI * eager same squeue as listener (we can distinguish non socket - * listeners at the time of handling a SYN in tcp_conn_request) + * listeners at the time of handling a SYN in tcp_input_listener) * and do most of the work that tcp_accept_finish does here itself * and then get behind the acceptor squeue to access the acceptor * queue. @@ -2354,52 +2221,38 @@ finish: /* * We already have a ref on tcp so no need to do one before squeue_enter */ - SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, opt_mp, tcp_accept_finish, - eager->tcp_connp, SQ_FILL, SQTAG_TCP_ACCEPT_FINISH); + SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, discon_mp, + tcp_accept_finish, eager->tcp_connp, NULL, SQ_FILL, + SQTAG_TCP_ACCEPT_FINISH); } /* * Swap information between the eager and acceptor for a TLI/XTI client. * The sockfs accept is done on the acceptor stream and control goes - * through tcp_wput_accept() and tcp_accept()/tcp_accept_swap() is not + * through tcp_tli_accept() and tcp_accept()/tcp_accept_swap() is not * called. In either case, both the eager and listener are in their own * perimeter (squeue) and the code has to deal with potential race. * - * See the block comment on top of tcp_accept() and tcp_wput_accept(). + * See the block comment on top of tcp_accept() and tcp_tli_accept(). */ -static int +static void tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager) { conn_t *econnp, *aconnp; - cred_t *effective_cred = NULL; - ASSERT(eager->tcp_rq == listener->tcp_rq); + ASSERT(eager->tcp_connp->conn_rq == listener->tcp_connp->conn_rq); ASSERT(eager->tcp_detached && !acceptor->tcp_detached); - ASSERT(!eager->tcp_hard_bound); ASSERT(!TCP_IS_SOCKET(acceptor)); ASSERT(!TCP_IS_SOCKET(eager)); ASSERT(!TCP_IS_SOCKET(listener)); - econnp = eager->tcp_connp; - aconnp = acceptor->tcp_connp; - /* * Trusted Extensions may need to use a security label that is * different from the acceptor's label on MLP and MAC-Exempt * sockets. If this is the case, the required security label - * already exists in econnp->conn_effective_cred. Use this label - * to generate a new effective cred for the acceptor. - * - * We allow for potential application level retry attempts by - * checking for transient errors before modifying eager. + * already exists in econnp->conn_ixa->ixa_tsl. Since we make the + * acceptor stream refer to econnp we atomatically get that label. */ - if (is_system_labeled() && - aconnp->conn_cred != NULL && econnp->conn_effective_cred != NULL) { - effective_cred = copycred_from_tslabel(aconnp->conn_cred, - crgetlabel(econnp->conn_effective_cred), KM_NOSLEEP); - if (effective_cred == NULL) - return (ENOMEM); - } acceptor->tcp_detached = B_TRUE; /* @@ -2416,18 +2269,20 @@ tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager) ASSERT(eager->tcp_eager_next_q0 == NULL && eager->tcp_eager_prev_q0 == NULL); mutex_exit(&listener->tcp_eager_lock); - eager->tcp_rq = acceptor->tcp_rq; - eager->tcp_wq = acceptor->tcp_wq; - eager->tcp_rq->q_ptr = econnp; - eager->tcp_wq->q_ptr = econnp; + econnp = eager->tcp_connp; + aconnp = acceptor->tcp_connp; + econnp->conn_rq = aconnp->conn_rq; + econnp->conn_wq = aconnp->conn_wq; + econnp->conn_rq->q_ptr = econnp; + econnp->conn_wq->q_ptr = econnp; /* * In the TLI/XTI loopback case, we are inside the listener's squeue, * which might be a different squeue from our peer TCP instance. * For TCP Fusion, the peer expects that whenever tcp_detached is * clear, our TCP queues point to the acceptor's queues. Thus, use - * membar_producer() to ensure that the assignments of tcp_rq/tcp_wq + * membar_producer() to ensure that the assignments of conn_rq/conn_wq * above reach global visibility prior to the clearing of tcp_detached. */ membar_producer(); @@ -2439,419 +2294,187 @@ tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager) econnp->conn_minor_arena = aconnp->conn_minor_arena; ASSERT(econnp->conn_minor_arena != NULL); - if (eager->tcp_cred != NULL) - crfree(eager->tcp_cred); - eager->tcp_cred = econnp->conn_cred = aconnp->conn_cred; - if (econnp->conn_effective_cred != NULL) - crfree(econnp->conn_effective_cred); - econnp->conn_effective_cred = effective_cred; + if (econnp->conn_cred != NULL) + crfree(econnp->conn_cred); + econnp->conn_cred = aconnp->conn_cred; aconnp->conn_cred = NULL; - ASSERT(aconnp->conn_effective_cred == NULL); - + econnp->conn_cpid = aconnp->conn_cpid; ASSERT(econnp->conn_netstack == aconnp->conn_netstack); ASSERT(eager->tcp_tcps == acceptor->tcp_tcps); econnp->conn_zoneid = aconnp->conn_zoneid; econnp->conn_allzones = aconnp->conn_allzones; + econnp->conn_ixa->ixa_zoneid = aconnp->conn_ixa->ixa_zoneid; + econnp->conn_mac_mode = aconnp->conn_mac_mode; + econnp->conn_zone_is_global = aconnp->conn_zone_is_global; aconnp->conn_mac_mode = CONN_MAC_DEFAULT; /* Do the IPC initialization */ CONN_INC_REF(econnp); - econnp->conn_multicast_loop = aconnp->conn_multicast_loop; - econnp->conn_af_isv6 = aconnp->conn_af_isv6; - econnp->conn_pkt_isv6 = aconnp->conn_pkt_isv6; + econnp->conn_family = aconnp->conn_family; + econnp->conn_ipversion = aconnp->conn_ipversion; /* Done with old IPC. Drop its ref on its connp */ CONN_DEC_REF(aconnp); - return (0); } /* * Adapt to the information, such as rtt and rtt_sd, provided from the - * ire cached in conn_cache_ire. If no ire cached, do a ire lookup. + * DCE and IRE maintained by IP. * * Checks for multicast and broadcast destination address. - * Returns zero on failure; non-zero if ok. + * Returns zero if ok; an errno on failure. * * Note that the MSS calculation here is based on the info given in - * the IRE. We do not do any calculation based on TCP options. They - * will be handled in tcp_rput_other() and tcp_rput_data() when TCP - * knows which options to use. + * the DCE and IRE. We do not do any calculation based on TCP options. They + * will be handled in tcp_input_data() when TCP knows which options to use. * * Note on how TCP gets its parameters for a connection. * * When a tcp_t structure is allocated, it gets all the default parameters. - * In tcp_adapt_ire(), it gets those metric parameters, like rtt, rtt_sd, + * In tcp_set_destination(), it gets those metric parameters, like rtt, rtt_sd, * spipe, rpipe, ... from the route metrics. Route metric overrides the * default. * - * An incoming SYN with a multicast or broadcast destination address, is dropped - * in 1 of 2 places. - * - * 1. If the packet was received over the wire it is dropped in - * ip_rput_process_broadcast() - * - * 2. If the packet was received through internal IP loopback, i.e. the packet - * was generated and received on the same machine, it is dropped in - * ip_wput_local() + * An incoming SYN with a multicast or broadcast destination address is dropped + * in ip_fanout_v4/v6. * * An incoming SYN with a multicast or broadcast source address is always - * dropped in tcp_adapt_ire. The same logic in tcp_adapt_ire also serves to + * dropped in tcp_set_destination, since IPDF_ALLOW_MCBC is not set in + * conn_connect. + * The same logic in tcp_set_destination also serves to * reject an attempt to connect to a broadcast or multicast (destination) * address. */ static int -tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp) +tcp_set_destination(tcp_t *tcp) { - ire_t *ire; - ire_t *sire = NULL; - iulp_t *ire_uinfo = NULL; uint32_t mss_max; uint32_t mss; boolean_t tcp_detached = TCP_IS_DETACHED(tcp); conn_t *connp = tcp->tcp_connp; - boolean_t ire_cacheable = B_FALSE; - zoneid_t zoneid = connp->conn_zoneid; - int match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | - MATCH_IRE_SECATTR; - ts_label_t *tsl = crgetlabel(CONN_CRED(connp)); - ill_t *ill = NULL; - boolean_t incoming = (ire_mp == NULL); tcp_stack_t *tcps = tcp->tcp_tcps; - ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - - ASSERT(connp->conn_ire_cache == NULL); - - if (tcp->tcp_ipversion == IPV4_VERSION) { + iulp_t uinfo; + int error; + uint32_t flags; - if (CLASSD(tcp->tcp_connp->conn_rem)) { - BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); - return (0); - } - /* - * If IP_NEXTHOP is set, then look for an IRE_CACHE - * for the destination with the nexthop as gateway. - * ire_ctable_lookup() is used because this particular - * ire, if it exists, will be marked private. - * If that is not available, use the interface ire - * for the nexthop. - * - * TSol: tcp_update_label will detect label mismatches based - * only on the destination's label, but that would not - * detect label mismatches based on the security attributes - * of routes or next hop gateway. Hence we need to pass the - * label to ire_ftable_lookup below in order to locate the - * right prefix (and/or) ire cache. Similarly we also need - * pass the label to the ire_cache_lookup below to locate - * the right ire that also matches on the label. - */ - if (tcp->tcp_connp->conn_nexthop_set) { - ire = ire_ctable_lookup(tcp->tcp_connp->conn_rem, - tcp->tcp_connp->conn_nexthop_v4, 0, NULL, zoneid, - tsl, MATCH_IRE_MARK_PRIVATE_ADDR | MATCH_IRE_GW, - ipst); - if (ire == NULL) { - ire = ire_ftable_lookup( - tcp->tcp_connp->conn_nexthop_v4, - 0, 0, IRE_INTERFACE, NULL, NULL, zoneid, 0, - tsl, match_flags, ipst); - if (ire == NULL) - return (0); - } else { - ire_uinfo = &ire->ire_uinfo; - } - } else { - ire = ire_cache_lookup(tcp->tcp_connp->conn_rem, - zoneid, tsl, ipst); - if (ire != NULL) { - ire_cacheable = B_TRUE; - ire_uinfo = (ire_mp != NULL) ? - &((ire_t *)ire_mp->b_rptr)->ire_uinfo: - &ire->ire_uinfo; + flags = IPDF_LSO | IPDF_ZCOPY; + /* + * Make sure we have a dce for the destination to avoid dce_ident + * contention for connected sockets. + */ + flags |= IPDF_UNIQUE_DCE; - } else { - if (ire_mp == NULL) { - ire = ire_ftable_lookup( - tcp->tcp_connp->conn_rem, - 0, 0, 0, NULL, &sire, zoneid, 0, - tsl, (MATCH_IRE_RECURSIVE | - MATCH_IRE_DEFAULT), ipst); - if (ire == NULL) - return (0); - ire_uinfo = (sire != NULL) ? - &sire->ire_uinfo : - &ire->ire_uinfo; - } else { - ire = (ire_t *)ire_mp->b_rptr; - ire_uinfo = - &((ire_t *) - ire_mp->b_rptr)->ire_uinfo; - } - } - } - ASSERT(ire != NULL); + if (!tcps->tcps_ignore_path_mtu) + connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY; - if ((ire->ire_src_addr == INADDR_ANY) || - (ire->ire_type & IRE_BROADCAST)) { - /* - * ire->ire_mp is non null when ire_mp passed in is used - * ire->ire_mp is set in ip_bind_insert_ire[_v6](). - */ - if (ire->ire_mp == NULL) - ire_refrele(ire); - if (sire != NULL) - ire_refrele(sire); - return (0); - } - - if (tcp->tcp_ipha->ipha_src == INADDR_ANY) { - ipaddr_t src_addr; + /* Use conn_lock to satify ASSERT; tcp is already serialized */ + mutex_enter(&connp->conn_lock); + error = conn_connect(connp, &uinfo, flags); + mutex_exit(&connp->conn_lock); + if (error != 0) + return (error); - /* - * ip_bind_connected() has stored the correct source - * address in conn_src. - */ - src_addr = tcp->tcp_connp->conn_src; - tcp->tcp_ipha->ipha_src = src_addr; - /* - * Copy of the src addr. in tcp_t is needed - * for the lookup funcs. - */ - IN6_IPADDR_TO_V4MAPPED(src_addr, &tcp->tcp_ip_src_v6); - } - /* - * Set the fragment bit so that IP will tell us if the MTU - * should change. IP tells us the latest setting of - * ip_path_mtu_discovery through ire_frag_flag. - */ - if (ipst->ips_ip_path_mtu_discovery) { - tcp->tcp_ipha->ipha_fragment_offset_and_flags = - htons(IPH_DF); - } - /* - * If ire_uinfo is NULL, this is the IRE_INTERFACE case - * for IP_NEXTHOP. No cache ire has been found for the - * destination and we are working with the nexthop's - * interface ire. Since we need to forward all packets - * to the nexthop first, we "blindly" set tcp_localnet - * to false, eventhough the destination may also be - * onlink. - */ - if (ire_uinfo == NULL) - tcp->tcp_localnet = 0; - else - tcp->tcp_localnet = (ire->ire_gateway_addr == 0); - } else { - /* - * For incoming connection ire_mp = NULL - * For outgoing connection ire_mp != NULL - * Technically we should check conn_incoming_ill - * when ire_mp is NULL and conn_outgoing_ill when - * ire_mp is non-NULL. But this is performance - * critical path and for IPV*_BOUND_IF, outgoing - * and incoming ill are always set to the same value. - */ - ill_t *dst_ill = NULL; - ipif_t *dst_ipif = NULL; + error = tcp_build_hdrs(tcp); + if (error != 0) + return (error); - ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill); + tcp->tcp_localnet = uinfo.iulp_localnet; - if (connp->conn_outgoing_ill != NULL) { - /* Outgoing or incoming path */ - int err; + if (uinfo.iulp_rtt != 0) { + clock_t rto; - dst_ill = conn_get_held_ill(connp, - &connp->conn_outgoing_ill, &err); - if (err == ILL_LOOKUP_FAILED || dst_ill == NULL) { - ip1dbg(("tcp_adapt_ire: ill_lookup failed\n")); - return (0); - } - match_flags |= MATCH_IRE_ILL; - dst_ipif = dst_ill->ill_ipif; - } - ire = ire_ctable_lookup_v6(&tcp->tcp_connp->conn_remv6, - 0, 0, dst_ipif, zoneid, tsl, match_flags, ipst); + tcp->tcp_rtt_sa = uinfo.iulp_rtt; + tcp->tcp_rtt_sd = uinfo.iulp_rtt_sd; + rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + + tcps->tcps_rexmit_interval_extra + + (tcp->tcp_rtt_sa >> 5); - if (ire != NULL) { - ire_cacheable = B_TRUE; - ire_uinfo = (ire_mp != NULL) ? - &((ire_t *)ire_mp->b_rptr)->ire_uinfo: - &ire->ire_uinfo; + if (rto > tcps->tcps_rexmit_interval_max) { + tcp->tcp_rto = tcps->tcps_rexmit_interval_max; + } else if (rto < tcps->tcps_rexmit_interval_min) { + tcp->tcp_rto = tcps->tcps_rexmit_interval_min; } else { - if (ire_mp == NULL) { - ire = ire_ftable_lookup_v6( - &tcp->tcp_connp->conn_remv6, - 0, 0, 0, dst_ipif, &sire, zoneid, - 0, tsl, match_flags, ipst); - if (ire == NULL) { - if (dst_ill != NULL) - ill_refrele(dst_ill); - return (0); - } - ire_uinfo = (sire != NULL) ? &sire->ire_uinfo : - &ire->ire_uinfo; - } else { - ire = (ire_t *)ire_mp->b_rptr; - ire_uinfo = - &((ire_t *)ire_mp->b_rptr)->ire_uinfo; - } - } - if (dst_ill != NULL) - ill_refrele(dst_ill); - - ASSERT(ire != NULL); - ASSERT(ire_uinfo != NULL); - - if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6) || - IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) { - /* - * ire->ire_mp is non null when ire_mp passed in is used - * ire->ire_mp is set in ip_bind_insert_ire[_v6](). - */ - if (ire->ire_mp == NULL) - ire_refrele(ire); - if (sire != NULL) - ire_refrele(sire); - return (0); + tcp->tcp_rto = rto; } - - if (IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip6h->ip6_src)) { - in6_addr_t src_addr; - - /* - * ip_bind_connected_v6() has stored the correct source - * address per IPv6 addr. selection policy in - * conn_src_v6. - */ - src_addr = tcp->tcp_connp->conn_srcv6; - - tcp->tcp_ip6h->ip6_src = src_addr; - /* - * Copy of the src addr. in tcp_t is needed - * for the lookup funcs. - */ - tcp->tcp_ip_src_v6 = src_addr; - ASSERT(IN6_ARE_ADDR_EQUAL(&tcp->tcp_ip6h->ip6_src, - &connp->conn_srcv6)); + } + if (uinfo.iulp_ssthresh != 0) + tcp->tcp_cwnd_ssthresh = uinfo.iulp_ssthresh; + else + tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; + if (uinfo.iulp_spipe > 0) { + connp->conn_sndbuf = MIN(uinfo.iulp_spipe, + tcps->tcps_max_buf); + if (tcps->tcps_snd_lowat_fraction != 0) { + connp->conn_sndlowat = connp->conn_sndbuf / + tcps->tcps_snd_lowat_fraction; } - tcp->tcp_localnet = - IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6); + (void) tcp_maxpsz_set(tcp, B_TRUE); } - /* - * This allows applications to fail quickly when connections are made - * to dead hosts. Hosts can be labeled dead by adding a reject route - * with both the RTF_REJECT and RTF_PRIVATE flags set. + * Note that up till now, acceptor always inherits receive + * window from the listener. But if there is a metrics + * associated with a host, we should use that instead of + * inheriting it from listener. Thus we need to pass this + * info back to the caller. */ - if ((ire->ire_flags & RTF_REJECT) && - (ire->ire_flags & RTF_PRIVATE)) - goto error; + if (uinfo.iulp_rpipe > 0) { + tcp->tcp_rwnd = MIN(uinfo.iulp_rpipe, + tcps->tcps_max_buf); + } + + if (uinfo.iulp_rtomax > 0) { + tcp->tcp_second_timer_threshold = + uinfo.iulp_rtomax; + } /* - * Make use of the cached rtt and rtt_sd values to calculate the - * initial RTO. Note that they are already initialized in - * tcp_init_values(). - * If ire_uinfo is NULL, i.e., we do not have a cache ire for - * IP_NEXTHOP, but instead are using the interface ire for the - * nexthop, then we do not use the ire_uinfo from that ire to - * do any initializations. + * Use the metric option settings, iulp_tstamp_ok and + * iulp_wscale_ok, only for active open. What this means + * is that if the other side uses timestamp or window + * scale option, TCP will also use those options. That + * is for passive open. If the application sets a + * large window, window scale is enabled regardless of + * the value in iulp_wscale_ok. This is the behavior + * since 2.6. So we keep it. + * The only case left in passive open processing is the + * check for SACK. + * For ECN, it should probably be like SACK. But the + * current value is binary, so we treat it like the other + * cases. The metric only controls active open.For passive + * open, the ndd param, tcp_ecn_permitted, controls the + * behavior. */ - if (ire_uinfo != NULL) { - if (ire_uinfo->iulp_rtt != 0) { - clock_t rto; - - tcp->tcp_rtt_sa = ire_uinfo->iulp_rtt; - tcp->tcp_rtt_sd = ire_uinfo->iulp_rtt_sd; - rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + - tcps->tcps_rexmit_interval_extra + - (tcp->tcp_rtt_sa >> 5); - - if (rto > tcps->tcps_rexmit_interval_max) { - tcp->tcp_rto = tcps->tcps_rexmit_interval_max; - } else if (rto < tcps->tcps_rexmit_interval_min) { - tcp->tcp_rto = tcps->tcps_rexmit_interval_min; - } else { - tcp->tcp_rto = rto; - } - } - if (ire_uinfo->iulp_ssthresh != 0) - tcp->tcp_cwnd_ssthresh = ire_uinfo->iulp_ssthresh; - else - tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; - if (ire_uinfo->iulp_spipe > 0) { - tcp->tcp_xmit_hiwater = MIN(ire_uinfo->iulp_spipe, - tcps->tcps_max_buf); - if (tcps->tcps_snd_lowat_fraction != 0) - tcp->tcp_xmit_lowater = tcp->tcp_xmit_hiwater / - tcps->tcps_snd_lowat_fraction; - (void) tcp_maxpsz_set(tcp, B_TRUE); - } + if (!tcp_detached) { /* - * Note that up till now, acceptor always inherits receive - * window from the listener. But if there is a metrics - * associated with a host, we should use that instead of - * inheriting it from listener. Thus we need to pass this - * info back to the caller. + * The if check means that the following can only + * be turned on by the metrics only IRE, but not off. */ - if (ire_uinfo->iulp_rpipe > 0) { - tcp->tcp_rwnd = MIN(ire_uinfo->iulp_rpipe, - tcps->tcps_max_buf); - } - - if (ire_uinfo->iulp_rtomax > 0) { - tcp->tcp_second_timer_threshold = - ire_uinfo->iulp_rtomax; - } - + if (uinfo.iulp_tstamp_ok) + tcp->tcp_snd_ts_ok = B_TRUE; + if (uinfo.iulp_wscale_ok) + tcp->tcp_snd_ws_ok = B_TRUE; + if (uinfo.iulp_sack == 2) + tcp->tcp_snd_sack_ok = B_TRUE; + if (uinfo.iulp_ecn_ok) + tcp->tcp_ecn_ok = B_TRUE; + } else { /* - * Use the metric option settings, iulp_tstamp_ok and - * iulp_wscale_ok, only for active open. What this means - * is that if the other side uses timestamp or window - * scale option, TCP will also use those options. That - * is for passive open. If the application sets a - * large window, window scale is enabled regardless of - * the value in iulp_wscale_ok. This is the behavior - * since 2.6. So we keep it. - * The only case left in passive open processing is the - * check for SACK. - * For ECN, it should probably be like SACK. But the - * current value is binary, so we treat it like the other - * cases. The metric only controls active open.For passive - * open, the ndd param, tcp_ecn_permitted, controls the - * behavior. + * Passive open. + * + * As above, the if check means that SACK can only be + * turned on by the metric only IRE. */ - if (!tcp_detached) { - /* - * The if check means that the following can only - * be turned on by the metrics only IRE, but not off. - */ - if (ire_uinfo->iulp_tstamp_ok) - tcp->tcp_snd_ts_ok = B_TRUE; - if (ire_uinfo->iulp_wscale_ok) - tcp->tcp_snd_ws_ok = B_TRUE; - if (ire_uinfo->iulp_sack == 2) - tcp->tcp_snd_sack_ok = B_TRUE; - if (ire_uinfo->iulp_ecn_ok) - tcp->tcp_ecn_ok = B_TRUE; - } else { - /* - * Passive open. - * - * As above, the if check means that SACK can only be - * turned on by the metric only IRE. - */ - if (ire_uinfo->iulp_sack > 0) { - tcp->tcp_snd_sack_ok = B_TRUE; - } + if (uinfo.iulp_sack > 0) { + tcp->tcp_snd_sack_ok = B_TRUE; } } - /* - * XXX: Note that currently, ire_max_frag can be as small as 68 + * XXX Note that currently, iulp_mtu can be as small as 68 * because of PMTUd. So tcp_mss may go to negative if combined * length of all those options exceeds 28 bytes. But because * of the tcp_mss_min check below, we may not have a problem if @@ -2864,31 +2487,15 @@ tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp) * We do not deal with that now. All those problems related to * PMTUd will be fixed later. */ - ASSERT(ire->ire_max_frag != 0); - mss = tcp->tcp_if_mtu = ire->ire_max_frag; - if (tcp->tcp_ipp_fields & IPPF_USE_MIN_MTU) { - if (tcp->tcp_ipp_use_min_mtu == IPV6_USE_MIN_MTU_NEVER) { - mss = MIN(mss, IPV6_MIN_MTU); - } - } + ASSERT(uinfo.iulp_mtu != 0); + mss = tcp->tcp_initial_pmtu = uinfo.iulp_mtu; /* Sanity check for MSS value. */ - if (tcp->tcp_ipversion == IPV4_VERSION) + if (connp->conn_ipversion == IPV4_VERSION) mss_max = tcps->tcps_mss_max_ipv4; else mss_max = tcps->tcps_mss_max_ipv6; - if (tcp->tcp_ipversion == IPV6_VERSION && - (ire->ire_frag_flag & IPH_FRAG_HDR)) { - /* - * After receiving an ICMPv6 "packet too big" message with a - * MTU < 1280, and for multirouted IPv6 packets, the IP layer - * will insert a 8-byte fragment header in every packet; we - * reduce the MSS by that amount here. - */ - mss -= sizeof (ip6_frag_t); - } - if (tcp->tcp_ipsec_overhead == 0) tcp->tcp_ipsec_overhead = conn_ipsec_length(connp); @@ -2903,71 +2510,28 @@ tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp) tcp->tcp_mss = mss; /* + * Update the tcp connection with LSO capability. + */ + tcp_update_lso(tcp, connp->conn_ixa); + + /* * Initialize the ISS here now that we have the full connection ID. * The RFC 1948 method of initial sequence number generation requires * knowledge of the full connection ID before setting the ISS. */ - tcp_iss_init(tcp); - if (ire->ire_type & (IRE_LOOPBACK | IRE_LOCAL)) - tcp->tcp_loopback = B_TRUE; - - if (sire != NULL) - IRE_REFRELE(sire); - - /* - * If we got an IRE_CACHE and an ILL, go through their properties; - * otherwise, this is deferred until later when we have an IRE_CACHE. - */ - if (tcp->tcp_loopback || - (ire_cacheable && (ill = ire_to_ill(ire)) != NULL)) { - /* - * For incoming, see if this tcp may be MDT-capable. For - * outgoing, this process has been taken care of through - * tcp_rput_other. - */ - tcp_ire_ill_check(tcp, ire, ill, incoming); - tcp->tcp_ire_ill_check_done = B_TRUE; - } + tcp->tcp_loopback = (uinfo.iulp_loopback | uinfo.iulp_local); - mutex_enter(&connp->conn_lock); /* * Make sure that conn is not marked incipient * for incoming connections. A blind * removal of incipient flag is cheaper than * check and removal. */ + mutex_enter(&connp->conn_lock); connp->conn_state_flags &= ~CONN_INCIPIENT; - - /* - * Must not cache forwarding table routes - * or recache an IRE after the conn_t has - * had conn_ire_cache cleared and is flagged - * unusable, (see the CONN_CACHE_IRE() macro). - */ - if (ire_cacheable && CONN_CACHE_IRE(connp)) { - rw_enter(&ire->ire_bucket->irb_lock, RW_READER); - if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { - connp->conn_ire_cache = ire; - IRE_UNTRACE_REF(ire); - rw_exit(&ire->ire_bucket->irb_lock); - mutex_exit(&connp->conn_lock); - return (1); - } - rw_exit(&ire->ire_bucket->irb_lock); - } mutex_exit(&connp->conn_lock); - - if (ire->ire_mp == NULL) - ire_refrele(ire); - return (1); - -error: - if (ire->ire_mp == NULL) - ire_refrele(ire); - if (sire != NULL) - ire_refrele(sire); return (0); } @@ -3001,7 +2565,7 @@ tcp_tpi_bind(tcp_t *tcp, mblk_t *mp) ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_tpi_bind: bad req, len %u", (uint_t)(mp->b_wptr - mp->b_rptr)); @@ -3010,7 +2574,7 @@ tcp_tpi_bind(tcp_t *tcp, mblk_t *mp) return; } /* Make sure the largest address fits */ - mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t) + 1, 1); + mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1); if (mp1 == NULL) { tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); return; @@ -3024,7 +2588,7 @@ tcp_tpi_bind(tcp_t *tcp, mblk_t *mp) switch (len) { case 0: /* request for a generic port */ tbr->ADDR_offset = sizeof (struct T_bind_req); - if (tcp->tcp_family == AF_INET) { + if (connp->conn_family == AF_INET) { tbr->ADDR_length = sizeof (sin_t); sin = (sin_t *)&tbr[1]; *sin = sin_null; @@ -3033,7 +2597,7 @@ tcp_tpi_bind(tcp_t *tcp, mblk_t *mp) len = sizeof (sin_t); mp->b_wptr = (uchar_t *)&sin[1]; } else { - ASSERT(tcp->tcp_family == AF_INET6); + ASSERT(connp->conn_family == AF_INET6); tbr->ADDR_length = sizeof (sin6_t); sin6 = (sin6_t *)&tbr[1]; *sin6 = sin6_null; @@ -3055,7 +2619,7 @@ tcp_tpi_bind(tcp_t *tcp, mblk_t *mp) break; default: - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_tpi_bind: bad address length, %d", tbr->ADDR_length); @@ -3080,16 +2644,16 @@ done: /* * Update port information as sockfs/tpi needs it for checking */ - if (tcp->tcp_family == AF_INET) { + if (connp->conn_family == AF_INET) { sin = (sin_t *)sa; - sin->sin_port = tcp->tcp_lport; + sin->sin_port = connp->conn_lport; } else { sin6 = (sin6_t *)sa; - sin6->sin6_port = tcp->tcp_lport; + sin6->sin6_port = connp->conn_lport; } mp->b_datap->db_type = M_PCPROTO; tbr->PRIM_type = T_BIND_ACK; - putnext(tcp->tcp_rq, mp); + putnext(connp->conn_rq, mp); } } @@ -3139,7 +2703,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, * Set loopmax appropriately so that one does not look * forever in the case all of the anonymous ports are in use. */ - if (tcp->tcp_anon_priv_bind) { + if (connp->conn_anon_priv_bind) { /* * loopmax = * (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1 @@ -3175,7 +2739,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, mutex_enter(&tbf->tf_lock); for (ltcp = tbf->tf_tcp; ltcp != NULL; ltcp = ltcp->tcp_bind_hash) { - if (lport == ltcp->tcp_lport) + if (lport == ltcp->tcp_connp->conn_lport) break; } @@ -3191,7 +2755,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, * privilege as being in all zones, as there's * otherwise no way to identify the right receiver. */ - if (!IPCL_BIND_ZONE_MATCH(ltcp->tcp_connp, connp)) + if (!IPCL_BIND_ZONE_MATCH(lconnp, connp)) continue; /* @@ -3227,7 +2791,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, * added. * * if (ltcp->tcp_state == TCPS_LISTEN || - * !reuseaddr || !ltcp->tcp_reuseaddr) { + * !reuseaddr || !lconnp->conn_reuseaddr) { * ... * } * @@ -3243,17 +2807,18 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, */ not_socket = !(TCP_IS_SOCKET(ltcp) && TCP_IS_SOCKET(tcp)); - exclbind = ltcp->tcp_exclbind || tcp->tcp_exclbind; + exclbind = lconnp->conn_exclbind || + connp->conn_exclbind; if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) || (connp->conn_mac_mode != CONN_MAC_DEFAULT) || (exclbind && (not_socket || ltcp->tcp_state <= TCPS_ESTABLISHED))) { if (V6_OR_V4_INADDR_ANY( - ltcp->tcp_bound_source_v6) || + lconnp->conn_bound_addr_v6) || V6_OR_V4_INADDR_ANY(*laddr) || IN6_ARE_ADDR_EQUAL(laddr, - <cp->tcp_bound_source_v6)) { + &lconnp->conn_bound_addr_v6)) { break; } continue; @@ -3266,7 +2831,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, * specific port. We use the same autoassigned port * number space for IPv4 and IPv6 sockets. */ - if (tcp->tcp_ipversion != ltcp->tcp_ipversion && + if (connp->conn_ipversion != lconnp->conn_ipversion && bind_to_req_port_only) continue; @@ -3281,9 +2846,9 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, */ if (quick_connect && (ltcp->tcp_state > TCPS_LISTEN) && - ((tcp->tcp_fport != ltcp->tcp_fport) || - !IN6_ARE_ADDR_EQUAL(&tcp->tcp_remote_v6, - <cp->tcp_remote_v6))) + ((connp->conn_fport != lconnp->conn_fport) || + !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6, + &lconnp->conn_faddr_v6))) continue; if (!reuseaddr) { @@ -3299,9 +2864,9 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, */ if (!V6_OR_V4_INADDR_ANY(*laddr) && !V6_OR_V4_INADDR_ANY( - ltcp->tcp_bound_source_v6) && + lconnp->conn_bound_addr_v6) && !IN6_ARE_ADDR_EQUAL(laddr, - <cp->tcp_bound_source_v6)) + &lconnp->conn_bound_addr_v6)) continue; if (ltcp->tcp_state >= TCPS_BOUND) { /* @@ -3327,7 +2892,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, * SO_REUSEADDR setting, so we break. */ if (IN6_ARE_ADDR_EQUAL(laddr, - <cp->tcp_bound_source_v6) && + &lconnp->conn_bound_addr_v6) && (ltcp->tcp_state == TCPS_LISTEN || ltcp->tcp_state == TCPS_BOUND)) break; @@ -3343,11 +2908,10 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, * number. */ tcp->tcp_state = TCPS_BOUND; - tcp->tcp_lport = htons(port); - *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport; + connp->conn_lport = htons(port); ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH( - tcp->tcp_lport)] == tbf); + connp->conn_lport)] == tbf); tcp_bind_hash_insert(tbf, tcp, 1); mutex_exit(&tbf->tf_lock); @@ -3364,12 +2928,12 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, * is updated. After the update, it may or may not * be in the valid range. */ - if (!tcp->tcp_anon_priv_bind) + if (!connp->conn_anon_priv_bind) tcps->tcps_next_port_to_try = port + 1; return (port); } - if (tcp->tcp_anon_priv_bind) { + if (connp->conn_anon_priv_bind) { port = tcp_get_next_priv_port(tcp); } else { if (count == 0 && user_specified) { @@ -3402,12 +2966,13 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, * tcp_clean_death / tcp_close_detached must not be called more than once * on a tcp. Thus every function that potentially calls tcp_clean_death * must check for the tcp state before calling tcp_clean_death. - * Eg. tcp_input, tcp_rput_data, tcp_eager_kill, tcp_clean_death_wrapper, + * Eg. tcp_input_data, tcp_eager_kill, tcp_clean_death_wrapper, * tcp_timer_handler, all check for the tcp state. */ /* ARGSUSED */ void -tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2) +tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy) { tcp_t *tcp = ((conn_t *)arg)->conn_tcp; @@ -3449,11 +3014,11 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) } ASSERT(tcp != NULL); - ASSERT((tcp->tcp_family == AF_INET && - tcp->tcp_ipversion == IPV4_VERSION) || - (tcp->tcp_family == AF_INET6 && - (tcp->tcp_ipversion == IPV4_VERSION || - tcp->tcp_ipversion == IPV6_VERSION))); + ASSERT((connp->conn_family == AF_INET && + connp->conn_ipversion == IPV4_VERSION) || + (connp->conn_family == AF_INET6 && + (connp->conn_ipversion == IPV4_VERSION || + connp->conn_ipversion == IPV6_VERSION))); if (TCP_IS_DETACHED(tcp)) { if (tcp->tcp_hard_binding) { @@ -3483,7 +3048,7 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) TCP_STAT(tcps, tcp_clean_death_nondetached); - q = tcp->tcp_rq; + q = connp->conn_rq; /* Trash all inbound data */ if (!IPCL_IS_NONSTR(connp)) { @@ -3506,7 +3071,7 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) */ (void) putnextctl1(q, M_FLUSH, FLUSHR); } - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, "tcp_clean_death: discon err %d", err); } @@ -3519,7 +3084,7 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) if (mp != NULL) { putnext(q, mp); } else { - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_clean_death, sending M_ERROR"); @@ -3552,6 +3117,7 @@ tcp_stop_lingering(tcp_t *tcp) { clock_t delta = 0; tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; tcp->tcp_linger_tid = 0; if (tcp->tcp_state > TCPS_LISTEN) { @@ -3568,15 +3134,14 @@ tcp_stop_lingering(tcp_t *tcp) } /* * Need to cancel those timers which will not be used when - * TCP is detached. This has to be done before the tcp_wq - * is set to the global queue. + * TCP is detached. This has to be done before the conn_wq + * is cleared. */ tcp_timers_stop(tcp); tcp->tcp_detached = B_TRUE; - ASSERT(tcps->tcps_g_q != NULL); - tcp->tcp_rq = tcps->tcps_g_q; - tcp->tcp_wq = WR(tcps->tcps_g_q); + connp->conn_rq = NULL; + connp->conn_wq = NULL; if (tcp->tcp_state == TCPS_TIME_WAIT) { tcp_time_wait_append(tcp); @@ -3595,16 +3160,14 @@ tcp_stop_lingering(tcp_t *tcp) } } else { tcp_closei_local(tcp); - CONN_DEC_REF(tcp->tcp_connp); + CONN_DEC_REF(connp); } finish: /* Signal closing thread that it can complete close */ mutex_enter(&tcp->tcp_closelock); tcp->tcp_detached = B_TRUE; - ASSERT(tcps->tcps_g_q != NULL); - - tcp->tcp_rq = tcps->tcps_g_q; - tcp->tcp_wq = WR(tcps->tcps_g_q); + connp->conn_rq = NULL; + connp->conn_wq = NULL; tcp->tcp_closed = 1; cv_signal(&tcp->tcp_closecv); @@ -3636,9 +3199,9 @@ tcp_close_common(conn_t *connp, int flags) ASSERT(connp->conn_ref >= 2); /* - * Mark the conn as closing. ill_pending_mp_add will not + * Mark the conn as closing. ipsq_pending_mp_add will not * add any mp to the pending mp list, after this conn has - * started closing. Same for sq_pending_mp_add + * started closing. */ mutex_enter(&connp->conn_lock); connp->conn_state_flags |= CONN_CLOSING; @@ -3664,7 +3227,7 @@ tcp_close_common(conn_t *connp, int flags) TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_close_output, connp, - tcp_squeue_flag, SQTAG_IP_TCP_CLOSE); + NULL, tcp_squeue_flag, SQTAG_IP_TCP_CLOSE); mutex_enter(&tcp->tcp_closelock); while (!tcp->tcp_closed) { @@ -3684,13 +3247,13 @@ tcp_close_common(conn_t *connp, int flags) * thread is higher priority than the squeue worker * thread and is bound to the same cpu. */ - if (tcp->tcp_linger && tcp->tcp_lingertime > 0) { + if (connp->conn_linger && connp->conn_lingertime > 0) { mutex_exit(&tcp->tcp_closelock); /* Entering squeue, bump ref count. */ CONN_INC_REF(connp); bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL); SQUEUE_ENTER_ONE(connp->conn_sqp, bp, - tcp_linger_interrupted, connp, + tcp_linger_interrupted, connp, NULL, tcp_squeue_flag, SQTAG_IP_TCP_CLOSE); mutex_enter(&tcp->tcp_closelock); } @@ -3703,8 +3266,8 @@ tcp_close_common(conn_t *connp, int flags) /* * In the case of listener streams that have eagers in the q or q0 - * we wait for the eagers to drop their reference to us. tcp_rq and - * tcp_wq of the eagers point to our queues. By waiting for the + * we wait for the eagers to drop their reference to us. conn_rq and + * conn_wq of the eagers point to our queues. By waiting for the * refcnt to drop to 1, we are sure that the eagers have cleaned * up their queue pointers and also dropped their references to us. */ @@ -3716,13 +3279,12 @@ tcp_close_common(conn_t *connp, int flags) mutex_exit(&connp->conn_lock); } /* - * ioctl cleanup. The mp is queued in the - * ill_pending_mp or in the sq_pending_mp. + * ioctl cleanup. The mp is queued in the ipx_pending_mp. */ if (conn_ioctl_cleanup_reqd) conn_ioctl_cleanup(connp); - tcp->tcp_cpid = -1; + connp->conn_cpid = NOPID; } static int @@ -3799,7 +3361,7 @@ tcp_tpi_close_accept(queue_t *q) /* ARGSUSED */ static void -tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2) +tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { conn_t *connp = (conn_t *)arg; tcp_t *tcp = connp->conn_tcp; @@ -3828,7 +3390,7 @@ tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2) /* ARGSUSED */ static void -tcp_close_output(void *arg, mblk_t *mp, void *arg2) +tcp_close_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { char *msg; conn_t *connp = (conn_t *)arg; @@ -3847,10 +3409,6 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2) } mutex_exit(&tcp->tcp_eager_lock); - connp->conn_mdt_ok = B_FALSE; - tcp->tcp_mdt = B_FALSE; - - connp->conn_lso_ok = B_FALSE; tcp->tcp_lso = B_FALSE; msg = NULL; @@ -3879,12 +3437,11 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2) * If SO_LINGER has set a zero linger time, abort the * connection with a reset. */ - if (tcp->tcp_linger && tcp->tcp_lingertime == 0) { + if (connp->conn_linger && connp->conn_lingertime == 0) { msg = "tcp_close, zero lingertime"; break; } - ASSERT(tcp->tcp_hard_bound || tcp->tcp_hard_binding); /* * Abort connection if there is unread data queued. */ @@ -3893,9 +3450,6 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2) break; } /* - * tcp_hard_bound is now cleared thus all packets go through - * tcp_lookup. This fact is used by tcp_detach below. - * * We have done a qwait() above which could have possibly * drained more messages in turn causing transition to a * different state. Check whether we have to do the rest @@ -3915,7 +3469,7 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2) * If lingering on close then wait until the fin is acked, * the SO_LINGER time passes, or a reset is sent/received. */ - if (tcp->tcp_linger && tcp->tcp_lingertime > 0 && + if (connp->conn_linger && connp->conn_lingertime > 0 && !(tcp->tcp_fin_acked) && tcp->tcp_state >= TCPS_ESTABLISHED) { if (tcp->tcp_closeflags & (FNDELAY|FNONBLOCK)) { @@ -3926,7 +3480,7 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2) tcp->tcp_linger_tid = TCP_TIMER(tcp, tcp_close_linger_timeout, - tcp->tcp_lingertime * hz); + connp->conn_lingertime * hz); /* tcp_close_linger_timeout will finish close */ if (tcp->tcp_linger_tid == 0) @@ -3944,8 +3498,8 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2) } /* - * Make sure that no other thread will access the tcp_rq of - * this instance (through lookups etc.) as tcp_rq will go + * Make sure that no other thread will access the conn_rq of + * this instance (through lookups etc.) as conn_rq will go * away shortly. */ tcp_acceptor_hash_remove(tcp); @@ -3962,8 +3516,8 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2) } /* * Need to cancel those timers which will not be used when - * TCP is detached. This has to be done before the tcp_wq - * is set to the global queue. + * TCP is detached. This has to be done before the conn_wq + * is set to NULL. */ tcp_timers_stop(tcp); @@ -4004,18 +3558,6 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2) ASSERT(connp->conn_ref >= 2); finish: - /* - * Although packets are always processed on the correct - * tcp's perimeter and access is serialized via squeue's, - * IP still needs a queue when sending packets in time_wait - * state so use WR(tcps_g_q) till ip_output() can be - * changed to deal with just connp. For read side, we - * could have set tcp_rq to NULL but there are some cases - * in tcp_rput_data() from early days of this code which - * do a putnext without checking if tcp is closed. Those - * need to be identified before both tcp_rq and tcp_wq - * can be set to NULL and tcps_g_q can disappear forever. - */ mutex_enter(&tcp->tcp_closelock); /* * Don't change the queues in the case of a listener that has @@ -4024,13 +3566,8 @@ finish: */ if (!tcp->tcp_wait_for_eagers) { tcp->tcp_detached = B_TRUE; - /* - * When default queue is closing we set tcps_g_q to NULL - * after the close is done. - */ - ASSERT(tcps->tcps_g_q != NULL); - tcp->tcp_rq = tcps->tcps_g_q; - tcp->tcp_wq = WR(tcps->tcps_g_q); + connp->conn_rq = NULL; + connp->conn_wq = NULL; } /* Signal tcp_close() to finish closing. */ @@ -4112,8 +3649,7 @@ tcp_timers_stop(tcp_t *tcp) static void tcp_closei_local(tcp_t *tcp) { - ire_t *ire; - conn_t *connp = tcp->tcp_connp; + conn_t *connp = tcp->tcp_connp; tcp_stack_t *tcps = tcp->tcp_tcps; if (!TCP_IS_SOCKET(tcp)) @@ -4138,7 +3674,7 @@ tcp_closei_local(tcp_t *tcp) * this point, eager will be closed but we * leave it in listeners eager list so that * if listener decides to close without doing - * accept, we can clean this up. In tcp_wput_accept + * accept, we can clean this up. In tcp_tli_accept * we take care of the case of accept on closed * eager. */ @@ -4150,9 +3686,9 @@ tcp_closei_local(tcp_t *tcp) * listener queue, after we have released our * reference on the listener */ - ASSERT(tcps->tcps_g_q != NULL); - tcp->tcp_rq = tcps->tcps_g_q; - tcp->tcp_wq = WR(tcps->tcps_g_q); + ASSERT(tcp->tcp_detached); + connp->conn_rq = NULL; + connp->conn_wq = NULL; CONN_DEC_REF(listener->tcp_connp); } else { mutex_exit(&listener->tcp_eager_lock); @@ -4185,20 +3721,16 @@ tcp_closei_local(tcp_t *tcp) */ if (tcp->tcp_state == TCPS_TIME_WAIT) (void) tcp_time_wait_remove(tcp, NULL); - CL_INET_DISCONNECT(connp, tcp); + CL_INET_DISCONNECT(connp); ipcl_hash_remove(connp); + ixa_cleanup(connp->conn_ixa); /* - * Delete the cached ire in conn_ire_cache and also mark - * the conn as CONDEMNED + * Mark the conn as CONDEMNED */ mutex_enter(&connp->conn_lock); connp->conn_state_flags |= CONN_CONDEMNED; - ire = connp->conn_ire_cache; - connp->conn_ire_cache = NULL; mutex_exit(&connp->conn_lock); - if (ire != NULL) - IRE_REFRELE_NOTR(ire); /* Need to cleanup any pending ioctls */ ASSERT(tcp->tcp_time_wait_next == NULL); @@ -4227,14 +3759,14 @@ tcp_closei_local(tcp_t *tcp) void tcp_free(tcp_t *tcp) { - mblk_t *mp; - ip6_pkt_t *ipp; + mblk_t *mp; + conn_t *connp = tcp->tcp_connp; ASSERT(tcp != NULL); ASSERT(tcp->tcp_ptpahn == NULL && tcp->tcp_acceptor_hash == NULL); - tcp->tcp_rq = NULL; - tcp->tcp_wq = NULL; + connp->conn_rq = NULL; + connp->conn_wq = NULL; tcp_close_mpp(&tcp->tcp_xmit_head); tcp_close_mpp(&tcp->tcp_reass_head); @@ -4281,12 +3813,12 @@ tcp_free(tcp_t *tcp) tcp->tcp_dstoptslen = 0; } ASSERT(tcp->tcp_dstoptslen == 0); - if (tcp->tcp_rtdstopts != NULL) { - mi_free(tcp->tcp_rtdstopts); - tcp->tcp_rtdstopts = NULL; - tcp->tcp_rtdstoptslen = 0; + if (tcp->tcp_rthdrdstopts != NULL) { + mi_free(tcp->tcp_rthdrdstopts); + tcp->tcp_rthdrdstopts = NULL; + tcp->tcp_rthdrdstoptslen = 0; } - ASSERT(tcp->tcp_rtdstoptslen == 0); + ASSERT(tcp->tcp_rthdrdstoptslen == 0); if (tcp->tcp_rthdr != NULL) { mi_free(tcp->tcp_rthdr); tcp->tcp_rthdr = NULL; @@ -4294,18 +3826,6 @@ tcp_free(tcp_t *tcp) } ASSERT(tcp->tcp_rthdrlen == 0); - ipp = &tcp->tcp_sticky_ipp; - if (ipp->ipp_fields & (IPPF_HOPOPTS | IPPF_RTDSTOPTS | IPPF_DSTOPTS | - IPPF_RTHDR)) - ip6_pkt_free(ipp); - - /* - * Free memory associated with the tcp/ip header template. - */ - - if (tcp->tcp_iphc != NULL) - bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); - /* * Following is really a blowing away a union. * It happens to have exactly two members of identical size @@ -4317,17 +3837,19 @@ tcp_free(tcp_t *tcp) /* * Put a connection confirmation message upstream built from the - * address information within 'iph' and 'tcph'. Report our success or failure. + * address/flowid information with the conn and iph. Report our success or + * failure. */ static boolean_t -tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, mblk_t *idmp, - mblk_t **defermp) +tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, mblk_t *idmp, + mblk_t **defermp, ip_recv_attr_t *ira) { sin_t sin; sin6_t sin6; mblk_t *mp; char *optp = NULL; int optlen = 0; + conn_t *connp = tcp->tcp_connp; if (defermp != NULL) *defermp = NULL; @@ -4352,20 +3874,19 @@ tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, mblk_t *idmp, } if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) { - ipha_t *ipha = (ipha_t *)iphdr; /* packet is IPv4 */ - if (tcp->tcp_family == AF_INET) { + if (connp->conn_family == AF_INET) { sin = sin_null; - sin.sin_addr.s_addr = ipha->ipha_src; - sin.sin_port = *(uint16_t *)tcph->th_lport; + sin.sin_addr.s_addr = connp->conn_faddr_v4; + sin.sin_port = connp->conn_fport; sin.sin_family = AF_INET; mp = mi_tpi_conn_con(NULL, (char *)&sin, (int)sizeof (sin_t), optp, optlen); } else { sin6 = sin6_null; - IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &sin6.sin6_addr); - sin6.sin6_port = *(uint16_t *)tcph->th_lport; + sin6.sin6_addr = connp->conn_faddr_v6; + sin6.sin6_port = connp->conn_fport; sin6.sin6_family = AF_INET6; mp = mi_tpi_conn_con(NULL, (char *)&sin6, (int)sizeof (sin6_t), optp, optlen); @@ -4375,10 +3896,10 @@ tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, mblk_t *idmp, ip6_t *ip6h = (ip6_t *)iphdr; ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION); - ASSERT(tcp->tcp_family == AF_INET6); + ASSERT(connp->conn_family == AF_INET6); sin6 = sin6_null; - sin6.sin6_addr = ip6h->ip6_src; - sin6.sin6_port = *(uint16_t *)tcph->th_lport; + sin6.sin6_addr = connp->conn_faddr_v6; + sin6.sin6_port = connp->conn_fport; sin6.sin6_family = AF_INET6; sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; mp = mi_tpi_conn_con(NULL, (char *)&sin6, @@ -4393,16 +3914,16 @@ tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, mblk_t *idmp, if (defermp == NULL) { conn_t *connp = tcp->tcp_connp; if (IPCL_IS_NONSTR(connp)) { - cred_t *cr; - pid_t cpid; - - cr = msg_getcred(mp, &cpid); (*connp->conn_upcalls->su_connected) - (connp->conn_upper_handle, tcp->tcp_connid, cr, - cpid); + (connp->conn_upper_handle, tcp->tcp_connid, + ira->ira_cred, ira->ira_cpid); freemsg(mp); } else { - putnext(tcp->tcp_rq, mp); + if (ira->ira_cred != NULL) { + /* So that getpeerucred works for TPI sockfs */ + mblk_setcred(mp, ira->ira_cred, ira->ira_cpid); + } + putnext(connp->conn_rq, mp); } } else { *defermp = mp; @@ -4456,7 +3977,7 @@ tcp_drop_q0(tcp_t *tcp) */ MAKE_UNDROPPABLE(eager); - if (tcp->tcp_debug) { + if (tcp->tcp_connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE, "tcp_drop_q0: listen half-open queue (max=%d) overflow" " (%d pending) on %s, drop one", tcps->tcps_conn_req_max_q0, @@ -4469,18 +3990,19 @@ tcp_drop_q0(tcp_t *tcp) /* Put a reference on the conn as we are enqueueing it in the sqeue */ CONN_INC_REF(eager->tcp_connp); - /* Mark the IRE created for this SYN request temporary */ - tcp_ip_ire_mark_advice(eager); SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, - tcp_clean_death_wrapper, eager->tcp_connp, + tcp_clean_death_wrapper, eager->tcp_connp, NULL, SQ_FILL, SQTAG_TCP_DROP_Q0); return (B_TRUE); } -int +/* + * Handle a SYN on an AF_INET6 socket; can be either IPv4 or IPv6 + */ +static mblk_t * tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, - tcph_t *tcph, uint_t ipvers, mblk_t *idmp) + ip_recv_attr_t *ira) { tcp_t *ltcp = lconnp->conn_tcp; tcp_t *tcp = connp->conn_tcp; @@ -4488,36 +4010,30 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, ipha_t *ipha; ip6_t *ip6h; sin6_t sin6; - in6_addr_t v6dst; - int err; - int ifindex = 0; + uint_t ifindex = ira->ira_ruifindex; tcp_stack_t *tcps = tcp->tcp_tcps; - if (ipvers == IPV4_VERSION) { + if (ira->ira_flags & IRAF_IS_IPV4) { ipha = (ipha_t *)mp->b_rptr; - connp->conn_send = ip_output; - connp->conn_recv = tcp_input; - - IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, - &connp->conn_bound_source_v6); - IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_srcv6); - IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_remv6); + connp->conn_ipversion = IPV4_VERSION; + IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_laddr_v6); + IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_faddr_v6); + connp->conn_saddr_v6 = connp->conn_laddr_v6; sin6 = sin6_null; - IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &sin6.sin6_addr); - IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); - sin6.sin6_port = *(uint16_t *)tcph->th_lport; + sin6.sin6_addr = connp->conn_faddr_v6; + sin6.sin6_port = connp->conn_fport; sin6.sin6_family = AF_INET6; - sin6.__sin6_src_id = ip_srcid_find_addr(&v6dst, - lconnp->conn_zoneid, tcps->tcps_netstack); - if (tcp->tcp_recvdstaddr) { + sin6.__sin6_src_id = ip_srcid_find_addr(&connp->conn_laddr_v6, + IPCL_ZONEID(lconnp), tcps->tcps_netstack); + + if (connp->conn_recv_ancillary.crb_recvdstaddr) { sin6_t sin6d; sin6d = sin6_null; - IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, - &sin6d.sin6_addr); - sin6d.sin6_port = *(uint16_t *)tcph->th_fport; + sin6d.sin6_addr = connp->conn_laddr_v6; + sin6d.sin6_port = connp->conn_lport; sin6d.sin6_family = AF_INET; tpi_mp = mi_tpi_extconn_ind(NULL, (char *)&sin6d, sizeof (sin6_t), @@ -4534,24 +4050,18 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, } else { ip6h = (ip6_t *)mp->b_rptr; - connp->conn_send = ip_output_v6; - connp->conn_recv = tcp_input; - - connp->conn_bound_source_v6 = ip6h->ip6_dst; - connp->conn_srcv6 = ip6h->ip6_dst; - connp->conn_remv6 = ip6h->ip6_src; - - /* db_cksumstuff is set at ip_fanout_tcp_v6 */ - ifindex = (int)DB_CKSUMSTUFF(mp); - DB_CKSUMSTUFF(mp) = 0; + connp->conn_ipversion = IPV6_VERSION; + connp->conn_laddr_v6 = ip6h->ip6_dst; + connp->conn_faddr_v6 = ip6h->ip6_src; + connp->conn_saddr_v6 = connp->conn_laddr_v6; sin6 = sin6_null; - sin6.sin6_addr = ip6h->ip6_src; - sin6.sin6_port = *(uint16_t *)tcph->th_lport; + sin6.sin6_addr = connp->conn_faddr_v6; + sin6.sin6_port = connp->conn_fport; sin6.sin6_family = AF_INET6; sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; - sin6.__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst, - lconnp->conn_zoneid, tcps->tcps_netstack); + sin6.__sin6_src_id = ip_srcid_find_addr(&connp->conn_laddr_v6, + IPCL_ZONEID(lconnp), tcps->tcps_netstack); if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) { /* Pass up the scope_id of remote addr */ @@ -4559,13 +4069,16 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, } else { sin6.sin6_scope_id = 0; } - if (tcp->tcp_recvdstaddr) { + if (connp->conn_recv_ancillary.crb_recvdstaddr) { sin6_t sin6d; sin6d = sin6_null; - sin6.sin6_addr = ip6h->ip6_dst; - sin6d.sin6_port = *(uint16_t *)tcph->th_fport; - sin6d.sin6_family = AF_INET; + sin6.sin6_addr = connp->conn_laddr_v6; + sin6d.sin6_port = connp->conn_lport; + sin6d.sin6_family = AF_INET6; + if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_laddr_v6)) + sin6d.sin6_scope_id = ifindex; + tpi_mp = mi_tpi_extconn_ind(NULL, (char *)&sin6d, sizeof (sin6_t), (char *)&tcp, (t_scalar_t)sizeof (intptr_t), @@ -4579,194 +4092,40 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, } } - if (tpi_mp == NULL) - return (ENOMEM); - - connp->conn_fport = *(uint16_t *)tcph->th_lport; - connp->conn_lport = *(uint16_t *)tcph->th_fport; - connp->conn_flags |= (IPCL_TCP6|IPCL_EAGER); - connp->conn_fully_bound = B_FALSE; - - /* Inherit information from the "parent" */ - tcp->tcp_ipversion = ltcp->tcp_ipversion; - tcp->tcp_family = ltcp->tcp_family; - - tcp->tcp_wq = ltcp->tcp_wq; - tcp->tcp_rq = ltcp->tcp_rq; - tcp->tcp_mss = tcps->tcps_mss_def_ipv6; - tcp->tcp_detached = B_TRUE; - SOCK_CONNID_INIT(tcp->tcp_connid); - if ((err = tcp_init_values(tcp)) != 0) { - freemsg(tpi_mp); - return (err); - } - - if (ipvers == IPV4_VERSION) { - if ((err = tcp_header_init_ipv4(tcp)) != 0) { - freemsg(tpi_mp); - return (err); - } - ASSERT(tcp->tcp_ipha != NULL); - } else { - /* ifindex must be already set */ - ASSERT(ifindex != 0); - - if (ltcp->tcp_bound_if != 0) - tcp->tcp_bound_if = ltcp->tcp_bound_if; - else if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) - tcp->tcp_bound_if = ifindex; - - tcp->tcp_ipv6_recvancillary = ltcp->tcp_ipv6_recvancillary; - tcp->tcp_recvifindex = 0; - tcp->tcp_recvhops = 0xffffffffU; - ASSERT(tcp->tcp_ip6h != NULL); - } - - tcp->tcp_lport = ltcp->tcp_lport; - - if (ltcp->tcp_ipversion == tcp->tcp_ipversion) { - if (tcp->tcp_iphc_len != ltcp->tcp_iphc_len) { - /* - * Listener had options of some sort; eager inherits. - * Free up the eager template and allocate one - * of the right size. - */ - if (tcp->tcp_hdr_grown) { - kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len); - } else { - bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); - kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc); - } - tcp->tcp_iphc = kmem_zalloc(ltcp->tcp_iphc_len, - KM_NOSLEEP); - if (tcp->tcp_iphc == NULL) { - tcp->tcp_iphc_len = 0; - freemsg(tpi_mp); - return (ENOMEM); - } - tcp->tcp_iphc_len = ltcp->tcp_iphc_len; - tcp->tcp_hdr_grown = B_TRUE; - } - tcp->tcp_hdr_len = ltcp->tcp_hdr_len; - tcp->tcp_ip_hdr_len = ltcp->tcp_ip_hdr_len; - tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len; - tcp->tcp_ip6_hops = ltcp->tcp_ip6_hops; - tcp->tcp_ip6_vcf = ltcp->tcp_ip6_vcf; - - /* - * Copy the IP+TCP header template from listener to eager - */ - bcopy(ltcp->tcp_iphc, tcp->tcp_iphc, ltcp->tcp_hdr_len); - if (tcp->tcp_ipversion == IPV6_VERSION) { - if (((ip6i_t *)(tcp->tcp_iphc))->ip6i_nxt == - IPPROTO_RAW) { - tcp->tcp_ip6h = - (ip6_t *)(tcp->tcp_iphc + - sizeof (ip6i_t)); - } else { - tcp->tcp_ip6h = - (ip6_t *)(tcp->tcp_iphc); - } - tcp->tcp_ipha = NULL; - } else { - tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc; - tcp->tcp_ip6h = NULL; - } - tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc + - tcp->tcp_ip_hdr_len); - } else { - /* - * only valid case when ipversion of listener and - * eager differ is when listener is IPv6 and - * eager is IPv4. - * Eager header template has been initialized to the - * maximum v4 header sizes, which includes space for - * TCP and IP options. - */ - ASSERT((ltcp->tcp_ipversion == IPV6_VERSION) && - (tcp->tcp_ipversion == IPV4_VERSION)); - ASSERT(tcp->tcp_iphc_len >= - TCP_MAX_COMBINED_HEADER_LENGTH); - tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len; - /* copy IP header fields individually */ - tcp->tcp_ipha->ipha_ttl = - ltcp->tcp_ip6h->ip6_hops; - bcopy(ltcp->tcp_tcph->th_lport, - tcp->tcp_tcph->th_lport, sizeof (ushort_t)); - } - - bcopy(tcph->th_lport, tcp->tcp_tcph->th_fport, sizeof (in_port_t)); - bcopy(tcp->tcp_tcph->th_fport, &tcp->tcp_fport, - sizeof (in_port_t)); - - if (ltcp->tcp_lport == 0) { - tcp->tcp_lport = *(in_port_t *)tcph->th_fport; - bcopy(tcph->th_fport, tcp->tcp_tcph->th_lport, - sizeof (in_port_t)); - } - - if (tcp->tcp_ipversion == IPV4_VERSION) { - ASSERT(ipha != NULL); - tcp->tcp_ipha->ipha_dst = ipha->ipha_src; - tcp->tcp_ipha->ipha_src = ipha->ipha_dst; - - /* Source routing option copyover (reverse it) */ - if (tcps->tcps_rev_src_routes) - tcp_opt_reverse(tcp, ipha); - } else { - ASSERT(ip6h != NULL); - tcp->tcp_ip6h->ip6_dst = ip6h->ip6_src; - tcp->tcp_ip6h->ip6_src = ip6h->ip6_dst; - } - - ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL); - ASSERT(!tcp->tcp_tconnind_started); - /* - * If the SYN contains a credential, it's a loopback packet; attach - * the credential to the TPI message. - */ - mblk_copycred(tpi_mp, idmp); - - tcp->tcp_conn.tcp_eager_conn_ind = tpi_mp; - - /* Inherit the listener's SSL protection state */ - - if ((tcp->tcp_kssl_ent = ltcp->tcp_kssl_ent) != NULL) { - kssl_hold_ent(tcp->tcp_kssl_ent); - tcp->tcp_kssl_pending = B_TRUE; - } - - /* Inherit the listener's non-STREAMS flag */ - if (IPCL_IS_NONSTR(lconnp)) { - connp->conn_flags |= IPCL_NONSTR; - } - - return (0); + return (tpi_mp); } - -int -tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha, - tcph_t *tcph, mblk_t *idmp) +/* Handle a SYN on an AF_INET socket */ +mblk_t * +tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, mblk_t *mp, + ip_recv_attr_t *ira) { tcp_t *ltcp = lconnp->conn_tcp; tcp_t *tcp = connp->conn_tcp; sin_t sin; mblk_t *tpi_mp = NULL; - int err; tcp_stack_t *tcps = tcp->tcp_tcps; + ipha_t *ipha; + + ASSERT(ira->ira_flags & IRAF_IS_IPV4); + ipha = (ipha_t *)mp->b_rptr; + + connp->conn_ipversion = IPV4_VERSION; + IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_laddr_v6); + IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_faddr_v6); + connp->conn_saddr_v6 = connp->conn_laddr_v6; sin = sin_null; - sin.sin_addr.s_addr = ipha->ipha_src; - sin.sin_port = *(uint16_t *)tcph->th_lport; + sin.sin_addr.s_addr = connp->conn_faddr_v4; + sin.sin_port = connp->conn_fport; sin.sin_family = AF_INET; - if (ltcp->tcp_recvdstaddr) { + if (lconnp->conn_recv_ancillary.crb_recvdstaddr) { sin_t sind; sind = sin_null; - sind.sin_addr.s_addr = ipha->ipha_dst; - sind.sin_port = *(uint16_t *)tcph->th_fport; + sind.sin_addr.s_addr = connp->conn_laddr_v4; + sind.sin_port = connp->conn_lport; sind.sin_family = AF_INET; tpi_mp = mi_tpi_extconn_ind(NULL, (char *)&sind, sizeof (sin_t), (char *)&tcp, @@ -4779,214 +4138,8 @@ tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha, (t_scalar_t)ltcp->tcp_conn_req_seqnum); } - if (tpi_mp == NULL) { - return (ENOMEM); - } - - connp->conn_flags |= (IPCL_TCP4|IPCL_EAGER); - connp->conn_send = ip_output; - connp->conn_recv = tcp_input; - connp->conn_fully_bound = B_FALSE; - - IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_bound_source_v6); - IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_srcv6); - IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_remv6); - connp->conn_fport = *(uint16_t *)tcph->th_lport; - connp->conn_lport = *(uint16_t *)tcph->th_fport; - - /* Inherit information from the "parent" */ - tcp->tcp_ipversion = ltcp->tcp_ipversion; - tcp->tcp_family = ltcp->tcp_family; - tcp->tcp_wq = ltcp->tcp_wq; - tcp->tcp_rq = ltcp->tcp_rq; tcp->tcp_mss = tcps->tcps_mss_def_ipv4; - tcp->tcp_detached = B_TRUE; - SOCK_CONNID_INIT(tcp->tcp_connid); - if ((err = tcp_init_values(tcp)) != 0) { - freemsg(tpi_mp); - return (err); - } - - /* - * Let's make sure that eager tcp template has enough space to - * copy IPv4 listener's tcp template. Since the conn_t structure is - * preserved and tcp_iphc_len is also preserved, an eager conn_t may - * have a tcp_template of total len TCP_MAX_COMBINED_HEADER_LENGTH or - * more (in case of re-allocation of conn_t with tcp-IPv6 template with - * extension headers or with ip6i_t struct). Note that bcopy() below - * copies listener tcp's hdr_len which cannot be greater than TCP_MAX_ - * COMBINED_HEADER_LENGTH as this listener must be a IPv4 listener. - */ - ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH); - ASSERT(ltcp->tcp_hdr_len <= TCP_MAX_COMBINED_HEADER_LENGTH); - - tcp->tcp_hdr_len = ltcp->tcp_hdr_len; - tcp->tcp_ip_hdr_len = ltcp->tcp_ip_hdr_len; - tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len; - tcp->tcp_ttl = ltcp->tcp_ttl; - tcp->tcp_tos = ltcp->tcp_tos; - - /* Copy the IP+TCP header template from listener to eager */ - bcopy(ltcp->tcp_iphc, tcp->tcp_iphc, ltcp->tcp_hdr_len); - tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc; - tcp->tcp_ip6h = NULL; - tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc + - tcp->tcp_ip_hdr_len); - - /* Initialize the IP addresses and Ports */ - tcp->tcp_ipha->ipha_dst = ipha->ipha_src; - tcp->tcp_ipha->ipha_src = ipha->ipha_dst; - bcopy(tcph->th_lport, tcp->tcp_tcph->th_fport, sizeof (in_port_t)); - bcopy(tcph->th_fport, tcp->tcp_tcph->th_lport, sizeof (in_port_t)); - - /* Source routing option copyover (reverse it) */ - if (tcps->tcps_rev_src_routes) - tcp_opt_reverse(tcp, ipha); - - ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL); - ASSERT(!tcp->tcp_tconnind_started); - - /* - * If the SYN contains a credential, it's a loopback packet; attach - * the credential to the TPI message. - */ - mblk_copycred(tpi_mp, idmp); - - tcp->tcp_conn.tcp_eager_conn_ind = tpi_mp; - - /* Inherit the listener's SSL protection state */ - if ((tcp->tcp_kssl_ent = ltcp->tcp_kssl_ent) != NULL) { - kssl_hold_ent(tcp->tcp_kssl_ent); - tcp->tcp_kssl_pending = B_TRUE; - } - - /* Inherit the listener's non-STREAMS flag */ - if (IPCL_IS_NONSTR(lconnp)) { - connp->conn_flags |= IPCL_NONSTR; - } - - return (0); -} - -/* - * sets up conn for ipsec. - * if the first mblk is M_CTL it is consumed and mpp is updated. - * in case of error mpp is freed. - */ -conn_t * -tcp_get_ipsec_conn(tcp_t *tcp, squeue_t *sqp, mblk_t **mpp) -{ - conn_t *connp = tcp->tcp_connp; - conn_t *econnp; - squeue_t *new_sqp; - mblk_t *first_mp = *mpp; - mblk_t *mp = *mpp; - boolean_t mctl_present = B_FALSE; - uint_t ipvers; - - econnp = tcp_get_conn(sqp, tcp->tcp_tcps); - if (econnp == NULL) { - freemsg(first_mp); - return (NULL); - } - if (DB_TYPE(mp) == M_CTL) { - if (mp->b_cont == NULL || - mp->b_cont->b_datap->db_type != M_DATA) { - freemsg(first_mp); - return (NULL); - } - mp = mp->b_cont; - if ((mp->b_datap->db_struioflag & STRUIO_EAGER) == 0) { - freemsg(first_mp); - return (NULL); - } - - mp->b_datap->db_struioflag &= ~STRUIO_EAGER; - first_mp->b_datap->db_struioflag &= ~STRUIO_POLICY; - mctl_present = B_TRUE; - } else { - ASSERT(mp->b_datap->db_struioflag & STRUIO_POLICY); - mp->b_datap->db_struioflag &= ~STRUIO_POLICY; - } - - new_sqp = (squeue_t *)DB_CKSUMSTART(mp); - DB_CKSUMSTART(mp) = 0; - - ASSERT(OK_32PTR(mp->b_rptr)); - ipvers = IPH_HDR_VERSION(mp->b_rptr); - if (ipvers == IPV4_VERSION) { - uint16_t *up; - uint32_t ports; - ipha_t *ipha; - - ipha = (ipha_t *)mp->b_rptr; - up = (uint16_t *)((uchar_t *)ipha + - IPH_HDR_LENGTH(ipha) + TCP_PORTS_OFFSET); - ports = *(uint32_t *)up; - IPCL_TCP_EAGER_INIT(econnp, IPPROTO_TCP, - ipha->ipha_dst, ipha->ipha_src, ports); - } else { - uint16_t *up; - uint32_t ports; - uint16_t ip_hdr_len; - uint8_t *nexthdrp; - ip6_t *ip6h; - tcph_t *tcph; - - ip6h = (ip6_t *)mp->b_rptr; - if (ip6h->ip6_nxt == IPPROTO_TCP) { - ip_hdr_len = IPV6_HDR_LEN; - } else if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ip_hdr_len, - &nexthdrp) || *nexthdrp != IPPROTO_TCP) { - CONN_DEC_REF(econnp); - freemsg(first_mp); - return (NULL); - } - tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; - up = (uint16_t *)tcph->th_lport; - ports = *(uint32_t *)up; - IPCL_TCP_EAGER_INIT_V6(econnp, IPPROTO_TCP, - ip6h->ip6_dst, ip6h->ip6_src, ports); - } - - /* - * The caller already ensured that there is a sqp present. - */ - econnp->conn_sqp = new_sqp; - econnp->conn_initial_sqp = new_sqp; - - if (connp->conn_policy != NULL) { - ipsec_in_t *ii; - ii = (ipsec_in_t *)(first_mp->b_rptr); - ASSERT(ii->ipsec_in_policy == NULL); - IPPH_REFHOLD(connp->conn_policy); - ii->ipsec_in_policy = connp->conn_policy; - - first_mp->b_datap->db_type = IPSEC_POLICY_SET; - if (!ip_bind_ipsec_policy_set(econnp, first_mp)) { - CONN_DEC_REF(econnp); - freemsg(first_mp); - return (NULL); - } - } - - if (ipsec_conn_cache_policy(econnp, ipvers == IPV4_VERSION) != 0) { - CONN_DEC_REF(econnp); - freemsg(first_mp); - return (NULL); - } - - /* - * If we know we have some policy, pass the "IPSEC" - * options size TCP uses this adjust the MSS. - */ - econnp->conn_tcp->tcp_ipsec_overhead = conn_ipsec_length(econnp); - if (mctl_present) { - freeb(first_mp); - *mpp = mp; - } - - return (econnp); + return (tpi_mp); } /* @@ -5002,10 +4155,8 @@ tcp_get_ipsec_conn(tcp_t *tcp, squeue_t *sqp, mblk_t **mpp) * connection sitting in the freelist. Obviously, this buys us * performance. * - * 2) Defence against DOS attack. Allocating a tcp/conn in tcp_conn_request - * has multiple disadvantages - tying up the squeue during alloc, and the - * fact that IPSec policy initialization has to happen here which - * requires us sending a M_CTL and checking for it i.e. real ugliness. + * 2) Defence against DOS attack. Allocating a tcp/conn in tcp_input_listener + * has multiple disadvantages - tying up the squeue during alloc. * But allocating the conn/tcp in IP land is also not the best since * we can't check the 'q' and 'q0' which are protected by squeue and * blindly allocate memory which might have to be freed here if we are @@ -5050,9 +4201,15 @@ tcp_get_conn(void *arg, tcp_stack_t *tcps) ns = tcps->tcps_netstack; netstack_hold(ns); connp->conn_netstack = ns; + connp->conn_ixa->ixa_ipst = ns->netstack_ip; tcp->tcp_tcps = tcps; - TCPS_REFHOLD(tcps); ipcl_globalhash_insert(connp); + + connp->conn_ixa->ixa_notify_cookie = tcp; + ASSERT(connp->conn_ixa->ixa_notify == tcp_notify); + connp->conn_recv = tcp_input_data; + ASSERT(connp->conn_recvicmp == tcp_icmp_input); + ASSERT(connp->conn_verifyicmp == tcp_verifyicmp); return ((void *)connp); } mutex_exit(&tcp_time_wait->tcp_time_wait_lock); @@ -5075,62 +4232,20 @@ tcp_get_conn(void *arg, tcp_stack_t *tcps) mutex_init(&tcp->tcp_rsrv_mp_lock, NULL, MUTEX_DEFAULT, NULL); tcp->tcp_tcps = tcps; - TCPS_REFHOLD(tcps); - return ((void *)connp); -} + connp->conn_recv = tcp_input_data; + connp->conn_recvicmp = tcp_icmp_input; + connp->conn_verifyicmp = tcp_verifyicmp; -/* - * Update the cached label for the given tcp_t. This should be called once per - * connection, and before any packets are sent or tcp_process_options is - * invoked. Returns B_FALSE if the correct label could not be constructed. - */ -static boolean_t -tcp_update_label(tcp_t *tcp, const cred_t *cr) -{ - conn_t *connp = tcp->tcp_connp; - - if (tcp->tcp_ipversion == IPV4_VERSION) { - uchar_t optbuf[IP_MAX_OPT_LENGTH]; - int added; - - if (tsol_compute_label(cr, tcp->tcp_remote, optbuf, - tcp->tcp_tcps->tcps_netstack->netstack_ip) != 0) - return (B_FALSE); - - added = tsol_remove_secopt(tcp->tcp_ipha, tcp->tcp_hdr_len); - if (added == -1) - return (B_FALSE); - tcp->tcp_hdr_len += added; - tcp->tcp_tcph = (tcph_t *)((uchar_t *)tcp->tcp_tcph + added); - tcp->tcp_ip_hdr_len += added; - if ((tcp->tcp_label_len = optbuf[IPOPT_OLEN]) != 0) { - tcp->tcp_label_len = (tcp->tcp_label_len + 3) & ~3; - added = tsol_prepend_option(optbuf, tcp->tcp_ipha, - tcp->tcp_hdr_len); - if (added == -1) - return (B_FALSE); - tcp->tcp_hdr_len += added; - tcp->tcp_tcph = (tcph_t *) - ((uchar_t *)tcp->tcp_tcph + added); - tcp->tcp_ip_hdr_len += added; - } - } else { - uchar_t optbuf[TSOL_MAX_IPV6_OPTION]; - - if (tsol_compute_label_v6(cr, &tcp->tcp_remote_v6, optbuf, - tcp->tcp_tcps->tcps_netstack->netstack_ip) != 0) - return (B_FALSE); - if (tsol_update_sticky(&tcp->tcp_sticky_ipp, - &tcp->tcp_label_len, optbuf) != 0) - return (B_FALSE); - if (tcp_build_hdrs(tcp) != 0) - return (B_FALSE); - } - - connp->conn_ulp_labeled = 1; + /* + * Register tcp_notify to listen to capability changes detected by IP. + * This upcall is made in the context of the call to conn_ip_output + * thus it is inside the squeue. + */ + connp->conn_ixa->ixa_notify = tcp_notify; + connp->conn_ixa->ixa_notify_cookie = tcp; - return (B_TRUE); + return ((void *)connp); } /* BEGIN CSTYLED */ @@ -5140,7 +4255,7 @@ tcp_update_label(tcp_t *tcp, const cred_t *cr) * ======================= * * The eager is now established in its own perimeter as soon as SYN is - * received in tcp_conn_request(). When sockfs receives conn_ind, it + * received in tcp_input_listener(). When sockfs receives conn_ind, it * completes the accept processing on the acceptor STREAM. The sending * of conn_ind part is common for both sockfs listener and a TLI/XTI * listener but a TLI/XTI listener completes the accept processing @@ -5149,29 +4264,28 @@ tcp_update_label(tcp_t *tcp, const cred_t *cr) * Common control flow for 3 way handshake: * ---------------------------------------- * - * incoming SYN (listener perimeter) -> tcp_rput_data() - * -> tcp_conn_request() + * incoming SYN (listener perimeter) -> tcp_input_listener() * - * incoming SYN-ACK-ACK (eager perim) -> tcp_rput_data() + * incoming SYN-ACK-ACK (eager perim) -> tcp_input_data() * send T_CONN_IND (listener perim) -> tcp_send_conn_ind() * * Sockfs ACCEPT Path: * ------------------- * - * open acceptor stream (tcp_open allocates tcp_wput_accept() + * open acceptor stream (tcp_open allocates tcp_tli_accept() * as STREAM entry point) * - * soaccept() sends T_CONN_RES on the acceptor STREAM to tcp_wput_accept() + * soaccept() sends T_CONN_RES on the acceptor STREAM to tcp_tli_accept() * - * tcp_wput_accept() extracts the eager and makes the q->q_ptr <-> eager + * tcp_tli_accept() extracts the eager and makes the q->q_ptr <-> eager * association (we are not behind eager's squeue but sockfs is protecting us * and no one knows about this stream yet. The STREAMS entry point q->q_info * is changed to point at tcp_wput(). * - * tcp_wput_accept() sends any deferred eagers via tcp_send_pending() to + * tcp_accept_common() sends any deferred eagers via tcp_send_pending() to * listener (done on listener's perimeter). * - * tcp_wput_accept() calls tcp_accept_finish() on eagers perimeter to finish + * tcp_tli_accept() calls tcp_accept_finish() on eagers perimeter to finish * accept. * * TLI/XTI client ACCEPT path: @@ -5179,8 +4293,8 @@ tcp_update_label(tcp_t *tcp, const cred_t *cr) * * soaccept() sends T_CONN_RES on the listener STREAM. * - * tcp_accept() -> tcp_accept_swap() complete the processing and send - * the bind_mp to eager perimeter to finish accept (tcp_rput_other()). + * tcp_tli_accept() -> tcp_accept_swap() complete the processing and send + * a M_SETOPS mblk to eager perimeter to finish accept (tcp_accept_finish()). * * Locks: * ====== @@ -5191,7 +4305,7 @@ tcp_update_label(tcp_t *tcp, const cred_t *cr) * Referencing: * ============ * - * 1) We start out in tcp_conn_request by eager placing a ref on + * 1) We start out in tcp_input_listener by eager placing a ref on * listener and listener adding eager to listeners->tcp_eager_next_q0. * * 2) When a SYN-ACK-ACK arrives, we send the conn_ind to listener. Before @@ -5249,51 +4363,71 @@ tcp_update_label(tcp_t *tcp, const cred_t *cr) /* * THIS FUNCTION IS DIRECTLY CALLED BY IP VIA SQUEUE FOR SYN. - * tcp_rput_data will not see any SYN packets. + * tcp_input_data will not see any packets for listeners since the listener + * has conn_recv set to tcp_input_listener. */ /* ARGSUSED */ void -tcp_conn_request(void *arg, mblk_t *mp, void *arg2) +tcp_input_listener(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) { - tcph_t *tcph; + tcpha_t *tcpha; uint32_t seg_seq; tcp_t *eager; - uint_t ipvers; - ipha_t *ipha; - ip6_t *ip6h; int err; conn_t *econnp = NULL; squeue_t *new_sqp; mblk_t *mp1; uint_t ip_hdr_len; - conn_t *connp = (conn_t *)arg; - tcp_t *tcp = connp->conn_tcp; - cred_t *credp; - tcp_stack_t *tcps = tcp->tcp_tcps; - ip_stack_t *ipst; + conn_t *lconnp = (conn_t *)arg; + tcp_t *listener = lconnp->conn_tcp; + tcp_stack_t *tcps = listener->tcp_tcps; + ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; + uint_t flags; + mblk_t *tpi_mp; + uint_t ifindex = ira->ira_ruifindex; - if (tcp->tcp_state != TCPS_LISTEN) + ip_hdr_len = ira->ira_ip_hdr_length; + tcpha = (tcpha_t *)&mp->b_rptr[ip_hdr_len]; + flags = (unsigned int)tcpha->tha_flags & 0xFF; + + if (!(flags & TH_SYN)) { + if ((flags & TH_RST) || (flags & TH_URG)) { + freemsg(mp); + return; + } + if (flags & TH_ACK) { + /* Note this executes in listener's squeue */ + tcp_xmit_listeners_reset(mp, ira, ipst, lconnp); + return; + } + + freemsg(mp); + return; + } + + if (listener->tcp_state != TCPS_LISTEN) goto error2; - ASSERT((tcp->tcp_connp->conn_flags & IPCL_BOUND) != 0); + ASSERT(IPCL_IS_BOUND(lconnp)); - mutex_enter(&tcp->tcp_eager_lock); - if (tcp->tcp_conn_req_cnt_q >= tcp->tcp_conn_req_max) { - mutex_exit(&tcp->tcp_eager_lock); + mutex_enter(&listener->tcp_eager_lock); + if (listener->tcp_conn_req_cnt_q >= listener->tcp_conn_req_max) { + mutex_exit(&listener->tcp_eager_lock); TCP_STAT(tcps, tcp_listendrop); BUMP_MIB(&tcps->tcps_mib, tcpListenDrop); - if (tcp->tcp_debug) { + if (lconnp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, - "tcp_conn_request: listen backlog (max=%d) " + "tcp_input_listener: listen backlog (max=%d) " "overflow (%d pending) on %s", - tcp->tcp_conn_req_max, tcp->tcp_conn_req_cnt_q, - tcp_display(tcp, NULL, DISP_PORT_ONLY)); + listener->tcp_conn_req_max, + listener->tcp_conn_req_cnt_q, + tcp_display(listener, NULL, DISP_PORT_ONLY)); } goto error2; } - if (tcp->tcp_conn_req_cnt_q0 >= - tcp->tcp_conn_req_max + tcps->tcps_conn_req_max_q0) { + if (listener->tcp_conn_req_cnt_q0 >= + listener->tcp_conn_req_max + tcps->tcps_conn_req_max_q0) { /* * Q0 is full. Drop a pending half-open req from the queue * to make room for the new SYN req. Also mark the time we @@ -5303,83 +4437,127 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) * be to set the "tcp_syn_defense" flag now. */ TCP_STAT(tcps, tcp_listendropq0); - tcp->tcp_last_rcv_lbolt = lbolt64; - if (!tcp_drop_q0(tcp)) { - mutex_exit(&tcp->tcp_eager_lock); + listener->tcp_last_rcv_lbolt = lbolt64; + if (!tcp_drop_q0(listener)) { + mutex_exit(&listener->tcp_eager_lock); BUMP_MIB(&tcps->tcps_mib, tcpListenDropQ0); - if (tcp->tcp_debug) { + if (lconnp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE, - "tcp_conn_request: listen half-open queue " - "(max=%d) full (%d pending) on %s", + "tcp_input_listener: listen half-open " + "queue (max=%d) full (%d pending) on %s", tcps->tcps_conn_req_max_q0, - tcp->tcp_conn_req_cnt_q0, - tcp_display(tcp, NULL, + listener->tcp_conn_req_cnt_q0, + tcp_display(listener, NULL, DISP_PORT_ONLY)); } goto error2; } } - mutex_exit(&tcp->tcp_eager_lock); + mutex_exit(&listener->tcp_eager_lock); /* - * IP adds STRUIO_EAGER and ensures that the received packet is - * M_DATA even if conn_ipv6_recvpktinfo is enabled or for ip6 - * link local address. If IPSec is enabled, db_struioflag has - * STRUIO_POLICY set (mutually exclusive from STRUIO_EAGER); - * otherwise an error case if neither of them is set. + * IP sets ira_sqp to either the senders conn_sqp (for loopback) + * or based on the ring (for packets from GLD). Otherwise it is + * set based on lbolt i.e., a somewhat random number. */ - if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) { - new_sqp = (squeue_t *)DB_CKSUMSTART(mp); - DB_CKSUMSTART(mp) = 0; - mp->b_datap->db_struioflag &= ~STRUIO_EAGER; - econnp = (conn_t *)tcp_get_conn(arg2, tcps); - if (econnp == NULL) - goto error2; - ASSERT(econnp->conn_netstack == connp->conn_netstack); - econnp->conn_sqp = new_sqp; - econnp->conn_initial_sqp = new_sqp; - } else if ((mp->b_datap->db_struioflag & STRUIO_POLICY) != 0) { - /* - * mp is updated in tcp_get_ipsec_conn(). - */ - econnp = tcp_get_ipsec_conn(tcp, arg2, &mp); - if (econnp == NULL) { - /* - * mp freed by tcp_get_ipsec_conn. - */ - return; - } - ASSERT(econnp->conn_netstack == connp->conn_netstack); - } else { + ASSERT(ira->ira_sqp != NULL); + new_sqp = ira->ira_sqp; + + econnp = (conn_t *)tcp_get_conn(arg2, tcps); + if (econnp == NULL) goto error2; - } - ASSERT(DB_TYPE(mp) == M_DATA); + ASSERT(econnp->conn_netstack == lconnp->conn_netstack); + econnp->conn_sqp = new_sqp; + econnp->conn_initial_sqp = new_sqp; + econnp->conn_ixa->ixa_sqp = new_sqp; + + econnp->conn_fport = tcpha->tha_lport; + econnp->conn_lport = tcpha->tha_fport; + + err = conn_inherit_parent(lconnp, econnp); + if (err != 0) + goto error3; - ipvers = IPH_HDR_VERSION(mp->b_rptr); - ASSERT(ipvers == IPV6_VERSION || ipvers == IPV4_VERSION); ASSERT(OK_32PTR(mp->b_rptr)); - if (ipvers == IPV4_VERSION) { - ipha = (ipha_t *)mp->b_rptr; - ip_hdr_len = IPH_HDR_LENGTH(ipha); - tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; - } else { - ip6h = (ip6_t *)mp->b_rptr; - ip_hdr_len = ip_hdr_length_v6(mp, ip6h); - tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; - } + ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION || + IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION); - if (tcp->tcp_family == AF_INET) { - ASSERT(ipvers == IPV4_VERSION); - err = tcp_conn_create_v4(connp, econnp, ipha, tcph, mp); + if (lconnp->conn_family == AF_INET) { + ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION); + tpi_mp = tcp_conn_create_v4(lconnp, econnp, mp, ira); } else { - err = tcp_conn_create_v6(connp, econnp, mp, tcph, ipvers, mp); + tpi_mp = tcp_conn_create_v6(lconnp, econnp, mp, ira); } - if (err) + if (tpi_mp == NULL) goto error3; eager = econnp->conn_tcp; + eager->tcp_detached = B_TRUE; + SOCK_CONNID_INIT(eager->tcp_connid); + + tcp_init_values(eager); + + ASSERT((econnp->conn_ixa->ixa_flags & + (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE | + IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO)) == + (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE | + IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO)); + + if (!tcps->tcps_dev_flow_ctl) + econnp->conn_ixa->ixa_flags |= IXAF_NO_DEV_FLOW_CTL; + + /* Prepare for diffing against previous packets */ + eager->tcp_recvifindex = 0; + eager->tcp_recvhops = 0xffffffffU; + + if (!(ira->ira_flags & IRAF_IS_IPV4) && econnp->conn_bound_if == 0) { + if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_faddr_v6) || + IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6)) { + econnp->conn_incoming_ifindex = ifindex; + econnp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET; + econnp->conn_ixa->ixa_scopeid = ifindex; + } + } + + if ((ira->ira_flags & (IRAF_IS_IPV4|IRAF_IPV4_OPTIONS)) == + (IRAF_IS_IPV4|IRAF_IPV4_OPTIONS) && + tcps->tcps_rev_src_routes) { + ipha_t *ipha = (ipha_t *)mp->b_rptr; + ip_pkt_t *ipp = &econnp->conn_xmit_ipp; + + /* Source routing option copyover (reverse it) */ + err = ip_find_hdr_v4(ipha, ipp, B_TRUE); + if (err != 0) { + freemsg(tpi_mp); + goto error3; + } + ip_pkt_source_route_reverse_v4(ipp); + } + + ASSERT(eager->tcp_conn.tcp_eager_conn_ind == NULL); + ASSERT(!eager->tcp_tconnind_started); + /* + * If the SYN came with a credential, it's a loopback packet or a + * labeled packet; attach the credential to the TPI message. + */ + if (ira->ira_cred != NULL) + mblk_setcred(tpi_mp, ira->ira_cred, ira->ira_cpid); + + eager->tcp_conn.tcp_eager_conn_ind = tpi_mp; + + /* Inherit the listener's SSL protection state */ + if ((eager->tcp_kssl_ent = listener->tcp_kssl_ent) != NULL) { + kssl_hold_ent(eager->tcp_kssl_ent); + eager->tcp_kssl_pending = B_TRUE; + } + + /* Inherit the listener's non-STREAMS flag */ + if (IPCL_IS_NONSTR(lconnp)) { + econnp->conn_flags |= IPCL_NONSTR; + } + ASSERT(eager->tcp_ordrel_mp == NULL); if (!IPCL_IS_NONSTR(econnp)) { @@ -5392,127 +4570,103 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) if ((eager->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) goto error3; } - /* Inherit various TCP parameters from the listener */ - eager->tcp_naglim = tcp->tcp_naglim; - eager->tcp_first_timer_threshold = tcp->tcp_first_timer_threshold; - eager->tcp_second_timer_threshold = tcp->tcp_second_timer_threshold; - - eager->tcp_first_ctimer_threshold = tcp->tcp_first_ctimer_threshold; - eager->tcp_second_ctimer_threshold = tcp->tcp_second_ctimer_threshold; - /* - * tcp_adapt_ire() may change tcp_rwnd according to the ire metrics. - * If it does not, the eager's receive window will be set to the - * listener's receive window later in this function. + * Now that the IP addresses and ports are setup in econnp we + * can do the IPsec policy work. */ - eager->tcp_rwnd = 0; + if (ira->ira_flags & IRAF_IPSEC_SECURE) { + if (lconnp->conn_policy != NULL) { + /* + * Inherit the policy from the listener; use + * actions from ira + */ + if (!ip_ipsec_policy_inherit(econnp, lconnp, ira)) { + CONN_DEC_REF(econnp); + freemsg(mp); + goto error3; + } + } + } - /* - * Inherit listener's tcp_init_cwnd. Need to do this before - * calling tcp_process_options() where tcp_mss_set() is called - * to set the initial cwnd. - */ - eager->tcp_init_cwnd = tcp->tcp_init_cwnd; + /* Inherit various TCP parameters from the listener */ + eager->tcp_naglim = listener->tcp_naglim; + eager->tcp_first_timer_threshold = listener->tcp_first_timer_threshold; + eager->tcp_second_timer_threshold = + listener->tcp_second_timer_threshold; + eager->tcp_first_ctimer_threshold = + listener->tcp_first_ctimer_threshold; + eager->tcp_second_ctimer_threshold = + listener->tcp_second_ctimer_threshold; /* - * Zones: tcp_adapt_ire() and tcp_send_data() both need the - * zone id before the accept is completed in tcp_wput_accept(). + * tcp_set_destination() may set tcp_rwnd according to the route + * metrics. If it does not, the eager's receive window will be set + * to the listener's receive window later in this function. */ - econnp->conn_zoneid = connp->conn_zoneid; - econnp->conn_allzones = connp->conn_allzones; - - /* Copy nexthop information from listener to eager */ - if (connp->conn_nexthop_set) { - econnp->conn_nexthop_set = connp->conn_nexthop_set; - econnp->conn_nexthop_v4 = connp->conn_nexthop_v4; - } + eager->tcp_rwnd = 0; /* - * TSOL: tsol_input_proc() needs the eager's cred before the - * eager is accepted + * Inherit listener's tcp_init_cwnd. Need to do this before + * calling tcp_process_options() which set the initial cwnd. */ - econnp->conn_cred = eager->tcp_cred = credp = connp->conn_cred; - crhold(credp); + eager->tcp_init_cwnd = listener->tcp_init_cwnd; - ASSERT(econnp->conn_effective_cred == NULL); if (is_system_labeled()) { - cred_t *cr; - ts_label_t *tsl; - - /* - * If this is an MLP connection or a MAC-Exempt connection - * with an unlabeled node, packets are to be - * exchanged using the security label of the received - * SYN packet instead of the server application's label. - */ - if ((cr = msg_getcred(mp, NULL)) != NULL && - (tsl = crgetlabel(cr)) != NULL && - (connp->conn_mlp_type != mlptSingle || - (connp->conn_mac_mode != CONN_MAC_AWARE && - (tsl->tsl_flags & TSLF_UNLABELED)))) { - if ((econnp->conn_effective_cred = - copycred_from_tslabel(econnp->conn_cred, - tsl, KM_NOSLEEP)) != NULL) { - DTRACE_PROBE2( - syn_accept_peerlabel, - conn_t *, econnp, cred_t *, - econnp->conn_effective_cred); - } else { - DTRACE_PROBE3( - tx__ip__log__error__set__eagercred__tcp, - char *, - "SYN mp(1) label on eager connp(2) failed", - mblk_t *, mp, conn_t *, econnp); - goto error3; - } + ip_xmit_attr_t *ixa = econnp->conn_ixa; + + ASSERT(ira->ira_tsl != NULL); + /* Discard any old label */ + if (ixa->ixa_free_flags & IXA_FREE_TSL) { + ASSERT(ixa->ixa_tsl != NULL); + label_rele(ixa->ixa_tsl); + ixa->ixa_free_flags &= ~IXA_FREE_TSL; + ixa->ixa_tsl = NULL; + } + if ((lconnp->conn_mlp_type != mlptSingle || + lconnp->conn_mac_mode != CONN_MAC_DEFAULT) && + ira->ira_tsl != NULL) { + /* + * If this is an MLP connection or a MAC-Exempt + * connection with an unlabeled node, packets are to be + * exchanged using the security label of the received + * SYN packet instead of the server application's label. + * tsol_check_dest called from ip_set_destination + * might later update TSF_UNLABELED by replacing + * ixa_tsl with a new label. + */ + label_hold(ira->ira_tsl); + ip_xmit_attr_replace_tsl(ixa, ira->ira_tsl); + DTRACE_PROBE2(mlp_syn_accept, conn_t *, + econnp, ts_label_t *, ixa->ixa_tsl) } else { + ixa->ixa_tsl = crgetlabel(econnp->conn_cred); DTRACE_PROBE2(syn_accept, conn_t *, - econnp, cred_t *, econnp->conn_cred) + econnp, ts_label_t *, ixa->ixa_tsl) } - /* - * Verify the destination is allowed to receive packets - * at the security label of the SYN-ACK we are generating. - * tsol_check_dest() may create a new effective cred for - * this connection with a modified label or label flags. + * conn_connect() called from tcp_set_destination will verify + * the destination is allowed to receive packets at the + * security label of the SYN-ACK we are generating. As part of + * that, tsol_check_dest() may create a new effective label for + * this connection. + * Finally conn_connect() will call conn_update_label. + * All that remains for TCP to do is to call + * conn_build_hdr_template which is done as part of + * tcp_set_destination. */ - if (IN6_IS_ADDR_V4MAPPED(&econnp->conn_remv6)) { - uint32_t dst; - IN6_V4MAPPED_TO_IPADDR(&econnp->conn_remv6, dst); - err = tsol_check_dest(CONN_CRED(econnp), &dst, - IPV4_VERSION, B_FALSE, &cr); - } else { - err = tsol_check_dest(CONN_CRED(econnp), - &econnp->conn_remv6, IPV6_VERSION, - B_FALSE, &cr); - } - if (err != 0) - goto error3; - if (cr != NULL) { - if (econnp->conn_effective_cred != NULL) - crfree(econnp->conn_effective_cred); - econnp->conn_effective_cred = cr; - } - - /* - * Generate the security label to be used in the text of - * this connection's outgoing packets. - */ - if (!tcp_update_label(eager, CONN_CRED(econnp))) { - DTRACE_PROBE3( - tx__ip__log__error__connrequest__tcp, - char *, "eager connp(1) label on SYN mp(2) failed", - conn_t *, econnp, mblk_t *, mp); - goto error3; - } } + /* + * Since we will clear tcp_listener before we clear tcp_detached + * in the accept code we need tcp_hard_binding aka tcp_accept_inprogress + * so we can tell a TCP_DETACHED_NONEAGER apart. + */ eager->tcp_hard_binding = B_TRUE; tcp_bind_hash_insert(&tcps->tcps_bind_fanout[ - TCP_BIND_HASH(eager->tcp_lport)], eager, 0); + TCP_BIND_HASH(econnp->conn_lport)], eager, 0); - CL_INET_CONNECT(connp, eager, B_FALSE, err); + CL_INET_CONNECT(econnp, B_FALSE, err); if (err != 0) { tcp_bind_hash_remove(eager); goto error3; @@ -5528,32 +4682,27 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) SOCK_CONNID_BUMP(eager->tcp_connid); /* - * There should be no ire in the mp as we are being called after - * receiving the SYN. - */ - ASSERT(tcp_ire_mp(&mp) == NULL); - - /* - * Adapt our mss, ttl, ... according to information provided in IRE. + * Adapt our mss, ttl, ... based on the remote address. */ - if (tcp_adapt_ire(eager, NULL) == 0) { + if (tcp_set_destination(eager) != 0) { + BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails); /* Undo the bind_hash_insert */ tcp_bind_hash_remove(eager); goto error3; } /* Process all TCP options. */ - tcp_process_options(eager, tcph); + tcp_process_options(eager, tcpha); /* Is the other end ECN capable? */ if (tcps->tcps_ecn_permitted >= 1 && - (tcph->th_flags[0] & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) { + (tcpha->tha_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) { eager->tcp_ecn_ok = B_TRUE; } /* - * listeners tcp_recv_hiwater should be the default window size or a + * The listener's conn_rcvbuf should be the default window size or a * window size changed via SO_RCVBUF option. First round up the * eager's tcp_rwnd to the nearest MSS. Then find out the window * scale option value if needed. Call tcp_rwnd_set() to finish the @@ -5563,7 +4712,7 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) * we should not inherit receive window size from listener. */ eager->tcp_rwnd = MSS_ROUNDUP( - (eager->tcp_rwnd == 0 ? tcp->tcp_recv_hiwater: + (eager->tcp_rwnd == 0 ? econnp->conn_rcvbuf : eager->tcp_rwnd), eager->tcp_mss); if (eager->tcp_snd_ws_ok) tcp_set_ws_value(eager); @@ -5575,77 +4724,46 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) */ (void) tcp_rwnd_set(eager, eager->tcp_rwnd); - /* - * We eliminate the need for sockfs to send down a T_SVR4_OPTMGMT_REQ - * via soaccept()->soinheritoptions() which essentially applies - * all the listener options to the new STREAM. The options that we - * need to take care of are: - * SO_DEBUG, SO_REUSEADDR, SO_KEEPALIVE, SO_DONTROUTE, SO_BROADCAST, - * SO_USELOOPBACK, SO_OOBINLINE, SO_DGRAM_ERRIND, SO_LINGER, - * SO_SNDBUF, SO_RCVBUF. - * - * SO_RCVBUF: tcp_rwnd_set() above takes care of it. - * SO_SNDBUF: Set the tcp_xmit_hiwater for the eager. When - * tcp_maxpsz_set() gets called later from - * tcp_accept_finish(), the option takes effect. - * - */ - /* Set the TCP options */ - eager->tcp_recv_lowater = tcp->tcp_recv_lowater; - eager->tcp_xmit_hiwater = tcp->tcp_xmit_hiwater; - eager->tcp_dgram_errind = tcp->tcp_dgram_errind; - eager->tcp_oobinline = tcp->tcp_oobinline; - eager->tcp_reuseaddr = tcp->tcp_reuseaddr; - eager->tcp_broadcast = tcp->tcp_broadcast; - eager->tcp_useloopback = tcp->tcp_useloopback; - eager->tcp_dontroute = tcp->tcp_dontroute; - eager->tcp_debug = tcp->tcp_debug; - eager->tcp_linger = tcp->tcp_linger; - eager->tcp_lingertime = tcp->tcp_lingertime; - if (tcp->tcp_ka_enabled) - eager->tcp_ka_enabled = 1; - - ASSERT(eager->tcp_recv_hiwater != 0 && - eager->tcp_recv_hiwater == eager->tcp_rwnd); - - /* Set the IP options */ - econnp->conn_broadcast = connp->conn_broadcast; - econnp->conn_loopback = connp->conn_loopback; - econnp->conn_dontroute = connp->conn_dontroute; - econnp->conn_reuseaddr = connp->conn_reuseaddr; + ASSERT(eager->tcp_connp->conn_rcvbuf != 0 && + eager->tcp_connp->conn_rcvbuf == eager->tcp_rwnd); + + ASSERT(econnp->conn_rcvbuf != 0 && + econnp->conn_rcvbuf == eager->tcp_rwnd); /* Put a ref on the listener for the eager. */ - CONN_INC_REF(connp); - mutex_enter(&tcp->tcp_eager_lock); - tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = eager; - eager->tcp_eager_next_q0 = tcp->tcp_eager_next_q0; - tcp->tcp_eager_next_q0 = eager; - eager->tcp_eager_prev_q0 = tcp; + CONN_INC_REF(lconnp); + mutex_enter(&listener->tcp_eager_lock); + listener->tcp_eager_next_q0->tcp_eager_prev_q0 = eager; + eager->tcp_eager_next_q0 = listener->tcp_eager_next_q0; + listener->tcp_eager_next_q0 = eager; + eager->tcp_eager_prev_q0 = listener; /* Set tcp_listener before adding it to tcp_conn_fanout */ - eager->tcp_listener = tcp; - eager->tcp_saved_listener = tcp; + eager->tcp_listener = listener; + eager->tcp_saved_listener = listener; /* * Tag this detached tcp vector for later retrieval * by our listener client in tcp_accept(). */ - eager->tcp_conn_req_seqnum = tcp->tcp_conn_req_seqnum; - tcp->tcp_conn_req_cnt_q0++; - if (++tcp->tcp_conn_req_seqnum == -1) { + eager->tcp_conn_req_seqnum = listener->tcp_conn_req_seqnum; + listener->tcp_conn_req_cnt_q0++; + if (++listener->tcp_conn_req_seqnum == -1) { /* * -1 is "special" and defined in TPI as something * that should never be used in T_CONN_IND */ - ++tcp->tcp_conn_req_seqnum; + ++listener->tcp_conn_req_seqnum; } - mutex_exit(&tcp->tcp_eager_lock); + mutex_exit(&listener->tcp_eager_lock); - if (tcp->tcp_syn_defense) { + if (listener->tcp_syn_defense) { /* Don't drop the SYN that comes from a good IP source */ - ipaddr_t *addr_cache = (ipaddr_t *)(tcp->tcp_ip_addr_cache); - if (addr_cache != NULL && eager->tcp_remote == - addr_cache[IP_ADDR_CACHE_HASH(eager->tcp_remote)]) { + ipaddr_t *addr_cache; + + addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache); + if (addr_cache != NULL && econnp->conn_faddr_v4 == + addr_cache[IP_ADDR_CACHE_HASH(econnp->conn_faddr_v4)]) { eager->tcp_dontdrop = B_TRUE; } } @@ -5655,14 +4773,14 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) * as we do that, we expose the eager to the classifier and * should not touch any field outside the eager's perimeter. * So do all the work necessary before inserting the eager - * in its own perimeter. Be optimistic that ipcl_conn_insert() + * in its own perimeter. Be optimistic that conn_connect() * will succeed but undo everything if it fails. */ - seg_seq = ABE32_TO_U32(tcph->th_seq); + seg_seq = ntohl(tcpha->tha_seq); eager->tcp_irs = seg_seq; eager->tcp_rack = seg_seq; eager->tcp_rnxt = seg_seq + 1; - U32_TO_ABE32(eager->tcp_rnxt, eager->tcp_tcph->th_ack); + eager->tcp_tcpha->tha_ack = htonl(eager->tcp_rnxt); BUMP_MIB(&tcps->tcps_mib, tcpPassiveOpens); eager->tcp_state = TCPS_SYN_RCVD; mp1 = tcp_xmit_mp(eager, eager->tcp_xmit_head, eager->tcp_mss, @@ -5677,24 +4795,10 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) } /* - * Note that in theory this should use the current pid - * so that getpeerucred on the client returns the actual listener - * that does accept. But accept() hasn't been called yet. We could use - * the pid of the process that did bind/listen on the server. - * However, with common usage like inetd() the bind/listen can be done - * by a different process than the accept(). - * Hence we do the simple thing of using the open pid here. - * Note that db_credp is set later in tcp_send_data(). - */ - mblk_setcred(mp1, credp, tcp->tcp_cpid); - eager->tcp_cpid = tcp->tcp_cpid; - eager->tcp_open_time = lbolt64; - - /* * We need to start the rto timer. In normal case, we start * the timer after sending the packet on the wire (or at * least believing that packet was sent by waiting for - * CALL_IP_WPUT() to return). Since this is the first packet + * conn_ip_output() to return). Since this is the first packet * being sent on the wire for the eager, our initial tcp_rto * is at least tcp_rexmit_interval_min which is a fairly * large value to allow the algorithm to adjust slowly to large @@ -5716,7 +4820,7 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) * ensure against an eager close race. */ - CONN_INC_REF(eager->tcp_connp); + CONN_INC_REF(econnp); TCP_TIMER_RESTART(eager, eager->tcp_rto); @@ -5724,22 +4828,16 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) * Insert the eager in its own perimeter now. We are ready to deal * with any packets on eager. */ - if (eager->tcp_ipversion == IPV4_VERSION) { - if (ipcl_conn_insert(econnp, IPPROTO_TCP, 0, 0, 0) != 0) { - goto error; - } - } else { - if (ipcl_conn_insert_v6(econnp, IPPROTO_TCP, 0, 0, 0, 0) != 0) { - goto error; - } - } - - /* mark conn as fully-bound */ - econnp->conn_fully_bound = B_TRUE; + if (ipcl_conn_insert(econnp) != 0) + goto error; - /* Send the SYN-ACK */ - tcp_send_data(eager, eager->tcp_wq, mp1); - CONN_DEC_REF(eager->tcp_connp); + /* + * Send the SYN-ACK. Can't use tcp_send_data since we can't update + * pmtu etc; we are not on the eager's squeue + */ + ASSERT(econnp->conn_ixa->ixa_notify_cookie == econnp->conn_tcp); + (void) conn_ip_output(mp1, econnp->conn_ixa); + CONN_DEC_REF(econnp); freemsg(mp); return; @@ -5749,7 +4847,7 @@ error: TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); mp1 = &eager->tcp_closemp; SQUEUE_ENTER_ONE(econnp->conn_sqp, mp1, tcp_eager_kill, - econnp, SQ_FILL, SQTAG_TCP_CONN_REQ_2); + econnp, NULL, SQ_FILL, SQTAG_TCP_CONN_REQ_2); /* * If a connection already exists, send the mp to that connections so @@ -5757,7 +4855,7 @@ error: */ ipst = tcps->tcps_netstack->netstack_ip; - if ((econnp = ipcl_classify(mp, connp->conn_zoneid, ipst)) != NULL) { + if ((econnp = ipcl_classify(mp, ira, ipst)) != NULL) { if (!IPCL_IS_CONNECTED(econnp)) { /* * Something bad happened. ipcl_conn_insert() @@ -5772,8 +4870,8 @@ error: CONN_DEC_REF(econnp); freemsg(mp); } else { - SQUEUE_ENTER_ONE(econnp->conn_sqp, mp, - tcp_input, econnp, SQ_FILL, SQTAG_TCP_CONN_REQ_1); + SQUEUE_ENTER_ONE(econnp->conn_sqp, mp, tcp_input_data, + econnp, ira, SQ_FILL, SQTAG_TCP_CONN_REQ_1); } } else { /* Nobody wants this packet */ @@ -5803,18 +4901,21 @@ error2: * very first time and there is no attempt to rebind them. */ void -tcp_conn_request_unbound(void *arg, mblk_t *mp, void *arg2) +tcp_input_listener_unbound(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *ira) { conn_t *connp = (conn_t *)arg; squeue_t *sqp = (squeue_t *)arg2; squeue_t *new_sqp; uint32_t conn_flags; - if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) { - new_sqp = (squeue_t *)DB_CKSUMSTART(mp); - } else { - goto done; - } + /* + * IP sets ira_sqp to either the senders conn_sqp (for loopback) + * or based on the ring (for packets from GLD). Otherwise it is + * set based on lbolt i.e., a somewhat random number. + */ + ASSERT(ira->ira_sqp != NULL); + new_sqp = ira->ira_sqp; if (connp->conn_fanout == NULL) goto done; @@ -5849,6 +4950,8 @@ tcp_conn_request_unbound(void *arg, mblk_t *mp, void *arg2) if (connp->conn_sqp != new_sqp) { while (connp->conn_sqp != new_sqp) (void) casptr(&connp->conn_sqp, sqp, new_sqp); + /* No special MT issues for outbound ixa_sqp hint */ + connp->conn_ixa->ixa_sqp = new_sqp; } do { @@ -5860,49 +4963,47 @@ tcp_conn_request_unbound(void *arg, mblk_t *mp, void *arg2) mutex_exit(&connp->conn_fanout->connf_lock); mutex_exit(&connp->conn_lock); + + /* + * Assume we have picked a good squeue for the listener. Make + * subsequent SYNs not try to change the squeue. + */ + connp->conn_recv = tcp_input_listener; } done: if (connp->conn_sqp != sqp) { CONN_INC_REF(connp); SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp, - SQ_FILL, SQTAG_TCP_CONN_REQ_UNBOUND); + ira, SQ_FILL, SQTAG_TCP_CONN_REQ_UNBOUND); } else { - tcp_conn_request(connp, mp, sqp); + tcp_input_listener(connp, mp, sqp, ira); } } /* * Successful connect request processing begins when our client passes - * a T_CONN_REQ message into tcp_wput() and ends when tcp_rput() passes - * our T_OK_ACK reply message upstream. The control flow looks like this: - * upstream -> tcp_wput() -> tcp_wput_proto() -> tcp_tpi_connect() -> IP - * upstream <- tcp_rput() <- IP + * a T_CONN_REQ message into tcp_wput(), which performs function calls into + * IP and the passes a T_OK_ACK (or T_ERROR_ACK upstream). + * * After various error checks are completed, tcp_tpi_connect() lays - * the target address and port into the composite header template, - * preallocates the T_OK_ACK reply message, construct a full 12 byte bind - * request followed by an IRE request, and passes the three mblk message - * down to IP looking like this: - * O_T_BIND_REQ for IP --> IRE req --> T_OK_ACK for our client - * Processing continues in tcp_rput() when we receive the following message: - * T_BIND_ACK from IP --> IRE ack --> T_OK_ACK for our client - * After consuming the first two mblks, tcp_rput() calls tcp_timer(), - * to fire off the connection request, and then passes the T_OK_ACK mblk - * upstream that we filled in below. There are, of course, numerous - * error conditions along the way which truncate the processing described - * above. + * the target address and port into the composite header template. + * Then we ask IP for information, including a source address if we didn't + * already have one. Finally we prepare to send the SYN packet, and then + * send up the T_OK_ACK reply message. */ static void tcp_tpi_connect(tcp_t *tcp, mblk_t *mp) { sin_t *sin; - queue_t *q = tcp->tcp_wq; struct T_conn_req *tcr; struct sockaddr *sa; socklen_t len; int error; cred_t *cr; pid_t cpid; + conn_t *connp = tcp->tcp_connp; + queue_t *q = connp->conn_wq; /* * All Solaris components should pass a db_credp @@ -5944,7 +5045,7 @@ tcp_tpi_connect(tcp_t *tcp, mblk_t *mp) * Determine packet type based on type of address passed in * the request should contain an IPv4 or IPv6 address. * Make sure that address family matches the type of - * family of the the address passed down + * family of the address passed down. */ switch (tcr->DEST_length) { default: @@ -6022,7 +5123,7 @@ tcp_tpi_connect(tcp_t *tcp, mblk_t *mp) break; } - error = proto_verify_ip_addr(tcp->tcp_family, sa, len); + error = proto_verify_ip_addr(connp->conn_family, sa, len); if (error != 0) { tcp_err_ack(tcp, mp, TSYSERR, error); return; @@ -6111,7 +5212,7 @@ tcp_tpi_connect(tcp_t *tcp, mblk_t *mp) /* return error ack and blow away saved option results if any */ connect_failed: if (mp != NULL) - putnext(tcp->tcp_rq, mp); + putnext(connp->conn_rq, mp); else { tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, TSYSERR, ENOMEM); @@ -6121,20 +5222,19 @@ connect_failed: /* * Handle connect to IPv4 destinations, including connections for AF_INET6 * sockets connecting to IPv4 mapped IPv6 destinations. + * Returns zero if OK, a positive errno, or a negative TLI error. */ static int tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport, - uint_t srcid, cred_t *cr, pid_t pid) + uint_t srcid) { - tcph_t *tcph; - mblk_t *mp; - ipaddr_t dstaddr = *dstaddrp; - int32_t oldstate; - uint16_t lport; - int error = 0; + ipaddr_t dstaddr = *dstaddrp; + uint16_t lport; + conn_t *connp = tcp->tcp_connp; tcp_stack_t *tcps = tcp->tcp_tcps; + int error; - ASSERT(tcp->tcp_ipversion == IPV4_VERSION); + ASSERT(connp->conn_ipversion == IPV4_VERSION); /* Check for attempt to connect to INADDR_ANY */ if (dstaddr == INADDR_ANY) { @@ -6157,74 +5257,21 @@ tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport, } /* Handle __sin6_src_id if socket not bound to an IP address */ - if (srcid != 0 && tcp->tcp_ipha->ipha_src == INADDR_ANY) { - ip_srcid_find_id(srcid, &tcp->tcp_ip_src_v6, - tcp->tcp_connp->conn_zoneid, tcps->tcps_netstack); - IN6_V4MAPPED_TO_IPADDR(&tcp->tcp_ip_src_v6, - tcp->tcp_ipha->ipha_src); + if (srcid != 0 && connp->conn_laddr_v4 == INADDR_ANY) { + ip_srcid_find_id(srcid, &connp->conn_laddr_v6, + IPCL_ZONEID(connp), tcps->tcps_netstack); + connp->conn_saddr_v6 = connp->conn_laddr_v6; } - /* - * Don't let an endpoint connect to itself. Note that - * the test here does not catch the case where the - * source IP addr was left unspecified by the user. In - * this case, the source addr is set in tcp_adapt_ire() - * using the reply to the T_BIND message that we send - * down to IP here and the check is repeated in tcp_rput_other. - */ - if (dstaddr == tcp->tcp_ipha->ipha_src && - dstport == tcp->tcp_lport) { - error = -TBADADDR; - goto failed; - } + IN6_IPADDR_TO_V4MAPPED(dstaddr, &connp->conn_faddr_v6); + connp->conn_fport = dstport; /* - * Verify the destination is allowed to receive packets - * at the security label of the connection we are initiating. - * tsol_check_dest() may create a new effective cred for this - * connection with a modified label or label flags. - */ - if (is_system_labeled()) { - ASSERT(tcp->tcp_connp->conn_effective_cred == NULL); - if ((error = tsol_check_dest(CONN_CRED(tcp->tcp_connp), - &dstaddr, IPV4_VERSION, tcp->tcp_connp->conn_mac_mode, - &tcp->tcp_connp->conn_effective_cred)) != 0) { - if (error != EHOSTUNREACH) - error = -TSYSERR; - goto failed; - } - } - - tcp->tcp_ipha->ipha_dst = dstaddr; - IN6_IPADDR_TO_V4MAPPED(dstaddr, &tcp->tcp_remote_v6); - - /* - * Massage a source route if any putting the first hop - * in iph_dst. Compute a starting value for the checksum which - * takes into account that the original iph_dst should be - * included in the checksum but that ip will include the - * first hop in the source route in the tcp checksum. - */ - tcp->tcp_sum = ip_massage_options(tcp->tcp_ipha, tcps->tcps_netstack); - tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16); - tcp->tcp_sum -= ((tcp->tcp_ipha->ipha_dst >> 16) + - (tcp->tcp_ipha->ipha_dst & 0xffff)); - if ((int)tcp->tcp_sum < 0) - tcp->tcp_sum--; - tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16); - tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) + - (tcp->tcp_sum >> 16)); - tcph = tcp->tcp_tcph; - *(uint16_t *)tcph->th_fport = dstport; - tcp->tcp_fport = dstport; - - oldstate = tcp->tcp_state; - /* * At this point the remote destination address and remote port fields * in the tcp-four-tuple have been filled in the tcp structure. Now we - * have to see which state tcp was in so we can take apropriate action. + * have to see which state tcp was in so we can take appropriate action. */ - if (oldstate == TCPS_IDLE) { + if (tcp->tcp_state == TCPS_IDLE) { /* * We support a quick connect capability here, allowing * clients to transition directly from IDLE to SYN_SENT @@ -6233,203 +5280,93 @@ tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport, */ lport = tcp_update_next_port(tcps->tcps_next_port_to_try, tcp, B_TRUE); - lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE, + lport = tcp_bindi(tcp, lport, &connp->conn_laddr_v6, 0, B_TRUE, B_FALSE, B_FALSE); - if (lport == 0) { - error = -TNOADDR; - goto failed; - } - } - tcp->tcp_state = TCPS_SYN_SENT; - - mp = allocb(sizeof (ire_t), BPRI_HI); - if (mp == NULL) { - tcp->tcp_state = oldstate; - error = ENOMEM; - goto failed; + if (lport == 0) + return (-TNOADDR); } - mp->b_wptr += sizeof (ire_t); - mp->b_datap->db_type = IRE_DB_REQ_TYPE; - tcp->tcp_hard_binding = 1; - /* - * We need to make sure that the conn_recv is set to a non-null - * value before we insert the conn_t into the classifier table. - * This is to avoid a race with an incoming packet which does - * an ipcl_classify(). + * Lookup the route to determine a source address and the uinfo. + * If there was a source route we have tcp_ipha->ipha_dst as the first + * hop. + * Setup TCP parameters based on the metrics/DCE. */ - tcp->tcp_connp->conn_recv = tcp_input; + error = tcp_set_destination(tcp); + if (error != 0) + return (error); - if (tcp->tcp_family == AF_INET) { - error = ip_proto_bind_connected_v4(tcp->tcp_connp, &mp, - IPPROTO_TCP, &tcp->tcp_ipha->ipha_src, tcp->tcp_lport, - tcp->tcp_remote, tcp->tcp_fport, B_TRUE, B_TRUE, cr); - } else { - in6_addr_t v6src; - if (tcp->tcp_ipversion == IPV4_VERSION) { - IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, &v6src); - } else { - v6src = tcp->tcp_ip6h->ip6_src; - } - error = ip_proto_bind_connected_v6(tcp->tcp_connp, &mp, - IPPROTO_TCP, &v6src, tcp->tcp_lport, &tcp->tcp_remote_v6, - &tcp->tcp_sticky_ipp, tcp->tcp_fport, B_TRUE, B_TRUE, cr); - } - BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens); - tcp->tcp_active_open = 1; + /* + * Don't let an endpoint connect to itself. + */ + if (connp->conn_faddr_v4 == connp->conn_laddr_v4 && + connp->conn_fport == connp->conn_lport) + return (-TBADADDR); + tcp->tcp_state = TCPS_SYN_SENT; - return (tcp_post_ip_bind(tcp, mp, error, cr, pid)); -failed: - /* return error ack and blow away saved option results if any */ - if (tcp->tcp_conn.tcp_opts_conn_req != NULL) - tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); - return (error); + return (ipcl_conn_insert_v4(connp)); } /* * Handle connect to IPv6 destinations. + * Returns zero if OK, a positive errno, or a negative TLI error. */ static int tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp, in_port_t dstport, - uint32_t flowinfo, uint_t srcid, uint32_t scope_id, cred_t *cr, pid_t pid) + uint32_t flowinfo, uint_t srcid, uint32_t scope_id) { - tcph_t *tcph; - mblk_t *mp; - ip6_rthdr_t *rth; - int32_t oldstate; - uint16_t lport; + uint16_t lport; + conn_t *connp = tcp->tcp_connp; tcp_stack_t *tcps = tcp->tcp_tcps; - int error = 0; - conn_t *connp = tcp->tcp_connp; + int error; - ASSERT(tcp->tcp_family == AF_INET6); + ASSERT(connp->conn_family == AF_INET6); /* * If we're here, it means that the destination address is a native - * IPv6 address. Return an error if tcp_ipversion is not IPv6. A + * IPv6 address. Return an error if conn_ipversion is not IPv6. A * reason why it might not be IPv6 is if the socket was bound to an * IPv4-mapped IPv6 address. */ - if (tcp->tcp_ipversion != IPV6_VERSION) { + if (connp->conn_ipversion != IPV6_VERSION) return (-TBADADDR); - } /* * Interpret a zero destination to mean loopback. * Update the T_CONN_REQ (sin/sin6) since it is used to * generate the T_CONN_CON. */ - if (IN6_IS_ADDR_UNSPECIFIED(dstaddrp)) { + if (IN6_IS_ADDR_UNSPECIFIED(dstaddrp)) *dstaddrp = ipv6_loopback; - } /* Handle __sin6_src_id if socket not bound to an IP address */ - if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip6h->ip6_src)) { - ip_srcid_find_id(srcid, &tcp->tcp_ip6h->ip6_src, - connp->conn_zoneid, tcps->tcps_netstack); - tcp->tcp_ip_src_v6 = tcp->tcp_ip6h->ip6_src; - } - - /* - * Take care of the scope_id now and add ip6i_t - * if ip6i_t is not already allocated through TCP - * sticky options. At this point tcp_ip6h does not - * have dst info, thus use dstaddrp. - */ - if (scope_id != 0 && - IN6_IS_ADDR_LINKSCOPE(dstaddrp)) { - ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp; - ip6i_t *ip6i; - - ipp->ipp_ifindex = scope_id; - ip6i = (ip6i_t *)tcp->tcp_iphc; - - if ((ipp->ipp_fields & IPPF_HAS_IP6I) && - ip6i != NULL && (ip6i->ip6i_nxt == IPPROTO_RAW)) { - /* Already allocated */ - ip6i->ip6i_flags |= IP6I_IFINDEX; - ip6i->ip6i_ifindex = ipp->ipp_ifindex; - ipp->ipp_fields |= IPPF_SCOPE_ID; - } else { - int reterr; - - ipp->ipp_fields |= IPPF_SCOPE_ID; - if (ipp->ipp_fields & IPPF_HAS_IP6I) - ip2dbg(("tcp_connect_v6: SCOPE_ID set\n")); - reterr = tcp_build_hdrs(tcp); - if (reterr != 0) - goto failed; - ip1dbg(("tcp_connect_ipv6: tcp_bld_hdrs returned\n")); - } - } - - /* - * Don't let an endpoint connect to itself. Note that - * the test here does not catch the case where the - * source IP addr was left unspecified by the user. In - * this case, the source addr is set in tcp_adapt_ire() - * using the reply to the T_BIND message that we send - * down to IP here and the check is repeated in tcp_rput_other. - */ - if (IN6_ARE_ADDR_EQUAL(dstaddrp, &tcp->tcp_ip6h->ip6_src) && - (dstport == tcp->tcp_lport)) { - error = -TBADADDR; - goto failed; + if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { + ip_srcid_find_id(srcid, &connp->conn_laddr_v6, + IPCL_ZONEID(connp), tcps->tcps_netstack); + connp->conn_saddr_v6 = connp->conn_laddr_v6; } /* - * Verify the destination is allowed to receive packets - * at the security label of the connection we are initiating. - * check_dest may create a new effective cred for this - * connection with a modified label or label flags. + * Take care of the scope_id now. */ - if (is_system_labeled()) { - ASSERT(tcp->tcp_connp->conn_effective_cred == NULL); - if ((error = tsol_check_dest(CONN_CRED(tcp->tcp_connp), - dstaddrp, IPV6_VERSION, tcp->tcp_connp->conn_mac_mode, - &tcp->tcp_connp->conn_effective_cred)) != 0) { - if (error != EHOSTUNREACH) - error = -TSYSERR; - goto failed; - } - } - - tcp->tcp_ip6h->ip6_dst = *dstaddrp; - tcp->tcp_remote_v6 = *dstaddrp; - tcp->tcp_ip6h->ip6_vcf = - (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) | - (flowinfo & ~IPV6_VERS_AND_FLOW_MASK); - - /* - * Massage a routing header (if present) putting the first hop - * in ip6_dst. Compute a starting value for the checksum which - * takes into account that the original ip6_dst should be - * included in the checksum but that ip will include the - * first hop in the source route in the tcp checksum. - */ - rth = ip_find_rthdr_v6(tcp->tcp_ip6h, (uint8_t *)tcp->tcp_tcph); - if (rth != NULL) { - tcp->tcp_sum = ip_massage_options_v6(tcp->tcp_ip6h, rth, - tcps->tcps_netstack); - tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) + - (tcp->tcp_sum >> 16)); + if (scope_id != 0 && IN6_IS_ADDR_LINKSCOPE(dstaddrp)) { + connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET; + connp->conn_ixa->ixa_scopeid = scope_id; } else { - tcp->tcp_sum = 0; + connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; } - tcph = tcp->tcp_tcph; - *(uint16_t *)tcph->th_fport = dstport; - tcp->tcp_fport = dstport; + connp->conn_flowinfo = flowinfo; + connp->conn_faddr_v6 = *dstaddrp; + connp->conn_fport = dstport; - oldstate = tcp->tcp_state; /* * At this point the remote destination address and remote port fields * in the tcp-four-tuple have been filled in the tcp structure. Now we - * have to see which state tcp was in so we can take apropriate action. + * have to see which state tcp was in so we can take appropriate action. */ - if (oldstate == TCPS_IDLE) { + if (tcp->tcp_state == TCPS_IDLE) { /* * We support a quick connect capability here, allowing * clients to transition directly from IDLE to SYN_SENT @@ -6438,128 +5375,55 @@ tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp, in_port_t dstport, */ lport = tcp_update_next_port(tcps->tcps_next_port_to_try, tcp, B_TRUE); - lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE, + lport = tcp_bindi(tcp, lport, &connp->conn_laddr_v6, 0, B_TRUE, B_FALSE, B_FALSE); - if (lport == 0) { - error = -TNOADDR; - goto failed; - } + if (lport == 0) + return (-TNOADDR); } - tcp->tcp_state = TCPS_SYN_SENT; - - mp = allocb(sizeof (ire_t), BPRI_HI); - if (mp != NULL) { - in6_addr_t v6src; - - mp->b_wptr += sizeof (ire_t); - mp->b_datap->db_type = IRE_DB_REQ_TYPE; - tcp->tcp_hard_binding = 1; - - /* - * We need to make sure that the conn_recv is set to a non-null - * value before we insert the conn_t into the classifier table. - * This is to avoid a race with an incoming packet which does - * an ipcl_classify(). - */ - tcp->tcp_connp->conn_recv = tcp_input; + /* + * Lookup the route to determine a source address and the uinfo. + * If there was a source route we have tcp_ip6h->ip6_dst as the first + * hop. + * Setup TCP parameters based on the metrics/DCE. + */ + error = tcp_set_destination(tcp); + if (error != 0) + return (error); - if (tcp->tcp_ipversion == IPV4_VERSION) { - IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, &v6src); - } else { - v6src = tcp->tcp_ip6h->ip6_src; - } - error = ip_proto_bind_connected_v6(connp, &mp, IPPROTO_TCP, - &v6src, tcp->tcp_lport, &tcp->tcp_remote_v6, - &tcp->tcp_sticky_ipp, tcp->tcp_fport, B_TRUE, B_TRUE, cr); - BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens); - tcp->tcp_active_open = 1; + /* + * Don't let an endpoint connect to itself. + */ + if (IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6, &connp->conn_laddr_v6) && + connp->conn_fport == connp->conn_lport) + return (-TBADADDR); - return (tcp_post_ip_bind(tcp, mp, error, cr, pid)); - } - /* Error case */ - tcp->tcp_state = oldstate; - error = ENOMEM; + tcp->tcp_state = TCPS_SYN_SENT; -failed: - /* return error ack and blow away saved option results if any */ - if (tcp->tcp_conn.tcp_opts_conn_req != NULL) - tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); - return (error); + return (ipcl_conn_insert_v6(connp)); } /* - * We need a stream q for detached closing tcp connections - * to use. Our client hereby indicates that this q is the - * one to use. + * Disconnect + * Note that unlike other functions this returns a positive tli error + * when it fails; it never returns an errno. */ -static void -tcp_def_q_set(tcp_t *tcp, mblk_t *mp) -{ - struct iocblk *iocp = (struct iocblk *)mp->b_rptr; - queue_t *q = tcp->tcp_wq; - tcp_stack_t *tcps = tcp->tcp_tcps; - -#ifdef NS_DEBUG - (void) printf("TCP_IOC_DEFAULT_Q for stack %d\n", - tcps->tcps_netstack->netstack_stackid); -#endif - mp->b_datap->db_type = M_IOCACK; - iocp->ioc_count = 0; - mutex_enter(&tcps->tcps_g_q_lock); - if (tcps->tcps_g_q != NULL) { - mutex_exit(&tcps->tcps_g_q_lock); - iocp->ioc_error = EALREADY; - } else { - int error = 0; - conn_t *connp = tcp->tcp_connp; - ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - - tcps->tcps_g_q = tcp->tcp_rq; - mutex_exit(&tcps->tcps_g_q_lock); - iocp->ioc_error = 0; - iocp->ioc_rval = 0; - /* - * We are passing tcp_sticky_ipp as NULL - * as it is not useful for tcp_default queue - * - * Set conn_recv just in case. - */ - tcp->tcp_connp->conn_recv = tcp_conn_request; - - ASSERT(connp->conn_af_isv6); - connp->conn_ulp = IPPROTO_TCP; - - if (ipst->ips_ipcl_proto_fanout_v6[IPPROTO_TCP].connf_head != - NULL || (connp->conn_mac_mode != CONN_MAC_DEFAULT)) { - error = -TBADADDR; - } else { - connp->conn_srcv6 = ipv6_all_zeros; - ipcl_proto_insert_v6(connp, IPPROTO_TCP); - } - - (void) tcp_post_ip_bind(tcp, NULL, error, NULL, 0); - } - qreply(q, mp); -} - static int tcp_disconnect_common(tcp_t *tcp, t_scalar_t seqnum) { tcp_t *ltcp = NULL; - conn_t *connp; + conn_t *lconnp; tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; /* * Right now, upper modules pass down a T_DISCON_REQ to TCP, * when the stream is in BOUND state. Do not send a reset, * since the destination IP address is not valid, and it can * be the initialized value of all zeros (broadcast address). - * - * XXX There won't be any pending bind request to IP. */ - if (tcp->tcp_state <= TCPS_BOUND) { - if (tcp->tcp_debug) { + if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_hard_binding) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_disconnect: bad state, %d", tcp->tcp_state); } @@ -6595,19 +5459,23 @@ tcp_disconnect_common(tcp_t *tcp, t_scalar_t seqnum) * If it used to be a listener, check to make sure no one else * has taken the port before switching back to LISTEN state. */ - if (tcp->tcp_ipversion == IPV4_VERSION) { - connp = ipcl_lookup_listener_v4(tcp->tcp_lport, - tcp->tcp_ipha->ipha_src, - tcp->tcp_connp->conn_zoneid, ipst); - if (connp != NULL) - ltcp = connp->conn_tcp; + if (connp->conn_ipversion == IPV4_VERSION) { + lconnp = ipcl_lookup_listener_v4(connp->conn_lport, + connp->conn_laddr_v4, IPCL_ZONEID(connp), ipst); + if (lconnp != NULL) + ltcp = lconnp->conn_tcp; } else { - /* Allow tcp_bound_if listeners? */ - connp = ipcl_lookup_listener_v6(tcp->tcp_lport, - &tcp->tcp_ip6h->ip6_src, 0, - tcp->tcp_connp->conn_zoneid, ipst); - if (connp != NULL) - ltcp = connp->conn_tcp; + uint_t ifindex = 0; + + if (connp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET) + ifindex = connp->conn_ixa->ixa_scopeid; + + /* Allow conn_bound_if listeners? */ + lconnp = ipcl_lookup_listener_v6(connp->conn_lport, + &connp->conn_laddr_v6, ifindex, IPCL_ZONEID(connp), + ipst); + if (lconnp != NULL) + ltcp = lconnp->conn_tcp; } if (tcp->tcp_conn_req_max && ltcp == NULL) { tcp->tcp_state = TCPS_LISTEN; @@ -6616,7 +5484,7 @@ tcp_disconnect_common(tcp_t *tcp, t_scalar_t seqnum) tcp->tcp_state = TCPS_BOUND; } if (ltcp != NULL) - CONN_DEC_REF(ltcp->tcp_connp); + CONN_DEC_REF(lconnp); if (old_state == TCPS_SYN_SENT || old_state == TCPS_SYN_RCVD) { BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails); } else if (old_state == TCPS_ESTABLISHED || @@ -6648,7 +5516,7 @@ tcp_disconnect_common(tcp_t *tcp, t_scalar_t seqnum) /* * Our client hereby directs us to reject the connection request - * that tcp_conn_request() marked with 'seqnum'. Rejection consists + * that tcp_input_listener() marked with 'seqnum'. Rejection consists * of sending the appropriate RST, not an ICMP error. */ static void @@ -6656,6 +5524,7 @@ tcp_disconnect(tcp_t *tcp, mblk_t *mp) { t_scalar_t seqnum; int error; + conn_t *connp = tcp->tcp_connp; ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_discon_req)) { @@ -6669,11 +5538,11 @@ tcp_disconnect(tcp_t *tcp, mblk_t *mp) else { if (tcp->tcp_state >= TCPS_ESTABLISHED) { /* Send M_FLUSH according to TPI */ - (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW); + (void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW); } mp = mi_tpi_ok_ack_alloc(mp); - if (mp) - putnext(tcp->tcp_rq, mp); + if (mp != NULL) + putnext(connp->conn_rq, mp); } } @@ -6695,6 +5564,7 @@ tcp_display(tcp_t *tcp, char *sup_buf, char format) in6_addr_t local, remote; char local_addrbuf[INET6_ADDRSTRLEN]; char remote_addrbuf[INET6_ADDRSTRLEN]; + conn_t *connp; if (sup_buf != NULL) buf = sup_buf; @@ -6703,6 +5573,8 @@ tcp_display(tcp_t *tcp, char *sup_buf, char format) if (tcp == NULL) return ("NULL_TCP"); + + connp = tcp->tcp_connp; switch (tcp->tcp_state) { case TCPS_CLOSED: cp = "TCP_CLOSED"; @@ -6750,32 +5622,32 @@ tcp_display(tcp_t *tcp, char *sup_buf, char format) } switch (format) { case DISP_ADDR_AND_PORT: - if (tcp->tcp_ipversion == IPV4_VERSION) { + if (connp->conn_ipversion == IPV4_VERSION) { /* * Note that we use the remote address in the tcp_b * structure. This means that it will print out * the real destination address, not the next hop's * address if source routing is used. */ - IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ip_src, &local); - IN6_IPADDR_TO_V4MAPPED(tcp->tcp_remote, &remote); + IN6_IPADDR_TO_V4MAPPED(connp->conn_laddr_v4, &local); + IN6_IPADDR_TO_V4MAPPED(connp->conn_faddr_v4, &remote); } else { - local = tcp->tcp_ip_src_v6; - remote = tcp->tcp_remote_v6; + local = connp->conn_laddr_v6; + remote = connp->conn_faddr_v6; } (void) inet_ntop(AF_INET6, &local, local_addrbuf, sizeof (local_addrbuf)); (void) inet_ntop(AF_INET6, &remote, remote_addrbuf, sizeof (remote_addrbuf)); (void) mi_sprintf(buf, "[%s.%u, %s.%u] %s", - local_addrbuf, ntohs(tcp->tcp_lport), remote_addrbuf, - ntohs(tcp->tcp_fport), cp); + local_addrbuf, ntohs(connp->conn_lport), remote_addrbuf, + ntohs(connp->conn_fport), cp); break; case DISP_PORT_ONLY: default: (void) mi_sprintf(buf, "[%u, %u] %s", - ntohs(tcp->tcp_lport), ntohs(tcp->tcp_fport), cp); + ntohs(connp->conn_lport), ntohs(connp->conn_fport), cp); break; } @@ -6788,26 +5660,24 @@ tcp_display(tcp_t *tcp, char *sup_buf, char format) * eager to disappear either by means of tcp_eager_blowoff() or * tcp_eager_cleanup() being called. tcp_eager_kill() can also be * called (via squeue) if the eager cannot be inserted in the - * fanout table in tcp_conn_request(). + * fanout table in tcp_input_listener(). */ /* ARGSUSED */ void -tcp_eager_kill(void *arg, mblk_t *mp, void *arg2) +tcp_eager_kill(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { conn_t *econnp = (conn_t *)arg; tcp_t *eager = econnp->conn_tcp; tcp_t *listener = eager->tcp_listener; - tcp_stack_t *tcps = eager->tcp_tcps; /* * We could be called because listener is closing. Since - * the eager is using listener's queue's, its not safe. - * Better use the default queue just to send the TH_RST - * out. + * the eager was using listener's queue's, we avoid + * using the listeners queues from now on. */ - ASSERT(tcps->tcps_g_q != NULL); - eager->tcp_rq = tcps->tcps_g_q; - eager->tcp_wq = WR(tcps->tcps_g_q); + ASSERT(eager->tcp_detached); + econnp->conn_rq = NULL; + econnp->conn_wq = NULL; /* * An eager's conn_fanout will be NULL if it's a duplicate @@ -6828,7 +5698,7 @@ tcp_eager_kill(void *arg, mblk_t *mp, void *arg2) * The eager has sent a conn_ind up to the * listener but listener decides to close * instead. We need to drop the extra ref - * placed on eager in tcp_rput_data() before + * placed on eager in tcp_input_data() before * sending the conn_ind to listener. */ CONN_DEC_REF(econnp); @@ -6873,7 +5743,7 @@ tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum) mutex_exit(&listener->tcp_eager_lock); mp = &eager->tcp_closemp; SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, tcp_eager_kill, - eager->tcp_connp, SQ_FILL, SQTAG_TCP_EAGER_BLOWOFF); + eager->tcp_connp, NULL, SQ_FILL, SQTAG_TCP_EAGER_BLOWOFF); return (B_TRUE); } @@ -6901,7 +5771,7 @@ tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only) CONN_INC_REF(eager->tcp_connp); mp = &eager->tcp_closemp; SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, - tcp_eager_kill, eager->tcp_connp, + tcp_eager_kill, eager->tcp_connp, NULL, SQ_FILL, SQTAG_TCP_EAGER_CLEANUP); } eager = eager->tcp_eager_next_q; @@ -6917,7 +5787,7 @@ tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only) CONN_INC_REF(eager->tcp_connp); mp = &eager->tcp_closemp; SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, - tcp_eager_kill, eager->tcp_connp, SQ_FILL, + tcp_eager_kill, eager->tcp_connp, NULL, SQ_FILL, SQTAG_TCP_EAGER_CLEANUP_Q0); } eager = eager->tcp_eager_next_q0; @@ -7008,7 +5878,7 @@ static void tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error) { if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) - putnext(tcp->tcp_rq, mp); + putnext(tcp->tcp_connp->conn_rq, mp); } /* Shorthand to generate and send TPI error acks to our client */ @@ -7024,7 +5894,7 @@ tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive, teackp->ERROR_prim = primitive; teackp->TLI_error = t_error; teackp->UNIX_error = sys_error; - putnext(tcp->tcp_rq, mp); + putnext(tcp->tcp_connp->conn_rq, mp); } } @@ -7194,8 +6064,9 @@ static void tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp) { tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; - if (tcp->tcp_family == AF_INET6) + if (connp->conn_family == AF_INET6) *tia = tcp_g_t_info_ack_v6; else *tia = tcp_g_t_info_ack; @@ -7203,7 +6074,7 @@ tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp) tia->OPT_size = tcp_max_optsize; if (tcp->tcp_mss == 0) { /* Not yet set - tcp_open does not set mss */ - if (tcp->tcp_ipversion == IPV4_VERSION) + if (connp->conn_ipversion == IPV4_VERSION) tia->TIDU_size = tcps->tcps_mss_def_ipv4; else tia->TIDU_size = tcps->tcps_mss_def_ipv6; @@ -7258,7 +6129,7 @@ tcp_capability_req(tcp_t *tcp, mblk_t *mp) tcap = (struct T_capability_ack *)mp->b_rptr; tcp_do_capability_ack(tcp, tcap, cap_bits1); - putnext(tcp->tcp_rq, mp); + putnext(tcp->tcp_connp->conn_rq, mp); } /* @@ -7276,16 +6147,18 @@ tcp_info_req(tcp_t *tcp, mblk_t *mp) return; } tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp); - putnext(tcp->tcp_rq, mp); + putnext(tcp->tcp_connp->conn_rq, mp); } /* Respond to the TPI addr request */ static void tcp_addr_req(tcp_t *tcp, mblk_t *mp) { - sin_t *sin; + struct sockaddr *sa; mblk_t *ackmp; struct T_addr_ack *taa; + conn_t *connp = tcp->tcp_connp; + uint_t addrlen; /* Make it large enough for worst case */ ackmp = reallocb(mp, sizeof (struct T_addr_ack) + @@ -7295,10 +6168,6 @@ tcp_addr_req(tcp_t *tcp, mblk_t *mp) return; } - if (tcp->tcp_ipversion == IPV6_VERSION) { - tcp_addr_req_ipv6(tcp, ackmp); - return; - } taa = (struct T_addr_ack *)ackmp->b_rptr; bzero(taa, sizeof (struct T_addr_ack)); @@ -7307,110 +6176,38 @@ tcp_addr_req(tcp_t *tcp, mblk_t *mp) taa->PRIM_type = T_ADDR_ACK; ackmp->b_datap->db_type = M_PCPROTO; + if (connp->conn_family == AF_INET) + addrlen = sizeof (sin_t); + else + addrlen = sizeof (sin6_t); + /* * Note: Following code assumes 32 bit alignment of basic * data structures like sin_t and struct T_addr_ack. */ if (tcp->tcp_state >= TCPS_BOUND) { /* - * Fill in local address + * Fill in local address first */ - taa->LOCADDR_length = sizeof (sin_t); taa->LOCADDR_offset = sizeof (*taa); - - sin = (sin_t *)&taa[1]; - - /* Fill zeroes and then intialize non-zero fields */ - *sin = sin_null; - - sin->sin_family = AF_INET; - - sin->sin_addr.s_addr = tcp->tcp_ipha->ipha_src; - sin->sin_port = *(uint16_t *)tcp->tcp_tcph->th_lport; - - ackmp->b_wptr = (uchar_t *)&sin[1]; - - if (tcp->tcp_state >= TCPS_SYN_RCVD) { - /* - * Fill in Remote address - */ - taa->REMADDR_length = sizeof (sin_t); - taa->REMADDR_offset = ROUNDUP32(taa->LOCADDR_offset + - taa->LOCADDR_length); - - sin = (sin_t *)(ackmp->b_rptr + taa->REMADDR_offset); - *sin = sin_null; - sin->sin_family = AF_INET; - sin->sin_addr.s_addr = tcp->tcp_remote; - sin->sin_port = tcp->tcp_fport; - - ackmp->b_wptr = (uchar_t *)&sin[1]; - } + taa->LOCADDR_length = addrlen; + sa = (struct sockaddr *)&taa[1]; + (void) conn_getsockname(connp, sa, &addrlen); + ackmp->b_wptr += addrlen; } - putnext(tcp->tcp_rq, ackmp); -} - -/* Assumes that tcp_addr_req gets enough space and alignment */ -static void -tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *ackmp) -{ - sin6_t *sin6; - struct T_addr_ack *taa; - - ASSERT(tcp->tcp_ipversion == IPV6_VERSION); - ASSERT(OK_32PTR(ackmp->b_rptr)); - ASSERT(ackmp->b_wptr - ackmp->b_rptr >= sizeof (struct T_addr_ack) + - 2 * sizeof (sin6_t)); - - taa = (struct T_addr_ack *)ackmp->b_rptr; - - bzero(taa, sizeof (struct T_addr_ack)); - ackmp->b_wptr = (uchar_t *)&taa[1]; - - taa->PRIM_type = T_ADDR_ACK; - ackmp->b_datap->db_type = M_PCPROTO; - - /* - * Note: Following code assumes 32 bit alignment of basic - * data structures like sin6_t and struct T_addr_ack. - */ - if (tcp->tcp_state >= TCPS_BOUND) { + if (tcp->tcp_state >= TCPS_SYN_RCVD) { /* - * Fill in local address + * Fill in Remote address */ - taa->LOCADDR_length = sizeof (sin6_t); - taa->LOCADDR_offset = sizeof (*taa); - - sin6 = (sin6_t *)&taa[1]; - *sin6 = sin6_null; - - sin6->sin6_family = AF_INET6; - sin6->sin6_addr = tcp->tcp_ip6h->ip6_src; - sin6->sin6_port = tcp->tcp_lport; - - ackmp->b_wptr = (uchar_t *)&sin6[1]; - - if (tcp->tcp_state >= TCPS_SYN_RCVD) { - /* - * Fill in Remote address - */ - taa->REMADDR_length = sizeof (sin6_t); - taa->REMADDR_offset = ROUNDUP32(taa->LOCADDR_offset + - taa->LOCADDR_length); - - sin6 = (sin6_t *)(ackmp->b_rptr + taa->REMADDR_offset); - *sin6 = sin6_null; - sin6->sin6_family = AF_INET6; - sin6->sin6_flowinfo = - tcp->tcp_ip6h->ip6_vcf & - ~IPV6_VERS_AND_FLOW_MASK; - sin6->sin6_addr = tcp->tcp_remote_v6; - sin6->sin6_port = tcp->tcp_fport; - - ackmp->b_wptr = (uchar_t *)&sin6[1]; - } + taa->REMADDR_length = addrlen; + /* assumed 32-bit alignment */ + taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length; + sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset); + (void) conn_getpeername(connp, sa, &addrlen); + ackmp->b_wptr += addrlen; } - putnext(tcp->tcp_rq, ackmp); + ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim); + putnext(tcp->tcp_connp->conn_rq, ackmp); } /* @@ -7420,19 +6217,19 @@ tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *ackmp) static void tcp_reinit(tcp_t *tcp) { - mblk_t *mp; - int err; + mblk_t *mp; tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; TCP_STAT(tcps, tcp_reinit_calls); /* tcp_reinit should never be called for detached tcp_t's */ ASSERT(tcp->tcp_listener == NULL); - ASSERT((tcp->tcp_family == AF_INET && - tcp->tcp_ipversion == IPV4_VERSION) || - (tcp->tcp_family == AF_INET6 && - (tcp->tcp_ipversion == IPV4_VERSION || - tcp->tcp_ipversion == IPV6_VERSION))); + ASSERT((connp->conn_family == AF_INET && + connp->conn_ipversion == IPV4_VERSION) || + (connp->conn_family == AF_INET6 && + (connp->conn_ipversion == IPV4_VERSION || + connp->conn_ipversion == IPV6_VERSION))); /* Cancel outstanding timers */ tcp_timers_stop(tcp); @@ -7453,7 +6250,7 @@ tcp_reinit(tcp_t *tcp) tcp->tcp_unsent = tcp->tcp_xmit_tail_unsent = 0; mutex_enter(&tcp->tcp_non_sq_lock); if (tcp->tcp_flow_stopped && - TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) { + TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) { tcp_clrqfull(tcp); } mutex_exit(&tcp->tcp_non_sq_lock); @@ -7494,7 +6291,7 @@ tcp_reinit(tcp_t *tcp) */ tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind); - CL_INET_DISCONNECT(tcp->tcp_connp, tcp); + CL_INET_DISCONNECT(connp); /* * The connection can't be on the tcp_time_wait_head list @@ -7522,14 +6319,12 @@ tcp_reinit(tcp_t *tcp) * Reset/preserve other values */ tcp_reinit_values(tcp); - ipcl_hash_remove(tcp->tcp_connp); - conn_delete_ire(tcp->tcp_connp, NULL); + ipcl_hash_remove(connp); + ixa_cleanup(connp->conn_ixa); tcp_ipsec_cleanup(tcp); - if (tcp->tcp_connp->conn_effective_cred != NULL) { - crfree(tcp->tcp_connp->conn_effective_cred); - tcp->tcp_connp->conn_effective_cred = NULL; - } + connp->conn_laddr_v6 = connp->conn_bound_addr_v6; + connp->conn_saddr_v6 = connp->conn_bound_addr_v6; if (tcp->tcp_conn_req_max != 0) { /* @@ -7553,44 +6348,31 @@ tcp_reinit(tcp_t *tcp) tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp; tcp->tcp_eager_next_drop_q0 = tcp; tcp->tcp_eager_prev_drop_q0 = tcp; - tcp->tcp_connp->conn_recv = tcp_conn_request; - if (tcp->tcp_family == AF_INET6) { - ASSERT(tcp->tcp_connp->conn_af_isv6); - (void) ipcl_bind_insert_v6(tcp->tcp_connp, IPPROTO_TCP, - &tcp->tcp_ip6h->ip6_src, tcp->tcp_lport); - } else { - ASSERT(!tcp->tcp_connp->conn_af_isv6); - (void) ipcl_bind_insert(tcp->tcp_connp, IPPROTO_TCP, - tcp->tcp_ipha->ipha_src, tcp->tcp_lport); - } + /* + * Initially set conn_recv to tcp_input_listener_unbound to try + * to pick a good squeue for the listener when the first SYN + * arrives. tcp_input_listener_unbound sets it to + * tcp_input_listener on that first SYN. + */ + connp->conn_recv = tcp_input_listener_unbound; + + connp->conn_proto = IPPROTO_TCP; + connp->conn_faddr_v6 = ipv6_all_zeros; + connp->conn_fport = 0; + + (void) ipcl_bind_insert(connp); } else { tcp->tcp_state = TCPS_BOUND; } /* * Initialize to default values - * Can't fail since enough header template space already allocated - * at open(). - */ - err = tcp_init_values(tcp); - ASSERT(err == 0); - /* Restore state in tcp_tcph */ - bcopy(&tcp->tcp_lport, tcp->tcp_tcph->th_lport, TCP_PORT_LEN); - if (tcp->tcp_ipversion == IPV4_VERSION) - tcp->tcp_ipha->ipha_src = tcp->tcp_bound_source; - else - tcp->tcp_ip6h->ip6_src = tcp->tcp_bound_source_v6; - /* - * Copy of the src addr. in tcp_t is needed in tcp_t - * since the lookup funcs can only lookup on tcp_t */ - tcp->tcp_ip_src_v6 = tcp->tcp_bound_source_v6; + tcp_init_values(tcp); ASSERT(tcp->tcp_ptpbhn != NULL); - tcp->tcp_recv_hiwater = tcps->tcps_recv_hiwat; - tcp->tcp_recv_lowater = tcp_rinfo.mi_lowat; - tcp->tcp_rwnd = tcps->tcps_recv_hiwat; - tcp->tcp_mss = tcp->tcp_ipversion != IPV4_VERSION ? + tcp->tcp_rwnd = connp->conn_rcvbuf; + tcp->tcp_mss = connp->conn_ipversion != IPV4_VERSION ? tcps->tcps_mss_def_ipv6 : tcps->tcps_mss_def_ipv4; } @@ -7606,6 +6388,7 @@ tcp_reinit_values(tcp) tcp_t *tcp; { tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; #ifndef lint #define DONTCARE(x) @@ -7626,8 +6409,8 @@ tcp_reinit_values(tcp) ASSERT(tcp->tcp_time_wait_prev == NULL); ASSERT(tcp->tcp_time_wait_expire == 0); PRESERVE(tcp->tcp_state); - PRESERVE(tcp->tcp_rq); - PRESERVE(tcp->tcp_wq); + PRESERVE(connp->conn_rq); + PRESERVE(connp->conn_wq); ASSERT(tcp->tcp_xmit_head == NULL); ASSERT(tcp->tcp_xmit_last == NULL); @@ -7638,26 +6421,32 @@ tcp_reinit_values(tcp) tcp->tcp_snxt = 0; /* Displayed in mib */ tcp->tcp_suna = 0; /* Displayed in mib */ tcp->tcp_swnd = 0; - DONTCARE(tcp->tcp_cwnd); /* Init in tcp_mss_set */ + DONTCARE(tcp->tcp_cwnd); /* Init in tcp_process_options */ ASSERT(tcp->tcp_ibsegs == 0); ASSERT(tcp->tcp_obsegs == 0); - if (tcp->tcp_iphc != NULL) { - ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH); - bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); + if (connp->conn_ht_iphc != NULL) { + kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated); + connp->conn_ht_iphc = NULL; + connp->conn_ht_iphc_allocated = 0; + connp->conn_ht_iphc_len = 0; + connp->conn_ht_ulp = NULL; + connp->conn_ht_ulp_len = 0; + tcp->tcp_ipha = NULL; + tcp->tcp_ip6h = NULL; + tcp->tcp_tcpha = NULL; } + /* We clear any IP_OPTIONS and extension headers */ + ip_pkt_free(&connp->conn_xmit_ipp); + DONTCARE(tcp->tcp_naglim); /* Init in tcp_init_values */ - DONTCARE(tcp->tcp_hdr_len); /* Init in tcp_init_values */ DONTCARE(tcp->tcp_ipha); DONTCARE(tcp->tcp_ip6h); - DONTCARE(tcp->tcp_ip_hdr_len); - DONTCARE(tcp->tcp_tcph); - DONTCARE(tcp->tcp_tcp_hdr_len); /* Init in tcp_init_values */ + DONTCARE(tcp->tcp_tcpha); tcp->tcp_valid_bits = 0; - DONTCARE(tcp->tcp_xmit_hiwater); /* Init in tcp_init_values */ DONTCARE(tcp->tcp_timer_backoff); /* Init in tcp_init_values */ DONTCARE(tcp->tcp_last_recv_time); /* Init in tcp_init_values */ tcp->tcp_last_rcv_lbolt = 0; @@ -7666,38 +6455,19 @@ tcp_reinit_values(tcp) tcp->tcp_urp_last_valid = 0; tcp->tcp_hard_binding = 0; - tcp->tcp_hard_bound = 0; - PRESERVE(tcp->tcp_cred); - PRESERVE(tcp->tcp_cpid); - PRESERVE(tcp->tcp_open_time); - PRESERVE(tcp->tcp_exclbind); tcp->tcp_fin_acked = 0; tcp->tcp_fin_rcvd = 0; tcp->tcp_fin_sent = 0; tcp->tcp_ordrel_done = 0; - tcp->tcp_debug = 0; - tcp->tcp_dontroute = 0; - tcp->tcp_broadcast = 0; - - tcp->tcp_useloopback = 0; - tcp->tcp_reuseaddr = 0; - tcp->tcp_oobinline = 0; - tcp->tcp_dgram_errind = 0; - tcp->tcp_detached = 0; - tcp->tcp_bind_pending = 0; - tcp->tcp_unbind_pending = 0; tcp->tcp_snd_ws_ok = B_FALSE; tcp->tcp_snd_ts_ok = B_FALSE; - tcp->tcp_linger = 0; - tcp->tcp_ka_enabled = 0; tcp->tcp_zero_win_probe = 0; tcp->tcp_loopback = 0; - tcp->tcp_refuse = 0; tcp->tcp_localnet = 0; tcp->tcp_syn_defense = 0; tcp->tcp_set_timer = 0; @@ -7707,19 +6477,12 @@ tcp_reinit_values(tcp) tcp->tcp_xmit_zc_clean = B_FALSE; tcp->tcp_snd_sack_ok = B_FALSE; - PRESERVE(tcp->tcp_recvdstaddr); tcp->tcp_hwcksum = B_FALSE; - tcp->tcp_ire_ill_check_done = B_FALSE; - DONTCARE(tcp->tcp_maxpsz); /* Init in tcp_init_values */ - - tcp->tcp_mdt = B_FALSE; - tcp->tcp_mdt_hdr_head = 0; - tcp->tcp_mdt_hdr_tail = 0; + DONTCARE(tcp->tcp_maxpsz_multiplier); /* Init in tcp_init_values */ tcp->tcp_conn_def_q0 = 0; tcp->tcp_ip_forward_progress = B_FALSE; - tcp->tcp_anon_priv_bind = 0; tcp->tcp_ecn_ok = B_FALSE; tcp->tcp_cwr = B_FALSE; @@ -7740,7 +6503,7 @@ tcp_reinit_values(tcp) tcp->tcp_ts_recent = 0; tcp->tcp_rnxt = 0; /* Displayed in mib */ DONTCARE(tcp->tcp_rwnd); /* Set in tcp_reinit() */ - tcp->tcp_if_mtu = 0; + tcp->tcp_initial_pmtu = 0; ASSERT(tcp->tcp_reass_head == NULL); ASSERT(tcp->tcp_reass_tail == NULL); @@ -7752,7 +6515,7 @@ tcp_reinit_values(tcp) ASSERT(tcp->tcp_rcv_last_tail == NULL); ASSERT(tcp->tcp_rcv_cnt == 0); - DONTCARE(tcp->tcp_cwnd_ssthresh); /* Init in tcp_adapt_ire */ + DONTCARE(tcp->tcp_cwnd_ssthresh); /* Init in tcp_set_destination */ DONTCARE(tcp->tcp_cwnd_max); /* Init in tcp_init_values */ tcp->tcp_csuna = 0; @@ -7773,8 +6536,6 @@ tcp_reinit_values(tcp) ASSERT(tcp->tcp_listener == NULL); - DONTCARE(tcp->tcp_xmit_lowater); /* Init in tcp_init_values */ - DONTCARE(tcp->tcp_irs); /* tcp_valid_bits cleared */ DONTCARE(tcp->tcp_iss); /* tcp_valid_bits cleared */ DONTCARE(tcp->tcp_fss); /* tcp_valid_bits cleared */ @@ -7785,14 +6546,11 @@ tcp_reinit_values(tcp) PRESERVE(tcp->tcp_conn_req_max); PRESERVE(tcp->tcp_conn_req_seqnum); - DONTCARE(tcp->tcp_ip_hdr_len); /* Init in tcp_init_values */ DONTCARE(tcp->tcp_first_timer_threshold); /* Init in tcp_init_values */ DONTCARE(tcp->tcp_second_timer_threshold); /* Init in tcp_init_values */ DONTCARE(tcp->tcp_first_ctimer_threshold); /* Init in tcp_init_values */ DONTCARE(tcp->tcp_second_ctimer_threshold); /* in tcp_init_values */ - tcp->tcp_lingertime = 0; - DONTCARE(tcp->tcp_urp_last); /* tcp_urp_last_valid is cleared */ ASSERT(tcp->tcp_urp_mp == NULL); ASSERT(tcp->tcp_urp_mark_mp == NULL); @@ -7811,16 +6569,16 @@ tcp_reinit_values(tcp) tcp->tcp_client_errno = 0; - DONTCARE(tcp->tcp_sum); /* Init in tcp_init_values */ + DONTCARE(connp->conn_sum); /* Init in tcp_init_values */ - tcp->tcp_remote_v6 = ipv6_all_zeros; /* Displayed in MIB */ + connp->conn_faddr_v6 = ipv6_all_zeros; /* Displayed in MIB */ - PRESERVE(tcp->tcp_bound_source_v6); + PRESERVE(connp->conn_bound_addr_v6); tcp->tcp_last_sent_len = 0; tcp->tcp_dupack_cnt = 0; - tcp->tcp_fport = 0; /* Displayed in MIB */ - PRESERVE(tcp->tcp_lport); + connp->conn_fport = 0; /* Displayed in MIB */ + PRESERVE(connp->conn_lport); PRESERVE(tcp->tcp_acceptor_lockp); @@ -7828,16 +6586,18 @@ tcp_reinit_values(tcp) PRESERVE(tcp->tcp_acceptor_id); DONTCARE(tcp->tcp_ipsec_overhead); - PRESERVE(tcp->tcp_family); - if (tcp->tcp_family == AF_INET6) { + PRESERVE(connp->conn_family); + /* Remove any remnants of mapped address binding */ + if (connp->conn_family == AF_INET6) { + connp->conn_ipversion = IPV6_VERSION; tcp->tcp_mss = tcps->tcps_mss_def_ipv6; } else { + connp->conn_ipversion = IPV4_VERSION; tcp->tcp_mss = tcps->tcps_mss_def_ipv4; } - PRESERVE(tcp->tcp_ipversion); /* Init in tcp_init_values */ - tcp->tcp_bound_if = 0; - tcp->tcp_ipv6_recvancillary = 0; + connp->conn_bound_if = 0; + connp->conn_recv_ancillary.crb_all = 0; tcp->tcp_recvifindex = 0; tcp->tcp_recvhops = 0; tcp->tcp_closed = 0; @@ -7854,19 +6614,18 @@ tcp_reinit_values(tcp) tcp->tcp_dstoptslen = 0; } ASSERT(tcp->tcp_dstoptslen == 0); - if (tcp->tcp_rtdstopts != NULL) { - mi_free(tcp->tcp_rtdstopts); - tcp->tcp_rtdstopts = NULL; - tcp->tcp_rtdstoptslen = 0; + if (tcp->tcp_rthdrdstopts != NULL) { + mi_free(tcp->tcp_rthdrdstopts); + tcp->tcp_rthdrdstopts = NULL; + tcp->tcp_rthdrdstoptslen = 0; } - ASSERT(tcp->tcp_rtdstoptslen == 0); + ASSERT(tcp->tcp_rthdrdstoptslen == 0); if (tcp->tcp_rthdr != NULL) { mi_free(tcp->tcp_rthdr); tcp->tcp_rthdr = NULL; tcp->tcp_rthdrlen = 0; } ASSERT(tcp->tcp_rthdrlen == 0); - PRESERVE(tcp->tcp_drop_opt_ack_cnt); /* Reset fusion-related fields */ tcp->tcp_fused = B_FALSE; @@ -7902,35 +6661,17 @@ tcp_reinit_values(tcp) #undef PRESERVE } -/* - * Allocate necessary resources and initialize state vector. - * Guaranteed not to fail so that when an error is returned, - * the caller doesn't need to do any additional cleanup. - */ -int -tcp_init(tcp_t *tcp, queue_t *q) -{ - int err; - - tcp->tcp_rq = q; - tcp->tcp_wq = WR(q); - tcp->tcp_state = TCPS_IDLE; - if ((err = tcp_init_values(tcp)) != 0) - tcp_timers_stop(tcp); - return (err); -} - -static int +static void tcp_init_values(tcp_t *tcp) { - int err; tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; - ASSERT((tcp->tcp_family == AF_INET && - tcp->tcp_ipversion == IPV4_VERSION) || - (tcp->tcp_family == AF_INET6 && - (tcp->tcp_ipversion == IPV4_VERSION || - tcp->tcp_ipversion == IPV6_VERSION))); + ASSERT((connp->conn_family == AF_INET && + connp->conn_ipversion == IPV4_VERSION) || + (connp->conn_family == AF_INET6 && + (connp->conn_ipversion == IPV4_VERSION || + connp->conn_ipversion == IPV6_VERSION))); /* * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO @@ -7953,7 +6694,7 @@ tcp_init_values(tcp_t *tcp) tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; tcp->tcp_snd_burst = TCP_CWND_INFINITE; - tcp->tcp_maxpsz = tcps->tcps_maxpsz_multiplier; + tcp->tcp_maxpsz_multiplier = tcps->tcps_maxpsz_multiplier; tcp->tcp_first_timer_threshold = tcps->tcps_ip_notify_interval; tcp->tcp_first_ctimer_threshold = tcps->tcps_ip_notify_cinterval; @@ -7966,10 +6707,7 @@ tcp_init_values(tcp_t *tcp) tcp->tcp_naglim = tcps->tcps_naglim_def; - /* NOTE: ISS is now set in tcp_adapt_ire(). */ - - tcp->tcp_mdt_hdr_head = 0; - tcp->tcp_mdt_hdr_tail = 0; + /* NOTE: ISS is now set in tcp_set_destination(). */ /* Reset fusion-related fields */ tcp->tcp_fused = B_FALSE; @@ -7977,280 +6715,84 @@ tcp_init_values(tcp_t *tcp) tcp->tcp_fused_sigurg = B_FALSE; tcp->tcp_loopback_peer = NULL; - /* Initialize the header template */ - if (tcp->tcp_family == AF_INET) { - err = tcp_header_init_ipv4(tcp); - } else { - err = tcp_header_init_ipv6(tcp); - } - if (err) - return (err); + /* We rebuild the header template on the next connect/conn_request */ + + connp->conn_mlp_type = mlptSingle; /* * Init the window scale to the max so tcp_rwnd_set() won't pare - * down tcp_rwnd. tcp_adapt_ire() will set the right value later. + * down tcp_rwnd. tcp_set_destination() will set the right value later. */ tcp->tcp_rcv_ws = TCP_MAX_WINSHIFT; - tcp->tcp_xmit_lowater = tcps->tcps_xmit_lowat; - tcp->tcp_xmit_hiwater = tcps->tcps_xmit_hiwat; - tcp->tcp_recv_hiwater = tcps->tcps_recv_hiwat; - tcp->tcp_rwnd = tcps->tcps_recv_hiwat; - tcp->tcp_recv_lowater = tcp_rinfo.mi_lowat; + tcp->tcp_rwnd = connp->conn_rcvbuf; tcp->tcp_cork = B_FALSE; /* - * Init the tcp_debug option. This value determines whether TCP + * Init the tcp_debug option if it wasn't already set. This value + * determines whether TCP * calls strlog() to print out debug messages. Doing this * initialization here means that this value is not inherited thru * tcp_reinit(). */ - tcp->tcp_debug = tcps->tcps_dbg; + if (!connp->conn_debug) + connp->conn_debug = tcps->tcps_dbg; tcp->tcp_ka_interval = tcps->tcps_keepalive_interval; tcp->tcp_ka_abort_thres = tcps->tcps_keepalive_abort_interval; - - return (0); -} - -/* - * Initialize the IPv4 header. Loses any record of any IP options. - */ -static int -tcp_header_init_ipv4(tcp_t *tcp) -{ - tcph_t *tcph; - uint32_t sum; - conn_t *connp; - tcp_stack_t *tcps = tcp->tcp_tcps; - - /* - * This is a simple initialization. If there's - * already a template, it should never be too small, - * so reuse it. Otherwise, allocate space for the new one. - */ - if (tcp->tcp_iphc == NULL) { - ASSERT(tcp->tcp_iphc_len == 0); - tcp->tcp_iphc_len = TCP_MAX_COMBINED_HEADER_LENGTH; - tcp->tcp_iphc = kmem_cache_alloc(tcp_iphc_cache, KM_NOSLEEP); - if (tcp->tcp_iphc == NULL) { - tcp->tcp_iphc_len = 0; - return (ENOMEM); - } - } - - /* options are gone; may need a new label */ - connp = tcp->tcp_connp; - connp->conn_mlp_type = mlptSingle; - connp->conn_ulp_labeled = !is_system_labeled(); - ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH); - - /* - * tcp_do_get{sock,peer}name constructs the sockaddr from the - * ip header, and decides which header to use based on ip version. - * That operation happens outside the squeue, so we hold the lock - * here to ensure that the ip version and header remain consistent. - */ - mutex_enter(&connp->conn_lock); - tcp->tcp_ipversion = IPV4_VERSION; - tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc; - tcp->tcp_ip6h = NULL; - mutex_exit(&connp->conn_lock); - - tcp->tcp_hdr_len = sizeof (ipha_t) + sizeof (tcph_t); - tcp->tcp_tcp_hdr_len = sizeof (tcph_t); - tcp->tcp_ip_hdr_len = sizeof (ipha_t); - tcp->tcp_ipha->ipha_length = htons(sizeof (ipha_t) + sizeof (tcph_t)); - tcp->tcp_ipha->ipha_version_and_hdr_length - = (IP_VERSION << 4) | IP_SIMPLE_HDR_LENGTH_IN_WORDS; - tcp->tcp_ipha->ipha_ident = 0; - - tcp->tcp_ttl = (uchar_t)tcps->tcps_ipv4_ttl; - tcp->tcp_tos = 0; - tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0; - tcp->tcp_ipha->ipha_ttl = (uchar_t)tcps->tcps_ipv4_ttl; - tcp->tcp_ipha->ipha_protocol = IPPROTO_TCP; - - tcph = (tcph_t *)(tcp->tcp_iphc + sizeof (ipha_t)); - tcp->tcp_tcph = tcph; - tcph->th_offset_and_rsrvd[0] = (5 << 4); - /* - * IP wants our header length in the checksum field to - * allow it to perform a single pseudo-header+checksum - * calculation on behalf of TCP. - * Include the adjustment for a source route once IP_OPTIONS is set. - */ - sum = sizeof (tcph_t) + tcp->tcp_sum; - sum = (sum >> 16) + (sum & 0xFFFF); - U16_TO_ABE16(sum, tcph->th_sum); - return (0); -} - -/* - * Initialize the IPv6 header. Loses any record of any IPv6 extension headers. - */ -static int -tcp_header_init_ipv6(tcp_t *tcp) -{ - tcph_t *tcph; - uint32_t sum; - conn_t *connp; - tcp_stack_t *tcps = tcp->tcp_tcps; - - /* - * This is a simple initialization. If there's - * already a template, it should never be too small, - * so reuse it. Otherwise, allocate space for the new one. - * Ensure that there is enough space to "downgrade" the tcp_t - * to an IPv4 tcp_t. This requires having space for a full load - * of IPv4 options, as well as a full load of TCP options - * (TCP_MAX_COMBINED_HEADER_LENGTH, 120 bytes); this is more space - * than a v6 header and a TCP header with a full load of TCP options - * (IPV6_HDR_LEN is 40 bytes; TCP_MAX_HDR_LENGTH is 60 bytes). - * We want to avoid reallocation in the "downgraded" case when - * processing outbound IPv4 options. - */ - if (tcp->tcp_iphc == NULL) { - ASSERT(tcp->tcp_iphc_len == 0); - tcp->tcp_iphc_len = TCP_MAX_COMBINED_HEADER_LENGTH; - tcp->tcp_iphc = kmem_cache_alloc(tcp_iphc_cache, KM_NOSLEEP); - if (tcp->tcp_iphc == NULL) { - tcp->tcp_iphc_len = 0; - return (ENOMEM); - } - } - - /* options are gone; may need a new label */ - connp = tcp->tcp_connp; - connp->conn_mlp_type = mlptSingle; - connp->conn_ulp_labeled = !is_system_labeled(); - - ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH); - tcp->tcp_hdr_len = IPV6_HDR_LEN + sizeof (tcph_t); - tcp->tcp_tcp_hdr_len = sizeof (tcph_t); - tcp->tcp_ip_hdr_len = IPV6_HDR_LEN; - - /* - * tcp_do_get{sock,peer}name constructs the sockaddr from the - * ip header, and decides which header to use based on ip version. - * That operation happens outside the squeue, so we hold the lock - * here to ensure that the ip version and header remain consistent. - */ - mutex_enter(&connp->conn_lock); - tcp->tcp_ipversion = IPV6_VERSION; - tcp->tcp_ip6h = (ip6_t *)tcp->tcp_iphc; - tcp->tcp_ipha = NULL; - mutex_exit(&connp->conn_lock); - - /* Initialize the header template */ - - tcp->tcp_ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW; - tcp->tcp_ip6h->ip6_plen = ntohs(sizeof (tcph_t)); - tcp->tcp_ip6h->ip6_nxt = IPPROTO_TCP; - tcp->tcp_ip6h->ip6_hops = (uint8_t)tcps->tcps_ipv6_hoplimit; - - tcph = (tcph_t *)(tcp->tcp_iphc + IPV6_HDR_LEN); - tcp->tcp_tcph = tcph; - tcph->th_offset_and_rsrvd[0] = (5 << 4); - /* - * IP wants our header length in the checksum field to - * allow it to perform a single psuedo-header+checksum - * calculation on behalf of TCP. - * Include the adjustment for a source route when IPV6_RTHDR is set. - */ - sum = sizeof (tcph_t) + tcp->tcp_sum; - sum = (sum >> 16) + (sum & 0xFFFF); - U16_TO_ABE16(sum, tcph->th_sum); - return (0); } /* At minimum we need 8 bytes in the TCP header for the lookup */ #define ICMP_MIN_TCP_HDR 8 /* - * tcp_icmp_error is called by tcp_rput_other to process ICMP error messages + * tcp_icmp_input is called as conn_recvicmp to process ICMP error messages * passed up by IP. The message is always received on the correct tcp_t. * Assumes that IP has pulled up everything up to and including the ICMP header. */ -void -tcp_icmp_error(tcp_t *tcp, mblk_t *mp) +/* ARGSUSED2 */ +static void +tcp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) { - icmph_t *icmph; - ipha_t *ipha; - int iph_hdr_length; - tcph_t *tcph; - boolean_t ipsec_mctl = B_FALSE; - boolean_t secure; - mblk_t *first_mp = mp; - int32_t new_mss; - uint32_t ratio; - size_t mp_size = MBLKL(mp); - uint32_t seg_seq; - tcp_stack_t *tcps = tcp->tcp_tcps; - ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - - /* Assume IP provides aligned packets - otherwise toss */ - if (!OK_32PTR(mp->b_rptr)) { - freemsg(mp); - return; - } - - /* - * Since ICMP errors are normal data marked with M_CTL when sent - * to TCP or UDP, we have to look for a IPSEC_IN value to identify - * packets starting with an ipsec_info_t, see ipsec_info.h. - */ - if ((mp_size == sizeof (ipsec_info_t)) && - (((ipsec_info_t *)mp->b_rptr)->ipsec_info_type == IPSEC_IN)) { - ASSERT(mp->b_cont != NULL); - mp = mp->b_cont; - /* IP should have done this */ - ASSERT(OK_32PTR(mp->b_rptr)); - mp_size = MBLKL(mp); - ipsec_mctl = B_TRUE; - } + conn_t *connp = (conn_t *)arg1; + icmph_t *icmph; + ipha_t *ipha; + int iph_hdr_length; + tcpha_t *tcpha; + uint32_t seg_seq; + tcp_t *tcp = connp->conn_tcp; - /* - * Verify that we have a complete outer IP header. If not, drop it. - */ - if (mp_size < sizeof (ipha_t)) { -noticmpv4: - freemsg(first_mp); - return; - } + /* Assume IP provides aligned packets */ + ASSERT(OK_32PTR(mp->b_rptr)); + ASSERT((MBLKL(mp) >= sizeof (ipha_t))); - ipha = (ipha_t *)mp->b_rptr; /* * Verify IP version. Anything other than IPv4 or IPv6 packet is sent * upstream. ICMPv6 is handled in tcp_icmp_error_ipv6. */ - switch (IPH_HDR_VERSION(ipha)) { - case IPV6_VERSION: - tcp_icmp_error_ipv6(tcp, first_mp, ipsec_mctl); + if (!(ira->ira_flags & IRAF_IS_IPV4)) { + tcp_icmp_error_ipv6(tcp, mp, ira); return; - case IPV4_VERSION: - break; - default: - goto noticmpv4; } /* Skip past the outer IP and ICMP headers */ - iph_hdr_length = IPH_HDR_LENGTH(ipha); + iph_hdr_length = ira->ira_ip_hdr_length; icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; /* - * If we don't have the correct outer IP header length or if the ULP - * is not IPPROTO_ICMP or if we don't have a complete inner IP header - * send it upstream. + * If we don't have the correct outer IP header length + * or if we don't have a complete inner IP header + * drop it. */ if (iph_hdr_length < sizeof (ipha_t) || - ipha->ipha_protocol != IPPROTO_ICMP || (ipha_t *)&icmph[1] + 1 > (ipha_t *)mp->b_wptr) { - goto noticmpv4; +noticmpv4: + freemsg(mp); + return; } ipha = (ipha_t *)&icmph[1]; /* Skip past the inner IP and find the ULP header */ iph_hdr_length = IPH_HDR_LENGTH(ipha); - tcph = (tcph_t *)((char *)ipha + iph_hdr_length); + tcpha = (tcpha_t *)((char *)ipha + iph_hdr_length); /* * If we don't have the correct inner IP header length or if the ULP * is not IPPROTO_TCP or if we don't have at least ICMP_MIN_TCP_HDR @@ -8258,166 +6800,20 @@ noticmpv4: */ if (iph_hdr_length < sizeof (ipha_t) || ipha->ipha_protocol != IPPROTO_TCP || - (uchar_t *)tcph + ICMP_MIN_TCP_HDR > mp->b_wptr) { - goto noticmpv4; - } - - if (TCP_IS_DETACHED_NONEAGER(tcp)) { - if (ipsec_mctl) { - secure = ipsec_in_is_secure(first_mp); - } else { - secure = B_FALSE; - } - if (secure) { - /* - * If we are willing to accept this in clear - * we don't have to verify policy. - */ - if (!ipsec_inbound_accept_clear(mp, ipha, NULL)) { - if (!tcp_check_policy(tcp, first_mp, - ipha, NULL, secure, ipsec_mctl)) { - /* - * tcp_check_policy called - * ip_drop_packet() on failure. - */ - return; - } - } - } - } else if (ipsec_mctl) { - /* - * This is a hard_bound connection. IP has already - * verified policy. We don't have to do it again. - */ - freeb(first_mp); - first_mp = mp; - ipsec_mctl = B_FALSE; - } - - seg_seq = ABE32_TO_U32(tcph->th_seq); - /* - * TCP SHOULD check that the TCP sequence number contained in - * payload of the ICMP error message is within the range - * SND.UNA <= SEG.SEQ < SND.NXT. - */ - if (SEQ_LT(seg_seq, tcp->tcp_suna) || SEQ_GEQ(seg_seq, tcp->tcp_snxt)) { - /* - * The ICMP message is bogus, just drop it. But if this is - * an ICMP too big message, IP has already changed - * the ire_max_frag to the bogus value. We need to change - * it back. - */ - if (icmph->icmph_type == ICMP_DEST_UNREACHABLE && - icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) { - conn_t *connp = tcp->tcp_connp; - ire_t *ire; - int flag; - - if (tcp->tcp_ipversion == IPV4_VERSION) { - flag = tcp->tcp_ipha-> - ipha_fragment_offset_and_flags; - } else { - flag = 0; - } - mutex_enter(&connp->conn_lock); - if ((ire = connp->conn_ire_cache) != NULL) { - mutex_enter(&ire->ire_lock); - mutex_exit(&connp->conn_lock); - ire->ire_max_frag = tcp->tcp_if_mtu; - ire->ire_frag_flag |= flag; - mutex_exit(&ire->ire_lock); - } else { - mutex_exit(&connp->conn_lock); - } - } + (uchar_t *)tcpha + ICMP_MIN_TCP_HDR > mp->b_wptr) { goto noticmpv4; } + seg_seq = ntohl(tcpha->tha_seq); switch (icmph->icmph_type) { case ICMP_DEST_UNREACHABLE: switch (icmph->icmph_code) { case ICMP_FRAGMENTATION_NEEDED: /* - * Reduce the MSS based on the new MTU. This will - * eliminate any fragmentation locally. - * N.B. There may well be some funny side-effects on - * the local send policy and the remote receive policy. - * Pending further research, we provide - * tcp_ignore_path_mtu just in case this proves - * disastrous somewhere. - * - * After updating the MSS, retransmit part of the - * dropped segment using the new mss by calling - * tcp_wput_data(). Need to adjust all those - * params to make sure tcp_wput_data() work properly. - */ - if (tcps->tcps_ignore_path_mtu || - tcp->tcp_ipha->ipha_fragment_offset_and_flags == 0) - break; - - /* - * Decrease the MSS by time stamp options - * IP options and IPSEC options. tcp_hdr_len - * includes time stamp option and IP option - * length. Note that new_mss may be negative - * if tcp_ipsec_overhead is large and the - * icmph_du_mtu is the minimum value, which is 68. - */ - new_mss = ntohs(icmph->icmph_du_mtu) - - tcp->tcp_hdr_len - tcp->tcp_ipsec_overhead; - - DTRACE_PROBE2(tcp__pmtu__change, tcp_t *, tcp, int, - new_mss); - - /* - * Only update the MSS if the new one is - * smaller than the previous one. This is - * to avoid problems when getting multiple - * ICMP errors for the same MTU. - */ - if (new_mss >= tcp->tcp_mss) - break; - - /* - * Note that we are using the template header's DF - * bit in the fast path sending. So we need to compare - * the new mss with both tcps_mss_min and ip_pmtu_min. - * And stop doing IPv4 PMTUd if new_mss is less than - * MAX(tcps_mss_min, ip_pmtu_min). - */ - if (new_mss < tcps->tcps_mss_min || - new_mss < ipst->ips_ip_pmtu_min) { - tcp->tcp_ipha->ipha_fragment_offset_and_flags = - 0; - } - - ratio = tcp->tcp_cwnd / tcp->tcp_mss; - ASSERT(ratio >= 1); - tcp_mss_set(tcp, new_mss, B_TRUE); - - /* - * Make sure we have something to - * send. + * Update Path MTU, then try to send something out. */ - if (SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) && - (tcp->tcp_xmit_head != NULL)) { - /* - * Shrink tcp_cwnd in - * proportion to the old MSS/new MSS. - */ - tcp->tcp_cwnd = ratio * tcp->tcp_mss; - if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && - (tcp->tcp_unsent == 0)) { - tcp->tcp_rexmit_max = tcp->tcp_fss; - } else { - tcp->tcp_rexmit_max = tcp->tcp_snxt; - } - tcp->tcp_rexmit_nxt = tcp->tcp_suna; - tcp->tcp_rexmit = B_TRUE; - tcp->tcp_dupack_cnt = 0; - tcp->tcp_snd_burst = TCP_CWND_SS; - tcp_ss_rexmit(tcp); - } + tcp_update_pmtu(tcp, B_TRUE); + tcp_rexmit_after_error(tcp); break; case ICMP_PORT_UNREACHABLE: case ICMP_PROTOCOL_UNREACHABLE: @@ -8451,7 +6847,6 @@ noticmpv4: * Ditch the half-open connection if we * suspect a SYN attack is under way. */ - tcp_ip_ire_mark_advice(tcp); (void) tcp_clean_death(tcp, tcp->tcp_client_errno, 7); } @@ -8483,67 +6878,191 @@ noticmpv4: break; } } - freemsg(first_mp); + freemsg(mp); } /* - * tcp_icmp_error_ipv6 is called by tcp_rput_other to process ICMPv6 - * error messages passed up by IP. - * Assumes that IP has pulled up all the extension headers as well - * as the ICMPv6 header. + * CALLED OUTSIDE OF SQUEUE! It can not follow any pointers that tcp might + * change. But it can refer to fields like tcp_suna and tcp_snxt. + * + * Function tcp_verifyicmp is called as conn_verifyicmp to verify the ICMP + * error messages received by IP. The message is always received on the correct + * tcp_t. + */ +/* ARGSUSED */ +static boolean_t +tcp_verifyicmp(conn_t *connp, void *arg2, icmph_t *icmph, icmp6_t *icmp6, + ip_recv_attr_t *ira) +{ + tcpha_t *tcpha = (tcpha_t *)arg2; + uint32_t seq = ntohl(tcpha->tha_seq); + tcp_t *tcp = connp->conn_tcp; + + /* + * TCP sequence number contained in payload of the ICMP error message + * should be within the range SND.UNA <= SEG.SEQ < SND.NXT. Otherwise, + * the message is either a stale ICMP error, or an attack from the + * network. Fail the verification. + */ + if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GEQ(seq, tcp->tcp_snxt)) + return (B_FALSE); + + /* For "too big" we also check the ignore flag */ + if (ira->ira_flags & IRAF_IS_IPV4) { + ASSERT(icmph != NULL); + if (icmph->icmph_type == ICMP_DEST_UNREACHABLE && + icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED && + tcp->tcp_tcps->tcps_ignore_path_mtu) + return (B_FALSE); + } else { + ASSERT(icmp6 != NULL); + if (icmp6->icmp6_type == ICMP6_PACKET_TOO_BIG && + tcp->tcp_tcps->tcps_ignore_path_mtu) + return (B_FALSE); + } + return (B_TRUE); +} + +/* + * Update the TCP connection according to change of PMTU. + * + * Path MTU might have changed by either increase or decrease, so need to + * adjust the MSS based on the value of ixa_pmtu. No need to handle tiny + * or negative MSS, since tcp_mss_set() will do it. */ static void -tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, boolean_t ipsec_mctl) +tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only) { - icmp6_t *icmp6; - ip6_t *ip6h; - uint16_t iph_hdr_length; - tcpha_t *tcpha; - uint8_t *nexthdrp; - uint32_t new_mss; - uint32_t ratio; - boolean_t secure; - mblk_t *first_mp = mp; - size_t mp_size; - uint32_t seg_seq; - tcp_stack_t *tcps = tcp->tcp_tcps; + uint32_t pmtu; + int32_t mss; + conn_t *connp = tcp->tcp_connp; + ip_xmit_attr_t *ixa = connp->conn_ixa; + iaflags_t ixaflags; + + if (tcp->tcp_tcps->tcps_ignore_path_mtu) + return; + + if (tcp->tcp_state < TCPS_ESTABLISHED) + return; /* - * The caller has determined if this is an IPSEC_IN packet and - * set ipsec_mctl appropriately (see tcp_icmp_error). + * Always call ip_get_pmtu() to make sure that IP has updated + * ixa_flags properly. */ - if (ipsec_mctl) - mp = mp->b_cont; + pmtu = ip_get_pmtu(ixa); + ixaflags = ixa->ixa_flags; - mp_size = MBLKL(mp); + /* + * Calculate the MSS by decreasing the PMTU by conn_ht_iphc_len and + * IPsec overhead if applied. Make sure to use the most recent + * IPsec information. + */ + mss = pmtu - connp->conn_ht_iphc_len - conn_ipsec_length(connp); /* - * Verify that we have a complete IP header. If not, send it upstream. + * Nothing to change, so just return. */ - if (mp_size < sizeof (ip6_t)) { -noticmpv6: - freemsg(first_mp); + if (mss == tcp->tcp_mss) return; - } /* - * Verify this is an ICMPV6 packet, else send it upstream. + * Currently, for ICMP errors, only PMTU decrease is handled. */ - ip6h = (ip6_t *)mp->b_rptr; - if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { - iph_hdr_length = IPV6_HDR_LEN; - } else if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, - &nexthdrp) || - *nexthdrp != IPPROTO_ICMPV6) { - goto noticmpv6; + if (mss > tcp->tcp_mss && decrease_only) + return; + + DTRACE_PROBE2(tcp_update_pmtu, int32_t, tcp->tcp_mss, uint32_t, mss); + + /* + * Update ixa_fragsize and ixa_pmtu. + */ + ixa->ixa_fragsize = ixa->ixa_pmtu = pmtu; + + /* + * Adjust MSS and all relevant variables. + */ + tcp_mss_set(tcp, mss); + + /* + * If the PMTU is below the min size maintained by IP, then ip_get_pmtu + * has set IXAF_PMTU_TOO_SMALL and cleared IXAF_PMTU_IPV4_DF. Since TCP + * has a (potentially different) min size we do the same. Make sure to + * clear IXAF_DONTFRAG, which is used by IP to decide whether to + * fragment the packet. + * + * LSO over IPv6 can not be fragmented. So need to disable LSO + * when IPv6 fragmentation is needed. + */ + if (mss < tcp->tcp_tcps->tcps_mss_min) + ixaflags |= IXAF_PMTU_TOO_SMALL; + + if (ixaflags & IXAF_PMTU_TOO_SMALL) + ixaflags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); + + if ((connp->conn_ipversion == IPV4_VERSION) && + !(ixaflags & IXAF_PMTU_IPV4_DF)) { + tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0; } + ixa->ixa_flags = ixaflags; +} + +/* + * Do slow start retransmission after ICMP errors of PMTU changes. + */ +static void +tcp_rexmit_after_error(tcp_t *tcp) +{ + /* + * All sent data has been acknowledged or no data left to send, just + * to return. + */ + if (!SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) || + (tcp->tcp_xmit_head == NULL)) + return; + + if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && (tcp->tcp_unsent == 0)) + tcp->tcp_rexmit_max = tcp->tcp_fss; + else + tcp->tcp_rexmit_max = tcp->tcp_snxt; + + tcp->tcp_rexmit_nxt = tcp->tcp_suna; + tcp->tcp_rexmit = B_TRUE; + tcp->tcp_dupack_cnt = 0; + tcp->tcp_snd_burst = TCP_CWND_SS; + tcp_ss_rexmit(tcp); +} + +/* + * tcp_icmp_error_ipv6 is called from tcp_icmp_input to process ICMPv6 + * error messages passed up by IP. + * Assumes that IP has pulled up all the extension headers as well + * as the ICMPv6 header. + */ +static void +tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, ip_recv_attr_t *ira) +{ + icmp6_t *icmp6; + ip6_t *ip6h; + uint16_t iph_hdr_length = ira->ira_ip_hdr_length; + tcpha_t *tcpha; + uint8_t *nexthdrp; + uint32_t seg_seq; + + /* + * Verify that we have a complete IP header. + */ + ASSERT((MBLKL(mp) >= sizeof (ip6_t))); + icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length]; ip6h = (ip6_t *)&icmp6[1]; /* * Verify if we have a complete ICMP and inner IP header. */ - if ((uchar_t *)&ip6h[1] > mp->b_wptr) - goto noticmpv6; + if ((uchar_t *)&ip6h[1] > mp->b_wptr) { +noticmpv6: + freemsg(mp); + return; + } if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) goto noticmpv6; @@ -8558,130 +7077,15 @@ noticmpv6: goto noticmpv6; } - /* - * ICMP errors come on the right queue or come on - * listener/global queue for detached connections and - * get switched to the right queue. If it comes on the - * right queue, policy check has already been done by IP - * and thus free the first_mp without verifying the policy. - * If it has come for a non-hard bound connection, we need - * to verify policy as IP may not have done it. - */ - if (!tcp->tcp_hard_bound) { - if (ipsec_mctl) { - secure = ipsec_in_is_secure(first_mp); - } else { - secure = B_FALSE; - } - if (secure) { - /* - * If we are willing to accept this in clear - * we don't have to verify policy. - */ - if (!ipsec_inbound_accept_clear(mp, NULL, ip6h)) { - if (!tcp_check_policy(tcp, first_mp, - NULL, ip6h, secure, ipsec_mctl)) { - /* - * tcp_check_policy called - * ip_drop_packet() on failure. - */ - return; - } - } - } - } else if (ipsec_mctl) { - /* - * This is a hard_bound connection. IP has already - * verified policy. We don't have to do it again. - */ - freeb(first_mp); - first_mp = mp; - ipsec_mctl = B_FALSE; - } - seg_seq = ntohl(tcpha->tha_seq); - /* - * TCP SHOULD check that the TCP sequence number contained in - * payload of the ICMP error message is within the range - * SND.UNA <= SEG.SEQ < SND.NXT. - */ - if (SEQ_LT(seg_seq, tcp->tcp_suna) || SEQ_GEQ(seg_seq, tcp->tcp_snxt)) { - /* - * If the ICMP message is bogus, should we kill the - * connection, or should we just drop the bogus ICMP - * message? It would probably make more sense to just - * drop the message so that if this one managed to get - * in, the real connection should not suffer. - */ - goto noticmpv6; - } - switch (icmp6->icmp6_type) { case ICMP6_PACKET_TOO_BIG: /* - * Reduce the MSS based on the new MTU. This will - * eliminate any fragmentation locally. - * N.B. There may well be some funny side-effects on - * the local send policy and the remote receive policy. - * Pending further research, we provide - * tcp_ignore_path_mtu just in case this proves - * disastrous somewhere. - * - * After updating the MSS, retransmit part of the - * dropped segment using the new mss by calling - * tcp_wput_data(). Need to adjust all those - * params to make sure tcp_wput_data() work properly. - */ - if (tcps->tcps_ignore_path_mtu) - break; - - /* - * Decrease the MSS by time stamp options - * IP options and IPSEC options. tcp_hdr_len - * includes time stamp option and IP option - * length. - */ - new_mss = ntohs(icmp6->icmp6_mtu) - tcp->tcp_hdr_len - - tcp->tcp_ipsec_overhead; - - /* - * Only update the MSS if the new one is - * smaller than the previous one. This is - * to avoid problems when getting multiple - * ICMP errors for the same MTU. - */ - if (new_mss >= tcp->tcp_mss) - break; - - ratio = tcp->tcp_cwnd / tcp->tcp_mss; - ASSERT(ratio >= 1); - tcp_mss_set(tcp, new_mss, B_TRUE); - - /* - * Make sure we have something to - * send. + * Update Path MTU, then try to send something out. */ - if (SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) && - (tcp->tcp_xmit_head != NULL)) { - /* - * Shrink tcp_cwnd in - * proportion to the old MSS/new MSS. - */ - tcp->tcp_cwnd = ratio * tcp->tcp_mss; - if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && - (tcp->tcp_unsent == 0)) { - tcp->tcp_rexmit_max = tcp->tcp_fss; - } else { - tcp->tcp_rexmit_max = tcp->tcp_snxt; - } - tcp->tcp_rexmit_nxt = tcp->tcp_suna; - tcp->tcp_rexmit = B_TRUE; - tcp->tcp_dupack_cnt = 0; - tcp->tcp_snd_burst = TCP_CWND_SS; - tcp_ss_rexmit(tcp); - } + tcp_update_pmtu(tcp, B_TRUE); + tcp_rexmit_after_error(tcp); break; - case ICMP6_DST_UNREACH: switch (icmp6->icmp6_code) { case ICMP6_DST_UNREACH_NOPORT: @@ -8692,7 +7096,6 @@ noticmpv6: ECONNREFUSED, 8); } break; - case ICMP6_DST_UNREACH_ADMIN: case ICMP6_DST_UNREACH_NOROUTE: case ICMP6_DST_UNREACH_BEYONDSCOPE: @@ -8708,7 +7111,6 @@ noticmpv6: * Ditch the half-open connection if we * suspect a SYN attack is under way. */ - tcp_ip_ire_mark_advice(tcp); (void) tcp_clean_death(tcp, tcp->tcp_client_errno, 9); } @@ -8720,7 +7122,6 @@ noticmpv6: break; } break; - case ICMP6_PARAM_PROB: /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */ if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER && @@ -8739,83 +7140,42 @@ noticmpv6: default: break; } - freemsg(first_mp); + freemsg(mp); } /* * Notify IP that we are having trouble with this connection. IP should - * blow the IRE away and start over. + * make note so it can potentially use a different IRE. */ static void tcp_ip_notify(tcp_t *tcp) { - struct iocblk *iocp; - ipid_t *ipid; - mblk_t *mp; - - /* IPv6 has NUD thus notification to delete the IRE is not needed */ - if (tcp->tcp_ipversion == IPV6_VERSION) - return; - - mp = mkiocb(IP_IOCTL); - if (mp == NULL) - return; - - iocp = (struct iocblk *)mp->b_rptr; - iocp->ioc_count = sizeof (ipid_t) + sizeof (tcp->tcp_ipha->ipha_dst); - - mp->b_cont = allocb(iocp->ioc_count, BPRI_HI); - if (!mp->b_cont) { - freeb(mp); - return; - } + conn_t *connp = tcp->tcp_connp; + ire_t *ire; - ipid = (ipid_t *)mp->b_cont->b_rptr; - mp->b_cont->b_wptr += iocp->ioc_count; - bzero(ipid, sizeof (*ipid)); - ipid->ipid_cmd = IP_IOC_IRE_DELETE_NO_REPLY; - ipid->ipid_ire_type = IRE_CACHE; - ipid->ipid_addr_offset = sizeof (ipid_t); - ipid->ipid_addr_length = sizeof (tcp->tcp_ipha->ipha_dst); /* * Note: in the case of source routing we want to blow away the * route to the first source route hop. */ - bcopy(&tcp->tcp_ipha->ipha_dst, &ipid[1], - sizeof (tcp->tcp_ipha->ipha_dst)); - - CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp); -} - -/* Unlink and return any mblk that looks like it contains an ire */ -static mblk_t * -tcp_ire_mp(mblk_t **mpp) -{ - mblk_t *mp = *mpp; - mblk_t *prev_mp = NULL; - - for (;;) { - switch (DB_TYPE(mp)) { - case IRE_DB_TYPE: - case IRE_DB_REQ_TYPE: - if (mp == *mpp) { - *mpp = mp->b_cont; - } else { - prev_mp->b_cont = mp->b_cont; - } - mp->b_cont = NULL; - return (mp); - default: - break; + ire = connp->conn_ixa->ixa_ire; + if (ire != NULL && !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { + if (ire->ire_ipversion == IPV4_VERSION) { + /* + * As per RFC 1122, we send an RTM_LOSING to inform + * routing protocols. + */ + ip_rts_change(RTM_LOSING, ire->ire_addr, + ire->ire_gateway_addr, ire->ire_mask, + connp->conn_laddr_v4, 0, 0, 0, + (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA), + ire->ire_ipst); } - prev_mp = mp; - mp = mp->b_cont; - if (mp == NULL) - break; + (void) ire_no_good(ire); } - return (mp); } +#pragma inline(tcp_send_data) + /* * Timer callback routine for keepalive probe. We do a fake resend of * last ACKed byte. Then set a timer using RTO. When the timer expires, @@ -8890,7 +7250,7 @@ tcp_keepalive_killer(void *arg) * timer back. */ if (mp != NULL) { - tcp_send_data(tcp, tcp->tcp_wq, mp); + tcp_send_data(tcp, mp); BUMP_MIB(&tcps->tcps_mib, tcpTimKeepaliveProbe); if (tcp->tcp_ka_last_intrvl != 0) { @@ -8930,17 +7290,17 @@ tcp_keepalive_killer(void *arg) int tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk) { - queue_t *q = tcp->tcp_rq; + conn_t *connp = tcp->tcp_connp; + queue_t *q = connp->conn_rq; int32_t mss = tcp->tcp_mss; int maxpsz; - conn_t *connp = tcp->tcp_connp; if (TCP_IS_DETACHED(tcp)) return (mss); if (tcp->tcp_fused) { maxpsz = tcp_fuse_maxpsz(tcp); mss = INFPSZ; - } else if (tcp->tcp_mdt || tcp->tcp_lso || tcp->tcp_maxpsz == 0) { + } else if (tcp->tcp_maxpsz_multiplier == 0) { /* * Set the sd_qn_maxpsz according to the socket send buffer * size, and sd_maxblk to INFPSZ (-1). This will essentially @@ -8948,7 +7308,7 @@ tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk) * kernel-allocated buffers without breaking it up into smaller * chunks. We round up the buffer size to the nearest SMSS. */ - maxpsz = MSS_ROUNDUP(tcp->tcp_xmit_hiwater, mss); + maxpsz = MSS_ROUNDUP(connp->conn_sndbuf, mss); if (tcp->tcp_kssl_ctx == NULL) mss = INFPSZ; else @@ -8960,21 +7320,17 @@ tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk) * head to break down larger than SMSS writes into SMSS- * size mblks, up to tcp_maxpsz_multiplier mblks at a time. */ - /* XXX tune this with ndd tcp_maxpsz_multiplier */ - maxpsz = tcp->tcp_maxpsz * mss; - if (maxpsz > tcp->tcp_xmit_hiwater/2) { - maxpsz = tcp->tcp_xmit_hiwater/2; + maxpsz = tcp->tcp_maxpsz_multiplier * mss; + if (maxpsz > connp->conn_sndbuf / 2) { + maxpsz = connp->conn_sndbuf / 2; /* Round up to nearest mss */ maxpsz = MSS_ROUNDUP(maxpsz, mss); } } (void) proto_set_maxpsz(q, connp, maxpsz); - if (!(IPCL_IS_NONSTR(connp))) { - /* XXX do it in set_maxpsz()? */ - tcp->tcp_wq->q_maxpsz = maxpsz; - } - + if (!(IPCL_IS_NONSTR(connp))) + connp->conn_wq->q_maxpsz = maxpsz; if (set_maxblk) (void) proto_set_tx_maxblk(q, connp, mss); return (mss); @@ -8985,18 +7341,18 @@ tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk) * tcpopt struct and return a bitmask saying which options were found. */ static int -tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt) +tcp_parse_options(tcpha_t *tcpha, tcp_opt_t *tcpopt) { uchar_t *endp; int len; uint32_t mss; - uchar_t *up = (uchar_t *)tcph; + uchar_t *up = (uchar_t *)tcpha; int found = 0; int32_t sack_len; tcp_seq sack_begin, sack_end; tcp_t *tcp; - endp = up + TCP_HDR_LENGTH(tcph); + endp = up + TCP_HDR_LENGTH(tcpha); up += TCP_MIN_HEADER_LENGTH; while (up < endp) { len = endp - up; @@ -9135,28 +7491,20 @@ tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt) } /* - * Set the mss associated with a particular tcp based on its current value, - * and a new one passed in. Observe minimums and maximums, and reset - * other state variables that we want to view as multiples of mss. - * - * This function is called mainly because values like tcp_mss, tcp_cwnd, - * highwater marks etc. need to be initialized or adjusted. - * 1) From tcp_process_options() when the other side's SYN/SYN-ACK - * packet arrives. - * 2) We need to set a new MSS when ICMP_FRAGMENTATION_NEEDED or - * ICMP6_PACKET_TOO_BIG arrives. - * 3) From tcp_paws_check() if the other side stops sending the timestamp, - * to increase the MSS to use the extra bytes available. + * Set the MSS associated with a particular tcp based on its current value, + * and a new one passed in. Observe minimums and maximums, and reset other + * state variables that we want to view as multiples of MSS. * - * Callers except tcp_paws_check() ensure that they only reduce mss. + * The value of MSS could be either increased or descreased. */ static void -tcp_mss_set(tcp_t *tcp, uint32_t mss, boolean_t do_ss) +tcp_mss_set(tcp_t *tcp, uint32_t mss) { uint32_t mss_max; tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; - if (tcp->tcp_ipversion == IPV4_VERSION) + if (connp->conn_ipversion == IPV4_VERSION) mss_max = tcps->tcps_mss_max_ipv4; else mss_max = tcps->tcps_mss_max_ipv6; @@ -9176,34 +7524,22 @@ tcp_mss_set(tcp_t *tcp, uint32_t mss, boolean_t do_ss) * TCP should be able to buffer at least 4 MSS data for obvious * performance reason. */ - if ((mss << 2) > tcp->tcp_xmit_hiwater) - tcp->tcp_xmit_hiwater = mss << 2; + if ((mss << 2) > connp->conn_sndbuf) + connp->conn_sndbuf = mss << 2; /* - * Set the xmit_lowater to at least twice of MSS. + * Set the send lowater to at least twice of MSS. */ - if ((mss << 1) > tcp->tcp_xmit_lowater) - tcp->tcp_xmit_lowater = mss << 1; + if ((mss << 1) > connp->conn_sndlowat) + connp->conn_sndlowat = mss << 1; + + /* + * Update tcp_cwnd according to the new value of MSS. Keep the + * previous ratio to preserve the transmit rate. + */ + tcp->tcp_cwnd = (tcp->tcp_cwnd / tcp->tcp_mss) * mss; + tcp->tcp_cwnd_cnt = 0; - if (do_ss) { - /* - * Either the tcp_cwnd is as yet uninitialized, or mss is - * changing due to a reduction in MTU, presumably as a - * result of a new path component, reset cwnd to its - * "initial" value, as a multiple of the new mss. - */ - SET_TCP_INIT_CWND(tcp, mss, tcps->tcps_slow_start_initial); - } else { - /* - * Called by tcp_paws_check(), the mss increased - * marginally to allow use of space previously taken - * by the timestamp option. It would be inappropriate - * to apply slow start or tcp_init_cwnd values to - * tcp_cwnd, simply adjust to a multiple of the new mss. - */ - tcp->tcp_cwnd = (tcp->tcp_cwnd / tcp->tcp_mss) * mss; - tcp->tcp_cwnd_cnt = 0; - } tcp->tcp_mss = mss; (void) tcp_maxpsz_set(tcp, B_TRUE); } @@ -9223,12 +7559,11 @@ tcp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) } static conn_t * -tcp_create_common(queue_t *q, cred_t *credp, boolean_t isv6, - boolean_t issocket, int *errorp) +tcp_create_common(cred_t *credp, boolean_t isv6, boolean_t issocket, + int *errorp) { tcp_t *tcp = NULL; conn_t *connp; - int err; zoneid_t zoneid; tcp_stack_t *tcps; squeue_t *sqp; @@ -9265,15 +7600,6 @@ tcp_create_common(queue_t *q, cred_t *credp, boolean_t isv6, else zoneid = crgetzoneid(credp); } - /* - * For stackid zero this is done from strplumb.c, but - * non-zero stackids are handled here. - */ - if (tcps->tcps_g_q == NULL && - tcps->tcps_netstack->netstack_stackid != - GLOBAL_NETSTACKID) { - tcp_g_q_setup(tcps); - } sqp = IP_SQUEUE_GET((uint_t)gethrtime()); connp = (conn_t *)tcp_get_conn(sqp, tcps); @@ -9286,41 +7612,50 @@ tcp_create_common(queue_t *q, cred_t *credp, boolean_t isv6, *errorp = ENOSR; return (NULL); } + ASSERT(connp->conn_ixa->ixa_protocol == connp->conn_proto); + connp->conn_sqp = sqp; connp->conn_initial_sqp = connp->conn_sqp; + connp->conn_ixa->ixa_sqp = connp->conn_sqp; tcp = connp->conn_tcp; + /* + * Besides asking IP to set the checksum for us, have conn_ip_output + * to do the following checks when necessary: + * + * IXAF_VERIFY_SOURCE: drop packets when our outer source goes invalid + * IXAF_VERIFY_PMTU: verify PMTU changes + * IXAF_VERIFY_LSO: verify LSO capability changes + */ + connp->conn_ixa->ixa_flags |= IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE | + IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO; + + if (!tcps->tcps_dev_flow_ctl) + connp->conn_ixa->ixa_flags |= IXAF_NO_DEV_FLOW_CTL; + if (isv6) { - connp->conn_flags |= IPCL_TCP6; - connp->conn_send = ip_output_v6; - connp->conn_af_isv6 = B_TRUE; - connp->conn_pkt_isv6 = B_TRUE; - connp->conn_src_preferences = IPV6_PREFER_SRC_DEFAULT; - tcp->tcp_ipversion = IPV6_VERSION; - tcp->tcp_family = AF_INET6; + connp->conn_ixa->ixa_src_preferences = IPV6_PREFER_SRC_DEFAULT; + connp->conn_ipversion = IPV6_VERSION; + connp->conn_family = AF_INET6; tcp->tcp_mss = tcps->tcps_mss_def_ipv6; + connp->conn_default_ttl = tcps->tcps_ipv6_hoplimit; } else { - connp->conn_flags |= IPCL_TCP4; - connp->conn_send = ip_output; - connp->conn_af_isv6 = B_FALSE; - connp->conn_pkt_isv6 = B_FALSE; - tcp->tcp_ipversion = IPV4_VERSION; - tcp->tcp_family = AF_INET; + connp->conn_ipversion = IPV4_VERSION; + connp->conn_family = AF_INET; tcp->tcp_mss = tcps->tcps_mss_def_ipv4; + connp->conn_default_ttl = tcps->tcps_ipv4_ttl; } + connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl; + + crhold(credp); + connp->conn_cred = credp; + connp->conn_cpid = curproc->p_pid; + connp->conn_open_time = lbolt64; - /* - * TCP keeps a copy of cred for cache locality reasons but - * we put a reference only once. If connp->conn_cred - * becomes invalid, tcp_cred should also be set to NULL. - */ - tcp->tcp_cred = connp->conn_cred = credp; - crhold(connp->conn_cred); - tcp->tcp_cpid = curproc->p_pid; - tcp->tcp_open_time = lbolt64; connp->conn_zoneid = zoneid; + /* conn_allzones can not be set this early, hence no IPCL_ZONEID */ + connp->conn_ixa->ixa_zoneid = zoneid; connp->conn_mlp_type = mlptSingle; - connp->conn_ulp_labeled = !is_system_labeled(); ASSERT(connp->conn_netstack == tcps->tcps_netstack); ASSERT(tcp->tcp_tcps == tcps); @@ -9331,38 +7666,22 @@ tcp_create_common(queue_t *q, cred_t *credp, boolean_t isv6, if (getpflags(NET_MAC_AWARE, credp) != 0) connp->conn_mac_mode = CONN_MAC_AWARE; - connp->conn_dev = NULL; + connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID); + if (issocket) { - connp->conn_flags |= IPCL_SOCKET; tcp->tcp_issocket = 1; } - /* Non-zero default values */ - connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; - - if (q == NULL) { - /* - * Create a helper stream for non-STREAMS socket. - */ - err = ip_create_helper_stream(connp, tcps->tcps_ldi_ident); - if (err != 0) { - ip1dbg(("tcp_create_common: create of IP helper stream " - "failed\n")); - CONN_DEC_REF(connp); - *errorp = err; - return (NULL); - } - q = connp->conn_rq; - } + connp->conn_rcvbuf = tcps->tcps_recv_hiwat; + connp->conn_sndbuf = tcps->tcps_xmit_hiwat; + connp->conn_sndlowat = tcps->tcps_xmit_lowat; + connp->conn_so_type = SOCK_STREAM; + connp->conn_wroff = connp->conn_ht_iphc_allocated + + tcps->tcps_wroff_xtra; SOCK_CONNID_INIT(tcp->tcp_connid); - err = tcp_init(tcp, q); - if (err != 0) { - CONN_DEC_REF(connp); - *errorp = err; - return (NULL); - } - + tcp->tcp_state = TCPS_IDLE; + tcp_init_values(tcp); return (connp); } @@ -9415,7 +7734,7 @@ tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, q->q_qinfo = &tcp_acceptor_rinit; /* * the conn_dev and minor_arena will be subsequently used by - * tcp_wput_accept() and tcp_tpi_close_accept() to figure out + * tcp_tli_accept() and tcp_tpi_close_accept() to figure out * the minor device number for this connection from the q_ptr. */ RD(q)->q_ptr = (void *)conn_dev; @@ -9426,7 +7745,7 @@ tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, } issocket = flag & SO_SOCKSTR; - connp = tcp_create_common(q, credp, isv6, issocket, &err); + connp = tcp_create_common(credp, isv6, issocket, &err); if (connp == NULL) { inet_minor_free(minor_arena, conn_dev); @@ -9434,6 +7753,8 @@ tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, return (err); } + connp->conn_rq = q; + connp->conn_wq = WR(q); q->q_ptr = WR(q)->q_ptr = connp; connp->conn_dev = conn_dev; @@ -9500,7 +7821,7 @@ tcp_allow_connopt_set(int level, int name) } /* - * this routine gets default values of certain options whose default + * This routine gets default values of certain options whose default * values are maintained by protocol specific code */ /* ARGSUSED */ @@ -9553,321 +7874,102 @@ tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr) return (sizeof (int)); } +/* + * TCP routine to get the values of options. + */ static int tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) { int *i1 = (int *)ptr; tcp_t *tcp = connp->conn_tcp; - ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp; + conn_opt_arg_t coas; + int retval; + + coas.coa_connp = connp; + coas.coa_ixa = connp->conn_ixa; + coas.coa_ipp = &connp->conn_xmit_ipp; + coas.coa_ancillary = B_FALSE; + coas.coa_changed = 0; switch (level) { case SOL_SOCKET: switch (name) { - case SO_LINGER: { - struct linger *lgr = (struct linger *)ptr; - - lgr->l_onoff = tcp->tcp_linger ? SO_LINGER : 0; - lgr->l_linger = tcp->tcp_lingertime; - } - return (sizeof (struct linger)); - case SO_DEBUG: - *i1 = tcp->tcp_debug ? SO_DEBUG : 0; - break; - case SO_KEEPALIVE: - *i1 = tcp->tcp_ka_enabled ? SO_KEEPALIVE : 0; - break; - case SO_DONTROUTE: - *i1 = tcp->tcp_dontroute ? SO_DONTROUTE : 0; - break; - case SO_USELOOPBACK: - *i1 = tcp->tcp_useloopback ? SO_USELOOPBACK : 0; - break; - case SO_BROADCAST: - *i1 = tcp->tcp_broadcast ? SO_BROADCAST : 0; - break; - case SO_REUSEADDR: - *i1 = tcp->tcp_reuseaddr ? SO_REUSEADDR : 0; - break; - case SO_OOBINLINE: - *i1 = tcp->tcp_oobinline ? SO_OOBINLINE : 0; - break; - case SO_DGRAM_ERRIND: - *i1 = tcp->tcp_dgram_errind ? SO_DGRAM_ERRIND : 0; - break; - case SO_TYPE: - *i1 = SOCK_STREAM; - break; - case SO_SNDBUF: - *i1 = tcp->tcp_xmit_hiwater; - break; - case SO_RCVBUF: - *i1 = tcp->tcp_recv_hiwater; - break; case SO_SND_COPYAVOID: *i1 = tcp->tcp_snd_zcopy_on ? SO_SND_COPYAVOID : 0; - break; - case SO_ALLZONES: - *i1 = connp->conn_allzones ? 1 : 0; - break; - case SO_ANON_MLP: - *i1 = connp->conn_anon_mlp; - break; - case SO_MAC_EXEMPT: - *i1 = (connp->conn_mac_mode == CONN_MAC_AWARE); - break; - case SO_MAC_IMPLICIT: - *i1 = (connp->conn_mac_mode == CONN_MAC_IMPLICIT); - break; - case SO_EXCLBIND: - *i1 = tcp->tcp_exclbind ? SO_EXCLBIND : 0; - break; - case SO_PROTOTYPE: - *i1 = IPPROTO_TCP; - break; - case SO_DOMAIN: - *i1 = tcp->tcp_family; - break; + return (sizeof (int)); case SO_ACCEPTCONN: *i1 = (tcp->tcp_state == TCPS_LISTEN); - default: - return (-1); + return (sizeof (int)); } break; case IPPROTO_TCP: switch (name) { case TCP_NODELAY: *i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0; - break; + return (sizeof (int)); case TCP_MAXSEG: *i1 = tcp->tcp_mss; - break; + return (sizeof (int)); case TCP_NOTIFY_THRESHOLD: *i1 = (int)tcp->tcp_first_timer_threshold; - break; + return (sizeof (int)); case TCP_ABORT_THRESHOLD: *i1 = tcp->tcp_second_timer_threshold; - break; + return (sizeof (int)); case TCP_CONN_NOTIFY_THRESHOLD: *i1 = tcp->tcp_first_ctimer_threshold; - break; + return (sizeof (int)); case TCP_CONN_ABORT_THRESHOLD: *i1 = tcp->tcp_second_ctimer_threshold; - break; - case TCP_RECVDSTADDR: - *i1 = tcp->tcp_recvdstaddr; - break; - case TCP_ANONPRIVBIND: - *i1 = tcp->tcp_anon_priv_bind; - break; - case TCP_EXCLBIND: - *i1 = tcp->tcp_exclbind ? TCP_EXCLBIND : 0; - break; + return (sizeof (int)); case TCP_INIT_CWND: *i1 = tcp->tcp_init_cwnd; - break; + return (sizeof (int)); case TCP_KEEPALIVE_THRESHOLD: *i1 = tcp->tcp_ka_interval; - break; + return (sizeof (int)); case TCP_KEEPALIVE_ABORT_THRESHOLD: *i1 = tcp->tcp_ka_abort_thres; - break; + return (sizeof (int)); case TCP_CORK: *i1 = tcp->tcp_cork; - break; - default: - return (-1); + return (sizeof (int)); } break; case IPPROTO_IP: - if (tcp->tcp_family != AF_INET) + if (connp->conn_family != AF_INET) return (-1); switch (name) { case IP_OPTIONS: - case T_IP_OPTIONS: { - /* - * This is compatible with BSD in that in only return - * the reverse source route with the final destination - * as the last entry. The first 4 bytes of the option - * will contain the final destination. - */ - int opt_len; - - opt_len = (char *)tcp->tcp_tcph - (char *)tcp->tcp_ipha; - opt_len -= tcp->tcp_label_len + IP_SIMPLE_HDR_LENGTH; - ASSERT(opt_len >= 0); + case T_IP_OPTIONS: /* Caller ensures enough space */ - if (opt_len > 0) { - /* - * TODO: Do we have to handle getsockopt on an - * initiator as well? - */ - return (ip_opt_get_user(tcp->tcp_ipha, ptr)); - } - return (0); - } - case IP_TOS: - case T_IP_TOS: - *i1 = (int)tcp->tcp_ipha->ipha_type_of_service; - break; - case IP_TTL: - *i1 = (int)tcp->tcp_ipha->ipha_ttl; - break; - case IP_NEXTHOP: - /* Handled at IP level */ - return (-EINVAL); + return (ip_opt_get_user(connp, ptr)); default: - return (-1); + break; } break; + case IPPROTO_IPV6: /* * IPPROTO_IPV6 options are only supported for sockets * that are using IPv6 on the wire. */ - if (tcp->tcp_ipversion != IPV6_VERSION) { + if (connp->conn_ipversion != IPV6_VERSION) { return (-1); } switch (name) { - case IPV6_UNICAST_HOPS: - *i1 = (unsigned int) tcp->tcp_ip6h->ip6_hops; - break; /* goto sizeof (int) option return */ - case IPV6_BOUND_IF: - /* Zero if not set */ - *i1 = tcp->tcp_bound_if; - break; /* goto sizeof (int) option return */ - case IPV6_RECVPKTINFO: - if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) - *i1 = 1; - else - *i1 = 0; - break; /* goto sizeof (int) option return */ - case IPV6_RECVTCLASS: - if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVTCLASS) - *i1 = 1; - else - *i1 = 0; - break; /* goto sizeof (int) option return */ - case IPV6_RECVHOPLIMIT: - if (tcp->tcp_ipv6_recvancillary & - TCP_IPV6_RECVHOPLIMIT) - *i1 = 1; - else - *i1 = 0; - break; /* goto sizeof (int) option return */ - case IPV6_RECVHOPOPTS: - if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVHOPOPTS) - *i1 = 1; - else - *i1 = 0; - break; /* goto sizeof (int) option return */ - case IPV6_RECVDSTOPTS: - if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVDSTOPTS) - *i1 = 1; - else - *i1 = 0; - break; /* goto sizeof (int) option return */ - case _OLD_IPV6_RECVDSTOPTS: - if (tcp->tcp_ipv6_recvancillary & - TCP_OLD_IPV6_RECVDSTOPTS) - *i1 = 1; - else - *i1 = 0; - break; /* goto sizeof (int) option return */ - case IPV6_RECVRTHDR: - if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVRTHDR) - *i1 = 1; - else - *i1 = 0; - break; /* goto sizeof (int) option return */ - case IPV6_RECVRTHDRDSTOPTS: - if (tcp->tcp_ipv6_recvancillary & - TCP_IPV6_RECVRTDSTOPTS) - *i1 = 1; - else - *i1 = 0; - break; /* goto sizeof (int) option return */ - case IPV6_PKTINFO: { - /* XXX assumes that caller has room for max size! */ - struct in6_pktinfo *pkti; - - pkti = (struct in6_pktinfo *)ptr; - if (ipp->ipp_fields & IPPF_IFINDEX) - pkti->ipi6_ifindex = ipp->ipp_ifindex; - else - pkti->ipi6_ifindex = 0; - if (ipp->ipp_fields & IPPF_ADDR) - pkti->ipi6_addr = ipp->ipp_addr; - else - pkti->ipi6_addr = ipv6_all_zeros; - return (sizeof (struct in6_pktinfo)); - } - case IPV6_TCLASS: - if (ipp->ipp_fields & IPPF_TCLASS) - *i1 = ipp->ipp_tclass; - else - *i1 = IPV6_FLOW_TCLASS( - IPV6_DEFAULT_VERS_AND_FLOW); - break; /* goto sizeof (int) option return */ - case IPV6_NEXTHOP: { - sin6_t *sin6 = (sin6_t *)ptr; - - if (!(ipp->ipp_fields & IPPF_NEXTHOP)) - return (0); - *sin6 = sin6_null; - sin6->sin6_family = AF_INET6; - sin6->sin6_addr = ipp->ipp_nexthop; - return (sizeof (sin6_t)); - } - case IPV6_HOPOPTS: - if (!(ipp->ipp_fields & IPPF_HOPOPTS)) - return (0); - if (ipp->ipp_hopoptslen <= tcp->tcp_label_len) - return (0); - bcopy((char *)ipp->ipp_hopopts + tcp->tcp_label_len, - ptr, ipp->ipp_hopoptslen - tcp->tcp_label_len); - if (tcp->tcp_label_len > 0) { - ptr[0] = ((char *)ipp->ipp_hopopts)[0]; - ptr[1] = (ipp->ipp_hopoptslen - - tcp->tcp_label_len + 7) / 8 - 1; - } - return (ipp->ipp_hopoptslen - tcp->tcp_label_len); - case IPV6_RTHDRDSTOPTS: - if (!(ipp->ipp_fields & IPPF_RTDSTOPTS)) - return (0); - bcopy(ipp->ipp_rtdstopts, ptr, ipp->ipp_rtdstoptslen); - return (ipp->ipp_rtdstoptslen); - case IPV6_RTHDR: - if (!(ipp->ipp_fields & IPPF_RTHDR)) - return (0); - bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen); - return (ipp->ipp_rthdrlen); - case IPV6_DSTOPTS: - if (!(ipp->ipp_fields & IPPF_DSTOPTS)) - return (0); - bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen); - return (ipp->ipp_dstoptslen); - case IPV6_SRC_PREFERENCES: - return (ip6_get_src_preferences(connp, - (uint32_t *)ptr)); - case IPV6_PATHMTU: { - struct ip6_mtuinfo *mtuinfo = (struct ip6_mtuinfo *)ptr; - + case IPV6_PATHMTU: if (tcp->tcp_state < TCPS_ESTABLISHED) return (-1); - - return (ip_fill_mtuinfo(&connp->conn_remv6, - connp->conn_fport, mtuinfo, - connp->conn_netstack)); - } - default: - return (-1); + break; } break; - default: - return (-1); } - return (sizeof (int)); + mutex_enter(&connp->conn_lock); + retval = conn_opt_get(&coas, level, name, ptr); + mutex_exit(&connp->conn_lock); + return (retval); } /* @@ -9896,7 +7998,6 @@ tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len, tcp_opt_obj.odb_opt_des_arr, tcp_opt_obj.odb_opt_arr_cnt, - tcp_opt_obj.odb_topmost_tpiprovider, B_FALSE, B_TRUE, cr); if (error != 0) { if (error < 0) { @@ -9909,30 +8010,28 @@ tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, error = squeue_synch_enter(sqp, connp, NULL); if (error == ENOMEM) { + kmem_free(optvalp_buf, max_optbuf_len); return (ENOMEM); } len = tcp_opt_get(connp, level, option_name, optvalp_buf); squeue_synch_exit(sqp, connp); - if (len < 0) { - /* - * Pass on to IP - */ + if (len == -1) { kmem_free(optvalp_buf, max_optbuf_len); - return (ip_get_options(connp, level, option_name, - optvalp, optlen, cr)); - } else { - /* - * update optlen and copy option value - */ - t_uscalar_t size = MIN(len, *optlen); - bcopy(optvalp_buf, optvalp, size); - bcopy(&size, optlen, sizeof (size)); - - kmem_free(optvalp_buf, max_optbuf_len); - return (0); + return (EINVAL); } + + /* + * update optlen and copy option value + */ + t_uscalar_t size = MIN(len, *optlen); + + bcopy(optvalp_buf, optvalp, size); + bcopy(&size, optlen, sizeof (size)); + + kmem_free(optvalp_buf, max_optbuf_len); + return (0); } /* @@ -9943,7 +8042,7 @@ tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, int tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, - void *thisdg_attrs, cred_t *cr, mblk_t *mblk) + void *thisdg_attrs, cred_t *cr) { tcp_t *tcp = connp->conn_tcp; int *i1 = (int *)invalp; @@ -9951,6 +8050,13 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, boolean_t checkonly; int reterr; tcp_stack_t *tcps = tcp->tcp_tcps; + conn_opt_arg_t coas; + + coas.coa_connp = connp; + coas.coa_ixa = connp->conn_ixa; + coas.coa_ipp = &connp->conn_xmit_ipp; + coas.coa_ancillary = B_FALSE; + coas.coa_changed = 0; switch (optset_context) { case SETFN_OPTCOM_CHECKONLY: @@ -10016,37 +8122,6 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, switch (level) { case SOL_SOCKET: switch (name) { - case SO_LINGER: { - struct linger *lgr = (struct linger *)invalp; - - if (!checkonly) { - if (lgr->l_onoff) { - tcp->tcp_linger = 1; - tcp->tcp_lingertime = lgr->l_linger; - } else { - tcp->tcp_linger = 0; - tcp->tcp_lingertime = 0; - } - /* struct copy */ - *(struct linger *)outvalp = *lgr; - } else { - if (!lgr->l_onoff) { - ((struct linger *) - outvalp)->l_onoff = 0; - ((struct linger *) - outvalp)->l_linger = 0; - } else { - /* struct copy */ - *(struct linger *)outvalp = *lgr; - } - } - *outlenp = sizeof (struct linger); - return (0); - } - case SO_DEBUG: - if (!checkonly) - tcp->tcp_debug = onoff; - break; case SO_KEEPALIVE: if (checkonly) { /* check only case */ @@ -10054,65 +8129,25 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, } if (!onoff) { - if (tcp->tcp_ka_enabled) { + if (connp->conn_keepalive) { if (tcp->tcp_ka_tid != 0) { (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ka_tid); tcp->tcp_ka_tid = 0; } - tcp->tcp_ka_enabled = 0; + connp->conn_keepalive = 0; } break; } - if (!tcp->tcp_ka_enabled) { + if (!connp->conn_keepalive) { /* Crank up the keepalive timer */ tcp->tcp_ka_last_intrvl = 0; tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer, MSEC_TO_TICK(tcp->tcp_ka_interval)); - tcp->tcp_ka_enabled = 1; - } - break; - case SO_DONTROUTE: - /* - * SO_DONTROUTE, SO_USELOOPBACK, and SO_BROADCAST are - * only of interest to IP. We track them here only so - * that we can report their current value. - */ - if (!checkonly) { - tcp->tcp_dontroute = onoff; - tcp->tcp_connp->conn_dontroute = onoff; + connp->conn_keepalive = 1; } break; - case SO_USELOOPBACK: - if (!checkonly) { - tcp->tcp_useloopback = onoff; - tcp->tcp_connp->conn_loopback = onoff; - } - break; - case SO_BROADCAST: - if (!checkonly) { - tcp->tcp_broadcast = onoff; - tcp->tcp_connp->conn_broadcast = onoff; - } - break; - case SO_REUSEADDR: - if (!checkonly) { - tcp->tcp_reuseaddr = onoff; - tcp->tcp_connp->conn_reuseaddr = onoff; - } - break; - case SO_OOBINLINE: - if (!checkonly) { - tcp->tcp_oobinline = onoff; - if (IPCL_IS_NONSTR(tcp->tcp_connp)) - proto_set_rx_oob_opt(connp, onoff); - } - break; - case SO_DGRAM_ERRIND: - if (!checkonly) - tcp->tcp_dgram_errind = onoff; - break; case SO_SNDBUF: { if (*i1 > tcps->tcps_max_buf) { *outlenp = 0; @@ -10121,11 +8156,11 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, if (checkonly) break; - tcp->tcp_xmit_hiwater = *i1; - if (tcps->tcps_snd_lowat_fraction != 0) - tcp->tcp_xmit_lowater = - tcp->tcp_xmit_hiwater / + connp->conn_sndbuf = *i1; + if (tcps->tcps_snd_lowat_fraction != 0) { + connp->conn_sndlowat = connp->conn_sndbuf / tcps->tcps_snd_lowat_fraction; + } (void) tcp_maxpsz_set(tcp, B_TRUE); /* * If we are flow-controlled, recheck the condition. @@ -10135,11 +8170,12 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, */ mutex_enter(&tcp->tcp_non_sq_lock); if (tcp->tcp_flow_stopped && - TCP_UNSENT_BYTES(tcp) < tcp->tcp_xmit_hiwater) { + TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) { tcp_clrqfull(tcp); } mutex_exit(&tcp->tcp_non_sq_lock); - break; + *outlenp = inlen; + return (0); } case SO_RCVBUF: if (*i1 > tcps->tcps_max_buf) { @@ -10155,43 +8191,20 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, * XXX should we return the rwnd here * and tcp_opt_get ? */ - break; + *outlenp = inlen; + return (0); case SO_SND_COPYAVOID: if (!checkonly) { - /* we only allow enable at most once for now */ if (tcp->tcp_loopback || (tcp->tcp_kssl_ctx != NULL) || - (!tcp->tcp_snd_zcopy_aware && - (onoff != 1 || !tcp_zcopy_check(tcp)))) { + (onoff != 1) || !tcp_zcopy_check(tcp)) { *outlenp = 0; return (EOPNOTSUPP); } tcp->tcp_snd_zcopy_aware = 1; } - break; - case SO_RCVTIMEO: - case SO_SNDTIMEO: - /* - * Pass these two options in order for third part - * protocol usage. Here just return directly. - */ + *outlenp = inlen; return (0); - case SO_ALLZONES: - /* Pass option along to IP level for handling */ - return (-EINVAL); - case SO_ANON_MLP: - /* Pass option along to IP level for handling */ - return (-EINVAL); - case SO_MAC_EXEMPT: - /* Pass option along to IP level for handling */ - return (-EINVAL); - case SO_EXCLBIND: - if (!checkonly) - tcp->tcp_exclbind = onoff; - break; - default: - *outlenp = 0; - return (EINVAL); } break; case IPPROTO_TCP: @@ -10217,25 +8230,12 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, tcp->tcp_second_ctimer_threshold = *i1; break; case TCP_RECVDSTADDR: - if (tcp->tcp_state > TCPS_LISTEN) - return (EOPNOTSUPP); - if (!checkonly) - tcp->tcp_recvdstaddr = onoff; - break; - case TCP_ANONPRIVBIND: - if ((reterr = secpolicy_net_privaddr(cr, 0, - IPPROTO_TCP)) != 0) { + if (tcp->tcp_state > TCPS_LISTEN) { *outlenp = 0; - return (reterr); - } - if (!checkonly) { - tcp->tcp_anon_priv_bind = onoff; + return (EOPNOTSUPP); } + /* Setting done in conn_opt_set */ break; - case TCP_EXCLBIND: - if (!checkonly) - tcp->tcp_exclbind = onoff; - break; /* goto sizeof (int) option return */ case TCP_INIT_CWND: { uint32_t init_cwnd = *((uint32_t *)invalp); @@ -10278,7 +8278,7 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, * keepalive timer. */ if (tcp->tcp_ka_tid != 0) { - ASSERT(tcp->tcp_ka_enabled); + ASSERT(connp->conn_keepalive); (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ka_tid); tcp->tcp_ka_last_intrvl = 0; @@ -10318,49 +8318,15 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, } break; default: - *outlenp = 0; - return (EINVAL); + break; } break; case IPPROTO_IP: - if (tcp->tcp_family != AF_INET) { + if (connp->conn_family != AF_INET) { *outlenp = 0; - return (ENOPROTOOPT); + return (EINVAL); } switch (name) { - case IP_OPTIONS: - case T_IP_OPTIONS: - reterr = tcp_opt_set_header(tcp, checkonly, - invalp, inlen); - if (reterr) { - *outlenp = 0; - return (reterr); - } - /* OK return - copy input buffer into output buffer */ - if (invalp != outvalp) { - /* don't trust bcopy for identical src/dst */ - bcopy(invalp, outvalp, inlen); - } - *outlenp = inlen; - return (0); - case IP_TOS: - case T_IP_TOS: - if (!checkonly) { - tcp->tcp_ipha->ipha_type_of_service = - (uchar_t)*i1; - tcp->tcp_tos = (uchar_t)*i1; - } - break; - case IP_TTL: - if (!checkonly) { - tcp->tcp_ipha->ipha_ttl = (uchar_t)*i1; - tcp->tcp_ttl = (uchar_t)*i1; - } - break; - case IP_BOUND_IF: - case IP_NEXTHOP: - /* Handled at the IP level */ - return (-EINVAL); case IP_SEC_OPT: /* * We should not allow policy setting after @@ -10368,166 +8334,42 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, */ if (tcp->tcp_state == TCPS_LISTEN) { return (EINVAL); - } else { - /* Handled at the IP level */ - return (-EINVAL); } - default: - *outlenp = 0; - return (EINVAL); + break; } break; - case IPPROTO_IPV6: { - ip6_pkt_t *ipp; - + case IPPROTO_IPV6: /* * IPPROTO_IPV6 options are only supported for sockets * that are using IPv6 on the wire. */ - if (tcp->tcp_ipversion != IPV6_VERSION) { + if (connp->conn_ipversion != IPV6_VERSION) { *outlenp = 0; - return (ENOPROTOOPT); + return (EINVAL); } - /* - * Only sticky options; no ancillary data - */ - ipp = &tcp->tcp_sticky_ipp; switch (name) { - case IPV6_UNICAST_HOPS: - /* -1 means use default */ - if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) { - *outlenp = 0; - return (EINVAL); - } - if (!checkonly) { - if (*i1 == -1) { - tcp->tcp_ip6h->ip6_hops = - ipp->ipp_unicast_hops = - (uint8_t)tcps->tcps_ipv6_hoplimit; - ipp->ipp_fields &= ~IPPF_UNICAST_HOPS; - /* Pass modified value to IP. */ - *i1 = tcp->tcp_ip6h->ip6_hops; - } else { - tcp->tcp_ip6h->ip6_hops = - ipp->ipp_unicast_hops = - (uint8_t)*i1; - ipp->ipp_fields |= IPPF_UNICAST_HOPS; - } - reterr = tcp_build_hdrs(tcp); - if (reterr != 0) - return (reterr); - } - break; - case IPV6_BOUND_IF: - if (!checkonly) { - tcp->tcp_bound_if = *i1; - PASS_OPT_TO_IP(connp); - } - break; - /* - * Set boolean switches for ancillary data delivery - */ case IPV6_RECVPKTINFO: if (!checkonly) { - if (onoff) - tcp->tcp_ipv6_recvancillary |= - TCP_IPV6_RECVPKTINFO; - else - tcp->tcp_ipv6_recvancillary &= - ~TCP_IPV6_RECVPKTINFO; /* Force it to be sent up with the next msg */ tcp->tcp_recvifindex = 0; - PASS_OPT_TO_IP(connp); } break; case IPV6_RECVTCLASS: if (!checkonly) { - if (onoff) - tcp->tcp_ipv6_recvancillary |= - TCP_IPV6_RECVTCLASS; - else - tcp->tcp_ipv6_recvancillary &= - ~TCP_IPV6_RECVTCLASS; - PASS_OPT_TO_IP(connp); + /* Force it to be sent up with the next msg */ + tcp->tcp_recvtclass = 0xffffffffU; } break; case IPV6_RECVHOPLIMIT: if (!checkonly) { - if (onoff) - tcp->tcp_ipv6_recvancillary |= - TCP_IPV6_RECVHOPLIMIT; - else - tcp->tcp_ipv6_recvancillary &= - ~TCP_IPV6_RECVHOPLIMIT; /* Force it to be sent up with the next msg */ tcp->tcp_recvhops = 0xffffffffU; - PASS_OPT_TO_IP(connp); - } - break; - case IPV6_RECVHOPOPTS: - if (!checkonly) { - if (onoff) - tcp->tcp_ipv6_recvancillary |= - TCP_IPV6_RECVHOPOPTS; - else - tcp->tcp_ipv6_recvancillary &= - ~TCP_IPV6_RECVHOPOPTS; - PASS_OPT_TO_IP(connp); - } - break; - case IPV6_RECVDSTOPTS: - if (!checkonly) { - if (onoff) - tcp->tcp_ipv6_recvancillary |= - TCP_IPV6_RECVDSTOPTS; - else - tcp->tcp_ipv6_recvancillary &= - ~TCP_IPV6_RECVDSTOPTS; - PASS_OPT_TO_IP(connp); - } - break; - case _OLD_IPV6_RECVDSTOPTS: - if (!checkonly) { - if (onoff) - tcp->tcp_ipv6_recvancillary |= - TCP_OLD_IPV6_RECVDSTOPTS; - else - tcp->tcp_ipv6_recvancillary &= - ~TCP_OLD_IPV6_RECVDSTOPTS; - } - break; - case IPV6_RECVRTHDR: - if (!checkonly) { - if (onoff) - tcp->tcp_ipv6_recvancillary |= - TCP_IPV6_RECVRTHDR; - else - tcp->tcp_ipv6_recvancillary &= - ~TCP_IPV6_RECVRTHDR; - PASS_OPT_TO_IP(connp); - } - break; - case IPV6_RECVRTHDRDSTOPTS: - if (!checkonly) { - if (onoff) - tcp->tcp_ipv6_recvancillary |= - TCP_IPV6_RECVRTDSTOPTS; - else - tcp->tcp_ipv6_recvancillary &= - ~TCP_IPV6_RECVRTDSTOPTS; - PASS_OPT_TO_IP(connp); } break; case IPV6_PKTINFO: - if (inlen != 0 && inlen != sizeof (struct in6_pktinfo)) - return (EINVAL); - if (checkonly) - break; - - if (inlen == 0) { - ipp->ipp_fields &= ~(IPPF_IFINDEX|IPPF_ADDR); - } else { + /* This is an extra check for TCP */ + if (inlen == sizeof (struct in6_pktinfo)) { struct in6_pktinfo *pkti; pkti = (struct in6_pktinfo *)invalp; @@ -10539,219 +8381,8 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, */ if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr)) return (EINVAL); - /* - * IP will validate the source address and - * interface index. - */ - if (IPCL_IS_NONSTR(tcp->tcp_connp)) { - reterr = ip_set_options(tcp->tcp_connp, - level, name, invalp, inlen, cr); - } else { - reterr = ip6_set_pktinfo(cr, - tcp->tcp_connp, pkti); - } - if (reterr != 0) - return (reterr); - ipp->ipp_ifindex = pkti->ipi6_ifindex; - ipp->ipp_addr = pkti->ipi6_addr; - if (ipp->ipp_ifindex != 0) - ipp->ipp_fields |= IPPF_IFINDEX; - else - ipp->ipp_fields &= ~IPPF_IFINDEX; - if (!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr)) - ipp->ipp_fields |= IPPF_ADDR; - else - ipp->ipp_fields &= ~IPPF_ADDR; - } - reterr = tcp_build_hdrs(tcp); - if (reterr != 0) - return (reterr); - break; - case IPV6_TCLASS: - if (inlen != 0 && inlen != sizeof (int)) - return (EINVAL); - if (checkonly) - break; - - if (inlen == 0) { - ipp->ipp_fields &= ~IPPF_TCLASS; - } else { - if (*i1 > 255 || *i1 < -1) - return (EINVAL); - if (*i1 == -1) { - ipp->ipp_tclass = 0; - *i1 = 0; - } else { - ipp->ipp_tclass = *i1; - } - ipp->ipp_fields |= IPPF_TCLASS; - } - reterr = tcp_build_hdrs(tcp); - if (reterr != 0) - return (reterr); - break; - case IPV6_NEXTHOP: - /* - * IP will verify that the nexthop is reachable - * and fail for sticky options. - */ - if (inlen != 0 && inlen != sizeof (sin6_t)) - return (EINVAL); - if (checkonly) - break; - - if (inlen == 0) { - ipp->ipp_fields &= ~IPPF_NEXTHOP; - } else { - sin6_t *sin6 = (sin6_t *)invalp; - - if (sin6->sin6_family != AF_INET6) - return (EAFNOSUPPORT); - if (IN6_IS_ADDR_V4MAPPED( - &sin6->sin6_addr)) - return (EADDRNOTAVAIL); - ipp->ipp_nexthop = sin6->sin6_addr; - if (!IN6_IS_ADDR_UNSPECIFIED( - &ipp->ipp_nexthop)) - ipp->ipp_fields |= IPPF_NEXTHOP; - else - ipp->ipp_fields &= ~IPPF_NEXTHOP; - } - reterr = tcp_build_hdrs(tcp); - if (reterr != 0) - return (reterr); - PASS_OPT_TO_IP(connp); - break; - case IPV6_HOPOPTS: { - ip6_hbh_t *hopts = (ip6_hbh_t *)invalp; - - /* - * Sanity checks - minimum size, size a multiple of - * eight bytes, and matching size passed in. - */ - if (inlen != 0 && - inlen != (8 * (hopts->ip6h_len + 1))) - return (EINVAL); - - if (checkonly) - break; - - reterr = optcom_pkt_set(invalp, inlen, B_TRUE, - (uchar_t **)&ipp->ipp_hopopts, - &ipp->ipp_hopoptslen, tcp->tcp_label_len); - if (reterr != 0) - return (reterr); - if (ipp->ipp_hopoptslen == 0) - ipp->ipp_fields &= ~IPPF_HOPOPTS; - else - ipp->ipp_fields |= IPPF_HOPOPTS; - reterr = tcp_build_hdrs(tcp); - if (reterr != 0) - return (reterr); - break; - } - case IPV6_RTHDRDSTOPTS: { - ip6_dest_t *dopts = (ip6_dest_t *)invalp; - - /* - * Sanity checks - minimum size, size a multiple of - * eight bytes, and matching size passed in. - */ - if (inlen != 0 && - inlen != (8 * (dopts->ip6d_len + 1))) - return (EINVAL); - - if (checkonly) - break; - - reterr = optcom_pkt_set(invalp, inlen, B_TRUE, - (uchar_t **)&ipp->ipp_rtdstopts, - &ipp->ipp_rtdstoptslen, 0); - if (reterr != 0) - return (reterr); - if (ipp->ipp_rtdstoptslen == 0) - ipp->ipp_fields &= ~IPPF_RTDSTOPTS; - else - ipp->ipp_fields |= IPPF_RTDSTOPTS; - reterr = tcp_build_hdrs(tcp); - if (reterr != 0) - return (reterr); - break; - } - case IPV6_DSTOPTS: { - ip6_dest_t *dopts = (ip6_dest_t *)invalp; - - /* - * Sanity checks - minimum size, size a multiple of - * eight bytes, and matching size passed in. - */ - if (inlen != 0 && - inlen != (8 * (dopts->ip6d_len + 1))) - return (EINVAL); - - if (checkonly) - break; - - reterr = optcom_pkt_set(invalp, inlen, B_TRUE, - (uchar_t **)&ipp->ipp_dstopts, - &ipp->ipp_dstoptslen, 0); - if (reterr != 0) - return (reterr); - if (ipp->ipp_dstoptslen == 0) - ipp->ipp_fields &= ~IPPF_DSTOPTS; - else - ipp->ipp_fields |= IPPF_DSTOPTS; - reterr = tcp_build_hdrs(tcp); - if (reterr != 0) - return (reterr); - break; - } - case IPV6_RTHDR: { - ip6_rthdr_t *rt = (ip6_rthdr_t *)invalp; - - /* - * Sanity checks - minimum size, size a multiple of - * eight bytes, and matching size passed in. - */ - if (inlen != 0 && - inlen != (8 * (rt->ip6r_len + 1))) - return (EINVAL); - - if (checkonly) - break; - - reterr = optcom_pkt_set(invalp, inlen, B_TRUE, - (uchar_t **)&ipp->ipp_rthdr, - &ipp->ipp_rthdrlen, 0); - if (reterr != 0) - return (reterr); - if (ipp->ipp_rthdrlen == 0) - ipp->ipp_fields &= ~IPPF_RTHDR; - else - ipp->ipp_fields |= IPPF_RTHDR; - reterr = tcp_build_hdrs(tcp); - if (reterr != 0) - return (reterr); - break; - } - case IPV6_V6ONLY: - if (!checkonly) { - tcp->tcp_connp->conn_ipv6_v6only = onoff; } break; - case IPV6_USE_MIN_MTU: - if (inlen != sizeof (int)) - return (EINVAL); - - if (*i1 < -1 || *i1 > 1) - return (EINVAL); - - if (checkonly) - break; - - ipp->ipp_fields |= IPPF_USE_MIN_MTU; - ipp->ipp_use_min_mtu = *i1; - break; case IPV6_SEC_OPT: /* * We should not allow policy setting after @@ -10759,30 +8390,18 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, */ if (tcp->tcp_state == TCPS_LISTEN) { return (EINVAL); - } else { - /* Handled at the IP level */ - return (-EINVAL); - } - case IPV6_SRC_PREFERENCES: - if (inlen != sizeof (uint32_t)) - return (EINVAL); - reterr = ip6_set_src_preferences(tcp->tcp_connp, - *(uint32_t *)invalp); - if (reterr != 0) { - *outlenp = 0; - return (reterr); } break; - default: - *outlenp = 0; - return (EINVAL); } break; - } /* end IPPROTO_IPV6 */ - default: + } + reterr = conn_opt_set(&coas, level, name, inlen, invalp, + checkonly, cr); + if (reterr != 0) { *outlenp = 0; - return (EINVAL); + return (reterr); } + /* * Common case of OK return with outval same as inval */ @@ -10791,6 +8410,45 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, (void) bcopy(invalp, outvalp, inlen); } *outlenp = inlen; + + if (coas.coa_changed & COA_HEADER_CHANGED) { + reterr = tcp_build_hdrs(tcp); + if (reterr != 0) + return (reterr); + } + if (coas.coa_changed & COA_ROUTE_CHANGED) { + in6_addr_t nexthop; + + /* + * If we are connected we re-cache the information. + * We ignore errors to preserve BSD behavior. + * Note that we don't redo IPsec policy lookup here + * since the final destination (or source) didn't change. + */ + ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa, + &connp->conn_faddr_v6, &nexthop); + + if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && + !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { + (void) ip_attr_connect(connp, connp->conn_ixa, + &connp->conn_laddr_v6, &connp->conn_faddr_v6, + &nexthop, connp->conn_fport, NULL, NULL, + IPDF_VERIFY_DST); + } + } + if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) { + connp->conn_wq->q_hiwat = connp->conn_sndbuf; + } + if (coas.coa_changed & COA_WROFF_CHANGED) { + connp->conn_wroff = connp->conn_ht_iphc_allocated + + tcps->tcps_wroff_xtra; + (void) proto_set_tx_wroff(connp->conn_rq, connp, + connp->conn_wroff); + } + if (coas.coa_changed & COA_OOBINLINE_CHANGED) { + if (IPCL_IS_NONSTR(connp)) + proto_set_rx_oob_opt(connp, onoff); + } return (0); } @@ -10798,12 +8456,12 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, int tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, - void *thisdg_attrs, cred_t *cr, mblk_t *mblk) + void *thisdg_attrs, cred_t *cr) { conn_t *connp = Q_TO_CONN(q); return (tcp_opt_set(connp, optset_context, level, name, inlen, invalp, - outlenp, outvalp, thisdg_attrs, cr, mblk)); + outlenp, outvalp, thisdg_attrs, cr)); } int @@ -10843,7 +8501,6 @@ tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, error = proto_opt_check(level, option_name, optlen, NULL, tcp_opt_obj.odb_opt_des_arr, tcp_opt_obj.odb_opt_arr_cnt, - tcp_opt_obj.odb_topmost_tpiprovider, B_TRUE, B_FALSE, cr); if (error != 0) { @@ -10856,292 +8513,75 @@ tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp, - NULL, cr, NULL); + NULL, cr); squeue_synch_exit(sqp, connp); - if (error < 0) { - /* - * Pass on to ip - */ - error = ip_set_options(connp, level, option_name, optvalp, - optlen, cr); - } + ASSERT(error >= 0); + return (error); } /* - * Update tcp_sticky_hdrs based on tcp_sticky_ipp. - * The headers include ip6i_t (if needed), ip6_t, any sticky extension + * Build/update the tcp header template (in conn_ht_iphc) based on + * conn_xmit_ipp. The headers include ip6_t, any extension * headers, and the maximum size tcp header (to avoid reallocation * on the fly for additional tcp options). + * + * Assumes the caller has already set conn_{faddr,laddr,fport,lport,flowinfo}. * Returns failure if can't allocate memory. */ static int tcp_build_hdrs(tcp_t *tcp) { - char *hdrs; - uint_t hdrs_len; - ip6i_t *ip6i; - char buf[TCP_MAX_HDR_LENGTH]; - ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp; - in6_addr_t src, dst; tcp_stack_t *tcps = tcp->tcp_tcps; - conn_t *connp = tcp->tcp_connp; + conn_t *connp = tcp->tcp_connp; + tcpha_t *tcpha; + uint32_t cksum; + int error; - /* - * save the existing tcp header and source/dest IP addresses - */ - bcopy(tcp->tcp_tcph, buf, tcp->tcp_tcp_hdr_len); - src = tcp->tcp_ip6h->ip6_src; - dst = tcp->tcp_ip6h->ip6_dst; - hdrs_len = ip_total_hdrs_len_v6(ipp) + TCP_MAX_HDR_LENGTH; - ASSERT(hdrs_len != 0); - if (hdrs_len > tcp->tcp_iphc_len) { - /* Need to reallocate */ - hdrs = kmem_zalloc(hdrs_len, KM_NOSLEEP); - if (hdrs == NULL) - return (ENOMEM); - if (tcp->tcp_iphc != NULL) { - if (tcp->tcp_hdr_grown) { - kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len); - } else { - bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); - kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc); - } - tcp->tcp_iphc_len = 0; - } - ASSERT(tcp->tcp_iphc_len == 0); - tcp->tcp_iphc = hdrs; - tcp->tcp_iphc_len = hdrs_len; - tcp->tcp_hdr_grown = B_TRUE; - } - ip_build_hdrs_v6((uchar_t *)tcp->tcp_iphc, - hdrs_len - TCP_MAX_HDR_LENGTH, ipp, IPPROTO_TCP); + /* Grab lock to satisfy ASSERT; TCP is serialized using squeue */ + mutex_enter(&connp->conn_lock); + error = conn_build_hdr_template(connp, TCP_MIN_HEADER_LENGTH, + TCP_MAX_TCP_OPTIONS_LENGTH, &connp->conn_laddr_v6, + &connp->conn_faddr_v6, connp->conn_flowinfo); + mutex_exit(&connp->conn_lock); + if (error != 0) + return (error); - /* Set header fields not in ipp */ - if (ipp->ipp_fields & IPPF_HAS_IP6I) { - ip6i = (ip6i_t *)tcp->tcp_iphc; - tcp->tcp_ip6h = (ip6_t *)&ip6i[1]; - } else { - tcp->tcp_ip6h = (ip6_t *)tcp->tcp_iphc; - } /* - * tcp->tcp_ip_hdr_len will include ip6i_t if there is one. - * - * tcp->tcp_tcp_hdr_len doesn't change here. + * Any routing header/option has been massaged. The checksum difference + * is stored in conn_sum for later use. */ - tcp->tcp_ip_hdr_len = hdrs_len - TCP_MAX_HDR_LENGTH; - tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc + tcp->tcp_ip_hdr_len); - tcp->tcp_hdr_len = tcp->tcp_ip_hdr_len + tcp->tcp_tcp_hdr_len; + tcpha = (tcpha_t *)connp->conn_ht_ulp; + tcp->tcp_tcpha = tcpha; - bcopy(buf, tcp->tcp_tcph, tcp->tcp_tcp_hdr_len); - - tcp->tcp_ip6h->ip6_src = src; - tcp->tcp_ip6h->ip6_dst = dst; + tcpha->tha_lport = connp->conn_lport; + tcpha->tha_fport = connp->conn_fport; + tcpha->tha_sum = 0; + tcpha->tha_offset_and_reserved = (5 << 4); /* - * If the hop limit was not set by ip_build_hdrs_v6(), set it to - * the default value for TCP. - */ - if (!(ipp->ipp_fields & IPPF_UNICAST_HOPS)) - tcp->tcp_ip6h->ip6_hops = tcps->tcps_ipv6_hoplimit; - - /* - * If we're setting extension headers after a connection - * has been established, and if we have a routing header - * among the extension headers, call ip_massage_options_v6 to - * manipulate the routing header/ip6_dst set the checksum - * difference in the tcp header template. - * (This happens in tcp_connect_ipv6 if the routing header - * is set prior to the connect.) - * Set the tcp_sum to zero first in case we've cleared a - * routing header or don't have one at all. + * IP wants our header length in the checksum field to + * allow it to perform a single pseudo-header+checksum + * calculation on behalf of TCP. + * Include the adjustment for a source route once IP_OPTIONS is set. */ - tcp->tcp_sum = 0; - if ((tcp->tcp_state >= TCPS_SYN_SENT) && - (tcp->tcp_ipp_fields & IPPF_RTHDR)) { - ip6_rthdr_t *rth = ip_find_rthdr_v6(tcp->tcp_ip6h, - (uint8_t *)tcp->tcp_tcph); - if (rth != NULL) { - tcp->tcp_sum = ip_massage_options_v6(tcp->tcp_ip6h, - rth, tcps->tcps_netstack); - tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) + - (tcp->tcp_sum >> 16)); - } - } - - /* Try to get everything in a single mblk */ - (void) proto_set_tx_wroff(tcp->tcp_rq, connp, - hdrs_len + tcps->tcps_wroff_xtra); - return (0); -} - -/* - * Transfer any source route option from ipha to buf/dst in reversed form. - */ -static int -tcp_opt_rev_src_route(ipha_t *ipha, char *buf, uchar_t *dst) -{ - ipoptp_t opts; - uchar_t *opt; - uint8_t optval; - uint8_t optlen; - uint32_t len = 0; - - for (optval = ipoptp_first(&opts, ipha); - optval != IPOPT_EOL; - optval = ipoptp_next(&opts)) { - opt = opts.ipoptp_cur; - optlen = opts.ipoptp_len; - switch (optval) { - int off1, off2; - case IPOPT_SSRR: - case IPOPT_LSRR: - - /* Reverse source route */ - /* - * First entry should be the next to last one in the - * current source route (the last entry is our - * address.) - * The last entry should be the final destination. - */ - buf[IPOPT_OPTVAL] = (uint8_t)optval; - buf[IPOPT_OLEN] = (uint8_t)optlen; - off1 = IPOPT_MINOFF_SR - 1; - off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1; - if (off2 < 0) { - /* No entries in source route */ - break; - } - bcopy(opt + off2, dst, IP_ADDR_LEN); - /* - * Note: use src since ipha has not had its src - * and dst reversed (it is in the state it was - * received. - */ - bcopy(&ipha->ipha_src, buf + off2, - IP_ADDR_LEN); - off2 -= IP_ADDR_LEN; - - while (off2 > 0) { - bcopy(opt + off2, buf + off1, - IP_ADDR_LEN); - off1 += IP_ADDR_LEN; - off2 -= IP_ADDR_LEN; - } - buf[IPOPT_OFFSET] = IPOPT_MINOFF_SR; - buf += optlen; - len += optlen; - break; - } - } -done: - /* Pad the resulting options */ - while (len & 0x3) { - *buf++ = IPOPT_EOL; - len++; - } - return (len); -} - - -/* - * Extract and revert a source route from ipha (if any) - * and then update the relevant fields in both tcp_t and the standard header. - */ -static void -tcp_opt_reverse(tcp_t *tcp, ipha_t *ipha) -{ - char buf[TCP_MAX_HDR_LENGTH]; - uint_t tcph_len; - int len; - - ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); - len = IPH_HDR_LENGTH(ipha); - if (len == IP_SIMPLE_HDR_LENGTH) - /* Nothing to do */ - return; - if (len > IP_SIMPLE_HDR_LENGTH + TCP_MAX_IP_OPTIONS_LENGTH || - (len & 0x3)) - return; - - tcph_len = tcp->tcp_tcp_hdr_len; - bcopy(tcp->tcp_tcph, buf, tcph_len); - tcp->tcp_sum = (tcp->tcp_ipha->ipha_dst >> 16) + - (tcp->tcp_ipha->ipha_dst & 0xffff); - len = tcp_opt_rev_src_route(ipha, (char *)tcp->tcp_ipha + - IP_SIMPLE_HDR_LENGTH, (uchar_t *)&tcp->tcp_ipha->ipha_dst); - len += IP_SIMPLE_HDR_LENGTH; - tcp->tcp_sum -= ((tcp->tcp_ipha->ipha_dst >> 16) + - (tcp->tcp_ipha->ipha_dst & 0xffff)); - if ((int)tcp->tcp_sum < 0) - tcp->tcp_sum--; - tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16); - tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16)); - tcp->tcp_tcph = (tcph_t *)((char *)tcp->tcp_ipha + len); - bcopy(buf, tcp->tcp_tcph, tcph_len); - tcp->tcp_ip_hdr_len = len; - tcp->tcp_ipha->ipha_version_and_hdr_length = - (IP_VERSION << 4) | (len >> 2); - len += tcph_len; - tcp->tcp_hdr_len = len; -} - -/* - * Copy the standard header into its new location, - * lay in the new options and then update the relevant - * fields in both tcp_t and the standard header. - */ -static int -tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly, uchar_t *ptr, uint_t len) -{ - uint_t tcph_len; - uint8_t *ip_optp; - tcph_t *new_tcph; - tcp_stack_t *tcps = tcp->tcp_tcps; - conn_t *connp = tcp->tcp_connp; - - if ((len > TCP_MAX_IP_OPTIONS_LENGTH) || (len & 0x3)) - return (EINVAL); - - if (len > IP_MAX_OPT_LENGTH - tcp->tcp_label_len) - return (EINVAL); - - if (checkonly) { - /* - * do not really set, just pretend to - T_CHECK - */ - return (0); - } + cksum = sizeof (tcpha_t) + connp->conn_sum; + cksum = (cksum >> 16) + (cksum & 0xFFFF); + ASSERT(cksum < 0x10000); + tcpha->tha_sum = htons(cksum); - ip_optp = (uint8_t *)tcp->tcp_ipha + IP_SIMPLE_HDR_LENGTH; - if (tcp->tcp_label_len > 0) { - int padlen; - uint8_t opt; + if (connp->conn_ipversion == IPV4_VERSION) + tcp->tcp_ipha = (ipha_t *)connp->conn_ht_iphc; + else + tcp->tcp_ip6h = (ip6_t *)connp->conn_ht_iphc; - /* convert list termination to no-ops */ - padlen = tcp->tcp_label_len - ip_optp[IPOPT_OLEN]; - ip_optp += ip_optp[IPOPT_OLEN]; - opt = len > 0 ? IPOPT_NOP : IPOPT_EOL; - while (--padlen >= 0) - *ip_optp++ = opt; - } - tcph_len = tcp->tcp_tcp_hdr_len; - new_tcph = (tcph_t *)(ip_optp + len); - ovbcopy(tcp->tcp_tcph, new_tcph, tcph_len); - tcp->tcp_tcph = new_tcph; - bcopy(ptr, ip_optp, len); - - len += IP_SIMPLE_HDR_LENGTH + tcp->tcp_label_len; - - tcp->tcp_ip_hdr_len = len; - tcp->tcp_ipha->ipha_version_and_hdr_length = - (IP_VERSION << 4) | (len >> 2); - tcp->tcp_hdr_len = len + tcph_len; - if (!TCP_IS_DETACHED(tcp)) { - /* Always allocate room for all options. */ - (void) proto_set_tx_wroff(tcp->tcp_rq, connp, - TCP_MAX_COMBINED_HEADER_LENGTH + tcps->tcps_wroff_xtra); + if (connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra > + connp->conn_wroff) { + connp->conn_wroff = connp->conn_ht_iphc_allocated + + tcps->tcps_wroff_xtra; + (void) proto_set_tx_wroff(connp->conn_rq, connp, + connp->conn_wroff); } return (0); } @@ -11184,36 +8624,6 @@ tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt, tcp_stack_t *tcps) nd_free(ndp); return (B_FALSE); } - tcps->tcps_mdt_head_param = kmem_zalloc(sizeof (tcpparam_t), - KM_SLEEP); - bcopy(&lcl_tcp_mdt_head_param, tcps->tcps_mdt_head_param, - sizeof (tcpparam_t)); - if (!nd_load(ndp, tcps->tcps_mdt_head_param->tcp_param_name, - tcp_param_get, tcp_param_set_aligned, - (caddr_t)tcps->tcps_mdt_head_param)) { - nd_free(ndp); - return (B_FALSE); - } - tcps->tcps_mdt_tail_param = kmem_zalloc(sizeof (tcpparam_t), - KM_SLEEP); - bcopy(&lcl_tcp_mdt_tail_param, tcps->tcps_mdt_tail_param, - sizeof (tcpparam_t)); - if (!nd_load(ndp, tcps->tcps_mdt_tail_param->tcp_param_name, - tcp_param_get, tcp_param_set_aligned, - (caddr_t)tcps->tcps_mdt_tail_param)) { - nd_free(ndp); - return (B_FALSE); - } - tcps->tcps_mdt_max_pbufs_param = kmem_zalloc(sizeof (tcpparam_t), - KM_SLEEP); - bcopy(&lcl_tcp_mdt_max_pbufs_param, tcps->tcps_mdt_max_pbufs_param, - sizeof (tcpparam_t)); - if (!nd_load(ndp, tcps->tcps_mdt_max_pbufs_param->tcp_param_name, - tcp_param_get, tcp_param_set_aligned, - (caddr_t)tcps->tcps_mdt_max_pbufs_param)) { - nd_free(ndp); - return (B_FALSE); - } if (!nd_load(ndp, "tcp_extra_priv_ports", tcp_extra_priv_ports_get, NULL, NULL)) { nd_free(ndp); @@ -11248,7 +8658,7 @@ tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt, tcp_stack_t *tcps) return (B_TRUE); } -/* ndd set routine for tcp_wroff_xtra, tcp_mdt_hdr_{head,tail}_min. */ +/* ndd set routine for tcp_wroff_xtra. */ /* ARGSUSED */ static int tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value, caddr_t cp, @@ -11307,6 +8717,7 @@ tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start) uint32_t u1; tcp_stack_t *tcps = tcp->tcp_tcps; + /* Walk through all the new pieces. */ do { ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= @@ -11433,9 +8844,10 @@ tcp_rwnd_reopen(tcp_t *tcp) { uint_t ret = 0; uint_t thwin; + conn_t *connp = tcp->tcp_connp; /* Learn the latest rwnd information that we sent to the other side. */ - thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win)) + thwin = ((uint_t)ntohs(tcp->tcp_tcpha->tha_win)) << tcp->tcp_rcv_ws; /* This is peer's calculated send window (our receive window). */ thwin -= tcp->tcp_rnxt - tcp->tcp_rack; @@ -11444,7 +8856,7 @@ tcp_rwnd_reopen(tcp_t *tcp) * SWS avoidance. This means that we need to check the increase of * of receive window is at least 1 MSS. */ - if (tcp->tcp_recv_hiwater - thwin >= tcp->tcp_mss) { + if (connp->conn_rcvbuf - thwin >= tcp->tcp_mss) { /* * If the window that the other side knows is less than max * deferred acks segments, send an update immediately. @@ -11453,7 +8865,7 @@ tcp_rwnd_reopen(tcp_t *tcp) BUMP_MIB(&tcp->tcp_tcps->tcps_mib, tcpOutWinUpdate); ret = TH_ACK_NEEDED; } - tcp->tcp_rwnd = tcp->tcp_recv_hiwater; + tcp->tcp_rwnd = connp->conn_rcvbuf; } return (ret); } @@ -11469,7 +8881,7 @@ tcp_rcv_drain(tcp_t *tcp) #ifdef DEBUG uint_t cnt = 0; #endif - queue_t *q = tcp->tcp_rq; + queue_t *q = tcp->tcp_connp->conn_rq; /* Can't drain on an eager connection */ if (tcp->tcp_listener != NULL) @@ -11511,7 +8923,7 @@ tcp_rcv_drain(tcp_t *tcp) if ((tcp->tcp_kssl_ctx != NULL) && (DB_TYPE(mp) == M_DATA)) { DTRACE_PROBE1(kssl_mblk__ksslinput_rcvdrain, mblk_t *, mp); - tcp_kssl_input(tcp, mp); + tcp_kssl_input(tcp, mp, NULL); continue; } putnext(q, mp); @@ -11538,11 +8950,22 @@ tcp_rcv_drain(tcp_t *tcp) * Other messages are added as new (b_next) elements. */ void -tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len) +tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len, cred_t *cr) { ASSERT(seg_len == msgdsize(mp)); ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_rcv_last_head != NULL); + if (is_system_labeled()) { + ASSERT(cr != NULL || msg_getcred(mp, NULL) != NULL); + /* + * Provide for protocols above TCP such as RPC. NOPID leaves + * db_cpid unchanged. + * The cred could have already been set. + */ + if (cr != NULL) + mblk_setcred(mp, cr, NOPID); + } + if (tcp->tcp_rcv_list == NULL) { ASSERT(tcp->tcp_rcv_last_head == NULL); tcp->tcp_rcv_list = mp; @@ -11562,176 +8985,6 @@ tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len) tcp->tcp_rwnd -= seg_len; } -/* - * DEFAULT TCP ENTRY POINT via squeue on READ side. - * - * This is the default entry function into TCP on the read side. TCP is - * always entered via squeue i.e. using squeue's for mutual exclusion. - * When classifier does a lookup to find the tcp, it also puts a reference - * on the conn structure associated so the tcp is guaranteed to exist - * when we come here. We still need to check the state because it might - * as well has been closed. The squeue processing function i.e. squeue_enter, - * is responsible for doing the CONN_DEC_REF. - * - * Apart from the default entry point, IP also sends packets directly to - * tcp_rput_data for AF_INET fast path and tcp_conn_request for incoming - * connections. - */ -boolean_t tcp_outbound_squeue_switch = B_FALSE; -void -tcp_input(void *arg, mblk_t *mp, void *arg2) -{ - conn_t *connp = (conn_t *)arg; - tcp_t *tcp = (tcp_t *)connp->conn_tcp; - - /* arg2 is the sqp */ - ASSERT(arg2 != NULL); - ASSERT(mp != NULL); - - /* - * Don't accept any input on a closed tcp as this TCP logically does - * not exist on the system. Don't proceed further with this TCP. - * For eg. this packet could trigger another close of this tcp - * which would be disastrous for tcp_refcnt. tcp_close_detached / - * tcp_clean_death / tcp_closei_local must be called at most once - * on a TCP. In this case we need to refeed the packet into the - * classifier and figure out where the packet should go. Need to - * preserve the recv_ill somehow. Until we figure that out, for - * now just drop the packet if we can't classify the packet. - */ - if (tcp->tcp_state == TCPS_CLOSED || - tcp->tcp_state == TCPS_BOUND) { - conn_t *new_connp; - ip_stack_t *ipst = tcp->tcp_tcps->tcps_netstack->netstack_ip; - - new_connp = ipcl_classify(mp, connp->conn_zoneid, ipst); - if (new_connp != NULL) { - tcp_reinput(new_connp, mp, arg2); - return; - } - /* We failed to classify. For now just drop the packet */ - freemsg(mp); - return; - } - - if (DB_TYPE(mp) != M_DATA) { - tcp_rput_common(tcp, mp); - return; - } - - if (mp->b_datap->db_struioflag & STRUIO_CONNECT) { - squeue_t *final_sqp; - - mp->b_datap->db_struioflag &= ~STRUIO_CONNECT; - final_sqp = (squeue_t *)DB_CKSUMSTART(mp); - DB_CKSUMSTART(mp) = 0; - if (tcp->tcp_state == TCPS_SYN_SENT && - connp->conn_final_sqp == NULL && - tcp_outbound_squeue_switch) { - ASSERT(connp->conn_initial_sqp == connp->conn_sqp); - connp->conn_final_sqp = final_sqp; - if (connp->conn_final_sqp != connp->conn_sqp) { - CONN_INC_REF(connp); - SQUEUE_SWITCH(connp, connp->conn_final_sqp); - SQUEUE_ENTER_ONE(connp->conn_sqp, mp, - tcp_rput_data, connp, ip_squeue_flag, - SQTAG_CONNECT_FINISH); - return; - } - } - } - tcp_rput_data(connp, mp, arg2); -} - -/* - * The read side put procedure. - * The packets passed up by ip are assume to be aligned according to - * OK_32PTR and the IP+TCP headers fitting in the first mblk. - */ -static void -tcp_rput_common(tcp_t *tcp, mblk_t *mp) -{ - /* - * tcp_rput_data() does not expect M_CTL except for the case - * where tcp_ipv6_recvancillary is set and we get a IN_PKTINFO - * type. Need to make sure that any other M_CTLs don't make - * it to tcp_rput_data since it is not expecting any and doesn't - * check for it. - */ - if (DB_TYPE(mp) == M_CTL) { - switch (*(uint32_t *)(mp->b_rptr)) { - case TCP_IOC_ABORT_CONN: - /* - * Handle connection abort request. - */ - tcp_ioctl_abort_handler(tcp, mp); - return; - case IPSEC_IN: - /* - * Only secure icmp arrive in TCP and they - * don't go through data path. - */ - tcp_icmp_error(tcp, mp); - return; - case IN_PKTINFO: - /* - * Handle IPV6_RECVPKTINFO socket option on AF_INET6 - * sockets that are receiving IPv4 traffic. tcp - */ - ASSERT(tcp->tcp_family == AF_INET6); - ASSERT(tcp->tcp_ipv6_recvancillary & - TCP_IPV6_RECVPKTINFO); - tcp_rput_data(tcp->tcp_connp, mp, - tcp->tcp_connp->conn_sqp); - return; - case MDT_IOC_INFO_UPDATE: - /* - * Handle Multidata information update; the - * following routine will free the message. - */ - if (tcp->tcp_connp->conn_mdt_ok) { - tcp_mdt_update(tcp, - &((ip_mdt_info_t *)mp->b_rptr)->mdt_capab, - B_FALSE); - } - freemsg(mp); - return; - case LSO_IOC_INFO_UPDATE: - /* - * Handle LSO information update; the following - * routine will free the message. - */ - if (tcp->tcp_connp->conn_lso_ok) { - tcp_lso_update(tcp, - &((ip_lso_info_t *)mp->b_rptr)->lso_capab); - } - freemsg(mp); - return; - default: - /* - * tcp_icmp_err() will process the M_CTL packets. - * Non-ICMP packets, if any, will be discarded in - * tcp_icmp_err(). We will process the ICMP packet - * even if we are TCP_IS_DETACHED_NONEAGER as the - * incoming ICMP packet may result in changing - * the tcp_mss, which we would need if we have - * packets to retransmit. - */ - tcp_icmp_error(tcp, mp); - return; - } - } - - /* No point processing the message if tcp is already closed */ - if (TCP_IS_DETACHED_NONEAGER(tcp)) { - freemsg(mp); - return; - } - - tcp_rput_other(tcp, mp); -} - - /* The minimum of smoothed mean deviation in RTO calculation. */ #define TCP_SD_MIN 400 @@ -11885,12 +9138,12 @@ tcp_get_seg_mp(tcp_t *tcp, uint32_t seq, int32_t *off) * segments. A segment is eligible if sack_cnt for that segment is greater * than or equal tcp_dupack_fast_retransmit. After it has retransmitted * all eligible segments, it checks to see if TCP can send some new segments - * (fast recovery). If it can, set the appropriate flag for tcp_rput_data(). + * (fast recovery). If it can, set the appropriate flag for tcp_input_data(). * * Parameters: * tcp_t *tcp: the tcp structure of the connection. * uint_t *flags: in return, appropriate value will be set for - * tcp_rput_data(). + * tcp_input_data(). */ static void tcp_sack_rxmit(tcp_t *tcp, uint_t *flags) @@ -11988,7 +9241,7 @@ tcp_sack_rxmit(tcp_t *tcp, uint_t *flags) tcp->tcp_pipe += seg_len; tcp->tcp_sack_snxt = begin + seg_len; - tcp_send_data(tcp, tcp->tcp_wq, xmit_mp); + tcp_send_data(tcp, xmit_mp); /* * Update the send timestamp to avoid false retransmission. @@ -12012,96 +9265,8 @@ tcp_sack_rxmit(tcp_t *tcp, uint_t *flags) } /* - * This function handles policy checking at TCP level for non-hard_bound/ - * detached connections. - */ -static boolean_t -tcp_check_policy(tcp_t *tcp, mblk_t *first_mp, ipha_t *ipha, ip6_t *ip6h, - boolean_t secure, boolean_t mctl_present) -{ - ipsec_latch_t *ipl = NULL; - ipsec_action_t *act = NULL; - mblk_t *data_mp; - ipsec_in_t *ii; - const char *reason; - kstat_named_t *counter; - tcp_stack_t *tcps = tcp->tcp_tcps; - ipsec_stack_t *ipss; - ip_stack_t *ipst; - - ASSERT(mctl_present || !secure); - - ASSERT((ipha == NULL && ip6h != NULL) || - (ip6h == NULL && ipha != NULL)); - - /* - * We don't necessarily have an ipsec_in_act action to verify - * policy because of assymetrical policy where we have only - * outbound policy and no inbound policy (possible with global - * policy). - */ - if (!secure) { - if (act == NULL || act->ipa_act.ipa_type == IPSEC_ACT_BYPASS || - act->ipa_act.ipa_type == IPSEC_ACT_CLEAR) - return (B_TRUE); - ipsec_log_policy_failure(IPSEC_POLICY_MISMATCH, - "tcp_check_policy", ipha, ip6h, secure, - tcps->tcps_netstack); - ipss = tcps->tcps_netstack->netstack_ipsec; - - ip_drop_packet(first_mp, B_TRUE, NULL, NULL, - DROPPER(ipss, ipds_tcp_clear), - &tcps->tcps_dropper); - return (B_FALSE); - } - - /* - * We have a secure packet. - */ - if (act == NULL) { - ipsec_log_policy_failure(IPSEC_POLICY_NOT_NEEDED, - "tcp_check_policy", ipha, ip6h, secure, - tcps->tcps_netstack); - ipss = tcps->tcps_netstack->netstack_ipsec; - - ip_drop_packet(first_mp, B_TRUE, NULL, NULL, - DROPPER(ipss, ipds_tcp_secure), - &tcps->tcps_dropper); - return (B_FALSE); - } - - /* - * XXX This whole routine is currently incorrect. ipl should - * be set to the latch pointer, but is currently not set, so - * we initialize it to NULL to avoid picking up random garbage. - */ - if (ipl == NULL) - return (B_TRUE); - - data_mp = first_mp->b_cont; - - ii = (ipsec_in_t *)first_mp->b_rptr; - - ipst = tcps->tcps_netstack->netstack_ip; - - if (ipsec_check_ipsecin_latch(ii, data_mp, ipl, ipha, ip6h, &reason, - &counter, tcp->tcp_connp)) { - BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded); - return (B_TRUE); - } - (void) strlog(TCP_MOD_ID, 0, 0, SL_ERROR|SL_WARN|SL_CONSOLE, - "tcp inbound policy mismatch: %s, packet dropped\n", - reason); - BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed); - - ip_drop_packet(first_mp, B_TRUE, NULL, NULL, counter, - &tcps->tcps_dropper); - return (B_FALSE); -} - -/* - * tcp_ss_rexmit() is called in tcp_rput_data() to do slow start - * retransmission after a timeout. + * tcp_ss_rexmit() is called to do slow start retransmission after a timeout + * or ICMP errors. * * To limit the number of duplicate segments, we limit the number of segment * to be sent in one time to tcp_snd_burst, the burst variable. @@ -12150,7 +9315,7 @@ tcp_ss_rexmit(tcp_t *tcp) if (xmit_mp == NULL) return; - tcp_send_data(tcp, tcp->tcp_wq, xmit_mp); + tcp_send_data(tcp, xmit_mp); snxt += cnt; win -= cnt; @@ -12184,7 +9349,7 @@ tcp_ss_rexmit(tcp_t *tcp) /* * Process all TCP option in SYN segment. Note that this function should - * be called after tcp_adapt_ire() is called so that the necessary info + * be called after tcp_set_destination() is called so that the necessary info * from IRE is already set in the tcp structure. * * This function sets up the correct tcp_mss value according to the @@ -12194,16 +9359,17 @@ tcp_ss_rexmit(tcp_t *tcp) * should do the appropriate change. */ void -tcp_process_options(tcp_t *tcp, tcph_t *tcph) +tcp_process_options(tcp_t *tcp, tcpha_t *tcpha) { int options; tcp_opt_t tcpopt; uint32_t mss_max; char *tmp_tcph; tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; tcpopt.tcp = NULL; - options = tcp_parse_options(tcph, &tcpopt); + options = tcp_parse_options(tcpha, &tcpopt); /* * Process MSS option. Note that MSS option value does not account @@ -12212,12 +9378,12 @@ tcp_process_options(tcp_t *tcp, tcph_t *tcph) * IPv6. */ if (!(options & TCP_OPT_MSS_PRESENT)) { - if (tcp->tcp_ipversion == IPV4_VERSION) + if (connp->conn_ipversion == IPV4_VERSION) tcpopt.tcp_opt_mss = tcps->tcps_mss_def_ipv4; else tcpopt.tcp_opt_mss = tcps->tcps_mss_def_ipv6; } else { - if (tcp->tcp_ipversion == IPV4_VERSION) + if (connp->conn_ipversion == IPV4_VERSION) mss_max = tcps->tcps_mss_max_ipv4; else mss_max = tcps->tcps_mss_max_ipv6; @@ -12240,23 +9406,23 @@ tcp_process_options(tcp_t *tcp, tcph_t *tcph) /* Process Timestamp option. */ if ((options & TCP_OPT_TSTAMP_PRESENT) && (tcp->tcp_snd_ts_ok || TCP_IS_DETACHED(tcp))) { - tmp_tcph = (char *)tcp->tcp_tcph; + tmp_tcph = (char *)tcp->tcp_tcpha; tcp->tcp_snd_ts_ok = B_TRUE; tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; tcp->tcp_last_rcv_lbolt = lbolt64; ASSERT(OK_32PTR(tmp_tcph)); - ASSERT(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH); + ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH); /* Fill in our template header with basic timestamp option. */ - tmp_tcph += tcp->tcp_tcp_hdr_len; + tmp_tcph += connp->conn_ht_ulp_len; tmp_tcph[0] = TCPOPT_NOP; tmp_tcph[1] = TCPOPT_NOP; tmp_tcph[2] = TCPOPT_TSTAMP; tmp_tcph[3] = TCPOPT_TSTAMP_LEN; - tcp->tcp_hdr_len += TCPOPT_REAL_TS_LEN; - tcp->tcp_tcp_hdr_len += TCPOPT_REAL_TS_LEN; - tcp->tcp_tcph->th_offset_and_rsrvd[0] += (3 << 4); + connp->conn_ht_iphc_len += TCPOPT_REAL_TS_LEN; + connp->conn_ht_ulp_len += TCPOPT_REAL_TS_LEN; + tcp->tcp_tcpha->tha_offset_and_reserved += (3 << 4); } else { tcp->tcp_snd_ts_ok = B_FALSE; } @@ -12266,12 +9432,11 @@ tcp_process_options(tcp_t *tcp, tcph_t *tcph) * then allocate the SACK info structure. Note the following ways * when tcp_snd_sack_ok is set to true. * - * For active connection: in tcp_adapt_ire() called in - * tcp_rput_other(), or in tcp_rput_other() when tcp_sack_permitted - * is checked. + * For active connection: in tcp_set_destination() called in + * tcp_connect(). * - * For passive connection: in tcp_adapt_ire() called in - * tcp_accept_comm(). + * For passive connection: in tcp_set_destination() called in + * tcp_input_listener(). * * That's the reason why the extra TCP_IS_DETACHED() check is there. * That check makes sure that if we did not send a SACK OK option, @@ -12320,7 +9485,8 @@ tcp_process_options(tcp_t *tcp, tcph_t *tcph) * Now we know the exact TCP/IP header length, subtract * that from tcp_mss to get our side's MSS. */ - tcp->tcp_mss -= tcp->tcp_hdr_len; + tcp->tcp_mss -= connp->conn_ht_iphc_len; + /* * Here we assume that the other side's header size will be equal to * our header size. We calculate the real MSS accordingly. Need to @@ -12328,22 +9494,29 @@ tcp_process_options(tcp_t *tcp, tcph_t *tcph) * * Real MSS = Opt.MSS - (our TCP/IP header - min TCP/IP header) */ - tcpopt.tcp_opt_mss -= tcp->tcp_hdr_len + tcp->tcp_ipsec_overhead - - ((tcp->tcp_ipversion == IPV4_VERSION ? + tcpopt.tcp_opt_mss -= connp->conn_ht_iphc_len + + tcp->tcp_ipsec_overhead - + ((connp->conn_ipversion == IPV4_VERSION ? IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) + TCP_MIN_HEADER_LENGTH); /* * Set MSS to the smaller one of both ends of the connection. * We should not have called tcp_mss_set() before, but our * side of the MSS should have been set to a proper value - * by tcp_adapt_ire(). tcp_mss_set() will also set up the + * by tcp_set_destination(). tcp_mss_set() will also set up the * STREAM head parameters properly. * * If we have a larger-than-16-bit window but the other side * didn't want to do window scale, tcp_rwnd_set() will take * care of that. */ - tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss), B_TRUE); + tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss)); + + /* + * Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been + * updated properly. + */ + SET_TCP_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial); } /* @@ -12410,7 +9583,7 @@ tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2) tcp_t *tail; /* - * The eager already has an extra ref put in tcp_rput_data + * The eager already has an extra ref put in tcp_input_data * so that it stays till accept comes back even though it * might get into TCPS_CLOSED as a result of a TH_RST etc. */ @@ -12496,8 +9669,8 @@ tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2) * remote host. This proves the IP addr is good. * Cache it! */ - addr_cache[IP_ADDR_CACHE_HASH( - tcp->tcp_remote)] = tcp->tcp_remote; + addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] = + tcp->tcp_connp->conn_faddr_v4; } mutex_exit(&listener->tcp_eager_lock); if (need_send_conn_ind) @@ -12513,17 +9686,16 @@ tcp_ulp_newconn(conn_t *lconnp, conn_t *econnp, mblk_t *mp) { if (IPCL_IS_NONSTR(lconnp)) { cred_t *cr; - pid_t cpid; - - cr = msg_getcred(mp, &cpid); + pid_t cpid = NOPID; ASSERT(econnp->conn_tcp->tcp_listener == lconnp->conn_tcp); ASSERT(econnp->conn_tcp->tcp_saved_listener == lconnp->conn_tcp); + cr = msg_getcred(mp, &cpid); + /* Keep the message around in case of a fallback to TPI */ econnp->conn_tcp->tcp_conn.tcp_eager_conn_ind = mp; - /* * Notify the ULP about the newconn. It is guaranteed that no * tcp_accept() call will be made for the eager if the @@ -12545,177 +9717,83 @@ tcp_ulp_newconn(conn_t *lconnp, conn_t *econnp, mblk_t *mp) econnp->conn_tcp->tcp_conn_req_seqnum); } } else { - putnext(lconnp->conn_tcp->tcp_rq, mp); + putnext(lconnp->conn_rq, mp); } } -mblk_t * -tcp_find_pktinfo(tcp_t *tcp, mblk_t *mp, uint_t *ipversp, uint_t *ip_hdr_lenp, - uint_t *ifindexp, ip6_pkt_t *ippp) +/* + * Handle a packet that has been reclassified by TCP. + * This function drops the ref on connp that the caller had. + */ +static void +tcp_reinput(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst) { - ip_pktinfo_t *pinfo; - ip6_t *ip6h; - uchar_t *rptr; - mblk_t *first_mp = mp; - boolean_t mctl_present = B_FALSE; - uint_t ifindex = 0; - ip6_pkt_t ipp; - uint_t ipvers; - uint_t ip_hdr_len; - tcp_stack_t *tcps = tcp->tcp_tcps; + ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; - rptr = mp->b_rptr; - ASSERT(OK_32PTR(rptr)); - ASSERT(tcp != NULL); - ipp.ipp_fields = 0; + if (connp->conn_incoming_ifindex != 0 && + connp->conn_incoming_ifindex != ira->ira_ruifindex) { + freemsg(mp); + CONN_DEC_REF(connp); + return; + } - switch DB_TYPE(mp) { - case M_CTL: - mp = mp->b_cont; - if (mp == NULL) { - freemsg(first_mp); - return (NULL); + if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) || + (ira->ira_flags & IRAF_IPSEC_SECURE)) { + ip6_t *ip6h; + ipha_t *ipha; + + if (ira->ira_flags & IRAF_IS_IPV4) { + ipha = (ipha_t *)mp->b_rptr; + ip6h = NULL; + } else { + ipha = NULL; + ip6h = (ip6_t *)mp->b_rptr; } - if (DB_TYPE(mp) != M_DATA) { - freemsg(first_mp); - return (NULL); + mp = ipsec_check_inbound_policy(mp, connp, ipha, ip6h, ira); + if (mp == NULL) { + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); + /* Note that mp is NULL */ + ip_drop_input("ipIfStatsInDiscards", mp, NULL); + CONN_DEC_REF(connp); + return; } - mctl_present = B_TRUE; - break; - case M_DATA: - break; - default: - cmn_err(CE_NOTE, "tcp_find_pktinfo: unknown db_type"); - freemsg(mp); - return (NULL); } - ipvers = IPH_HDR_VERSION(rptr); - if (ipvers == IPV4_VERSION) { - if (tcp == NULL) { - ip_hdr_len = IPH_HDR_LENGTH(rptr); - goto done; - } - - ipp.ipp_fields |= IPPF_HOPLIMIT; - ipp.ipp_hoplimit = ((ipha_t *)rptr)->ipha_ttl; + if (IPCL_IS_TCP(connp)) { /* - * If we have IN_PKTINFO in an M_CTL and tcp_ipv6_recvancillary - * has TCP_IPV6_RECVPKTINFO set, pass I/F index along in ipp. + * do not drain, certain use cases can blow + * the stack */ - if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) && - mctl_present) { - pinfo = (ip_pktinfo_t *)first_mp->b_rptr; - if ((MBLKL(first_mp) == sizeof (ip_pktinfo_t)) && - (pinfo->ip_pkt_ulp_type == IN_PKTINFO) && - (pinfo->ip_pkt_flags & IPF_RECVIF)) { - ipp.ipp_fields |= IPPF_IFINDEX; - ipp.ipp_ifindex = pinfo->ip_pkt_ifindex; - ifindex = pinfo->ip_pkt_ifindex; - } - freeb(first_mp); - mctl_present = B_FALSE; - } - ip_hdr_len = IPH_HDR_LENGTH(rptr); + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, + connp->conn_recv, connp, ira, + SQ_NODRAIN, SQTAG_IP_TCP_INPUT); } else { - ip6h = (ip6_t *)rptr; - - ASSERT(ipvers == IPV6_VERSION); - ipp.ipp_fields = IPPF_HOPLIMIT | IPPF_TCLASS; - ipp.ipp_tclass = (ip6h->ip6_flow & 0x0FF00000) >> 20; - ipp.ipp_hoplimit = ip6h->ip6_hops; - - if (ip6h->ip6_nxt != IPPROTO_TCP) { - uint8_t nexthdrp; - ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - - /* Look for ifindex information */ - if (ip6h->ip6_nxt == IPPROTO_RAW) { - ip6i_t *ip6i = (ip6i_t *)ip6h; - if ((uchar_t *)&ip6i[1] > mp->b_wptr) { - BUMP_MIB(&ipst->ips_ip_mib, tcpInErrs); - freemsg(first_mp); - return (NULL); - } - - if (ip6i->ip6i_flags & IP6I_IFINDEX) { - ASSERT(ip6i->ip6i_ifindex != 0); - ipp.ipp_fields |= IPPF_IFINDEX; - ipp.ipp_ifindex = ip6i->ip6i_ifindex; - ifindex = ip6i->ip6i_ifindex; - } - rptr = (uchar_t *)&ip6i[1]; - mp->b_rptr = rptr; - if (rptr == mp->b_wptr) { - mblk_t *mp1; - mp1 = mp->b_cont; - freeb(mp); - mp = mp1; - rptr = mp->b_rptr; - } - if (MBLKL(mp) < IPV6_HDR_LEN + - sizeof (tcph_t)) { - BUMP_MIB(&ipst->ips_ip_mib, tcpInErrs); - freemsg(first_mp); - return (NULL); - } - ip6h = (ip6_t *)rptr; - } - - /* - * Find any potentially interesting extension headers - * as well as the length of the IPv6 + extension - * headers. - */ - ip_hdr_len = ip_find_hdr_v6(mp, ip6h, &ipp, &nexthdrp); - /* Verify if this is a TCP packet */ - if (nexthdrp != IPPROTO_TCP) { - BUMP_MIB(&ipst->ips_ip_mib, tcpInErrs); - freemsg(first_mp); - return (NULL); - } - } else { - ip_hdr_len = IPV6_HDR_LEN; - } + /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */ + (connp->conn_recv)(connp, mp, NULL, + ira); + CONN_DEC_REF(connp); } -done: - if (ipversp != NULL) - *ipversp = ipvers; - if (ip_hdr_lenp != NULL) - *ip_hdr_lenp = ip_hdr_len; - if (ippp != NULL) - *ippp = ipp; - if (ifindexp != NULL) - *ifindexp = ifindex; - if (mctl_present) { - freeb(first_mp); - } - return (mp); } +boolean_t tcp_outbound_squeue_switch = B_FALSE; + /* * Handle M_DATA messages from IP. Its called directly from IP via - * squeue for AF_INET type sockets fast path. No M_CTL are expected - * in this path. - * - * For everything else (including AF_INET6 sockets with 'tcp_ipversion' - * v4 and v6), we are called through tcp_input() and a M_CTL can - * be present for options but tcp_find_pktinfo() deals with it. We - * only expect M_DATA packets after tcp_find_pktinfo() is done. + * squeue for received IP packets. * * The first argument is always the connp/tcp to which the mp belongs. * There are no exceptions to this rule. The caller has already put - * a reference on this connp/tcp and once tcp_rput_data() returns, + * a reference on this connp/tcp and once tcp_input_data() returns, * the squeue will do the refrele. * - * The TH_SYN for the listener directly go to tcp_conn_request via - * squeue. + * The TH_SYN for the listener directly go to tcp_input_listener via + * squeue. ICMP errors go directly to tcp_icmp_input(). * * sqp: NULL = recursive, sqp != NULL means called from squeue */ void -tcp_rput_data(void *arg, mblk_t *mp, void *arg2) +tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) { int32_t bytes_acked; int32_t gap; @@ -12729,11 +9807,10 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) int seg_len; uint_t ip_hdr_len; uint32_t seg_seq; - tcph_t *tcph; + tcpha_t *tcpha; int urp; tcp_opt_t tcpopt; - uint_t ipvers; - ip6_pkt_t ipp; + ip_pkt_t ipp; boolean_t ofo_seg = B_FALSE; /* Out of order segment */ uint32_t cwnd; uint32_t add; @@ -12756,33 +9833,43 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) rptr = mp->b_rptr; ASSERT(OK_32PTR(rptr)); - /* - * An AF_INET socket is not capable of receiving any pktinfo. Do inline - * processing here. For rest call tcp_find_pktinfo to fill up the - * necessary information. - */ - if (IPCL_IS_TCP4(connp)) { - ipvers = IPV4_VERSION; - ip_hdr_len = IPH_HDR_LENGTH(rptr); - } else { - mp = tcp_find_pktinfo(tcp, mp, &ipvers, &ip_hdr_len, - NULL, &ipp); - if (mp == NULL) { - TCP_STAT(tcps, tcp_rput_v6_error); - return; + ip_hdr_len = ira->ira_ip_hdr_length; + if (connp->conn_recv_ancillary.crb_all != 0) { + /* + * Record packet information in the ip_pkt_t + */ + ipp.ipp_fields = 0; + if (ira->ira_flags & IRAF_IS_IPV4) { + (void) ip_find_hdr_v4((ipha_t *)rptr, &ipp, + B_FALSE); + } else { + uint8_t nexthdrp; + + /* + * IPv6 packets can only be received by applications + * that are prepared to receive IPv6 addresses. + * The IP fanout must ensure this. + */ + ASSERT(connp->conn_family == AF_INET6); + + (void) ip_find_hdr_v6(mp, (ip6_t *)rptr, B_TRUE, &ipp, + &nexthdrp); + ASSERT(nexthdrp == IPPROTO_TCP); + + /* Could have caused a pullup? */ + iphdr = mp->b_rptr; + rptr = mp->b_rptr; } - iphdr = mp->b_rptr; - rptr = mp->b_rptr; } ASSERT(DB_TYPE(mp) == M_DATA); ASSERT(mp->b_next == NULL); - tcph = (tcph_t *)&rptr[ip_hdr_len]; - seg_seq = ABE32_TO_U32(tcph->th_seq); - seg_ack = ABE32_TO_U32(tcph->th_ack); + tcpha = (tcpha_t *)&rptr[ip_hdr_len]; + seg_seq = ntohl(tcpha->tha_seq); + seg_ack = ntohl(tcpha->tha_ack); ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); seg_len = (int)(mp->b_wptr - rptr) - - (ip_hdr_len + TCP_HDR_LENGTH(tcph)); + (ip_hdr_len + TCP_HDR_LENGTH(tcpha)); if ((mp1 = mp->b_cont) != NULL && mp1->b_datap->db_type == M_DATA) { do { ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= @@ -12794,7 +9881,7 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) if (tcp->tcp_state == TCPS_TIME_WAIT) { tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack, - seg_len, tcph); + seg_len, tcpha, ira); return; } @@ -12809,7 +9896,7 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) tcp->tcp_last_recv_time = lbolt; } - flags = (unsigned int)tcph->th_flags[0] & 0xFF; + flags = (unsigned int)tcpha->tha_flags & 0xFF; BUMP_LOCAL(tcp->tcp_ibsegs); DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp); @@ -12840,7 +9927,7 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) } /* Update pointers into message */ iphdr = rptr = mp->b_rptr; - tcph = (tcph_t *)&rptr[ip_hdr_len]; + tcpha = (tcpha_t *)&rptr[ip_hdr_len]; if (SEQ_GT(seg_seq, tcp->tcp_rnxt)) { /* * Since we can't handle any data with this urgent @@ -12849,13 +9936,29 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) * the urgent mark and generate the M_PCSIG, * which we can do. */ - mp->b_wptr = (uchar_t *)tcph + TCP_HDR_LENGTH(tcph); + mp->b_wptr = (uchar_t *)tcpha + TCP_HDR_LENGTH(tcpha); seg_len = 0; } } switch (tcp->tcp_state) { case TCPS_SYN_SENT: + if (connp->conn_final_sqp == NULL && + tcp_outbound_squeue_switch && sqp != NULL) { + ASSERT(connp->conn_initial_sqp == connp->conn_sqp); + connp->conn_final_sqp = sqp; + if (connp->conn_final_sqp != connp->conn_sqp) { + DTRACE_PROBE1(conn__final__sqp__switch, + conn_t *, connp); + CONN_INC_REF(connp); + SQUEUE_SWITCH(connp, connp->conn_final_sqp); + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, + tcp_input_data, connp, ira, ip_squeue_flag, + SQTAG_CONNECT_FINISH); + return; + } + DTRACE_PROBE1(conn__final__sqp__same, conn_t *, connp); + } if (flags & TH_ACK) { /* * Note that our stack cannot send data before a @@ -12887,13 +9990,13 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) } /* Process all TCP options. */ - tcp_process_options(tcp, tcph); + tcp_process_options(tcp, tcpha); /* * The following changes our rwnd to be a multiple of the * MIN(peer MSS, our MSS) for performance reason. */ - (void) tcp_rwnd_set(tcp, - MSS_ROUNDUP(tcp->tcp_recv_hiwater, tcp->tcp_mss)); + (void) tcp_rwnd_set(tcp, MSS_ROUNDUP(connp->conn_rcvbuf, + tcp->tcp_mss)); /* Is the other end ECN capable? */ if (tcp->tcp_ecn_ok) { @@ -12910,21 +10013,17 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) tcp->tcp_irs = seg_seq; tcp->tcp_rack = seg_seq; tcp->tcp_rnxt = seg_seq + 1; - U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); + tcp->tcp_tcpha->tha_ack = htonl(tcp->tcp_rnxt); if (!TCP_IS_DETACHED(tcp)) { /* Allocate room for SACK options if needed. */ - if (tcp->tcp_snd_sack_ok) { - (void) proto_set_tx_wroff(tcp->tcp_rq, connp, - tcp->tcp_hdr_len + - TCPOPT_MAX_SACK_LEN + - (tcp->tcp_loopback ? 0 : - tcps->tcps_wroff_xtra)); - } else { - (void) proto_set_tx_wroff(tcp->tcp_rq, connp, - tcp->tcp_hdr_len + - (tcp->tcp_loopback ? 0 : - tcps->tcps_wroff_xtra)); - } + connp->conn_wroff = connp->conn_ht_iphc_len; + if (tcp->tcp_snd_sack_ok) + connp->conn_wroff += TCPOPT_MAX_SACK_LEN; + if (!tcp->tcp_loopback) + connp->conn_wroff += tcps->tcps_wroff_xtra; + + (void) proto_set_tx_wroff(connp->conn_rq, connp, + connp->conn_wroff); } if (flags & TH_ACK) { /* @@ -12944,15 +10043,14 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) * sending up connection confirmation */ tcp->tcp_state = TCPS_ESTABLISHED; - if (!tcp_conn_con(tcp, iphdr, tcph, mp, - tcp->tcp_loopback ? &mp1 : NULL)) { + if (!tcp_conn_con(tcp, iphdr, mp, + tcp->tcp_loopback ? &mp1 : NULL, ira)) { tcp->tcp_state = TCPS_SYN_SENT; freemsg(mp); return; } /* SYN was acked - making progress */ - if (tcp->tcp_ipversion == IPV6_VERSION) - tcp->tcp_ip_forward_progress = B_TRUE; + tcp->tcp_ip_forward_progress = B_TRUE; /* One for the SYN */ tcp->tcp_suna = tcp->tcp_iss + 1; @@ -12983,7 +10081,7 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) tcp->tcp_swl1 = seg_seq; tcp->tcp_swl2 = seg_ack; - new_swnd = BE16_TO_U16(tcph->th_win); + new_swnd = ntohs(tcpha->tha_win); tcp->tcp_swnd = new_swnd; if (new_swnd > tcp->tcp_max_swnd) tcp->tcp_max_swnd = new_swnd; @@ -13022,22 +10120,25 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) tcp->tcp_ack_tid); tcp->tcp_ack_tid = 0; } - tcp_send_data(tcp, tcp->tcp_wq, ack_mp); + tcp_send_data(tcp, ack_mp); BUMP_LOCAL(tcp->tcp_obsegs); BUMP_MIB(&tcps->tcps_mib, tcpOutAck); if (!IPCL_IS_NONSTR(connp)) { /* Send up T_CONN_CON */ - putnext(tcp->tcp_rq, mp1); + if (ira->ira_cred != NULL) { + mblk_setcred(mp1, + ira->ira_cred, + ira->ira_cpid); + } + putnext(connp->conn_rq, mp1); } else { - cred_t *cr; - pid_t cpid; - - cr = msg_getcred(mp1, &cpid); (*connp->conn_upcalls-> su_connected) (connp->conn_upper_handle, - tcp->tcp_connid, cr, cpid); + tcp->tcp_connid, + ira->ira_cred, + ira->ira_cpid); freemsg(mp1); } @@ -13054,15 +10155,16 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) TCP_STAT(tcps, tcp_fusion_unfusable); tcp->tcp_unfusable = B_TRUE; if (!IPCL_IS_NONSTR(connp)) { - putnext(tcp->tcp_rq, mp1); + if (ira->ira_cred != NULL) { + mblk_setcred(mp1, ira->ira_cred, + ira->ira_cpid); + } + putnext(connp->conn_rq, mp1); } else { - cred_t *cr; - pid_t cpid; - - cr = msg_getcred(mp1, &cpid); (*connp->conn_upcalls->su_connected) (connp->conn_upper_handle, - tcp->tcp_connid, cr, cpid); + tcp->tcp_connid, ira->ira_cred, + ira->ira_cpid); freemsg(mp1); } } @@ -13089,13 +10191,8 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) tcp->tcp_state = TCPS_SYN_RCVD; mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, tcp->tcp_mss, NULL, NULL, tcp->tcp_iss, B_FALSE, NULL, B_FALSE); - if (mp1) { - /* - * See comment in tcp_conn_request() for why we use - * the open() time pid here. - */ - DB_CPID(mp1) = tcp->tcp_cpid; - tcp_send_data(tcp, tcp->tcp_wq, mp1); + if (mp1 != NULL) { + tcp_send_data(tcp, mp1); TCP_TIMER_RESTART(tcp, tcp->tcp_rto); } freemsg(mp); @@ -13146,9 +10243,20 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) conn_t *new_connp; ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - new_connp = ipcl_classify(mp, connp->conn_zoneid, ipst); + /* + * Don't accept any input on a closed tcp as this TCP logically + * does not exist on the system. Don't proceed further with + * this TCP. For instance, this packet could trigger another + * close of this tcp which would be disastrous for tcp_refcnt. + * tcp_close_detached / tcp_clean_death / tcp_closei_local must + * be called at most once on a TCP. In this case we need to + * refeed the packet into the classifier and figure out where + * the packet should go. + */ + new_connp = ipcl_classify(mp, ira, ipst); if (new_connp != NULL) { - tcp_reinput(new_connp, mp, connp->conn_sqp); + /* Drops ref on new_connp */ + tcp_reinput(new_connp, mp, ira, ipst); return; } /* We failed to classify. For now just drop the packet */ @@ -13194,7 +10302,7 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) tcp->tcp_kssl_ctx = NULL; tcp->tcp_rnxt += seg_len; - U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); + tcp->tcp_tcpha->tha_ack = htonl(tcp->tcp_rnxt); flags |= TH_ACK_NEEDED; goto ack_check; } @@ -13205,13 +10313,13 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) return; } - mp->b_rptr = (uchar_t *)tcph + TCP_HDR_LENGTH(tcph); - urp = BE16_TO_U16(tcph->th_urp) - TCP_OLD_URP_INTERPRETATION; - new_swnd = BE16_TO_U16(tcph->th_win) << - ((tcph->th_flags[0] & TH_SYN) ? 0 : tcp->tcp_snd_ws); + mp->b_rptr = (uchar_t *)tcpha + TCP_HDR_LENGTH(tcpha); + urp = ntohs(tcpha->tha_urp) - TCP_OLD_URP_INTERPRETATION; + new_swnd = ntohs(tcpha->tha_win) << + ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws); if (tcp->tcp_snd_ts_ok) { - if (!tcp_paws_check(tcp, tcph, &tcpopt)) { + if (!tcp_paws_check(tcp, tcpha, &tcpopt)) { /* * This segment is not acceptable. * Drop it and send back an ACK. @@ -13227,7 +10335,7 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) * SACK info in already updated in tcp_parse_options. Ignore * all other TCP options... */ - (void) tcp_parse_options(tcph, &tcpopt); + (void) tcp_parse_options(tcpha, &tcpopt); } try_again:; mss = tcp->tcp_mss; @@ -13289,7 +10397,7 @@ try_again:; * Adjust seg_len to the original value for tracing. */ seg_len -= gap; - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, "tcp_rput: unacceptable, gap %d, rgap %d, " "flags 0x%x, seg_seq %u, seg_ack %u, " @@ -13436,7 +10544,7 @@ try_again:; return; } if (!TCP_IS_DETACHED(tcp) && - !putnextctl1(tcp->tcp_rq, + !putnextctl1(connp->conn_rq, M_PCSIG, SIGURG)) { /* Try again on the rexmit. */ freemsg(mp1); @@ -13505,7 +10613,7 @@ ok:; * same segment. In this case, we once again turn * on ECN_ECHO. */ - if (tcp->tcp_ipversion == IPV4_VERSION) { + if (connp->conn_ipversion == IPV4_VERSION) { uchar_t tos = ((ipha_t *)rptr)->ipha_type_of_service; if ((tos & IPH_ECN_CE) == IPH_ECN_CE) { @@ -13705,7 +10813,7 @@ ok:; return; } if (!TCP_IS_DETACHED(tcp) && - !putnextctl1(tcp->tcp_rq, M_PCSIG, + !putnextctl1(connp->conn_rq, M_PCSIG, SIGURG)) { /* Try again on the rexmit. */ freemsg(mp1); @@ -13739,7 +10847,7 @@ ok:; } else if (tcp->tcp_urp_mark_mp != NULL) { /* * An allocation failure prevented the previous - * tcp_rput_data from sending up the allocated + * tcp_input_data from sending up the allocated * MSG*MARKNEXT message - send it up this time * around. */ @@ -13775,14 +10883,14 @@ ok:; */ (void) adjmsg(mp, urp - seg_len); - tcp_rput_data(connp, - mp, NULL); + tcp_input_data(connp, + mp, NULL, ira); return; } (void) adjmsg(mp1, urp - seg_len); /* Feed this piece back in. */ tmp_rnxt = tcp->tcp_rnxt; - tcp_rput_data(connp, mp1, NULL); + tcp_input_data(connp, mp1, NULL, ira); /* * If the data passed back in was not * processed (ie: bad ACK) sending @@ -13811,13 +10919,13 @@ ok:; */ (void) adjmsg(mp, urp + 1 - seg_len); - tcp_rput_data(connp, - mp, NULL); + tcp_input_data(connp, + mp, NULL, ira); return; } (void) adjmsg(mp1, urp + 1 - seg_len); tmp_rnxt = tcp->tcp_rnxt; - tcp_rput_data(connp, mp1, NULL); + tcp_input_data(connp, mp1, NULL, ira); /* * If the data passed back in was not * processed (ie: bad ACK) sending @@ -13831,7 +10939,7 @@ ok:; return; } } - tcp_rput_data(connp, mp, NULL); + tcp_input_data(connp, mp, NULL, ira); return; } /* @@ -13960,7 +11068,7 @@ process_ack: } bytes_acked = (int)(seg_ack - tcp->tcp_suna); - if (tcp->tcp_ipversion == IPV6_VERSION && bytes_acked > 0) + if (bytes_acked > 0) tcp->tcp_ip_forward_progress = B_TRUE; if (tcp->tcp_state == TCPS_SYN_RCVD) { if ((tcp->tcp_conn.tcp_eager_conn_ind != NULL) && @@ -13983,7 +11091,7 @@ process_ack: /* * The listener also exists because of the refhold - * done in tcp_conn_request. Its possible that it + * done in tcp_input_listener. Its possible that it * might have closed. We will check that once we * get inside listeners context. */ @@ -14005,12 +11113,12 @@ process_ack: } else if (!tcp->tcp_loopback) { SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, mp, tcp_send_conn_ind, - listener->tcp_connp, SQ_FILL, + listener->tcp_connp, NULL, SQ_FILL, SQTAG_TCP_CONN_IND); } else { SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, mp, tcp_send_conn_ind, - listener->tcp_connp, SQ_PROCESS, + listener->tcp_connp, NULL, SQ_PROCESS, SQTAG_TCP_CONN_IND); } } @@ -14026,7 +11134,7 @@ process_ack: */ tcp->tcp_state = TCPS_ESTABLISHED; if (tcp->tcp_active_open) { - if (!tcp_conn_con(tcp, iphdr, tcph, mp, NULL)) { + if (!tcp_conn_con(tcp, iphdr, mp, NULL, ira)) { freemsg(mp); tcp->tcp_state = TCPS_SYN_RCVD; return; @@ -14044,8 +11152,7 @@ process_ack: tcp->tcp_suna = tcp->tcp_iss + 1; /* One for the SYN */ bytes_acked--; /* SYN was acked - making progress */ - if (tcp->tcp_ipversion == IPV6_VERSION) - tcp->tcp_ip_forward_progress = B_TRUE; + tcp->tcp_ip_forward_progress = B_TRUE; /* * If SYN was retransmitted, need to reset all @@ -14083,7 +11190,7 @@ process_ack: /* Fuse when both sides are in ESTABLISHED state */ if (tcp->tcp_loopback && do_tcp_fusion) - tcp_fuse(tcp, iphdr, tcph); + tcp_fuse(tcp, iphdr, tcpha); } /* This code follows 4.4BSD-Lite2 mostly. */ @@ -14388,7 +11495,7 @@ process_ack: if (mp != NULL) { BUMP_LOCAL(tcp->tcp_obsegs); BUMP_MIB(&tcps->tcps_mib, tcpOutAck); - tcp_send_data(tcp, tcp->tcp_wq, mp); + tcp_send_data(tcp, mp); } return; } @@ -14487,7 +11594,6 @@ process_ack: } } else { tcp->tcp_rexmit = B_FALSE; - tcp->tcp_xmit_zc_clean = B_FALSE; tcp->tcp_rexmit_nxt = tcp->tcp_snxt; tcp->tcp_snd_burst = tcp->tcp_localnet ? TCP_CWND_INFINITE : TCP_CWND_NORMAL; @@ -14662,8 +11768,7 @@ fin_acked: tcp->tcp_xmit_tail = NULL; if (tcp->tcp_fin_sent) { /* FIN was acked - making progress */ - if (tcp->tcp_ipversion == IPV6_VERSION && - !tcp->tcp_fin_acked) + if (!tcp->tcp_fin_acked) tcp->tcp_ip_forward_progress = B_TRUE; tcp->tcp_fin_acked = B_TRUE; if (tcp->tcp_linger_tid != 0 && @@ -14781,7 +11886,7 @@ est: * bit so this TIME-WAIT connection won't * interfere with new ones. */ - tcp->tcp_exclbind = 0; + connp->conn_exclbind = 0; if (!TCP_IS_DETACHED(tcp)) { TCP_TIMER_RESTART(tcp, tcps->tcps_time_wait_interval); @@ -14805,8 +11910,8 @@ est: if (!tcp->tcp_fin_rcvd) { tcp->tcp_fin_rcvd = B_TRUE; tcp->tcp_rnxt++; - tcph = tcp->tcp_tcph; - U32_TO_ABE32(tcp->tcp_rnxt, tcph->th_ack); + tcpha = tcp->tcp_tcpha; + tcpha->tha_ack = htonl(tcp->tcp_rnxt); /* * Generate the ordrel_ind at the end unless we @@ -14815,7 +11920,7 @@ est: * after tcp_accept is done. */ if (tcp->tcp_listener == NULL && - !TCP_IS_DETACHED(tcp) && (!tcp->tcp_hard_binding)) + !TCP_IS_DETACHED(tcp) && !tcp->tcp_hard_binding) flags |= TH_ORDREL_NEEDED; switch (tcp->tcp_state) { case TCPS_SYN_RCVD: @@ -14836,7 +11941,7 @@ est: * bit so this TIME-WAIT connection won't * interfere with new ones. */ - tcp->tcp_exclbind = 0; + connp->conn_exclbind = 0; if (!TCP_IS_DETACHED(tcp)) { TCP_TIMER_RESTART(tcp, tcps->tcps_time_wait_interval); @@ -14872,7 +11977,7 @@ est: freeb(mp1); } update_ack: - tcph = tcp->tcp_tcph; + tcpha = tcp->tcp_tcpha; tcp->tcp_rack_cnt++; { uint32_t cur_max; @@ -14915,7 +12020,7 @@ update_ack: } } tcp->tcp_rnxt += seg_len; - U32_TO_ABE32(tcp->tcp_rnxt, tcph->th_ack); + tcpha->tha_ack = htonl(tcp->tcp_rnxt); if (mp == NULL) goto xmit_check; @@ -14942,12 +12047,13 @@ update_ack: /* * Check for ancillary data changes compared to last segment. */ - if (tcp->tcp_ipv6_recvancillary != 0) { - mp = tcp_rput_add_ancillary(tcp, mp, &ipp); - ASSERT(mp != NULL); + if (connp->conn_recv_ancillary.crb_all != 0) { + mp = tcp_input_add_ancillary(tcp, mp, &ipp, ira); + if (mp == NULL) + return; } - if (tcp->tcp_listener || tcp->tcp_hard_binding) { + if (tcp->tcp_listener != NULL || tcp->tcp_hard_binding) { /* * Side queue inbound data until the accept happens. * tcp_accept/tcp_rput drains this when the accept happens. @@ -14961,9 +12067,9 @@ update_ack: if (tcp->tcp_kssl_pending) { DTRACE_PROBE1(kssl_mblk__ksslinput_pending, mblk_t *, mp); - tcp_kssl_input(tcp, mp); + tcp_kssl_input(tcp, mp, ira->ira_cred); } else { - tcp_rcv_enqueue(tcp, mp, seg_len); + tcp_rcv_enqueue(tcp, mp, seg_len, ira->ira_cred); } } else if (IPCL_IS_NONSTR(connp)) { /* @@ -15015,19 +12121,22 @@ update_ack: (DB_TYPE(mp) == M_DATA)) { DTRACE_PROBE1(kssl_mblk__ksslinput_data1, mblk_t *, mp); - tcp_kssl_input(tcp, mp); + tcp_kssl_input(tcp, mp, ira->ira_cred); } else { - putnext(tcp->tcp_rq, mp); - if (!canputnext(tcp->tcp_rq)) + if (is_system_labeled()) + tcp_setcred_data(mp, ira); + + putnext(connp->conn_rq, mp); + if (!canputnext(connp->conn_rq)) tcp->tcp_rwnd -= seg_len; } } else if ((tcp->tcp_kssl_ctx != NULL) && (DB_TYPE(mp) == M_DATA)) { /* Does this need SSL processing first? */ DTRACE_PROBE1(kssl_mblk__ksslinput_data2, mblk_t *, mp); - tcp_kssl_input(tcp, mp); + tcp_kssl_input(tcp, mp, ira->ira_cred); } else if ((flags & (TH_PUSH|TH_FIN)) || - tcp->tcp_rcv_cnt + seg_len >= tcp->tcp_recv_hiwater >> 3) { + tcp->tcp_rcv_cnt + seg_len >= connp->conn_rcvbuf >> 3) { if (tcp->tcp_rcv_list != NULL) { /* * Enqueue the new segment first and then @@ -15042,11 +12151,15 @@ update_ack: * canputnext() as tcp_rcv_drain() needs to * call canputnext(). */ - tcp_rcv_enqueue(tcp, mp, seg_len); + tcp_rcv_enqueue(tcp, mp, seg_len, + ira->ira_cred); flags |= tcp_rcv_drain(tcp); } else { - putnext(tcp->tcp_rq, mp); - if (!canputnext(tcp->tcp_rq)) + if (is_system_labeled()) + tcp_setcred_data(mp, ira); + + putnext(connp->conn_rq, mp); + if (!canputnext(connp->conn_rq)) tcp->tcp_rwnd -= seg_len; } } else { @@ -15054,7 +12167,7 @@ update_ack: * Enqueue all packets when processing an mblk * from the co queue and also enqueue normal packets. */ - tcp_rcv_enqueue(tcp, mp, seg_len); + tcp_rcv_enqueue(tcp, mp, seg_len, ira->ira_cred); } /* * Make sure the timer is running if we have data waiting @@ -15103,7 +12216,7 @@ xmit_check: BUMP_MIB(&tcps->tcps_mib, tcpRetransSegs); UPDATE_MIB(&tcps->tcps_mib, tcpRetransBytes, snd_size); - tcp_send_data(tcp, tcp->tcp_wq, mp1); + tcp_send_data(tcp, mp1); } } if (flags & TH_NEED_SACK_REXMIT) { @@ -15155,7 +12268,10 @@ ack_check: ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); mp1 = tcp->tcp_urp_mark_mp; tcp->tcp_urp_mark_mp = NULL; - putnext(tcp->tcp_rq, mp1); + if (is_system_labeled()) + tcp_setcred_data(mp1, ira); + + putnext(connp->conn_rq, mp1); #ifdef DEBUG (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, "tcp_rput: sending zero-length %s %s", @@ -15172,7 +12288,7 @@ ack_check: mp1 = tcp_ack_mp(tcp); if (mp1 != NULL) { - tcp_send_data(tcp, tcp->tcp_wq, mp1); + tcp_send_data(tcp, mp1); BUMP_LOCAL(tcp->tcp_obsegs); BUMP_MIB(&tcps->tcps_mib, tcpOutAck); } @@ -15200,6 +12316,7 @@ ack_check: * after tcp_accept is done. */ ASSERT(tcp->tcp_listener == NULL); + ASSERT(!tcp->tcp_detached); if (IPCL_IS_NONSTR(connp)) { ASSERT(tcp->tcp_ordrel_mp == NULL); @@ -15220,7 +12337,7 @@ ack_check: mp1 = tcp->tcp_ordrel_mp; tcp->tcp_ordrel_mp = NULL; tcp->tcp_ordrel_done = B_TRUE; - putnext(tcp->tcp_rq, mp1); + putnext(connp->conn_rq, mp1); } done: ASSERT(!(flags & TH_MARKNEXT_NEEDED)); @@ -15251,21 +12368,22 @@ tcp_update_xmit_tail(tcp_t *tcp, uint32_t snxt) * segment passes the PAWS test, else returns B_FALSE. */ boolean_t -tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp) +tcp_paws_check(tcp_t *tcp, tcpha_t *tcpha, tcp_opt_t *tcpoptp) { uint8_t flags; int options; uint8_t *up; + conn_t *connp = tcp->tcp_connp; - flags = (unsigned int)tcph->th_flags[0] & 0xFF; + flags = (unsigned int)tcpha->tha_flags & 0xFF; /* * If timestamp option is aligned nicely, get values inline, * otherwise call general routine to parse. Only do that * if timestamp is the only option. */ - if (TCP_HDR_LENGTH(tcph) == (uint32_t)TCP_MIN_HEADER_LENGTH + + if (TCP_HDR_LENGTH(tcpha) == (uint32_t)TCP_MIN_HEADER_LENGTH + TCPOPT_REAL_TS_LEN && - OK_32PTR((up = ((uint8_t *)tcph) + + OK_32PTR((up = ((uint8_t *)tcpha) + TCP_MIN_HEADER_LENGTH)) && *(uint32_t *)up == TCPOPT_NOP_NOP_TSTAMP) { tcpoptp->tcp_opt_ts_val = ABE32_TO_U32((up+4)); @@ -15278,7 +12396,7 @@ tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp) } else { tcpoptp->tcp = NULL; } - options = tcp_parse_options(tcph, tcpoptp); + options = tcp_parse_options(tcpha, tcpoptp); } if (options & TCP_OPT_TSTAMP_PRESENT) { @@ -15311,16 +12429,15 @@ tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp) */ tcp->tcp_snd_ts_ok = B_FALSE; - tcp->tcp_hdr_len -= TCPOPT_REAL_TS_LEN; - tcp->tcp_tcp_hdr_len -= TCPOPT_REAL_TS_LEN; - tcp->tcp_tcph->th_offset_and_rsrvd[0] -= (3 << 4); + connp->conn_ht_iphc_len -= TCPOPT_REAL_TS_LEN; + connp->conn_ht_ulp_len -= TCPOPT_REAL_TS_LEN; + tcp->tcp_tcpha->tha_offset_and_reserved -= (3 << 4); /* - * Adjust the tcp_mss accordingly. We also need to - * adjust tcp_cwnd here in accordance with the new mss. - * But we avoid doing a slow start here so as to not - * to lose on the transfer rate built up so far. + * Adjust the tcp_mss and tcp_cwnd accordingly. We avoid + * doing a slow start here so as to not to lose on the + * transfer rate built up so far. */ - tcp_mss_set(tcp, tcp->tcp_mss + TCPOPT_REAL_TS_LEN, B_FALSE); + tcp_mss_set(tcp, tcp->tcp_mss + TCPOPT_REAL_TS_LEN); if (tcp->tcp_snd_sack_ok) { ASSERT(tcp->tcp_sack_info != NULL); tcp->tcp_max_sack_blk = 4; @@ -15338,38 +12455,37 @@ tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp) * when memory allocation fails we can just wait for the next data segment. */ static mblk_t * -tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp) +tcp_input_add_ancillary(tcp_t *tcp, mblk_t *mp, ip_pkt_t *ipp, + ip_recv_attr_t *ira) { struct T_optdata_ind *todi; int optlen; uchar_t *optptr; struct T_opthdr *toh; - uint_t addflag; /* Which pieces to add */ + crb_t addflag; /* Which pieces to add */ mblk_t *mp1; + conn_t *connp = tcp->tcp_connp; optlen = 0; - addflag = 0; + addflag.crb_all = 0; /* If app asked for pktinfo and the index has changed ... */ - if ((ipp->ipp_fields & IPPF_IFINDEX) && - ipp->ipp_ifindex != tcp->tcp_recvifindex && - (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO)) { + if (connp->conn_recv_ancillary.crb_ip_recvpktinfo && + ira->ira_ruifindex != tcp->tcp_recvifindex) { optlen += sizeof (struct T_opthdr) + sizeof (struct in6_pktinfo); - addflag |= TCP_IPV6_RECVPKTINFO; + addflag.crb_ip_recvpktinfo = 1; } /* If app asked for hoplimit and it has changed ... */ - if ((ipp->ipp_fields & IPPF_HOPLIMIT) && - ipp->ipp_hoplimit != tcp->tcp_recvhops && - (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVHOPLIMIT)) { + if (connp->conn_recv_ancillary.crb_ipv6_recvhoplimit && + ipp->ipp_hoplimit != tcp->tcp_recvhops) { optlen += sizeof (struct T_opthdr) + sizeof (uint_t); - addflag |= TCP_IPV6_RECVHOPLIMIT; + addflag.crb_ipv6_recvhoplimit = 1; } /* If app asked for tclass and it has changed ... */ - if ((ipp->ipp_fields & IPPF_TCLASS) && - ipp->ipp_tclass != tcp->tcp_recvtclass && - (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVTCLASS)) { + if (connp->conn_recv_ancillary.crb_ipv6_recvtclass && + ipp->ipp_tclass != tcp->tcp_recvtclass) { optlen += sizeof (struct T_opthdr) + sizeof (uint_t); - addflag |= TCP_IPV6_RECVTCLASS; + addflag.crb_ipv6_recvtclass = 1; } /* * If app asked for hopbyhop headers and it has changed ... @@ -15377,51 +12493,51 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp) * a connected socket at all, (2) we're connected to at most one peer, * (3) if anything changes, then it must be some other extra option. */ - if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVHOPOPTS) && + if (connp->conn_recv_ancillary.crb_ipv6_recvhopopts && ip_cmpbuf(tcp->tcp_hopopts, tcp->tcp_hopoptslen, (ipp->ipp_fields & IPPF_HOPOPTS), ipp->ipp_hopopts, ipp->ipp_hopoptslen)) { - optlen += sizeof (struct T_opthdr) + ipp->ipp_hopoptslen - - tcp->tcp_label_len; - addflag |= TCP_IPV6_RECVHOPOPTS; + optlen += sizeof (struct T_opthdr) + ipp->ipp_hopoptslen; + addflag.crb_ipv6_recvhopopts = 1; if (!ip_allocbuf((void **)&tcp->tcp_hopopts, &tcp->tcp_hopoptslen, (ipp->ipp_fields & IPPF_HOPOPTS), ipp->ipp_hopopts, ipp->ipp_hopoptslen)) return (mp); } /* If app asked for dst headers before routing headers ... */ - if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVRTDSTOPTS) && - ip_cmpbuf(tcp->tcp_rtdstopts, tcp->tcp_rtdstoptslen, - (ipp->ipp_fields & IPPF_RTDSTOPTS), - ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen)) { + if (connp->conn_recv_ancillary.crb_ipv6_recvrthdrdstopts && + ip_cmpbuf(tcp->tcp_rthdrdstopts, tcp->tcp_rthdrdstoptslen, + (ipp->ipp_fields & IPPF_RTHDRDSTOPTS), + ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen)) { optlen += sizeof (struct T_opthdr) + - ipp->ipp_rtdstoptslen; - addflag |= TCP_IPV6_RECVRTDSTOPTS; - if (!ip_allocbuf((void **)&tcp->tcp_rtdstopts, - &tcp->tcp_rtdstoptslen, (ipp->ipp_fields & IPPF_RTDSTOPTS), - ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen)) + ipp->ipp_rthdrdstoptslen; + addflag.crb_ipv6_recvrthdrdstopts = 1; + if (!ip_allocbuf((void **)&tcp->tcp_rthdrdstopts, + &tcp->tcp_rthdrdstoptslen, + (ipp->ipp_fields & IPPF_RTHDRDSTOPTS), + ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen)) return (mp); } /* If app asked for routing headers and it has changed ... */ - if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVRTHDR) && + if (connp->conn_recv_ancillary.crb_ipv6_recvrthdr && ip_cmpbuf(tcp->tcp_rthdr, tcp->tcp_rthdrlen, (ipp->ipp_fields & IPPF_RTHDR), ipp->ipp_rthdr, ipp->ipp_rthdrlen)) { optlen += sizeof (struct T_opthdr) + ipp->ipp_rthdrlen; - addflag |= TCP_IPV6_RECVRTHDR; + addflag.crb_ipv6_recvrthdr = 1; if (!ip_allocbuf((void **)&tcp->tcp_rthdr, &tcp->tcp_rthdrlen, (ipp->ipp_fields & IPPF_RTHDR), ipp->ipp_rthdr, ipp->ipp_rthdrlen)) return (mp); } /* If app asked for dest headers and it has changed ... */ - if ((tcp->tcp_ipv6_recvancillary & - (TCP_IPV6_RECVDSTOPTS | TCP_OLD_IPV6_RECVDSTOPTS)) && + if ((connp->conn_recv_ancillary.crb_ipv6_recvdstopts || + connp->conn_recv_ancillary.crb_old_ipv6_recvdstopts) && ip_cmpbuf(tcp->tcp_dstopts, tcp->tcp_dstoptslen, (ipp->ipp_fields & IPPF_DSTOPTS), ipp->ipp_dstopts, ipp->ipp_dstoptslen)) { optlen += sizeof (struct T_opthdr) + ipp->ipp_dstoptslen; - addflag |= TCP_IPV6_RECVDSTOPTS; + addflag.crb_ipv6_recvdstopts = 1; if (!ip_allocbuf((void **)&tcp->tcp_dstopts, &tcp->tcp_dstoptslen, (ipp->ipp_fields & IPPF_DSTOPTS), ipp->ipp_dstopts, ipp->ipp_dstoptslen)) @@ -15454,9 +12570,11 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp) * If app asked for pktinfo and the index has changed ... * Note that the local address never changes for the connection. */ - if (addflag & TCP_IPV6_RECVPKTINFO) { + if (addflag.crb_ip_recvpktinfo) { struct in6_pktinfo *pkti; + uint_t ifindex; + ifindex = ira->ira_ruifindex; toh = (struct T_opthdr *)optptr; toh->level = IPPROTO_IPV6; toh->name = IPV6_PKTINFO; @@ -15464,19 +12582,15 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp) toh->status = 0; optptr += sizeof (*toh); pkti = (struct in6_pktinfo *)optptr; - if (tcp->tcp_ipversion == IPV6_VERSION) - pkti->ipi6_addr = tcp->tcp_ip6h->ip6_src; - else - IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, - &pkti->ipi6_addr); - pkti->ipi6_ifindex = ipp->ipp_ifindex; + pkti->ipi6_addr = connp->conn_laddr_v6; + pkti->ipi6_ifindex = ifindex; optptr += sizeof (*pkti); ASSERT(OK_32PTR(optptr)); /* Save as "last" value */ - tcp->tcp_recvifindex = ipp->ipp_ifindex; + tcp->tcp_recvifindex = ifindex; } /* If app asked for hoplimit and it has changed ... */ - if (addflag & TCP_IPV6_RECVHOPLIMIT) { + if (addflag.crb_ipv6_recvhoplimit) { toh = (struct T_opthdr *)optptr; toh->level = IPPROTO_IPV6; toh->name = IPV6_HOPLIMIT; @@ -15490,7 +12604,7 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp) tcp->tcp_recvhops = ipp->ipp_hoplimit; } /* If app asked for tclass and it has changed ... */ - if (addflag & TCP_IPV6_RECVTCLASS) { + if (addflag.crb_ipv6_recvtclass) { toh = (struct T_opthdr *)optptr; toh->level = IPPROTO_IPV6; toh->name = IPV6_TCLASS; @@ -15503,40 +12617,38 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp) /* Save as "last" value */ tcp->tcp_recvtclass = ipp->ipp_tclass; } - if (addflag & TCP_IPV6_RECVHOPOPTS) { + if (addflag.crb_ipv6_recvhopopts) { toh = (struct T_opthdr *)optptr; toh->level = IPPROTO_IPV6; toh->name = IPV6_HOPOPTS; - toh->len = sizeof (*toh) + ipp->ipp_hopoptslen - - tcp->tcp_label_len; + toh->len = sizeof (*toh) + ipp->ipp_hopoptslen; toh->status = 0; optptr += sizeof (*toh); - bcopy((uchar_t *)ipp->ipp_hopopts + tcp->tcp_label_len, optptr, - ipp->ipp_hopoptslen - tcp->tcp_label_len); - optptr += ipp->ipp_hopoptslen - tcp->tcp_label_len; + bcopy((uchar_t *)ipp->ipp_hopopts, optptr, ipp->ipp_hopoptslen); + optptr += ipp->ipp_hopoptslen; ASSERT(OK_32PTR(optptr)); /* Save as last value */ ip_savebuf((void **)&tcp->tcp_hopopts, &tcp->tcp_hopoptslen, (ipp->ipp_fields & IPPF_HOPOPTS), ipp->ipp_hopopts, ipp->ipp_hopoptslen); } - if (addflag & TCP_IPV6_RECVRTDSTOPTS) { + if (addflag.crb_ipv6_recvrthdrdstopts) { toh = (struct T_opthdr *)optptr; toh->level = IPPROTO_IPV6; toh->name = IPV6_RTHDRDSTOPTS; - toh->len = sizeof (*toh) + ipp->ipp_rtdstoptslen; + toh->len = sizeof (*toh) + ipp->ipp_rthdrdstoptslen; toh->status = 0; optptr += sizeof (*toh); - bcopy(ipp->ipp_rtdstopts, optptr, ipp->ipp_rtdstoptslen); - optptr += ipp->ipp_rtdstoptslen; + bcopy(ipp->ipp_rthdrdstopts, optptr, ipp->ipp_rthdrdstoptslen); + optptr += ipp->ipp_rthdrdstoptslen; ASSERT(OK_32PTR(optptr)); /* Save as last value */ - ip_savebuf((void **)&tcp->tcp_rtdstopts, - &tcp->tcp_rtdstoptslen, - (ipp->ipp_fields & IPPF_RTDSTOPTS), - ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen); + ip_savebuf((void **)&tcp->tcp_rthdrdstopts, + &tcp->tcp_rthdrdstoptslen, + (ipp->ipp_fields & IPPF_RTHDRDSTOPTS), + ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen); } - if (addflag & TCP_IPV6_RECVRTHDR) { + if (addflag.crb_ipv6_recvrthdr) { toh = (struct T_opthdr *)optptr; toh->level = IPPROTO_IPV6; toh->name = IPV6_RTHDR; @@ -15551,7 +12663,7 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp) (ipp->ipp_fields & IPPF_RTHDR), ipp->ipp_rthdr, ipp->ipp_rthdrlen); } - if (addflag & (TCP_IPV6_RECVDSTOPTS | TCP_OLD_IPV6_RECVDSTOPTS)) { + if (addflag.crb_ipv6_recvdstopts) { toh = (struct T_opthdr *)optptr; toh->level = IPPROTO_IPV6; toh->name = IPV6_DSTOPTS; @@ -15570,99 +12682,13 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp) return (mp); } -/* - * tcp_rput_other is called by tcp_rput to handle everything other than M_DATA - * messages. - */ -void -tcp_rput_other(tcp_t *tcp, mblk_t *mp) -{ - uchar_t *rptr = mp->b_rptr; - queue_t *q = tcp->tcp_rq; - struct T_error_ack *tea; - - switch (mp->b_datap->db_type) { - case M_PROTO: - case M_PCPROTO: - ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); - if ((mp->b_wptr - rptr) < sizeof (t_scalar_t)) - break; - tea = (struct T_error_ack *)rptr; - ASSERT(tea->PRIM_type != T_BIND_ACK); - ASSERT(tea->ERROR_prim != O_T_BIND_REQ && - tea->ERROR_prim != T_BIND_REQ); - switch (tea->PRIM_type) { - case T_ERROR_ACK: - if (tcp->tcp_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, - SL_TRACE|SL_ERROR, - "tcp_rput_other: case T_ERROR_ACK, " - "ERROR_prim == %d", - tea->ERROR_prim); - } - switch (tea->ERROR_prim) { - case T_SVR4_OPTMGMT_REQ: - if (tcp->tcp_drop_opt_ack_cnt > 0) { - /* T_OPTMGMT_REQ generated by TCP */ - printf("T_SVR4_OPTMGMT_REQ failed " - "%d/%d - dropped (cnt %d)\n", - tea->TLI_error, tea->UNIX_error, - tcp->tcp_drop_opt_ack_cnt); - freemsg(mp); - tcp->tcp_drop_opt_ack_cnt--; - return; - } - break; - } - if (tea->ERROR_prim == T_SVR4_OPTMGMT_REQ && - tcp->tcp_drop_opt_ack_cnt > 0) { - printf("T_SVR4_OPTMGMT_REQ failed %d/%d " - "- dropped (cnt %d)\n", - tea->TLI_error, tea->UNIX_error, - tcp->tcp_drop_opt_ack_cnt); - freemsg(mp); - tcp->tcp_drop_opt_ack_cnt--; - return; - } - break; - case T_OPTMGMT_ACK: - if (tcp->tcp_drop_opt_ack_cnt > 0) { - /* T_OPTMGMT_REQ generated by TCP */ - freemsg(mp); - tcp->tcp_drop_opt_ack_cnt--; - return; - } - break; - default: - ASSERT(tea->ERROR_prim != T_UNBIND_REQ); - break; - } - break; - case M_FLUSH: - if (*rptr & FLUSHR) - flushq(q, FLUSHDATA); - break; - default: - /* M_CTL will be directly sent to tcp_icmp_error() */ - ASSERT(DB_TYPE(mp) != M_CTL); - break; - } - /* - * Make sure we set this bit before sending the ACK for - * bind. Otherwise accept could possibly run and free - * this tcp struct. - */ - ASSERT(q != NULL); - putnext(q, mp); -} - /* ARGSUSED */ static void -tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2) +tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { conn_t *connp = (conn_t *)arg; tcp_t *tcp = connp->conn_tcp; - queue_t *q = tcp->tcp_rq; + queue_t *q = connp->conn_rq; tcp_stack_t *tcps = tcp->tcp_tcps; ASSERT(!IPCL_IS_NONSTR(connp)); @@ -15683,7 +12709,7 @@ tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2) if (canputnext(q)) { /* Not flow-controlled, open rwnd */ - tcp->tcp_rwnd = tcp->tcp_recv_hiwater; + tcp->tcp_rwnd = connp->conn_rcvbuf; /* * Send back a window update immediately if TCP is above @@ -15712,16 +12738,10 @@ tcp_rsrv(queue_t *q) conn_t *connp = Q_TO_CONN(q); tcp_t *tcp = connp->conn_tcp; mblk_t *mp; - tcp_stack_t *tcps = tcp->tcp_tcps; /* No code does a putq on the read side */ ASSERT(q->q_first == NULL); - /* Nothing to do for the default queue */ - if (q == tcps->tcps_g_q) { - return; - } - /* * If tcp->tcp_rsrv_mp == NULL, it means that tcp_rsrv() has already * been run. So just return. @@ -15736,7 +12756,7 @@ tcp_rsrv(queue_t *q) CONN_INC_REF(connp); SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_rsrv_input, connp, - SQ_PROCESS, SQTAG_TCP_RSRV); + NULL, SQ_PROCESS, SQTAG_TCP_RSRV); } /* @@ -15746,8 +12766,8 @@ tcp_rsrv(queue_t *q) * * This function is called in 2 cases: * - * 1) Before data transfer begins, in tcp_accept_comm() for accepting a - * connection (passive open) and in tcp_rput_data() for active connect. + * 1) Before data transfer begins, in tcp_input_listener() for accepting a + * connection (passive open) and in tcp_input_data() for active connect. * This is called after tcp_mss_set() when the desired MSS value is known. * This makes sure that our window size is a mutiple of the other side's * MSS. @@ -15766,6 +12786,7 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) uint32_t max_transmittable_rwnd; boolean_t tcp_detached = TCP_IS_DETACHED(tcp); tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; /* * Insist on a receive window that is at least @@ -15782,7 +12803,7 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) ASSERT(peer_tcp != NULL); sth_hiwat = tcp_fuse_set_rcv_hiwat(tcp, rwnd); if (!tcp_detached) { - (void) proto_set_rx_hiwat(tcp->tcp_rq, tcp->tcp_connp, + (void) proto_set_rx_hiwat(connp->conn_rq, connp, sth_hiwat); tcp_set_recv_threshold(tcp, sth_hiwat >> 3); } @@ -15797,11 +12818,10 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) return (sth_hiwat); } - if (tcp_detached) { + if (tcp_detached) old_max_rwnd = tcp->tcp_rwnd; - } else { - old_max_rwnd = tcp->tcp_recv_hiwater; - } + else + old_max_rwnd = connp->conn_rcvbuf; /* @@ -15854,9 +12874,14 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) * connection.) */ tcp->tcp_rwnd += rwnd - old_max_rwnd; - tcp->tcp_recv_hiwater = rwnd; + connp->conn_rcvbuf = rwnd; + + /* Are we already connected? */ + if (tcp->tcp_tcpha != NULL) { + tcp->tcp_tcpha->tha_win = + htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); + } - U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, tcp->tcp_tcph->th_win); if ((tcp->tcp_rcv_ws > 0) && rwnd > tcp->tcp_cwnd_max) tcp->tcp_cwnd_max = rwnd; @@ -15865,7 +12890,7 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) tcp_set_recv_threshold(tcp, rwnd >> 3); - (void) proto_set_rx_hiwat(tcp->tcp_rq, tcp->tcp_connp, rwnd); + (void) proto_set_rx_hiwat(connp->conn_rq, connp, rwnd); return (rwnd); } @@ -15944,7 +12969,7 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl) connp = NULL; while ((connp = - ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { + ipcl_get_next_conn(connfp, connp, IPCL_TCPCONN)) != NULL) { tcp_t *tcp; boolean_t needattr; @@ -15992,11 +13017,10 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl) needattr = B_TRUE; break; } - if (connp->conn_fully_bound && - connp->conn_effective_cred != NULL) { + if (connp->conn_ixa->ixa_tsl != NULL) { ts_label_t *tsl; - tsl = crgetlabel(connp->conn_effective_cred); + tsl = connp->conn_ixa->ixa_tsl; mlp.tme_flags |= MIB2_TMEF_IS_LABELED; mlp.tme_doi = label2doi(tsl); mlp.tme_label = *label2bslabel(tsl); @@ -16004,12 +13028,17 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl) } /* Create a message to report on IPv6 entries */ - if (tcp->tcp_ipversion == IPV6_VERSION) { - tce6.tcp6ConnLocalAddress = tcp->tcp_ip_src_v6; - tce6.tcp6ConnRemAddress = tcp->tcp_remote_v6; - tce6.tcp6ConnLocalPort = ntohs(tcp->tcp_lport); - tce6.tcp6ConnRemPort = ntohs(tcp->tcp_fport); - tce6.tcp6ConnIfIndex = tcp->tcp_bound_if; + if (connp->conn_ipversion == IPV6_VERSION) { + tce6.tcp6ConnLocalAddress = connp->conn_laddr_v6; + tce6.tcp6ConnRemAddress = connp->conn_faddr_v6; + tce6.tcp6ConnLocalPort = ntohs(connp->conn_lport); + tce6.tcp6ConnRemPort = ntohs(connp->conn_fport); + if (connp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET) { + tce6.tcp6ConnIfIndex = + connp->conn_ixa->ixa_scopeid; + } else { + tce6.tcp6ConnIfIndex = connp->conn_bound_if; + } /* Don't want just anybody seeing these... */ if (ispriv) { tce6.tcp6ConnEntryInfo.ce_snxt = @@ -16041,9 +13070,9 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl) tce6.tcp6ConnEntryInfo.ce_state = tcp->tcp_state; tce6.tcp6ConnCreationProcess = - (tcp->tcp_cpid < 0) ? MIB2_UNKNOWN_PROCESS : - tcp->tcp_cpid; - tce6.tcp6ConnCreationTime = tcp->tcp_open_time; + (connp->conn_cpid < 0) ? MIB2_UNKNOWN_PROCESS : + connp->conn_cpid; + tce6.tcp6ConnCreationTime = connp->conn_open_time; (void) snmp_append_data2(mp6_conn_ctl->b_cont, &mp6_conn_tail, (char *)&tce6, sizeof (tce6)); @@ -16059,21 +13088,21 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl) * but don't have IPV6_V6ONLY set. * (i.e. anything an IPv4 peer could connect to) */ - if (tcp->tcp_ipversion == IPV4_VERSION || + if (connp->conn_ipversion == IPV4_VERSION || (tcp->tcp_state <= TCPS_LISTEN && - !tcp->tcp_connp->conn_ipv6_v6only && - IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip_src_v6))) { - if (tcp->tcp_ipversion == IPV6_VERSION) { + !connp->conn_ipv6_v6only && + IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6))) { + if (connp->conn_ipversion == IPV6_VERSION) { tce.tcpConnRemAddress = INADDR_ANY; tce.tcpConnLocalAddress = INADDR_ANY; } else { tce.tcpConnRemAddress = - tcp->tcp_remote; + connp->conn_faddr_v4; tce.tcpConnLocalAddress = - tcp->tcp_ip_src; + connp->conn_laddr_v4; } - tce.tcpConnLocalPort = ntohs(tcp->tcp_lport); - tce.tcpConnRemPort = ntohs(tcp->tcp_fport); + tce.tcpConnLocalPort = ntohs(connp->conn_lport); + tce.tcpConnRemPort = ntohs(connp->conn_fport); /* Don't want just anybody seeing these... */ if (ispriv) { tce.tcpConnEntryInfo.ce_snxt = @@ -16107,9 +13136,10 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl) tcp->tcp_state; tce.tcpConnCreationProcess = - (tcp->tcp_cpid < 0) ? MIB2_UNKNOWN_PROCESS : - tcp->tcp_cpid; - tce.tcpConnCreationTime = tcp->tcp_open_time; + (connp->conn_cpid < 0) ? + MIB2_UNKNOWN_PROCESS : + connp->conn_cpid; + tce.tcpConnCreationTime = connp->conn_open_time; (void) snmp_append_data2(mp_conn_ctl->b_cont, &mp_conn_tail, (char *)&tce, sizeof (tce)); @@ -16273,7 +13303,6 @@ tcp_timer(void *arg) tcp_t *listener = tcp->tcp_listener; if (tcp->tcp_syn_rcvd_timeout == 0 && (listener != NULL)) { - ASSERT(tcp->tcp_rq == listener->tcp_rq); /* it's our first timeout */ tcp->tcp_syn_rcvd_timeout = 1; mutex_enter(&listener->tcp_eager_lock); @@ -16295,7 +13324,7 @@ tcp_timer(void *arg) cmn_err(CE_WARN, "High TCP connect timeout " "rate! System (port %d) may be under a " "SYN flood attack!", - BE16_TO_U16(listener->tcp_tcph->th_lport)); + ntohs(listener->tcp_connp->conn_lport)); listener->tcp_ip_addr_cache = kmem_zalloc( IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t), @@ -16363,7 +13392,7 @@ tcp_timer(void *arg) * backoff. */ if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) { - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, "tcp_timer: zero win"); } @@ -16415,6 +13444,13 @@ tcp_timer(void *arg) * 3. But 1 and 3 are exclusive. */ if (tcp->tcp_unsent != 0) { + /* + * Should not hold the zero-copy messages for too long. + */ + if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean) + tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp, + tcp->tcp_xmit_head, B_TRUE); + if (tcp->tcp_cwnd == 0) { /* * Set tcp_cwnd to 1 MSS so that a @@ -16477,7 +13513,7 @@ tcp_timer(void *arg) (void) tcp_clean_death(tcp, 0, 24); return; default: - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, "tcp_timer: strange state (%d) %s", tcp->tcp_state, tcp_display(tcp, NULL, @@ -16485,8 +13521,16 @@ tcp_timer(void *arg) } return; } + if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) { /* + * Should not hold the zero-copy messages for too long. + */ + if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean) + tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp, + tcp->tcp_xmit_head, B_TRUE); + + /* * For zero window probe, we need to send indefinitely, * unless we have not heard from the other side for some * time... @@ -16529,11 +13573,13 @@ tcp_timer(void *arg) tcp->tcp_ms_we_have_waited = second_threshold; } } else if (ms > first_threshold) { - if (tcp->tcp_snd_zcopy_aware && (!tcp->tcp_xmit_zc_clean) && - tcp->tcp_xmit_head != NULL) { - tcp->tcp_xmit_head = - tcp_zcopy_backoff(tcp, tcp->tcp_xmit_head, 1); - } + /* + * Should not hold the zero-copy messages for too long. + */ + if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean) + tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp, + tcp->tcp_xmit_head, B_TRUE); + /* * We have been retransmitting for too long... The RTT * we calculated is probably incorrect. Reinitialize it. @@ -16618,20 +13664,11 @@ tcp_timer(void *arg) if (mp == NULL) { return; } - /* - * Attach credentials to retransmitted initial SYNs. - * In theory we should use the credentials from the connect() - * call to ensure that getpeerucred() on the peer will be correct. - * But we assume that SYN's are not dropped for loopback connections. - */ - if (tcp->tcp_state == TCPS_SYN_SENT) { - mblk_setcred(mp, CONN_CRED(tcp->tcp_connp), tcp->tcp_cpid); - } tcp->tcp_csuna = tcp->tcp_snxt; BUMP_MIB(&tcps->tcps_mib, tcpRetransSegs); UPDATE_MIB(&tcps->tcps_mib, tcpRetransBytes, mss); - tcp_send_data(tcp, tcp->tcp_wq, mp); + tcp_send_data(tcp, mp); } @@ -16639,7 +13676,6 @@ static int tcp_do_unbind(conn_t *connp) { tcp_t *tcp = connp->conn_tcp; - int error = 0; switch (tcp->tcp_state) { case TCPS_BOUND: @@ -16659,41 +13695,36 @@ tcp_do_unbind(conn_t *connp) } mutex_exit(&tcp->tcp_eager_lock); - if (tcp->tcp_ipversion == IPV4_VERSION) { - tcp->tcp_ipha->ipha_src = 0; - } else { - V6_SET_ZERO(tcp->tcp_ip6h->ip6_src); - } - V6_SET_ZERO(tcp->tcp_ip_src_v6); - bzero(tcp->tcp_tcph->th_lport, sizeof (tcp->tcp_tcph->th_lport)); + connp->conn_laddr_v6 = ipv6_all_zeros; + connp->conn_saddr_v6 = ipv6_all_zeros; tcp_bind_hash_remove(tcp); tcp->tcp_state = TCPS_IDLE; - tcp->tcp_mdt = B_FALSE; - connp = tcp->tcp_connp; - connp->conn_mdt_ok = B_FALSE; - ipcl_hash_remove(connp); + ip_unbind(connp); bzero(&connp->conn_ports, sizeof (connp->conn_ports)); - return (error); + return (0); } /* tcp_unbind is called by tcp_wput_proto to handle T_UNBIND_REQ messages. */ static void tcp_tpi_unbind(tcp_t *tcp, mblk_t *mp) { - int error = tcp_do_unbind(tcp->tcp_connp); + conn_t *connp = tcp->tcp_connp; + int error; + error = tcp_do_unbind(connp); if (error > 0) { tcp_err_ack(tcp, mp, TSYSERR, error); } else if (error < 0) { tcp_err_ack(tcp, mp, -error, 0); } else { /* Send M_FLUSH according to TPI */ - (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW); + (void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW); mp = mi_tpi_ok_ack_alloc(mp); - putnext(tcp->tcp_rq, mp); + if (mp != NULL) + putnext(connp->conn_rq, mp); } } @@ -16764,7 +13795,7 @@ retry: } } if (is_system_labeled() && - (i = tsol_next_port(crgetzone(tcp->tcp_cred), port, + (i = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), port, IPPROTO_TCP, B_TRUE)) != 0) { port = i; goto retry; @@ -16796,7 +13827,7 @@ retry: restart = B_TRUE; } if (is_system_labeled() && - (nextport = tsol_next_port(crgetzone(tcp->tcp_cred), + (nextport = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) { next_priv_port = nextport; goto retry; @@ -16820,11 +13851,10 @@ struct { */ /* ARGSUSED */ static void -tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2) +tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { conn_t *connp = (conn_t *)arg; tcp_t *tcp = connp->conn_tcp; - queue_t *q = tcp->tcp_wq; ASSERT(DB_TYPE(mp) != M_IOCTL); /* @@ -16851,7 +13881,7 @@ tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2) tcp_wput_flush(tcp, mp); break; default: - CALL_IP_WPUT(connp, q, mp); + ip_wput_nondata(connp->conn_wq, mp); break; } } @@ -16862,7 +13892,7 @@ tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2) */ /* ARGSUSED */ void -tcp_output(void *arg, mblk_t *mp, void *arg2) +tcp_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { int len; int hdrlen; @@ -16870,7 +13900,7 @@ tcp_output(void *arg, mblk_t *mp, void *arg2) mblk_t *mp1; uchar_t *rptr; uint32_t snxt; - tcph_t *tcph; + tcpha_t *tcpha; struct datab *db; uint32_t suna; uint32_t mss; @@ -16882,7 +13912,7 @@ tcp_output(void *arg, mblk_t *mp, void *arg2) tcp_t *tcp = connp->conn_tcp; uint32_t msize; tcp_stack_t *tcps = tcp->tcp_tcps; - ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; + ip_xmit_attr_t *ixa; /* * Try and ASSERT the minimum possible references on the @@ -16903,25 +13933,18 @@ tcp_output(void *arg, mblk_t *mp, void *arg2) tcp->tcp_squeue_bytes -= msize; mutex_exit(&tcp->tcp_non_sq_lock); - /* Check to see if this connection wants to be re-fused. */ - if (tcp->tcp_refuse) { - if (tcp->tcp_ipversion == IPV4_VERSION && - !ipst->ips_ip4_observe.he_interested) { - tcp_fuse(tcp, (uchar_t *)&tcp->tcp_saved_ipha, - &tcp->tcp_saved_tcph); - } else if (tcp->tcp_ipversion == IPV6_VERSION && - !ipst->ips_ip6_observe.he_interested) { - tcp_fuse(tcp, (uchar_t *)&tcp->tcp_saved_ip6h, - &tcp->tcp_saved_tcph); - } - } /* Bypass tcp protocol for fused tcp loopback */ if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize)) return; mss = tcp->tcp_mss; - if (tcp->tcp_xmit_zc_clean) - mp = tcp_zcopy_backoff(tcp, mp, 0); + /* + * If ZEROCOPY has turned off, try not to send any zero-copy message + * down. Do backoff, now. + */ + if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_snd_zcopy_on) + mp = tcp_zcopy_backoff(tcp, mp, B_FALSE); + ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); len = (int)(mp->b_wptr - mp->b_rptr); @@ -16977,8 +14000,7 @@ tcp_output(void *arg, mblk_t *mp, void *arg2) * start again to get back the connection's "self-clock" as * described in VJ's paper. * - * Refer to the comment in tcp_mss_set() for the calculation - * of tcp_cwnd after idle. + * Reinitialize tcp_cwnd after idle. */ if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && (TICK_TO_MSEC(lbolt - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) { @@ -16999,7 +14021,7 @@ tcp_output(void *arg, mblk_t *mp, void *arg2) mutex_enter(&tcp->tcp_non_sq_lock); if (tcp->tcp_flow_stopped && - TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) { + TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) { tcp_clrqfull(tcp); } mutex_exit(&tcp->tcp_non_sq_lock); @@ -17046,43 +14068,43 @@ tcp_output(void *arg, mblk_t *mp, void *arg2) mp->b_next = (mblk_t *)(uintptr_t)snxt; /* adjust tcp header information */ - tcph = tcp->tcp_tcph; - tcph->th_flags[0] = (TH_ACK|TH_PUSH); + tcpha = tcp->tcp_tcpha; + tcpha->tha_flags = (TH_ACK|TH_PUSH); - sum = len + tcp->tcp_tcp_hdr_len + tcp->tcp_sum; + sum = len + connp->conn_ht_ulp_len + connp->conn_sum; sum = (sum >> 16) + (sum & 0xFFFF); - U16_TO_ABE16(sum, tcph->th_sum); + tcpha->tha_sum = htons(sum); - U32_TO_ABE32(snxt, tcph->th_seq); + tcpha->tha_seq = htonl(snxt); BUMP_MIB(&tcps->tcps_mib, tcpOutDataSegs); UPDATE_MIB(&tcps->tcps_mib, tcpOutDataBytes, len); BUMP_LOCAL(tcp->tcp_obsegs); /* Update the latest receive window size in TCP header. */ - U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, - tcph->th_win); + tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); tcp->tcp_last_sent_len = (ushort_t)len; - plen = len + tcp->tcp_hdr_len; + plen = len + connp->conn_ht_iphc_len; - if (tcp->tcp_ipversion == IPV4_VERSION) { + ixa = connp->conn_ixa; + ixa->ixa_pktlen = plen; + + if (ixa->ixa_flags & IXAF_IS_IPV4) { tcp->tcp_ipha->ipha_length = htons(plen); } else { - tcp->tcp_ip6h->ip6_plen = htons(plen - - ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc)); + tcp->tcp_ip6h->ip6_plen = htons(plen - IPV6_HDR_LEN); } /* see if we need to allocate a mblk for the headers */ - hdrlen = tcp->tcp_hdr_len; + hdrlen = connp->conn_ht_iphc_len; rptr = mp1->b_rptr - hdrlen; db = mp1->b_datap; if ((db->db_ref != 2) || rptr < db->db_base || (!OK_32PTR(rptr))) { /* NOTE: we assume allocb returns an OK_32PTR */ - mp = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + - tcps->tcps_wroff_xtra, BPRI_MED); + mp = allocb(hdrlen + tcps->tcps_wroff_xtra, BPRI_MED); if (!mp) { freemsg(mp1); goto no_memory; @@ -17090,7 +14112,6 @@ tcp_output(void *arg, mblk_t *mp, void *arg2) mp->b_cont = mp1; mp1 = mp; /* Leave room for Link Level header */ - /* hdrlen = tcp->tcp_hdr_len; */ rptr = &mp1->b_rptr[tcps->tcps_wroff_xtra]; mp1->b_wptr = &rptr[hdrlen]; } @@ -17099,16 +14120,16 @@ tcp_output(void *arg, mblk_t *mp, void *arg2) /* Fill in the timestamp option. */ if (tcp->tcp_snd_ts_ok) { U32_TO_BE32((uint32_t)lbolt, - (char *)tcph+TCP_MIN_HEADER_LENGTH+4); + (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); U32_TO_BE32(tcp->tcp_ts_recent, - (char *)tcph+TCP_MIN_HEADER_LENGTH+8); + (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); } else { - ASSERT(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH); + ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH); } /* copy header into outgoing packet */ dst = (ipaddr_t *)rptr; - src = (ipaddr_t *)tcp->tcp_iphc; + src = (ipaddr_t *)connp->conn_ht_iphc; dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; @@ -17135,21 +14156,22 @@ tcp_output(void *arg, mblk_t *mp, void *arg2) if (tcp->tcp_ecn_ok) { SET_ECT(tcp, rptr); - tcph = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len); + tcpha = (tcpha_t *)(rptr + ixa->ixa_ip_hdr_length); if (tcp->tcp_ecn_echo_on) - tcph->th_flags[0] |= TH_ECE; + tcpha->tha_flags |= TH_ECE; if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { - tcph->th_flags[0] |= TH_CWR; + tcpha->tha_flags |= TH_CWR; tcp->tcp_ecn_cwr_sent = B_TRUE; } } if (tcp->tcp_ip_forward_progress) { - ASSERT(tcp->tcp_ipversion == IPV6_VERSION); - *(uint32_t *)mp1->b_rptr |= IP_FORWARD_PROG; tcp->tcp_ip_forward_progress = B_FALSE; + connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF; + } else { + connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF; } - tcp_send_data(tcp, tcp->tcp_wq, mp1); + tcp_send_data(tcp, mp1); return; /* @@ -17166,29 +14188,27 @@ slow: tcp_wput_data(tcp, NULL, B_FALSE); } +/* + * This runs at the tail end of accept processing on the squeue of the + * new connection. + */ /* ARGSUSED */ void -tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) +tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { conn_t *connp = (conn_t *)arg; tcp_t *tcp = connp->conn_tcp; - queue_t *q = tcp->tcp_rq; - struct tcp_options *tcpopt; + queue_t *q = connp->conn_rq; tcp_stack_t *tcps = tcp->tcp_tcps; - /* socket options */ - uint_t sopp_flags; - ssize_t sopp_rxhiwat; - ssize_t sopp_maxblk; - ushort_t sopp_wroff; - ushort_t sopp_tail; - ushort_t sopp_copyopt; + struct sock_proto_props sopp; - tcpopt = (struct tcp_options *)mp->b_rptr; + /* We should just receive a single mblk that fits a T_discon_ind */ + ASSERT(mp->b_cont == NULL); /* * Drop the eager's ref on the listener, that was placed when - * this eager began life in tcp_conn_request. + * this eager began life in tcp_input_listener. */ CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp); if (IPCL_IS_NONSTR(connp)) { @@ -17227,15 +14247,12 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) * memory allocation failure problems. We know * that the size of the incoming mblk i.e. * stroptions is greater than sizeof - * T_discon_ind. So the reallocb below can't - * fail. + * T_discon_ind. */ - freemsg(mp->b_cont); - mp->b_cont = NULL; ASSERT(DB_REF(mp) == 1); - mp = reallocb(mp, sizeof (struct T_discon_ind), - B_FALSE); - ASSERT(mp != NULL); + ASSERT(MBLKSIZE(mp) >= + sizeof (struct T_discon_ind)); + DB_TYPE(mp) = M_PROTO; ((union T_primitives *)mp->b_rptr)->type = T_DISCON_IND; @@ -17251,41 +14268,21 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) mp->b_wptr = mp->b_rptr + sizeof (struct T_discon_ind); putnext(q, mp); - return; } } - if (tcp->tcp_hard_binding) { - tcp->tcp_hard_binding = B_FALSE; - tcp->tcp_hard_bound = B_TRUE; - } + tcp->tcp_hard_binding = B_FALSE; return; } - if (tcpopt->to_flags & TCPOPT_BOUNDIF) { - int boundif = tcpopt->to_boundif; - uint_t len = sizeof (int); - - (void) tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, IPPROTO_IPV6, - IPV6_BOUND_IF, len, (uchar_t *)&boundif, &len, - (uchar_t *)&boundif, NULL, tcp->tcp_cred, NULL); - } - if (tcpopt->to_flags & TCPOPT_RECVPKTINFO) { - uint_t on = 1; - uint_t len = sizeof (uint_t); - (void) tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, IPPROTO_IPV6, - IPV6_RECVPKTINFO, len, (uchar_t *)&on, &len, - (uchar_t *)&on, NULL, tcp->tcp_cred, NULL); - } - /* - * Set max window size (tcp_recv_hiwater) of the acceptor. + * Set max window size (conn_rcvbuf) of the acceptor. */ if (tcp->tcp_rcv_list == NULL) { /* * Recv queue is empty, tcp_rwnd should not have changed. * That means it should be equal to the listener's tcp_rwnd. */ - tcp->tcp_recv_hiwater = tcp->tcp_rwnd; + connp->conn_rcvbuf = tcp->tcp_rwnd; } else { #ifdef DEBUG mblk_t *tmp; @@ -17300,19 +14297,19 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) ASSERT(cnt != 0 && tcp->tcp_rcv_cnt == cnt); #endif /* There is some data, add them back to get the max. */ - tcp->tcp_recv_hiwater = tcp->tcp_rwnd + tcp->tcp_rcv_cnt; + connp->conn_rcvbuf = tcp->tcp_rwnd + tcp->tcp_rcv_cnt; } /* * This is the first time we run on the correct * queue after tcp_accept. So fix all the q parameters * here. */ - sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_MAXBLK | SOCKOPT_WROFF; - sopp_maxblk = tcp_maxpsz_set(tcp, B_FALSE); + sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_MAXBLK | SOCKOPT_WROFF; + sopp.sopp_maxblk = tcp_maxpsz_set(tcp, B_FALSE); - sopp_rxhiwat = tcp->tcp_fused ? - tcp_fuse_set_rcv_hiwat(tcp, tcp->tcp_recv_hiwater) : - tcp->tcp_recv_hiwater; + sopp.sopp_rxhiwat = tcp->tcp_fused ? + tcp_fuse_set_rcv_hiwat(tcp, connp->conn_rcvbuf) : + connp->conn_rcvbuf; /* * Determine what write offset value to use depending on SACK and @@ -17328,18 +14325,18 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) * since it would reduce the amount of work done by kmem. * Non-fused tcp loopback case is handled separately below. */ - sopp_wroff = 0; + sopp.sopp_wroff = 0; /* * Update the peer's transmit parameters according to * our recently calculated high water mark value. */ (void) tcp_maxpsz_set(tcp->tcp_loopback_peer, B_TRUE); } else if (tcp->tcp_snd_sack_ok) { - sopp_wroff = tcp->tcp_hdr_len + TCPOPT_MAX_SACK_LEN + + sopp.sopp_wroff = connp->conn_ht_iphc_allocated + (tcp->tcp_loopback ? 0 : tcps->tcps_wroff_xtra); } else { - sopp_wroff = tcp->tcp_hdr_len + (tcp->tcp_loopback ? 0 : - tcps->tcps_wroff_xtra); + sopp.sopp_wroff = connp->conn_ht_iphc_len + + (tcp->tcp_loopback ? 0 : tcps->tcps_wroff_xtra); } /* @@ -17354,30 +14351,22 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) * costs. */ if (tcp->tcp_kssl_ctx != NULL) { - sopp_wroff += SSL3_WROFFSET; + sopp.sopp_wroff += SSL3_WROFFSET; - sopp_flags |= SOCKOPT_TAIL; - sopp_tail = SSL3_MAX_TAIL_LEN; + sopp.sopp_flags |= SOCKOPT_TAIL; + sopp.sopp_tail = SSL3_MAX_TAIL_LEN; - sopp_flags |= SOCKOPT_ZCOPY; - sopp_copyopt = ZCVMUNSAFE; + sopp.sopp_flags |= SOCKOPT_ZCOPY; + sopp.sopp_zcopyflag = ZCVMUNSAFE; - sopp_maxblk = SSL3_MAX_RECORD_LEN; + sopp.sopp_maxblk = SSL3_MAX_RECORD_LEN; } /* Send the options up */ if (IPCL_IS_NONSTR(connp)) { - struct sock_proto_props sopp; - - sopp.sopp_flags = sopp_flags; - sopp.sopp_wroff = sopp_wroff; - sopp.sopp_maxblk = sopp_maxblk; - sopp.sopp_rxhiwat = sopp_rxhiwat; - if (sopp_flags & SOCKOPT_TAIL) { + if (sopp.sopp_flags & SOCKOPT_TAIL) { ASSERT(tcp->tcp_kssl_ctx != NULL); - ASSERT(sopp_flags & SOCKOPT_ZCOPY); - sopp.sopp_tail = sopp_tail; - sopp.sopp_zcopyflag = sopp_copyopt; + ASSERT(sopp.sopp_flags & SOCKOPT_ZCOPY); } if (tcp->tcp_loopback) { sopp.sopp_flags |= SOCKOPT_LOOPBACK; @@ -17385,34 +14374,40 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) } (*connp->conn_upcalls->su_set_proto_props) (connp->conn_upper_handle, &sopp); + freemsg(mp); } else { + /* + * Let us reuse the incoming mblk to avoid + * memory allocation failure problems. We know + * that the size of the incoming mblk is at least + * stroptions + */ struct stroptions *stropt; - mblk_t *stropt_mp = allocb(sizeof (struct stroptions), BPRI_HI); - if (stropt_mp == NULL) { - tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); - return; - } - DB_TYPE(stropt_mp) = M_SETOPTS; - stropt = (struct stroptions *)stropt_mp->b_rptr; - stropt_mp->b_wptr += sizeof (struct stroptions); + + ASSERT(DB_REF(mp) == 1); + ASSERT(MBLKSIZE(mp) >= sizeof (struct stroptions)); + + DB_TYPE(mp) = M_SETOPTS; + stropt = (struct stroptions *)mp->b_rptr; + mp->b_wptr = mp->b_rptr + sizeof (struct stroptions); + stropt = (struct stroptions *)mp->b_rptr; stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK; - stropt->so_hiwat = sopp_rxhiwat; - stropt->so_wroff = sopp_wroff; - stropt->so_maxblk = sopp_maxblk; + stropt->so_hiwat = sopp.sopp_rxhiwat; + stropt->so_wroff = sopp.sopp_wroff; + stropt->so_maxblk = sopp.sopp_maxblk; - if (sopp_flags & SOCKOPT_TAIL) { + if (sopp.sopp_flags & SOCKOPT_TAIL) { ASSERT(tcp->tcp_kssl_ctx != NULL); stropt->so_flags |= SO_TAIL | SO_COPYOPT; - stropt->so_tail = sopp_tail; - stropt->so_copyopt = sopp_copyopt; + stropt->so_tail = sopp.sopp_tail; + stropt->so_copyopt = sopp.sopp_zcopyflag; } /* Send the options up */ - putnext(q, stropt_mp); + putnext(q, mp); } - freemsg(mp); /* * Pass up any data and/or a fin that has been received. * @@ -17432,7 +14427,7 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) if (!tcp->tcp_fused && (*connp->conn_upcalls->su_recv) (connp->conn_upper_handle, NULL, 0, 0, &error, &push) >= 0) { - tcp->tcp_rwnd = tcp->tcp_recv_hiwater; + tcp->tcp_rwnd = connp->conn_rcvbuf; if (tcp->tcp_state >= TCPS_ESTABLISHED && tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { tcp_xmit_ctl(NULL, @@ -17463,7 +14458,7 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) /* We drain directly in case of fused tcp loopback */ if (!tcp->tcp_fused && canputnext(q)) { - tcp->tcp_rwnd = tcp->tcp_recv_hiwater; + tcp->tcp_rwnd = connp->conn_rcvbuf; if (tcp->tcp_state >= TCPS_ESTABLISHED && tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { tcp_xmit_ctl(NULL, @@ -17508,12 +14503,9 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) putnext(q, mp); } } - if (tcp->tcp_hard_binding) { - tcp->tcp_hard_binding = B_FALSE; - tcp->tcp_hard_bound = B_TRUE; - } + tcp->tcp_hard_binding = B_FALSE; - if (tcp->tcp_ka_enabled) { + if (connp->conn_keepalive) { tcp->tcp_ka_last_intrvl = 0; tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer, MSEC_TO_TICK(tcp->tcp_ka_interval)); @@ -17535,14 +14527,14 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) /* * The function called through squeue to get behind listener's perimeter to - * send a deffered conn_ind. + * send a deferred conn_ind. */ /* ARGSUSED */ void -tcp_send_pending(void *arg, mblk_t *mp, void *arg2) +tcp_send_pending(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { - conn_t *connp = (conn_t *)arg; - tcp_t *listener = connp->conn_tcp; + conn_t *lconnp = (conn_t *)arg; + tcp_t *listener = lconnp->conn_tcp; struct T_conn_ind *conn_ind; tcp_t *tcp; @@ -17560,29 +14552,34 @@ tcp_send_pending(void *arg, mblk_t *mp, void *arg2) return; } - tcp_ulp_newconn(connp, tcp->tcp_connp, mp); + tcp_ulp_newconn(lconnp, tcp->tcp_connp, mp); } -/* ARGSUSED */ +/* + * Common to TPI and sockfs accept code. + */ +/* ARGSUSED2 */ static int tcp_accept_common(conn_t *lconnp, conn_t *econnp, cred_t *cr) { tcp_t *listener, *eager; - mblk_t *opt_mp; - struct tcp_options *tcpopt; + mblk_t *discon_mp; listener = lconnp->conn_tcp; ASSERT(listener->tcp_state == TCPS_LISTEN); eager = econnp->conn_tcp; ASSERT(eager->tcp_listener != NULL); - ASSERT(eager->tcp_rq != NULL); + /* + * Pre allocate the discon_ind mblk also. tcp_accept_finish will + * use it if something failed. + */ + discon_mp = allocb(MAX(sizeof (struct T_discon_ind), + sizeof (struct stroptions)), BPRI_HI); - opt_mp = allocb(sizeof (struct tcp_options), BPRI_HI); - if (opt_mp == NULL) { + if (discon_mp == NULL) { return (-TPROTO); } - bzero((char *)opt_mp->b_rptr, sizeof (struct tcp_options)); eager->tcp_issocket = B_TRUE; econnp->conn_zoneid = listener->tcp_connp->conn_zoneid; @@ -17607,24 +14604,6 @@ tcp_accept_common(conn_t *lconnp, conn_t *econnp, cred_t *cr) */ ASSERT(econnp->conn_ref >= 3); - opt_mp->b_datap->db_type = M_SETOPTS; - opt_mp->b_wptr += sizeof (struct tcp_options); - - /* - * Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO - * from listener to acceptor. - */ - tcpopt = (struct tcp_options *)opt_mp->b_rptr; - tcpopt->to_flags = 0; - - if (listener->tcp_bound_if != 0) { - tcpopt->to_flags |= TCPOPT_BOUNDIF; - tcpopt->to_boundif = listener->tcp_bound_if; - } - if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) { - tcpopt->to_flags |= TCPOPT_RECVPKTINFO; - } - mutex_enter(&listener->tcp_eager_lock); if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { @@ -17686,7 +14665,7 @@ tcp_accept_common(conn_t *lconnp, conn_t *econnp, cred_t *cr) /* Need to get inside the listener perimeter */ CONN_INC_REF(listener->tcp_connp); SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, mp1, - tcp_send_pending, listener->tcp_connp, SQ_FILL, + tcp_send_pending, listener->tcp_connp, NULL, SQ_FILL, SQTAG_TCP_SEND_PENDING); } no_more_eagers: @@ -17700,8 +14679,8 @@ no_more_eagers: * before sending the conn_ind in tcp_send_conn_ind. * The ref will be dropped in tcp_accept_finish(). */ - SQUEUE_ENTER_ONE(econnp->conn_sqp, opt_mp, tcp_accept_finish, - econnp, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0); + SQUEUE_ENTER_ONE(econnp->conn_sqp, discon_mp, tcp_accept_finish, + econnp, NULL, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0); return (0); } @@ -17712,7 +14691,6 @@ tcp_accept(sock_lower_handle_t lproto_handle, { conn_t *lconnp, *econnp; tcp_t *listener, *eager; - tcp_stack_t *tcps; lconnp = (conn_t *)lproto_handle; listener = lconnp->conn_tcp; @@ -17720,7 +14698,6 @@ tcp_accept(sock_lower_handle_t lproto_handle, econnp = (conn_t *)eproto_handle; eager = econnp->conn_tcp; ASSERT(eager->tcp_listener != NULL); - tcps = eager->tcp_tcps; /* * It is OK to manipulate these fields outside the eager's squeue @@ -17732,19 +14709,6 @@ tcp_accept(sock_lower_handle_t lproto_handle, econnp->conn_upper_handle = sock_handle; econnp->conn_upcalls = lconnp->conn_upcalls; ASSERT(IPCL_IS_NONSTR(econnp)); - /* - * Create helper stream if it is a non-TPI TCP connection. - */ - if (ip_create_helper_stream(econnp, tcps->tcps_ldi_ident)) { - ip1dbg(("tcp_accept: create of IP helper stream" - " failed\n")); - return (EPROTO); - } - eager->tcp_rq = econnp->conn_rq; - eager->tcp_wq = econnp->conn_wq; - - ASSERT(eager->tcp_rq != NULL); - return (tcp_accept_common(lconnp, econnp, cr)); } @@ -17752,7 +14716,7 @@ tcp_accept(sock_lower_handle_t lproto_handle, /* * This is the STREAMS entry point for T_CONN_RES coming down on * Acceptor STREAM when sockfs listener does accept processing. - * Read the block comment on top of tcp_conn_request(). + * Read the block comment on top of tcp_input_listener(). */ void tcp_tpi_accept(queue_t *q, mblk_t *mp) @@ -17815,8 +14779,8 @@ tcp_tpi_accept(queue_t *q, mblk_t *mp) econnp = eager->tcp_connp; econnp->conn_dev = (dev_t)RD(q)->q_ptr; econnp->conn_minor_arena = (vmem_t *)(WR(q)->q_ptr); - eager->tcp_rq = rq; - eager->tcp_wq = q; + econnp->conn_rq = rq; + econnp->conn_wq = q; rq->q_ptr = econnp; rq->q_qinfo = &tcp_rinitv4; /* No open - same as rinitv6 */ q->q_ptr = econnp; @@ -17836,7 +14800,7 @@ tcp_tpi_accept(queue_t *q, mblk_t *mp) * should already be enough space in the mp that came * down from soaccept(). */ - if (eager->tcp_family == AF_INET) { + if (econnp->conn_family == AF_INET) { sin_t *sin; ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >= @@ -17844,8 +14808,8 @@ tcp_tpi_accept(queue_t *q, mblk_t *mp) sin = (sin_t *)mp->b_wptr; mp->b_wptr += sizeof (sin_t); sin->sin_family = AF_INET; - sin->sin_port = eager->tcp_lport; - sin->sin_addr.s_addr = eager->tcp_ipha->ipha_src; + sin->sin_port = econnp->conn_lport; + sin->sin_addr.s_addr = econnp->conn_laddr_v4; } else { sin6_t *sin6; @@ -17854,20 +14818,23 @@ tcp_tpi_accept(queue_t *q, mblk_t *mp) sin6 = (sin6_t *)mp->b_wptr; mp->b_wptr += sizeof (sin6_t); sin6->sin6_family = AF_INET6; - sin6->sin6_port = eager->tcp_lport; - if (eager->tcp_ipversion == IPV4_VERSION) { + sin6->sin6_port = econnp->conn_lport; + sin6->sin6_addr = econnp->conn_laddr_v6; + if (econnp->conn_ipversion == IPV4_VERSION) { sin6->sin6_flowinfo = 0; - IN6_IPADDR_TO_V4MAPPED( - eager->tcp_ipha->ipha_src, - &sin6->sin6_addr); } else { ASSERT(eager->tcp_ip6h != NULL); sin6->sin6_flowinfo = eager->tcp_ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; - sin6->sin6_addr = eager->tcp_ip6h->ip6_src; } - sin6->sin6_scope_id = 0; + if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) && + (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) { + sin6->sin6_scope_id = + econnp->conn_ixa->ixa_scopeid; + } else { + sin6->sin6_scope_id = 0; + } sin6->__sin6_src_id = 0; } @@ -17881,97 +14848,6 @@ tcp_tpi_accept(queue_t *q, mblk_t *mp) } } -static int -tcp_do_getsockname(tcp_t *tcp, struct sockaddr *sa, uint_t *salenp) -{ - sin_t *sin = (sin_t *)sa; - sin6_t *sin6 = (sin6_t *)sa; - - switch (tcp->tcp_family) { - case AF_INET: - ASSERT(tcp->tcp_ipversion == IPV4_VERSION); - - if (*salenp < sizeof (sin_t)) - return (EINVAL); - - *sin = sin_null; - sin->sin_family = AF_INET; - if (tcp->tcp_state >= TCPS_BOUND) { - sin->sin_port = tcp->tcp_lport; - sin->sin_addr.s_addr = tcp->tcp_ipha->ipha_src; - } - *salenp = sizeof (sin_t); - break; - - case AF_INET6: - if (*salenp < sizeof (sin6_t)) - return (EINVAL); - - *sin6 = sin6_null; - sin6->sin6_family = AF_INET6; - if (tcp->tcp_state >= TCPS_BOUND) { - sin6->sin6_port = tcp->tcp_lport; - mutex_enter(&tcp->tcp_connp->conn_lock); - if (tcp->tcp_ipversion == IPV4_VERSION) { - IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, - &sin6->sin6_addr); - } else { - sin6->sin6_addr = tcp->tcp_ip6h->ip6_src; - } - mutex_exit(&tcp->tcp_connp->conn_lock); - } - *salenp = sizeof (sin6_t); - break; - } - - return (0); -} - -static int -tcp_do_getpeername(tcp_t *tcp, struct sockaddr *sa, uint_t *salenp) -{ - sin_t *sin = (sin_t *)sa; - sin6_t *sin6 = (sin6_t *)sa; - - if (tcp->tcp_state < TCPS_SYN_RCVD) - return (ENOTCONN); - - switch (tcp->tcp_family) { - case AF_INET: - ASSERT(tcp->tcp_ipversion == IPV4_VERSION); - - if (*salenp < sizeof (sin_t)) - return (EINVAL); - - *sin = sin_null; - sin->sin_family = AF_INET; - sin->sin_port = tcp->tcp_fport; - IN6_V4MAPPED_TO_IPADDR(&tcp->tcp_remote_v6, - sin->sin_addr.s_addr); - *salenp = sizeof (sin_t); - break; - - case AF_INET6: - if (*salenp < sizeof (sin6_t)) - return (EINVAL); - - *sin6 = sin6_null; - sin6->sin6_family = AF_INET6; - sin6->sin6_port = tcp->tcp_fport; - sin6->sin6_addr = tcp->tcp_remote_v6; - mutex_enter(&tcp->tcp_connp->conn_lock); - if (tcp->tcp_ipversion == IPV6_VERSION) { - sin6->sin6_flowinfo = tcp->tcp_ip6h->ip6_vcf & - ~IPV6_VERS_AND_FLOW_MASK; - } - mutex_exit(&tcp->tcp_connp->conn_lock); - *salenp = sizeof (sin6_t); - break; - } - - return (0); -} - /* * Handle special out-of-band ioctl requests (see PSARC/2008/265). */ @@ -17980,7 +14856,8 @@ tcp_wput_cmdblk(queue_t *q, mblk_t *mp) { void *data; mblk_t *datamp = mp->b_cont; - tcp_t *tcp = Q_TO_TCP(q); + conn_t *connp = Q_TO_CONN(q); + tcp_t *tcp = connp->conn_tcp; cmdblk_t *cmdp = (cmdblk_t *)mp->b_rptr; if (datamp == NULL || MBLKL(datamp) < cmdp->cb_len) { @@ -17993,10 +14870,14 @@ tcp_wput_cmdblk(queue_t *q, mblk_t *mp) switch (cmdp->cb_cmd) { case TI_GETPEERNAME: - cmdp->cb_error = tcp_do_getpeername(tcp, data, &cmdp->cb_len); + if (tcp->tcp_state < TCPS_SYN_RCVD) + cmdp->cb_error = ENOTCONN; + else + cmdp->cb_error = conn_getpeername(connp, data, + &cmdp->cb_len); break; case TI_GETMYNAME: - cmdp->cb_error = tcp_do_getsockname(tcp, data, &cmdp->cb_len); + cmdp->cb_error = conn_getsockname(connp, data, &cmdp->cb_len); break; default: cmdp->cb_error = EINVAL; @@ -18029,14 +14910,14 @@ tcp_wput(queue_t *q, mblk_t *mp) mutex_enter(&tcp->tcp_non_sq_lock); tcp->tcp_squeue_bytes += size; - if (TCP_UNSENT_BYTES(tcp) > tcp->tcp_xmit_hiwater) { + if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) { tcp_setqfull(tcp); } mutex_exit(&tcp->tcp_non_sq_lock); CONN_INC_REF(connp); SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, connp, - tcp_squeue_flag, SQTAG_TCP_OUTPUT); + NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); return; case M_CMD: @@ -18053,7 +14934,7 @@ tcp_wput(queue_t *q, mblk_t *mp) if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) { type = ((union T_primitives *)rptr)->type; } else { - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_wput_proto, dropping one..."); @@ -18093,7 +14974,7 @@ tcp_wput(queue_t *q, mblk_t *mp) /* * Most ioctls can be processed right away without going via * squeues - process them right here. Those that do require - * squeue (currently TCP_IOC_DEFAULT_Q and _SIOCSOCKFALLBACK) + * squeue (currently _SIOCSOCKFALLBACK) * are processed by tcp_wput_ioctl(). */ iocp = (struct iocblk *)mp->b_rptr; @@ -18111,26 +14992,13 @@ tcp_wput(queue_t *q, mblk_t *mp) case ND_SET: /* nd_getset does the necessary checks */ case ND_GET: - if (!nd_getset(q, tcps->tcps_g_nd, mp)) { - CALL_IP_WPUT(connp, q, mp); - return; - } - qreply(q, mp); - return; - case TCP_IOC_DEFAULT_Q: - /* - * Wants to be the default wq. Check the credentials - * first, the rest is executed via squeue. - */ - if (secpolicy_ip_config(iocp->ioc_cr, B_FALSE) != 0) { - iocp->ioc_error = EPERM; - iocp->ioc_count = 0; - mp->b_datap->db_type = M_IOCACK; + if (nd_getset(q, tcps->tcps_g_nd, mp)) { qreply(q, mp); return; } - output_proc = tcp_wput_ioctl; - break; + ip_wput_nondata(q, mp); + return; + default: output_proc = tcp_wput_ioctl; break; @@ -18143,7 +15011,7 @@ tcp_wput(queue_t *q, mblk_t *mp) CONN_INC_REF(connp); SQUEUE_ENTER_ONE(connp->conn_sqp, mp, output_proc, connp, - tcp_squeue_flag, SQTAG_TCP_WPUT_OTHER); + NULL, tcp_squeue_flag, SQTAG_TCP_WPUT_OTHER); } /* @@ -18188,52 +15056,32 @@ tcp_wput_fallback(queue_t *wq, mblk_t *mp) freemsg(mp); } +/* + * Check the usability of ZEROCOPY. It's instead checking the flag set by IP. + */ static boolean_t tcp_zcopy_check(tcp_t *tcp) { - conn_t *connp = tcp->tcp_connp; - ire_t *ire; + conn_t *connp = tcp->tcp_connp; + ip_xmit_attr_t *ixa = connp->conn_ixa; boolean_t zc_enabled = B_FALSE; tcp_stack_t *tcps = tcp->tcp_tcps; if (do_tcpzcopy == 2) zc_enabled = B_TRUE; - else if (tcp->tcp_ipversion == IPV4_VERSION && - IPCL_IS_CONNECTED(connp) && - (connp->conn_flags & IPCL_CHECK_POLICY) == 0 && - connp->conn_dontroute == 0 && - !connp->conn_nexthop_set && - connp->conn_outgoing_ill == NULL && - do_tcpzcopy == 1) { - /* - * the checks above closely resemble the fast path checks - * in tcp_send_data(). - */ - mutex_enter(&connp->conn_lock); - ire = connp->conn_ire_cache; - ASSERT(!(connp->conn_state_flags & CONN_INCIPIENT)); - if (ire != NULL && !(ire->ire_marks & IRE_MARK_CONDEMNED)) { - IRE_REFHOLD(ire); - if (ire->ire_stq != NULL) { - ill_t *ill = (ill_t *)ire->ire_stq->q_ptr; - - zc_enabled = ill && (ill->ill_capabilities & - ILL_CAPAB_ZEROCOPY) && - (ill->ill_zerocopy_capab-> - ill_zerocopy_flags != 0); - } - IRE_REFRELE(ire); - } - mutex_exit(&connp->conn_lock); - } + else if ((do_tcpzcopy == 1) && (ixa->ixa_flags & IXAF_ZCOPY_CAPAB)) + zc_enabled = B_TRUE; + tcp->tcp_snd_zcopy_on = zc_enabled; if (!TCP_IS_DETACHED(tcp)) { if (zc_enabled) { - (void) proto_set_tx_copyopt(tcp->tcp_rq, connp, + ixa->ixa_flags |= IXAF_VERIFY_ZCOPY; + (void) proto_set_tx_copyopt(connp->conn_rq, connp, ZCVMSAFE); TCP_STAT(tcps, tcp_zcopy_on); } else { - (void) proto_set_tx_copyopt(tcp->tcp_rq, connp, + ixa->ixa_flags &= ~IXAF_VERIFY_ZCOPY; + (void) proto_set_tx_copyopt(connp->conn_rq, connp, ZCVMUNSAFE); TCP_STAT(tcps, tcp_zcopy_off); } @@ -18241,99 +15089,84 @@ tcp_zcopy_check(tcp_t *tcp) return (zc_enabled); } -static mblk_t * -tcp_zcopy_disable(tcp_t *tcp, mblk_t *bp) -{ - tcp_stack_t *tcps = tcp->tcp_tcps; - - if (do_tcpzcopy == 2) - return (bp); - else if (tcp->tcp_snd_zcopy_on) { - tcp->tcp_snd_zcopy_on = B_FALSE; - if (!TCP_IS_DETACHED(tcp)) { - (void) proto_set_tx_copyopt(tcp->tcp_rq, tcp->tcp_connp, - ZCVMUNSAFE); - TCP_STAT(tcps, tcp_zcopy_disable); - } - } - return (tcp_zcopy_backoff(tcp, bp, 0)); -} - /* - * Backoff from a zero-copy mblk by copying data to a new mblk and freeing - * the original desballoca'ed segmapped mblk. + * Backoff from a zero-copy message by copying data to a new allocated + * message and freeing the original desballoca'ed segmapped message. + * + * This function is called by following two callers: + * 1. tcp_timer: fix_xmitlist is set to B_TRUE, because it's safe to free + * the origial desballoca'ed message and notify sockfs. This is in re- + * transmit state. + * 2. tcp_output: fix_xmitlist is set to B_FALSE. Flag STRUIO_ZCNOTIFY need + * to be copied to new message. */ static mblk_t * -tcp_zcopy_backoff(tcp_t *tcp, mblk_t *bp, int fix_xmitlist) +tcp_zcopy_backoff(tcp_t *tcp, mblk_t *bp, boolean_t fix_xmitlist) { - mblk_t *head, *tail, *nbp; + mblk_t *nbp; + mblk_t *head = NULL; + mblk_t *tail = NULL; tcp_stack_t *tcps = tcp->tcp_tcps; - if (IS_VMLOANED_MBLK(bp)) { - TCP_STAT(tcps, tcp_zcopy_backoff); - if ((head = copyb(bp)) == NULL) { - /* fail to backoff; leave it for the next backoff */ - tcp->tcp_xmit_zc_clean = B_FALSE; - return (bp); - } - if (bp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) { - if (fix_xmitlist) - tcp_zcopy_notify(tcp); - else - head->b_datap->db_struioflag |= STRUIO_ZCNOTIFY; - } - nbp = bp->b_cont; - if (fix_xmitlist) { - head->b_prev = bp->b_prev; - head->b_next = bp->b_next; - if (tcp->tcp_xmit_tail == bp) - tcp->tcp_xmit_tail = head; - } - bp->b_next = NULL; - bp->b_prev = NULL; - freeb(bp); - } else { - head = bp; - nbp = bp->b_cont; - } - tail = head; - while (nbp) { - if (IS_VMLOANED_MBLK(nbp)) { + ASSERT(bp != NULL); + while (bp != NULL) { + if (IS_VMLOANED_MBLK(bp)) { TCP_STAT(tcps, tcp_zcopy_backoff); - if ((tail->b_cont = copyb(nbp)) == NULL) { + if ((nbp = copyb(bp)) == NULL) { tcp->tcp_xmit_zc_clean = B_FALSE; - tail->b_cont = nbp; - return (head); + if (tail != NULL) + tail->b_cont = bp; + return ((head == NULL) ? bp : head); } - tail = tail->b_cont; - if (nbp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) { + + if (bp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) { if (fix_xmitlist) tcp_zcopy_notify(tcp); else - tail->b_datap->db_struioflag |= + nbp->b_datap->db_struioflag |= STRUIO_ZCNOTIFY; } - bp = nbp; - nbp = nbp->b_cont; + nbp->b_cont = bp->b_cont; + + /* + * Copy saved information and adjust tcp_xmit_tail + * if needed. + */ if (fix_xmitlist) { - tail->b_prev = bp->b_prev; - tail->b_next = bp->b_next; + nbp->b_prev = bp->b_prev; + nbp->b_next = bp->b_next; + if (tcp->tcp_xmit_tail == bp) - tcp->tcp_xmit_tail = tail; + tcp->tcp_xmit_tail = nbp; } - bp->b_next = NULL; + + /* Free the original message. */ bp->b_prev = NULL; + bp->b_next = NULL; freeb(bp); + + bp = nbp; + } + + if (head == NULL) { + head = bp; + } + if (tail == NULL) { + tail = bp; } else { - tail->b_cont = nbp; - tail = nbp; - nbp = nbp->b_cont; + tail->b_cont = bp; + tail = bp; } + + /* Move forward. */ + bp = bp->b_cont; } + if (fix_xmitlist) { tcp->tcp_xmit_last = tail; tcp->tcp_xmit_zc_clean = B_TRUE; } + return (head); } @@ -18341,7 +15174,7 @@ static void tcp_zcopy_notify(tcp_t *tcp) { struct stdata *stp; - conn_t *connp; + conn_t *connp; if (tcp->tcp_detached) return; @@ -18351,323 +15184,149 @@ tcp_zcopy_notify(tcp_t *tcp) (connp->conn_upper_handle); return; } - stp = STREAM(tcp->tcp_rq); + stp = STREAM(connp->conn_rq); mutex_enter(&stp->sd_lock); stp->sd_flag |= STZCNOTIFY; cv_broadcast(&stp->sd_zcopy_wait); mutex_exit(&stp->sd_lock); } -static boolean_t -tcp_send_find_ire(tcp_t *tcp, ipaddr_t *dst, ire_t **irep) +/* + * Update the TCP connection according to change of LSO capability. + */ +static void +tcp_update_lso(tcp_t *tcp, ip_xmit_attr_t *ixa) { - ire_t *ire; - conn_t *connp = tcp->tcp_connp; - tcp_stack_t *tcps = tcp->tcp_tcps; - ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - - mutex_enter(&connp->conn_lock); - ire = connp->conn_ire_cache; - ASSERT(!(connp->conn_state_flags & CONN_INCIPIENT)); - - if ((ire != NULL) && - (((dst != NULL) && (ire->ire_addr == *dst)) || ((dst == NULL) && - IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, &tcp->tcp_ip6h->ip6_dst))) && - !(ire->ire_marks & IRE_MARK_CONDEMNED)) { - IRE_REFHOLD(ire); - mutex_exit(&connp->conn_lock); - } else { - boolean_t cached = B_FALSE; - ts_label_t *tsl; - - /* force a recheck later on */ - tcp->tcp_ire_ill_check_done = B_FALSE; - - TCP_DBGSTAT(tcps, tcp_ire_null1); - connp->conn_ire_cache = NULL; - mutex_exit(&connp->conn_lock); - - if (ire != NULL) - IRE_REFRELE_NOTR(ire); - - tsl = crgetlabel(CONN_CRED(connp)); - ire = (dst ? - ire_cache_lookup(*dst, connp->conn_zoneid, tsl, ipst) : - ire_cache_lookup_v6(&tcp->tcp_ip6h->ip6_dst, - connp->conn_zoneid, tsl, ipst)); + /* + * We check against IPv4 header length to preserve the old behavior + * of only enabling LSO when there are no IP options. + * But this restriction might not be necessary at all. Before removing + * it, need to verify how LSO is handled for source routing case, with + * which IP does software checksum. + * + * For IPv6, whenever any extension header is needed, LSO is supressed. + */ + if (ixa->ixa_ip_hdr_length != ((ixa->ixa_flags & IXAF_IS_IPV4) ? + IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN)) + return; - if (ire == NULL) { - TCP_STAT(tcps, tcp_ire_null); - return (B_FALSE); - } + /* + * Either the LSO capability newly became usable, or it has changed. + */ + if (ixa->ixa_flags & IXAF_LSO_CAPAB) { + ill_lso_capab_t *lsoc = &ixa->ixa_lso_capab; - IRE_REFHOLD_NOTR(ire); + ASSERT(lsoc->ill_lso_max > 0); + tcp->tcp_lso_max = MIN(TCP_MAX_LSO_LENGTH, lsoc->ill_lso_max); - mutex_enter(&connp->conn_lock); - if (CONN_CACHE_IRE(connp)) { - rw_enter(&ire->ire_bucket->irb_lock, RW_READER); - if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { - TCP_CHECK_IREINFO(tcp, ire); - connp->conn_ire_cache = ire; - cached = B_TRUE; - } - rw_exit(&ire->ire_bucket->irb_lock); - } - mutex_exit(&connp->conn_lock); + DTRACE_PROBE3(tcp_update_lso, boolean_t, tcp->tcp_lso, + boolean_t, B_TRUE, uint32_t, tcp->tcp_lso_max); /* - * We can continue to use the ire but since it was - * not cached, we should drop the extra reference. + * If LSO to be enabled, notify the STREAM header with larger + * data block. */ - if (!cached) - IRE_REFRELE_NOTR(ire); + if (!tcp->tcp_lso) + tcp->tcp_maxpsz_multiplier = 0; + + tcp->tcp_lso = B_TRUE; + TCP_STAT(tcp->tcp_tcps, tcp_lso_enabled); + } else { /* LSO capability is not usable any more. */ + DTRACE_PROBE3(tcp_update_lso, boolean_t, tcp->tcp_lso, + boolean_t, B_FALSE, uint32_t, tcp->tcp_lso_max); /* - * Rampart note: no need to select a new label here, since - * labels are not allowed to change during the life of a TCP - * connection. + * If LSO to be disabled, notify the STREAM header with smaller + * data block. And need to restore fragsize to PMTU. */ + if (tcp->tcp_lso) { + tcp->tcp_maxpsz_multiplier = + tcp->tcp_tcps->tcps_maxpsz_multiplier; + ixa->ixa_fragsize = ixa->ixa_pmtu; + tcp->tcp_lso = B_FALSE; + TCP_STAT(tcp->tcp_tcps, tcp_lso_disabled); + } } - *irep = ire; - - return (B_TRUE); + (void) tcp_maxpsz_set(tcp, B_TRUE); } /* - * Called from tcp_send() or tcp_send_data() to find workable IRE. - * - * 0 = success; - * 1 = failed to find ire and ill. + * Update the TCP connection according to change of ZEROCOPY capability. */ -static boolean_t -tcp_send_find_ire_ill(tcp_t *tcp, mblk_t *mp, ire_t **irep, ill_t **illp) +static void +tcp_update_zcopy(tcp_t *tcp) { - ipha_t *ipha; - ipaddr_t dst; - ire_t *ire; - ill_t *ill; - mblk_t *ire_fp_mp; + conn_t *connp = tcp->tcp_connp; tcp_stack_t *tcps = tcp->tcp_tcps; - if (mp != NULL) - ipha = (ipha_t *)mp->b_rptr; - else - ipha = tcp->tcp_ipha; - dst = ipha->ipha_dst; - - if (!tcp_send_find_ire(tcp, &dst, &ire)) - return (B_FALSE); - - if ((ire->ire_flags & RTF_MULTIRT) || - (ire->ire_stq == NULL) || - (ire->ire_nce == NULL) || - ((ire_fp_mp = ire->ire_nce->nce_fp_mp) == NULL) || - ((mp != NULL) && (ire->ire_max_frag < ntohs(ipha->ipha_length) || - MBLKL(ire_fp_mp) > MBLKHEAD(mp)))) { - TCP_STAT(tcps, tcp_ip_ire_send); - IRE_REFRELE(ire); - return (B_FALSE); + if (tcp->tcp_snd_zcopy_on) { + tcp->tcp_snd_zcopy_on = B_FALSE; + if (!TCP_IS_DETACHED(tcp)) { + (void) proto_set_tx_copyopt(connp->conn_rq, connp, + ZCVMUNSAFE); + TCP_STAT(tcps, tcp_zcopy_off); + } + } else { + tcp->tcp_snd_zcopy_on = B_TRUE; + if (!TCP_IS_DETACHED(tcp)) { + (void) proto_set_tx_copyopt(connp->conn_rq, connp, + ZCVMSAFE); + TCP_STAT(tcps, tcp_zcopy_on); + } } +} - ill = ire_to_ill(ire); - ASSERT(ill != NULL); +/* + * Notify function registered with ip_xmit_attr_t. It's called in the squeue + * so it's safe to update the TCP connection. + */ +/* ARGSUSED1 */ +static void +tcp_notify(void *arg, ip_xmit_attr_t *ixa, ixa_notify_type_t ntype, + ixa_notify_arg_t narg) +{ + tcp_t *tcp = (tcp_t *)arg; + conn_t *connp = tcp->tcp_connp; - if (!tcp->tcp_ire_ill_check_done) { - tcp_ire_ill_check(tcp, ire, ill, B_TRUE); - tcp->tcp_ire_ill_check_done = B_TRUE; + switch (ntype) { + case IXAN_LSO: + tcp_update_lso(tcp, connp->conn_ixa); + break; + case IXAN_PMTU: + tcp_update_pmtu(tcp, B_FALSE); + break; + case IXAN_ZCOPY: + tcp_update_zcopy(tcp); + break; + default: + break; } - - *irep = ire; - *illp = ill; - - return (B_TRUE); } static void -tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp) +tcp_send_data(tcp_t *tcp, mblk_t *mp) { - ipha_t *ipha; - ipaddr_t src; - ipaddr_t dst; - uint32_t cksum; - ire_t *ire; - uint16_t *up; - ill_t *ill; conn_t *connp = tcp->tcp_connp; - uint32_t hcksum_txflags = 0; - mblk_t *ire_fp_mp; - uint_t ire_fp_mp_len; - tcp_stack_t *tcps = tcp->tcp_tcps; - ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - cred_t *cr; - pid_t cpid; - - ASSERT(DB_TYPE(mp) == M_DATA); /* - * Here we need to handle the overloading of the cred_t for - * both getpeerucred and TX. - * If this is a SYN then the caller already set db_credp so - * that getpeerucred will work. But if TX is in use we might have - * a conn_effective_cred which is different, and we need to use that - * cred to make TX use the correct label and label dependent route. + * Check here to avoid sending zero-copy message down to IP when + * ZEROCOPY capability has turned off. We only need to deal with + * the race condition between sockfs and the notification here. + * Since we have tried to backoff the tcp_xmit_head when turning + * zero-copy off and new messages in tcp_output(), we simply drop + * the dup'ed packet here and let tcp retransmit, if tcp_xmit_zc_clean + * is not true. */ - if (is_system_labeled()) { - cr = msg_getcred(mp, &cpid); - if (cr == NULL || connp->conn_effective_cred != NULL) - mblk_setcred(mp, CONN_CRED(connp), cpid); - } - - ipha = (ipha_t *)mp->b_rptr; - src = ipha->ipha_src; - dst = ipha->ipha_dst; - - ASSERT(q != NULL); - DTRACE_PROBE2(tcp__trace__send, mblk_t *, mp, tcp_t *, tcp); - - /* - * Drop off fast path for IPv6 and also if options are present or - * we need to resolve a TS label. - */ - if (tcp->tcp_ipversion != IPV4_VERSION || - !IPCL_IS_CONNECTED(connp) || - !CONN_IS_LSO_MD_FASTPATH(connp) || - (connp->conn_flags & IPCL_CHECK_POLICY) != 0 || - !connp->conn_ulp_labeled || - ipha->ipha_ident == IP_HDR_INCLUDED || - ipha->ipha_version_and_hdr_length != IP_SIMPLE_HDR_VERSION || - IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { - if (tcp->tcp_snd_zcopy_aware) - mp = tcp_zcopy_disable(tcp, mp); - TCP_STAT(tcps, tcp_ip_send); - CALL_IP_WPUT(connp, q, mp); - return; - } - - if (!tcp_send_find_ire_ill(tcp, mp, &ire, &ill)) { - if (tcp->tcp_snd_zcopy_aware) - mp = tcp_zcopy_backoff(tcp, mp, 0); - CALL_IP_WPUT(connp, q, mp); + if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_snd_zcopy_on && + !tcp->tcp_xmit_zc_clean) { + ip_drop_output("TCP ZC was disabled but not clean", mp, NULL); + freemsg(mp); return; } - ire_fp_mp = ire->ire_nce->nce_fp_mp; - ire_fp_mp_len = MBLKL(ire_fp_mp); - - ASSERT(ipha->ipha_ident == 0 || ipha->ipha_ident == IP_HDR_INCLUDED); - ipha->ipha_ident = (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1); -#ifndef _BIG_ENDIAN - ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8); -#endif - - /* - * Check to see if we need to re-enable LSO/MDT for this connection - * because it was previously disabled due to changes in the ill; - * note that by doing it here, this re-enabling only applies when - * the packet is not dispatched through CALL_IP_WPUT(). - * - * That means for IPv4, it is worth re-enabling LSO/MDT for the fastpath - * case, since that's how we ended up here. For IPv6, we do the - * re-enabling work in ip_xmit_v6(), albeit indirectly via squeue. - */ - if (connp->conn_lso_ok && !tcp->tcp_lso && ILL_LSO_TCP_USABLE(ill)) { - /* - * Restore LSO for this connection, so that next time around - * it is eligible to go through tcp_lsosend() path again. - */ - TCP_STAT(tcps, tcp_lso_enabled); - tcp->tcp_lso = B_TRUE; - ip1dbg(("tcp_send_data: reenabling LSO for connp %p on " - "interface %s\n", (void *)connp, ill->ill_name)); - } else if (connp->conn_mdt_ok && !tcp->tcp_mdt && ILL_MDT_USABLE(ill)) { - /* - * Restore MDT for this connection, so that next time around - * it is eligible to go through tcp_multisend() path again. - */ - TCP_STAT(tcps, tcp_mdt_conn_resumed1); - tcp->tcp_mdt = B_TRUE; - ip1dbg(("tcp_send_data: reenabling MDT for connp %p on " - "interface %s\n", (void *)connp, ill->ill_name)); - } - - if (tcp->tcp_snd_zcopy_aware) { - if ((ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) == 0 || - (ill->ill_zerocopy_capab->ill_zerocopy_flags == 0)) - mp = tcp_zcopy_disable(tcp, mp); - /* - * we shouldn't need to reset ipha as the mp containing - * ipha should never be a zero-copy mp. - */ - } - - if (ILL_HCKSUM_CAPABLE(ill) && dohwcksum) { - ASSERT(ill->ill_hcksum_capab != NULL); - hcksum_txflags = ill->ill_hcksum_capab->ill_hcksum_txflags; - } - - /* pseudo-header checksum (do it in parts for IP header checksum) */ - cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); - - ASSERT(ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION); - up = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH); - - IP_CKSUM_XMIT_FAST(ire->ire_ipversion, hcksum_txflags, mp, ipha, up, - IPPROTO_TCP, IP_SIMPLE_HDR_LENGTH, ntohs(ipha->ipha_length), cksum); - - /* Software checksum? */ - if (DB_CKSUMFLAGS(mp) == 0) { - TCP_STAT(tcps, tcp_out_sw_cksum); - TCP_STAT_UPDATE(tcps, tcp_out_sw_cksum_bytes, - ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH); - } - - /* Calculate IP header checksum if hardware isn't capable */ - if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) { - IP_HDR_CKSUM(ipha, cksum, ((uint32_t *)ipha)[0], - ((uint16_t *)ipha)[4]); - } - ASSERT(DB_TYPE(ire_fp_mp) == M_DATA); - mp->b_rptr = (uchar_t *)ipha - ire_fp_mp_len; - bcopy(ire_fp_mp->b_rptr, mp->b_rptr, ire_fp_mp_len); - - UPDATE_OB_PKT_COUNT(ire); - ire->ire_last_used_time = lbolt; - - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits); - UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets, - ntohs(ipha->ipha_length)); - - DTRACE_PROBE4(ip4__physical__out__start, - ill_t *, NULL, ill_t *, ill, ipha_t *, ipha, mblk_t *, mp); - FW_HOOKS(ipst->ips_ip4_physical_out_event, - ipst->ips_ipv4firewall_physical_out, - NULL, ill, ipha, mp, mp, 0, ipst); - DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); - DTRACE_IP_FASTPATH(mp, ipha, ill, ipha, NULL); - - if (mp != NULL) { - if (ipst->ips_ip4_observe.he_interested) { - zoneid_t szone; - - /* - * Both of these functions expect b_rptr to be - * where the IP header starts, so advance past the - * link layer header if present. - */ - mp->b_rptr += ire_fp_mp_len; - szone = ip_get_zoneid_v4(ipha->ipha_src, mp, - ipst, ALL_ZONES); - ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, - ALL_ZONES, ill, ipst); - mp->b_rptr -= ire_fp_mp_len; - } - - ILL_SEND_TX(ill, ire, connp, mp, 0, NULL); - } - - IRE_REFRELE(ire); + ASSERT(connp->conn_ixa->ixa_notify_cookie == connp->conn_tcp); + (void) conn_ip_output(mp, connp->conn_ixa); } /* @@ -18731,15 +15390,13 @@ tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent) int tcpstate; int usable = 0; mblk_t *xmit_tail; - queue_t *q = tcp->tcp_wq; int32_t mss; int32_t num_sack_blk = 0; + int32_t total_hdr_len; int32_t tcp_hdr_len; - int32_t tcp_tcp_hdr_len; - int mdt_thres; int rc; tcp_stack_t *tcps = tcp->tcp_tcps; - ip_stack_t *ipst; + conn_t *connp = tcp->tcp_connp; tcpstate = tcp->tcp_state; if (mp == NULL) { @@ -18771,7 +15428,7 @@ tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent) tcp_display(tcp, NULL, DISP_ADDR_AND_PORT)); #else - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, "tcp_wput_data: data after ordrel, %s\n", @@ -18781,12 +15438,12 @@ tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent) #endif /* DEBUG */ } if (tcp->tcp_snd_zcopy_aware && - (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) != 0) + (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY)) tcp_zcopy_notify(tcp); freemsg(mp); mutex_enter(&tcp->tcp_non_sq_lock); if (tcp->tcp_flow_stopped && - TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) { + TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) { tcp_clrqfull(tcp); } mutex_exit(&tcp->tcp_non_sq_lock); @@ -18886,12 +15543,12 @@ data_null: opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; mss = tcp->tcp_mss - opt_len; - tcp_hdr_len = tcp->tcp_hdr_len + opt_len; - tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len + opt_len; + total_hdr_len = connp->conn_ht_iphc_len + opt_len; + tcp_hdr_len = connp->conn_ht_ulp_len + opt_len; } else { mss = tcp->tcp_mss; - tcp_hdr_len = tcp->tcp_hdr_len; - tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len; + total_hdr_len = connp->conn_ht_iphc_len; + tcp_hdr_len = connp->conn_ht_ulp_len; } if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && @@ -18913,7 +15570,7 @@ data_null: * In the special case when cwnd is zero, which can only * happen if the connection is ECN capable, return now. * New segments is sent using tcp_timer(). The timer - * is set in tcp_rput_data(). + * is set in tcp_input_data(). */ if (tcp->tcp_cwnd == 0) { /* @@ -19023,66 +15680,12 @@ data_null: } /* Update the latest receive window size in TCP header. */ - U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, - tcp->tcp_tcph->th_win); + tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); - /* - * Determine if it's worthwhile to attempt LSO or MDT, based on: - * - * 1. Simple TCP/IP{v4,v6} (no options). - * 2. IPSEC/IPQoS processing is not needed for the TCP connection. - * 3. If the TCP connection is in ESTABLISHED state. - * 4. The TCP is not detached. - * - * If any of the above conditions have changed during the - * connection, stop using LSO/MDT and restore the stream head - * parameters accordingly. - */ - ipst = tcps->tcps_netstack->netstack_ip; - - if ((tcp->tcp_lso || tcp->tcp_mdt) && - ((tcp->tcp_ipversion == IPV4_VERSION && - tcp->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH) || - (tcp->tcp_ipversion == IPV6_VERSION && - tcp->tcp_ip_hdr_len != IPV6_HDR_LEN) || - tcp->tcp_state != TCPS_ESTABLISHED || - TCP_IS_DETACHED(tcp) || !CONN_IS_LSO_MD_FASTPATH(tcp->tcp_connp) || - CONN_IPSEC_OUT_ENCAPSULATED(tcp->tcp_connp) || - IPP_ENABLED(IPP_LOCAL_OUT, ipst))) { - if (tcp->tcp_lso) { - tcp->tcp_connp->conn_lso_ok = B_FALSE; - tcp->tcp_lso = B_FALSE; - } else { - tcp->tcp_connp->conn_mdt_ok = B_FALSE; - tcp->tcp_mdt = B_FALSE; - } - - /* Anything other than detached is considered pathological */ - if (!TCP_IS_DETACHED(tcp)) { - if (tcp->tcp_lso) - TCP_STAT(tcps, tcp_lso_disabled); - else - TCP_STAT(tcps, tcp_mdt_conn_halted1); - (void) tcp_maxpsz_set(tcp, B_TRUE); - } - } - - /* Use MDT if sendable amount is greater than the threshold */ - if (tcp->tcp_mdt && - (mdt_thres = mss << tcp_mdt_smss_threshold, usable > mdt_thres) && - (tail_unsent > mdt_thres || (xmit_tail->b_cont != NULL && - MBLKL(xmit_tail->b_cont) > mdt_thres)) && - (tcp->tcp_valid_bits == 0 || - tcp->tcp_valid_bits == TCP_FSS_VALID)) { - ASSERT(tcp->tcp_connp->conn_mdt_ok); - rc = tcp_multisend(q, tcp, mss, tcp_hdr_len, tcp_tcp_hdr_len, - num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail, - local_time, mdt_thres); - } else { - rc = tcp_send(q, tcp, mss, tcp_hdr_len, tcp_tcp_hdr_len, - num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail, - local_time, INT_MAX); - } + /* Send the packet. */ + rc = tcp_send(tcp, mss, total_hdr_len, tcp_hdr_len, + num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail, + local_time); /* Pretend that all we were trying to send really got sent */ if (rc < 0 && tail_unsent < 0) { @@ -19131,39 +15734,41 @@ done:; tcp->tcp_unsent += len; mutex_enter(&tcp->tcp_non_sq_lock); if (tcp->tcp_flow_stopped) { - if (TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) { + if (TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) { tcp_clrqfull(tcp); } - } else if (TCP_UNSENT_BYTES(tcp) >= tcp->tcp_xmit_hiwater) { - tcp_setqfull(tcp); + } else if (TCP_UNSENT_BYTES(tcp) >= connp->conn_sndbuf) { + if (!(tcp->tcp_detached)) + tcp_setqfull(tcp); } mutex_exit(&tcp->tcp_non_sq_lock); } /* - * tcp_fill_header is called by tcp_send() and tcp_multisend() to fill the - * outgoing TCP header with the template header, as well as other - * options such as time-stamp, ECN and/or SACK. + * tcp_fill_header is called by tcp_send() to fill the outgoing TCP header + * with the template header, as well as other options such as time-stamp, + * ECN and/or SACK. */ static void tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk) { - tcph_t *tcp_tmpl, *tcp_h; + tcpha_t *tcp_tmpl, *tcpha; uint32_t *dst, *src; int hdrlen; + conn_t *connp = tcp->tcp_connp; ASSERT(OK_32PTR(rptr)); /* Template header */ - tcp_tmpl = tcp->tcp_tcph; + tcp_tmpl = tcp->tcp_tcpha; /* Header of outgoing packet */ - tcp_h = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len); + tcpha = (tcpha_t *)(rptr + connp->conn_ixa->ixa_ip_hdr_length); /* dst and src are opaque 32-bit fields, used for copying */ dst = (uint32_t *)rptr; - src = (uint32_t *)tcp->tcp_iphc; - hdrlen = tcp->tcp_hdr_len; + src = (uint32_t *)connp->conn_ht_iphc; + hdrlen = connp->conn_ht_iphc_len; /* Fill time-stamp option if needed */ if (tcp->tcp_snd_ts_ok) { @@ -19172,7 +15777,7 @@ tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk) U32_TO_BE32(tcp->tcp_ts_recent, (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8); } else { - ASSERT(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH); + ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH); } /* @@ -19208,16 +15813,16 @@ tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk) SET_ECT(tcp, rptr); if (tcp->tcp_ecn_echo_on) - tcp_h->th_flags[0] |= TH_ECE; + tcpha->tha_flags |= TH_ECE; if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { - tcp_h->th_flags[0] |= TH_CWR; + tcpha->tha_flags |= TH_CWR; tcp->tcp_ecn_cwr_sent = B_TRUE; } } /* Fill in SACK options */ if (num_sack_blk > 0) { - uchar_t *wptr = rptr + tcp->tcp_hdr_len; + uchar_t *wptr = rptr + connp->conn_ht_iphc_len; sack_blk_t *tmp; int32_t i; @@ -19235,1536 +15840,62 @@ tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk) U32_TO_BE32(tmp[i].end, wptr); wptr += sizeof (tcp_seq); } - tcp_h->th_offset_and_rsrvd[0] += + tcpha->tha_offset_and_reserved += ((num_sack_blk * 2 + 1) << 4); } } /* - * tcp_mdt_add_attrs() is called by tcp_multisend() in order to attach - * the destination address and SAP attribute, and if necessary, the - * hardware checksum offload attribute to a Multidata message. - */ -static int -tcp_mdt_add_attrs(multidata_t *mmd, const mblk_t *dlmp, const boolean_t hwcksum, - const uint32_t start, const uint32_t stuff, const uint32_t end, - const uint32_t flags, tcp_stack_t *tcps) -{ - /* Add global destination address & SAP attribute */ - if (dlmp == NULL || !ip_md_addr_attr(mmd, NULL, dlmp)) { - ip1dbg(("tcp_mdt_add_attrs: can't add global physical " - "destination address+SAP\n")); - - if (dlmp != NULL) - TCP_STAT(tcps, tcp_mdt_allocfail); - return (-1); - } - - /* Add global hwcksum attribute */ - if (hwcksum && - !ip_md_hcksum_attr(mmd, NULL, start, stuff, end, flags)) { - ip1dbg(("tcp_mdt_add_attrs: can't add global hardware " - "checksum attribute\n")); - - TCP_STAT(tcps, tcp_mdt_allocfail); - return (-1); - } - - return (0); -} - -/* - * Smaller and private version of pdescinfo_t used specifically for TCP, - * which allows for only two payload spans per packet. - */ -typedef struct tcp_pdescinfo_s PDESCINFO_STRUCT(2) tcp_pdescinfo_t; - -/* - * tcp_multisend() is called by tcp_wput_data() for Multidata Transmit - * scheme, and returns one the following: + * tcp_send() is called by tcp_wput_data() and returns one of the following: * * -1 = failed allocation. * 0 = success; burst count reached, or usable send window is too small, * and that we'd rather wait until later before sending again. */ static int -tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, - const int tcp_tcp_hdr_len, const int num_sack_blk, int *usable, - uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, - const int mdt_thres) -{ - mblk_t *md_mp_head, *md_mp, *md_pbuf, *md_pbuf_nxt, *md_hbuf; - multidata_t *mmd; - uint_t obsegs, obbytes, hdr_frag_sz; - uint_t cur_hdr_off, cur_pld_off, base_pld_off, first_snxt; - int num_burst_seg, max_pld; - pdesc_t *pkt; - tcp_pdescinfo_t tcp_pkt_info; - pdescinfo_t *pkt_info; - int pbuf_idx, pbuf_idx_nxt; - int seg_len, len, spill, af; - boolean_t add_buffer, zcopy, clusterwide; - boolean_t rconfirm = B_FALSE; - boolean_t done = B_FALSE; - uint32_t cksum; - uint32_t hwcksum_flags; - ire_t *ire = NULL; - ill_t *ill; - ipha_t *ipha; - ip6_t *ip6h; - ipaddr_t src, dst; - ill_zerocopy_capab_t *zc_cap = NULL; - uint16_t *up; - int err; - conn_t *connp; - tcp_stack_t *tcps = tcp->tcp_tcps; - ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - int usable_mmd, tail_unsent_mmd; - uint_t snxt_mmd, obsegs_mmd, obbytes_mmd; - mblk_t *xmit_tail_mmd; - netstackid_t stack_id; - -#ifdef _BIG_ENDIAN -#define IPVER(ip6h) ((((uint32_t *)ip6h)[0] >> 28) & 0x7) -#else -#define IPVER(ip6h) ((((uint32_t *)ip6h)[0] >> 4) & 0x7) -#endif - -#define PREP_NEW_MULTIDATA() { \ - mmd = NULL; \ - md_mp = md_hbuf = NULL; \ - cur_hdr_off = 0; \ - max_pld = tcp->tcp_mdt_max_pld; \ - pbuf_idx = pbuf_idx_nxt = -1; \ - add_buffer = B_TRUE; \ - zcopy = B_FALSE; \ -} - -#define PREP_NEW_PBUF() { \ - md_pbuf = md_pbuf_nxt = NULL; \ - pbuf_idx = pbuf_idx_nxt = -1; \ - cur_pld_off = 0; \ - first_snxt = *snxt; \ - ASSERT(*tail_unsent > 0); \ - base_pld_off = MBLKL(*xmit_tail) - *tail_unsent; \ -} - - ASSERT(mdt_thres >= mss); - ASSERT(*usable > 0 && *usable > mdt_thres); - ASSERT(tcp->tcp_state == TCPS_ESTABLISHED); - ASSERT(!TCP_IS_DETACHED(tcp)); - ASSERT(tcp->tcp_valid_bits == 0 || - tcp->tcp_valid_bits == TCP_FSS_VALID); - ASSERT((tcp->tcp_ipversion == IPV4_VERSION && - tcp->tcp_ip_hdr_len == IP_SIMPLE_HDR_LENGTH) || - (tcp->tcp_ipversion == IPV6_VERSION && - tcp->tcp_ip_hdr_len == IPV6_HDR_LEN)); - - connp = tcp->tcp_connp; - ASSERT(connp != NULL); - ASSERT(CONN_IS_LSO_MD_FASTPATH(connp)); - ASSERT(!CONN_IPSEC_OUT_ENCAPSULATED(connp)); - - stack_id = connp->conn_netstack->netstack_stackid; - - usable_mmd = tail_unsent_mmd = 0; - snxt_mmd = obsegs_mmd = obbytes_mmd = 0; - xmit_tail_mmd = NULL; - /* - * Note that tcp will only declare at most 2 payload spans per - * packet, which is much lower than the maximum allowable number - * of packet spans per Multidata. For this reason, we use the - * privately declared and smaller descriptor info structure, in - * order to save some stack space. - */ - pkt_info = (pdescinfo_t *)&tcp_pkt_info; - - af = (tcp->tcp_ipversion == IPV4_VERSION) ? AF_INET : AF_INET6; - if (af == AF_INET) { - dst = tcp->tcp_ipha->ipha_dst; - src = tcp->tcp_ipha->ipha_src; - ASSERT(!CLASSD(dst)); - } - ASSERT(af == AF_INET || - !IN6_IS_ADDR_MULTICAST(&tcp->tcp_ip6h->ip6_dst)); - - obsegs = obbytes = 0; - num_burst_seg = tcp->tcp_snd_burst; - md_mp_head = NULL; - PREP_NEW_MULTIDATA(); - - /* - * Before we go on further, make sure there is an IRE that we can - * use, and that the ILL supports MDT. Otherwise, there's no point - * in proceeding any further, and we should just hand everything - * off to the legacy path. - */ - if (!tcp_send_find_ire(tcp, (af == AF_INET) ? &dst : NULL, &ire)) - goto legacy_send_no_md; - - ASSERT(ire != NULL); - ASSERT(af != AF_INET || ire->ire_ipversion == IPV4_VERSION); - ASSERT(af == AF_INET || !IN6_IS_ADDR_V4MAPPED(&(ire->ire_addr_v6))); - ASSERT(af == AF_INET || ire->ire_nce != NULL); - ASSERT(!(ire->ire_type & IRE_BROADCAST)); - /* - * If we do support loopback for MDT (which requires modifications - * to the receiving paths), the following assertions should go away, - * and we would be sending the Multidata to loopback conn later on. - */ - ASSERT(!IRE_IS_LOCAL(ire)); - ASSERT(ire->ire_stq != NULL); - - ill = ire_to_ill(ire); - ASSERT(ill != NULL); - ASSERT(!ILL_MDT_CAPABLE(ill) || ill->ill_mdt_capab != NULL); - - if (!tcp->tcp_ire_ill_check_done) { - tcp_ire_ill_check(tcp, ire, ill, B_TRUE); - tcp->tcp_ire_ill_check_done = B_TRUE; - } - - /* - * If the underlying interface conditions have changed, or if the - * new interface does not support MDT, go back to legacy path. - */ - if (!ILL_MDT_USABLE(ill) || (ire->ire_flags & RTF_MULTIRT) != 0) { - /* don't go through this path anymore for this connection */ - TCP_STAT(tcps, tcp_mdt_conn_halted2); - tcp->tcp_mdt = B_FALSE; - ip1dbg(("tcp_multisend: disabling MDT for connp %p on " - "interface %s\n", (void *)connp, ill->ill_name)); - /* IRE will be released prior to returning */ - goto legacy_send_no_md; - } - - if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) - zc_cap = ill->ill_zerocopy_capab; - - /* - * Check if we can take tcp fast-path. Note that "incomplete" - * ire's (where the link-layer for next hop is not resolved - * or where the fast-path header in nce_fp_mp is not available - * yet) are sent down the legacy (slow) path. - * NOTE: We should fix ip_xmit_v4 to handle M_MULTIDATA - */ - if (ire->ire_nce && ire->ire_nce->nce_state != ND_REACHABLE) { - /* IRE will be released prior to returning */ - goto legacy_send_no_md; - } - - /* go to legacy path if interface doesn't support zerocopy */ - if (tcp->tcp_snd_zcopy_aware && do_tcpzcopy != 2 && - (zc_cap == NULL || zc_cap->ill_zerocopy_flags == 0)) { - /* IRE will be released prior to returning */ - goto legacy_send_no_md; - } - - /* does the interface support hardware checksum offload? */ - hwcksum_flags = 0; - if (ILL_HCKSUM_CAPABLE(ill) && - (ill->ill_hcksum_capab->ill_hcksum_txflags & - (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6 | HCKSUM_INET_PARTIAL | - HCKSUM_IPHDRCKSUM)) && dohwcksum) { - if (ill->ill_hcksum_capab->ill_hcksum_txflags & - HCKSUM_IPHDRCKSUM) - hwcksum_flags = HCK_IPV4_HDRCKSUM; - - if (ill->ill_hcksum_capab->ill_hcksum_txflags & - (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6)) - hwcksum_flags |= HCK_FULLCKSUM; - else if (ill->ill_hcksum_capab->ill_hcksum_txflags & - HCKSUM_INET_PARTIAL) - hwcksum_flags |= HCK_PARTIALCKSUM; - } - - /* - * Each header fragment consists of the leading extra space, - * followed by the TCP/IP header, and the trailing extra space. - * We make sure that each header fragment begins on a 32-bit - * aligned memory address (tcp_mdt_hdr_head is already 32-bit - * aligned in tcp_mdt_update). - */ - hdr_frag_sz = roundup((tcp->tcp_mdt_hdr_head + tcp_hdr_len + - tcp->tcp_mdt_hdr_tail), 4); - - /* are we starting from the beginning of data block? */ - if (*tail_unsent == 0) { - *xmit_tail = (*xmit_tail)->b_cont; - ASSERT((uintptr_t)MBLKL(*xmit_tail) <= (uintptr_t)INT_MAX); - *tail_unsent = (int)MBLKL(*xmit_tail); - } - - /* - * Here we create one or more Multidata messages, each made up of - * one header buffer and up to N payload buffers. This entire - * operation is done within two loops: - * - * The outer loop mostly deals with creating the Multidata message, - * as well as the header buffer that gets added to it. It also - * links the Multidata messages together such that all of them can - * be sent down to the lower layer in a single putnext call; this - * linking behavior depends on the tcp_mdt_chain tunable. - * - * The inner loop takes an existing Multidata message, and adds - * one or more (up to tcp_mdt_max_pld) payload buffers to it. It - * packetizes those buffers by filling up the corresponding header - * buffer fragments with the proper IP and TCP headers, and by - * describing the layout of each packet in the packet descriptors - * that get added to the Multidata. - */ - do { - /* - * If usable send window is too small, or data blocks in - * transmit list are smaller than our threshold (i.e. app - * performs large writes followed by small ones), we hand - * off the control over to the legacy path. Note that we'll - * get back the control once it encounters a large block. - */ - if (*usable < mss || (*tail_unsent <= mdt_thres && - (*xmit_tail)->b_cont != NULL && - MBLKL((*xmit_tail)->b_cont) <= mdt_thres)) { - /* send down what we've got so far */ - if (md_mp_head != NULL) { - tcp_multisend_data(tcp, ire, ill, md_mp_head, - obsegs, obbytes, &rconfirm); - } - /* - * Pass control over to tcp_send(), but tell it to - * return to us once a large-size transmission is - * possible. - */ - TCP_STAT(tcps, tcp_mdt_legacy_small); - if ((err = tcp_send(q, tcp, mss, tcp_hdr_len, - tcp_tcp_hdr_len, num_sack_blk, usable, snxt, - tail_unsent, xmit_tail, local_time, - mdt_thres)) <= 0) { - /* burst count reached, or alloc failed */ - IRE_REFRELE(ire); - return (err); - } - - /* tcp_send() may have sent everything, so check */ - if (*usable <= 0) { - IRE_REFRELE(ire); - return (0); - } - - TCP_STAT(tcps, tcp_mdt_legacy_ret); - /* - * We may have delivered the Multidata, so make sure - * to re-initialize before the next round. - */ - md_mp_head = NULL; - obsegs = obbytes = 0; - num_burst_seg = tcp->tcp_snd_burst; - PREP_NEW_MULTIDATA(); - - /* are we starting from the beginning of data block? */ - if (*tail_unsent == 0) { - *xmit_tail = (*xmit_tail)->b_cont; - ASSERT((uintptr_t)MBLKL(*xmit_tail) <= - (uintptr_t)INT_MAX); - *tail_unsent = (int)MBLKL(*xmit_tail); - } - } - /* - * Record current values for parameters we may need to pass - * to tcp_send() or tcp_multisend_data(). We checkpoint at - * each iteration of the outer loop (each multidata message - * creation). If we have a failure in the inner loop, we send - * any complete multidata messages we have before reverting - * to using the traditional non-md path. - */ - snxt_mmd = *snxt; - usable_mmd = *usable; - xmit_tail_mmd = *xmit_tail; - tail_unsent_mmd = *tail_unsent; - obsegs_mmd = obsegs; - obbytes_mmd = obbytes; - - /* - * max_pld limits the number of mblks in tcp's transmit - * queue that can be added to a Multidata message. Once - * this counter reaches zero, no more additional mblks - * can be added to it. What happens afterwards depends - * on whether or not we are set to chain the Multidata - * messages. If we are to link them together, reset - * max_pld to its original value (tcp_mdt_max_pld) and - * prepare to create a new Multidata message which will - * get linked to md_mp_head. Else, leave it alone and - * let the inner loop break on its own. - */ - if (tcp_mdt_chain && max_pld == 0) - PREP_NEW_MULTIDATA(); - - /* adding a payload buffer; re-initialize values */ - if (add_buffer) - PREP_NEW_PBUF(); - - /* - * If we don't have a Multidata, either because we just - * (re)entered this outer loop, or after we branched off - * to tcp_send above, setup the Multidata and header - * buffer to be used. - */ - if (md_mp == NULL) { - int md_hbuflen; - uint32_t start, stuff; - - /* - * Calculate Multidata header buffer size large enough - * to hold all of the headers that can possibly be - * sent at this moment. We'd rather over-estimate - * the size than running out of space; this is okay - * since this buffer is small anyway. - */ - md_hbuflen = (howmany(*usable, mss) + 1) * hdr_frag_sz; - - /* - * Start and stuff offset for partial hardware - * checksum offload; these are currently for IPv4. - * For full checksum offload, they are set to zero. - */ - if ((hwcksum_flags & HCK_PARTIALCKSUM)) { - if (af == AF_INET) { - start = IP_SIMPLE_HDR_LENGTH; - stuff = IP_SIMPLE_HDR_LENGTH + - TCP_CHECKSUM_OFFSET; - } else { - start = IPV6_HDR_LEN; - stuff = IPV6_HDR_LEN + - TCP_CHECKSUM_OFFSET; - } - } else { - start = stuff = 0; - } - - /* - * Create the header buffer, Multidata, as well as - * any necessary attributes (destination address, - * SAP and hardware checksum offload) that should - * be associated with the Multidata message. - */ - ASSERT(cur_hdr_off == 0); - if ((md_hbuf = allocb(md_hbuflen, BPRI_HI)) == NULL || - ((md_hbuf->b_wptr += md_hbuflen), - (mmd = mmd_alloc(md_hbuf, &md_mp, - KM_NOSLEEP)) == NULL) || (tcp_mdt_add_attrs(mmd, - /* fastpath mblk */ - ire->ire_nce->nce_res_mp, - /* hardware checksum enabled */ - (hwcksum_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)), - /* hardware checksum offsets */ - start, stuff, 0, - /* hardware checksum flag */ - hwcksum_flags, tcps) != 0)) { -legacy_send: - /* - * We arrive here from a failure within the - * inner (packetizer) loop or we fail one of - * the conditionals above. We restore the - * previously checkpointed values for: - * xmit_tail - * usable - * tail_unsent - * snxt - * obbytes - * obsegs - * We should then be able to dispatch any - * complete multidata before reverting to the - * traditional path with consistent parameters - * (the inner loop updates these as it - * iterates). - */ - *xmit_tail = xmit_tail_mmd; - *usable = usable_mmd; - *tail_unsent = tail_unsent_mmd; - *snxt = snxt_mmd; - obbytes = obbytes_mmd; - obsegs = obsegs_mmd; - if (md_mp != NULL) { - /* Unlink message from the chain */ - if (md_mp_head != NULL) { - err = (intptr_t)rmvb(md_mp_head, - md_mp); - /* - * We can't assert that rmvb - * did not return -1, since we - * may get here before linkb - * happens. We do, however, - * check if we just removed the - * only element in the list. - */ - if (err == 0) - md_mp_head = NULL; - } - /* md_hbuf gets freed automatically */ - TCP_STAT(tcps, tcp_mdt_discarded); - freeb(md_mp); - } else { - /* Either allocb or mmd_alloc failed */ - TCP_STAT(tcps, tcp_mdt_allocfail); - if (md_hbuf != NULL) - freeb(md_hbuf); - } - - /* send down what we've got so far */ - if (md_mp_head != NULL) { - tcp_multisend_data(tcp, ire, ill, - md_mp_head, obsegs, obbytes, - &rconfirm); - } -legacy_send_no_md: - if (ire != NULL) - IRE_REFRELE(ire); - /* - * Too bad; let the legacy path handle this. - * We specify INT_MAX for the threshold, since - * we gave up with the Multidata processings - * and let the old path have it all. - */ - TCP_STAT(tcps, tcp_mdt_legacy_all); - return (tcp_send(q, tcp, mss, tcp_hdr_len, - tcp_tcp_hdr_len, num_sack_blk, usable, - snxt, tail_unsent, xmit_tail, local_time, - INT_MAX)); - } - - /* link to any existing ones, if applicable */ - TCP_STAT(tcps, tcp_mdt_allocd); - if (md_mp_head == NULL) { - md_mp_head = md_mp; - } else if (tcp_mdt_chain) { - TCP_STAT(tcps, tcp_mdt_linked); - linkb(md_mp_head, md_mp); - } - } - - ASSERT(md_mp_head != NULL); - ASSERT(tcp_mdt_chain || md_mp_head->b_cont == NULL); - ASSERT(md_mp != NULL && mmd != NULL); - ASSERT(md_hbuf != NULL); - - /* - * Packetize the transmittable portion of the data block; - * each data block is essentially added to the Multidata - * as a payload buffer. We also deal with adding more - * than one payload buffers, which happens when the remaining - * packetized portion of the current payload buffer is less - * than MSS, while the next data block in transmit queue - * has enough data to make up for one. This "spillover" - * case essentially creates a split-packet, where portions - * of the packet's payload fragments may span across two - * virtually discontiguous address blocks. - */ - seg_len = mss; - do { - len = seg_len; - - /* one must remain NULL for DTRACE_IP_FASTPATH */ - ipha = NULL; - ip6h = NULL; - - ASSERT(len > 0); - ASSERT(max_pld >= 0); - ASSERT(!add_buffer || cur_pld_off == 0); - - /* - * First time around for this payload buffer; note - * in the case of a spillover, the following has - * been done prior to adding the split-packet - * descriptor to Multidata, and we don't want to - * repeat the process. - */ - if (add_buffer) { - ASSERT(mmd != NULL); - ASSERT(md_pbuf == NULL); - ASSERT(md_pbuf_nxt == NULL); - ASSERT(pbuf_idx == -1 && pbuf_idx_nxt == -1); - - /* - * Have we reached the limit? We'd get to - * this case when we're not chaining the - * Multidata messages together, and since - * we're done, terminate this loop. - */ - if (max_pld == 0) - break; /* done */ - - if ((md_pbuf = dupb(*xmit_tail)) == NULL) { - TCP_STAT(tcps, tcp_mdt_allocfail); - goto legacy_send; /* out_of_mem */ - } - - if (IS_VMLOANED_MBLK(md_pbuf) && !zcopy && - zc_cap != NULL) { - if (!ip_md_zcopy_attr(mmd, NULL, - zc_cap->ill_zerocopy_flags)) { - freeb(md_pbuf); - TCP_STAT(tcps, - tcp_mdt_allocfail); - /* out_of_mem */ - goto legacy_send; - } - zcopy = B_TRUE; - } - - md_pbuf->b_rptr += base_pld_off; - - /* - * Add a payload buffer to the Multidata; this - * operation must not fail, or otherwise our - * logic in this routine is broken. There - * is no memory allocation done by the - * routine, so any returned failure simply - * tells us that we've done something wrong. - * - * A failure tells us that either we're adding - * the same payload buffer more than once, or - * we're trying to add more buffers than - * allowed (max_pld calculation is wrong). - * None of the above cases should happen, and - * we panic because either there's horrible - * heap corruption, and/or programming mistake. - */ - pbuf_idx = mmd_addpldbuf(mmd, md_pbuf); - if (pbuf_idx < 0) { - cmn_err(CE_PANIC, "tcp_multisend: " - "payload buffer logic error " - "detected for tcp %p mmd %p " - "pbuf %p (%d)\n", - (void *)tcp, (void *)mmd, - (void *)md_pbuf, pbuf_idx); - } - - ASSERT(max_pld > 0); - --max_pld; - add_buffer = B_FALSE; - } - - ASSERT(md_mp_head != NULL); - ASSERT(md_pbuf != NULL); - ASSERT(md_pbuf_nxt == NULL); - ASSERT(pbuf_idx != -1); - ASSERT(pbuf_idx_nxt == -1); - ASSERT(*usable > 0); - - /* - * We spillover to the next payload buffer only - * if all of the following is true: - * - * 1. There is not enough data on the current - * payload buffer to make up `len', - * 2. We are allowed to send `len', - * 3. The next payload buffer length is large - * enough to accomodate `spill'. - */ - if ((spill = len - *tail_unsent) > 0 && - *usable >= len && - MBLKL((*xmit_tail)->b_cont) >= spill && - max_pld > 0) { - md_pbuf_nxt = dupb((*xmit_tail)->b_cont); - if (md_pbuf_nxt == NULL) { - TCP_STAT(tcps, tcp_mdt_allocfail); - goto legacy_send; /* out_of_mem */ - } - - if (IS_VMLOANED_MBLK(md_pbuf_nxt) && !zcopy && - zc_cap != NULL) { - if (!ip_md_zcopy_attr(mmd, NULL, - zc_cap->ill_zerocopy_flags)) { - freeb(md_pbuf_nxt); - TCP_STAT(tcps, - tcp_mdt_allocfail); - /* out_of_mem */ - goto legacy_send; - } - zcopy = B_TRUE; - } - - /* - * See comments above on the first call to - * mmd_addpldbuf for explanation on the panic. - */ - pbuf_idx_nxt = mmd_addpldbuf(mmd, md_pbuf_nxt); - if (pbuf_idx_nxt < 0) { - panic("tcp_multisend: " - "next payload buffer logic error " - "detected for tcp %p mmd %p " - "pbuf %p (%d)\n", - (void *)tcp, (void *)mmd, - (void *)md_pbuf_nxt, pbuf_idx_nxt); - } - - ASSERT(max_pld > 0); - --max_pld; - } else if (spill > 0) { - /* - * If there's a spillover, but the following - * xmit_tail couldn't give us enough octets - * to reach "len", then stop the current - * Multidata creation and let the legacy - * tcp_send() path take over. We don't want - * to send the tiny segment as part of this - * Multidata for performance reasons; instead, - * we let the legacy path deal with grouping - * it with the subsequent small mblks. - */ - if (*usable >= len && - MBLKL((*xmit_tail)->b_cont) < spill) { - max_pld = 0; - break; /* done */ - } - - /* - * We can't spillover, and we are near - * the end of the current payload buffer, - * so send what's left. - */ - ASSERT(*tail_unsent > 0); - len = *tail_unsent; - } - - /* tail_unsent is negated if there is a spillover */ - *tail_unsent -= len; - *usable -= len; - ASSERT(*usable >= 0); - - if (*usable < mss) - seg_len = *usable; - /* - * Sender SWS avoidance; see comments in tcp_send(); - * everything else is the same, except that we only - * do this here if there is no more data to be sent - * following the current xmit_tail. We don't check - * for 1-byte urgent data because we shouldn't get - * here if TCP_URG_VALID is set. - */ - if (*usable > 0 && *usable < mss && - ((md_pbuf_nxt == NULL && - (*xmit_tail)->b_cont == NULL) || - (md_pbuf_nxt != NULL && - (*xmit_tail)->b_cont->b_cont == NULL)) && - seg_len < (tcp->tcp_max_swnd >> 1) && - (tcp->tcp_unsent - - ((*snxt + len) - tcp->tcp_snxt)) > seg_len && - !tcp->tcp_zero_win_probe) { - if ((*snxt + len) == tcp->tcp_snxt && - (*snxt + len) == tcp->tcp_suna) { - TCP_TIMER_RESTART(tcp, tcp->tcp_rto); - } - done = B_TRUE; - } - - /* - * Prime pump for IP's checksumming on our behalf; - * include the adjustment for a source route if any. - * Do this only for software/partial hardware checksum - * offload, as this field gets zeroed out later for - * the full hardware checksum offload case. - */ - if (!(hwcksum_flags & HCK_FULLCKSUM)) { - cksum = len + tcp_tcp_hdr_len + tcp->tcp_sum; - cksum = (cksum >> 16) + (cksum & 0xFFFF); - U16_TO_ABE16(cksum, tcp->tcp_tcph->th_sum); - } - - U32_TO_ABE32(*snxt, tcp->tcp_tcph->th_seq); - *snxt += len; - - tcp->tcp_tcph->th_flags[0] = TH_ACK; - /* - * We set the PUSH bit only if TCP has no more buffered - * data to be transmitted (or if sender SWS avoidance - * takes place), as opposed to setting it for every - * last packet in the burst. - */ - if (done || - (tcp->tcp_unsent - (*snxt - tcp->tcp_snxt)) == 0) - tcp->tcp_tcph->th_flags[0] |= TH_PUSH; - - /* - * Set FIN bit if this is our last segment; snxt - * already includes its length, and it will not - * be adjusted after this point. - */ - if (tcp->tcp_valid_bits == TCP_FSS_VALID && - *snxt == tcp->tcp_fss) { - if (!tcp->tcp_fin_acked) { - tcp->tcp_tcph->th_flags[0] |= TH_FIN; - BUMP_MIB(&tcps->tcps_mib, - tcpOutControl); - } - if (!tcp->tcp_fin_sent) { - tcp->tcp_fin_sent = B_TRUE; - /* - * tcp state must be ESTABLISHED - * in order for us to get here in - * the first place. - */ - tcp->tcp_state = TCPS_FIN_WAIT_1; - - /* - * Upon returning from this routine, - * tcp_wput_data() will set tcp_snxt - * to be equal to snxt + tcp_fin_sent. - * This is essentially the same as - * setting it to tcp_fss + 1. - */ - } - } - - tcp->tcp_last_sent_len = (ushort_t)len; - - len += tcp_hdr_len; - if (tcp->tcp_ipversion == IPV4_VERSION) - tcp->tcp_ipha->ipha_length = htons(len); - else - tcp->tcp_ip6h->ip6_plen = htons(len - - ((char *)&tcp->tcp_ip6h[1] - - tcp->tcp_iphc)); - - pkt_info->flags = (PDESC_HBUF_REF | PDESC_PBUF_REF); - - /* setup header fragment */ - PDESC_HDR_ADD(pkt_info, - md_hbuf->b_rptr + cur_hdr_off, /* base */ - tcp->tcp_mdt_hdr_head, /* head room */ - tcp_hdr_len, /* len */ - tcp->tcp_mdt_hdr_tail); /* tail room */ - - ASSERT(pkt_info->hdr_lim - pkt_info->hdr_base == - hdr_frag_sz); - ASSERT(MBLKIN(md_hbuf, - (pkt_info->hdr_base - md_hbuf->b_rptr), - PDESC_HDRSIZE(pkt_info))); - - /* setup first payload fragment */ - PDESC_PLD_INIT(pkt_info); - PDESC_PLD_SPAN_ADD(pkt_info, - pbuf_idx, /* index */ - md_pbuf->b_rptr + cur_pld_off, /* start */ - tcp->tcp_last_sent_len); /* len */ - - /* create a split-packet in case of a spillover */ - if (md_pbuf_nxt != NULL) { - ASSERT(spill > 0); - ASSERT(pbuf_idx_nxt > pbuf_idx); - ASSERT(!add_buffer); - - md_pbuf = md_pbuf_nxt; - md_pbuf_nxt = NULL; - pbuf_idx = pbuf_idx_nxt; - pbuf_idx_nxt = -1; - cur_pld_off = spill; - - /* trim out first payload fragment */ - PDESC_PLD_SPAN_TRIM(pkt_info, 0, spill); - - /* setup second payload fragment */ - PDESC_PLD_SPAN_ADD(pkt_info, - pbuf_idx, /* index */ - md_pbuf->b_rptr, /* start */ - spill); /* len */ - - if ((*xmit_tail)->b_next == NULL) { - /* - * Store the lbolt used for RTT - * estimation. We can only record one - * timestamp per mblk so we do it when - * we reach the end of the payload - * buffer. Also we only take a new - * timestamp sample when the previous - * timed data from the same mblk has - * been ack'ed. - */ - (*xmit_tail)->b_prev = local_time; - (*xmit_tail)->b_next = - (mblk_t *)(uintptr_t)first_snxt; - } - - first_snxt = *snxt - spill; - - /* - * Advance xmit_tail; usable could be 0 by - * the time we got here, but we made sure - * above that we would only spillover to - * the next data block if usable includes - * the spilled-over amount prior to the - * subtraction. Therefore, we are sure - * that xmit_tail->b_cont can't be NULL. - */ - ASSERT((*xmit_tail)->b_cont != NULL); - *xmit_tail = (*xmit_tail)->b_cont; - ASSERT((uintptr_t)MBLKL(*xmit_tail) <= - (uintptr_t)INT_MAX); - *tail_unsent = (int)MBLKL(*xmit_tail) - spill; - } else { - cur_pld_off += tcp->tcp_last_sent_len; - } - - /* - * Fill in the header using the template header, and - * add options such as time-stamp, ECN and/or SACK, - * as needed. - */ - tcp_fill_header(tcp, pkt_info->hdr_rptr, - (clock_t)local_time, num_sack_blk); - - /* take care of some IP header businesses */ - if (af == AF_INET) { - ipha = (ipha_t *)pkt_info->hdr_rptr; - - ASSERT(OK_32PTR((uchar_t *)ipha)); - ASSERT(PDESC_HDRL(pkt_info) >= - IP_SIMPLE_HDR_LENGTH); - ASSERT(ipha->ipha_version_and_hdr_length == - IP_SIMPLE_HDR_VERSION); - - /* - * Assign ident value for current packet; see - * related comments in ip_wput_ire() about the - * contract private interface with clustering - * group. - */ - clusterwide = B_FALSE; - if (cl_inet_ipident != NULL) { - ASSERT(cl_inet_isclusterwide != NULL); - if ((*cl_inet_isclusterwide)(stack_id, - IPPROTO_IP, AF_INET, - (uint8_t *)(uintptr_t)src, NULL)) { - ipha->ipha_ident = - (*cl_inet_ipident)(stack_id, - IPPROTO_IP, AF_INET, - (uint8_t *)(uintptr_t)src, - (uint8_t *)(uintptr_t)dst, - NULL); - clusterwide = B_TRUE; - } - } - - if (!clusterwide) { - ipha->ipha_ident = (uint16_t) - atomic_add_32_nv( - &ire->ire_ident, 1); - } -#ifndef _BIG_ENDIAN - ipha->ipha_ident = (ipha->ipha_ident << 8) | - (ipha->ipha_ident >> 8); -#endif - } else { - ip6h = (ip6_t *)pkt_info->hdr_rptr; - - ASSERT(OK_32PTR((uchar_t *)ip6h)); - ASSERT(IPVER(ip6h) == IPV6_VERSION); - ASSERT(ip6h->ip6_nxt == IPPROTO_TCP); - ASSERT(PDESC_HDRL(pkt_info) >= - (IPV6_HDR_LEN + TCP_CHECKSUM_OFFSET + - TCP_CHECKSUM_SIZE)); - ASSERT(tcp->tcp_ipversion == IPV6_VERSION); - - if (tcp->tcp_ip_forward_progress) { - rconfirm = B_TRUE; - tcp->tcp_ip_forward_progress = B_FALSE; - } - } - - /* at least one payload span, and at most two */ - ASSERT(pkt_info->pld_cnt > 0 && pkt_info->pld_cnt < 3); - - /* add the packet descriptor to Multidata */ - if ((pkt = mmd_addpdesc(mmd, pkt_info, &err, - KM_NOSLEEP)) == NULL) { - /* - * Any failure other than ENOMEM indicates - * that we have passed in invalid pkt_info - * or parameters to mmd_addpdesc, which must - * not happen. - * - * EINVAL is a result of failure on boundary - * checks against the pkt_info contents. It - * should not happen, and we panic because - * either there's horrible heap corruption, - * and/or programming mistake. - */ - if (err != ENOMEM) { - cmn_err(CE_PANIC, "tcp_multisend: " - "pdesc logic error detected for " - "tcp %p mmd %p pinfo %p (%d)\n", - (void *)tcp, (void *)mmd, - (void *)pkt_info, err); - } - TCP_STAT(tcps, tcp_mdt_addpdescfail); - goto legacy_send; /* out_of_mem */ - } - ASSERT(pkt != NULL); - - /* calculate IP header and TCP checksums */ - if (af == AF_INET) { - /* calculate pseudo-header checksum */ - cksum = (dst >> 16) + (dst & 0xFFFF) + - (src >> 16) + (src & 0xFFFF); - - /* offset for TCP header checksum */ - up = IPH_TCPH_CHECKSUMP(ipha, - IP_SIMPLE_HDR_LENGTH); - } else { - up = (uint16_t *)&ip6h->ip6_src; - - /* calculate pseudo-header checksum */ - cksum = up[0] + up[1] + up[2] + up[3] + - up[4] + up[5] + up[6] + up[7] + - up[8] + up[9] + up[10] + up[11] + - up[12] + up[13] + up[14] + up[15]; - - /* Fold the initial sum */ - cksum = (cksum & 0xffff) + (cksum >> 16); - - up = (uint16_t *)(((uchar_t *)ip6h) + - IPV6_HDR_LEN + TCP_CHECKSUM_OFFSET); - } - - if (hwcksum_flags & HCK_FULLCKSUM) { - /* clear checksum field for hardware */ - *up = 0; - } else if (hwcksum_flags & HCK_PARTIALCKSUM) { - uint32_t sum; - - /* pseudo-header checksumming */ - sum = *up + cksum + IP_TCP_CSUM_COMP; - sum = (sum & 0xFFFF) + (sum >> 16); - *up = (sum & 0xFFFF) + (sum >> 16); - } else { - /* software checksumming */ - TCP_STAT(tcps, tcp_out_sw_cksum); - TCP_STAT_UPDATE(tcps, tcp_out_sw_cksum_bytes, - tcp->tcp_hdr_len + tcp->tcp_last_sent_len); - *up = IP_MD_CSUM(pkt, tcp->tcp_ip_hdr_len, - cksum + IP_TCP_CSUM_COMP); - if (*up == 0) - *up = 0xFFFF; - } - - /* IPv4 header checksum */ - if (af == AF_INET) { - if (hwcksum_flags & HCK_IPV4_HDRCKSUM) { - ipha->ipha_hdr_checksum = 0; - } else { - IP_HDR_CKSUM(ipha, cksum, - ((uint32_t *)ipha)[0], - ((uint16_t *)ipha)[4]); - } - } - - if (af == AF_INET && - HOOKS4_INTERESTED_PHYSICAL_OUT(ipst) || - af == AF_INET6 && - HOOKS6_INTERESTED_PHYSICAL_OUT(ipst)) { - mblk_t *mp, *mp1; - uchar_t *hdr_rptr, *hdr_wptr; - uchar_t *pld_rptr, *pld_wptr; - - /* - * We reconstruct a pseudo packet for the hooks - * framework using mmd_transform_link(). - * If it is a split packet we pullup the - * payload. FW_HOOKS expects a pkt comprising - * of two mblks: a header and the payload. - */ - if ((mp = mmd_transform_link(pkt)) == NULL) { - TCP_STAT(tcps, tcp_mdt_allocfail); - goto legacy_send; - } - - if (pkt_info->pld_cnt > 1) { - /* split payload, more than one pld */ - if ((mp1 = msgpullup(mp->b_cont, -1)) == - NULL) { - freemsg(mp); - TCP_STAT(tcps, - tcp_mdt_allocfail); - goto legacy_send; - } - freemsg(mp->b_cont); - mp->b_cont = mp1; - } else { - mp1 = mp->b_cont; - } - ASSERT(mp1 != NULL && mp1->b_cont == NULL); - - /* - * Remember the message offsets. This is so we - * can detect changes when we return from the - * FW_HOOKS callbacks. - */ - hdr_rptr = mp->b_rptr; - hdr_wptr = mp->b_wptr; - pld_rptr = mp->b_cont->b_rptr; - pld_wptr = mp->b_cont->b_wptr; - - if (af == AF_INET) { - DTRACE_PROBE4( - ip4__physical__out__start, - ill_t *, NULL, - ill_t *, ill, - ipha_t *, ipha, - mblk_t *, mp); - FW_HOOKS( - ipst->ips_ip4_physical_out_event, - ipst->ips_ipv4firewall_physical_out, - NULL, ill, ipha, mp, mp, 0, ipst); - DTRACE_PROBE1( - ip4__physical__out__end, - mblk_t *, mp); - } else { - DTRACE_PROBE4( - ip6__physical__out_start, - ill_t *, NULL, - ill_t *, ill, - ip6_t *, ip6h, - mblk_t *, mp); - FW_HOOKS6( - ipst->ips_ip6_physical_out_event, - ipst->ips_ipv6firewall_physical_out, - NULL, ill, ip6h, mp, mp, 0, ipst); - DTRACE_PROBE1( - ip6__physical__out__end, - mblk_t *, mp); - } - - if (mp == NULL || - (mp1 = mp->b_cont) == NULL || - mp->b_rptr != hdr_rptr || - mp->b_wptr != hdr_wptr || - mp1->b_rptr != pld_rptr || - mp1->b_wptr != pld_wptr || - mp1->b_cont != NULL) { - /* - * We abandon multidata processing and - * return to the normal path, either - * when a packet is blocked, or when - * the boundaries of header buffer or - * payload buffer have been changed by - * FW_HOOKS[6]. - */ - if (mp != NULL) - freemsg(mp); - goto legacy_send; - } - /* Finished with the pseudo packet */ - freemsg(mp); - } - DTRACE_IP_FASTPATH(md_hbuf, pkt_info->hdr_rptr, - ill, ipha, ip6h); - /* advance header offset */ - cur_hdr_off += hdr_frag_sz; - - obbytes += tcp->tcp_last_sent_len; - ++obsegs; - } while (!done && *usable > 0 && --num_burst_seg > 0 && - *tail_unsent > 0); - - if ((*xmit_tail)->b_next == NULL) { - /* - * Store the lbolt used for RTT estimation. We can only - * record one timestamp per mblk so we do it when we - * reach the end of the payload buffer. Also we only - * take a new timestamp sample when the previous timed - * data from the same mblk has been ack'ed. - */ - (*xmit_tail)->b_prev = local_time; - (*xmit_tail)->b_next = (mblk_t *)(uintptr_t)first_snxt; - } - - ASSERT(*tail_unsent >= 0); - if (*tail_unsent > 0) { - /* - * We got here because we broke out of the above - * loop due to of one of the following cases: - * - * 1. len < adjusted MSS (i.e. small), - * 2. Sender SWS avoidance, - * 3. max_pld is zero. - * - * We are done for this Multidata, so trim our - * last payload buffer (if any) accordingly. - */ - if (md_pbuf != NULL) - md_pbuf->b_wptr -= *tail_unsent; - } else if (*usable > 0) { - *xmit_tail = (*xmit_tail)->b_cont; - ASSERT((uintptr_t)MBLKL(*xmit_tail) <= - (uintptr_t)INT_MAX); - *tail_unsent = (int)MBLKL(*xmit_tail); - add_buffer = B_TRUE; - } - } while (!done && *usable > 0 && num_burst_seg > 0 && - (tcp_mdt_chain || max_pld > 0)); - - if (md_mp_head != NULL) { - /* send everything down */ - tcp_multisend_data(tcp, ire, ill, md_mp_head, obsegs, obbytes, - &rconfirm); - } - -#undef PREP_NEW_MULTIDATA -#undef PREP_NEW_PBUF -#undef IPVER - - IRE_REFRELE(ire); - return (0); -} - -/* - * A wrapper function for sending one or more Multidata messages down to - * the module below ip; this routine does not release the reference of the - * IRE (caller does that). This routine is analogous to tcp_send_data(). - */ -static void -tcp_multisend_data(tcp_t *tcp, ire_t *ire, const ill_t *ill, mblk_t *md_mp_head, - const uint_t obsegs, const uint_t obbytes, boolean_t *rconfirm) +tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len, + const int tcp_hdr_len, const int num_sack_blk, int *usable, + uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time) { - uint64_t delta; - nce_t *nce; - tcp_stack_t *tcps = tcp->tcp_tcps; - ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - - ASSERT(ire != NULL && ill != NULL); - ASSERT(ire->ire_stq != NULL); - ASSERT(md_mp_head != NULL); - ASSERT(rconfirm != NULL); - - /* adjust MIBs and IRE timestamp */ - DTRACE_PROBE2(tcp__trace__send, mblk_t *, md_mp_head, tcp_t *, tcp); - tcp->tcp_obsegs += obsegs; - UPDATE_MIB(&tcps->tcps_mib, tcpOutDataSegs, obsegs); - UPDATE_MIB(&tcps->tcps_mib, tcpOutDataBytes, obbytes); - TCP_STAT_UPDATE(tcps, tcp_mdt_pkt_out, obsegs); - - if (tcp->tcp_ipversion == IPV4_VERSION) { - TCP_STAT_UPDATE(tcps, tcp_mdt_pkt_out_v4, obsegs); - } else { - TCP_STAT_UPDATE(tcps, tcp_mdt_pkt_out_v6, obsegs); - } - UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests, obsegs); - UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits, obsegs); - UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets, obbytes); - - ire->ire_ob_pkt_count += obsegs; - if (ire->ire_ipif != NULL) - atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, obsegs); - ire->ire_last_used_time = lbolt; - - if ((tcp->tcp_ipversion == IPV4_VERSION && - ipst->ips_ip4_observe.he_interested) || - (tcp->tcp_ipversion == IPV6_VERSION && - ipst->ips_ip6_observe.he_interested)) { - multidata_t *dlmdp = mmd_getmultidata(md_mp_head); - pdesc_t *dl_pkt; - pdescinfo_t pinfo; - mblk_t *nmp; - zoneid_t szone = tcp->tcp_connp->conn_zoneid; - - for (dl_pkt = mmd_getfirstpdesc(dlmdp, &pinfo); - (dl_pkt != NULL); - dl_pkt = mmd_getnextpdesc(dl_pkt, &pinfo)) { - if ((nmp = mmd_transform_link(dl_pkt)) == NULL) - continue; - ipobs_hook(nmp, IPOBS_HOOK_OUTBOUND, szone, - ALL_ZONES, ill, ipst); - freemsg(nmp); - } - } - - /* send it down */ - putnext(ire->ire_stq, md_mp_head); - - /* we're done for TCP/IPv4 */ - if (tcp->tcp_ipversion == IPV4_VERSION) - return; - - nce = ire->ire_nce; - - ASSERT(nce != NULL); - ASSERT(!(nce->nce_flags & (NCE_F_NONUD|NCE_F_PERMANENT))); - ASSERT(nce->nce_state != ND_INCOMPLETE); - - /* reachability confirmation? */ - if (*rconfirm) { - nce->nce_last = TICK_TO_MSEC(lbolt64); - if (nce->nce_state != ND_REACHABLE) { - mutex_enter(&nce->nce_lock); - nce->nce_state = ND_REACHABLE; - nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT; - mutex_exit(&nce->nce_lock); - (void) untimeout(nce->nce_timeout_id); - if (ip_debug > 2) { - /* ip1dbg */ - pr_addr_dbg("tcp_multisend_data: state " - "for %s changed to REACHABLE\n", - AF_INET6, &ire->ire_addr_v6); - } - } - /* reset transport reachability confirmation */ - *rconfirm = B_FALSE; - } - - delta = TICK_TO_MSEC(lbolt64) - nce->nce_last; - ip1dbg(("tcp_multisend_data: delta = %" PRId64 - " ill_reachable_time = %d \n", delta, ill->ill_reachable_time)); - - if (delta > (uint64_t)ill->ill_reachable_time) { - mutex_enter(&nce->nce_lock); - switch (nce->nce_state) { - case ND_REACHABLE: - case ND_STALE: - /* - * ND_REACHABLE is identical to ND_STALE in this - * specific case. If reachable time has expired for - * this neighbor (delta is greater than reachable - * time), conceptually, the neighbor cache is no - * longer in REACHABLE state, but already in STALE - * state. So the correct transition here is to - * ND_DELAY. - */ - nce->nce_state = ND_DELAY; - mutex_exit(&nce->nce_lock); - NDP_RESTART_TIMER(nce, - ipst->ips_delay_first_probe_time); - if (ip_debug > 3) { - /* ip2dbg */ - pr_addr_dbg("tcp_multisend_data: state " - "for %s changed to DELAY\n", - AF_INET6, &ire->ire_addr_v6); - } - break; - case ND_DELAY: - case ND_PROBE: - mutex_exit(&nce->nce_lock); - /* Timers have already started */ - break; - case ND_UNREACHABLE: - /* - * ndp timer has detected that this nce is - * unreachable and initiated deleting this nce - * and all its associated IREs. This is a race - * where we found the ire before it was deleted - * and have just sent out a packet using this - * unreachable nce. - */ - mutex_exit(&nce->nce_lock); - break; - default: - ASSERT(0); - } - } -} - -/* - * Derived from tcp_send_data(). - */ -static void -tcp_lsosend_data(tcp_t *tcp, mblk_t *mp, ire_t *ire, ill_t *ill, const int mss, - int num_lso_seg) -{ - ipha_t *ipha; - mblk_t *ire_fp_mp; - uint_t ire_fp_mp_len; - uint32_t hcksum_txflags = 0; - ipaddr_t src; - ipaddr_t dst; - uint32_t cksum; - uint16_t *up; - tcp_stack_t *tcps = tcp->tcp_tcps; - ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - - ASSERT(DB_TYPE(mp) == M_DATA); - ASSERT(tcp->tcp_state == TCPS_ESTABLISHED); - ASSERT(tcp->tcp_ipversion == IPV4_VERSION); - ASSERT(tcp->tcp_connp != NULL); - ASSERT(CONN_IS_LSO_MD_FASTPATH(tcp->tcp_connp)); - - ipha = (ipha_t *)mp->b_rptr; - src = ipha->ipha_src; - dst = ipha->ipha_dst; - - DTRACE_PROBE2(tcp__trace__send, mblk_t *, mp, tcp_t *, tcp); - - ASSERT(ipha->ipha_ident == 0 || ipha->ipha_ident == IP_HDR_INCLUDED); - ipha->ipha_ident = (uint16_t)atomic_add_32_nv(&ire->ire_ident, - num_lso_seg); -#ifndef _BIG_ENDIAN - ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8); -#endif - if (tcp->tcp_snd_zcopy_aware) { - if ((ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) == 0 || - (ill->ill_zerocopy_capab->ill_zerocopy_flags == 0)) - mp = tcp_zcopy_disable(tcp, mp); - } - - if (ILL_HCKSUM_CAPABLE(ill) && dohwcksum) { - ASSERT(ill->ill_hcksum_capab != NULL); - hcksum_txflags = ill->ill_hcksum_capab->ill_hcksum_txflags; - } - - /* - * Since the TCP checksum should be recalculated by h/w, we can just - * zero the checksum field for HCK_FULLCKSUM, or calculate partial - * pseudo-header checksum for HCK_PARTIALCKSUM. - * The partial pseudo-header excludes TCP length, that was calculated - * in tcp_send(), so to zero *up before further processing. - */ - cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); - - up = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH); - *up = 0; - - IP_CKSUM_XMIT_FAST(ire->ire_ipversion, hcksum_txflags, mp, ipha, up, - IPPROTO_TCP, IP_SIMPLE_HDR_LENGTH, ntohs(ipha->ipha_length), cksum); - - /* - * Append LSO flags and mss to the mp. - */ - lso_info_set(mp, mss, HW_LSO); - - ipha->ipha_fragment_offset_and_flags |= - (uint32_t)htons(ire->ire_frag_flag); - - ire_fp_mp = ire->ire_nce->nce_fp_mp; - ire_fp_mp_len = MBLKL(ire_fp_mp); - ASSERT(DB_TYPE(ire_fp_mp) == M_DATA); - mp->b_rptr = (uchar_t *)ipha - ire_fp_mp_len; - bcopy(ire_fp_mp->b_rptr, mp->b_rptr, ire_fp_mp_len); - - UPDATE_OB_PKT_COUNT(ire); - ire->ire_last_used_time = lbolt; - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits); - UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets, - ntohs(ipha->ipha_length)); - - DTRACE_PROBE4(ip4__physical__out__start, - ill_t *, NULL, ill_t *, ill, ipha_t *, ipha, mblk_t *, mp); - FW_HOOKS(ipst->ips_ip4_physical_out_event, - ipst->ips_ipv4firewall_physical_out, NULL, - ill, ipha, mp, mp, 0, ipst); - DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); - DTRACE_IP_FASTPATH(mp, ipha, ill, ipha, NULL); - - if (mp != NULL) { - if (ipst->ips_ip4_observe.he_interested) { - zoneid_t szone; - - if (ire_fp_mp_len != 0) - mp->b_rptr += ire_fp_mp_len; - szone = ip_get_zoneid_v4(ipha->ipha_src, mp, - ipst, ALL_ZONES); - ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, - ALL_ZONES, ill, ipst); - if (ire_fp_mp_len != 0) - mp->b_rptr -= ire_fp_mp_len; - } - - ILL_SEND_TX(ill, ire, tcp->tcp_connp, mp, 0, NULL); - } -} - -/* - * tcp_send() is called by tcp_wput_data() for non-Multidata transmission - * scheme, and returns one of the following: - * - * -1 = failed allocation. - * 0 = success; burst count reached, or usable send window is too small, - * and that we'd rather wait until later before sending again. - * 1 = success; we are called from tcp_multisend(), and both usable send - * window and tail_unsent are greater than the MDT threshold, and thus - * Multidata Transmit should be used instead. - */ -static int -tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, - const int tcp_tcp_hdr_len, const int num_sack_blk, int *usable, - uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, - const int mdt_thres) -{ - int num_burst_seg = tcp->tcp_snd_burst; - ire_t *ire = NULL; - ill_t *ill = NULL; - mblk_t *ire_fp_mp = NULL; - uint_t ire_fp_mp_len = 0; + int num_burst_seg = tcp->tcp_snd_burst; int num_lso_seg = 1; uint_t lso_usable; boolean_t do_lso_send = B_FALSE; tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; + ip_xmit_attr_t *ixa = connp->conn_ixa; /* - * Check LSO capability before any further work. And the similar check - * need to be done in for(;;) loop. - * LSO will be deployed when therer is more than one mss of available - * data and a burst transmission is allowed. + * Check LSO possibility. The value of tcp->tcp_lso indicates whether + * the underlying connection is LSO capable. Will check whether having + * enough available data to initiate LSO transmission in the for(){} + * loops. */ - if (tcp->tcp_lso && - (tcp->tcp_valid_bits == 0 || - tcp->tcp_valid_bits == TCP_FSS_VALID) && - num_burst_seg >= 2 && (*usable - 1) / mss >= 1) { - /* - * Try to find usable IRE/ILL and do basic check to the ILL. - * Double check LSO usability before going further, since the - * underlying interface could have been changed. In case of any - * change of LSO capability, set tcp_ire_ill_check_done to - * B_FALSE to force to check the ILL with the next send. - */ - if (tcp_send_find_ire_ill(tcp, NULL, &ire, &ill) && - tcp->tcp_lso && ILL_LSO_TCP_USABLE(ill)) { - /* - * Enable LSO with this transmission. - * Since IRE has been hold in tcp_send_find_ire_ill(), - * IRE_REFRELE(ire) should be called before return. - */ + if (tcp->tcp_lso && (tcp->tcp_valid_bits & ~TCP_FSS_VALID) == 0) do_lso_send = B_TRUE; - ire_fp_mp = ire->ire_nce->nce_fp_mp; - ire_fp_mp_len = MBLKL(ire_fp_mp); - /* Round up to multiple of 4 */ - ire_fp_mp_len = ((ire_fp_mp_len + 3) / 4) * 4; - } else { - tcp->tcp_lso = B_FALSE; - tcp->tcp_ire_ill_check_done = B_FALSE; - do_lso_send = B_FALSE; - ill = NULL; - } - } for (;;) { struct datab *db; - tcph_t *tcph; + tcpha_t *tcpha; uint32_t sum; mblk_t *mp, *mp1; uchar_t *rptr; int len; /* - * If we're called by tcp_multisend(), and the amount of - * sendable data as well as the size of current xmit_tail - * is beyond the MDT threshold, return to the caller and - * let the large data transmit be done using MDT. + * Burst count reached, return successfully. */ - if (*usable > 0 && *usable > mdt_thres && - (*tail_unsent > mdt_thres || (*tail_unsent == 0 && - MBLKL((*xmit_tail)->b_cont) > mdt_thres))) { - ASSERT(tcp->tcp_mdt); - return (1); /* success; do large send */ - } - if (num_burst_seg == 0) - break; /* success; burst count reached */ + break; /* - * Calculate the maximum payload length we can send in *one* + * Calculate the maximum payload length we can send at one * time. */ if (do_lso_send) { /* - * Check whether need to do LSO any more. + * Check whether be able to to do LSO for the current + * available data. */ if (num_burst_seg >= 2 && (*usable - 1) / mss >= 1) { lso_usable = MIN(tcp->tcp_lso_max, *usable); @@ -20787,7 +15918,10 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, } ASSERT(num_lso_seg <= IP_MAXPACKET / mss + 1); - +#ifdef DEBUG + DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg, boolean_t, + do_lso_send); +#endif /* * Adjust num_burst_seg here. */ @@ -20817,7 +15951,7 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, /* * If the retransmit timer is not running * we start it so that we will retransmit - * in the case when the the receiver has + * in the case when the receiver has * decremented the window. */ if (*snxt == tcp->tcp_snxt && @@ -20838,7 +15972,7 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, } } - tcph = tcp->tcp_tcph; + tcpha = tcp->tcp_tcpha; /* * The reason to adjust len here is that we need to set flags @@ -20849,19 +15983,25 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, *usable -= len; /* Approximate - can be adjusted later */ if (*usable > 0) - tcph->th_flags[0] = TH_ACK; + tcpha->tha_flags = TH_ACK; else - tcph->th_flags[0] = (TH_ACK | TH_PUSH); + tcpha->tha_flags = (TH_ACK | TH_PUSH); /* - * Prime pump for IP's checksumming on our behalf + * Prime pump for IP's checksumming on our behalf. * Include the adjustment for a source route if any. + * In case of LSO, the partial pseudo-header checksum should + * exclusive TCP length, so zero tha_sum before IP calculate + * pseudo-header checksum for partial checksum offload. */ - sum = len + tcp_tcp_hdr_len + tcp->tcp_sum; - sum = (sum >> 16) + (sum & 0xFFFF); - U16_TO_ABE16(sum, tcph->th_sum); - - U32_TO_ABE32(*snxt, tcph->th_seq); + if (do_lso_send) { + sum = 0; + } else { + sum = len + tcp_hdr_len + connp->conn_sum; + sum = (sum >> 16) + (sum & 0xFFFF); + } + tcpha->tha_sum = htons(sum); + tcpha->tha_seq = htonl(*snxt); /* * Branch off to tcp_xmit_mp() if any of the VALID bits is @@ -20907,8 +16047,6 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, (*xmit_tail)->b_rptr = prev_rptr; if (mp == NULL) { - if (ire != NULL) - IRE_REFRELE(ire); return (-1); } mp1 = mp->b_cont; @@ -20927,7 +16065,7 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, BUMP_LOCAL(tcp->tcp_obsegs); BUMP_MIB(&tcps->tcps_mib, tcpOutDataSegs); UPDATE_MIB(&tcps->tcps_mib, tcpOutDataBytes, len); - tcp_send_data(tcp, q, mp); + tcp_send_data(tcp, mp); continue; } @@ -20942,18 +16080,18 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, *tail_unsent -= len; if (len <= mss) /* LSO is unusable */ tcp->tcp_last_sent_len = (ushort_t)len; - len += tcp_hdr_len; - if (tcp->tcp_ipversion == IPV4_VERSION) + len += total_hdr_len; + ixa->ixa_pktlen = len; + + if (ixa->ixa_flags & IXAF_IS_IPV4) { tcp->tcp_ipha->ipha_length = htons(len); - else + } else { tcp->tcp_ip6h->ip6_plen = - htons(len - - ((char *)&tcp->tcp_ip6h[1] - - tcp->tcp_iphc)); + htons(len - IPV6_HDR_LEN); + } + mp = dupb(*xmit_tail); if (mp == NULL) { - if (ire != NULL) - IRE_REFRELE(ire); return (-1); /* out_of_mem */ } mp->b_rptr = rptr; @@ -20983,21 +16121,21 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, if (len <= mss) /* LSO is unusable (!do_lso_send) */ tcp->tcp_last_sent_len = (ushort_t)len; - len += tcp_hdr_len; - if (tcp->tcp_ipversion == IPV4_VERSION) + len += total_hdr_len; + ixa->ixa_pktlen = len; + + if (ixa->ixa_flags & IXAF_IS_IPV4) { tcp->tcp_ipha->ipha_length = htons(len); - else - tcp->tcp_ip6h->ip6_plen = htons(len - - ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc)); + } else { + tcp->tcp_ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); + } mp = dupb(*xmit_tail); if (mp == NULL) { - if (ire != NULL) - IRE_REFRELE(ire); return (-1); /* out_of_mem */ } - len = tcp_hdr_len; + len = total_hdr_len; /* * There are four reasons to allocate a new hdr mblk: * 1) The bytes above us are in use by another packet @@ -21008,24 +16146,21 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, rptr = mp->b_rptr - len; if (!OK_32PTR(rptr) || ((db = mp->b_datap), db->db_ref != 2) || - rptr < db->db_base + ire_fp_mp_len) { + rptr < db->db_base) { /* NOTE: we assume allocb returns an OK_32PTR */ must_alloc:; - mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + - tcps->tcps_wroff_xtra + ire_fp_mp_len, BPRI_MED); + mp1 = allocb(connp->conn_ht_iphc_allocated + + tcps->tcps_wroff_xtra, BPRI_MED); if (mp1 == NULL) { freemsg(mp); - if (ire != NULL) - IRE_REFRELE(ire); return (-1); /* out_of_mem */ } mp1->b_cont = mp; mp = mp1; /* Leave room for Link Level header */ - len = tcp_hdr_len; - rptr = - &mp->b_rptr[tcps->tcps_wroff_xtra + ire_fp_mp_len]; + len = total_hdr_len; + rptr = &mp->b_rptr[tcps->tcps_wroff_xtra]; mp->b_wptr = &rptr[len]; } @@ -21057,18 +16192,17 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, /* * Excess data in mblk; can we split it? - * If MDT is enabled for the connection, + * If LSO is enabled for the connection, * keep on splitting as this is a transient * send path. */ - if (!do_lso_send && !tcp->tcp_mdt && - (spill + nmpsz > 0)) { + if (!do_lso_send && (spill + nmpsz > 0)) { /* * Don't split if stream head was * told to break up larger writes * into smaller ones. */ - if (tcp->tcp_maxpsz > 0) + if (tcp->tcp_maxpsz_multiplier > 0) break; /* @@ -21096,8 +16230,6 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, if (mp1 == NULL) { *tail_unsent = spill; freemsg(mp); - if (ire != NULL) - IRE_REFRELE(ire); return (-1); /* out_of_mem */ } } @@ -21119,11 +16251,12 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, /* * Adjust the checksum */ - tcph = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len); + tcpha = (tcpha_t *)(rptr + + ixa->ixa_ip_hdr_length); sum += spill; sum = (sum >> 16) + (sum & 0xFFFF); - U16_TO_ABE16(sum, tcph->th_sum); - if (tcp->tcp_ipversion == IPV4_VERSION) { + tcpha->tha_sum = htons(sum); + if (connp->conn_ipversion == IPV4_VERSION) { sum = ntohs( ((ipha_t *)rptr)->ipha_length) + spill; @@ -21136,311 +16269,55 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, ((ip6_t *)rptr)->ip6_plen = htons(sum); } + ixa->ixa_pktlen += spill; *tail_unsent = 0; } } if (tcp->tcp_ip_forward_progress) { - ASSERT(tcp->tcp_ipversion == IPV6_VERSION); - *(uint32_t *)mp->b_rptr |= IP_FORWARD_PROG; tcp->tcp_ip_forward_progress = B_FALSE; + ixa->ixa_flags |= IXAF_REACH_CONF; + } else { + ixa->ixa_flags &= ~IXAF_REACH_CONF; } + /* + * Append LSO information, both flags and mss, to the mp. + */ if (do_lso_send) { - tcp_lsosend_data(tcp, mp, ire, ill, mss, - num_lso_seg); - tcp->tcp_obsegs += num_lso_seg; + lso_info_set(mp, mss, HW_LSO); + ixa->ixa_fragsize = IP_MAXPACKET; + ixa->ixa_extra_ident = num_lso_seg - 1; + DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg, + boolean_t, B_TRUE); + + tcp_send_data(tcp, mp); + + /* + * Restore values of ixa_fragsize and ixa_extra_ident. + */ + ixa->ixa_fragsize = ixa->ixa_pmtu; + ixa->ixa_extra_ident = 0; + tcp->tcp_obsegs += num_lso_seg; TCP_STAT(tcps, tcp_lso_times); TCP_STAT_UPDATE(tcps, tcp_lso_pkt_out, num_lso_seg); } else { - tcp_send_data(tcp, q, mp); + tcp_send_data(tcp, mp); BUMP_LOCAL(tcp->tcp_obsegs); } } - if (ire != NULL) - IRE_REFRELE(ire); return (0); } -/* Unlink and return any mblk that looks like it contains a MDT info */ -static mblk_t * -tcp_mdt_info_mp(mblk_t *mp) -{ - mblk_t *prev_mp; - - for (;;) { - prev_mp = mp; - /* no more to process? */ - if ((mp = mp->b_cont) == NULL) - break; - - switch (DB_TYPE(mp)) { - case M_CTL: - if (*(uint32_t *)mp->b_rptr != MDT_IOC_INFO_UPDATE) - continue; - ASSERT(prev_mp != NULL); - prev_mp->b_cont = mp->b_cont; - mp->b_cont = NULL; - return (mp); - default: - break; - } - } - return (mp); -} - -/* MDT info update routine, called when IP notifies us about MDT */ -static void -tcp_mdt_update(tcp_t *tcp, ill_mdt_capab_t *mdt_capab, boolean_t first) -{ - boolean_t prev_state; - tcp_stack_t *tcps = tcp->tcp_tcps; - - /* - * IP is telling us to abort MDT on this connection? We know - * this because the capability is only turned off when IP - * encounters some pathological cases, e.g. link-layer change - * where the new driver doesn't support MDT, or in situation - * where MDT usage on the link-layer has been switched off. - * IP would not have sent us the initial MDT_IOC_INFO_UPDATE - * if the link-layer doesn't support MDT, and if it does, it - * will indicate that the feature is to be turned on. - */ - prev_state = tcp->tcp_mdt; - tcp->tcp_mdt = (mdt_capab->ill_mdt_on != 0); - if (!tcp->tcp_mdt && !first) { - TCP_STAT(tcps, tcp_mdt_conn_halted3); - ip1dbg(("tcp_mdt_update: disabling MDT for connp %p\n", - (void *)tcp->tcp_connp)); - } - - /* - * We currently only support MDT on simple TCP/{IPv4,IPv6}, - * so disable MDT otherwise. The checks are done here - * and in tcp_wput_data(). - */ - if (tcp->tcp_mdt && - (tcp->tcp_ipversion == IPV4_VERSION && - tcp->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH) || - (tcp->tcp_ipversion == IPV6_VERSION && - tcp->tcp_ip_hdr_len != IPV6_HDR_LEN)) - tcp->tcp_mdt = B_FALSE; - - if (tcp->tcp_mdt) { - if (mdt_capab->ill_mdt_version != MDT_VERSION_2) { - cmn_err(CE_NOTE, "tcp_mdt_update: unknown MDT " - "version (%d), expected version is %d", - mdt_capab->ill_mdt_version, MDT_VERSION_2); - tcp->tcp_mdt = B_FALSE; - return; - } - - /* - * We need the driver to be able to handle at least three - * spans per packet in order for tcp MDT to be utilized. - * The first is for the header portion, while the rest are - * needed to handle a packet that straddles across two - * virtually non-contiguous buffers; a typical tcp packet - * therefore consists of only two spans. Note that we take - * a zero as "don't care". - */ - if (mdt_capab->ill_mdt_span_limit > 0 && - mdt_capab->ill_mdt_span_limit < 3) { - tcp->tcp_mdt = B_FALSE; - return; - } - - /* a zero means driver wants default value */ - tcp->tcp_mdt_max_pld = MIN(mdt_capab->ill_mdt_max_pld, - tcps->tcps_mdt_max_pbufs); - if (tcp->tcp_mdt_max_pld == 0) - tcp->tcp_mdt_max_pld = tcps->tcps_mdt_max_pbufs; - - /* ensure 32-bit alignment */ - tcp->tcp_mdt_hdr_head = roundup(MAX(tcps->tcps_mdt_hdr_head_min, - mdt_capab->ill_mdt_hdr_head), 4); - tcp->tcp_mdt_hdr_tail = roundup(MAX(tcps->tcps_mdt_hdr_tail_min, - mdt_capab->ill_mdt_hdr_tail), 4); - - if (!first && !prev_state) { - TCP_STAT(tcps, tcp_mdt_conn_resumed2); - ip1dbg(("tcp_mdt_update: reenabling MDT for connp %p\n", - (void *)tcp->tcp_connp)); - } - } -} - -/* Unlink and return any mblk that looks like it contains a LSO info */ -static mblk_t * -tcp_lso_info_mp(mblk_t *mp) -{ - mblk_t *prev_mp; - - for (;;) { - prev_mp = mp; - /* no more to process? */ - if ((mp = mp->b_cont) == NULL) - break; - - switch (DB_TYPE(mp)) { - case M_CTL: - if (*(uint32_t *)mp->b_rptr != LSO_IOC_INFO_UPDATE) - continue; - ASSERT(prev_mp != NULL); - prev_mp->b_cont = mp->b_cont; - mp->b_cont = NULL; - return (mp); - default: - break; - } - } - - return (mp); -} - -/* LSO info update routine, called when IP notifies us about LSO */ -static void -tcp_lso_update(tcp_t *tcp, ill_lso_capab_t *lso_capab) -{ - tcp_stack_t *tcps = tcp->tcp_tcps; - - /* - * IP is telling us to abort LSO on this connection? We know - * this because the capability is only turned off when IP - * encounters some pathological cases, e.g. link-layer change - * where the new NIC/driver doesn't support LSO, or in situation - * where LSO usage on the link-layer has been switched off. - * IP would not have sent us the initial LSO_IOC_INFO_UPDATE - * if the link-layer doesn't support LSO, and if it does, it - * will indicate that the feature is to be turned on. - */ - tcp->tcp_lso = (lso_capab->ill_lso_on != 0); - TCP_STAT(tcps, tcp_lso_enabled); - - /* - * We currently only support LSO on simple TCP/IPv4, - * so disable LSO otherwise. The checks are done here - * and in tcp_wput_data(). - */ - if (tcp->tcp_lso && - (tcp->tcp_ipversion == IPV4_VERSION && - tcp->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH) || - (tcp->tcp_ipversion == IPV6_VERSION)) { - tcp->tcp_lso = B_FALSE; - TCP_STAT(tcps, tcp_lso_disabled); - } else { - tcp->tcp_lso_max = MIN(TCP_MAX_LSO_LENGTH, - lso_capab->ill_lso_max); - } -} - -static void -tcp_ire_ill_check(tcp_t *tcp, ire_t *ire, ill_t *ill, boolean_t check_lso_mdt) -{ - conn_t *connp = tcp->tcp_connp; - tcp_stack_t *tcps = tcp->tcp_tcps; - ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - - ASSERT(ire != NULL); - - /* - * We may be in the fastpath here, and although we essentially do - * similar checks as in ip_bind_connected{_v6}/ip_xxinfo_return, - * we try to keep things as brief as possible. After all, these - * are only best-effort checks, and we do more thorough ones prior - * to calling tcp_send()/tcp_multisend(). - */ - if ((ipst->ips_ip_lso_outbound || ipst->ips_ip_multidata_outbound) && - check_lso_mdt && !(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && - ill != NULL && !CONN_IPSEC_OUT_ENCAPSULATED(connp) && - !(ire->ire_flags & RTF_MULTIRT) && - !IPP_ENABLED(IPP_LOCAL_OUT, ipst) && - CONN_IS_LSO_MD_FASTPATH(connp)) { - if (ipst->ips_ip_lso_outbound && ILL_LSO_CAPABLE(ill)) { - /* Cache the result */ - connp->conn_lso_ok = B_TRUE; - - ASSERT(ill->ill_lso_capab != NULL); - if (!ill->ill_lso_capab->ill_lso_on) { - ill->ill_lso_capab->ill_lso_on = 1; - ip1dbg(("tcp_ire_ill_check: connp %p enables " - "LSO for interface %s\n", (void *)connp, - ill->ill_name)); - } - tcp_lso_update(tcp, ill->ill_lso_capab); - } else if (ipst->ips_ip_multidata_outbound && - ILL_MDT_CAPABLE(ill)) { - /* Cache the result */ - connp->conn_mdt_ok = B_TRUE; - - ASSERT(ill->ill_mdt_capab != NULL); - if (!ill->ill_mdt_capab->ill_mdt_on) { - ill->ill_mdt_capab->ill_mdt_on = 1; - ip1dbg(("tcp_ire_ill_check: connp %p enables " - "MDT for interface %s\n", (void *)connp, - ill->ill_name)); - } - tcp_mdt_update(tcp, ill->ill_mdt_capab, B_TRUE); - } - } - - /* - * The goal is to reduce the number of generated tcp segments by - * setting the maxpsz multiplier to 0; this will have an affect on - * tcp_maxpsz_set(). With this behavior, tcp will pack more data - * into each packet, up to SMSS bytes. Doing this reduces the number - * of outbound segments and incoming ACKs, thus allowing for better - * network and system performance. In contrast the legacy behavior - * may result in sending less than SMSS size, because the last mblk - * for some packets may have more data than needed to make up SMSS, - * and the legacy code refused to "split" it. - * - * We apply the new behavior on following situations: - * - * 1) Loopback connections, - * 2) Connections in which the remote peer is not on local subnet, - * 3) Local subnet connections over the bge interface (see below). - * - * Ideally, we would like this behavior to apply for interfaces other - * than bge. However, doing so would negatively impact drivers which - * perform dynamic mapping and unmapping of DMA resources, which are - * increased by setting the maxpsz multiplier to 0 (more mblks per - * packet will be generated by tcp). The bge driver does not suffer - * from this, as it copies the mblks into pre-mapped buffers, and - * therefore does not require more I/O resources than before. - * - * Otherwise, this behavior is present on all network interfaces when - * the destination endpoint is non-local, since reducing the number - * of packets in general is good for the network. - * - * TODO We need to remove this hard-coded conditional for bge once - * a better "self-tuning" mechanism, or a way to comprehend - * the driver transmit strategy is devised. Until the solution - * is found and well understood, we live with this hack. - */ - if (!tcp_static_maxpsz && - (tcp->tcp_loopback || !tcp->tcp_localnet || - (ill->ill_name_length > 3 && bcmp(ill->ill_name, "bge", 3) == 0))) { - /* override the default value */ - tcp->tcp_maxpsz = 0; - - ip3dbg(("tcp_ire_ill_check: connp %p tcp_maxpsz %d on " - "interface %s\n", (void *)connp, tcp->tcp_maxpsz, - ill != NULL ? ill->ill_name : ipif_loopback_name)); - } - - /* set the stream head parameters accordingly */ - (void) tcp_maxpsz_set(tcp, B_TRUE); -} - /* tcp_wput_flush is called by tcp_wput_nondata to handle M_FLUSH messages. */ static void tcp_wput_flush(tcp_t *tcp, mblk_t *mp) { uchar_t fval = *mp->b_rptr; mblk_t *tail; - queue_t *q = tcp->tcp_wq; + conn_t *connp = tcp->tcp_connp; + queue_t *q = connp->conn_wq; /* TODO: How should flush interact with urgent data? */ if ((fval & FLUSHW) && tcp->tcp_xmit_head && @@ -21473,7 +16350,7 @@ tcp_wput_flush(tcp_t *tcp, mblk_t *mp) } /* * We have no unsent data, so unsent must be less than - * tcp_xmit_lowater, so re-enable flow. + * conn_sndlowat, so re-enable flow. */ mutex_enter(&tcp->tcp_non_sq_lock); if (tcp->tcp_flow_stopped) { @@ -21501,12 +16378,12 @@ tcp_wput_flush(tcp_t *tcp, mblk_t *mp) static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp) { - mblk_t *mp1; - struct iocblk *iocp = (struct iocblk *)mp->b_rptr; + mblk_t *mp1; + struct iocblk *iocp = (struct iocblk *)mp->b_rptr; STRUCT_HANDLE(strbuf, sb); - queue_t *q = tcp->tcp_wq; - int error; - uint_t addrlen; + uint_t addrlen; + conn_t *connp = tcp->tcp_connp; + queue_t *q = connp->conn_wq; /* Make sure it is one of ours. */ switch (iocp->ioc_cmd) { @@ -21514,7 +16391,7 @@ tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp) case TI_GETPEERNAME: break; default: - CALL_IP_WPUT(tcp->tcp_connp, q, mp); + ip_wput_nondata(q, mp); return; } switch (mi_copy_state(q, mp, &mp1)) { @@ -21541,43 +16418,56 @@ tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp) } STRUCT_SET_HANDLE(sb, iocp->ioc_flag, (void *)mp1->b_rptr); - addrlen = tcp->tcp_family == AF_INET ? sizeof (sin_t) : sizeof (sin6_t); + + if (connp->conn_family == AF_INET) + addrlen = sizeof (sin_t); + else + addrlen = sizeof (sin6_t); + if (STRUCT_FGET(sb, maxlen) < addrlen) { mi_copy_done(q, mp, EINVAL); return; } - mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE); - if (mp1 == NULL) - return; - switch (iocp->ioc_cmd) { case TI_GETMYNAME: - error = tcp_do_getsockname(tcp, (void *)mp1->b_rptr, &addrlen); break; case TI_GETPEERNAME: - error = tcp_do_getpeername(tcp, (void *)mp1->b_rptr, &addrlen); + if (tcp->tcp_state < TCPS_SYN_RCVD) { + mi_copy_done(q, mp, ENOTCONN); + return; + } break; } + mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE); + if (!mp1) + return; - if (error != 0) { - mi_copy_done(q, mp, error); - } else { - mp1->b_wptr += addrlen; - STRUCT_FSET(sb, len, addrlen); - - /* Copy out the address */ - mi_copyout(q, mp); + STRUCT_FSET(sb, len, addrlen); + switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { + case TI_GETMYNAME: + (void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr, + &addrlen); + break; + case TI_GETPEERNAME: + (void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr, + &addrlen); + break; } + mp1->b_wptr += addrlen; + /* Copy out the address */ + mi_copyout(q, mp); } static void tcp_use_pure_tpi(tcp_t *tcp) { + conn_t *connp = tcp->tcp_connp; + #ifdef _ILP32 - tcp->tcp_acceptor_id = (t_uscalar_t)tcp->tcp_rq; + tcp->tcp_acceptor_id = (t_uscalar_t)connp->conn_rq; #else - tcp->tcp_acceptor_id = tcp->tcp_connp->conn_dev; + tcp->tcp_acceptor_id = connp->conn_dev; #endif /* * Insert this socket into the acceptor hash. @@ -21595,11 +16485,11 @@ tcp_use_pure_tpi(tcp_t *tcp) */ /* ARGSUSED */ static void -tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2) +tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { - conn_t *connp = (conn_t *)arg; - tcp_t *tcp = connp->conn_tcp; - queue_t *q = tcp->tcp_wq; + conn_t *connp = (conn_t *)arg; + tcp_t *tcp = connp->conn_tcp; + queue_t *q = connp->conn_wq; struct iocblk *iocp; ASSERT(DB_TYPE(mp) == M_IOCTL); @@ -21617,17 +16507,6 @@ tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2) iocp = (struct iocblk *)mp->b_rptr; switch (iocp->ioc_cmd) { - case TCP_IOC_DEFAULT_Q: - /* Wants to be the default wq. */ - if (secpolicy_ip_config(iocp->ioc_cr, B_FALSE) != 0) { - iocp->ioc_error = EPERM; - iocp->ioc_count = 0; - mp->b_datap->db_type = M_IOCACK; - qreply(q, mp); - return; - } - tcp_def_q_set(tcp, mp); - return; case _SIOCSOCKFALLBACK: /* * Either sockmod is about to be popped and the socket @@ -21650,7 +16529,7 @@ tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2) qreply(q, mp); return; } - CALL_IP_WPUT(connp, q, mp); + ip_wput_nondata(q, mp); } /* @@ -21658,14 +16537,14 @@ tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2) */ /* ARGSUSED */ static void -tcp_wput_proto(void *arg, mblk_t *mp, void *arg2) +tcp_wput_proto(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { - conn_t *connp = (conn_t *)arg; - tcp_t *tcp = connp->conn_tcp; + conn_t *connp = (conn_t *)arg; + tcp_t *tcp = connp->conn_tcp; union T_primitives *tprim = (union T_primitives *)mp->b_rptr; - uchar_t *rptr; - t_scalar_t type; - cred_t *cr; + uchar_t *rptr; + t_scalar_t type; + cred_t *cr; /* * Try and ASSERT the minimum possible references on the @@ -21684,7 +16563,7 @@ tcp_wput_proto(void *arg, mblk_t *mp, void *arg2) if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) { type = ((union T_primitives *)rptr)->type; if (type == T_EXDATA_REQ) { - tcp_output_urgent(connp, mp, arg2); + tcp_output_urgent(connp, mp, arg2, NULL); } else if (type != T_DATA_REQ) { goto non_urgent_data; } else { @@ -21695,7 +16574,7 @@ tcp_wput_proto(void *arg, mblk_t *mp, void *arg2) } return; } else { - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_wput_proto, dropping one..."); } @@ -21776,17 +16655,10 @@ non_urgent_data: * for subsequent processing by ip_restart_optmgmt(), which * will do the CONN_DEC_REF(). */ - CONN_INC_REF(connp); if ((int)tprim->type == T_SVR4_OPTMGMT_REQ) { - if (svr4_optcom_req(tcp->tcp_wq, mp, cr, &tcp_opt_obj, - B_TRUE) != EINPROGRESS) { - CONN_DEC_REF(connp); - } + svr4_optcom_req(connp->conn_wq, mp, cr, &tcp_opt_obj); } else { - if (tpi_optcom_req(tcp->tcp_wq, mp, cr, &tcp_opt_obj, - B_TRUE) != EINPROGRESS) { - CONN_DEC_REF(connp); - } + tpi_optcom_req(connp->conn_wq, mp, cr, &tcp_opt_obj); } break; @@ -21804,7 +16676,7 @@ non_urgent_data: * We were crossing FINs and got a reset from * the other side. Just ignore it. */ - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_wput_proto, T_ORDREL_REQ out of " @@ -21818,7 +16690,7 @@ non_urgent_data: tcp_addr_req(tcp, mp); break; default: - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_wput_proto, bogus TPI msg, type %d", tprim->type); @@ -21844,19 +16716,6 @@ tcp_wsrv(queue_t *q) TCP_STAT(tcps, tcp_wsrv_called); } -/* Non overlapping byte exchanger */ -static void -tcp_xchg(uchar_t *a, uchar_t *b, int len) -{ - uchar_t uch; - - while (len-- > 0) { - uch = a[len]; - a[len] = b[len]; - b[len] = uch; - } -} - /* * Send out a control packet on the tcp connection specified. This routine * is typically called where we need a simple ACK or RST generated. @@ -21865,50 +16724,51 @@ static void tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl) { uchar_t *rptr; - tcph_t *tcph; + tcpha_t *tcpha; ipha_t *ipha = NULL; ip6_t *ip6h = NULL; uint32_t sum; - int tcp_hdr_len; - int tcp_ip_hdr_len; + int total_hdr_len; + int ip_hdr_len; mblk_t *mp; tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; + ip_xmit_attr_t *ixa = connp->conn_ixa; /* * Save sum for use in source route later. */ - ASSERT(tcp != NULL); - sum = tcp->tcp_tcp_hdr_len + tcp->tcp_sum; - tcp_hdr_len = tcp->tcp_hdr_len; - tcp_ip_hdr_len = tcp->tcp_ip_hdr_len; + sum = connp->conn_ht_ulp_len + connp->conn_sum; + total_hdr_len = connp->conn_ht_iphc_len; + ip_hdr_len = ixa->ixa_ip_hdr_length; /* If a text string is passed in with the request, pass it to strlog. */ - if (str != NULL && tcp->tcp_debug) { + if (str != NULL && connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, "tcp_xmit_ctl: '%s', seq 0x%x, ack 0x%x, ctl 0x%x", str, seq, ack, ctl); } - mp = allocb(tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + tcps->tcps_wroff_xtra, + mp = allocb(connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra, BPRI_MED); if (mp == NULL) { return; } rptr = &mp->b_rptr[tcps->tcps_wroff_xtra]; mp->b_rptr = rptr; - mp->b_wptr = &rptr[tcp_hdr_len]; - bcopy(tcp->tcp_iphc, rptr, tcp_hdr_len); + mp->b_wptr = &rptr[total_hdr_len]; + bcopy(connp->conn_ht_iphc, rptr, total_hdr_len); + + ixa->ixa_pktlen = total_hdr_len; - if (tcp->tcp_ipversion == IPV4_VERSION) { + if (ixa->ixa_flags & IXAF_IS_IPV4) { ipha = (ipha_t *)rptr; - ipha->ipha_length = htons(tcp_hdr_len); + ipha->ipha_length = htons(total_hdr_len); } else { ip6h = (ip6_t *)rptr; - ASSERT(tcp != NULL); - ip6h->ip6_plen = htons(tcp->tcp_hdr_len - - ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc)); + ip6h->ip6_plen = htons(total_hdr_len - IPV6_HDR_LEN); } - tcph = (tcph_t *)&rptr[tcp_ip_hdr_len]; - tcph->th_flags[0] = (uint8_t)ctl; + tcpha = (tcpha_t *)&rptr[ip_hdr_len]; + tcpha->tha_flags = (uint8_t)ctl; if (ctl & TH_RST) { BUMP_MIB(&tcps->tcps_mib, tcpOutRsts); BUMP_MIB(&tcps->tcps_mib, tcpOutControl); @@ -21917,43 +16777,45 @@ tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl) */ if (tcp->tcp_snd_ts_ok && tcp->tcp_state > TCPS_SYN_SENT) { - mp->b_wptr = &rptr[tcp_hdr_len - TCPOPT_REAL_TS_LEN]; + mp->b_wptr = &rptr[total_hdr_len - TCPOPT_REAL_TS_LEN]; *(mp->b_wptr) = TCPOPT_EOL; - if (tcp->tcp_ipversion == IPV4_VERSION) { - ipha->ipha_length = htons(tcp_hdr_len - + + ixa->ixa_pktlen = total_hdr_len - TCPOPT_REAL_TS_LEN; + + if (connp->conn_ipversion == IPV4_VERSION) { + ipha->ipha_length = htons(total_hdr_len - TCPOPT_REAL_TS_LEN); } else { - ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) - - TCPOPT_REAL_TS_LEN); + ip6h->ip6_plen = htons(total_hdr_len - + IPV6_HDR_LEN - TCPOPT_REAL_TS_LEN); } - tcph->th_offset_and_rsrvd[0] -= (3 << 4); + tcpha->tha_offset_and_reserved -= (3 << 4); sum -= TCPOPT_REAL_TS_LEN; } } if (ctl & TH_ACK) { if (tcp->tcp_snd_ts_ok) { U32_TO_BE32(lbolt, - (char *)tcph+TCP_MIN_HEADER_LENGTH+4); + (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); U32_TO_BE32(tcp->tcp_ts_recent, - (char *)tcph+TCP_MIN_HEADER_LENGTH+8); + (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); } /* Update the latest receive window size in TCP header. */ - U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, - tcph->th_win); + tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); tcp->tcp_rack = ack; tcp->tcp_rack_cnt = 0; BUMP_MIB(&tcps->tcps_mib, tcpOutAck); } BUMP_LOCAL(tcp->tcp_obsegs); - U32_TO_BE32(seq, tcph->th_seq); - U32_TO_BE32(ack, tcph->th_ack); + tcpha->tha_seq = htonl(seq); + tcpha->tha_ack = htonl(ack); /* * Include the adjustment for a source route if any. */ sum = (sum >> 16) + (sum & 0xFFFF); - U16_TO_BE16(sum, tcph->th_sum); - tcp_send_data(tcp, tcp->tcp_wq, mp); + tcpha->tha_sum = htons(sum); + tcp_send_data(tcp, mp); } /* @@ -21991,115 +16853,32 @@ tcp_send_rst_chk(tcp_stack_t *tcps) } /* - * Send down the advice IP ioctl to tell IP to mark an IRE temporary. - */ -static void -tcp_ip_ire_mark_advice(tcp_t *tcp) -{ - mblk_t *mp; - ipic_t *ipic; - - if (tcp->tcp_ipversion == IPV4_VERSION) { - mp = tcp_ip_advise_mblk(&tcp->tcp_ipha->ipha_dst, IP_ADDR_LEN, - &ipic); - } else { - mp = tcp_ip_advise_mblk(&tcp->tcp_ip6h->ip6_dst, IPV6_ADDR_LEN, - &ipic); - } - if (mp == NULL) - return; - ipic->ipic_ire_marks |= IRE_MARK_TEMPORARY; - CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp); -} - -/* - * Return an IP advice ioctl mblk and set ipic to be the pointer - * to the advice structure. - */ -static mblk_t * -tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic) -{ - struct iocblk *ioc; - mblk_t *mp, *mp1; - - mp = allocb(sizeof (ipic_t) + addr_len, BPRI_HI); - if (mp == NULL) - return (NULL); - bzero(mp->b_rptr, sizeof (ipic_t) + addr_len); - *ipic = (ipic_t *)mp->b_rptr; - (*ipic)->ipic_cmd = IP_IOC_IRE_ADVISE_NO_REPLY; - (*ipic)->ipic_addr_offset = sizeof (ipic_t); - - bcopy(addr, *ipic + 1, addr_len); - - (*ipic)->ipic_addr_length = addr_len; - mp->b_wptr = &mp->b_rptr[sizeof (ipic_t) + addr_len]; - - mp1 = mkiocb(IP_IOCTL); - if (mp1 == NULL) { - freemsg(mp); - return (NULL); - } - mp1->b_cont = mp; - ioc = (struct iocblk *)mp1->b_rptr; - ioc->ioc_count = sizeof (ipic_t) + addr_len; - - return (mp1); -} - -/* * Generate a reset based on an inbound packet, connp is set by caller * when RST is in response to an unexpected inbound packet for which * there is active tcp state in the system. * * IPSEC NOTE : Try to send the reply with the same protection as it came - * in. We still have the ipsec_mp that the packet was attached to. Thus - * the packet will go out at the same level of protection as it came in by - * converting the IPSEC_IN to IPSEC_OUT. + * in. We have the ip_recv_attr_t which is reversed to form the ip_xmit_attr_t. + * That way the packet will go out at the same level of protection as it + * came in with. */ static void -tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, - uint32_t ack, int ctl, uint_t ip_hdr_len, zoneid_t zoneid, - tcp_stack_t *tcps, conn_t *connp) +tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, uint32_t ack, int ctl, + ip_recv_attr_t *ira, ip_stack_t *ipst, conn_t *connp) { ipha_t *ipha = NULL; ip6_t *ip6h = NULL; ushort_t len; - tcph_t *tcph; + tcpha_t *tcpha; int i; - mblk_t *ipsec_mp; - boolean_t mctl_present; - ipic_t *ipic; ipaddr_t v4addr; in6_addr_t v6addr; - int addr_len; - void *addr; - queue_t *q = tcps->tcps_g_q; - tcp_t *tcp; - cred_t *cr; - pid_t pid; - mblk_t *nmp; - ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - - if (tcps->tcps_g_q == NULL) { - /* - * For non-zero stackids the default queue isn't created - * until the first open, thus there can be a need to send - * a reset before then. But we can't do that, hence we just - * drop the packet. Later during boot, when the default queue - * has been setup, a retransmitted packet from the peer - * will result in a reset. - */ - ASSERT(tcps->tcps_netstack->netstack_stackid != - GLOBAL_NETSTACKID); - freemsg(mp); - return; - } - - if (connp != NULL) - tcp = connp->conn_tcp; - else - tcp = Q_TO_TCP(q); + netstack_t *ns = ipst->ips_netstack; + tcp_stack_t *tcps = ns->netstack_tcp; + ip_xmit_attr_t ixas, *ixa; + uint_t ip_hdr_len = ira->ira_ip_hdr_length; + boolean_t need_refrele = B_FALSE; /* ixa_refrele(ixa) */ + ushort_t port; if (!tcp_send_rst_chk(tcps)) { tcps->tcps_rst_unsent++; @@ -22107,16 +16886,41 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, return; } - if (mp->b_datap->db_type == M_CTL) { - ipsec_mp = mp; - mp = mp->b_cont; - mctl_present = B_TRUE; + /* + * If connp != NULL we use conn_ixa to keep IP_NEXTHOP and other + * options from the listener. In that case the caller must ensure that + * we are running on the listener = connp squeue. + * + * We get a safe copy of conn_ixa so we don't need to restore anything + * we or ip_output_simple might change in the ixa. + */ + if (connp != NULL) { + ASSERT(connp->conn_on_sqp); + + ixa = conn_get_ixa_exclusive(connp); + if (ixa == NULL) { + tcps->tcps_rst_unsent++; + freemsg(mp); + return; + } + need_refrele = B_TRUE; } else { - ipsec_mp = mp; - mctl_present = B_FALSE; + bzero(&ixas, sizeof (ixas)); + ixa = &ixas; + /* + * IXAF_VERIFY_SOURCE is overkill since we know the + * packet was for us. + */ + ixa->ixa_flags |= IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE; + ixa->ixa_protocol = IPPROTO_TCP; + ixa->ixa_zoneid = ira->ira_zoneid; + ixa->ixa_ifindex = 0; + ixa->ixa_ipst = ipst; + ixa->ixa_cred = kcred; + ixa->ixa_cpid = NOPID; } - if (str && q && tcps->tcps_dbg) { + if (str && tcps->tcps_dbg) { (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, "tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, " "flags 0x%x", @@ -22126,20 +16930,12 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, mblk_t *mp1 = copyb(mp); freemsg(mp); mp = mp1; - if (!mp) { - if (mctl_present) - freeb(ipsec_mp); - return; - } else { - if (mctl_present) { - ipsec_mp->b_cont = mp; - } else { - ipsec_mp = mp; - } - } + if (mp == NULL) + goto done; } else if (mp->b_cont) { freemsg(mp->b_cont); mp->b_cont = NULL; + DB_CKSUMFLAGS(mp) = 0; } /* * We skip reversing source route here. @@ -22159,18 +16955,20 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, */ if (ipha->ipha_src == 0 || ipha->ipha_src == INADDR_BROADCAST || CLASSD(ipha->ipha_src)) { - freemsg(ipsec_mp); BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); - return; + ip_drop_input("ipIfStatsInDiscards", mp, NULL); + freemsg(mp); + goto done; } } else { ip6h = (ip6_t *)mp->b_rptr; if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) || IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src)) { - freemsg(ipsec_mp); BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsInDiscards); - return; + ip_drop_input("ipIfStatsInDiscards", mp, NULL); + freemsg(mp); + goto done; } /* Remove any extension headers assuming partial overlay */ @@ -22185,13 +16983,13 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, ip6h->ip6_nxt = IPPROTO_TCP; } } - tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; - if (tcph->th_flags[0] & TH_RST) { - freemsg(ipsec_mp); - return; + tcpha = (tcpha_t *)&mp->b_rptr[ip_hdr_len]; + if (tcpha->tha_flags & TH_RST) { + freemsg(mp); + goto done; } - tcph->th_offset_and_rsrvd[0] = (5 << 4); - len = ip_hdr_len + sizeof (tcph_t); + tcpha->tha_offset_and_reserved = (5 << 4); + len = ip_hdr_len + sizeof (tcpha_t); mp->b_wptr = &mp->b_rptr[len]; if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { ipha->ipha_length = htons(len); @@ -22201,108 +16999,79 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, ipha->ipha_dst = v4addr; ipha->ipha_ident = 0; ipha->ipha_ttl = (uchar_t)tcps->tcps_ipv4_ttl; - addr_len = IP_ADDR_LEN; - addr = &v4addr; + ixa->ixa_flags |= IXAF_IS_IPV4; + ixa->ixa_ip_hdr_length = ip_hdr_len; } else { - /* No ip6i_t in this case */ ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); /* Swap addresses */ v6addr = ip6h->ip6_src; ip6h->ip6_src = ip6h->ip6_dst; ip6h->ip6_dst = v6addr; ip6h->ip6_hops = (uchar_t)tcps->tcps_ipv6_hoplimit; - addr_len = IPV6_ADDR_LEN; - addr = &v6addr; - } - tcp_xchg(tcph->th_fport, tcph->th_lport, 2); - U32_TO_BE32(ack, tcph->th_ack); - U32_TO_BE32(seq, tcph->th_seq); - U16_TO_BE16(0, tcph->th_win); - U16_TO_BE16(sizeof (tcph_t), tcph->th_sum); - tcph->th_flags[0] = (uint8_t)ctl; + ixa->ixa_flags &= ~IXAF_IS_IPV4; + + if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_dst)) { + ixa->ixa_flags |= IXAF_SCOPEID_SET; + ixa->ixa_scopeid = ira->ira_ruifindex; + } + ixa->ixa_ip_hdr_length = IPV6_HDR_LEN; + } + ixa->ixa_pktlen = len; + + /* Swap the ports */ + port = tcpha->tha_fport; + tcpha->tha_fport = tcpha->tha_lport; + tcpha->tha_lport = port; + + tcpha->tha_ack = htonl(ack); + tcpha->tha_seq = htonl(seq); + tcpha->tha_win = 0; + tcpha->tha_sum = htons(sizeof (tcpha_t)); + tcpha->tha_flags = (uint8_t)ctl; if (ctl & TH_RST) { BUMP_MIB(&tcps->tcps_mib, tcpOutRsts); BUMP_MIB(&tcps->tcps_mib, tcpOutControl); } - /* IP trusts us to set up labels when required. */ - if (is_system_labeled() && (cr = msg_getcred(mp, &pid)) != NULL && - crgetlabel(cr) != NULL) { - int err; - - if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) - err = tsol_check_label(cr, &mp, - tcp->tcp_connp->conn_mac_mode, - tcps->tcps_netstack->netstack_ip, pid); - else - err = tsol_check_label_v6(cr, &mp, - tcp->tcp_connp->conn_mac_mode, - tcps->tcps_netstack->netstack_ip, pid); - if (mctl_present) - ipsec_mp->b_cont = mp; - else - ipsec_mp = mp; - if (err != 0) { - freemsg(ipsec_mp); - return; - } - if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { - ipha = (ipha_t *)mp->b_rptr; - } else { - ip6h = (ip6_t *)mp->b_rptr; - } + /* Discard any old label */ + if (ixa->ixa_free_flags & IXA_FREE_TSL) { + ASSERT(ixa->ixa_tsl != NULL); + label_rele(ixa->ixa_tsl); + ixa->ixa_free_flags &= ~IXA_FREE_TSL; } + ixa->ixa_tsl = ira->ira_tsl; /* Behave as a multi-level responder */ - if (mctl_present) { - ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr; - - ASSERT(ii->ipsec_in_type == IPSEC_IN); - if (!ipsec_in_to_out(ipsec_mp, ipha, ip6h, zoneid)) { - return; + if (ira->ira_flags & IRAF_IPSEC_SECURE) { + /* + * Apply IPsec based on how IPsec was applied to + * the packet that caused the RST. + */ + if (!ipsec_in_to_out(ira, ixa, mp, ipha, ip6h)) { + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); + /* Note: mp already consumed and ip_drop_packet done */ + goto done; } + } else { + /* + * This is in clear. The RST message we are building + * here should go out in clear, independent of our policy. + */ + ixa->ixa_flags |= IXAF_NO_IPSEC; } - if (zoneid == ALL_ZONES) - zoneid = GLOBAL_ZONEID; - - /* Add the zoneid so ip_output routes it properly */ - if ((nmp = ip_prepend_zoneid(ipsec_mp, zoneid, ipst)) == NULL) { - freemsg(ipsec_mp); - return; - } - ipsec_mp = nmp; /* * NOTE: one might consider tracing a TCP packet here, but * this function has no active TCP state and no tcp structure * that has a trace buffer. If we traced here, we would have * to keep a local trace buffer in tcp_record_trace(). - * - * TSol note: The mblk that contains the incoming packet was - * reused by tcp_xmit_listener_reset, so it already contains - * the right credentials and we don't need to call mblk_setcred. - * Also the conn's cred is not right since it is associated - * with tcps_g_q. */ - CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, ipsec_mp); - /* - * Tell IP to mark the IRE used for this destination temporary. - * This way, we can limit our exposure to DoS attack because IP - * creates an IRE for each destination. If there are too many, - * the time to do any routing lookup will be extremely long. And - * the lookup can be in interrupt context. - * - * Note that in normal circumstances, this marking should not - * affect anything. It would be nice if only 1 message is - * needed to inform IP that the IRE created for this RST should - * not be added to the cache table. But there is currently - * not such communication mechanism between TCP and IP. So - * the best we can do now is to send the advice ioctl to IP - * to mark the IRE temporary. - */ - if ((mp = tcp_ip_advise_mblk(addr, addr_len, &ipic)) != NULL) { - ipic->ipic_ire_marks |= IRE_MARK_TEMPORARY; - CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp); + (void) ip_output_simple(mp, ixa); +done: + ixa_cleanup(ixa); + if (need_refrele) { + ASSERT(ixa != &ixas); + ixa_refrele(ixa); } } @@ -22313,9 +17082,11 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, static int tcp_xmit_end(tcp_t *tcp) { - ipic_t *ipic; - mblk_t *mp; + mblk_t *mp; tcp_stack_t *tcps = tcp->tcp_tcps; + iulp_t uinfo; + ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; + conn_t *connp = tcp->tcp_connp; if (tcp->tcp_state < TCPS_SYN_RCVD || tcp->tcp_state > TCPS_CLOSE_WAIT) { @@ -22337,7 +17108,7 @@ tcp_xmit_end(tcp_t *tcp) tcp->tcp_fss, B_FALSE, NULL, B_FALSE); if (mp) { - tcp_send_data(tcp, tcp->tcp_wq, mp); + tcp_send_data(tcp, mp); } else { /* * Couldn't allocate msg. Pretend we got it out. @@ -22373,66 +17144,49 @@ tcp_xmit_end(tcp_t *tcp) return (0); /* - * NOTE: should not update if source routes i.e. if tcp_remote if - * different from the destination. + * We do not have a good algorithm to update ssthresh at this time. + * So don't do any update. + */ + bzero(&uinfo, sizeof (uinfo)); + uinfo.iulp_rtt = tcp->tcp_rtt_sa; + uinfo.iulp_rtt_sd = tcp->tcp_rtt_sd; + + /* + * Note that uinfo is kept for conn_faddr in the DCE. Could update even + * if source routed but we don't. */ - if (tcp->tcp_ipversion == IPV4_VERSION) { - if (tcp->tcp_remote != tcp->tcp_ipha->ipha_dst) { + if (connp->conn_ipversion == IPV4_VERSION) { + if (connp->conn_faddr_v4 != tcp->tcp_ipha->ipha_dst) { return (0); } - mp = tcp_ip_advise_mblk(&tcp->tcp_ipha->ipha_dst, IP_ADDR_LEN, - &ipic); + (void) dce_update_uinfo_v4(connp->conn_faddr_v4, &uinfo, ipst); } else { - if (!(IN6_ARE_ADDR_EQUAL(&tcp->tcp_remote_v6, + uint_t ifindex; + + if (!(IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6, &tcp->tcp_ip6h->ip6_dst))) { return (0); } - mp = tcp_ip_advise_mblk(&tcp->tcp_ip6h->ip6_dst, IPV6_ADDR_LEN, - &ipic); - } - - /* Record route attributes in the IRE for use by future connections. */ - if (mp == NULL) - return (0); - - /* - * We do not have a good algorithm to update ssthresh at this time. - * So don't do any update. - */ - ipic->ipic_rtt = tcp->tcp_rtt_sa; - ipic->ipic_rtt_sd = tcp->tcp_rtt_sd; - - CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp); - - return (0); -} + ifindex = 0; + if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_faddr_v6)) { + ip_xmit_attr_t *ixa = connp->conn_ixa; -/* ARGSUSED */ -void -tcp_xmit_reset(void *arg, mblk_t *mp, void *arg2) -{ - conn_t *connp = (conn_t *)arg; - mblk_t *mp1; - tcp_t *tcp = connp->conn_tcp; - tcp_xmit_reset_event_t *eventp; - - ASSERT(mp->b_datap->db_type == M_PROTO && - MBLKL(mp) == sizeof (tcp_xmit_reset_event_t)); + /* + * If we are going to create a DCE we'd better have + * an ifindex + */ + if (ixa->ixa_nce != NULL) { + ifindex = ixa->ixa_nce->nce_common->ncec_ill-> + ill_phyint->phyint_ifindex; + } else { + return (0); + } + } - if (tcp->tcp_state != TCPS_LISTEN) { - freemsg(mp); - return; + (void) dce_update_uinfo(&connp->conn_faddr_v6, ifindex, &uinfo, + ipst); } - - mp1 = mp->b_cont; - mp->b_cont = NULL; - eventp = (tcp_xmit_reset_event_t *)mp->b_rptr; - ASSERT(eventp->tcp_xre_tcps->tcps_netstack == - connp->conn_netstack); - - tcp_xmit_listeners_reset(mp1, eventp->tcp_xre_iphdrlen, - eventp->tcp_xre_zoneid, eventp->tcp_xre_tcps, connp); - freemsg(mp); + return (0); } /* @@ -22442,45 +17196,25 @@ tcp_xmit_reset(void *arg, mblk_t *mp, void *arg2) * Note that we are reusing the incoming mp to construct the outgoing RST. */ void -tcp_xmit_listeners_reset(mblk_t *mp, uint_t ip_hdr_len, zoneid_t zoneid, - tcp_stack_t *tcps, conn_t *connp) +tcp_xmit_listeners_reset(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst, + conn_t *connp) { uchar_t *rptr; uint32_t seg_len; - tcph_t *tcph; + tcpha_t *tcpha; uint32_t seg_seq; uint32_t seg_ack; uint_t flags; - mblk_t *ipsec_mp; ipha_t *ipha; ip6_t *ip6h; - boolean_t mctl_present = B_FALSE; - boolean_t check = B_TRUE; boolean_t policy_present; + netstack_t *ns = ipst->ips_netstack; + tcp_stack_t *tcps = ns->netstack_tcp; ipsec_stack_t *ipss = tcps->tcps_netstack->netstack_ipsec; + uint_t ip_hdr_len = ira->ira_ip_hdr_length; TCP_STAT(tcps, tcp_no_listener); - ipsec_mp = mp; - - if (mp->b_datap->db_type == M_CTL) { - ipsec_in_t *ii; - - mctl_present = B_TRUE; - mp = mp->b_cont; - - ii = (ipsec_in_t *)ipsec_mp->b_rptr; - ASSERT(ii->ipsec_in_type == IPSEC_IN); - if (ii->ipsec_in_dont_check) { - check = B_FALSE; - if (!ii->ipsec_in_secure) { - freeb(ipsec_mp); - mctl_present = B_FALSE; - ipsec_mp = mp; - } - } - } - if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { policy_present = ipss->ipsec_inbound_v4_policy_present; ipha = (ipha_t *)mp->b_rptr; @@ -22491,41 +17225,39 @@ tcp_xmit_listeners_reset(mblk_t *mp, uint_t ip_hdr_len, zoneid_t zoneid, ip6h = (ip6_t *)mp->b_rptr; } - if (check && policy_present) { + if (policy_present) { /* * The conn_t parameter is NULL because we already know * nobody's home. */ - ipsec_mp = ipsec_check_global_policy( - ipsec_mp, (conn_t *)NULL, ipha, ip6h, mctl_present, - tcps->tcps_netstack); - if (ipsec_mp == NULL) + mp = ipsec_check_global_policy(mp, (conn_t *)NULL, ipha, ip6h, + ira, ns); + if (mp == NULL) return; } - if (is_system_labeled() && !tsol_can_reply_error(mp)) { + if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) { DTRACE_PROBE2( tx__ip__log__error__nolistener__tcp, char *, "Could not reply with RST to mp(1)", mblk_t *, mp); ip2dbg(("tcp_xmit_listeners_reset: not permitted to reply\n")); - freemsg(ipsec_mp); + freemsg(mp); return; } rptr = mp->b_rptr; - tcph = (tcph_t *)&rptr[ip_hdr_len]; - seg_seq = BE32_TO_U32(tcph->th_seq); - seg_ack = BE32_TO_U32(tcph->th_ack); - flags = tcph->th_flags[0]; + tcpha = (tcpha_t *)&rptr[ip_hdr_len]; + seg_seq = ntohl(tcpha->tha_seq); + seg_ack = ntohl(tcpha->tha_ack); + flags = tcpha->tha_flags; - seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcph) + ip_hdr_len); + seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcpha) + ip_hdr_len); if (flags & TH_RST) { - freemsg(ipsec_mp); + freemsg(mp); } else if (flags & TH_ACK) { - tcp_xmit_early_reset("no tcp, reset", - ipsec_mp, seg_ack, 0, TH_RST, ip_hdr_len, zoneid, tcps, - connp); + tcp_xmit_early_reset("no tcp, reset", mp, seg_ack, 0, TH_RST, + ira, ipst, connp); } else { if (flags & TH_SYN) { seg_len++; @@ -22537,14 +17269,13 @@ tcp_xmit_listeners_reset(mblk_t *mp, uint_t ip_hdr_len, zoneid_t zoneid, * segment is neither. Just drop it on the * floor. */ - freemsg(ipsec_mp); + freemsg(mp); tcps->tcps_rst_unsent++; return; } - tcp_xmit_early_reset("no tcp, reset/ack", - ipsec_mp, 0, seg_seq + seg_len, - TH_RST | TH_ACK, ip_hdr_len, zoneid, tcps, connp); + tcp_xmit_early_reset("no tcp, reset/ack", mp, 0, + seg_seq + seg_len, TH_RST | TH_ACK, ira, ipst, connp); } } @@ -22573,14 +17304,16 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, mblk_t *mp1; mblk_t *mp2; uchar_t *rptr; - tcph_t *tcph; + tcpha_t *tcpha; int32_t num_sack_blk = 0; int32_t sack_opt_len = 0; tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; + ip_xmit_attr_t *ixa = connp->conn_ixa; /* Allocate for our maximum TCP header + link-level */ - mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + - tcps->tcps_wroff_xtra, BPRI_MED); + mp1 = allocb(connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra, + BPRI_MED); if (!mp1) return (NULL); data_length = 0; @@ -22646,15 +17379,14 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, } /* Update the latest receive window size in TCP header. */ - U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, - tcp->tcp_tcph->th_win); + tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); rptr = mp1->b_rptr + tcps->tcps_wroff_xtra; mp1->b_rptr = rptr; - mp1->b_wptr = rptr + tcp->tcp_hdr_len + sack_opt_len; - bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len); - tcph = (tcph_t *)&rptr[tcp->tcp_ip_hdr_len]; - U32_TO_ABE32(seq, tcph->th_seq); + mp1->b_wptr = rptr + connp->conn_ht_iphc_len + sack_opt_len; + bcopy(connp->conn_ht_iphc, rptr, connp->conn_ht_iphc_len); + tcpha = (tcpha_t *)&rptr[ixa->ixa_ip_hdr_length]; + tcpha->tha_seq = htonl(seq); /* * Use tcp_unsent to determine if the PUSH bit should be used assumes @@ -22729,14 +17461,14 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, wptr[0] = TCPOPT_MAXSEG; wptr[1] = TCPOPT_MAXSEG_LEN; wptr += 2; - u1 = tcp->tcp_if_mtu - - (tcp->tcp_ipversion == IPV4_VERSION ? + u1 = tcp->tcp_initial_pmtu - + (connp->conn_ipversion == IPV4_VERSION ? IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) - TCP_MIN_HEADER_LENGTH; U16_TO_BE16(u1, wptr); mp1->b_wptr = wptr + 2; /* Update the offset to cover the additional word */ - tcph->th_offset_and_rsrvd[0] += (1 << 4); + tcpha->tha_offset_and_reserved += (1 << 4); /* * Note that the following way of filling in @@ -22763,7 +17495,7 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, ASSERT(tcp->tcp_ts_recent == 0); U32_TO_BE32(0L, wptr); mp1->b_wptr += TCPOPT_REAL_TS_LEN; - tcph->th_offset_and_rsrvd[0] += + tcpha->tha_offset_and_reserved += (3 << 4); } @@ -22819,7 +17551,7 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, wptr[2] = TCPOPT_WS_LEN; wptr[3] = (uchar_t)tcp->tcp_rcv_ws; mp1->b_wptr += TCPOPT_REAL_WS_LEN; - tcph->th_offset_and_rsrvd[0] += (1 << 4); + tcpha->tha_offset_and_reserved += (1 << 4); } if (tcp->tcp_snd_sack_ok) { @@ -22829,7 +17561,7 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, wptr[2] = TCPOPT_SACK_PERMITTED; wptr[3] = TCPOPT_SACK_OK_LEN; mp1->b_wptr += TCPOPT_REAL_SACK_OK_LEN; - tcph->th_offset_and_rsrvd[0] += (1 << 4); + tcpha->tha_offset_and_reserved += (1 << 4); } /* allocb() of adequate mblk assures space */ @@ -22840,9 +17572,9 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, * Get IP set to checksum on our behalf * Include the adjustment for a source route if any. */ - u1 += tcp->tcp_sum; + u1 += connp->conn_sum; u1 = (u1 >> 16) + (u1 & 0xFFFF); - U16_TO_BE16(u1, tcph->th_sum); + tcpha->tha_sum = htons(u1); BUMP_MIB(&tcps->tcps_mib, tcpOutControl); } if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && @@ -22878,10 +17610,10 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, u1 < (uint32_t)(64 * 1024)) { flags |= TH_URG; BUMP_MIB(&tcps->tcps_mib, tcpOutUrg); - U32_TO_ABE16(u1, tcph->th_urp); + tcpha->tha_urp = htons(u1); } } - tcph->th_flags[0] = (uchar_t)flags; + tcpha->tha_flags = (uchar_t)flags; tcp->tcp_rack = tcp->tcp_rnxt; tcp->tcp_rack_cnt = 0; @@ -22890,14 +17622,14 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, uint32_t llbolt = (uint32_t)lbolt; U32_TO_BE32(llbolt, - (char *)tcph+TCP_MIN_HEADER_LENGTH+4); + (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); U32_TO_BE32(tcp->tcp_ts_recent, - (char *)tcph+TCP_MIN_HEADER_LENGTH+8); + (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); } } if (num_sack_blk > 0) { - uchar_t *wptr = (uchar_t *)tcph + tcp->tcp_tcp_hdr_len; + uchar_t *wptr = (uchar_t *)tcpha + connp->conn_ht_ulp_len; sack_blk_t *tmp; int32_t i; @@ -22915,33 +17647,34 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, U32_TO_BE32(tmp[i].end, wptr); wptr += sizeof (tcp_seq); } - tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) << 4); + tcpha->tha_offset_and_reserved += ((num_sack_blk * 2 + 1) << 4); } ASSERT((uintptr_t)(mp1->b_wptr - rptr) <= (uintptr_t)INT_MAX); data_length += (int)(mp1->b_wptr - rptr); - if (tcp->tcp_ipversion == IPV4_VERSION) { + + ixa->ixa_pktlen = data_length; + + if (ixa->ixa_flags & IXAF_IS_IPV4) { ((ipha_t *)rptr)->ipha_length = htons(data_length); } else { - ip6_t *ip6 = (ip6_t *)(rptr + - (((ip6_t *)rptr)->ip6_nxt == IPPROTO_RAW ? - sizeof (ip6i_t) : 0)); + ip6_t *ip6 = (ip6_t *)rptr; - ip6->ip6_plen = htons(data_length - - ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc)); + ip6->ip6_plen = htons(data_length - IPV6_HDR_LEN); } /* * Prime pump for IP * Include the adjustment for a source route if any. */ - data_length -= tcp->tcp_ip_hdr_len; - data_length += tcp->tcp_sum; + data_length -= ixa->ixa_ip_hdr_length; + data_length += connp->conn_sum; data_length = (data_length >> 16) + (data_length & 0xFFFF); - U16_TO_ABE16(data_length, tcph->th_sum); + tcpha->tha_sum = htons(data_length); if (tcp->tcp_ip_forward_progress) { - ASSERT(tcp->tcp_ipversion == IPV6_VERSION); - *(uint32_t *)mp1->b_rptr |= IP_FORWARD_PROG; tcp->tcp_ip_forward_progress = B_FALSE; + connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF; + } else { + connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF; } return (mp1); } @@ -23012,7 +17745,7 @@ tcp_ack_timer(void *arg) BUMP_LOCAL(tcp->tcp_obsegs); BUMP_MIB(&tcps->tcps_mib, tcpOutAck); BUMP_MIB(&tcps->tcps_mib, tcpOutAckDelayed); - tcp_send_data(tcp, tcp->tcp_wq, mp); + tcp_send_data(tcp, mp); } } @@ -23023,6 +17756,7 @@ tcp_ack_mp(tcp_t *tcp) { uint32_t seq_no; tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; /* * There are a few cases to be considered while setting the sequence no. @@ -23058,12 +17792,13 @@ tcp_ack_mp(tcp_t *tcp) /* Generate a simple ACK */ int data_length; uchar_t *rptr; - tcph_t *tcph; + tcpha_t *tcpha; mblk_t *mp1; + int32_t total_hdr_len; int32_t tcp_hdr_len; - int32_t tcp_tcp_hdr_len; int32_t num_sack_blk = 0; int32_t sack_opt_len; + ip_xmit_attr_t *ixa = connp->conn_ixa; /* * Allocate space for TCP + IP headers @@ -23074,34 +17809,34 @@ tcp_ack_mp(tcp_t *tcp) tcp->tcp_num_sack_blk); sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; - tcp_hdr_len = tcp->tcp_hdr_len + sack_opt_len; - tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len + sack_opt_len; + total_hdr_len = connp->conn_ht_iphc_len + sack_opt_len; + tcp_hdr_len = connp->conn_ht_ulp_len + sack_opt_len; } else { - tcp_hdr_len = tcp->tcp_hdr_len; - tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len; + total_hdr_len = connp->conn_ht_iphc_len; + tcp_hdr_len = connp->conn_ht_ulp_len; } - mp1 = allocb(tcp_hdr_len + tcps->tcps_wroff_xtra, BPRI_MED); + mp1 = allocb(total_hdr_len + tcps->tcps_wroff_xtra, BPRI_MED); if (!mp1) return (NULL); /* Update the latest receive window size in TCP header. */ - U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, - tcp->tcp_tcph->th_win); + tcp->tcp_tcpha->tha_win = + htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); /* copy in prototype TCP + IP header */ rptr = mp1->b_rptr + tcps->tcps_wroff_xtra; mp1->b_rptr = rptr; - mp1->b_wptr = rptr + tcp_hdr_len; - bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len); + mp1->b_wptr = rptr + total_hdr_len; + bcopy(connp->conn_ht_iphc, rptr, connp->conn_ht_iphc_len); - tcph = (tcph_t *)&rptr[tcp->tcp_ip_hdr_len]; + tcpha = (tcpha_t *)&rptr[ixa->ixa_ip_hdr_length]; /* Set the TCP sequence number. */ - U32_TO_ABE32(seq_no, tcph->th_seq); + tcpha->tha_seq = htonl(seq_no); /* Set up the TCP flag field. */ - tcph->th_flags[0] = (uchar_t)TH_ACK; + tcpha->tha_flags = (uchar_t)TH_ACK; if (tcp->tcp_ecn_echo_on) - tcph->th_flags[0] |= TH_ECE; + tcpha->tha_flags |= TH_ECE; tcp->tcp_rack = tcp->tcp_rnxt; tcp->tcp_rack_cnt = 0; @@ -23111,14 +17846,15 @@ tcp_ack_mp(tcp_t *tcp) uint32_t llbolt = (uint32_t)lbolt; U32_TO_BE32(llbolt, - (char *)tcph+TCP_MIN_HEADER_LENGTH+4); + (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); U32_TO_BE32(tcp->tcp_ts_recent, - (char *)tcph+TCP_MIN_HEADER_LENGTH+8); + (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); } /* Fill in SACK options */ if (num_sack_blk > 0) { - uchar_t *wptr = (uchar_t *)tcph + tcp->tcp_tcp_hdr_len; + uchar_t *wptr = (uchar_t *)tcpha + + connp->conn_ht_ulp_len; sack_blk_t *tmp; int32_t i; @@ -23136,34 +17872,33 @@ tcp_ack_mp(tcp_t *tcp) U32_TO_BE32(tmp[i].end, wptr); wptr += sizeof (tcp_seq); } - tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) - << 4); + tcpha->tha_offset_and_reserved += + ((num_sack_blk * 2 + 1) << 4); } - if (tcp->tcp_ipversion == IPV4_VERSION) { - ((ipha_t *)rptr)->ipha_length = htons(tcp_hdr_len); + ixa->ixa_pktlen = total_hdr_len; + + if (ixa->ixa_flags & IXAF_IS_IPV4) { + ((ipha_t *)rptr)->ipha_length = htons(total_hdr_len); } else { - /* Check for ip6i_t header in sticky hdrs */ - ip6_t *ip6 = (ip6_t *)(rptr + - (((ip6_t *)rptr)->ip6_nxt == IPPROTO_RAW ? - sizeof (ip6i_t) : 0)); + ip6_t *ip6 = (ip6_t *)rptr; - ip6->ip6_plen = htons(tcp_hdr_len - - ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc)); + ip6->ip6_plen = htons(total_hdr_len - IPV6_HDR_LEN); } /* * Prime pump for checksum calculation in IP. Include the * adjustment for a source route if any. */ - data_length = tcp_tcp_hdr_len + tcp->tcp_sum; + data_length = tcp_hdr_len + connp->conn_sum; data_length = (data_length >> 16) + (data_length & 0xFFFF); - U16_TO_ABE16(data_length, tcph->th_sum); + tcpha->tha_sum = htons(data_length); if (tcp->tcp_ip_forward_progress) { - ASSERT(tcp->tcp_ipversion == IPV6_VERSION); - *(uint32_t *)mp1->b_rptr |= IP_FORWARD_PROG; tcp->tcp_ip_forward_progress = B_FALSE; + connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF; + } else { + connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF; } return (mp1); } @@ -23183,6 +17918,8 @@ tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock) tcp_t **tcpp; tcp_t *tcpnext; tcp_t *tcphash; + conn_t *connp = tcp->tcp_connp; + conn_t *connext; if (tcp->tcp_ptpbhn != NULL) { ASSERT(!caller_holds_lock); @@ -23199,7 +17936,7 @@ tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock) if (tcphash != NULL) { /* Look for an entry using the same port */ while ((tcphash = tcpp[0]) != NULL && - tcp->tcp_lport != tcphash->tcp_lport) + connp->conn_lport != tcphash->tcp_connp->conn_lport) tcpp = &(tcphash->tcp_bind_hash); /* The port was not found, just add to the end */ @@ -23219,14 +17956,19 @@ tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock) * INADDR_ANY. */ tcpnext = tcphash; + connext = tcpnext->tcp_connp; tcphash = NULL; - if (V6_OR_V4_INADDR_ANY(tcp->tcp_bound_source_v6) && - !V6_OR_V4_INADDR_ANY(tcpnext->tcp_bound_source_v6)) { - while ((tcpnext = tcpp[0]) != NULL && - !V6_OR_V4_INADDR_ANY(tcpnext->tcp_bound_source_v6)) - tcpp = &(tcpnext->tcp_bind_hash_port); - - if (tcpnext) { + if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) && + !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) { + while ((tcpnext = tcpp[0]) != NULL) { + connext = tcpnext->tcp_connp; + if (!V6_OR_V4_INADDR_ANY( + connext->conn_bound_addr_v6)) + tcpp = &(tcpnext->tcp_bind_hash_port); + else + break; + } + if (tcpnext != NULL) { tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port; tcphash = tcpnext->tcp_bind_hash; if (tcphash != NULL) { @@ -23263,6 +18005,7 @@ tcp_bind_hash_remove(tcp_t *tcp) tcp_t *tcpnext; kmutex_t *lockp; tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; if (tcp->tcp_ptpbhn == NULL) return; @@ -23271,8 +18014,9 @@ tcp_bind_hash_remove(tcp_t *tcp) * Extract the lock pointer in case there are concurrent * hash_remove's for this instance. */ - ASSERT(tcp->tcp_lport != 0); - lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH(tcp->tcp_lport)].tf_lock; + ASSERT(connp->conn_lport != 0); + lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH( + connp->conn_lport)].tf_lock; ASSERT(lockp != NULL); mutex_enter(lockp); @@ -23548,7 +18292,7 @@ tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp, *sys_errorp = 0; *do_disconnectp = 0; - error = tpi_optcom_buf(tcp->tcp_wq, mp, opt_lenp, + error = tpi_optcom_buf(tcp->tcp_connp->conn_wq, mp, opt_lenp, opt_offset, cr, &tcp_opt_obj, NULL, &is_absreq_failure); @@ -23663,238 +18407,6 @@ tcp_sack_info_constructor(void *buf, void *cdrarg, int kmflags) return (0); } -/* ARGSUSED */ -static int -tcp_iphc_constructor(void *buf, void *cdrarg, int kmflags) -{ - bzero(buf, TCP_MAX_COMBINED_HEADER_LENGTH); - return (0); -} - -/* - * Make sure we wait until the default queue is setup, yet allow - * tcp_g_q_create() to open a TCP stream. - * We need to allow tcp_g_q_create() do do an open - * of tcp, hence we compare curhread. - * All others have to wait until the tcps_g_q has been - * setup. - */ -void -tcp_g_q_setup(tcp_stack_t *tcps) -{ - mutex_enter(&tcps->tcps_g_q_lock); - if (tcps->tcps_g_q != NULL) { - mutex_exit(&tcps->tcps_g_q_lock); - return; - } - if (tcps->tcps_g_q_creator == NULL) { - /* This thread will set it up */ - tcps->tcps_g_q_creator = curthread; - mutex_exit(&tcps->tcps_g_q_lock); - tcp_g_q_create(tcps); - mutex_enter(&tcps->tcps_g_q_lock); - ASSERT(tcps->tcps_g_q_creator == curthread); - tcps->tcps_g_q_creator = NULL; - cv_signal(&tcps->tcps_g_q_cv); - ASSERT(tcps->tcps_g_q != NULL); - mutex_exit(&tcps->tcps_g_q_lock); - return; - } - /* Everybody but the creator has to wait */ - if (tcps->tcps_g_q_creator != curthread) { - while (tcps->tcps_g_q == NULL) - cv_wait(&tcps->tcps_g_q_cv, &tcps->tcps_g_q_lock); - } - mutex_exit(&tcps->tcps_g_q_lock); -} - -#define IP "ip" - -#define TCP6DEV "/devices/pseudo/tcp6@0:tcp6" - -/* - * Create a default tcp queue here instead of in strplumb - */ -void -tcp_g_q_create(tcp_stack_t *tcps) -{ - int error; - ldi_handle_t lh = NULL; - ldi_ident_t li = NULL; - int rval; - cred_t *cr; - major_t IP_MAJ; - -#ifdef NS_DEBUG - (void) printf("tcp_g_q_create()\n"); -#endif - - IP_MAJ = ddi_name_to_major(IP); - - ASSERT(tcps->tcps_g_q_creator == curthread); - - error = ldi_ident_from_major(IP_MAJ, &li); - if (error) { -#ifdef DEBUG - printf("tcp_g_q_create: lyr ident get failed error %d\n", - error); -#endif - return; - } - - cr = zone_get_kcred(netstackid_to_zoneid( - tcps->tcps_netstack->netstack_stackid)); - ASSERT(cr != NULL); - /* - * We set the tcp default queue to IPv6 because IPv4 falls - * back to IPv6 when it can't find a client, but - * IPv6 does not fall back to IPv4. - */ - error = ldi_open_by_name(TCP6DEV, FREAD|FWRITE, cr, &lh, li); - if (error) { -#ifdef DEBUG - printf("tcp_g_q_create: open of TCP6DEV failed error %d\n", - error); -#endif - goto out; - } - - /* - * This ioctl causes the tcp framework to cache a pointer to - * this stream, so we don't want to close the stream after - * this operation. - * Use the kernel credentials that are for the zone we're in. - */ - error = ldi_ioctl(lh, TCP_IOC_DEFAULT_Q, - (intptr_t)0, FKIOCTL, cr, &rval); - if (error) { -#ifdef DEBUG - printf("tcp_g_q_create: ioctl TCP_IOC_DEFAULT_Q failed " - "error %d\n", error); -#endif - goto out; - } - tcps->tcps_g_q_lh = lh; /* For tcp_g_q_close */ - lh = NULL; -out: - /* Close layered handles */ - if (li) - ldi_ident_release(li); - /* Keep cred around until _inactive needs it */ - tcps->tcps_g_q_cr = cr; -} - -/* - * We keep tcp_g_q set until all other tcp_t's in the zone - * has gone away, and then when tcp_g_q_inactive() is called - * we clear it. - */ -void -tcp_g_q_destroy(tcp_stack_t *tcps) -{ -#ifdef NS_DEBUG - (void) printf("tcp_g_q_destroy()for stack %d\n", - tcps->tcps_netstack->netstack_stackid); -#endif - - if (tcps->tcps_g_q == NULL) { - return; /* Nothing to cleanup */ - } - /* - * Drop reference corresponding to the default queue. - * This reference was added from tcp_open when the default queue - * was created, hence we compensate for this extra drop in - * tcp_g_q_close. If the refcnt drops to zero here it means - * the default queue was the last one to be open, in which - * case, then tcp_g_q_inactive will be - * called as a result of the refrele. - */ - TCPS_REFRELE(tcps); -} - -/* - * Called when last tcp_t drops reference count using TCPS_REFRELE. - * Run by tcp_q_q_inactive using a taskq. - */ -static void -tcp_g_q_close(void *arg) -{ - tcp_stack_t *tcps = arg; - int error; - ldi_handle_t lh = NULL; - ldi_ident_t li = NULL; - cred_t *cr; - major_t IP_MAJ; - - IP_MAJ = ddi_name_to_major(IP); - -#ifdef NS_DEBUG - (void) printf("tcp_g_q_inactive() for stack %d refcnt %d\n", - tcps->tcps_netstack->netstack_stackid, - tcps->tcps_netstack->netstack_refcnt); -#endif - lh = tcps->tcps_g_q_lh; - if (lh == NULL) - return; /* Nothing to cleanup */ - - ASSERT(tcps->tcps_refcnt == 1); - ASSERT(tcps->tcps_g_q != NULL); - - error = ldi_ident_from_major(IP_MAJ, &li); - if (error) { -#ifdef DEBUG - printf("tcp_g_q_inactive: lyr ident get failed error %d\n", - error); -#endif - return; - } - - cr = tcps->tcps_g_q_cr; - tcps->tcps_g_q_cr = NULL; - ASSERT(cr != NULL); - - /* - * Make sure we can break the recursion when tcp_close decrements - * the reference count causing g_q_inactive to be called again. - */ - tcps->tcps_g_q_lh = NULL; - - /* close the default queue */ - (void) ldi_close(lh, FREAD|FWRITE, cr); - /* - * At this point in time tcps and the rest of netstack_t might - * have been deleted. - */ - tcps = NULL; - - /* Close layered handles */ - ldi_ident_release(li); - crfree(cr); -} - -/* - * Called when last tcp_t drops reference count using TCPS_REFRELE. - * - * Have to ensure that the ldi routines are not used by an - * interrupt thread by using a taskq. - */ -void -tcp_g_q_inactive(tcp_stack_t *tcps) -{ - if (tcps->tcps_g_q_lh == NULL) - return; /* Nothing to cleanup */ - - ASSERT(tcps->tcps_refcnt == 0); - TCPS_REFHOLD(tcps); /* Compensate for what g_q_destroy did */ - - if (servicing_interrupt()) { - (void) taskq_dispatch(tcp_taskq, tcp_g_q_close, - (void *) tcps, TQ_SLEEP); - } else { - tcp_g_q_close(tcps); - } -} - /* * Called by IP when IP is loaded into the kernel */ @@ -23909,10 +18421,6 @@ tcp_ddi_g_init(void) sizeof (tcp_sack_info_t), 0, tcp_sack_info_constructor, NULL, NULL, NULL, NULL, 0); - tcp_iphc_cache = kmem_cache_create("tcp_iphc_cache", - TCP_MAX_COMBINED_HEADER_LENGTH, 0, - tcp_iphc_constructor, NULL, NULL, NULL, NULL, 0); - mutex_init(&tcp_random_lock, NULL, MUTEX_DEFAULT, NULL); /* Initialize the random number generator */ @@ -23923,9 +18431,6 @@ tcp_ddi_g_init(void) tcp_g_kstat = tcp_g_kstat_init(&tcp_g_statistics); - tcp_taskq = taskq_create("tcp_taskq", 1, minclsyspri, 1, 1, - TASKQ_PREPOPULATE); - tcp_squeue_flag = tcp_squeue_switch(tcp_squeue_wput); /* @@ -23933,8 +18438,7 @@ tcp_ddi_g_init(void) * destroyed in the kernel, so we can maintain the * set of tcp_stack_t's. */ - netstack_register(NS_TCP, tcp_stack_init, tcp_stack_shutdown, - tcp_stack_fini); + netstack_register(NS_TCP, tcp_stack_init, NULL, tcp_stack_fini); } @@ -23956,8 +18460,6 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns) tcps->tcps_netstack = ns; /* Initialize locks */ - mutex_init(&tcps->tcps_g_q_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&tcps->tcps_g_q_cv, NULL, CV_DEFAULT, NULL); mutex_init(&tcps->tcps_iss_key_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&tcps->tcps_epriv_port_lock, NULL, MUTEX_DEFAULT, NULL); @@ -24018,6 +18520,11 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns) major = mod_name_to_major(INET_NAME); error = ldi_ident_from_major(major, &tcps->tcps_ldi_ident); ASSERT(error == 0); + tcps->tcps_ixa_cleanup_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL); + ASSERT(tcps->tcps_ixa_cleanup_mp != NULL); + cv_init(&tcps->tcps_ixa_cleanup_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&tcps->tcps_ixa_cleanup_lock, NULL, MUTEX_DEFAULT, NULL); + return (tcps); } @@ -24035,22 +18542,8 @@ tcp_ddi_g_destroy(void) kmem_cache_destroy(tcp_timercache); kmem_cache_destroy(tcp_sack_info_cache); - kmem_cache_destroy(tcp_iphc_cache); netstack_unregister(NS_TCP); - taskq_destroy(tcp_taskq); -} - -/* - * Shut down the TCP stack instance. - */ -/* ARGSUSED */ -static void -tcp_stack_shutdown(netstackid_t stackid, void *arg) -{ - tcp_stack_t *tcps = (tcp_stack_t *)arg; - - tcp_g_q_destroy(tcps); } /* @@ -24062,17 +18555,16 @@ tcp_stack_fini(netstackid_t stackid, void *arg) tcp_stack_t *tcps = (tcp_stack_t *)arg; int i; + freeb(tcps->tcps_ixa_cleanup_mp); + tcps->tcps_ixa_cleanup_mp = NULL; + cv_destroy(&tcps->tcps_ixa_cleanup_cv); + mutex_destroy(&tcps->tcps_ixa_cleanup_lock); + nd_free(&tcps->tcps_g_nd); kmem_free(tcps->tcps_params, sizeof (lcl_tcp_param_arr)); tcps->tcps_params = NULL; kmem_free(tcps->tcps_wroff_xtra_param, sizeof (tcpparam_t)); tcps->tcps_wroff_xtra_param = NULL; - kmem_free(tcps->tcps_mdt_head_param, sizeof (tcpparam_t)); - tcps->tcps_mdt_head_param = NULL; - kmem_free(tcps->tcps_mdt_tail_param, sizeof (tcpparam_t)); - tcps->tcps_mdt_tail_param = NULL; - kmem_free(tcps->tcps_mdt_max_pbufs_param, sizeof (tcpparam_t)); - tcps->tcps_mdt_max_pbufs_param = NULL; for (i = 0; i < TCP_BIND_FANOUT_SIZE; i++) { ASSERT(tcps->tcps_bind_fanout[i].tf_tcp == NULL); @@ -24091,8 +18583,6 @@ tcp_stack_fini(netstackid_t stackid, void *arg) tcps->tcps_acceptor_fanout = NULL; mutex_destroy(&tcps->tcps_iss_key_lock); - mutex_destroy(&tcps->tcps_g_q_lock); - cv_destroy(&tcps->tcps_g_q_cv); mutex_destroy(&tcps->tcps_epriv_port_lock); ip_drop_unregister(&tcps->tcps_dropper); @@ -24120,6 +18610,7 @@ tcp_iss_init(tcp_t *tcp) struct { uint32_t ports; in6_addr_t src; in6_addr_t dst; } arg; uint32_t answer[4]; tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; tcps->tcps_iss_incr_extra += (ISS_INCR >> 1); tcp->tcp_iss = tcps->tcps_iss_incr_extra; @@ -24128,16 +18619,9 @@ tcp_iss_init(tcp_t *tcp) mutex_enter(&tcps->tcps_iss_key_lock); context = tcps->tcps_iss_key; mutex_exit(&tcps->tcps_iss_key_lock); - arg.ports = tcp->tcp_ports; - if (tcp->tcp_ipversion == IPV4_VERSION) { - IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, - &arg.src); - IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_dst, - &arg.dst); - } else { - arg.src = tcp->tcp_ip6h->ip6_src; - arg.dst = tcp->tcp_ip6h->ip6_dst; - } + arg.ports = connp->conn_ports; + arg.src = connp->conn_laddr_v6; + arg.dst = connp->conn_faddr_v6; MD5Update(&context, (uchar_t *)&arg, sizeof (arg)); MD5Final((uchar_t *)answer, &context); tcp->tcp_iss += answer[0] ^ answer[1] ^ answer[2] ^ answer[3]; @@ -24220,27 +18704,16 @@ cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *), void *arg, connp = NULL; while ((connp = - ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { + ipcl_get_next_conn(connfp, connp, IPCL_TCPCONN)) != NULL) { tcp = connp->conn_tcp; cl_tcpi.cl_tcpi_version = CL_TCPI_V1; - cl_tcpi.cl_tcpi_ipversion = tcp->tcp_ipversion; + cl_tcpi.cl_tcpi_ipversion = connp->conn_ipversion; cl_tcpi.cl_tcpi_state = tcp->tcp_state; - cl_tcpi.cl_tcpi_lport = tcp->tcp_lport; - cl_tcpi.cl_tcpi_fport = tcp->tcp_fport; - /* - * The macros tcp_laddr and tcp_faddr give the IPv4 - * addresses. They are copied implicitly below as - * mapped addresses. - */ - cl_tcpi.cl_tcpi_laddr_v6 = tcp->tcp_ip_src_v6; - if (tcp->tcp_ipversion == IPV4_VERSION) { - cl_tcpi.cl_tcpi_faddr = - tcp->tcp_ipha->ipha_dst; - } else { - cl_tcpi.cl_tcpi_faddr_v6 = - tcp->tcp_ip6h->ip6_dst; - } + cl_tcpi.cl_tcpi_lport = connp->conn_lport; + cl_tcpi.cl_tcpi_fport = connp->conn_fport; + cl_tcpi.cl_tcpi_laddr_v6 = connp->conn_laddr_v6; + cl_tcpi.cl_tcpi_faddr_v6 = connp->conn_faddr_v6; /* * If the callback returns non-zero @@ -24302,35 +18775,35 @@ cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *), void *arg, /* * Check if a tcp structure matches the info in acp. */ -#define TCP_AC_ADDR_MATCH(acp, tcp) \ +#define TCP_AC_ADDR_MATCH(acp, connp, tcp) \ (((acp)->ac_local.ss_family == AF_INET) ? \ ((TCP_AC_V4LOCAL((acp)) == INADDR_ANY || \ - TCP_AC_V4LOCAL((acp)) == (tcp)->tcp_ip_src) && \ + TCP_AC_V4LOCAL((acp)) == (connp)->conn_laddr_v4) && \ (TCP_AC_V4REMOTE((acp)) == INADDR_ANY || \ - TCP_AC_V4REMOTE((acp)) == (tcp)->tcp_remote) && \ + TCP_AC_V4REMOTE((acp)) == (connp)->conn_faddr_v4) && \ (TCP_AC_V4LPORT((acp)) == 0 || \ - TCP_AC_V4LPORT((acp)) == (tcp)->tcp_lport) && \ + TCP_AC_V4LPORT((acp)) == (connp)->conn_lport) && \ (TCP_AC_V4RPORT((acp)) == 0 || \ - TCP_AC_V4RPORT((acp)) == (tcp)->tcp_fport) && \ - (acp)->ac_start <= (tcp)->tcp_state && \ - (acp)->ac_end >= (tcp)->tcp_state) : \ + TCP_AC_V4RPORT((acp)) == (connp)->conn_fport) && \ + (acp)->ac_start <= (tcp)->tcp_state && \ + (acp)->ac_end >= (tcp)->tcp_state) : \ ((IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL((acp))) || \ IN6_ARE_ADDR_EQUAL(&TCP_AC_V6LOCAL((acp)), \ - &(tcp)->tcp_ip_src_v6)) && \ + &(connp)->conn_laddr_v6)) && \ (IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE((acp))) || \ IN6_ARE_ADDR_EQUAL(&TCP_AC_V6REMOTE((acp)), \ - &(tcp)->tcp_remote_v6)) && \ + &(connp)->conn_faddr_v6)) && \ (TCP_AC_V6LPORT((acp)) == 0 || \ - TCP_AC_V6LPORT((acp)) == (tcp)->tcp_lport) && \ + TCP_AC_V6LPORT((acp)) == (connp)->conn_lport) && \ (TCP_AC_V6RPORT((acp)) == 0 || \ - TCP_AC_V6RPORT((acp)) == (tcp)->tcp_fport) && \ - (acp)->ac_start <= (tcp)->tcp_state && \ + TCP_AC_V6RPORT((acp)) == (connp)->conn_fport) && \ + (acp)->ac_start <= (tcp)->tcp_state && \ (acp)->ac_end >= (tcp)->tcp_state)) -#define TCP_AC_MATCH(acp, tcp) \ +#define TCP_AC_MATCH(acp, connp, tcp) \ (((acp)->ac_zoneid == ALL_ZONES || \ - (acp)->ac_zoneid == tcp->tcp_connp->conn_zoneid) ? \ - TCP_AC_ADDR_MATCH(acp, tcp) : 0) + (acp)->ac_zoneid == (connp)->conn_zoneid) ? \ + TCP_AC_ADDR_MATCH(acp, connp, tcp) : 0) /* * Build a message containing a tcp_ioc_abort_conn_t structure @@ -24346,8 +18819,6 @@ tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *acp, tcp_t *tp) if (mp == NULL) return (NULL); - mp->b_datap->db_type = M_CTL; - *((uint32_t *)mp->b_rptr) = TCP_IOC_ABORT_CONN; tacp = (tcp_ioc_abort_conn_t *)((uchar_t *)mp->b_rptr + sizeof (uint32_t)); @@ -24359,17 +18830,17 @@ tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *acp, tcp_t *tp) if (acp->ac_local.ss_family == AF_INET) { tacp->ac_local.ss_family = AF_INET; tacp->ac_remote.ss_family = AF_INET; - TCP_AC_V4LOCAL(tacp) = tp->tcp_ip_src; - TCP_AC_V4REMOTE(tacp) = tp->tcp_remote; - TCP_AC_V4LPORT(tacp) = tp->tcp_lport; - TCP_AC_V4RPORT(tacp) = tp->tcp_fport; + TCP_AC_V4LOCAL(tacp) = tp->tcp_connp->conn_laddr_v4; + TCP_AC_V4REMOTE(tacp) = tp->tcp_connp->conn_faddr_v4; + TCP_AC_V4LPORT(tacp) = tp->tcp_connp->conn_lport; + TCP_AC_V4RPORT(tacp) = tp->tcp_connp->conn_fport; } else { tacp->ac_local.ss_family = AF_INET6; tacp->ac_remote.ss_family = AF_INET6; - TCP_AC_V6LOCAL(tacp) = tp->tcp_ip_src_v6; - TCP_AC_V6REMOTE(tacp) = tp->tcp_remote_v6; - TCP_AC_V6LPORT(tacp) = tp->tcp_lport; - TCP_AC_V6RPORT(tacp) = tp->tcp_fport; + TCP_AC_V6LOCAL(tacp) = tp->tcp_connp->conn_laddr_v6; + TCP_AC_V6REMOTE(tacp) = tp->tcp_connp->conn_faddr_v6; + TCP_AC_V6LPORT(tacp) = tp->tcp_connp->conn_lport; + TCP_AC_V6RPORT(tacp) = tp->tcp_connp->conn_fport; } mp->b_wptr = (uchar_t *)mp->b_rptr + sizeof (uint32_t) + sizeof (*acp); return (mp); @@ -24419,14 +18890,32 @@ tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *acp) } /* - * Called inside tcp_rput when a message built using + * Called using SQ_FILL when a message built using * tcp_ioctl_abort_build_msg is put into a queue. * Note that when we get here there is no wildcard in acp any more. */ +/* ARGSUSED2 */ static void -tcp_ioctl_abort_handler(tcp_t *tcp, mblk_t *mp) +tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy) { - tcp_ioc_abort_conn_t *acp; + conn_t *connp = (conn_t *)arg; + tcp_t *tcp = connp->conn_tcp; + tcp_ioc_abort_conn_t *acp; + + /* + * Don't accept any input on a closed tcp as this TCP logically does + * not exist on the system. Don't proceed further with this TCP. + * For eg. this packet could trigger another close of this tcp + * which would be disastrous for tcp_refcnt. tcp_close_detached / + * tcp_clean_death / tcp_closei_local must be called at most once + * on a TCP. + */ + if (tcp->tcp_state == TCPS_CLOSED || + tcp->tcp_state == TCPS_BOUND) { + freemsg(mp); + return; + } acp = (tcp_ioc_abort_conn_t *)(mp->b_rptr + sizeof (uint32_t)); if (tcp->tcp_state <= acp->ac_end) { @@ -24468,12 +18957,17 @@ startover: for (tconnp = connfp->connf_head; tconnp != NULL; tconnp = tconnp->conn_next) { tcp = tconnp->conn_tcp; - if (TCP_AC_MATCH(acp, tcp)) { - CONN_INC_REF(tcp->tcp_connp); + /* + * We are missing a check on sin6_scope_id for linklocals here, + * but current usage is just for aborting based on zoneid + * for shared-IP zones. + */ + if (TCP_AC_MATCH(acp, tconnp, tcp)) { + CONN_INC_REF(tconnp); mp = tcp_ioctl_abort_build_msg(acp, tcp); if (mp == NULL) { err = ENOMEM; - CONN_DEC_REF(tcp->tcp_connp); + CONN_DEC_REF(tconnp); break; } mp->b_prev = (mblk_t *)tcp; @@ -24501,8 +18995,9 @@ startover: listhead = listhead->b_next; tcp = (tcp_t *)mp->b_prev; mp->b_next = mp->b_prev = NULL; - SQUEUE_ENTER_ONE(tcp->tcp_connp->conn_sqp, mp, tcp_input, - tcp->tcp_connp, SQ_FILL, SQTAG_TCP_ABORT_BUCKET); + SQUEUE_ENTER_ONE(tcp->tcp_connp->conn_sqp, mp, + tcp_ioctl_abort_handler, tcp->tcp_connp, NULL, + SQ_FILL, SQTAG_TCP_ABORT_BUCKET); } *count += nmatch; @@ -24669,7 +19164,7 @@ out: */ void tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, - uint32_t seg_ack, int seg_len, tcph_t *tcph) + uint32_t seg_ack, int seg_len, tcpha_t *tcpha, ip_recv_attr_t *ira) { int32_t bytes_acked; int32_t gap; @@ -24677,17 +19172,18 @@ tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, tcp_opt_t tcpopt; uint_t flags; uint32_t new_swnd = 0; - conn_t *connp; + conn_t *nconnp; + conn_t *connp = tcp->tcp_connp; tcp_stack_t *tcps = tcp->tcp_tcps; BUMP_LOCAL(tcp->tcp_ibsegs); DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp); - flags = (unsigned int)tcph->th_flags[0] & 0xFF; - new_swnd = BE16_TO_U16(tcph->th_win) << - ((tcph->th_flags[0] & TH_SYN) ? 0 : tcp->tcp_snd_ws); + flags = (unsigned int)tcpha->tha_flags & 0xFF; + new_swnd = ntohs(tcpha->tha_win) << + ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws); if (tcp->tcp_snd_ts_ok) { - if (!tcp_paws_check(tcp, tcph, &tcpopt)) { + if (!tcp_paws_check(tcp, tcpha, &tcpopt)) { tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); goto done; @@ -24770,17 +19266,10 @@ tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, mutex_enter(&tcps->tcps_iss_key_lock); context = tcps->tcps_iss_key; mutex_exit(&tcps->tcps_iss_key_lock); - arg.ports = tcp->tcp_ports; + arg.ports = connp->conn_ports; /* We use MAPPED addresses in tcp_iss_init */ - arg.src = tcp->tcp_ip_src_v6; - if (tcp->tcp_ipversion == IPV4_VERSION) { - IN6_IPADDR_TO_V4MAPPED( - tcp->tcp_ipha->ipha_dst, - &arg.dst); - } else { - arg.dst = - tcp->tcp_ip6h->ip6_dst; - } + arg.src = connp->conn_laddr_v6; + arg.dst = connp->conn_faddr_v6; MD5Update(&context, (uchar_t *)&arg, sizeof (arg)); MD5Final((uchar_t *)answer, &context); @@ -24813,21 +19302,11 @@ tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, */ if (tcp_clean_death(tcp, 0, 27) == -1) goto done; - /* - * We will come back to tcp_rput_data - * on the global queue. Packets destined - * for the global queue will be checked - * with global policy. But the policy for - * this packet has already been checked as - * this was destined for the detached - * connection. We need to bypass policy - * check this time by attaching a dummy - * ipsec_in with ipsec_in_dont_check set. - */ - connp = ipcl_classify(mp, tcp->tcp_connp->conn_zoneid, ipst); - if (connp != NULL) { + nconnp = ipcl_classify(mp, ira, ipst); + if (nconnp != NULL) { TCP_STAT(tcps, tcp_time_wait_syn_success); - tcp_reinput(connp, mp, tcp->tcp_connp->conn_sqp); + /* Drops ref on nconnp */ + tcp_reinput(nconnp, mp, ira, ipst); return; } goto done; @@ -24905,11 +19384,6 @@ process_ack: tcp->tcp_rnxt, TH_ACK); } done: - if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) { - DB_CKSUMSTART(mp) = 0; - mp->b_datap->db_struioflag &= ~STRUIO_EAGER; - TCP_STAT(tcps, tcp_time_wait_syn_fail); - } freemsg(mp); } @@ -24965,11 +19439,12 @@ tcp_timer_callback(void *arg) tcpt = (tcp_timer_t *)mp->b_rptr; connp = tcpt->connp; SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timer_handler, connp, - SQ_FILL, SQTAG_TCP_TIMER); + NULL, SQ_FILL, SQTAG_TCP_TIMER); } +/* ARGSUSED */ static void -tcp_timer_handler(void *arg, mblk_t *mp, void *arg2) +tcp_timer_handler(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { tcp_timer_t *tcpt; conn_t *connp = (conn_t *)arg; @@ -24983,7 +19458,7 @@ tcp_timer_handler(void *arg, mblk_t *mp, void *arg2) * If the TCP has reached the closed state, don't proceed any * further. This TCP logically does not exist on the system. * tcpt_proc could for example access queues, that have already - * been qprocoff'ed off. Also see comments at the start of tcp_input + * been qprocoff'ed off. */ if (tcp->tcp_state != TCPS_CLOSED) { (*tcpt->tcpt_proc)(connp); @@ -25148,26 +19623,9 @@ tcp_setqfull(tcp_t *tcp) if (tcp->tcp_closed) return; - if (IPCL_IS_NONSTR(connp)) { - (*connp->conn_upcalls->su_txq_full) - (tcp->tcp_connp->conn_upper_handle, B_TRUE); - tcp->tcp_flow_stopped = B_TRUE; - } else { - queue_t *q = tcp->tcp_wq; - - if (!(q->q_flag & QFULL)) { - mutex_enter(QLOCK(q)); - if (!(q->q_flag & QFULL)) { - /* still need to set QFULL */ - q->q_flag |= QFULL; - tcp->tcp_flow_stopped = B_TRUE; - mutex_exit(QLOCK(q)); - TCP_STAT(tcps, tcp_flwctl_on); - } else { - mutex_exit(QLOCK(q)); - } - } - } + conn_setqfull(connp, &tcp->tcp_flow_stopped); + if (tcp->tcp_flow_stopped) + TCP_STAT(tcps, tcp_flwctl_on); } void @@ -25177,27 +19635,7 @@ tcp_clrqfull(tcp_t *tcp) if (tcp->tcp_closed) return; - - if (IPCL_IS_NONSTR(connp)) { - (*connp->conn_upcalls->su_txq_full) - (tcp->tcp_connp->conn_upper_handle, B_FALSE); - tcp->tcp_flow_stopped = B_FALSE; - } else { - queue_t *q = tcp->tcp_wq; - - if (q->q_flag & QFULL) { - mutex_enter(QLOCK(q)); - if (q->q_flag & QFULL) { - q->q_flag &= ~QFULL; - tcp->tcp_flow_stopped = B_FALSE; - mutex_exit(QLOCK(q)); - if (q->q_flag & QWANTW) - qbackenable(q, 0); - } else { - mutex_exit(QLOCK(q)); - } - } - } + conn_clrqfull(connp, &tcp->tcp_flow_stopped); } /* @@ -25246,10 +19684,7 @@ tcp_kstat2_init(netstackid_t stackid, tcp_stat_t *tcps_statisticsp) tcp_stat_t template = { { "tcp_time_wait", KSTAT_DATA_UINT64 }, { "tcp_time_wait_syn", KSTAT_DATA_UINT64 }, - { "tcp_time_wait_success", KSTAT_DATA_UINT64 }, - { "tcp_time_wait_fail", KSTAT_DATA_UINT64 }, - { "tcp_reinput_syn", KSTAT_DATA_UINT64 }, - { "tcp_ip_output", KSTAT_DATA_UINT64 }, + { "tcp_time_wait_syn_success", KSTAT_DATA_UINT64 }, { "tcp_detach_non_time_wait", KSTAT_DATA_UINT64 }, { "tcp_detach_time_wait", KSTAT_DATA_UINT64 }, { "tcp_time_wait_reap", KSTAT_DATA_UINT64 }, @@ -25287,37 +19722,14 @@ tcp_kstat2_init(netstackid_t stackid, tcp_stat_t *tcps_statisticsp) { "tcp_timermp_freed", KSTAT_DATA_UINT64 }, { "tcp_push_timer_cnt", KSTAT_DATA_UINT64 }, { "tcp_ack_timer_cnt", KSTAT_DATA_UINT64 }, - { "tcp_ire_null1", KSTAT_DATA_UINT64 }, - { "tcp_ire_null", KSTAT_DATA_UINT64 }, - { "tcp_ip_send", KSTAT_DATA_UINT64 }, - { "tcp_ip_ire_send", KSTAT_DATA_UINT64 }, { "tcp_wsrv_called", KSTAT_DATA_UINT64 }, { "tcp_flwctl_on", KSTAT_DATA_UINT64 }, { "tcp_timer_fire_early", KSTAT_DATA_UINT64 }, { "tcp_timer_fire_miss", KSTAT_DATA_UINT64 }, { "tcp_rput_v6_error", KSTAT_DATA_UINT64 }, - { "tcp_out_sw_cksum", KSTAT_DATA_UINT64 }, - { "tcp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, { "tcp_zcopy_on", KSTAT_DATA_UINT64 }, { "tcp_zcopy_off", KSTAT_DATA_UINT64 }, { "tcp_zcopy_backoff", KSTAT_DATA_UINT64 }, - { "tcp_zcopy_disable", KSTAT_DATA_UINT64 }, - { "tcp_mdt_pkt_out", KSTAT_DATA_UINT64 }, - { "tcp_mdt_pkt_out_v4", KSTAT_DATA_UINT64 }, - { "tcp_mdt_pkt_out_v6", KSTAT_DATA_UINT64 }, - { "tcp_mdt_discarded", KSTAT_DATA_UINT64 }, - { "tcp_mdt_conn_halted1", KSTAT_DATA_UINT64 }, - { "tcp_mdt_conn_halted2", KSTAT_DATA_UINT64 }, - { "tcp_mdt_conn_halted3", KSTAT_DATA_UINT64 }, - { "tcp_mdt_conn_resumed1", KSTAT_DATA_UINT64 }, - { "tcp_mdt_conn_resumed2", KSTAT_DATA_UINT64 }, - { "tcp_mdt_legacy_small", KSTAT_DATA_UINT64 }, - { "tcp_mdt_legacy_all", KSTAT_DATA_UINT64 }, - { "tcp_mdt_legacy_ret", KSTAT_DATA_UINT64 }, - { "tcp_mdt_allocfail", KSTAT_DATA_UINT64 }, - { "tcp_mdt_addpdescfail", KSTAT_DATA_UINT64 }, - { "tcp_mdt_allocd", KSTAT_DATA_UINT64 }, - { "tcp_mdt_linked", KSTAT_DATA_UINT64 }, { "tcp_fusion_flowctl", KSTAT_DATA_UINT64 }, { "tcp_fusion_backenabled", KSTAT_DATA_UINT64 }, { "tcp_fusion_urg", KSTAT_DATA_UINT64 }, @@ -25490,7 +19902,7 @@ tcp_kstat_update(kstat_t *kp, int rw) connfp = &ipst->ips_ipcl_globalhash_fanout[i]; connp = NULL; while ((connp = - ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { + ipcl_get_next_conn(connfp, connp, IPCL_TCPCONN)) != NULL) { tcp = connp->conn_tcp; switch (tcp_snmp_state(tcp)) { case MIB2_TCP_established: @@ -25565,48 +19977,6 @@ tcp_kstat_update(kstat_t *kp, int rw) return (0); } -void -tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp) -{ - uint16_t hdr_len; - ipha_t *ipha; - uint8_t *nexthdrp; - tcph_t *tcph; - tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps; - - /* Already has an eager */ - if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) { - TCP_STAT(tcps, tcp_reinput_syn); - SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp, - SQ_PROCESS, SQTAG_TCP_REINPUT_EAGER); - return; - } - - switch (IPH_HDR_VERSION(mp->b_rptr)) { - case IPV4_VERSION: - ipha = (ipha_t *)mp->b_rptr; - hdr_len = IPH_HDR_LENGTH(ipha); - break; - case IPV6_VERSION: - if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr, - &hdr_len, &nexthdrp)) { - CONN_DEC_REF(connp); - freemsg(mp); - return; - } - break; - } - - tcph = (tcph_t *)&mp->b_rptr[hdr_len]; - if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) { - mp->b_datap->db_struioflag |= STRUIO_EAGER; - DB_CKSUMSTART(mp) = (intptr_t)sqp; - } - - SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp, - SQ_FILL, SQTAG_TCP_REINPUT); -} - static int tcp_squeue_switch(int val) { @@ -25653,278 +20023,20 @@ tcp_squeue_add(squeue_t *sqp) tcp_time_wait->tcp_free_list_cnt = 0; } -static int -tcp_post_ip_bind(tcp_t *tcp, mblk_t *mp, int error, cred_t *cr, pid_t pid) +/* + * On a labeled system we have some protocols above TCP, such as RPC, which + * appear to assume that every mblk in a chain has a db_credp. + */ +static void +tcp_setcred_data(mblk_t *mp, ip_recv_attr_t *ira) { - mblk_t *ire_mp = NULL; - mblk_t *syn_mp; - mblk_t *mdti; - mblk_t *lsoi; - int retval; - tcph_t *tcph; - cred_t *ecr; - ts_label_t *tsl; - uint32_t mss; - conn_t *connp = tcp->tcp_connp; - tcp_stack_t *tcps = tcp->tcp_tcps; - - if (error == 0) { - /* - * Adapt Multidata information, if any. The - * following tcp_mdt_update routine will free - * the message. - */ - if (mp != NULL && ((mdti = tcp_mdt_info_mp(mp)) != NULL)) { - tcp_mdt_update(tcp, &((ip_mdt_info_t *)mdti-> - b_rptr)->mdt_capab, B_TRUE); - freemsg(mdti); - } - - /* - * Check to update LSO information with tcp, and - * tcp_lso_update routine will free the message. - */ - if (mp != NULL && ((lsoi = tcp_lso_info_mp(mp)) != NULL)) { - tcp_lso_update(tcp, &((ip_lso_info_t *)lsoi-> - b_rptr)->lso_capab); - freemsg(lsoi); - } - - /* Get the IRE, if we had requested for it */ - if (mp != NULL) - ire_mp = tcp_ire_mp(&mp); - - if (tcp->tcp_hard_binding) { - tcp->tcp_hard_binding = B_FALSE; - tcp->tcp_hard_bound = B_TRUE; - CL_INET_CONNECT(tcp->tcp_connp, tcp, B_TRUE, retval); - if (retval != 0) { - error = EADDRINUSE; - goto bind_failed; - } - } else { - if (ire_mp != NULL) - freeb(ire_mp); - goto after_syn_sent; - } - - retval = tcp_adapt_ire(tcp, ire_mp); - if (ire_mp != NULL) - freeb(ire_mp); - if (retval == 0) { - error = (int)((tcp->tcp_state >= TCPS_SYN_SENT) ? - ENETUNREACH : EADDRNOTAVAIL); - goto ipcl_rm; - } - /* - * Don't let an endpoint connect to itself. - * Also checked in tcp_connect() but that - * check can't handle the case when the - * local IP address is INADDR_ANY. - */ - if (tcp->tcp_ipversion == IPV4_VERSION) { - if ((tcp->tcp_ipha->ipha_dst == - tcp->tcp_ipha->ipha_src) && - (BE16_EQL(tcp->tcp_tcph->th_lport, - tcp->tcp_tcph->th_fport))) { - error = EADDRNOTAVAIL; - goto ipcl_rm; - } - } else { - if (IN6_ARE_ADDR_EQUAL( - &tcp->tcp_ip6h->ip6_dst, - &tcp->tcp_ip6h->ip6_src) && - (BE16_EQL(tcp->tcp_tcph->th_lport, - tcp->tcp_tcph->th_fport))) { - error = EADDRNOTAVAIL; - goto ipcl_rm; - } - } - ASSERT(tcp->tcp_state == TCPS_SYN_SENT); - /* - * This should not be possible! Just for - * defensive coding... - */ - if (tcp->tcp_state != TCPS_SYN_SENT) - goto after_syn_sent; - - if (is_system_labeled() && - !tcp_update_label(tcp, CONN_CRED(tcp->tcp_connp))) { - error = EHOSTUNREACH; - goto ipcl_rm; - } - - /* - * tcp_adapt_ire() does not adjust - * for TCP/IP header length. - */ - mss = tcp->tcp_mss - tcp->tcp_hdr_len; - - /* - * Just make sure our rwnd is at - * least tcp_recv_hiwat_mss * MSS - * large, and round up to the nearest - * MSS. - * - * We do the round up here because - * we need to get the interface - * MTU first before we can do the - * round up. - */ - tcp->tcp_rwnd = MAX(MSS_ROUNDUP(tcp->tcp_rwnd, mss), - tcps->tcps_recv_hiwat_minmss * mss); - tcp->tcp_recv_hiwater = tcp->tcp_rwnd; - tcp_set_ws_value(tcp); - U32_TO_ABE16((tcp->tcp_rwnd >> tcp->tcp_rcv_ws), - tcp->tcp_tcph->th_win); - if (tcp->tcp_rcv_ws > 0 || tcps->tcps_wscale_always) - tcp->tcp_snd_ws_ok = B_TRUE; - - /* - * Set tcp_snd_ts_ok to true - * so that tcp_xmit_mp will - * include the timestamp - * option in the SYN segment. - */ - if (tcps->tcps_tstamp_always || - (tcp->tcp_rcv_ws && tcps->tcps_tstamp_if_wscale)) { - tcp->tcp_snd_ts_ok = B_TRUE; - } - - /* - * tcp_snd_sack_ok can be set in - * tcp_adapt_ire() if the sack metric - * is set. So check it here also. - */ - if (tcps->tcps_sack_permitted == 2 || - tcp->tcp_snd_sack_ok) { - if (tcp->tcp_sack_info == NULL) { - tcp->tcp_sack_info = - kmem_cache_alloc(tcp_sack_info_cache, - KM_SLEEP); - } - tcp->tcp_snd_sack_ok = B_TRUE; - } + ASSERT(is_system_labeled()); + ASSERT(ira->ira_cred != NULL); - /* - * Should we use ECN? Note that the current - * default value (SunOS 5.9) of tcp_ecn_permitted - * is 1. The reason for doing this is that there - * are equipments out there that will drop ECN - * enabled IP packets. Setting it to 1 avoids - * compatibility problems. - */ - if (tcps->tcps_ecn_permitted == 2) - tcp->tcp_ecn_ok = B_TRUE; - - TCP_TIMER_RESTART(tcp, tcp->tcp_rto); - syn_mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, - tcp->tcp_iss, B_FALSE, NULL, B_FALSE); - if (syn_mp) { - /* - * cr contains the cred from the thread calling - * connect(). - * - * If no thread cred is available, use the - * socket creator's cred instead. If still no - * cred, drop the request rather than risk a - * panic on production systems. - */ - if (cr == NULL) { - cr = CONN_CRED(connp); - pid = tcp->tcp_cpid; - ASSERT(cr != NULL); - if (cr != NULL) { - mblk_setcred(syn_mp, cr, pid); - } else { - error = ECONNABORTED; - goto ipcl_rm; - } - - /* - * If an effective security label exists for - * the connection, create a copy of the thread's - * cred but with the effective label attached. - */ - } else if (is_system_labeled() && - connp->conn_effective_cred != NULL && - (tsl = crgetlabel(connp-> - conn_effective_cred)) != NULL) { - if ((ecr = copycred_from_tslabel(cr, - tsl, KM_NOSLEEP)) == NULL) { - error = ENOMEM; - goto ipcl_rm; - } - mblk_setcred(syn_mp, ecr, pid); - crfree(ecr); - - /* - * Default to using the thread's cred unchanged. - */ - } else { - mblk_setcred(syn_mp, cr, pid); - } - - /* - * We must bump the generation before sending the syn - * to ensure that we use the right generation in case - * this thread issues a "connected" up call. - */ - SOCK_CONNID_BUMP(tcp->tcp_connid); - - tcp_send_data(tcp, tcp->tcp_wq, syn_mp); - } - after_syn_sent: - if (mp != NULL) { - ASSERT(mp->b_cont == NULL); - freeb(mp); - } - return (error); - } else { - /* error */ - if (tcp->tcp_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, - "tcp_post_ip_bind: error == %d", error); - } - if (mp != NULL) { - freeb(mp); - } + while (mp != NULL) { + mblk_setcred(mp, ira->ira_cred, NOPID); + mp = mp->b_cont; } - -ipcl_rm: - /* - * Need to unbind with classifier since we were just - * told that our bind succeeded. a.k.a error == 0 at the entry. - */ - tcp->tcp_hard_bound = B_FALSE; - tcp->tcp_hard_binding = B_FALSE; - - ipcl_hash_remove(connp); - -bind_failed: - tcp->tcp_state = TCPS_IDLE; - if (tcp->tcp_ipversion == IPV4_VERSION) - tcp->tcp_ipha->ipha_src = 0; - else - V6_SET_ZERO(tcp->tcp_ip6h->ip6_src); - /* - * Copy of the src addr. in tcp_t is needed since - * the lookup funcs. can only look at tcp_t - */ - V6_SET_ZERO(tcp->tcp_ip_src_v6); - - tcph = tcp->tcp_tcph; - tcph->th_lport[0] = 0; - tcph->th_lport[1] = 0; - tcp_bind_hash_remove(tcp); - bzero(&connp->u_port, sizeof (connp->u_port)); - /* blow away saved option results if any */ - if (tcp->tcp_conn.tcp_opts_conn_req != NULL) - tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); - - conn_delete_ire(tcp->tcp_connp, NULL); - - return (error); } static int @@ -25936,16 +20048,16 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, boolean_t user_specified; in_port_t allocated_port; in_port_t requested_port = *requested_port_ptr; - conn_t *connp; + conn_t *connp = tcp->tcp_connp; zone_t *zone; tcp_stack_t *tcps = tcp->tcp_tcps; - in6_addr_t v6addr = tcp->tcp_ip_src_v6; + in6_addr_t v6addr = connp->conn_laddr_v6; /* * XXX It's up to the caller to specify bind_to_req_port_only or not. */ - if (cr == NULL) - cr = tcp->tcp_cred; + ASSERT(cr != NULL); + /* * Get a valid port (within the anonymous range and should not * be a privileged one) to use if the user has not given a port. @@ -25961,7 +20073,7 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, mlptype = mlptSingle; mlp_port = requested_port; if (requested_port == 0) { - requested_port = tcp->tcp_anon_priv_bind ? + requested_port = connp->conn_anon_priv_bind ? tcp_get_next_priv_port(tcp) : tcp_update_next_port(tcps->tcps_next_port_to_try, tcp, B_TRUE); @@ -25975,7 +20087,6 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, * this socket and RPC is MLP in this zone, then give him an * anonymous MLP. */ - connp = tcp->tcp_connp; if (connp->conn_anon_mlp && is_system_labeled()) { zone = crgetzone(cr); addrtype = tsol_mlp_addr_type( @@ -26016,7 +20127,7 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, if (priv) { if (secpolicy_net_privaddr(cr, requested_port, IPPROTO_TCP) != 0) { - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_bind: no priv for port %d", @@ -26044,7 +20155,7 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, if (mlptype != mlptSingle) { if (secpolicy_net_bindmlp(cr) != 0) { - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_bind: no priv for multilevel port %d", @@ -26068,7 +20179,7 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, mlpzone = tsol_mlp_findzone(IPPROTO_TCP, htons(mlp_port)); if (connp->conn_zoneid != mlpzone) { - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_bind: attempt to bind port " @@ -26083,10 +20194,10 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, if (!user_specified) { int err; - err = tsol_mlp_anon(zone, mlptype, connp->conn_ulp, + err = tsol_mlp_anon(zone, mlptype, connp->conn_proto, requested_port, B_TRUE); if (err != 0) { - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_bind: cannot establish anon " @@ -26101,17 +20212,18 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, } allocated_port = tcp_bindi(tcp, requested_port, &v6addr, - tcp->tcp_reuseaddr, B_FALSE, bind_to_req_port_only, user_specified); + connp->conn_reuseaddr, B_FALSE, bind_to_req_port_only, + user_specified); if (allocated_port == 0) { connp->conn_mlp_type = mlptSingle; if (connp->conn_anon_port) { connp->conn_anon_port = B_FALSE; - (void) tsol_mlp_anon(zone, mlptype, connp->conn_ulp, + (void) tsol_mlp_anon(zone, mlptype, connp->conn_proto, requested_port, B_FALSE); } if (bind_to_req_port_only) { - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_bind: requested addr busy"); @@ -26119,7 +20231,7 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, return (-TADDRBUSY); } else { /* If we are out of ports, fail the bind. */ - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_bind: out of ports?"); @@ -26133,6 +20245,9 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, return (0); } +/* + * Check the address and check/pick a local port number. + */ static int tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, boolean_t bind_to_req_port_only) @@ -26140,18 +20255,22 @@ tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, tcp_t *tcp = connp->conn_tcp; sin_t *sin; sin6_t *sin6; - in_port_t requested_port; + in_port_t requested_port; ipaddr_t v4addr; in6_addr_t v6addr; - uint_t ipversion; - int error = 0; + ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */ + zoneid_t zoneid = IPCL_ZONEID(connp); + ip_stack_t *ipst = connp->conn_netstack->netstack_ip; + uint_t scopeid = 0; + int error = 0; + ip_xmit_attr_t *ixa = connp->conn_ixa; ASSERT((uintptr_t)len <= (uintptr_t)INT_MAX); if (tcp->tcp_state == TCPS_BOUND) { return (0); } else if (tcp->tcp_state > TCPS_BOUND) { - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_bind: bad state, %d", tcp->tcp_state); } @@ -26161,7 +20280,7 @@ tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, ASSERT(sa != NULL && len != 0); if (!OK_32PTR((char *)sa)) { - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_bind: bad address parameter, " @@ -26171,38 +20290,48 @@ tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, return (-TPROTO); } + error = proto_verify_ip_addr(connp->conn_family, sa, len); + if (error != 0) { + return (error); + } + switch (len) { case sizeof (sin_t): /* Complete IPv4 address */ sin = (sin_t *)sa; - /* - * With sockets sockfs will accept bogus sin_family in - * bind() and replace it with the family used in the socket - * call. - */ - if (sin->sin_family != AF_INET || - tcp->tcp_family != AF_INET) { - return (EAFNOSUPPORT); - } requested_port = ntohs(sin->sin_port); - ipversion = IPV4_VERSION; v4addr = sin->sin_addr.s_addr; IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr); + if (v4addr != INADDR_ANY) { + laddr_type = ip_laddr_verify_v4(v4addr, zoneid, ipst, + B_FALSE); + } break; case sizeof (sin6_t): /* Complete IPv6 address */ sin6 = (sin6_t *)sa; - if (sin6->sin6_family != AF_INET6 || - tcp->tcp_family != AF_INET6) { - return (EAFNOSUPPORT); - } - requested_port = ntohs(sin6->sin6_port); - ipversion = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) ? - IPV4_VERSION : IPV6_VERSION; v6addr = sin6->sin6_addr; + requested_port = ntohs(sin6->sin6_port); + if (IN6_IS_ADDR_V4MAPPED(&v6addr)) { + if (connp->conn_ipv6_v6only) + return (EADDRNOTAVAIL); + + IN6_V4MAPPED_TO_IPADDR(&v6addr, v4addr); + if (v4addr != INADDR_ANY) { + laddr_type = ip_laddr_verify_v4(v4addr, + zoneid, ipst, B_FALSE); + } + } else { + if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr)) { + if (IN6_IS_ADDR_LINKSCOPE(&v6addr)) + scopeid = sin6->sin6_scope_id; + laddr_type = ip_laddr_verify_v6(&v6addr, + zoneid, ipst, B_FALSE, scopeid); + } + } break; default: - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_bind: bad address length, %d", len); } @@ -26210,34 +20339,32 @@ tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, /* return (-TBADADDR); */ } - tcp->tcp_bound_source_v6 = v6addr; + /* Is the local address a valid unicast address? */ + if (laddr_type == IPVL_BAD) + return (EADDRNOTAVAIL); - /* Check for change in ipversion */ - if (tcp->tcp_ipversion != ipversion) { - ASSERT(tcp->tcp_family == AF_INET6); - error = (ipversion == IPV6_VERSION) ? - tcp_header_init_ipv6(tcp) : tcp_header_init_ipv4(tcp); - if (error) { - return (ENOMEM); - } - } - - /* - * Initialize family specific fields. Copy of the src addr. - * in tcp_t is needed for the lookup funcs. - */ - if (tcp->tcp_ipversion == IPV6_VERSION) { - tcp->tcp_ip6h->ip6_src = v6addr; + connp->conn_bound_addr_v6 = v6addr; + if (scopeid != 0) { + ixa->ixa_flags |= IXAF_SCOPEID_SET; + ixa->ixa_scopeid = scopeid; + connp->conn_incoming_ifindex = scopeid; } else { - IN6_V4MAPPED_TO_IPADDR(&v6addr, tcp->tcp_ipha->ipha_src); + ixa->ixa_flags &= ~IXAF_SCOPEID_SET; + connp->conn_incoming_ifindex = connp->conn_bound_if; } - tcp->tcp_ip_src_v6 = v6addr; + + connp->conn_laddr_v6 = v6addr; + connp->conn_saddr_v6 = v6addr; bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only; error = tcp_bind_select_lport(tcp, &requested_port, bind_to_req_port_only, cr); - + if (error != 0) { + connp->conn_laddr_v6 = ipv6_all_zeros; + connp->conn_saddr_v6 = ipv6_all_zeros; + connp->conn_bound_addr_v6 = ipv6_all_zeros; + } return (error); } @@ -26253,7 +20380,7 @@ tcp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, tcp_t *tcp = connp->conn_tcp; if (tcp->tcp_state >= TCPS_BOUND) { - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_bind: bad state, %d", tcp->tcp_state); } @@ -26265,19 +20392,8 @@ tcp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, return (error); ASSERT(tcp->tcp_state == TCPS_BOUND); - tcp->tcp_conn_req_max = 0; - - if (tcp->tcp_family == AF_INET6) { - ASSERT(tcp->tcp_connp->conn_af_isv6); - error = ip_proto_bind_laddr_v6(connp, NULL, IPPROTO_TCP, - &tcp->tcp_bound_source_v6, 0, B_FALSE); - } else { - ASSERT(!tcp->tcp_connp->conn_af_isv6); - error = ip_proto_bind_laddr_v4(connp, NULL, IPPROTO_TCP, - tcp->tcp_ipha->ipha_src, 0, B_FALSE); - } - return (tcp_post_ip_bind(tcp, NULL, error, NULL, 0)); + return (0); } int @@ -26337,7 +20453,14 @@ tcp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, ipaddr_t *dstaddrp; in_port_t dstport; uint_t srcid; - int error = 0; + int error; + uint32_t mss; + mblk_t *syn_mp; + tcp_stack_t *tcps = tcp->tcp_tcps; + int32_t oldstate; + ip_xmit_attr_t *ixa = connp->conn_ixa; + + oldstate = tcp->tcp_state; switch (len) { default: @@ -26351,7 +20474,7 @@ tcp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, if (sin->sin_port == 0) { return (-TBADADDR); } - if (tcp->tcp_connp && tcp->tcp_connp->conn_ipv6_v6only) { + if (connp->conn_ipv6_v6only) { return (EAFNOSUPPORT); } break; @@ -26365,23 +20488,18 @@ tcp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, } /* * If we're connecting to an IPv4-mapped IPv6 address, we need to - * make sure that the template IP header in the tcp structure is an - * IPv4 header, and that the tcp_ipversion is IPV4_VERSION. We + * make sure that the conn_ipversion is IPV4_VERSION. We * need to this before we call tcp_bindi() so that the port lookup * code will look for ports in the correct port space (IPv4 and * IPv6 have separate port spaces). */ - if (tcp->tcp_family == AF_INET6 && tcp->tcp_ipversion == IPV6_VERSION && + if (connp->conn_family == AF_INET6 && + connp->conn_ipversion == IPV6_VERSION && IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { - int err = 0; + if (connp->conn_ipv6_v6only) + return (EADDRNOTAVAIL); - err = tcp_header_init_ipv4(tcp); - if (err != 0) { - error = ENOMEM; - goto connect_failed; - } - if (tcp->tcp_lport != 0) - *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport; + connp->conn_ipversion = IPV4_VERSION; } switch (tcp->tcp_state) { @@ -26399,43 +20517,147 @@ tcp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, */ /* FALLTHRU */ case TCPS_BOUND: - if (tcp->tcp_family == AF_INET6) { - if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { - return (tcp_connect_ipv6(tcp, - &sin6->sin6_addr, - sin6->sin6_port, sin6->sin6_flowinfo, - sin6->__sin6_src_id, sin6->sin6_scope_id, - cr, pid)); - } + break; + default: + return (-TOUTSTATE); + } + + /* + * We update our cred/cpid based on the caller of connect + */ + if (connp->conn_cred != cr) { + crhold(cr); + crfree(connp->conn_cred); + connp->conn_cred = cr; + } + connp->conn_cpid = pid; + + /* Cache things in the ixa without any refhold */ + ixa->ixa_cred = cr; + ixa->ixa_cpid = pid; + if (is_system_labeled()) { + /* We need to restart with a label based on the cred */ + ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); + } + + if (connp->conn_family == AF_INET6) { + if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { + error = tcp_connect_ipv6(tcp, &sin6->sin6_addr, + sin6->sin6_port, sin6->sin6_flowinfo, + sin6->__sin6_src_id, sin6->sin6_scope_id); + } else { /* * Destination adress is mapped IPv6 address. * Source bound address should be unspecified or * IPv6 mapped address as well. */ if (!IN6_IS_ADDR_UNSPECIFIED( - &tcp->tcp_bound_source_v6) && - !IN6_IS_ADDR_V4MAPPED(&tcp->tcp_bound_source_v6)) { + &connp->conn_bound_addr_v6) && + !IN6_IS_ADDR_V4MAPPED(&connp->conn_bound_addr_v6)) { return (EADDRNOTAVAIL); } dstaddrp = &V4_PART_OF_V6((sin6->sin6_addr)); dstport = sin6->sin6_port; srcid = sin6->__sin6_src_id; - } else { - dstaddrp = &sin->sin_addr.s_addr; - dstport = sin->sin_port; - srcid = 0; + error = tcp_connect_ipv4(tcp, dstaddrp, dstport, + srcid); } + } else { + dstaddrp = &sin->sin_addr.s_addr; + dstport = sin->sin_port; + srcid = 0; + error = tcp_connect_ipv4(tcp, dstaddrp, dstport, srcid); + } - error = tcp_connect_ipv4(tcp, dstaddrp, dstport, srcid, cr, - pid); - break; - default: - return (-TOUTSTATE); + if (error != 0) + goto connect_failed; + + CL_INET_CONNECT(connp, B_TRUE, error); + if (error != 0) + goto connect_failed; + + /* connect succeeded */ + BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens); + tcp->tcp_active_open = 1; + + /* + * tcp_set_destination() does not adjust for TCP/IP header length. + */ + mss = tcp->tcp_mss - connp->conn_ht_iphc_len; + + /* + * Just make sure our rwnd is at least rcvbuf * MSS large, and round up + * to the nearest MSS. + * + * We do the round up here because we need to get the interface MTU + * first before we can do the round up. + */ + tcp->tcp_rwnd = connp->conn_rcvbuf; + tcp->tcp_rwnd = MAX(MSS_ROUNDUP(tcp->tcp_rwnd, mss), + tcps->tcps_recv_hiwat_minmss * mss); + connp->conn_rcvbuf = tcp->tcp_rwnd; + tcp_set_ws_value(tcp); + tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); + if (tcp->tcp_rcv_ws > 0 || tcps->tcps_wscale_always) + tcp->tcp_snd_ws_ok = B_TRUE; + + /* + * Set tcp_snd_ts_ok to true + * so that tcp_xmit_mp will + * include the timestamp + * option in the SYN segment. + */ + if (tcps->tcps_tstamp_always || + (tcp->tcp_rcv_ws && tcps->tcps_tstamp_if_wscale)) { + tcp->tcp_snd_ts_ok = B_TRUE; } + /* - * Note: Code below is the "failure" case + * tcp_snd_sack_ok can be set in + * tcp_set_destination() if the sack metric + * is set. So check it here also. + */ + if (tcps->tcps_sack_permitted == 2 || + tcp->tcp_snd_sack_ok) { + if (tcp->tcp_sack_info == NULL) { + tcp->tcp_sack_info = kmem_cache_alloc( + tcp_sack_info_cache, KM_SLEEP); + } + tcp->tcp_snd_sack_ok = B_TRUE; + } + + /* + * Should we use ECN? Note that the current + * default value (SunOS 5.9) of tcp_ecn_permitted + * is 1. The reason for doing this is that there + * are equipments out there that will drop ECN + * enabled IP packets. Setting it to 1 avoids + * compatibility problems. */ + if (tcps->tcps_ecn_permitted == 2) + tcp->tcp_ecn_ok = B_TRUE; + + TCP_TIMER_RESTART(tcp, tcp->tcp_rto); + syn_mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, + tcp->tcp_iss, B_FALSE, NULL, B_FALSE); + if (syn_mp != NULL) { + /* + * We must bump the generation before sending the syn + * to ensure that we use the right generation in case + * this thread issues a "connected" up call. + */ + SOCK_CONNID_BUMP(tcp->tcp_connid); + tcp_send_data(tcp, syn_mp); + } + + if (tcp->tcp_conn.tcp_opts_conn_req != NULL) + tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); + return (0); + connect_failed: + connp->conn_faddr_v6 = ipv6_all_zeros; + connp->conn_fport = 0; + tcp->tcp_state = oldstate; if (tcp->tcp_conn.tcp_opts_conn_req != NULL) tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); return (error); @@ -26446,7 +20668,6 @@ tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, socklen_t len, sock_connid_t *id, cred_t *cr) { conn_t *connp = (conn_t *)proto_handle; - tcp_t *tcp = connp->conn_tcp; squeue_t *sqp = connp->conn_sqp; int error; @@ -26455,7 +20676,7 @@ tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, /* All Solaris components should pass a cred for this operation. */ ASSERT(cr != NULL); - error = proto_verify_ip_addr(tcp->tcp_family, sa, len); + error = proto_verify_ip_addr(connp->conn_family, sa, len); if (error != 0) { return (error); } @@ -26493,7 +20714,7 @@ tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, } } - if (tcp->tcp_loopback) { + if (connp->conn_tcp->tcp_loopback) { struct sock_proto_props sopp; sopp.sopp_flags = SOCKOPT_LOOPBACK; @@ -26521,7 +20742,7 @@ tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, return (NULL); } - connp = tcp_create_common(NULL, credp, isv6, B_TRUE, errorp); + connp = tcp_create_common(credp, isv6, B_TRUE, errorp); if (connp == NULL) { return (NULL); } @@ -26578,8 +20799,8 @@ tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle, connp->conn_upcalls = sock_upcalls; connp->conn_upper_handle = sock_handle; - ASSERT(connp->conn_tcp->tcp_recv_hiwater != 0 && - connp->conn_tcp->tcp_recv_hiwater == connp->conn_tcp->tcp_rwnd); + ASSERT(connp->conn_rcvbuf != 0 && + connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd); (*sock_upcalls->su_set_proto_props)(sock_handle, &sopp); } @@ -26663,7 +20884,7 @@ tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, /* * Squeue Flow Control */ - if (TCP_UNSENT_BYTES(tcp) > tcp->tcp_xmit_hiwater) { + if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) { tcp_setqfull(tcp); } mutex_exit(&tcp->tcp_non_sq_lock); @@ -26680,12 +20901,11 @@ tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, CONN_INC_REF(connp); if (msg->msg_flags & MSG_OOB) { - SQUEUE_ENTER_ONE(connp->conn_sqp, mp, - tcp_output_urgent, connp, tcp_squeue_flag, - SQTAG_TCP_OUTPUT); + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent, + connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); } else { SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, - connp, tcp_squeue_flag, SQTAG_TCP_OUTPUT); + connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); } return (0); @@ -26698,9 +20918,9 @@ tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, return (0); } -/* ARGSUSED */ +/* ARGSUSED2 */ void -tcp_output_urgent(void *arg, mblk_t *mp, void *arg2) +tcp_output_urgent(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { int len; uint32_t msize; @@ -26739,7 +20959,7 @@ tcp_output_urgent(void *arg, mblk_t *mp, void *arg2) tcp_wput_data(tcp, mp, B_TRUE); } -/* ARGSUSED */ +/* ARGSUSED3 */ int tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr, socklen_t *addrlenp, cred_t *cr) @@ -26752,24 +20972,24 @@ tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr, ASSERT(cr != NULL); ASSERT(tcp != NULL); + if (tcp->tcp_state < TCPS_SYN_RCVD) + return (ENOTCONN); - return (tcp_do_getpeername(tcp, addr, addrlenp)); + return (conn_getpeername(connp, addr, addrlenp)); } -/* ARGSUSED */ +/* ARGSUSED3 */ int tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr, socklen_t *addrlenp, cred_t *cr) { conn_t *connp = (conn_t *)proto_handle; - tcp_t *tcp = connp->conn_tcp; /* All Solaris components should pass a cred for this operation. */ ASSERT(cr != NULL); ASSERT(connp->conn_upper_handle != NULL); - - return (tcp_do_getsockname(tcp, addr, addrlenp)); + return (conn_getsockname(connp, addr, addrlenp)); } /* @@ -26809,8 +21029,8 @@ tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q, RD(q)->q_ptr = WR(q)->q_ptr = connp; - connp->conn_tcp->tcp_rq = connp->conn_rq = RD(q); - connp->conn_tcp->tcp_wq = connp->conn_wq = WR(q); + connp->conn_rq = RD(q); + connp->conn_wq = WR(q); WR(q)->q_qinfo = &tcp_sock_winit; @@ -26830,11 +21050,11 @@ tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q, stropt_mp->b_wptr += sizeof (struct stroptions); stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK; - stropt->so_wroff = tcp->tcp_hdr_len + (tcp->tcp_loopback ? 0 : + stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 : tcp->tcp_tcps->tcps_wroff_xtra); if (tcp->tcp_snd_sack_ok) stropt->so_wroff += TCPOPT_MAX_SACK_LEN; - stropt->so_hiwat = tcp->tcp_recv_hiwater; + stropt->so_hiwat = connp->conn_rcvbuf; stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE); putnext(RD(q), stropt_mp); @@ -26845,15 +21065,17 @@ tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q, tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID); laddrlen = faddrlen = sizeof (sin6_t); - (void) tcp_do_getsockname(tcp, (struct sockaddr *)&laddr, &laddrlen); - error = tcp_do_getpeername(tcp, (struct sockaddr *)&faddr, &faddrlen); + (void) tcp_getsockname((sock_lower_handle_t)connp, + (struct sockaddr *)&laddr, &laddrlen, CRED()); + error = tcp_getpeername((sock_lower_handle_t)connp, + (struct sockaddr *)&faddr, &faddrlen, CRED()); if (error != 0) faddrlen = 0; opts = 0; - if (tcp->tcp_oobinline) + if (connp->conn_oobinline) opts |= SO_OOBINLINE; - if (tcp->tcp_dontroute) + if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE) opts |= SO_DONTROUTE; /* @@ -26868,6 +21090,7 @@ tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q, while ((mp = tcp->tcp_rcv_list) != NULL) { tcp->tcp_rcv_list = mp->b_next; mp->b_next = NULL; + /* We never do fallback for kernel RPC */ putnext(q, mp); } tcp->tcp_rcv_last_head = NULL; @@ -26908,7 +21131,7 @@ tcp_fallback_eager(tcp_t *eager, boolean_t direct_sockfs) * Sockfs guarantees that the listener will not be closed * during fallback. So we can safely use the listener's queue. */ - putnext(listener->tcp_rq, mp); + putnext(listener->tcp_connp->conn_rq, mp); } int @@ -26987,7 +21210,7 @@ tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q, /* ARGSUSED */ static void -tcp_shutdown_output(void *arg, mblk_t *mp, void *arg2) +tcp_shutdown_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { conn_t *connp = (conn_t *)arg; tcp_t *tcp = connp->conn_tcp; @@ -27002,7 +21225,7 @@ tcp_shutdown_output(void *arg, mblk_t *mp, void *arg2) * We were crossing FINs and got a reset from * the other side. Just ignore it. */ - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_shutdown_output() out of state %s", @@ -27036,7 +21259,7 @@ tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL); CONN_INC_REF(connp); SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output, - connp, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT); + connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT); (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, SOCK_OPCTL_SHUT_SEND, 0); @@ -27109,7 +21332,7 @@ tcp_do_listen(conn_t *connp, struct sockaddr *sa, socklen_t len, */ goto do_listen; } - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_listen: bad state, %d", tcp->tcp_state); } @@ -27121,15 +21344,14 @@ tcp_do_listen(conn_t *connp, struct sockaddr *sa, socklen_t len, sin6_t *sin6; ASSERT(IPCL_IS_NONSTR(connp)); - /* Do an implicit bind: Request for a generic port. */ - if (tcp->tcp_family == AF_INET) { + if (connp->conn_family == AF_INET) { len = sizeof (sin_t); sin = (sin_t *)&addr; *sin = sin_null; sin->sin_family = AF_INET; } else { - ASSERT(tcp->tcp_family == AF_INET6); + ASSERT(connp->conn_family == AF_INET6); len = sizeof (sin6_t); sin6 = (sin6_t *)&addr; *sin6 = sin6_null; @@ -27171,23 +21393,42 @@ do_listen: } /* - * We can call ip_bind directly, the processing continues - * in tcp_post_ip_bind(). - * * We need to make sure that the conn_recv is set to a non-null * value before we insert the conn into the classifier table. * This is to avoid a race with an incoming packet which does an * ipcl_classify(). + * We initially set it to tcp_input_listener_unbound to try to + * pick a good squeue for the listener when the first SYN arrives. + * tcp_input_listener_unbound sets it to tcp_input_listener on that + * first SYN. */ - connp->conn_recv = tcp_conn_request; - if (tcp->tcp_family == AF_INET) { - error = ip_proto_bind_laddr_v4(connp, NULL, - IPPROTO_TCP, tcp->tcp_bound_source, tcp->tcp_lport, B_TRUE); - } else { - error = ip_proto_bind_laddr_v6(connp, NULL, IPPROTO_TCP, - &tcp->tcp_bound_source_v6, tcp->tcp_lport, B_TRUE); + connp->conn_recv = tcp_input_listener_unbound; + + /* Insert the listener in the classifier table */ + error = ip_laddr_fanout_insert(connp); + if (error != 0) { + /* Undo the bind - release the port number */ + tcp->tcp_state = TCPS_IDLE; + connp->conn_bound_addr_v6 = ipv6_all_zeros; + + connp->conn_laddr_v6 = ipv6_all_zeros; + connp->conn_saddr_v6 = ipv6_all_zeros; + connp->conn_ports = 0; + + if (connp->conn_anon_port) { + zone_t *zone; + + zone = crgetzone(cr); + connp->conn_anon_port = B_FALSE; + (void) tsol_mlp_anon(zone, connp->conn_mlp_type, + connp->conn_proto, connp->conn_lport, B_FALSE); + } + connp->conn_mlp_type = mlptSingle; + + tcp_bind_hash_remove(tcp); + return (error); } - return (tcp_post_ip_bind(tcp, NULL, error, NULL, 0)); + return (error); } void @@ -27222,7 +21463,7 @@ tcp_clr_flowctrl(sock_lower_handle_t proto_handle) if (tcp->tcp_fused) { tcp_fuse_backenable(tcp); } else { - tcp->tcp_rwnd = tcp->tcp_recv_hiwater; + tcp->tcp_rwnd = connp->conn_rcvbuf; /* * Send back a window update immediately if TCP is above * ESTABLISHED state and the increase of the rcv window @@ -27253,10 +21494,28 @@ tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, /* All Solaris components should pass a cred for this operation. */ ASSERT(cr != NULL); + /* + * If we don't have a helper stream then create one. + * ip_create_helper_stream takes care of locking the conn_t, + * so this check for NULL is just a performance optimization. + */ + if (connp->conn_helper_info == NULL) { + tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps; + + /* + * Create a helper stream for non-STREAMS socket. + */ + error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident); + if (error != 0) { + ip0dbg(("tcp_ioctl: create of IP helper stream " + "failed %d\n", error)); + return (error); + } + } + switch (cmd) { case ND_SET: case ND_GET: - case TCP_IOC_DEFAULT_Q: case _SIOCSOCKFALLBACK: case TCP_IOC_ABORT_CONN: case TI_GETPEERNAME: diff --git a/usr/src/uts/common/inet/tcp/tcp_fusion.c b/usr/src/uts/common/inet/tcp/tcp_fusion.c index 3ee909cc4d..313b024943 100644 --- a/usr/src/uts/common/inet/tcp/tcp_fusion.c +++ b/usr/src/uts/common/inet/tcp/tcp_fusion.c @@ -69,50 +69,6 @@ boolean_t do_tcp_fusion = B_TRUE; /* - * Return true if this connection needs some IP functionality - */ -static boolean_t -tcp_loopback_needs_ip(tcp_t *tcp, netstack_t *ns) -{ - ipsec_stack_t *ipss = ns->netstack_ipsec; - - /* - * If ire is not cached, do not use fusion - */ - if (tcp->tcp_connp->conn_ire_cache == NULL) { - /* - * There is no need to hold conn_lock here because when called - * from tcp_fuse() there can be no window where conn_ire_cache - * can change. This is not true when called from - * tcp_fuse_output() as conn_ire_cache can become null just - * after the check. It will be necessary to recheck for a NULL - * conn_ire_cache in tcp_fuse_output() to avoid passing a - * stale ill pointer to FW_HOOKS. - */ - return (B_TRUE); - } - if (tcp->tcp_ipversion == IPV4_VERSION) { - if (tcp->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH) - return (B_TRUE); - if (CONN_OUTBOUND_POLICY_PRESENT(tcp->tcp_connp, ipss)) - return (B_TRUE); - if (CONN_INBOUND_POLICY_PRESENT(tcp->tcp_connp, ipss)) - return (B_TRUE); - } else { - if (tcp->tcp_ip_hdr_len != IPV6_HDR_LEN) - return (B_TRUE); - if (CONN_OUTBOUND_POLICY_PRESENT_V6(tcp->tcp_connp, ipss)) - return (B_TRUE); - if (CONN_INBOUND_POLICY_PRESENT_V6(tcp->tcp_connp, ipss)) - return (B_TRUE); - } - if (!CONN_IS_LSO_MD_FASTPATH(tcp->tcp_connp)) - return (B_TRUE); - return (B_FALSE); -} - - -/* * This routine gets called by the eager tcp upon changing state from * SYN_RCVD to ESTABLISHED. It fuses a direct path between itself * and the active connect tcp such that the regular tcp processings @@ -124,10 +80,10 @@ tcp_loopback_needs_ip(tcp_t *tcp, netstack_t *ns) * same squeue as the one given to the active connect tcp during open. */ void -tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph) +tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcpha_t *tcpha) { - conn_t *peer_connp, *connp = tcp->tcp_connp; - tcp_t *peer_tcp; + conn_t *peer_connp, *connp = tcp->tcp_connp; + tcp_t *peer_tcp; tcp_stack_t *tcps = tcp->tcp_tcps; netstack_t *ns; ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; @@ -136,20 +92,16 @@ tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph) ASSERT(tcp->tcp_loopback); ASSERT(tcp->tcp_loopback_peer == NULL); /* - * We need to inherit tcp_recv_hiwater of the listener tcp, + * We need to inherit conn_rcvbuf of the listener tcp, * but we can't really use tcp_listener since we get here after - * sending up T_CONN_IND and tcp_wput_accept() may be called + * sending up T_CONN_IND and tcp_tli_accept() may be called * independently, at which point tcp_listener is cleared; * this is why we use tcp_saved_listener. The listener itself * is guaranteed to be around until tcp_accept_finish() is called * on this eager -- this won't happen until we're done since we're * inside the eager's perimeter now. - * - * We can also get called in the case were a connection needs - * to be re-fused. In this case tcp_saved_listener will be - * NULL but tcp_refuse will be true. */ - ASSERT(tcp->tcp_saved_listener != NULL || tcp->tcp_refuse); + ASSERT(tcp->tcp_saved_listener != NULL); /* * Lookup peer endpoint; search for the remote endpoint having * the reversed address-port quadruplet in ESTABLISHED state, @@ -157,12 +109,12 @@ tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph) * is applied accordingly for loopback address, but not for * local address since we want fusion to happen across Zones. */ - if (tcp->tcp_ipversion == IPV4_VERSION) { + if (connp->conn_ipversion == IPV4_VERSION) { peer_connp = ipcl_conn_tcp_lookup_reversed_ipv4(connp, - (ipha_t *)iphdr, tcph, ipst); + (ipha_t *)iphdr, tcpha, ipst); } else { peer_connp = ipcl_conn_tcp_lookup_reversed_ipv6(connp, - (ip6_t *)iphdr, tcph, ipst); + (ip6_t *)iphdr, tcpha, ipst); } /* @@ -202,28 +154,20 @@ tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph) /* * Fuse the endpoints; we perform further checks against both * tcp endpoints to ensure that a fusion is allowed to happen. - * In particular we bail out for non-simple TCP/IP or if IPsec/ - * IPQoS policy/kernel SSL exists. We also need to check if - * the connection is quiescent to cover the case when we are - * trying to re-enable fusion after IPobservability is turned off. + * In particular we bail out if kernel SSL exists. */ ns = tcps->tcps_netstack; ipst = ns->netstack_ip; if (!tcp->tcp_unfusable && !peer_tcp->tcp_unfusable && - !tcp_loopback_needs_ip(tcp, ns) && - !tcp_loopback_needs_ip(peer_tcp, ns) && - tcp->tcp_kssl_ent == NULL && - tcp->tcp_xmit_head == NULL && peer_tcp->tcp_xmit_head == NULL && - !IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN, ipst)) { + (tcp->tcp_kssl_ent == NULL) && (tcp->tcp_xmit_head == NULL) && + (peer_tcp->tcp_xmit_head == NULL)) { mblk_t *mp; - queue_t *peer_rq = peer_tcp->tcp_rq; + queue_t *peer_rq = peer_connp->conn_rq; ASSERT(!TCP_IS_DETACHED(peer_tcp)); - ASSERT(tcp->tcp_fused_sigurg_mp == NULL || - (!IPCL_IS_NONSTR(connp) && tcp->tcp_refuse)); - ASSERT(peer_tcp->tcp_fused_sigurg_mp == NULL || - (!IPCL_IS_NONSTR(peer_connp) && peer_tcp->tcp_refuse)); + ASSERT(tcp->tcp_fused_sigurg_mp == NULL); + ASSERT(peer_tcp->tcp_fused_sigurg_mp == NULL); ASSERT(tcp->tcp_kssl_ctx == NULL); /* @@ -272,54 +216,40 @@ tcp_fuse(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph) tcp_timers_stop(tcp); tcp_timers_stop(peer_tcp); - if (!tcp->tcp_refuse) { - /* - * Set receive buffer and max packet size for the - * active open tcp. - * eager's values will be set in tcp_accept_finish. - */ - - (void) tcp_rwnd_set(peer_tcp, - peer_tcp->tcp_recv_hiwater); + /* + * Set receive buffer and max packet size for the + * active open tcp. + * eager's values will be set in tcp_accept_finish. + */ + (void) tcp_rwnd_set(peer_tcp, peer_tcp->tcp_connp->conn_rcvbuf); - /* - * Set the write offset value to zero since we won't - * be needing any room for TCP/IP headers. - */ - if (!IPCL_IS_NONSTR(peer_tcp->tcp_connp)) { - struct stroptions *stropt; + /* + * Set the write offset value to zero since we won't + * be needing any room for TCP/IP headers. + */ + if (!IPCL_IS_NONSTR(peer_tcp->tcp_connp)) { + struct stroptions *stropt; - DB_TYPE(mp) = M_SETOPTS; - mp->b_wptr += sizeof (*stropt); + DB_TYPE(mp) = M_SETOPTS; + mp->b_wptr += sizeof (*stropt); - stropt = (struct stroptions *)mp->b_rptr; - stropt->so_flags = SO_WROFF; - stropt->so_wroff = 0; + stropt = (struct stroptions *)mp->b_rptr; + stropt->so_flags = SO_WROFF; + stropt->so_wroff = 0; - /* Send the options up */ - putnext(peer_rq, mp); - } else { - struct sock_proto_props sopp; + /* Send the options up */ + putnext(peer_rq, mp); + } else { + struct sock_proto_props sopp; - /* The peer is a non-STREAMS end point */ - ASSERT(IPCL_IS_TCP(peer_connp)); + /* The peer is a non-STREAMS end point */ + ASSERT(IPCL_IS_TCP(peer_connp)); - sopp.sopp_flags = SOCKOPT_WROFF; - sopp.sopp_wroff = 0; - (*peer_connp->conn_upcalls->su_set_proto_props) - (peer_connp->conn_upper_handle, &sopp); - } - } else { - /* - * Endpoints are being re-fused, so options will not - * be sent up. In case of STREAMS, free the stroptions - * mblk. - */ - if (!IPCL_IS_NONSTR(connp)) - freemsg(mp); + sopp.sopp_flags = SOCKOPT_WROFF; + sopp.sopp_wroff = 0; + (*peer_connp->conn_upcalls->su_set_proto_props) + (peer_connp->conn_upper_handle, &sopp); } - tcp->tcp_refuse = B_FALSE; - peer_tcp->tcp_refuse = B_FALSE; } else { TCP_STAT(tcps, tcp_fusion_unqualified); } @@ -374,12 +304,12 @@ tcp_unfuse(tcp_t *tcp) * when called from tcp_rcv_drain(). */ if (!TCP_IS_DETACHED(tcp)) { - (void) tcp_fuse_rcv_drain(tcp->tcp_rq, tcp, + (void) tcp_fuse_rcv_drain(tcp->tcp_connp->conn_rq, tcp, &tcp->tcp_fused_sigurg_mp); } if (!TCP_IS_DETACHED(peer_tcp)) { - (void) tcp_fuse_rcv_drain(peer_tcp->tcp_rq, peer_tcp, - &peer_tcp->tcp_fused_sigurg_mp); + (void) tcp_fuse_rcv_drain(peer_tcp->tcp_connp->conn_rq, + peer_tcp, &peer_tcp->tcp_fused_sigurg_mp); } /* Lift up any flow-control conditions */ @@ -398,12 +328,12 @@ tcp_unfuse(tcp_t *tcp) mutex_exit(&peer_tcp->tcp_non_sq_lock); /* - * Update th_seq and th_ack in the header template + * Update tha_seq and tha_ack in the header template */ - U32_TO_ABE32(tcp->tcp_snxt, tcp->tcp_tcph->th_seq); - U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); - U32_TO_ABE32(peer_tcp->tcp_snxt, peer_tcp->tcp_tcph->th_seq); - U32_TO_ABE32(peer_tcp->tcp_rnxt, peer_tcp->tcp_tcph->th_ack); + tcp->tcp_tcpha->tha_seq = htonl(tcp->tcp_snxt); + tcp->tcp_tcpha->tha_ack = htonl(tcp->tcp_rnxt); + peer_tcp->tcp_tcpha->tha_seq = htonl(peer_tcp->tcp_snxt); + peer_tcp->tcp_tcpha->tha_ack = htonl(peer_tcp->tcp_rnxt); /* Unfuse the endpoints */ peer_tcp->tcp_fused = tcp->tcp_fused = B_FALSE; @@ -509,59 +439,28 @@ tcp_fuse_output_urg(tcp_t *tcp, mblk_t *mp) boolean_t tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size) { - tcp_t *peer_tcp = tcp->tcp_loopback_peer; - boolean_t flow_stopped, peer_data_queued = B_FALSE; - boolean_t urgent = (DB_TYPE(mp) != M_DATA); - boolean_t push = B_TRUE; - mblk_t *mp1 = mp; - ill_t *ilp, *olp; - ipif_t *iifp, *oifp; - ipha_t *ipha; - ip6_t *ip6h; - tcph_t *tcph; - uint_t ip_hdr_len; - uint32_t seq; - uint32_t recv_size = send_size; + conn_t *connp = tcp->tcp_connp; + tcp_t *peer_tcp = tcp->tcp_loopback_peer; + conn_t *peer_connp = peer_tcp->tcp_connp; + boolean_t flow_stopped, peer_data_queued = B_FALSE; + boolean_t urgent = (DB_TYPE(mp) != M_DATA); + boolean_t push = B_TRUE; + mblk_t *mp1 = mp; + uint_t ip_hdr_len; + uint32_t recv_size = send_size; tcp_stack_t *tcps = tcp->tcp_tcps; netstack_t *ns = tcps->tcps_netstack; ip_stack_t *ipst = ns->netstack_ip; + ipsec_stack_t *ipss = ns->netstack_ipsec; + iaflags_t ixaflags = connp->conn_ixa->ixa_flags; + boolean_t do_ipsec, hooks_out, hooks_in, ipobs_enabled; ASSERT(tcp->tcp_fused); ASSERT(peer_tcp != NULL && peer_tcp->tcp_loopback_peer == tcp); - ASSERT(tcp->tcp_connp->conn_sqp == peer_tcp->tcp_connp->conn_sqp); + ASSERT(connp->conn_sqp == peer_connp->conn_sqp); ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); - /* If this connection requires IP, unfuse and use regular path */ - if (tcp_loopback_needs_ip(tcp, ns) || - tcp_loopback_needs_ip(peer_tcp, ns) || - IPP_ENABLED(IPP_LOCAL_OUT|IPP_LOCAL_IN, ipst) || - (tcp->tcp_ipversion == IPV4_VERSION && - ipst->ips_ip4_observe.he_interested) || - (tcp->tcp_ipversion == IPV6_VERSION && - ipst->ips_ip6_observe.he_interested)) { - TCP_STAT(tcps, tcp_fusion_aborted); - tcp->tcp_refuse = B_TRUE; - peer_tcp->tcp_refuse = B_TRUE; - - bcopy(peer_tcp->tcp_tcph, &tcp->tcp_saved_tcph, - sizeof (tcph_t)); - bcopy(tcp->tcp_tcph, &peer_tcp->tcp_saved_tcph, - sizeof (tcph_t)); - if (tcp->tcp_ipversion == IPV4_VERSION) { - bcopy(peer_tcp->tcp_ipha, &tcp->tcp_saved_ipha, - sizeof (ipha_t)); - bcopy(tcp->tcp_ipha, &peer_tcp->tcp_saved_ipha, - sizeof (ipha_t)); - } else { - bcopy(peer_tcp->tcp_ip6h, &tcp->tcp_saved_ip6h, - sizeof (ip6_t)); - bcopy(tcp->tcp_ip6h, &peer_tcp->tcp_saved_ip6h, - sizeof (ip6_t)); - } - goto unfuse; - } - if (send_size == 0) { freemsg(mp); return (B_TRUE); @@ -578,123 +477,74 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size) mp1 = mp->b_cont; } - if (tcp->tcp_ipversion == IPV4_VERSION && - (HOOKS4_INTERESTED_LOOPBACK_IN(ipst) || - HOOKS4_INTERESTED_LOOPBACK_OUT(ipst)) || - tcp->tcp_ipversion == IPV6_VERSION && - (HOOKS6_INTERESTED_LOOPBACK_IN(ipst) || - HOOKS6_INTERESTED_LOOPBACK_OUT(ipst))) { - /* - * Build ip and tcp header to satisfy FW_HOOKS. - * We only build it when any hook is present. - */ + /* + * Check that we are still using an IRE_LOCAL or IRE_LOOPBACK before + * further processes. + */ + if (!ip_output_verify_local(connp->conn_ixa)) + goto unfuse; + + /* + * Build IP and TCP header in case we have something that needs the + * headers. Those cases are: + * 1. IPsec + * 2. IPobs + * 3. FW_HOOKS + * + * If tcp_xmit_mp() fails to dupb() the message, unfuse the connection + * and back to regular path. + */ + if (ixaflags & IXAF_IS_IPV4) { + do_ipsec = (ixaflags & IXAF_IPSEC_SECURE) || + CONN_INBOUND_POLICY_PRESENT(peer_connp, ipss); + + hooks_out = HOOKS4_INTERESTED_LOOPBACK_OUT(ipst); + hooks_in = HOOKS4_INTERESTED_LOOPBACK_IN(ipst); + ipobs_enabled = (ipst->ips_ip4_observe.he_interested != 0); + } else { + do_ipsec = (ixaflags & IXAF_IPSEC_SECURE) || + CONN_INBOUND_POLICY_PRESENT_V6(peer_connp, ipss); + + hooks_out = HOOKS6_INTERESTED_LOOPBACK_OUT(ipst); + hooks_in = HOOKS6_INTERESTED_LOOPBACK_IN(ipst); + ipobs_enabled = (ipst->ips_ip6_observe.he_interested != 0); + } + + /* We do logical 'or' for efficiency */ + if (ipobs_enabled | do_ipsec | hooks_in | hooks_out) { if ((mp1 = tcp_xmit_mp(tcp, mp1, tcp->tcp_mss, NULL, NULL, tcp->tcp_snxt, B_TRUE, NULL, B_FALSE)) == NULL) /* If tcp_xmit_mp fails, use regular path */ goto unfuse; /* - * The ipif and ill can be safely referenced under the - * protection of conn_lock - see head of function comment for - * conn_get_held_ipif(). It is necessary to check that both - * the ipif and ill can be looked up (i.e. not condemned). If - * not, bail out and unfuse this connection. + * Leave all IP relevant processes to ip_output_process_local(), + * which handles IPsec, IPobs, and FW_HOOKS. */ - mutex_enter(&peer_tcp->tcp_connp->conn_lock); - if ((peer_tcp->tcp_connp->conn_ire_cache == NULL) || - (peer_tcp->tcp_connp->conn_ire_cache->ire_marks & - IRE_MARK_CONDEMNED) || - ((oifp = peer_tcp->tcp_connp->conn_ire_cache->ire_ipif) - == NULL) || - (!IPIF_CAN_LOOKUP(oifp)) || - ((olp = oifp->ipif_ill) == NULL) || - (ill_check_and_refhold(olp) != 0)) { - mutex_exit(&peer_tcp->tcp_connp->conn_lock); - goto unfuse; - } - mutex_exit(&peer_tcp->tcp_connp->conn_lock); - - /* PFHooks: LOOPBACK_OUT */ - if (tcp->tcp_ipversion == IPV4_VERSION) { - ipha = (ipha_t *)mp1->b_rptr; - - DTRACE_PROBE4(ip4__loopback__out__start, - ill_t *, NULL, ill_t *, olp, - ipha_t *, ipha, mblk_t *, mp1); - FW_HOOKS(ipst->ips_ip4_loopback_out_event, - ipst->ips_ipv4firewall_loopback_out, - NULL, olp, ipha, mp1, mp1, 0, ipst); - DTRACE_PROBE1(ip4__loopback__out__end, mblk_t *, mp1); - } else { - ip6h = (ip6_t *)mp1->b_rptr; - - DTRACE_PROBE4(ip6__loopback__out__start, - ill_t *, NULL, ill_t *, olp, - ip6_t *, ip6h, mblk_t *, mp1); - FW_HOOKS6(ipst->ips_ip6_loopback_out_event, - ipst->ips_ipv6firewall_loopback_out, - NULL, olp, ip6h, mp1, mp1, 0, ipst); - DTRACE_PROBE1(ip6__loopback__out__end, mblk_t *, mp1); - } - ill_refrele(olp); + mp1 = ip_output_process_local(mp1, connp->conn_ixa, hooks_out, + hooks_in, do_ipsec ? peer_connp : NULL); + /* If the message is dropped for any reason. */ if (mp1 == NULL) goto unfuse; /* - * The ipif and ill can be safely referenced under the - * protection of conn_lock - see head of function comment for - * conn_get_held_ipif(). It is necessary to check that both - * the ipif and ill can be looked up (i.e. not condemned). If - * not, bail out and unfuse this connection. + * Data length might have been changed by FW_HOOKS. + * We assume that the first mblk contains the TCP/IP headers. */ - mutex_enter(&tcp->tcp_connp->conn_lock); - if ((tcp->tcp_connp->conn_ire_cache == NULL) || - (tcp->tcp_connp->conn_ire_cache->ire_marks & - IRE_MARK_CONDEMNED) || - ((iifp = tcp->tcp_connp->conn_ire_cache->ire_ipif) - == NULL) || - (!IPIF_CAN_LOOKUP(iifp)) || - ((ilp = iifp->ipif_ill) == NULL) || - (ill_check_and_refhold(ilp) != 0)) { - mutex_exit(&tcp->tcp_connp->conn_lock); - goto unfuse; - } - mutex_exit(&tcp->tcp_connp->conn_lock); - - /* PFHooks: LOOPBACK_IN */ - if (tcp->tcp_ipversion == IPV4_VERSION) { - DTRACE_PROBE4(ip4__loopback__in__start, - ill_t *, ilp, ill_t *, NULL, - ipha_t *, ipha, mblk_t *, mp1); - FW_HOOKS(ipst->ips_ip4_loopback_in_event, - ipst->ips_ipv4firewall_loopback_in, - ilp, NULL, ipha, mp1, mp1, 0, ipst); - DTRACE_PROBE1(ip4__loopback__in__end, mblk_t *, mp1); - ill_refrele(ilp); - if (mp1 == NULL) - goto unfuse; - - ip_hdr_len = IPH_HDR_LENGTH(ipha); - } else { - DTRACE_PROBE4(ip6__loopback__in__start, - ill_t *, ilp, ill_t *, NULL, - ip6_t *, ip6h, mblk_t *, mp1); - FW_HOOKS6(ipst->ips_ip6_loopback_in_event, - ipst->ips_ipv6firewall_loopback_in, - ilp, NULL, ip6h, mp1, mp1, 0, ipst); - DTRACE_PROBE1(ip6__loopback__in__end, mblk_t *, mp1); - ill_refrele(ilp); - if (mp1 == NULL) - goto unfuse; - - ip_hdr_len = ip_hdr_length_v6(mp1, ip6h); - } + if (hooks_in || hooks_out) { + tcpha_t *tcpha; + + ip_hdr_len = (ixaflags & IXAF_IS_IPV4) ? + IPH_HDR_LENGTH((ipha_t *)mp1->b_rptr) : + ip_hdr_length_v6(mp1, (ip6_t *)mp1->b_rptr); - /* Data length might be changed by FW_HOOKS */ - tcph = (tcph_t *)&mp1->b_rptr[ip_hdr_len]; - seq = ABE32_TO_U32(tcph->th_seq); - recv_size += seq - tcp->tcp_snxt; + tcpha = (tcpha_t *)&mp1->b_rptr[ip_hdr_len]; + ASSERT((uchar_t *)tcpha + sizeof (tcpha_t) <= + mp1->b_wptr); + recv_size += htonl(tcpha->tha_seq) - tcp->tcp_snxt; + + } /* * The message duplicated by tcp_xmit_mp is freed. @@ -712,7 +562,7 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size) * detached we use tcp_rcv_enqueue() instead. Queued data will be * drained when the accept completes (in tcp_accept_finish()). */ - if (IPCL_IS_NONSTR(peer_tcp->tcp_connp) && + if (IPCL_IS_NONSTR(peer_connp) && !TCP_IS_DETACHED(peer_tcp)) { int error; int flags = 0; @@ -720,18 +570,18 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size) if ((tcp->tcp_valid_bits & TCP_URG_VALID) && (tcp->tcp_urg == tcp->tcp_snxt)) { flags = MSG_OOB; - (*peer_tcp->tcp_connp->conn_upcalls->su_signal_oob) - (peer_tcp->tcp_connp->conn_upper_handle, 0); + (*peer_connp->conn_upcalls->su_signal_oob) + (peer_connp->conn_upper_handle, 0); tcp->tcp_valid_bits &= ~TCP_URG_VALID; } - if ((*peer_tcp->tcp_connp->conn_upcalls->su_recv)( - peer_tcp->tcp_connp->conn_upper_handle, mp, recv_size, + if ((*peer_connp->conn_upcalls->su_recv)( + peer_connp->conn_upper_handle, mp, recv_size, flags, &error, &push) < 0) { ASSERT(error != EOPNOTSUPP); peer_data_queued = B_TRUE; } } else { - if (IPCL_IS_NONSTR(peer_tcp->tcp_connp) && + if (IPCL_IS_NONSTR(peer_connp) && (tcp->tcp_valid_bits & TCP_URG_VALID) && (tcp->tcp_urg == tcp->tcp_snxt)) { /* @@ -744,7 +594,8 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size) return (B_TRUE); } - tcp_rcv_enqueue(peer_tcp, mp, recv_size); + tcp_rcv_enqueue(peer_tcp, mp, recv_size, + tcp->tcp_connp->conn_cred); /* In case it wrapped around and also to keep it constant */ peer_tcp->tcp_rwnd += recv_size; @@ -764,22 +615,21 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size) mutex_enter(&tcp->tcp_non_sq_lock); flow_stopped = tcp->tcp_flow_stopped; if ((TCP_IS_DETACHED(peer_tcp) && - (peer_tcp->tcp_rcv_cnt >= peer_tcp->tcp_recv_hiwater)) || + (peer_tcp->tcp_rcv_cnt >= peer_connp->conn_rcvbuf)) || (!TCP_IS_DETACHED(peer_tcp) && - !IPCL_IS_NONSTR(peer_tcp->tcp_connp) && - !canputnext(peer_tcp->tcp_rq))) { + !IPCL_IS_NONSTR(peer_connp) && !canputnext(peer_connp->conn_rq))) { peer_data_queued = B_TRUE; } if (!flow_stopped && (peer_data_queued || - (TCP_UNSENT_BYTES(tcp) >= tcp->tcp_xmit_hiwater))) { + (TCP_UNSENT_BYTES(tcp) >= connp->conn_sndbuf))) { tcp_setqfull(tcp); flow_stopped = B_TRUE; TCP_STAT(tcps, tcp_fusion_flowctl); DTRACE_PROBE3(tcp__fuse__output__flowctl, tcp_t *, tcp, uint_t, send_size, uint_t, peer_tcp->tcp_rcv_cnt); } else if (flow_stopped && !peer_data_queued && - (TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater)) { + (TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat)) { tcp_clrqfull(tcp); TCP_STAT(tcps, tcp_fusion_backenabled); flow_stopped = B_FALSE; @@ -818,13 +668,14 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size) /* * For TLI-based streams, a thread in tcp_accept_swap() * can race with us. That thread will ensure that the - * correct peer_tcp->tcp_rq is globally visible before - * peer_tcp->tcp_detached is visible as clear, but we - * must also ensure that the load of tcp_rq cannot be - * reordered to be before the tcp_detached check. + * correct peer_connp->conn_rq is globally visible + * before peer_tcp->tcp_detached is visible as clear, + * but we must also ensure that the load of conn_rq + * cannot be reordered to be before the tcp_detached + * check. */ membar_consumer(); - (void) tcp_fuse_rcv_drain(peer_tcp->tcp_rq, peer_tcp, + (void) tcp_fuse_rcv_drain(peer_connp->conn_rq, peer_tcp, NULL); } } @@ -928,11 +779,11 @@ tcp_fuse_rcv_drain(queue_t *q, tcp_t *tcp, mblk_t **sigurg_mpp) tcp->tcp_rcv_last_head = NULL; tcp->tcp_rcv_last_tail = NULL; tcp->tcp_rcv_cnt = 0; - tcp->tcp_rwnd = tcp->tcp_recv_hiwater; + tcp->tcp_rwnd = tcp->tcp_connp->conn_rcvbuf; mutex_enter(&peer_tcp->tcp_non_sq_lock); if (peer_tcp->tcp_flow_stopped && (TCP_UNSENT_BYTES(peer_tcp) <= - peer_tcp->tcp_xmit_lowater)) { + peer_tcp->tcp_connp->conn_sndlowat)) { tcp_clrqfull(peer_tcp); TCP_STAT(tcps, tcp_fusion_backenabled); } @@ -964,8 +815,8 @@ tcp_fuse_set_rcv_hiwat(tcp_t *tcp, size_t rwnd) * Record high water mark, this is used for flow-control * purposes in tcp_fuse_output(). */ - tcp->tcp_recv_hiwater = rwnd; - tcp->tcp_rwnd = tcp->tcp_recv_hiwater; + tcp->tcp_connp->conn_rcvbuf = rwnd; + tcp->tcp_rwnd = rwnd; return (rwnd); } @@ -976,12 +827,13 @@ int tcp_fuse_maxpsz(tcp_t *tcp) { tcp_t *peer_tcp = tcp->tcp_loopback_peer; - uint_t sndbuf = tcp->tcp_xmit_hiwater; + conn_t *connp = tcp->tcp_connp; + uint_t sndbuf = connp->conn_sndbuf; uint_t maxpsz = sndbuf; ASSERT(tcp->tcp_fused); ASSERT(peer_tcp != NULL); - ASSERT(peer_tcp->tcp_recv_hiwater != 0); + ASSERT(peer_tcp->tcp_connp->conn_rcvbuf != 0); /* * In the fused loopback case, we want the stream head to split * up larger writes into smaller chunks for a more accurate flow- @@ -990,8 +842,8 @@ tcp_fuse_maxpsz(tcp_t *tcp) * We round up the buffer to system page size due to the lack of * TCP MSS concept in Fusion. */ - if (maxpsz > peer_tcp->tcp_recv_hiwater) - maxpsz = peer_tcp->tcp_recv_hiwater; + if (maxpsz > peer_tcp->tcp_connp->conn_rcvbuf) + maxpsz = peer_tcp->tcp_connp->conn_rcvbuf; maxpsz = P2ROUNDUP_TYPED(maxpsz, PAGESIZE, uint_t) >> 1; return (maxpsz); @@ -1013,12 +865,12 @@ tcp_fuse_backenable(tcp_t *tcp) peer_tcp->tcp_connp->conn_sqp); if (tcp->tcp_rcv_list != NULL) - (void) tcp_fuse_rcv_drain(tcp->tcp_rq, tcp, NULL); + (void) tcp_fuse_rcv_drain(tcp->tcp_connp->conn_rq, tcp, NULL); mutex_enter(&peer_tcp->tcp_non_sq_lock); if (peer_tcp->tcp_flow_stopped && (TCP_UNSENT_BYTES(peer_tcp) <= - peer_tcp->tcp_xmit_lowater)) { + peer_tcp->tcp_connp->conn_sndlowat)) { tcp_clrqfull(peer_tcp); } mutex_exit(&peer_tcp->tcp_non_sq_lock); diff --git a/usr/src/uts/common/inet/tcp/tcp_kssl.c b/usr/src/uts/common/inet/tcp/tcp_kssl.c index 75fa36196a..5d9051aed1 100644 --- a/usr/src/uts/common/inet/tcp/tcp_kssl.c +++ b/usr/src/uts/common/inet/tcp/tcp_kssl.c @@ -56,20 +56,21 @@ * For the Kernel SSL proxy * * Routines in this file are called on tcp's incoming path, - * tcp_rput_data() mainly, and right before the message is + * tcp_input_data() mainly, and right before the message is * to be putnext()'ed upstreams. */ static void tcp_kssl_input_callback(void *, mblk_t *, kssl_cmd_t); -static void tcp_kssl_input_asynch(void *, mblk_t *, void *); +static void tcp_kssl_input_asynch(void *, mblk_t *, void *, + ip_recv_attr_t *); -extern void tcp_output(void *, mblk_t *, void *); +extern void tcp_output(void *, mblk_t *, void *, ip_recv_attr_t *); extern void tcp_send_conn_ind(void *, mblk_t *, void *); extern int tcp_squeue_flag; /* - * tcp_rput_data() calls this routine for all packet destined to a + * tcp_input_data() calls this routine for all packet destined to a * connection to the SSL port, when the SSL kernel proxy is configured * to intercept and process those packets. * A packet may carry multiple SSL records, so the function @@ -84,7 +85,7 @@ extern int tcp_squeue_flag; * which could decrement the conn/tcp reference before we get to increment it. */ void -tcp_kssl_input(tcp_t *tcp, mblk_t *mp) +tcp_kssl_input(tcp_t *tcp, mblk_t *mp, cred_t *cr) { struct conn_s *connp = tcp->tcp_connp; tcp_t *listener; @@ -97,15 +98,26 @@ tcp_kssl_input(tcp_t *tcp, mblk_t *mp) boolean_t is_v4; void *addr; + if (is_system_labeled() && mp != NULL) { + ASSERT(cr != NULL || msg_getcred(mp, NULL) != NULL); + /* + * Provide for protocols above TCP such as RPC. NOPID leaves + * db_cpid unchanged. + * The cred could have already been set. + */ + if (cr != NULL) + mblk_setcred(mp, cr, NOPID); + } + /* First time here, allocate the SSL context */ if (tcp->tcp_kssl_ctx == NULL) { ASSERT(tcp->tcp_kssl_pending); - is_v4 = (tcp->tcp_ipversion == IPV4_VERSION); + is_v4 = (connp->conn_ipversion == IPV4_VERSION); if (is_v4) { - addr = &tcp->tcp_ipha->ipha_dst; + addr = &connp->conn_faddr_v4; } else { - addr = &tcp->tcp_ip6h->ip6_dst; + addr = &connp->conn_faddr_v6; } if (kssl_init_context(tcp->tcp_kssl_ent, @@ -146,7 +158,7 @@ tcp_kssl_input(tcp_t *tcp, mblk_t *mp) mutex_enter(&tcp->tcp_non_sq_lock); tcp->tcp_squeue_bytes += msgdsize(outmp); mutex_exit(&tcp->tcp_non_sq_lock); - tcp_output(connp, outmp, NULL); + tcp_output(connp, outmp, NULL, NULL); /* FALLTHROUGH */ case KSSL_CMD_NONE: @@ -194,7 +206,7 @@ tcp_kssl_input(tcp_t *tcp, mblk_t *mp) tci->PRIM_type = T_SSL_PROXY_CONN_IND; /* - * The code below is copied from tcp_rput_data() + * The code below is copied from tcp_input_data * delivering the T_CONN_IND on a TCPS_SYN_RCVD, * and all conn ref cnt comments apply. */ @@ -214,7 +226,7 @@ tcp_kssl_input(tcp_t *tcp, mblk_t *mp) SQUEUE_ENTER_ONE( listener->tcp_connp->conn_sqp, ind_mp, tcp_send_conn_ind, - listener->tcp_connp, SQ_FILL, + listener->tcp_connp, NULL, SQ_FILL, SQTAG_TCP_CONN_IND); } } @@ -240,11 +252,12 @@ tcp_kssl_input(tcp_t *tcp, mblk_t *mp) if (tcp->tcp_listener != NULL) { DTRACE_PROBE1(kssl_mblk__input_rcv_enqueue, mblk_t *, outmp); - tcp_rcv_enqueue(tcp, outmp, msgdsize(outmp)); + tcp_rcv_enqueue(tcp, outmp, msgdsize(outmp), + NULL); } else { DTRACE_PROBE1(kssl_mblk__input_putnext, mblk_t *, outmp); - putnext(tcp->tcp_rq, outmp); + putnext(connp->conn_rq, outmp); } /* * We're at a phase where records are sent upstreams, @@ -283,7 +296,7 @@ no_can_do: tci->PRIM_type = T_SSL_PROXY_CONN_IND; /* - * The code below is copied from tcp_rput_data() + * The code below is copied from tcp_input_data * delivering the T_CONN_IND on a TCPS_SYN_RCVD, * and all conn ref cnt comments apply. */ @@ -303,12 +316,12 @@ no_can_do: SQUEUE_ENTER_ONE( listener->tcp_connp->conn_sqp, ind_mp, tcp_send_conn_ind, - listener->tcp_connp, + listener->tcp_connp, NULL, SQ_FILL, SQTAG_TCP_CONN_IND); } } if (mp != NULL) - tcp_rcv_enqueue(tcp, mp, msgdsize(mp)); + tcp_rcv_enqueue(tcp, mp, msgdsize(mp), NULL); break; } mp = NULL; @@ -351,7 +364,7 @@ tcp_kssl_input_callback(void *arg, mblk_t *mp, kssl_cmd_t kssl_cmd) } CONN_INC_REF(connp); SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, connp, - tcp_squeue_flag, SQTAG_TCP_OUTPUT); + NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); /* FALLTHROUGH */ case KSSL_CMD_NONE: @@ -363,9 +376,9 @@ tcp_kssl_input_callback(void *arg, mblk_t *mp, kssl_cmd_t kssl_cmd) * Keep accumulating if not yet accepted. */ if (tcp->tcp_listener != NULL) { - tcp_rcv_enqueue(tcp, mp, msgdsize(mp)); + tcp_rcv_enqueue(tcp, mp, msgdsize(mp), NULL); } else { - putnext(tcp->tcp_rq, mp); + putnext(connp->conn_rq, mp); } break; @@ -383,7 +396,7 @@ tcp_kssl_input_callback(void *arg, mblk_t *mp, kssl_cmd_t kssl_cmd) if ((sqmp = allocb(1, BPRI_MED)) != NULL) { CONN_INC_REF(connp); SQUEUE_ENTER_ONE(connp->conn_sqp, sqmp, tcp_kssl_input_asynch, - connp, SQ_FILL, SQTAG_TCP_KSSL_INPUT); + connp, NULL, SQ_FILL, SQTAG_TCP_KSSL_INPUT); } else { DTRACE_PROBE(kssl_err__allocb_failed); } @@ -396,7 +409,7 @@ tcp_kssl_input_callback(void *arg, mblk_t *mp, kssl_cmd_t kssl_cmd) */ /* ARGSUSED */ void -tcp_kssl_input_asynch(void *arg, mblk_t *mp, void *arg2) +tcp_kssl_input_asynch(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { conn_t *connp = (conn_t *)arg; tcp_t *tcp = connp->conn_tcp; @@ -409,6 +422,6 @@ tcp_kssl_input_asynch(void *arg, mblk_t *mp, void *arg2) * while we're away */ if (tcp->tcp_kssl_ctx != NULL) { - tcp_kssl_input(tcp, NULL); + tcp_kssl_input(tcp, NULL, NULL); } } diff --git a/usr/src/uts/common/inet/tcp/tcp_opt_data.c b/usr/src/uts/common/inet/tcp/tcp_opt_data.c index fa2529a5ac..d15ff4ffcd 100644 --- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c +++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c @@ -39,12 +39,7 @@ #include <netinet/tcp.h> #include <inet/optcom.h> - -extern int tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr); -extern int tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr); -extern int tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, - int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, - void *thisdg_attrs, cred_t *cr, mblk_t *mblk); +#include <inet/tcp_impl.h> /* * Table of all known options handled on a TCP protocol stack. @@ -55,161 +50,165 @@ extern int tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, */ opdes_t tcp_opt_arr[] = { -{ SO_LINGER, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ SO_LINGER, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (struct linger), 0 }, -{ SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_KEEPALIVE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 +{ SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_KEEPALIVE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_SNDTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, +{ SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_SNDTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (struct timeval), 0 }, -{ SO_RCVTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ SO_RCVTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (struct timeval), 0 }, -{ SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 +{ SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), +{ SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), +{ SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), +{ SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT, sizeof (int), +{ SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), 0 }, -{ SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, +{ SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, +{ SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, -{ SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, +{ SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, -{ TCP_NODELAY, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 +{ TCP_NODELAY, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ TCP_MAXSEG, IPPROTO_TCP, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (uint_t), +{ TCP_MAXSEG, IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t), 536 }, { TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_DEF_FN), sizeof (int), -1 /* not initialized */ }, + OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, { TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_DEF_FN), sizeof (int), -1 /* not initialized */ }, + OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, { TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_DEF_FN), sizeof (int), -1 /* not initialized */ }, + OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, { TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_DEF_FN), sizeof (int), -1 /* not initialized */ }, + OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, -{ TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), +{ TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, OP_PASSNEXT, +{ TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0, sizeof (int), 0 }, -{ TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 +{ TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, OP_PASSNEXT, +{ TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0, sizeof (int), 0 }, -{ TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, +{ TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), + (OP_VARLEN|OP_NODEFAULT), IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, { T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), + (OP_VARLEN|OP_NODEFAULT), IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, -{ IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ IP_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_DEF_FN), +{ IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ IP_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, -{ IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_NODEFAULT), +{ IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT, sizeof (ipsec_req_t), -1 /* not initialized */ }, -{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 /* no ifindex */ }, -{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, OP_PASSNEXT, +{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0, sizeof (int), 0 }, -{ IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_DEF_FN), +{ IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, -{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 /* no ifindex */ }, -{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT, +{ IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, + +{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (in_addr_t), -1 /* not initialized */ }, -{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, OP_PASSNEXT, +{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0, sizeof (int), 0 }, { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT|OP_VARLEN), + (OP_NODEFAULT|OP_VARLEN), sizeof (struct in6_pktinfo), -1 /* not initialized */ }, { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), + OP_NODEFAULT, sizeof (sin6_t), -1 /* not initialized */ }, { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), 255*8, + (OP_VARLEN|OP_NODEFAULT), 255*8, -1 /* not initialized */ }, { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), 255*8, + (OP_VARLEN|OP_NODEFAULT), 255*8, -1 /* not initialized */ }, { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), 255*8, + (OP_VARLEN|OP_NODEFAULT), 255*8, -1 /* not initialized */ }, { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), 255*8, + (OP_VARLEN|OP_NODEFAULT), 255*8, -1 /* not initialized */ }, { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), + OP_NODEFAULT, sizeof (int), -1 /* not initialized */ }, { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), + OP_NODEFAULT, sizeof (struct ip6_mtuinfo), -1 /* not initialized */ }, -{ IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, + sizeof (int), 0 }, +{ IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, /* Enable receipt of ancillary data */ -{ IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_NODEFAULT), +{ IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT, sizeof (ipsec_req_t), -1 /* not initialized */ }, -{ IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT }, }; @@ -247,7 +246,6 @@ optdb_obj_t tcp_opt_obj = { tcp_opt_default, /* TCP default value function pointer */ tcp_tpi_opt_get, /* TCP get function pointer */ tcp_tpi_opt_set, /* TCP set function pointer */ - B_TRUE, /* TCP is tpi provider */ TCP_OPT_ARR_CNT, /* TCP option database count of entries */ tcp_opt_arr, /* TCP option database */ TCP_VALID_LEVELS_CNT, /* TCP valid level count of entries */ diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h index bec2b3256f..1b7c87736a 100644 --- a/usr/src/uts/common/inet/tcp_impl.h +++ b/usr/src/uts/common/inet/tcp_impl.h @@ -70,41 +70,6 @@ extern "C" { } /* - * Before caching the conn IRE, we need to make sure certain TCP - * states are in sync with the ire. The mismatch could occur if the - * TCP state has been set in tcp_adapt_ire() using a different IRE, - * e.g if an address was not present during an initial connect(), - * tcp_adapt_ire() will set the state using the interface route. - * Subsequently, if the address is added to the local machine, the - * retransmitted SYN will get the correct (loopback) IRE, but the TCP - * state (tcp_loopback and tcp_localnet) will remain out of sync. - * This is especially an issue with TCP fusion which relies on the - * TCP state to be accurate. - * - * This check/change should be made only if the TCP is not yet in - * the established state, else it would lead to inconsistencies. - */ -#define TCP_CHECK_IREINFO(tcp, ire) { \ - if ((tcp)->tcp_state < TCPS_ESTABLISHED) { \ - if (((ire)->ire_type & (IRE_LOOPBACK | \ - IRE_LOCAL)) && !(tcp)->tcp_loopback) { \ - (tcp)->tcp_loopback = B_TRUE; \ - } else if ((tcp)->tcp_loopback && \ - !((ire)->ire_type & (IRE_LOOPBACK | IRE_LOCAL))) { \ - (tcp)->tcp_loopback = B_FALSE; \ - } \ - if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ - (tcp)->tcp_localnet = \ - ((ire)->ire_gateway_addr == 0); \ - } else { \ - (tcp)->tcp_localnet = \ - IN6_IS_ADDR_UNSPECIFIED( \ - &(ire)->ire_gateway_addr_v6); \ - } \ - } \ -} - -/* * Write-side flow-control is implemented via the per instance STREAMS * write-side Q by explicitly setting QFULL to stop the flow of mblk_t(s) * and clearing QFULL and calling qbackenable() to restart the flow based @@ -205,18 +170,19 @@ typedef struct tcpparam_s { #define tcps_keepalive_abort_interval_high tcps_params[59].tcp_param_max #define tcps_keepalive_abort_interval tcps_params[59].tcp_param_val #define tcps_keepalive_abort_interval_low tcps_params[59].tcp_param_min +#define tcps_dev_flow_ctl tcps_params[60].tcp_param_val extern struct qinit tcp_rinitv4, tcp_rinitv6; extern boolean_t do_tcp_fusion; extern int tcp_maxpsz_set(tcp_t *, boolean_t); extern void tcp_timers_stop(tcp_t *); -extern void tcp_rcv_enqueue(tcp_t *, mblk_t *, uint_t); +extern void tcp_rcv_enqueue(tcp_t *, mblk_t *, uint_t, cred_t *); extern void tcp_push_timer(void *); extern timeout_id_t tcp_timeout(conn_t *, void (*)(void *), clock_t); extern clock_t tcp_timeout_cancel(conn_t *, timeout_id_t); -extern void tcp_fuse(tcp_t *, uchar_t *, tcph_t *); +extern void tcp_fuse(tcp_t *, uchar_t *, tcpha_t *); extern void tcp_unfuse(tcp_t *); extern boolean_t tcp_fuse_output(tcp_t *, mblk_t *, uint32_t); extern void tcp_fuse_output_urg(tcp_t *, mblk_t *); @@ -242,6 +208,11 @@ extern int tcp_fallback(sock_lower_handle_t, queue_t *, boolean_t, extern sock_downcalls_t sock_tcp_downcalls; +extern int tcp_opt_default(queue_t *, t_scalar_t, t_scalar_t, uchar_t *); +extern int tcp_tpi_opt_get(queue_t *, t_scalar_t, t_scalar_t, uchar_t *); +extern int tcp_tpi_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *, + uint_t *, uchar_t *, void *, cred_t *); + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/inet/tcp_stack.h b/usr/src/uts/common/inet/tcp_stack.h index 2c151894eb..a254da4b43 100644 --- a/usr/src/uts/common/inet/tcp_stack.h +++ b/usr/src/uts/common/inet/tcp_stack.h @@ -42,9 +42,6 @@ typedef struct tcp_stat { kstat_named_t tcp_time_wait; kstat_named_t tcp_time_wait_syn; kstat_named_t tcp_time_wait_syn_success; - kstat_named_t tcp_time_wait_syn_fail; - kstat_named_t tcp_reinput_syn; - kstat_named_t tcp_ip_output; kstat_named_t tcp_detach_non_time_wait; kstat_named_t tcp_detach_time_wait; kstat_named_t tcp_time_wait_reap; @@ -82,37 +79,14 @@ typedef struct tcp_stat { kstat_named_t tcp_timermp_freed; kstat_named_t tcp_push_timer_cnt; kstat_named_t tcp_ack_timer_cnt; - kstat_named_t tcp_ire_null1; - kstat_named_t tcp_ire_null; - kstat_named_t tcp_ip_send; - kstat_named_t tcp_ip_ire_send; kstat_named_t tcp_wsrv_called; kstat_named_t tcp_flwctl_on; kstat_named_t tcp_timer_fire_early; kstat_named_t tcp_timer_fire_miss; kstat_named_t tcp_rput_v6_error; - kstat_named_t tcp_out_sw_cksum; - kstat_named_t tcp_out_sw_cksum_bytes; kstat_named_t tcp_zcopy_on; kstat_named_t tcp_zcopy_off; kstat_named_t tcp_zcopy_backoff; - kstat_named_t tcp_zcopy_disable; - kstat_named_t tcp_mdt_pkt_out; - kstat_named_t tcp_mdt_pkt_out_v4; - kstat_named_t tcp_mdt_pkt_out_v6; - kstat_named_t tcp_mdt_discarded; - kstat_named_t tcp_mdt_conn_halted1; - kstat_named_t tcp_mdt_conn_halted2; - kstat_named_t tcp_mdt_conn_halted3; - kstat_named_t tcp_mdt_conn_resumed1; - kstat_named_t tcp_mdt_conn_resumed2; - kstat_named_t tcp_mdt_legacy_small; - kstat_named_t tcp_mdt_legacy_all; - kstat_named_t tcp_mdt_legacy_ret; - kstat_named_t tcp_mdt_allocfail; - kstat_named_t tcp_mdt_addpdescfail; - kstat_named_t tcp_mdt_allocd; - kstat_named_t tcp_mdt_linked; kstat_named_t tcp_fusion_flowctl; kstat_named_t tcp_fusion_backenabled; kstat_named_t tcp_fusion_urg; @@ -154,15 +128,6 @@ struct tcp_stack { mib2_tcp_t tcps_mib; - /* Protected by tcps_g_q_lock */ - queue_t *tcps_g_q; /* Default queue */ - uint_t tcps_refcnt; /* Total number of tcp_t's */ - kmutex_t tcps_g_q_lock; - kcondvar_t tcps_g_q_cv; - kthread_t *tcps_g_q_creator; - struct __ldi_handle *tcps_g_q_lh; - cred_t *tcps_g_q_cr; /* For _inactive close call */ - /* * Extra privileged ports. In host byte order. * Protected by tcp_epriv_port_lock. @@ -182,9 +147,6 @@ struct tcp_stack { caddr_t tcps_g_nd; struct tcpparam_s *tcps_params; /* ndd parameters */ struct tcpparam_s *tcps_wroff_xtra_param; - struct tcpparam_s *tcps_mdt_head_param; - struct tcpparam_s *tcps_mdt_tail_param; - struct tcpparam_s *tcps_mdt_max_pbufs_param; /* Hint not protected by any lock */ uint_t tcps_next_port_to_try; @@ -222,6 +184,11 @@ struct tcp_stack { /* The number of RST not sent because of the rate limit. */ uint32_t tcps_rst_unsent; ldi_ident_t tcps_ldi_ident; + + /* Used to synchronize access when reclaiming memory */ + mblk_t *tcps_ixa_cleanup_mp; + kmutex_t tcps_ixa_cleanup_lock; + kcondvar_t tcps_ixa_cleanup_cv; }; typedef struct tcp_stack tcp_stack_t; diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c index d0bab511b0..e18fc57f40 100644 --- a/usr/src/uts/common/inet/udp/udp.c +++ b/usr/src/uts/common/inet/udp/udp.c @@ -26,12 +26,9 @@ #include <sys/types.h> #include <sys/stream.h> -#include <sys/dlpi.h> -#include <sys/pattr.h> #include <sys/stropts.h> #include <sys/strlog.h> #include <sys/strsun.h> -#include <sys/time.h> #define _SUN_TPI_VERSION 2 #include <sys/tihdr.h> #include <sys/timod.h> @@ -41,7 +38,9 @@ #include <sys/suntpi.h> #include <sys/xti_inet.h> #include <sys/kmem.h> +#include <sys/cred_impl.h> #include <sys/policy.h> +#include <sys/priv.h> #include <sys/ucred.h> #include <sys/zone.h> @@ -57,12 +56,11 @@ #include <netinet/ip6.h> #include <netinet/icmp6.h> #include <netinet/udp.h> -#include <net/if.h> -#include <net/route.h> #include <inet/common.h> #include <inet/ip.h> #include <inet/ip_impl.h> +#include <inet/ipsec_impl.h> #include <inet/ip6.h> #include <inet/ip_ire.h> #include <inet/ip_if.h> @@ -74,34 +72,25 @@ #include <inet/optcom.h> #include <inet/snmpcom.h> #include <inet/kstatcom.h> -#include <inet/udp_impl.h> #include <inet/ipclassifier.h> -#include <inet/ipsec_impl.h> -#include <inet/ipp_common.h> #include <sys/squeue_impl.h> #include <inet/ipnet.h> #include <sys/ethernet.h> -/* - * The ipsec_info.h header file is here since it has the definition for the - * M_CTL message types used by IP to convey information to the ULP. The - * ipsec_info.h needs the pfkeyv2.h, hence the latter's presence. - */ -#include <net/pfkeyv2.h> -#include <inet/ipsec_info.h> - #include <sys/tsol/label.h> #include <sys/tsol/tnet.h> #include <rpc/pmap_prot.h> +#include <inet/udp_impl.h> + /* * Synchronization notes: * * UDP is MT and uses the usual kernel synchronization primitives. There are 2 - * locks, the fanout lock (uf_lock) and the udp endpoint lock udp_rwlock. - * We also use conn_lock when updating things that affect the IP classifier - * lookup. - * The lock order is udp_rwlock -> uf_lock and is udp_rwlock -> conn_lock. + * locks, the fanout lock (uf_lock) and conn_lock. conn_lock + * protects the contents of the udp_t. uf_lock protects the address and the + * fanout information. + * The lock order is conn_lock -> uf_lock. * * The fanout lock uf_lock: * When a UDP endpoint is bound to a local port, it is inserted into @@ -114,11 +103,6 @@ * from the bind hash list only when it is being unbound or being closed. * The per bucket lock also protects a UDP endpoint's state changes. * - * The udp_rwlock: - * This protects most of the other fields in the udp_t. The exact list of - * fields which are protected by each of the above locks is documented in - * the udp_t structure definition. - * * Plumbing notes: * UDP is always a device driver. For compatibility with mibopen() code * it is possible to I_PUSH "udp", but that results in pushing a passthrough @@ -133,41 +117,32 @@ /* For /etc/system control */ uint_t udp_bind_fanout_size = UDP_BIND_FANOUT_SIZE; -/* Option processing attrs */ -typedef struct udpattrs_s { - union { - ip6_pkt_t *udpattr_ipp6; /* For V6 */ - ip4_pkt_t *udpattr_ipp4; /* For V4 */ - } udpattr_ippu; -#define udpattr_ipp6 udpattr_ippu.udpattr_ipp6 -#define udpattr_ipp4 udpattr_ippu.udpattr_ipp4 - mblk_t *udpattr_mb; - boolean_t udpattr_credset; -} udpattrs_t; - static void udp_addr_req(queue_t *q, mblk_t *mp); static void udp_tpi_bind(queue_t *q, mblk_t *mp); static void udp_bind_hash_insert(udp_fanout_t *uf, udp_t *udp); static void udp_bind_hash_remove(udp_t *udp, boolean_t caller_holds_lock); -static int udp_build_hdrs(udp_t *udp); +static int udp_build_hdr_template(conn_t *, const in6_addr_t *, + const in6_addr_t *, in_port_t, uint32_t); static void udp_capability_req(queue_t *q, mblk_t *mp); static int udp_tpi_close(queue_t *q, int flags); +static void udp_close_free(conn_t *); static void udp_tpi_connect(queue_t *q, mblk_t *mp); static void udp_tpi_disconnect(queue_t *q, mblk_t *mp); static void udp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, - int sys_error); -static void udp_err_ack_prim(queue_t *q, mblk_t *mp, int primitive, - t_scalar_t tlierr, int unixerr); + int sys_error); +static void udp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive, + t_scalar_t tlierr, int sys_error); static int udp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); static int udp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr); static int udp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr); -static void udp_icmp_error(conn_t *, mblk_t *); -static void udp_icmp_error_ipv6(conn_t *, mblk_t *); +static void udp_icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *); +static void udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, + ip_recv_attr_t *ira); static void udp_info_req(queue_t *q, mblk_t *mp); -static void udp_input(void *, mblk_t *, void *); +static void udp_input(void *, mblk_t *, void *, ip_recv_attr_t *); static void udp_lrput(queue_t *, mblk_t *); static void udp_lwput(queue_t *, mblk_t *); static int udp_open(queue_t *q, dev_t *devp, int flag, int sflag, @@ -176,24 +151,34 @@ static int udp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp); static int udp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp); -static int udp_unitdata_opt_process(queue_t *q, mblk_t *mp, - int *errorp, udpattrs_t *udpattrs); static boolean_t udp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name); +int udp_opt_set(conn_t *connp, uint_t optset_context, + int level, int name, uint_t inlen, + uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, + void *thisdg_attrs, cred_t *cr); +int udp_opt_get(conn_t *connp, int level, int name, + uchar_t *ptr); +static int udp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr, + pid_t pid); +static int udp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr, + pid_t pid, ip_xmit_attr_t *ixa); +static int udp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, + sin6_t *sin6, ushort_t ipversion, cred_t *cr, pid_t, + ip_xmit_attr_t *ixa); static int udp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); static boolean_t udp_param_register(IDP *ndp, udpparam_t *udppa, int cnt); static int udp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr); -static void udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, - ipha_t *ipha); -static void udp_ud_err(queue_t *q, mblk_t *mp, uchar_t *destaddr, - t_scalar_t destlen, t_scalar_t err); +static mblk_t *udp_prepend_hdr(conn_t *, ip_xmit_attr_t *, const ip_pkt_t *, + const in6_addr_t *, const in6_addr_t *, in_port_t, uint32_t, mblk_t *, + int *); +static mblk_t *udp_prepend_header_template(conn_t *, ip_xmit_attr_t *, + mblk_t *, const in6_addr_t *, in_port_t, uint32_t, int *); +static void udp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err); +static void udp_ud_err_connected(conn_t *, t_scalar_t); static void udp_tpi_unbind(queue_t *q, mblk_t *mp); static in_port_t udp_update_next_port(udp_t *udp, in_port_t port, boolean_t random); -static mblk_t *udp_output_v4(conn_t *, mblk_t *, ipaddr_t, uint16_t, uint_t, - int *, boolean_t, struct nmsghdr *, cred_t *, pid_t); -static mblk_t *udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6, - int *error, struct nmsghdr *msg, cred_t *cr, pid_t pid); static void udp_wput_other(queue_t *q, mblk_t *mp); static void udp_wput_iocdata(queue_t *q, mblk_t *mp); static void udp_wput_fallback(queue_t *q, mblk_t *mp); @@ -208,11 +193,9 @@ static void *udp_kstat2_init(netstackid_t, udp_stat_t *); static void udp_kstat2_fini(netstackid_t, kstat_t *); static int udp_kstat_update(kstat_t *kp, int rw); -static void udp_xmit(queue_t *, mblk_t *, ire_t *ire, conn_t *, zoneid_t); -static int udp_send_connected(conn_t *, mblk_t *, struct nmsghdr *, - cred_t *, pid_t); -static void udp_ulp_recv(conn_t *, mblk_t *); +/* Common routines for TPI and socket module */ +static void udp_ulp_recv(conn_t *, mblk_t *, uint_t, ip_recv_attr_t *); /* Common routine for TPI and socket module */ static conn_t *udp_do_open(cred_t *, boolean_t, int); @@ -220,30 +203,20 @@ static void udp_do_close(conn_t *); static int udp_do_bind(conn_t *, struct sockaddr *, socklen_t, cred_t *, boolean_t); static int udp_do_unbind(conn_t *); -static int udp_do_getsockname(udp_t *, struct sockaddr *, uint_t *); -static int udp_do_getpeername(udp_t *, struct sockaddr *, uint_t *); int udp_getsockname(sock_lower_handle_t, struct sockaddr *, socklen_t *, cred_t *); int udp_getpeername(sock_lower_handle_t, struct sockaddr *, socklen_t *, cred_t *); static int udp_do_connect(conn_t *, const struct sockaddr *, socklen_t, - cred_t *cr); -static int udp_post_ip_bind_connect(udp_t *, mblk_t *, int); + cred_t *, pid_t); #define UDP_RECV_HIWATER (56 * 1024) #define UDP_RECV_LOWATER 128 #define UDP_XMIT_HIWATER (56 * 1024) #define UDP_XMIT_LOWATER 1024 -/* - * The following is defined in tcp.c - */ -extern int (*cl_inet_connect2)(netstackid_t stack_id, - uint8_t protocol, boolean_t is_outgoing, - sa_family_t addr_family, - uint8_t *laddrp, in_port_t lport, - uint8_t *faddrp, in_port_t fport, void *args); +#pragma inline(udp_output_connected, udp_output_newdst, udp_output_lastdst) /* * Checks if the given destination addr/port is allowed out. @@ -251,7 +224,7 @@ extern int (*cl_inet_connect2)(netstackid_t stack_id, * Called for each connect() and for sendto()/sendmsg() to a different * destination. * For connect(), called in udp_connect(). - * For sendto()/sendmsg(), called in udp_output_v{4,6}(). + * For sendto()/sendmsg(), called in udp_output_newdst(). * * This macro assumes that the cl_inet_connect2 hook is not NULL. * Please check this before calling this macro. @@ -260,25 +233,26 @@ extern int (*cl_inet_connect2)(netstackid_t stack_id, * CL_INET_UDP_CONNECT(conn_t cp, udp_t *udp, boolean_t is_outgoing, * in6_addr_t *faddrp, in_port_t (or uint16_t) fport, int err); */ -#define CL_INET_UDP_CONNECT(cp, udp, is_outgoing, faddrp, fport, err) { \ +#define CL_INET_UDP_CONNECT(cp, is_outgoing, faddrp, fport, err) { \ (err) = 0; \ /* \ * Running in cluster mode - check and register active \ * "connection" information \ */ \ - if ((udp)->udp_ipversion == IPV4_VERSION) \ + if ((cp)->conn_ipversion == IPV4_VERSION) \ (err) = (*cl_inet_connect2)( \ (cp)->conn_netstack->netstack_stackid, \ IPPROTO_UDP, is_outgoing, AF_INET, \ - (uint8_t *)&((udp)->udp_v6src._S6_un._S6_u32[3]), \ - (udp)->udp_port, \ - (uint8_t *)&((faddrp)->_S6_un._S6_u32[3]), \ + (uint8_t *)&((cp)->conn_laddr_v4), \ + (cp)->conn_lport, \ + (uint8_t *)&(V4_PART_OF_V6(*faddrp)), \ (in_port_t)(fport), NULL); \ else \ (err) = (*cl_inet_connect2)( \ (cp)->conn_netstack->netstack_stackid, \ IPPROTO_UDP, is_outgoing, AF_INET6, \ - (uint8_t *)&((udp)->udp_v6src), (udp)->udp_port, \ + (uint8_t *)&((cp)->conn_laddr_v6), \ + (cp)->conn_lport, \ (uint8_t *)(faddrp), (in_port_t)(fport), NULL); \ } @@ -387,6 +361,8 @@ udpparam_t udp_param_arr[] = { { 0, (1<<30), UDP_XMIT_LOWATER, "udp_xmit_lowat"}, { UDP_RECV_LOWATER, (1<<30), UDP_RECV_HIWATER, "udp_recv_hiwat"}, { 65536, (1<<30), 2*1024*1024, "udp_max_buf"}, + { 0, 1, 0, "udp_pmtu_discovery" }, + { 0, 1, 0, "udp_sendto_ignerr" }, }; /* END CSTYLED */ @@ -451,9 +427,10 @@ retry: static void udp_bind_hash_remove(udp_t *udp, boolean_t caller_holds_lock) { - udp_t *udpnext; - kmutex_t *lockp; - udp_stack_t *us = udp->udp_us; + udp_t *udpnext; + kmutex_t *lockp; + udp_stack_t *us = udp->udp_us; + conn_t *connp = udp->udp_connp; if (udp->udp_ptpbhn == NULL) return; @@ -462,9 +439,9 @@ udp_bind_hash_remove(udp_t *udp, boolean_t caller_holds_lock) * Extract the lock pointer in case there are concurrent * hash_remove's for this instance. */ - ASSERT(udp->udp_port != 0); + ASSERT(connp->conn_lport != 0); if (!caller_holds_lock) { - lockp = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port, + lockp = &us->us_bind_fanout[UDP_BIND_HASH(connp->conn_lport, us->us_bind_fanout_size)].uf_lock; ASSERT(lockp != NULL); mutex_enter(lockp); @@ -486,8 +463,10 @@ udp_bind_hash_remove(udp_t *udp, boolean_t caller_holds_lock) static void udp_bind_hash_insert(udp_fanout_t *uf, udp_t *udp) { + conn_t *connp = udp->udp_connp; udp_t **udpp; udp_t *udpnext; + conn_t *connext; ASSERT(MUTEX_HELD(&uf->uf_lock)); ASSERT(udp->udp_ptpbhn == NULL); @@ -503,11 +482,11 @@ udp_bind_hash_insert(udp_fanout_t *uf, udp_t *udp) * specific address get preference over those binding to * INADDR_ANY. */ - if (V6_OR_V4_INADDR_ANY(udp->udp_bound_v6src) && - !V6_OR_V4_INADDR_ANY(udpnext->udp_bound_v6src)) { + connext = udpnext->udp_connp; + if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) && + !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) { while ((udpnext = udpp[0]) != NULL && - !V6_OR_V4_INADDR_ANY( - udpnext->udp_bound_v6src)) { + !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) { udpp = &(udpnext->udp_bind_hash); } if (udpnext != NULL) @@ -525,10 +504,9 @@ udp_bind_hash_insert(udp_fanout_t *uf, udp_t *udp) * This routine is called to handle each O_T_BIND_REQ/T_BIND_REQ message * passed to udp_wput. * It associates a port number and local address with the stream. - * The O_T_BIND_REQ/T_BIND_REQ is passed downstream to ip with the UDP - * protocol type (IPPROTO_UDP) placed in the message following the address. - * A T_BIND_ACK message is passed upstream when ip acknowledges the request. - * (Called as writer.) + * It calls IP to verify the local IP address, and calls IP to insert + * the conn_t in the fanout table. + * If everything is ok it then sends the T_BIND_ACK back up. * * Note that UDP over IPv4 and IPv6 sockets can use the same port number * without setting SO_REUSEADDR. This is needed so that they @@ -580,10 +558,10 @@ udp_tpi_bind(queue_t *q, mblk_t *mp) } /* * Reallocate the message to make sure we have enough room for an - * address and the protocol type. + * address. */ - mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t) + 1, 1); - if (!mp1) { + mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1); + if (mp1 == NULL) { udp_err_ack(q, mp, TSYSERR, ENOMEM); return; } @@ -597,7 +575,7 @@ udp_tpi_bind(queue_t *q, mblk_t *mp) switch (tbr->ADDR_length) { case 0: /* Request for a generic port */ tbr->ADDR_offset = sizeof (struct T_bind_req); - if (udp->udp_family == AF_INET) { + if (connp->conn_family == AF_INET) { tbr->ADDR_length = sizeof (sin_t); sin = (sin_t *)&tbr[1]; *sin = sin_null; @@ -605,7 +583,7 @@ udp_tpi_bind(queue_t *q, mblk_t *mp) mp->b_wptr = (uchar_t *)&sin[1]; sa = (struct sockaddr *)sin; } else { - ASSERT(udp->udp_family == AF_INET6); + ASSERT(connp->conn_family == AF_INET6); tbr->ADDR_length = sizeof (sin6_t); sin6 = (sin6_t *)&tbr[1]; *sin6 = sin6_null; @@ -622,7 +600,7 @@ udp_tpi_bind(queue_t *q, mblk_t *mp) udp_err_ack(q, mp, TSYSERR, EINVAL); return; } - if (udp->udp_family != AF_INET || + if (connp->conn_family != AF_INET || sa->sa_family != AF_INET) { udp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT); return; @@ -636,7 +614,7 @@ udp_tpi_bind(queue_t *q, mblk_t *mp) udp_err_ack(q, mp, TSYSERR, EINVAL); return; } - if (udp->udp_family != AF_INET6 || + if (connp->conn_family != AF_INET6 || sa->sa_family != AF_INET6) { udp_err_ack(q, mp, TSYSERR, EAFNOSUPPORT); return; @@ -669,29 +647,21 @@ udp_tpi_bind(queue_t *q, mblk_t *mp) * This routine handles each T_CONN_REQ message passed to udp. It * associates a default destination address with the stream. * - * This routine sends down a T_BIND_REQ to IP with the following mblks: - * T_BIND_REQ - specifying local and remote address/port - * IRE_DB_REQ_TYPE - to get an IRE back containing ire_type and src - * T_OK_ACK - for the T_CONN_REQ - * T_CONN_CON - to keep the TPI user happy - * - * The connect completes in udp_do_connect. - * When a T_BIND_ACK is received information is extracted from the IRE - * and the two appended messages are sent to the TPI user. - * Should udp_bind_result receive T_ERROR_ACK for the T_BIND_REQ it will - * convert it to an error ack for the appropriate primitive. + * After various error checks are completed, udp_connect() lays + * the target address and port into the composite header template. + * Then we ask IP for information, including a source address if we didn't + * already have one. Finally we send up the T_OK_ACK reply message. */ static void udp_tpi_connect(queue_t *q, mblk_t *mp) { - udp_t *udp; conn_t *connp = Q_TO_CONN(q); int error; socklen_t len; struct sockaddr *sa; struct T_conn_req *tcr; cred_t *cr; - + pid_t pid; /* * All Solaris components should pass a db_credp * for this TPI message, hence we ASSERT. @@ -699,14 +669,13 @@ udp_tpi_connect(queue_t *q, mblk_t *mp) * like a TPI message sent by some other kernel * component, we check and return an error. */ - cr = msg_getcred(mp, NULL); + cr = msg_getcred(mp, &pid); ASSERT(cr != NULL); if (cr == NULL) { udp_err_ack(q, mp, TSYSERR, EINVAL); return; } - udp = connp->conn_udp; tcr = (struct T_conn_req *)mp->b_rptr; /* A bit of sanity checking */ @@ -724,7 +693,7 @@ udp_tpi_connect(queue_t *q, mblk_t *mp) * Determine packet type based on type of address passed in * the request should contain an IPv4 or IPv6 address. * Make sure that address family matches the type of - * family of the the address passed down + * family of the address passed down. */ len = tcr->DEST_length; switch (tcr->DEST_length) { @@ -743,13 +712,13 @@ udp_tpi_connect(queue_t *q, mblk_t *mp) break; } - error = proto_verify_ip_addr(udp->udp_family, sa, len); + error = proto_verify_ip_addr(connp->conn_family, sa, len); if (error != 0) { udp_err_ack(q, mp, TSYSERR, error); return; } - error = udp_do_connect(connp, sa, len, cr); + error = udp_do_connect(connp, sa, len, cr, pid); if (error != 0) { if (error < 0) udp_err_ack(q, mp, -error, 0); @@ -761,7 +730,7 @@ udp_tpi_connect(queue_t *q, mblk_t *mp) * We have to send a connection confirmation to * keep TLI happy. */ - if (udp->udp_family == AF_INET) { + if (connp->conn_family == AF_INET) { mp1 = mi_tpi_conn_con(NULL, (char *)sa, sizeof (sin_t), NULL, 0); } else { @@ -810,72 +779,14 @@ done: return (0); } -/* - * Called in the close path to quiesce the conn - */ -void -udp_quiesce_conn(conn_t *connp) -{ - udp_t *udp = connp->conn_udp; - - if (cl_inet_unbind != NULL && udp->udp_state == TS_IDLE) { - /* - * Running in cluster mode - register unbind information - */ - if (udp->udp_ipversion == IPV4_VERSION) { - (*cl_inet_unbind)( - connp->conn_netstack->netstack_stackid, - IPPROTO_UDP, AF_INET, - (uint8_t *)(&(V4_PART_OF_V6(udp->udp_v6src))), - (in_port_t)udp->udp_port, NULL); - } else { - (*cl_inet_unbind)( - connp->conn_netstack->netstack_stackid, - IPPROTO_UDP, AF_INET6, - (uint8_t *)(&(udp->udp_v6src)), - (in_port_t)udp->udp_port, NULL); - } - } - - udp_bind_hash_remove(udp, B_FALSE); - -} - -void +static void udp_close_free(conn_t *connp) { udp_t *udp = connp->conn_udp; /* If there are any options associated with the stream, free them. */ - if (udp->udp_ip_snd_options != NULL) { - mi_free((char *)udp->udp_ip_snd_options); - udp->udp_ip_snd_options = NULL; - udp->udp_ip_snd_options_len = 0; - } - - if (udp->udp_ip_rcv_options != NULL) { - mi_free((char *)udp->udp_ip_rcv_options); - udp->udp_ip_rcv_options = NULL; - udp->udp_ip_rcv_options_len = 0; - } - - /* Free memory associated with sticky options */ - if (udp->udp_sticky_hdrs_len != 0) { - kmem_free(udp->udp_sticky_hdrs, - udp->udp_sticky_hdrs_len); - udp->udp_sticky_hdrs = NULL; - udp->udp_sticky_hdrs_len = 0; - } - if (udp->udp_last_cred != NULL) { - crfree(udp->udp_last_cred); - udp->udp_last_cred = NULL; - } - if (udp->udp_effective_cred != NULL) { - crfree(udp->udp_effective_cred); - udp->udp_effective_cred = NULL; - } - - ip6_pkt_free(&udp->udp_sticky_ipp); + if (udp->udp_recv_ipp.ipp_fields != 0) + ip_pkt_free(&udp->udp_recv_ipp); /* * Clear any fields which the kmem_cache constructor clears. @@ -892,59 +803,48 @@ static int udp_do_disconnect(conn_t *connp) { udp_t *udp; - mblk_t *ire_mp; udp_fanout_t *udpf; udp_stack_t *us; int error; udp = connp->conn_udp; us = udp->udp_us; - rw_enter(&udp->udp_rwlock, RW_WRITER); - if (udp->udp_state != TS_DATA_XFER || udp->udp_pending_op != -1) { - rw_exit(&udp->udp_rwlock); + mutex_enter(&connp->conn_lock); + if (udp->udp_state != TS_DATA_XFER) { + mutex_exit(&connp->conn_lock); return (-TOUTSTATE); } - udp->udp_pending_op = T_DISCON_REQ; - udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port, + udpf = &us->us_bind_fanout[UDP_BIND_HASH(connp->conn_lport, us->us_bind_fanout_size)]; mutex_enter(&udpf->uf_lock); - udp->udp_v6src = udp->udp_bound_v6src; + if (connp->conn_mcbc_bind) + connp->conn_saddr_v6 = ipv6_all_zeros; + else + connp->conn_saddr_v6 = connp->conn_bound_addr_v6; + connp->conn_laddr_v6 = connp->conn_bound_addr_v6; + connp->conn_faddr_v6 = ipv6_all_zeros; + connp->conn_fport = 0; udp->udp_state = TS_IDLE; mutex_exit(&udpf->uf_lock); - if (udp->udp_family == AF_INET6) { - /* Rebuild the header template */ - error = udp_build_hdrs(udp); - if (error != 0) { - udp->udp_pending_op = -1; - rw_exit(&udp->udp_rwlock); - return (error); - } - } + /* Remove any remnants of mapped address binding */ + if (connp->conn_family == AF_INET6) + connp->conn_ipversion = IPV6_VERSION; - ire_mp = allocb(sizeof (ire_t), BPRI_HI); - if (ire_mp == NULL) { - mutex_enter(&udpf->uf_lock); - udp->udp_pending_op = -1; - mutex_exit(&udpf->uf_lock); - rw_exit(&udp->udp_rwlock); - return (ENOMEM); - } - - rw_exit(&udp->udp_rwlock); - - if (udp->udp_family == AF_INET6) { - error = ip_proto_bind_laddr_v6(connp, &ire_mp, IPPROTO_UDP, - &udp->udp_bound_v6src, udp->udp_port, B_TRUE); - } else { - error = ip_proto_bind_laddr_v4(connp, &ire_mp, IPPROTO_UDP, - V4_PART_OF_V6(udp->udp_bound_v6src), udp->udp_port, B_TRUE); - } + connp->conn_v6lastdst = ipv6_all_zeros; + error = udp_build_hdr_template(connp, &connp->conn_saddr_v6, + &connp->conn_faddr_v6, connp->conn_fport, connp->conn_flowinfo); + mutex_exit(&connp->conn_lock); + if (error != 0) + return (error); - return (udp_post_ip_bind_connect(udp, ire_mp, error)); + /* + * Tell IP to remove the full binding and revert + * to the local address binding. + */ + return (ip_laddr_fanout_insert(connp)); } - static void udp_tpi_disconnect(queue_t *q, mblk_t *mp) { @@ -981,12 +881,9 @@ int udp_disconnect(conn_t *connp) { int error; - udp_t *udp = connp->conn_udp; - - udp->udp_dgram_errind = B_FALSE; + connp->conn_dgram_errind = B_FALSE; error = udp_do_disconnect(connp); - if (error < 0) error = proto_tlitosyserr(-error); @@ -1003,8 +900,8 @@ udp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error) /* Shorthand to generate and send TPI error acks to our client */ static void -udp_err_ack_prim(queue_t *q, mblk_t *mp, int primitive, t_scalar_t t_error, - int sys_error) +udp_err_ack_prim(queue_t *q, mblk_t *mp, t_scalar_t primitive, + t_scalar_t t_error, int sys_error) { struct T_error_ack *teackp; @@ -1018,7 +915,7 @@ udp_err_ack_prim(queue_t *q, mblk_t *mp, int primitive, t_scalar_t t_error, } } -/*ARGSUSED*/ +/*ARGSUSED2*/ static int udp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) { @@ -1033,7 +930,7 @@ udp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) return (0); } -/* ARGSUSED */ +/* ARGSUSED1 */ static int udp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr) @@ -1072,7 +969,7 @@ udp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp, return (0); } -/* ARGSUSED */ +/* ARGSUSED1 */ static int udp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr) @@ -1109,39 +1006,41 @@ udp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp, #define ICMP_MIN_UDP_HDR 4 /* - * udp_icmp_error is called by udp_input to process ICMP msgs. passed up by IP. + * udp_icmp_input is called as conn_recvicmp to process ICMP messages. * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors. * Assumes that IP has pulled up everything up to and including the ICMP header. */ +/* ARGSUSED2 */ static void -udp_icmp_error(conn_t *connp, mblk_t *mp) +udp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) { - icmph_t *icmph; - ipha_t *ipha; - int iph_hdr_length; - udpha_t *udpha; - sin_t sin; - sin6_t sin6; - mblk_t *mp1; - int error = 0; - udp_t *udp = connp->conn_udp; + conn_t *connp = (conn_t *)arg1; + icmph_t *icmph; + ipha_t *ipha; + int iph_hdr_length; + udpha_t *udpha; + sin_t sin; + sin6_t sin6; + mblk_t *mp1; + int error = 0; + udp_t *udp = connp->conn_udp; - mp1 = NULL; ipha = (ipha_t *)mp->b_rptr; ASSERT(OK_32PTR(mp->b_rptr)); if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) { ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION); - udp_icmp_error_ipv6(connp, mp); + udp_icmp_error_ipv6(connp, mp, ira); return; } ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); /* Skip past the outer IP and ICMP headers */ - iph_hdr_length = IPH_HDR_LENGTH(ipha); + ASSERT(IPH_HDR_LENGTH(ipha) == ira->ira_ip_hdr_length); + iph_hdr_length = ira->ira_ip_hdr_length; icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; - ipha = (ipha_t *)&icmph[1]; + ipha = (ipha_t *)&icmph[1]; /* Inner IP header */ /* Skip past the inner IP and find the ULP header */ iph_hdr_length = IPH_HDR_LENGTH(ipha); @@ -1150,11 +1049,41 @@ udp_icmp_error(conn_t *connp, mblk_t *mp) switch (icmph->icmph_type) { case ICMP_DEST_UNREACHABLE: switch (icmph->icmph_code) { - case ICMP_FRAGMENTATION_NEEDED: + case ICMP_FRAGMENTATION_NEEDED: { + ipha_t *ipha; + ip_xmit_attr_t *ixa; /* * IP has already adjusted the path MTU. + * But we need to adjust DF for IPv4. */ + if (connp->conn_ipversion != IPV4_VERSION) + break; + + ixa = conn_get_ixa(connp, B_FALSE); + if (ixa == NULL || ixa->ixa_ire == NULL) { + /* + * Some other thread holds conn_ixa. We will + * redo this on the next ICMP too big. + */ + if (ixa != NULL) + ixa_refrele(ixa); + break; + } + (void) ip_get_pmtu(ixa); + + mutex_enter(&connp->conn_lock); + ipha = (ipha_t *)connp->conn_ht_iphc; + if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) { + ipha->ipha_fragment_offset_and_flags |= + IPH_DF_HTONS; + } else { + ipha->ipha_fragment_offset_and_flags &= + ~IPH_DF_HTONS; + } + mutex_exit(&connp->conn_lock); + ixa_refrele(ixa); break; + } case ICMP_PORT_UNREACHABLE: case ICMP_PROTOCOL_UNREACHABLE: error = ECONNREFUSED; @@ -1177,25 +1106,24 @@ udp_icmp_error(conn_t *connp, mblk_t *mp) * Deliver T_UDERROR_IND when the application has asked for it. * The socket layer enables this automatically when connected. */ - if (!udp->udp_dgram_errind) { + if (!connp->conn_dgram_errind) { freemsg(mp); return; } - - switch (udp->udp_family) { + switch (connp->conn_family) { case AF_INET: sin = sin_null; sin.sin_family = AF_INET; sin.sin_addr.s_addr = ipha->ipha_dst; sin.sin_port = udpha->uha_dst_port; if (IPCL_IS_NONSTR(connp)) { - rw_enter(&udp->udp_rwlock, RW_WRITER); + mutex_enter(&connp->conn_lock); if (udp->udp_state == TS_DATA_XFER) { - if (sin.sin_port == udp->udp_dstport && + if (sin.sin_port == connp->conn_fport && sin.sin_addr.s_addr == - V4_PART_OF_V6(udp->udp_v6dst)) { - rw_exit(&udp->udp_rwlock); + connp->conn_faddr_v4) { + mutex_exit(&connp->conn_lock); (*connp->conn_upcalls->su_set_error) (connp->conn_upper_handle, error); goto done; @@ -1204,10 +1132,12 @@ udp_icmp_error(conn_t *connp, mblk_t *mp) udp->udp_delayed_error = error; *((sin_t *)&udp->udp_delayed_addr) = sin; } - rw_exit(&udp->udp_rwlock); + mutex_exit(&connp->conn_lock); } else { mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0, error); + if (mp1 != NULL) + putnext(connp->conn_rq, mp1); } break; case AF_INET6: @@ -1216,12 +1146,12 @@ udp_icmp_error(conn_t *connp, mblk_t *mp) IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &sin6.sin6_addr); sin6.sin6_port = udpha->uha_dst_port; if (IPCL_IS_NONSTR(connp)) { - rw_enter(&udp->udp_rwlock, RW_WRITER); + mutex_enter(&connp->conn_lock); if (udp->udp_state == TS_DATA_XFER) { - if (sin6.sin6_port == udp->udp_dstport && + if (sin6.sin6_port == connp->conn_fport && IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr, - &udp->udp_v6dst)) { - rw_exit(&udp->udp_rwlock); + &connp->conn_faddr_v6)) { + mutex_exit(&connp->conn_lock); (*connp->conn_upcalls->su_set_error) (connp->conn_upper_handle, error); goto done; @@ -1230,17 +1160,16 @@ udp_icmp_error(conn_t *connp, mblk_t *mp) udp->udp_delayed_error = error; *((sin6_t *)&udp->udp_delayed_addr) = sin6; } - rw_exit(&udp->udp_rwlock); + mutex_exit(&connp->conn_lock); } else { mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), NULL, 0, error); + if (mp1 != NULL) + putnext(connp->conn_rq, mp1); } break; } - if (mp1 != NULL) - putnext(connp->conn_rq, mp1); done: - ASSERT(!RW_ISWRITER(&udp->udp_rwlock)); freemsg(mp); } @@ -1251,7 +1180,7 @@ done: * ICMPv6 header. */ static void -udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp) +udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira) { icmp6_t *icmp6; ip6_t *ip6h, *outer_ip6h; @@ -1265,12 +1194,19 @@ udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp) udp_stack_t *us = udp->udp_us; outer_ip6h = (ip6_t *)mp->b_rptr; +#ifdef DEBUG if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6) iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h); else iph_hdr_length = IPV6_HDR_LEN; + ASSERT(iph_hdr_length == ira->ira_ip_hdr_length); +#endif + /* Skip past the outer IP and ICMP headers */ + iph_hdr_length = ira->ira_ip_hdr_length; icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length]; - ip6h = (ip6_t *)&icmp6[1]; + + /* Skip past the inner IP and find the ULP header */ + ip6h = (ip6_t *)&icmp6[1]; /* Inner IP header */ if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) { freemsg(mp); return; @@ -1308,7 +1244,7 @@ udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp) * information, send up an empty message containing an * IPV6_PATHMTU ancillary data item. */ - if (!udp->udp_ipv6_recvpathmtu) + if (!connp->conn_ipv6_recvpathmtu) break; udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) + @@ -1334,7 +1270,7 @@ udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp) sin6 = (sin6_t *)&tudi[1]; bzero(sin6, sizeof (sin6_t)); sin6->sin6_family = AF_INET6; - sin6->sin6_addr = udp->udp_v6dst; + sin6->sin6_addr = connp->conn_faddr_v6; toh = (struct T_opthdr *)&sin6[1]; toh->level = IPPROTO_IPV6; @@ -1352,8 +1288,7 @@ udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp) * message. Free it, then send our empty message. */ freemsg(mp); - udp_ulp_recv(connp, newmp); - + udp_ulp_recv(connp, newmp, msgdsize(newmp), ira); return; } case ICMP6_TIME_EXCEEDED: @@ -1378,7 +1313,7 @@ udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp) * Deliver T_UDERROR_IND when the application has asked for it. * The socket layer enables this automatically when connected. */ - if (!udp->udp_dgram_errind) { + if (!connp->conn_dgram_errind) { freemsg(mp); return; } @@ -1390,12 +1325,12 @@ udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp) sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; if (IPCL_IS_NONSTR(connp)) { - rw_enter(&udp->udp_rwlock, RW_WRITER); + mutex_enter(&connp->conn_lock); if (udp->udp_state == TS_DATA_XFER) { - if (sin6.sin6_port == udp->udp_dstport && + if (sin6.sin6_port == connp->conn_fport && IN6_ARE_ADDR_EQUAL(&sin6.sin6_addr, - &udp->udp_v6dst)) { - rw_exit(&udp->udp_rwlock); + &connp->conn_faddr_v6)) { + mutex_exit(&connp->conn_lock); (*connp->conn_upcalls->su_set_error) (connp->conn_upper_handle, error); goto done; @@ -1404,7 +1339,7 @@ udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp) udp->udp_delayed_error = error; *((sin6_t *)&udp->udp_delayed_addr) = sin6; } - rw_exit(&udp->udp_rwlock); + mutex_exit(&connp->conn_lock); } else { mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), NULL, 0, error); @@ -1412,7 +1347,6 @@ udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp) putnext(connp->conn_rq, mp1); } done: - ASSERT(!RW_ISWRITER(&udp->udp_rwlock)); freemsg(mp); } @@ -1426,11 +1360,12 @@ done: static void udp_addr_req(queue_t *q, mblk_t *mp) { - sin_t *sin; - sin6_t *sin6; + struct sockaddr *sa; mblk_t *ackmp; struct T_addr_ack *taa; udp_t *udp = Q_TO_UDP(q); + conn_t *connp = udp->udp_connp; + uint_t addrlen; /* Make it large enough for worst case */ ackmp = reallocb(mp, sizeof (struct T_addr_ack) + @@ -1446,7 +1381,13 @@ udp_addr_req(queue_t *q, mblk_t *mp) taa->PRIM_type = T_ADDR_ACK; ackmp->b_datap->db_type = M_PCPROTO; - rw_enter(&udp->udp_rwlock, RW_READER); + + if (connp->conn_family == AF_INET) + addrlen = sizeof (sin_t); + else + addrlen = sizeof (sin6_t); + + mutex_enter(&connp->conn_lock); /* * Note: Following code assumes 32 bit alignment of basic * data structures like sin_t and struct T_addr_ack. @@ -1456,91 +1397,23 @@ udp_addr_req(queue_t *q, mblk_t *mp) * Fill in local address first */ taa->LOCADDR_offset = sizeof (*taa); - if (udp->udp_family == AF_INET) { - taa->LOCADDR_length = sizeof (sin_t); - sin = (sin_t *)&taa[1]; - /* Fill zeroes and then initialize non-zero fields */ - *sin = sin_null; - sin->sin_family = AF_INET; - if (!IN6_IS_ADDR_V4MAPPED_ANY(&udp->udp_v6src) && - !IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) { - IN6_V4MAPPED_TO_IPADDR(&udp->udp_v6src, - sin->sin_addr.s_addr); - } else { - /* - * INADDR_ANY - * udp_v6src is not set, we might be bound to - * broadcast/multicast. Use udp_bound_v6src as - * local address instead (that could - * also still be INADDR_ANY) - */ - IN6_V4MAPPED_TO_IPADDR(&udp->udp_bound_v6src, - sin->sin_addr.s_addr); - } - sin->sin_port = udp->udp_port; - ackmp->b_wptr = (uchar_t *)&sin[1]; - if (udp->udp_state == TS_DATA_XFER) { - /* - * connected, fill remote address too - */ - taa->REMADDR_length = sizeof (sin_t); - /* assumed 32-bit alignment */ - taa->REMADDR_offset = taa->LOCADDR_offset + - taa->LOCADDR_length; - - sin = (sin_t *)(ackmp->b_rptr + - taa->REMADDR_offset); - /* initialize */ - *sin = sin_null; - sin->sin_family = AF_INET; - sin->sin_addr.s_addr = - V4_PART_OF_V6(udp->udp_v6dst); - sin->sin_port = udp->udp_dstport; - ackmp->b_wptr = (uchar_t *)&sin[1]; - } - } else { - taa->LOCADDR_length = sizeof (sin6_t); - sin6 = (sin6_t *)&taa[1]; - /* Fill zeroes and then initialize non-zero fields */ - *sin6 = sin6_null; - sin6->sin6_family = AF_INET6; - if (!IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) { - sin6->sin6_addr = udp->udp_v6src; - } else { - /* - * UNSPECIFIED - * udp_v6src is not set, we might be bound to - * broadcast/multicast. Use udp_bound_v6src as - * local address instead (that could - * also still be UNSPECIFIED) - */ - sin6->sin6_addr = - udp->udp_bound_v6src; - } - sin6->sin6_port = udp->udp_port; - ackmp->b_wptr = (uchar_t *)&sin6[1]; - if (udp->udp_state == TS_DATA_XFER) { - /* - * connected, fill remote address too - */ - taa->REMADDR_length = sizeof (sin6_t); - /* assumed 32-bit alignment */ - taa->REMADDR_offset = taa->LOCADDR_offset + - taa->LOCADDR_length; - - sin6 = (sin6_t *)(ackmp->b_rptr + - taa->REMADDR_offset); - /* initialize */ - *sin6 = sin6_null; - sin6->sin6_family = AF_INET6; - sin6->sin6_addr = udp->udp_v6dst; - sin6->sin6_port = udp->udp_dstport; - ackmp->b_wptr = (uchar_t *)&sin6[1]; - } - ackmp->b_wptr = (uchar_t *)&sin6[1]; - } + taa->LOCADDR_length = addrlen; + sa = (struct sockaddr *)&taa[1]; + (void) conn_getsockname(connp, sa, &addrlen); + ackmp->b_wptr += addrlen; } - rw_exit(&udp->udp_rwlock); + if (udp->udp_state == TS_DATA_XFER) { + /* + * connected, fill remote address too + */ + taa->REMADDR_length = addrlen; + /* assumed 32-bit alignment */ + taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length; + sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset); + (void) conn_getpeername(connp, sa, &addrlen); + ackmp->b_wptr += addrlen; + } + mutex_exit(&connp->conn_lock); ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim); qreply(q, ackmp); } @@ -1548,7 +1421,9 @@ udp_addr_req(queue_t *q, mblk_t *mp) static void udp_copy_info(struct T_info_ack *tap, udp_t *udp) { - if (udp->udp_family == AF_INET) { + conn_t *connp = udp->udp_connp; + + if (connp->conn_family == AF_INET) { *tap = udp_g_t_info_ack_ipv4; } else { *tap = udp_g_t_info_ack_ipv6; @@ -1632,20 +1507,15 @@ udp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) * This is the open routine for udp. It allocates a udp_t structure for * the stream and, on the first open of the module, creates an ND table. */ -/*ARGSUSED2*/ static int udp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, boolean_t isv6) { - int error; udp_t *udp; conn_t *connp; dev_t conn_dev; - udp_stack_t *us; vmem_t *minor_arena; - TRACE_1(TR_FAC_UDP, TR_UDP_OPEN, "udp_open: q %p", q); - /* If the stream is already open, return immediately. */ if (q->q_ptr != NULL) return (0); @@ -1685,7 +1555,6 @@ udp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, return (ENOMEM); } udp = connp->conn_udp; - us = udp->udp_us; *devp = makedevice(getemajor(*devp), (minor_t)conn_dev); connp->conn_dev = conn_dev; @@ -1699,39 +1568,27 @@ udp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, connp->conn_rq = q; connp->conn_wq = WR(q); - rw_enter(&udp->udp_rwlock, RW_WRITER); - ASSERT(connp->conn_ulp == IPPROTO_UDP); + /* + * Since this conn_t/udp_t is not yet visible to anybody else we don't + * need to lock anything. + */ + ASSERT(connp->conn_proto == IPPROTO_UDP); ASSERT(connp->conn_udp == udp); ASSERT(udp->udp_connp == connp); if (flag & SO_SOCKSTR) { - connp->conn_flags |= IPCL_SOCKET; udp->udp_issocket = B_TRUE; } - q->q_hiwat = us->us_recv_hiwat; - WR(q)->q_hiwat = us->us_xmit_hiwat; - WR(q)->q_lowat = us->us_xmit_lowat; + WR(q)->q_hiwat = connp->conn_sndbuf; + WR(q)->q_lowat = connp->conn_sndlowat; qprocson(q); - if (udp->udp_family == AF_INET6) { - /* Build initial header template for transmit */ - if ((error = udp_build_hdrs(udp)) != 0) { - rw_exit(&udp->udp_rwlock); - qprocsoff(q); - inet_minor_free(minor_arena, conn_dev); - ipcl_conn_destroy(connp); - return (error); - } - } - rw_exit(&udp->udp_rwlock); - /* Set the Stream head write offset and high watermark. */ - (void) proto_set_tx_wroff(q, connp, - udp->udp_max_hdr_len + us->us_wroff_extra); - /* XXX udp_set_rcv_hiwat() doesn't hold the lock, is it a bug??? */ - (void) proto_set_rx_hiwat(q, connp, udp_set_rcv_hiwat(udp, q->q_hiwat)); + (void) proto_set_tx_wroff(q, connp, connp->conn_wroff); + (void) proto_set_rx_hiwat(q, connp, + udp_set_rcv_hiwat(udp, connp->conn_rcvbuf)); mutex_enter(&connp->conn_lock); connp->conn_state_flags &= ~CONN_INCIPIENT; @@ -1753,7 +1610,6 @@ udp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name) * This routine gets default values of certain options whose default * values are maintained by protcol specific code */ -/* ARGSUSED */ int udp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) { @@ -1791,456 +1647,127 @@ udp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) /* * This routine retrieves the current status of socket options. - * It returns the size of the option retrieved. + * It returns the size of the option retrieved, or -1. */ -static int -udp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) +int +udp_opt_get(conn_t *connp, t_scalar_t level, t_scalar_t name, + uchar_t *ptr) { - udp_t *udp = connp->conn_udp; - udp_stack_t *us = udp->udp_us; int *i1 = (int *)ptr; - ip6_pkt_t *ipp = &udp->udp_sticky_ipp; + udp_t *udp = connp->conn_udp; int len; + conn_opt_arg_t coas; + int retval; - ASSERT(RW_READ_HELD(&udp->udp_rwlock)); - switch (level) { - case SOL_SOCKET: - switch (name) { - case SO_DEBUG: - *i1 = udp->udp_debug; - break; /* goto sizeof (int) option return */ - case SO_REUSEADDR: - *i1 = udp->udp_reuseaddr; - break; /* goto sizeof (int) option return */ - case SO_TYPE: - *i1 = SOCK_DGRAM; - break; /* goto sizeof (int) option return */ + coas.coa_connp = connp; + coas.coa_ixa = connp->conn_ixa; + coas.coa_ipp = &connp->conn_xmit_ipp; + coas.coa_ancillary = B_FALSE; + coas.coa_changed = 0; + /* + * We assume that the optcom framework has checked for the set + * of levels and names that are supported, hence we don't worry + * about rejecting based on that. + * First check for UDP specific handling, then pass to common routine. + */ + switch (level) { + case IPPROTO_IP: /* - * The following three items are available here, - * but are only meaningful to IP. + * Only allow IPv4 option processing on IPv4 sockets. */ - case SO_DONTROUTE: - *i1 = udp->udp_dontroute; - break; /* goto sizeof (int) option return */ - case SO_USELOOPBACK: - *i1 = udp->udp_useloopback; - break; /* goto sizeof (int) option return */ - case SO_BROADCAST: - *i1 = udp->udp_broadcast; - break; /* goto sizeof (int) option return */ - - case SO_SNDBUF: - *i1 = udp->udp_xmit_hiwat; - break; /* goto sizeof (int) option return */ - case SO_RCVBUF: - *i1 = udp->udp_rcv_disply_hiwat; - break; /* goto sizeof (int) option return */ - case SO_DGRAM_ERRIND: - *i1 = udp->udp_dgram_errind; - break; /* goto sizeof (int) option return */ - case SO_RECVUCRED: - *i1 = udp->udp_recvucred; - break; /* goto sizeof (int) option return */ - case SO_TIMESTAMP: - *i1 = udp->udp_timestamp; - break; /* goto sizeof (int) option return */ - case SO_ANON_MLP: - *i1 = connp->conn_anon_mlp; - break; /* goto sizeof (int) option return */ - case SO_MAC_EXEMPT: - *i1 = (connp->conn_mac_mode == CONN_MAC_AWARE); - break; - case SO_MAC_IMPLICIT: - *i1 = (connp->conn_mac_mode == CONN_MAC_IMPLICIT); - break; - case SO_ALLZONES: - *i1 = connp->conn_allzones; - break; /* goto sizeof (int) option return */ - case SO_EXCLBIND: - *i1 = udp->udp_exclbind ? SO_EXCLBIND : 0; - break; - case SO_PROTOTYPE: - *i1 = IPPROTO_UDP; - break; - case SO_DOMAIN: - *i1 = udp->udp_family; - break; - default: - return (-1); - } - break; - case IPPROTO_IP: - if (udp->udp_family != AF_INET) + if (connp->conn_family != AF_INET) return (-1); + switch (name) { case IP_OPTIONS: case T_IP_OPTIONS: - len = udp->udp_ip_rcv_options_len - udp->udp_label_len; - if (len > 0) { - bcopy(udp->udp_ip_rcv_options + - udp->udp_label_len, ptr, len); - } - return (len); - case IP_TOS: - case T_IP_TOS: - *i1 = (int)udp->udp_type_of_service; - break; /* goto sizeof (int) option return */ - case IP_TTL: - *i1 = (int)udp->udp_ttl; - break; /* goto sizeof (int) option return */ - case IP_DHCPINIT_IF: - return (-EINVAL); - case IP_NEXTHOP: - case IP_RECVPKTINFO: - /* - * This also handles IP_PKTINFO. - * IP_PKTINFO and IP_RECVPKTINFO have the same value. - * Differentiation is based on the size of the argument - * passed in. - * This option is handled in IP which will return an - * error for IP_PKTINFO as it's not supported as a - * sticky option. - */ - return (-EINVAL); - case IP_MULTICAST_IF: - /* 0 address if not set */ - *(ipaddr_t *)ptr = udp->udp_multicast_if_addr; - return (sizeof (ipaddr_t)); - case IP_MULTICAST_TTL: - *(uchar_t *)ptr = udp->udp_multicast_ttl; - return (sizeof (uchar_t)); - case IP_MULTICAST_LOOP: - *ptr = connp->conn_multicast_loop; - return (sizeof (uint8_t)); - case IP_RECVOPTS: - *i1 = udp->udp_recvopts; - break; /* goto sizeof (int) option return */ - case IP_RECVDSTADDR: - *i1 = udp->udp_recvdstaddr; - break; /* goto sizeof (int) option return */ - case IP_RECVIF: - *i1 = udp->udp_recvif; - break; /* goto sizeof (int) option return */ - case IP_RECVSLLA: - *i1 = udp->udp_recvslla; - break; /* goto sizeof (int) option return */ - case IP_RECVTTL: - *i1 = udp->udp_recvttl; - break; /* goto sizeof (int) option return */ - case IP_ADD_MEMBERSHIP: - case IP_DROP_MEMBERSHIP: - case IP_BLOCK_SOURCE: - case IP_UNBLOCK_SOURCE: - case IP_ADD_SOURCE_MEMBERSHIP: - case IP_DROP_SOURCE_MEMBERSHIP: - case MCAST_JOIN_GROUP: - case MCAST_LEAVE_GROUP: - case MCAST_BLOCK_SOURCE: - case MCAST_UNBLOCK_SOURCE: - case MCAST_JOIN_SOURCE_GROUP: - case MCAST_LEAVE_SOURCE_GROUP: - /* cannot "get" the value for these */ - return (-1); - case IP_BOUND_IF: - /* Zero if not set */ - *i1 = udp->udp_bound_if; - break; /* goto sizeof (int) option return */ - case IP_UNSPEC_SRC: - *i1 = udp->udp_unspec_source; - break; /* goto sizeof (int) option return */ - case IP_BROADCAST_TTL: - *(uchar_t *)ptr = connp->conn_broadcast_ttl; - return (sizeof (uchar_t)); - default: - return (-1); - } - break; - case IPPROTO_IPV6: - if (udp->udp_family != AF_INET6) - return (-1); - switch (name) { - case IPV6_UNICAST_HOPS: - *i1 = (unsigned int)udp->udp_ttl; - break; /* goto sizeof (int) option return */ - case IPV6_MULTICAST_IF: - /* 0 index if not set */ - *i1 = udp->udp_multicast_if_index; - break; /* goto sizeof (int) option return */ - case IPV6_MULTICAST_HOPS: - *i1 = udp->udp_multicast_ttl; - break; /* goto sizeof (int) option return */ - case IPV6_MULTICAST_LOOP: - *i1 = connp->conn_multicast_loop; - break; /* goto sizeof (int) option return */ - case IPV6_JOIN_GROUP: - case IPV6_LEAVE_GROUP: - case MCAST_JOIN_GROUP: - case MCAST_LEAVE_GROUP: - case MCAST_BLOCK_SOURCE: - case MCAST_UNBLOCK_SOURCE: - case MCAST_JOIN_SOURCE_GROUP: - case MCAST_LEAVE_SOURCE_GROUP: - /* cannot "get" the value for these */ - return (-1); - case IPV6_BOUND_IF: - /* Zero if not set */ - *i1 = udp->udp_bound_if; - break; /* goto sizeof (int) option return */ - case IPV6_UNSPEC_SRC: - *i1 = udp->udp_unspec_source; - break; /* goto sizeof (int) option return */ - case IPV6_RECVPKTINFO: - *i1 = udp->udp_ip_recvpktinfo; - break; /* goto sizeof (int) option return */ - case IPV6_RECVTCLASS: - *i1 = udp->udp_ipv6_recvtclass; - break; /* goto sizeof (int) option return */ - case IPV6_RECVPATHMTU: - *i1 = udp->udp_ipv6_recvpathmtu; - break; /* goto sizeof (int) option return */ - case IPV6_RECVHOPLIMIT: - *i1 = udp->udp_ipv6_recvhoplimit; - break; /* goto sizeof (int) option return */ - case IPV6_RECVHOPOPTS: - *i1 = udp->udp_ipv6_recvhopopts; - break; /* goto sizeof (int) option return */ - case IPV6_RECVDSTOPTS: - *i1 = udp->udp_ipv6_recvdstopts; - break; /* goto sizeof (int) option return */ - case _OLD_IPV6_RECVDSTOPTS: - *i1 = udp->udp_old_ipv6_recvdstopts; - break; /* goto sizeof (int) option return */ - case IPV6_RECVRTHDRDSTOPTS: - *i1 = udp->udp_ipv6_recvrthdrdstopts; - break; /* goto sizeof (int) option return */ - case IPV6_RECVRTHDR: - *i1 = udp->udp_ipv6_recvrthdr; - break; /* goto sizeof (int) option return */ - case IPV6_PKTINFO: { - /* XXX assumes that caller has room for max size! */ - struct in6_pktinfo *pkti; - - pkti = (struct in6_pktinfo *)ptr; - if (ipp->ipp_fields & IPPF_IFINDEX) - pkti->ipi6_ifindex = ipp->ipp_ifindex; - else - pkti->ipi6_ifindex = 0; - if (ipp->ipp_fields & IPPF_ADDR) - pkti->ipi6_addr = ipp->ipp_addr; - else - pkti->ipi6_addr = ipv6_all_zeros; - return (sizeof (struct in6_pktinfo)); - } - case IPV6_TCLASS: - if (ipp->ipp_fields & IPPF_TCLASS) - *i1 = ipp->ipp_tclass; - else - *i1 = IPV6_FLOW_TCLASS( - IPV6_DEFAULT_VERS_AND_FLOW); - break; /* goto sizeof (int) option return */ - case IPV6_NEXTHOP: { - sin6_t *sin6 = (sin6_t *)ptr; - - if (!(ipp->ipp_fields & IPPF_NEXTHOP)) - return (0); - *sin6 = sin6_null; - sin6->sin6_family = AF_INET6; - sin6->sin6_addr = ipp->ipp_nexthop; - return (sizeof (sin6_t)); - } - case IPV6_HOPOPTS: - if (!(ipp->ipp_fields & IPPF_HOPOPTS)) - return (0); - if (ipp->ipp_hopoptslen <= udp->udp_label_len_v6) + mutex_enter(&connp->conn_lock); + if (!(udp->udp_recv_ipp.ipp_fields & + IPPF_IPV4_OPTIONS)) { + mutex_exit(&connp->conn_lock); return (0); - /* - * The cipso/label option is added by kernel. - * User is not usually aware of this option. - * We copy out the hbh opt after the label option. - */ - bcopy((char *)ipp->ipp_hopopts + udp->udp_label_len_v6, - ptr, ipp->ipp_hopoptslen - udp->udp_label_len_v6); - if (udp->udp_label_len_v6 > 0) { - ptr[0] = ((char *)ipp->ipp_hopopts)[0]; - ptr[1] = (ipp->ipp_hopoptslen - - udp->udp_label_len_v6 + 7) / 8 - 1; } - return (ipp->ipp_hopoptslen - udp->udp_label_len_v6); - case IPV6_RTHDRDSTOPTS: - if (!(ipp->ipp_fields & IPPF_RTDSTOPTS)) - return (0); - bcopy(ipp->ipp_rtdstopts, ptr, ipp->ipp_rtdstoptslen); - return (ipp->ipp_rtdstoptslen); - case IPV6_RTHDR: - if (!(ipp->ipp_fields & IPPF_RTHDR)) - return (0); - bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen); - return (ipp->ipp_rthdrlen); - case IPV6_DSTOPTS: - if (!(ipp->ipp_fields & IPPF_DSTOPTS)) - return (0); - bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen); - return (ipp->ipp_dstoptslen); - case IPV6_PATHMTU: - return (ip_fill_mtuinfo(&udp->udp_v6dst, - udp->udp_dstport, (struct ip6_mtuinfo *)ptr, - us->us_netstack)); - default: - return (-1); + + len = udp->udp_recv_ipp.ipp_ipv4_options_len; + ASSERT(len != 0); + bcopy(udp->udp_recv_ipp.ipp_ipv4_options, ptr, len); + mutex_exit(&connp->conn_lock); + return (len); } break; case IPPROTO_UDP: switch (name) { - case UDP_ANONPRIVBIND: - *i1 = udp->udp_anon_priv_bind; - break; - case UDP_EXCLBIND: - *i1 = udp->udp_exclbind ? UDP_EXCLBIND : 0; - break; - case UDP_RCVHDR: - *i1 = udp->udp_rcvhdr ? 1 : 0; - break; case UDP_NAT_T_ENDPOINT: + mutex_enter(&connp->conn_lock); *i1 = udp->udp_nat_t_endpoint; - break; - default: - return (-1); + mutex_exit(&connp->conn_lock); + return (sizeof (int)); + case UDP_RCVHDR: + mutex_enter(&connp->conn_lock); + *i1 = udp->udp_rcvhdr ? 1 : 0; + mutex_exit(&connp->conn_lock); + return (sizeof (int)); } - break; - default: - return (-1); } - return (sizeof (int)); + mutex_enter(&connp->conn_lock); + retval = conn_opt_get(&coas, level, name, ptr); + mutex_exit(&connp->conn_lock); + return (retval); } +/* + * This routine retrieves the current status of socket options. + * It returns the size of the option retrieved, or -1. + */ int udp_tpi_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) { - udp_t *udp; - int err; - - udp = Q_TO_UDP(q); + conn_t *connp = Q_TO_CONN(q); + int err; - rw_enter(&udp->udp_rwlock, RW_READER); - err = udp_opt_get(Q_TO_CONN(q), level, name, ptr); - rw_exit(&udp->udp_rwlock); + err = udp_opt_get(connp, level, name, ptr); return (err); } /* * This routine sets socket options. */ -/* ARGSUSED */ -static int -udp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen, - uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, cred_t *cr, - void *thisdg_attrs, boolean_t checkonly) +int +udp_do_opt_set(conn_opt_arg_t *coa, int level, int name, + uint_t inlen, uchar_t *invalp, cred_t *cr, boolean_t checkonly) { - udpattrs_t *attrs = thisdg_attrs; - int *i1 = (int *)invalp; - boolean_t onoff = (*i1 == 0) ? 0 : 1; - udp_t *udp = connp->conn_udp; + conn_t *connp = coa->coa_connp; + ip_xmit_attr_t *ixa = coa->coa_ixa; + udp_t *udp = connp->conn_udp; udp_stack_t *us = udp->udp_us; - int error; - uint_t newlen; - size_t sth_wroff; + int *i1 = (int *)invalp; + boolean_t onoff = (*i1 == 0) ? 0 : 1; + int error; - ASSERT(RW_WRITE_HELD(&udp->udp_rwlock)); + ASSERT(MUTEX_NOT_HELD(&coa->coa_connp->conn_lock)); /* - * For fixed length options, no sanity check - * of passed in length is done. It is assumed *_optcom_req() - * routines do the right thing. + * First do UDP specific sanity checks and handle UDP specific + * options. Note that some IPPROTO_UDP options are handled + * by conn_opt_set. */ switch (level) { case SOL_SOCKET: switch (name) { - case SO_REUSEADDR: - if (!checkonly) { - udp->udp_reuseaddr = onoff; - PASS_OPT_TO_IP(connp); - } - break; - case SO_DEBUG: - if (!checkonly) - udp->udp_debug = onoff; - break; - /* - * The following three items are available here, - * but are only meaningful to IP. - */ - case SO_DONTROUTE: - if (!checkonly) { - udp->udp_dontroute = onoff; - PASS_OPT_TO_IP(connp); - } - break; - case SO_USELOOPBACK: - if (!checkonly) { - udp->udp_useloopback = onoff; - PASS_OPT_TO_IP(connp); - } - break; - case SO_BROADCAST: - if (!checkonly) { - udp->udp_broadcast = onoff; - PASS_OPT_TO_IP(connp); - } - break; - case SO_SNDBUF: if (*i1 > us->us_max_buf) { - *outlenp = 0; return (ENOBUFS); } - if (!checkonly) { - udp->udp_xmit_hiwat = *i1; - connp->conn_wq->q_hiwat = *i1; - } break; case SO_RCVBUF: if (*i1 > us->us_max_buf) { - *outlenp = 0; return (ENOBUFS); } - if (!checkonly) { - int size; - - udp->udp_rcv_disply_hiwat = *i1; - size = udp_set_rcv_hiwat(udp, *i1); - rw_exit(&udp->udp_rwlock); - (void) proto_set_rx_hiwat(connp->conn_rq, connp, - size); - rw_enter(&udp->udp_rwlock, RW_WRITER); - } - break; - case SO_DGRAM_ERRIND: - if (!checkonly) - udp->udp_dgram_errind = onoff; - break; - case SO_RECVUCRED: - if (!checkonly) - udp->udp_recvucred = onoff; - break; - case SO_ALLZONES: - /* - * "soft" error (negative) - * option not handled at this level - * Do not modify *outlenp. - */ - return (-EINVAL); - case SO_TIMESTAMP: - if (!checkonly) - udp->udp_timestamp = onoff; - break; - case SO_ANON_MLP: - case SO_MAC_EXEMPT: - case SO_MAC_IMPLICIT: - PASS_OPT_TO_IP(connp); break; + case SCM_UCRED: { struct ucred_s *ucr; - cred_t *cr, *newcr; + cred_t *newcr; ts_label_t *tsl; /* @@ -2250,20 +1777,18 @@ udp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen, */ if (connp->conn_mlp_type == mlptSingle) break; + ucr = (struct ucred_s *)invalp; if (inlen != ucredsize || ucr->uc_labeloff < sizeof (*ucr) || ucr->uc_labeloff + sizeof (bslabel_t) > inlen) return (EINVAL); if (!checkonly) { - mblk_t *mb; - pid_t cpid; - - if (attrs == NULL || - (mb = attrs->udpattr_mb) == NULL) - return (EINVAL); - if ((cr = msg_getcred(mb, &cpid)) == NULL) - cr = udp->udp_connp->conn_cred; + /* + * Set ixa_tsl to the new label. + * We assume that crgetzoneid doesn't change + * as part of the SCM_UCRED. + */ ASSERT(cr != NULL); if ((tsl = crgetlabel(cr)) == NULL) return (EINVAL); @@ -2271,778 +1796,75 @@ udp_do_opt_set(conn_t *connp, int level, int name, uint_t inlen, tsl->tsl_doi, KM_NOSLEEP); if (newcr == NULL) return (ENOSR); - mblk_setcred(mb, newcr, cpid); - attrs->udpattr_credset = B_TRUE; - crfree(newcr); - } - break; - } - case SO_EXCLBIND: - if (!checkonly) - udp->udp_exclbind = onoff; - break; - case SO_RCVTIMEO: - case SO_SNDTIMEO: - /* - * Pass these two options in order for third part - * protocol usage. Here just return directly. - */ - return (0); - default: - *outlenp = 0; - return (EINVAL); - } - break; - case IPPROTO_IP: - if (udp->udp_family != AF_INET) { - *outlenp = 0; - return (ENOPROTOOPT); - } - switch (name) { - case IP_OPTIONS: - case T_IP_OPTIONS: - /* Save options for use by IP. */ - newlen = inlen + udp->udp_label_len; - if ((inlen & 0x3) || newlen > IP_MAX_OPT_LENGTH) { - *outlenp = 0; - return (EINVAL); - } - if (checkonly) - break; - - /* - * Update the stored options taking into account - * any CIPSO option which we should not overwrite. - */ - if (!tsol_option_set(&udp->udp_ip_snd_options, - &udp->udp_ip_snd_options_len, - udp->udp_label_len, invalp, inlen)) { - *outlenp = 0; - return (ENOMEM); - } - - udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + - UDPH_SIZE + udp->udp_ip_snd_options_len; - sth_wroff = udp->udp_max_hdr_len + us->us_wroff_extra; - rw_exit(&udp->udp_rwlock); - (void) proto_set_tx_wroff(connp->conn_rq, connp, - sth_wroff); - rw_enter(&udp->udp_rwlock, RW_WRITER); - break; - - case IP_TTL: - if (!checkonly) { - udp->udp_ttl = (uchar_t)*i1; - } - break; - case IP_TOS: - case T_IP_TOS: - if (!checkonly) { - udp->udp_type_of_service = (uchar_t)*i1; - } - break; - case IP_MULTICAST_IF: { - /* - * TODO should check OPTMGMT reply and undo this if - * there is an error. - */ - struct in_addr *inap = (struct in_addr *)invalp; - if (!checkonly) { - udp->udp_multicast_if_addr = - inap->s_addr; - PASS_OPT_TO_IP(connp); - } - break; - } - case IP_MULTICAST_TTL: - if (!checkonly) - udp->udp_multicast_ttl = *invalp; - break; - case IP_MULTICAST_LOOP: - if (!checkonly) { - connp->conn_multicast_loop = *invalp; - PASS_OPT_TO_IP(connp); - } - break; - case IP_RECVOPTS: - if (!checkonly) - udp->udp_recvopts = onoff; - break; - case IP_RECVDSTADDR: - if (!checkonly) - udp->udp_recvdstaddr = onoff; - break; - case IP_RECVIF: - if (!checkonly) { - udp->udp_recvif = onoff; - PASS_OPT_TO_IP(connp); - } - break; - case IP_RECVSLLA: - if (!checkonly) { - udp->udp_recvslla = onoff; - PASS_OPT_TO_IP(connp); - } - break; - case IP_RECVTTL: - if (!checkonly) - udp->udp_recvttl = onoff; - break; - case IP_PKTINFO: { - /* - * This also handles IP_RECVPKTINFO. - * IP_PKTINFO and IP_RECVPKTINFO have same value. - * Differentiation is based on the size of the - * argument passed in. - */ - struct in_pktinfo *pktinfop; - ip4_pkt_t *attr_pktinfop; - - if (checkonly) - break; - - if (inlen == sizeof (int)) { - /* - * This is IP_RECVPKTINFO option. - * Keep a local copy of whether this option is - * set or not and pass it down to IP for - * processing. - */ - - udp->udp_ip_recvpktinfo = onoff; - return (-EINVAL); - } - - if (attrs == NULL || - (attr_pktinfop = attrs->udpattr_ipp4) == NULL) { + ASSERT(newcr->cr_label != NULL); /* - * sticky option or no buffer to return - * the results. + * Move the hold on the cr_label to ixa_tsl by + * setting cr_label to NULL. Then release newcr. */ - return (EINVAL); - } - - if (inlen != sizeof (struct in_pktinfo)) - return (EINVAL); - - pktinfop = (struct in_pktinfo *)invalp; - - /* - * At least one of the values should be specified - */ - if (pktinfop->ipi_ifindex == 0 && - pktinfop->ipi_spec_dst.s_addr == INADDR_ANY) { - return (EINVAL); - } - - attr_pktinfop->ip4_addr = pktinfop->ipi_spec_dst.s_addr; - attr_pktinfop->ip4_ill_index = pktinfop->ipi_ifindex; - - break; - } - case IP_ADD_MEMBERSHIP: - case IP_DROP_MEMBERSHIP: - case IP_BLOCK_SOURCE: - case IP_UNBLOCK_SOURCE: - case IP_ADD_SOURCE_MEMBERSHIP: - case IP_DROP_SOURCE_MEMBERSHIP: - case MCAST_JOIN_GROUP: - case MCAST_LEAVE_GROUP: - case MCAST_BLOCK_SOURCE: - case MCAST_UNBLOCK_SOURCE: - case MCAST_JOIN_SOURCE_GROUP: - case MCAST_LEAVE_SOURCE_GROUP: - case IP_SEC_OPT: - case IP_NEXTHOP: - case IP_DHCPINIT_IF: - /* - * "soft" error (negative) - * option not handled at this level - * Do not modify *outlenp. - */ - return (-EINVAL); - case IP_BOUND_IF: - if (!checkonly) { - udp->udp_bound_if = *i1; - PASS_OPT_TO_IP(connp); - } - break; - case IP_UNSPEC_SRC: - if (!checkonly) { - udp->udp_unspec_source = onoff; - PASS_OPT_TO_IP(connp); - } - break; - case IP_BROADCAST_TTL: - if (!checkonly) - connp->conn_broadcast_ttl = *invalp; - break; - default: - *outlenp = 0; - return (EINVAL); - } - break; - case IPPROTO_IPV6: { - ip6_pkt_t *ipp; - boolean_t sticky; - - if (udp->udp_family != AF_INET6) { - *outlenp = 0; - return (ENOPROTOOPT); - } - /* - * Deal with both sticky options and ancillary data - */ - sticky = B_FALSE; - if (attrs == NULL || (ipp = attrs->udpattr_ipp6) == - NULL) { - /* sticky options, or none */ - ipp = &udp->udp_sticky_ipp; - sticky = B_TRUE; - } - - switch (name) { - case IPV6_MULTICAST_IF: - if (!checkonly) { - udp->udp_multicast_if_index = *i1; - PASS_OPT_TO_IP(connp); - } - break; - case IPV6_UNICAST_HOPS: - /* -1 means use default */ - if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) { - *outlenp = 0; - return (EINVAL); - } - if (!checkonly) { - if (*i1 == -1) { - udp->udp_ttl = ipp->ipp_unicast_hops = - us->us_ipv6_hoplimit; - ipp->ipp_fields &= ~IPPF_UNICAST_HOPS; - /* Pass modified value to IP. */ - *i1 = udp->udp_ttl; - } else { - udp->udp_ttl = ipp->ipp_unicast_hops = - (uint8_t)*i1; - ipp->ipp_fields |= IPPF_UNICAST_HOPS; - } - /* Rebuild the header template */ - error = udp_build_hdrs(udp); - if (error != 0) { - *outlenp = 0; - return (error); - } - } - break; - case IPV6_MULTICAST_HOPS: - /* -1 means use default */ - if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) { - *outlenp = 0; - return (EINVAL); - } - if (!checkonly) { - if (*i1 == -1) { - udp->udp_multicast_ttl = - ipp->ipp_multicast_hops = - IP_DEFAULT_MULTICAST_TTL; - ipp->ipp_fields &= ~IPPF_MULTICAST_HOPS; - /* Pass modified value to IP. */ - *i1 = udp->udp_multicast_ttl; - } else { - udp->udp_multicast_ttl = - ipp->ipp_multicast_hops = - (uint8_t)*i1; - ipp->ipp_fields |= IPPF_MULTICAST_HOPS; - } - } - break; - case IPV6_MULTICAST_LOOP: - if (*i1 != 0 && *i1 != 1) { - *outlenp = 0; - return (EINVAL); - } - if (!checkonly) { - connp->conn_multicast_loop = *i1; - PASS_OPT_TO_IP(connp); - } - break; - case IPV6_JOIN_GROUP: - case IPV6_LEAVE_GROUP: - case MCAST_JOIN_GROUP: - case MCAST_LEAVE_GROUP: - case MCAST_BLOCK_SOURCE: - case MCAST_UNBLOCK_SOURCE: - case MCAST_JOIN_SOURCE_GROUP: - case MCAST_LEAVE_SOURCE_GROUP: - /* - * "soft" error (negative) - * option not handled at this level - * Note: Do not modify *outlenp - */ - return (-EINVAL); - case IPV6_BOUND_IF: - if (!checkonly) { - udp->udp_bound_if = *i1; - PASS_OPT_TO_IP(connp); - } - break; - case IPV6_UNSPEC_SRC: - if (!checkonly) { - udp->udp_unspec_source = onoff; - PASS_OPT_TO_IP(connp); - } - break; - /* - * Set boolean switches for ancillary data delivery - */ - case IPV6_RECVPKTINFO: - if (!checkonly) { - udp->udp_ip_recvpktinfo = onoff; - PASS_OPT_TO_IP(connp); - } - break; - case IPV6_RECVTCLASS: - if (!checkonly) { - udp->udp_ipv6_recvtclass = onoff; - PASS_OPT_TO_IP(connp); - } - break; - case IPV6_RECVPATHMTU: - if (!checkonly) { - udp->udp_ipv6_recvpathmtu = onoff; - PASS_OPT_TO_IP(connp); - } - break; - case IPV6_RECVHOPLIMIT: - if (!checkonly) { - udp->udp_ipv6_recvhoplimit = onoff; - PASS_OPT_TO_IP(connp); - } - break; - case IPV6_RECVHOPOPTS: - if (!checkonly) { - udp->udp_ipv6_recvhopopts = onoff; - PASS_OPT_TO_IP(connp); - } - break; - case IPV6_RECVDSTOPTS: - if (!checkonly) { - udp->udp_ipv6_recvdstopts = onoff; - PASS_OPT_TO_IP(connp); - } - break; - case _OLD_IPV6_RECVDSTOPTS: - if (!checkonly) - udp->udp_old_ipv6_recvdstopts = onoff; - break; - case IPV6_RECVRTHDRDSTOPTS: - if (!checkonly) { - udp->udp_ipv6_recvrthdrdstopts = onoff; - PASS_OPT_TO_IP(connp); - } - break; - case IPV6_RECVRTHDR: - if (!checkonly) { - udp->udp_ipv6_recvrthdr = onoff; - PASS_OPT_TO_IP(connp); - } - break; - /* - * Set sticky options or ancillary data. - * If sticky options, (re)build any extension headers - * that might be needed as a result. - */ - case IPV6_PKTINFO: - /* - * The source address and ifindex are verified - * in ip_opt_set(). For ancillary data the - * source address is checked in ip_wput_v6. - */ - if (inlen != 0 && inlen != sizeof (struct in6_pktinfo)) - return (EINVAL); - if (checkonly) - break; - - if (inlen == 0) { - ipp->ipp_fields &= ~(IPPF_IFINDEX|IPPF_ADDR); - ipp->ipp_sticky_ignored |= - (IPPF_IFINDEX|IPPF_ADDR); - } else { - struct in6_pktinfo *pkti; - - pkti = (struct in6_pktinfo *)invalp; - ipp->ipp_ifindex = pkti->ipi6_ifindex; - ipp->ipp_addr = pkti->ipi6_addr; - if (ipp->ipp_ifindex != 0) - ipp->ipp_fields |= IPPF_IFINDEX; - else - ipp->ipp_fields &= ~IPPF_IFINDEX; - if (!IN6_IS_ADDR_UNSPECIFIED( - &ipp->ipp_addr)) - ipp->ipp_fields |= IPPF_ADDR; - else - ipp->ipp_fields &= ~IPPF_ADDR; - } - if (sticky) { - error = udp_build_hdrs(udp); - if (error != 0) - return (error); - PASS_OPT_TO_IP(connp); - } - break; - case IPV6_HOPLIMIT: - if (sticky) - return (EINVAL); - if (inlen != 0 && inlen != sizeof (int)) - return (EINVAL); - if (checkonly) - break; - - if (inlen == 0) { - ipp->ipp_fields &= ~IPPF_HOPLIMIT; - ipp->ipp_sticky_ignored |= IPPF_HOPLIMIT; - } else { - if (*i1 > 255 || *i1 < -1) - return (EINVAL); - if (*i1 == -1) - ipp->ipp_hoplimit = - us->us_ipv6_hoplimit; - else - ipp->ipp_hoplimit = *i1; - ipp->ipp_fields |= IPPF_HOPLIMIT; - } - break; - case IPV6_TCLASS: - if (inlen != 0 && inlen != sizeof (int)) - return (EINVAL); - if (checkonly) - break; - - if (inlen == 0) { - ipp->ipp_fields &= ~IPPF_TCLASS; - ipp->ipp_sticky_ignored |= IPPF_TCLASS; - } else { - if (*i1 > 255 || *i1 < -1) - return (EINVAL); - if (*i1 == -1) - ipp->ipp_tclass = 0; - else - ipp->ipp_tclass = *i1; - ipp->ipp_fields |= IPPF_TCLASS; - } - if (sticky) { - error = udp_build_hdrs(udp); - if (error != 0) - return (error); - } - break; - case IPV6_NEXTHOP: - /* - * IP will verify that the nexthop is reachable - * and fail for sticky options. - */ - if (inlen != 0 && inlen != sizeof (sin6_t)) - return (EINVAL); - if (checkonly) - break; - - if (inlen == 0) { - ipp->ipp_fields &= ~IPPF_NEXTHOP; - ipp->ipp_sticky_ignored |= IPPF_NEXTHOP; - } else { - sin6_t *sin6 = (sin6_t *)invalp; - - if (sin6->sin6_family != AF_INET6) { - return (EAFNOSUPPORT); - } - if (IN6_IS_ADDR_V4MAPPED( - &sin6->sin6_addr)) - return (EADDRNOTAVAIL); - ipp->ipp_nexthop = sin6->sin6_addr; - if (!IN6_IS_ADDR_UNSPECIFIED( - &ipp->ipp_nexthop)) - ipp->ipp_fields |= IPPF_NEXTHOP; - else - ipp->ipp_fields &= ~IPPF_NEXTHOP; - } - if (sticky) { - error = udp_build_hdrs(udp); - if (error != 0) - return (error); - PASS_OPT_TO_IP(connp); - } - break; - case IPV6_HOPOPTS: { - ip6_hbh_t *hopts = (ip6_hbh_t *)invalp; - /* - * Sanity checks - minimum size, size a multiple of - * eight bytes, and matching size passed in. - */ - if (inlen != 0 && - inlen != (8 * (hopts->ip6h_len + 1))) - return (EINVAL); - - if (checkonly) - break; - - error = optcom_pkt_set(invalp, inlen, sticky, - (uchar_t **)&ipp->ipp_hopopts, - &ipp->ipp_hopoptslen, - sticky ? udp->udp_label_len_v6 : 0); - if (error != 0) - return (error); - if (ipp->ipp_hopoptslen == 0) { - ipp->ipp_fields &= ~IPPF_HOPOPTS; - ipp->ipp_sticky_ignored |= IPPF_HOPOPTS; - } else { - ipp->ipp_fields |= IPPF_HOPOPTS; - } - if (sticky) { - error = udp_build_hdrs(udp); - if (error != 0) - return (error); - } - break; - } - case IPV6_RTHDRDSTOPTS: { - ip6_dest_t *dopts = (ip6_dest_t *)invalp; - - /* - * Sanity checks - minimum size, size a multiple of - * eight bytes, and matching size passed in. - */ - if (inlen != 0 && - inlen != (8 * (dopts->ip6d_len + 1))) - return (EINVAL); - - if (checkonly) - break; - - if (inlen == 0) { - if (sticky && - (ipp->ipp_fields & IPPF_RTDSTOPTS) != 0) { - kmem_free(ipp->ipp_rtdstopts, - ipp->ipp_rtdstoptslen); - ipp->ipp_rtdstopts = NULL; - ipp->ipp_rtdstoptslen = 0; - } - - ipp->ipp_fields &= ~IPPF_RTDSTOPTS; - ipp->ipp_sticky_ignored |= IPPF_RTDSTOPTS; - } else { - error = optcom_pkt_set(invalp, inlen, sticky, - (uchar_t **)&ipp->ipp_rtdstopts, - &ipp->ipp_rtdstoptslen, 0); - if (error != 0) - return (error); - ipp->ipp_fields |= IPPF_RTDSTOPTS; - } - if (sticky) { - error = udp_build_hdrs(udp); - if (error != 0) - return (error); - } - break; - } - case IPV6_DSTOPTS: { - ip6_dest_t *dopts = (ip6_dest_t *)invalp; - - /* - * Sanity checks - minimum size, size a multiple of - * eight bytes, and matching size passed in. - */ - if (inlen != 0 && - inlen != (8 * (dopts->ip6d_len + 1))) - return (EINVAL); - - if (checkonly) - break; - - if (inlen == 0) { - if (sticky && - (ipp->ipp_fields & IPPF_DSTOPTS) != 0) { - kmem_free(ipp->ipp_dstopts, - ipp->ipp_dstoptslen); - ipp->ipp_dstopts = NULL; - ipp->ipp_dstoptslen = 0; - } - ipp->ipp_fields &= ~IPPF_DSTOPTS; - ipp->ipp_sticky_ignored |= IPPF_DSTOPTS; - } else { - error = optcom_pkt_set(invalp, inlen, sticky, - (uchar_t **)&ipp->ipp_dstopts, - &ipp->ipp_dstoptslen, 0); - if (error != 0) - return (error); - ipp->ipp_fields |= IPPF_DSTOPTS; - } - if (sticky) { - error = udp_build_hdrs(udp); - if (error != 0) - return (error); - } - break; - } - case IPV6_RTHDR: { - ip6_rthdr_t *rt = (ip6_rthdr_t *)invalp; - - /* - * Sanity checks - minimum size, size a multiple of - * eight bytes, and matching size passed in. - */ - if (inlen != 0 && - inlen != (8 * (rt->ip6r_len + 1))) - return (EINVAL); - - if (checkonly) - break; - - if (inlen == 0) { - if (sticky && - (ipp->ipp_fields & IPPF_RTHDR) != 0) { - kmem_free(ipp->ipp_rthdr, - ipp->ipp_rthdrlen); - ipp->ipp_rthdr = NULL; - ipp->ipp_rthdrlen = 0; - } - ipp->ipp_fields &= ~IPPF_RTHDR; - ipp->ipp_sticky_ignored |= IPPF_RTHDR; - } else { - error = optcom_pkt_set(invalp, inlen, sticky, - (uchar_t **)&ipp->ipp_rthdr, - &ipp->ipp_rthdrlen, 0); - if (error != 0) - return (error); - ipp->ipp_fields |= IPPF_RTHDR; - } - if (sticky) { - error = udp_build_hdrs(udp); - if (error != 0) - return (error); + ip_xmit_attr_replace_tsl(ixa, newcr->cr_label); + ixa->ixa_flags |= IXAF_UCRED_TSL; + newcr->cr_label = NULL; + crfree(newcr); + coa->coa_changed |= COA_HEADER_CHANGED; + coa->coa_changed |= COA_WROFF_CHANGED; } - break; + /* Fully handled this option. */ + return (0); } - - case IPV6_DONTFRAG: - if (checkonly) - break; - - if (onoff) { - ipp->ipp_fields |= IPPF_DONTFRAG; - } else { - ipp->ipp_fields &= ~IPPF_DONTFRAG; - } - break; - - case IPV6_USE_MIN_MTU: - if (inlen != sizeof (int)) - return (EINVAL); - - if (*i1 < -1 || *i1 > 1) - return (EINVAL); - - if (checkonly) - break; - - ipp->ipp_fields |= IPPF_USE_MIN_MTU; - ipp->ipp_use_min_mtu = *i1; - break; - - case IPV6_SEC_OPT: - case IPV6_SRC_PREFERENCES: - case IPV6_V6ONLY: - /* Handled at the IP level */ - return (-EINVAL); - default: - *outlenp = 0; - return (EINVAL); } break; - } /* end IPPROTO_IPV6 */ case IPPROTO_UDP: switch (name) { - case UDP_ANONPRIVBIND: - if ((error = secpolicy_net_privaddr(cr, 0, - IPPROTO_UDP)) != 0) { - *outlenp = 0; - return (error); - } - if (!checkonly) { - udp->udp_anon_priv_bind = onoff; - } - break; - case UDP_EXCLBIND: - if (!checkonly) - udp->udp_exclbind = onoff; - break; - case UDP_RCVHDR: - if (!checkonly) - udp->udp_rcvhdr = onoff; - break; case UDP_NAT_T_ENDPOINT: if ((error = secpolicy_ip_config(cr, B_FALSE)) != 0) { - *outlenp = 0; return (error); } /* - * Use udp_family instead so we can avoid ambiguitites + * Use conn_family instead so we can avoid ambiguitites * with AF_INET6 sockets that may switch from IPv4 * to IPv6. */ - if (udp->udp_family != AF_INET) { - *outlenp = 0; + if (connp->conn_family != AF_INET) { return (EAFNOSUPPORT); } if (!checkonly) { - int size; - + mutex_enter(&connp->conn_lock); udp->udp_nat_t_endpoint = onoff; - - udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + - UDPH_SIZE + udp->udp_ip_snd_options_len; - - /* Also, adjust wroff */ - if (onoff) { - udp->udp_max_hdr_len += - sizeof (uint32_t); - } - size = udp->udp_max_hdr_len + - us->us_wroff_extra; - (void) proto_set_tx_wroff(connp->conn_rq, connp, - size); + mutex_exit(&connp->conn_lock); + coa->coa_changed |= COA_HEADER_CHANGED; + coa->coa_changed |= COA_WROFF_CHANGED; } - break; - default: - *outlenp = 0; - return (EINVAL); + /* Fully handled this option. */ + return (0); + case UDP_RCVHDR: + mutex_enter(&connp->conn_lock); + udp->udp_rcvhdr = onoff; + mutex_exit(&connp->conn_lock); + return (0); } break; - default: - *outlenp = 0; - return (EINVAL); - } - /* - * Common case of OK return with outval same as inval. - */ - if (invalp != outvalp) { - /* don't trust bcopy for identical src/dst */ - (void) bcopy(invalp, outvalp, inlen); } - *outlenp = inlen; - return (0); + error = conn_opt_set(coa, level, name, inlen, invalp, + checkonly, cr); + return (error); } +/* + * This routine sets socket options. + */ int -udp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, - uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, - void *thisdg_attrs, cred_t *cr) +udp_opt_set(conn_t *connp, uint_t optset_context, int level, + int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, + uchar_t *outvalp, void *thisdg_attrs, cred_t *cr) { - int error; + udp_t *udp = connp->conn_udp; + int err; + conn_opt_arg_t coas, *coa; boolean_t checkonly; + udp_stack_t *us = udp->udp_us; - error = 0; switch (optset_context) { case SETFN_OPTCOM_CHECKONLY: checkonly = B_TRUE; @@ -3056,7 +1878,7 @@ udp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, */ if (inlen == 0) { *outlenp = 0; - goto done; + return (0); } break; case SETFN_OPTCOM_NEGOTIATE: @@ -3074,8 +1896,7 @@ udp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, */ if (!udp_opt_allow_udr_set(level, name)) { *outlenp = 0; - error = EINVAL; - goto done; + return (EINVAL); } break; default: @@ -3083,99 +1904,326 @@ udp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, * We should never get here */ *outlenp = 0; - error = EINVAL; - goto done; + return (EINVAL); } ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); - error = udp_do_opt_set(connp, level, name, inlen, invalp, outlenp, - outvalp, cr, thisdg_attrs, checkonly); -done: - return (error); + if (thisdg_attrs != NULL) { + /* Options from T_UNITDATA_REQ */ + coa = (conn_opt_arg_t *)thisdg_attrs; + ASSERT(coa->coa_connp == connp); + ASSERT(coa->coa_ixa != NULL); + ASSERT(coa->coa_ipp != NULL); + ASSERT(coa->coa_ancillary); + } else { + coa = &coas; + coas.coa_connp = connp; + /* Get a reference on conn_ixa to prevent concurrent mods */ + coas.coa_ixa = conn_get_ixa(connp, B_TRUE); + if (coas.coa_ixa == NULL) { + *outlenp = 0; + return (ENOMEM); + } + coas.coa_ipp = &connp->conn_xmit_ipp; + coas.coa_ancillary = B_FALSE; + coas.coa_changed = 0; + } + + err = udp_do_opt_set(coa, level, name, inlen, invalp, + cr, checkonly); + if (err != 0) { +errout: + if (!coa->coa_ancillary) + ixa_refrele(coa->coa_ixa); + *outlenp = 0; + return (err); + } + /* Handle DHCPINIT here outside of lock */ + if (level == IPPROTO_IP && name == IP_DHCPINIT_IF) { + uint_t ifindex; + ill_t *ill; + + ifindex = *(uint_t *)invalp; + if (ifindex == 0) { + ill = NULL; + } else { + ill = ill_lookup_on_ifindex(ifindex, B_FALSE, + coa->coa_ixa->ixa_ipst); + if (ill == NULL) { + err = ENXIO; + goto errout; + } + + mutex_enter(&ill->ill_lock); + if (ill->ill_state_flags & ILL_CONDEMNED) { + mutex_exit(&ill->ill_lock); + ill_refrele(ill); + err = ENXIO; + goto errout; + } + if (IS_VNI(ill)) { + mutex_exit(&ill->ill_lock); + ill_refrele(ill); + err = EINVAL; + goto errout; + } + } + mutex_enter(&connp->conn_lock); + + if (connp->conn_dhcpinit_ill != NULL) { + /* + * We've locked the conn so conn_cleanup_ill() + * cannot clear conn_dhcpinit_ill -- so it's + * safe to access the ill. + */ + ill_t *oill = connp->conn_dhcpinit_ill; + + ASSERT(oill->ill_dhcpinit != 0); + atomic_dec_32(&oill->ill_dhcpinit); + ill_set_inputfn(connp->conn_dhcpinit_ill); + connp->conn_dhcpinit_ill = NULL; + } + + if (ill != NULL) { + connp->conn_dhcpinit_ill = ill; + atomic_inc_32(&ill->ill_dhcpinit); + ill_set_inputfn(ill); + mutex_exit(&connp->conn_lock); + mutex_exit(&ill->ill_lock); + ill_refrele(ill); + } else { + mutex_exit(&connp->conn_lock); + } + } + + /* + * Common case of OK return with outval same as inval. + */ + if (invalp != outvalp) { + /* don't trust bcopy for identical src/dst */ + (void) bcopy(invalp, outvalp, inlen); + } + *outlenp = inlen; + + /* + * If this was not ancillary data, then we rebuild the headers, + * update the IRE/NCE, and IPsec as needed. + * Since the label depends on the destination we go through + * ip_set_destination first. + */ + if (coa->coa_ancillary) { + return (0); + } + + if (coa->coa_changed & COA_ROUTE_CHANGED) { + in6_addr_t saddr, faddr, nexthop; + in_port_t fport; + + /* + * We clear lastdst to make sure we pick up the change + * next time sending. + * If we are connected we re-cache the information. + * We ignore errors to preserve BSD behavior. + * Note that we don't redo IPsec policy lookup here + * since the final destination (or source) didn't change. + */ + mutex_enter(&connp->conn_lock); + connp->conn_v6lastdst = ipv6_all_zeros; + + ip_attr_nexthop(coa->coa_ipp, coa->coa_ixa, + &connp->conn_faddr_v6, &nexthop); + saddr = connp->conn_saddr_v6; + faddr = connp->conn_faddr_v6; + fport = connp->conn_fport; + mutex_exit(&connp->conn_lock); + + if (!IN6_IS_ADDR_UNSPECIFIED(&faddr) && + !IN6_IS_ADDR_V4MAPPED_ANY(&faddr)) { + (void) ip_attr_connect(connp, coa->coa_ixa, + &saddr, &faddr, &nexthop, fport, NULL, NULL, + IPDF_ALLOW_MCBC | IPDF_VERIFY_DST); + } + } + + ixa_refrele(coa->coa_ixa); + + if (coa->coa_changed & COA_HEADER_CHANGED) { + /* + * Rebuild the header template if we are connected. + * Otherwise clear conn_v6lastdst so we rebuild the header + * in the data path. + */ + mutex_enter(&connp->conn_lock); + if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && + !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { + err = udp_build_hdr_template(connp, + &connp->conn_saddr_v6, &connp->conn_faddr_v6, + connp->conn_fport, connp->conn_flowinfo); + if (err != 0) { + mutex_exit(&connp->conn_lock); + return (err); + } + } else { + connp->conn_v6lastdst = ipv6_all_zeros; + } + mutex_exit(&connp->conn_lock); + } + if (coa->coa_changed & COA_RCVBUF_CHANGED) { + (void) proto_set_rx_hiwat(connp->conn_rq, connp, + connp->conn_rcvbuf); + } + if ((coa->coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) { + connp->conn_wq->q_hiwat = connp->conn_sndbuf; + } + if (coa->coa_changed & COA_WROFF_CHANGED) { + /* Increase wroff if needed */ + uint_t wroff; + + mutex_enter(&connp->conn_lock); + wroff = connp->conn_ht_iphc_allocated + us->us_wroff_extra; + if (udp->udp_nat_t_endpoint) + wroff += sizeof (uint32_t); + if (wroff > connp->conn_wroff) { + connp->conn_wroff = wroff; + mutex_exit(&connp->conn_lock); + (void) proto_set_tx_wroff(connp->conn_rq, connp, wroff); + } else { + mutex_exit(&connp->conn_lock); + } + } + return (err); } -/* ARGSUSED */ +/* This routine sets socket options. */ int udp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, - void *thisdg_attrs, cred_t *cr, mblk_t *mblk) + void *thisdg_attrs, cred_t *cr) { - conn_t *connp = Q_TO_CONN(q); + conn_t *connp = Q_TO_CONN(q); int error; - udp_t *udp = connp->conn_udp; - rw_enter(&udp->udp_rwlock, RW_WRITER); error = udp_opt_set(connp, optset_context, level, name, inlen, invalp, outlenp, outvalp, thisdg_attrs, cr); - rw_exit(&udp->udp_rwlock); return (error); } /* - * Update udp_sticky_hdrs based on udp_sticky_ipp, udp_v6src, and udp_ttl. - * The headers include ip6i_t (if needed), ip6_t, any sticky extension - * headers, and the udp header. - * Returns failure if can't allocate memory. + * Setup IP and UDP headers. + * Returns NULL on allocation failure, in which case data_mp is freed. */ -static int -udp_build_hdrs(udp_t *udp) +mblk_t * +udp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp, + const in6_addr_t *v6src, const in6_addr_t *v6dst, in_port_t dstport, + uint32_t flowinfo, mblk_t *data_mp, int *errorp) { - udp_stack_t *us = udp->udp_us; - uchar_t *hdrs; - uint_t hdrs_len; - ip6_t *ip6h; - ip6i_t *ip6i; - udpha_t *udpha; - ip6_pkt_t *ipp = &udp->udp_sticky_ipp; - size_t sth_wroff; - conn_t *connp = udp->udp_connp; - - ASSERT(RW_WRITE_HELD(&udp->udp_rwlock)); - ASSERT(connp != NULL); + mblk_t *mp; + udpha_t *udpha; + udp_stack_t *us = connp->conn_netstack->netstack_udp; + uint_t data_len; + uint32_t cksum; + udp_t *udp = connp->conn_udp; + boolean_t insert_spi = udp->udp_nat_t_endpoint; + uint_t ulp_hdr_len; - hdrs_len = ip_total_hdrs_len_v6(ipp) + UDPH_SIZE; - ASSERT(hdrs_len != 0); - if (hdrs_len != udp->udp_sticky_hdrs_len) { - /* Need to reallocate */ - hdrs = kmem_alloc(hdrs_len, KM_NOSLEEP); - if (hdrs == NULL) - return (ENOMEM); + data_len = msgdsize(data_mp); + ulp_hdr_len = UDPH_SIZE; + if (insert_spi) + ulp_hdr_len += sizeof (uint32_t); - if (udp->udp_sticky_hdrs_len != 0) { - kmem_free(udp->udp_sticky_hdrs, - udp->udp_sticky_hdrs_len); - } - udp->udp_sticky_hdrs = hdrs; - udp->udp_sticky_hdrs_len = hdrs_len; + mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, IPPROTO_UDP, flowinfo, + ulp_hdr_len, data_mp, data_len, us->us_wroff_extra, &cksum, errorp); + if (mp == NULL) { + ASSERT(*errorp != 0); + return (NULL); } - ip_build_hdrs_v6(udp->udp_sticky_hdrs, - udp->udp_sticky_hdrs_len - UDPH_SIZE, ipp, IPPROTO_UDP); - /* Set header fields not in ipp */ - if (ipp->ipp_fields & IPPF_HAS_IP6I) { - ip6i = (ip6i_t *)udp->udp_sticky_hdrs; - ip6h = (ip6_t *)&ip6i[1]; + data_len += ulp_hdr_len; + ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length; + + udpha = (udpha_t *)(mp->b_rptr + ixa->ixa_ip_hdr_length); + udpha->uha_src_port = connp->conn_lport; + udpha->uha_dst_port = dstport; + udpha->uha_checksum = 0; + udpha->uha_length = htons(data_len); + + /* + * If there was a routing option/header then conn_prepend_hdr + * has massaged it and placed the pseudo-header checksum difference + * in the cksum argument. + * + * Setup header length and prepare for ULP checksum done in IP. + * + * We make it easy for IP to include our pseudo header + * by putting our length in uha_checksum. + * The IP source, destination, and length have already been set by + * conn_prepend_hdr. + */ + cksum += data_len; + cksum = (cksum >> 16) + (cksum & 0xFFFF); + ASSERT(cksum < 0x10000); + + if (ixa->ixa_flags & IXAF_IS_IPV4) { + ipha_t *ipha = (ipha_t *)mp->b_rptr; + + ASSERT(ntohs(ipha->ipha_length) == ixa->ixa_pktlen); + + /* IP does the checksum if uha_checksum is non-zero */ + if (us->us_do_checksum) { + if (cksum == 0) + udpha->uha_checksum = 0xffff; + else + udpha->uha_checksum = htons(cksum); + } else { + udpha->uha_checksum = 0; + } } else { - ip6h = (ip6_t *)udp->udp_sticky_hdrs; + ip6_t *ip6h = (ip6_t *)mp->b_rptr; + + ASSERT(ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN == ixa->ixa_pktlen); + if (cksum == 0) + udpha->uha_checksum = 0xffff; + else + udpha->uha_checksum = htons(cksum); } - if (!(ipp->ipp_fields & IPPF_ADDR)) - ip6h->ip6_src = udp->udp_v6src; + /* Insert all-0s SPI now. */ + if (insert_spi) + *((uint32_t *)(udpha + 1)) = 0; - udpha = (udpha_t *)(udp->udp_sticky_hdrs + hdrs_len - UDPH_SIZE); - udpha->uha_src_port = udp->udp_port; + return (mp); +} - /* Try to get everything in a single mblk */ - if (hdrs_len > udp->udp_max_hdr_len) { - udp->udp_max_hdr_len = hdrs_len; - sth_wroff = udp->udp_max_hdr_len + us->us_wroff_extra; - rw_exit(&udp->udp_rwlock); - (void) proto_set_tx_wroff(udp->udp_connp->conn_rq, - udp->udp_connp, sth_wroff); - rw_enter(&udp->udp_rwlock, RW_WRITER); - } +static int +udp_build_hdr_template(conn_t *connp, const in6_addr_t *v6src, + const in6_addr_t *v6dst, in_port_t dstport, uint32_t flowinfo) +{ + udpha_t *udpha; + int error; + + ASSERT(MUTEX_HELD(&connp->conn_lock)); + /* + * We clear lastdst to make sure we don't use the lastdst path + * next time sending since we might not have set v6dst yet. + */ + connp->conn_v6lastdst = ipv6_all_zeros; + + error = conn_build_hdr_template(connp, UDPH_SIZE, 0, v6src, v6dst, + flowinfo); + if (error != 0) + return (error); + + /* + * Any routing header/option has been massaged. The checksum difference + * is stored in conn_sum. + */ + udpha = (udpha_t *)connp->conn_ht_ulp; + udpha->uha_src_port = connp->conn_lport; + udpha->uha_dst_port = dstport; + udpha->uha_checksum = 0; + udpha->uha_length = htons(UDPH_SIZE); /* Filled in later */ return (0); } @@ -3252,189 +2300,6 @@ udp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr) return (0); } -/* - * Copy hop-by-hop option from ipp->ipp_hopopts to the buffer provided (with - * T_opthdr) and return the number of bytes copied. 'dbuf' may be NULL to - * just count the length needed for allocation. If 'dbuf' is non-NULL, - * then it's assumed to be allocated to be large enough. - * - * Returns zero if trimming of the security option causes all options to go - * away. - */ -static size_t -copy_hop_opts(const ip6_pkt_t *ipp, uchar_t *dbuf) -{ - struct T_opthdr *toh; - size_t hol = ipp->ipp_hopoptslen; - ip6_hbh_t *dstopt = NULL; - const ip6_hbh_t *srcopt = ipp->ipp_hopopts; - size_t tlen, olen, plen; - boolean_t deleting; - const struct ip6_opt *sopt, *lastpad; - struct ip6_opt *dopt; - - if ((toh = (struct T_opthdr *)dbuf) != NULL) { - toh->level = IPPROTO_IPV6; - toh->name = IPV6_HOPOPTS; - toh->status = 0; - dstopt = (ip6_hbh_t *)(toh + 1); - } - - /* - * If labeling is enabled, then skip the label option - * but get other options if there are any. - */ - if (is_system_labeled()) { - dopt = NULL; - if (dstopt != NULL) { - /* will fill in ip6h_len later */ - dstopt->ip6h_nxt = srcopt->ip6h_nxt; - dopt = (struct ip6_opt *)(dstopt + 1); - } - sopt = (const struct ip6_opt *)(srcopt + 1); - hol -= sizeof (*srcopt); - tlen = sizeof (*dstopt); - lastpad = NULL; - deleting = B_FALSE; - /* - * This loop finds the first (lastpad pointer) of any number of - * pads that preceeds the security option, then treats the - * security option as though it were a pad, and then finds the - * next non-pad option (or end of list). - * - * It then treats the entire block as one big pad. To preserve - * alignment of any options that follow, or just the end of the - * list, it computes a minimal new padding size that keeps the - * same alignment for the next option. - * - * If it encounters just a sequence of pads with no security - * option, those are copied as-is rather than collapsed. - * - * Note that to handle the end of list case, the code makes one - * loop with 'hol' set to zero. - */ - for (;;) { - if (hol > 0) { - if (sopt->ip6o_type == IP6OPT_PAD1) { - if (lastpad == NULL) - lastpad = sopt; - sopt = (const struct ip6_opt *) - &sopt->ip6o_len; - hol--; - continue; - } - olen = sopt->ip6o_len + sizeof (*sopt); - if (olen > hol) - olen = hol; - if (sopt->ip6o_type == IP6OPT_PADN || - sopt->ip6o_type == ip6opt_ls) { - if (sopt->ip6o_type == ip6opt_ls) - deleting = B_TRUE; - if (lastpad == NULL) - lastpad = sopt; - sopt = (const struct ip6_opt *) - ((const char *)sopt + olen); - hol -= olen; - continue; - } - } else { - /* if nothing was copied at all, then delete */ - if (tlen == sizeof (*dstopt)) - return (0); - /* last pass; pick up any trailing padding */ - olen = 0; - } - if (deleting) { - /* - * compute aligning effect of deleted material - * to reproduce with pad. - */ - plen = ((const char *)sopt - - (const char *)lastpad) & 7; - tlen += plen; - if (dopt != NULL) { - if (plen == 1) { - dopt->ip6o_type = IP6OPT_PAD1; - } else if (plen > 1) { - plen -= sizeof (*dopt); - dopt->ip6o_type = IP6OPT_PADN; - dopt->ip6o_len = plen; - if (plen > 0) - bzero(dopt + 1, plen); - } - dopt = (struct ip6_opt *) - ((char *)dopt + plen); - } - deleting = B_FALSE; - lastpad = NULL; - } - /* if there's uncopied padding, then copy that now */ - if (lastpad != NULL) { - olen += (const char *)sopt - - (const char *)lastpad; - sopt = lastpad; - lastpad = NULL; - } - if (dopt != NULL && olen > 0) { - bcopy(sopt, dopt, olen); - dopt = (struct ip6_opt *)((char *)dopt + olen); - } - if (hol == 0) - break; - tlen += olen; - sopt = (const struct ip6_opt *) - ((const char *)sopt + olen); - hol -= olen; - } - /* go back and patch up the length value, rounded upward */ - if (dstopt != NULL) - dstopt->ip6h_len = (tlen - 1) >> 3; - } else { - tlen = hol; - if (dstopt != NULL) - bcopy(srcopt, dstopt, hol); - } - - tlen += sizeof (*toh); - if (toh != NULL) - toh->len = tlen; - - return (tlen); -} - -/* - * Update udp_rcv_opt_len from the packet. - * Called when options received, and when no options received but - * udp_ip_recv_opt_len has previously recorded options. - */ -static void -udp_save_ip_rcv_opt(udp_t *udp, void *opt, int opt_len) -{ - /* Save the options if any */ - if (opt_len > 0) { - if (opt_len > udp->udp_ip_rcv_options_len) { - /* Need to allocate larger buffer */ - if (udp->udp_ip_rcv_options_len != 0) - mi_free((char *)udp->udp_ip_rcv_options); - udp->udp_ip_rcv_options_len = 0; - udp->udp_ip_rcv_options = - (uchar_t *)mi_alloc(opt_len, BPRI_HI); - if (udp->udp_ip_rcv_options != NULL) - udp->udp_ip_rcv_options_len = opt_len; - } - if (udp->udp_ip_rcv_options_len != 0) { - bcopy(opt, udp->udp_ip_rcv_options, opt_len); - /* Adjust length if we are resusing the space */ - udp->udp_ip_rcv_options_len = opt_len; - } - } else if (udp->udp_ip_rcv_options_len != 0) { - /* Clear out previously recorded options */ - mi_free((char *)udp->udp_ip_rcv_options); - udp->udp_ip_rcv_options = NULL; - udp->udp_ip_rcv_options_len = 0; - } -} - static mblk_t * udp_queue_fallback(udp_t *udp, mblk_t *mp) { @@ -3466,15 +2331,15 @@ udp_queue_fallback(udp_t *udp, mblk_t *mp) * TPI, then we'll queue the mp for later processing. */ static void -udp_ulp_recv(conn_t *connp, mblk_t *mp) +udp_ulp_recv(conn_t *connp, mblk_t *mp, uint_t len, ip_recv_attr_t *ira) { if (IPCL_IS_NONSTR(connp)) { udp_t *udp = connp->conn_udp; int error; + ASSERT(len == msgdsize(mp)); if ((*connp->conn_upcalls->su_recv) - (connp->conn_upper_handle, mp, msgdsize(mp), 0, &error, - NULL) < 0) { + (connp->conn_upper_handle, mp, len, 0, &error, NULL) < 0) { mutex_enter(&udp->udp_recv_lock); if (error == ENOSPC) { /* @@ -3500,282 +2365,170 @@ udp_ulp_recv(conn_t *connp, mblk_t *mp) } ASSERT(MUTEX_NOT_HELD(&udp->udp_recv_lock)); } else { + if (is_system_labeled()) { + ASSERT(ira->ira_cred != NULL); + /* + * Provide for protocols above UDP such as RPC + * NOPID leaves db_cpid unchanged. + */ + mblk_setcred(mp, ira->ira_cred, NOPID); + } + putnext(connp->conn_rq, mp); } } +/* + * This is the inbound data path. + * IP has already pulled up the IP plus UDP headers and verified alignment + * etc. + */ /* ARGSUSED2 */ static void -udp_input(void *arg1, mblk_t *mp, void *arg2) +udp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) { - conn_t *connp = (conn_t *)arg1; + conn_t *connp = (conn_t *)arg1; struct T_unitdata_ind *tudi; uchar_t *rptr; /* Pointer to IP header */ int hdr_length; /* Length of IP+UDP headers */ - int opt_len; int udi_size; /* Size of T_unitdata_ind */ - int mp_len; + int pkt_len; udp_t *udp; udpha_t *udpha; - int ipversion; - ip6_pkt_t ipp; + ip_pkt_t ipps; ip6_t *ip6h; - ip6i_t *ip6i; mblk_t *mp1; - mblk_t *options_mp = NULL; - ip_pktinfo_t *pinfo = NULL; - cred_t *cr = NULL; - pid_t cpid; - uint32_t udp_ip_rcv_options_len; - udp_bits_t udp_bits; - cred_t *rcr = connp->conn_cred; - udp_stack_t *us; + uint32_t udp_ipv4_options_len; + crb_t recv_ancillary; + udp_stack_t *us; ASSERT(connp->conn_flags & IPCL_UDPCONN); udp = connp->conn_udp; us = udp->udp_us; rptr = mp->b_rptr; - ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_CTL); + + ASSERT(DB_TYPE(mp) == M_DATA); ASSERT(OK_32PTR(rptr)); + ASSERT(ira->ira_pktlen == msgdsize(mp)); + pkt_len = ira->ira_pktlen; /* - * IP should have prepended the options data in an M_CTL - * Check M_CTL "type" to make sure are not here bcos of - * a valid ICMP message + * Get a snapshot of these and allow other threads to change + * them after that. We need the same recv_ancillary when determining + * the size as when adding the ancillary data items. */ - if (DB_TYPE(mp) == M_CTL) { - if (MBLKL(mp) == sizeof (ip_pktinfo_t) && - ((ip_pktinfo_t *)mp->b_rptr)->ip_pkt_ulp_type == - IN_PKTINFO) { - /* - * IP_RECVIF or IP_RECVSLLA or IPF_RECVADDR information - * has been prepended to the packet by IP. We need to - * extract the mblk and adjust the rptr - */ - pinfo = (ip_pktinfo_t *)mp->b_rptr; - options_mp = mp; - mp = mp->b_cont; - rptr = mp->b_rptr; - UDP_STAT(us, udp_in_pktinfo); - } else { - /* - * ICMP messages. - */ - udp_icmp_error(connp, mp); - return; - } - } + mutex_enter(&connp->conn_lock); + udp_ipv4_options_len = udp->udp_recv_ipp.ipp_ipv4_options_len; + recv_ancillary = connp->conn_recv_ancillary; + mutex_exit(&connp->conn_lock); + + hdr_length = ira->ira_ip_hdr_length; - mp_len = msgdsize(mp); /* - * This is the inbound data path. - * First, we check to make sure the IP version number is correct, - * and then pull the IP and UDP headers into the first mblk. + * IP inspected the UDP header thus all of it must be in the mblk. + * UDP length check is performed for IPv6 packets and IPv4 packets + * to check if the size of the packet as specified + * by the UDP header is the same as the length derived from the IP + * header. */ + udpha = (udpha_t *)(rptr + hdr_length); + if (pkt_len != ntohs(udpha->uha_length) + hdr_length) + goto tossit; - /* Initialize regardless if ipversion is IPv4 or IPv6 */ - ipp.ipp_fields = 0; + hdr_length += UDPH_SIZE; + ASSERT(MBLKL(mp) >= hdr_length); /* IP did a pullup */ - ipversion = IPH_HDR_VERSION(rptr); + /* Initialize regardless of IP version */ + ipps.ipp_fields = 0; - rw_enter(&udp->udp_rwlock, RW_READER); - udp_ip_rcv_options_len = udp->udp_ip_rcv_options_len; - udp_bits = udp->udp_bits; - rw_exit(&udp->udp_rwlock); + if (((ira->ira_flags & IRAF_IPV4_OPTIONS) || + udp_ipv4_options_len > 0) && + connp->conn_family == AF_INET) { + int err; - switch (ipversion) { - case IPV4_VERSION: - ASSERT(MBLKL(mp) >= sizeof (ipha_t)); - ASSERT(((ipha_t *)rptr)->ipha_protocol == IPPROTO_UDP); - hdr_length = IPH_HDR_LENGTH(rptr) + UDPH_SIZE; - opt_len = hdr_length - (IP_SIMPLE_HDR_LENGTH + UDPH_SIZE); - if ((opt_len > 0 || udp_ip_rcv_options_len > 0) && - udp->udp_family == AF_INET) { - /* - * Record/update udp_ip_rcv_options with the lock - * held. Not needed for AF_INET6 sockets - * since they don't support a getsockopt of IP_OPTIONS. - */ - rw_enter(&udp->udp_rwlock, RW_WRITER); - udp_save_ip_rcv_opt(udp, rptr + IP_SIMPLE_HDR_LENGTH, - opt_len); - rw_exit(&udp->udp_rwlock); - } - /* Handle IPV6_RECVPKTINFO even for IPv4 packet. */ - if ((udp->udp_family == AF_INET6) && (pinfo != NULL) && - udp->udp_ip_recvpktinfo) { - if (pinfo->ip_pkt_flags & IPF_RECVIF) { - ipp.ipp_fields |= IPPF_IFINDEX; - ipp.ipp_ifindex = pinfo->ip_pkt_ifindex; - } - } - break; - case IPV6_VERSION: /* - * IPv6 packets can only be received by applications - * that are prepared to receive IPv6 addresses. - * The IP fanout must ensure this. + * Record/update udp_recv_ipp with the lock + * held. Not needed for AF_INET6 sockets + * since they don't support a getsockopt of IP_OPTIONS. */ - ASSERT(udp->udp_family == AF_INET6); + mutex_enter(&connp->conn_lock); + err = ip_find_hdr_v4((ipha_t *)rptr, &udp->udp_recv_ipp, + B_TRUE); + if (err != 0) { + /* Allocation failed. Drop packet */ + mutex_exit(&connp->conn_lock); + freemsg(mp); + BUMP_MIB(&us->us_udp_mib, udpInErrors); + return; + } + mutex_exit(&connp->conn_lock); + } - ip6h = (ip6_t *)rptr; - ASSERT((uchar_t *)&ip6h[1] <= mp->b_wptr); + if (recv_ancillary.crb_all != 0) { + /* + * Record packet information in the ip_pkt_t + */ + if (ira->ira_flags & IRAF_IS_IPV4) { + ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION); + ASSERT(MBLKL(mp) >= sizeof (ipha_t)); + ASSERT(((ipha_t *)rptr)->ipha_protocol == IPPROTO_UDP); + ASSERT(ira->ira_ip_hdr_length == IPH_HDR_LENGTH(rptr)); - if (ip6h->ip6_nxt != IPPROTO_UDP) { + (void) ip_find_hdr_v4((ipha_t *)rptr, &ipps, B_FALSE); + } else { uint8_t nexthdrp; - /* Look for ifindex information */ - if (ip6h->ip6_nxt == IPPROTO_RAW) { - ip6i = (ip6i_t *)ip6h; - if ((uchar_t *)&ip6i[1] > mp->b_wptr) - goto tossit; - - if (ip6i->ip6i_flags & IP6I_IFINDEX) { - ASSERT(ip6i->ip6i_ifindex != 0); - ipp.ipp_fields |= IPPF_IFINDEX; - ipp.ipp_ifindex = ip6i->ip6i_ifindex; - } - rptr = (uchar_t *)&ip6i[1]; - mp->b_rptr = rptr; - if (rptr == mp->b_wptr) { - mp1 = mp->b_cont; - freeb(mp); - mp = mp1; - rptr = mp->b_rptr; - } - if (MBLKL(mp) < (IPV6_HDR_LEN + UDPH_SIZE)) - goto tossit; - ip6h = (ip6_t *)rptr; - mp_len = msgdsize(mp); - } + + ASSERT(IPH_HDR_VERSION(rptr) == IPV6_VERSION); /* - * Find any potentially interesting extension headers - * as well as the length of the IPv6 + extension - * headers. + * IPv6 packets can only be received by applications + * that are prepared to receive IPv6 addresses. + * The IP fanout must ensure this. */ - hdr_length = ip_find_hdr_v6(mp, ip6h, &ipp, &nexthdrp) + - UDPH_SIZE; - ASSERT(nexthdrp == IPPROTO_UDP); - } else { - hdr_length = IPV6_HDR_LEN + UDPH_SIZE; - ip6i = NULL; - } - break; - default: - ASSERT(0); - } + ASSERT(connp->conn_family == AF_INET6); - /* - * IP inspected the UDP header thus all of it must be in the mblk. - * UDP length check is performed for IPv6 packets and IPv4 packets - * to check if the size of the packet as specified - * by the header is the same as the physical size of the packet. - * FIXME? Didn't IP already check this? - */ - udpha = (udpha_t *)(rptr + (hdr_length - UDPH_SIZE)); - if ((MBLKL(mp) < hdr_length) || - (mp_len != (ntohs(udpha->uha_length) + hdr_length - UDPH_SIZE))) { - goto tossit; - } + ip6h = (ip6_t *)rptr; - - /* Walk past the headers unless UDP_RCVHDR was set. */ - if (!udp_bits.udpb_rcvhdr) { - mp->b_rptr = rptr + hdr_length; - mp_len -= hdr_length; + /* We don't care about the length, but need the ipp */ + hdr_length = ip_find_hdr_v6(mp, ip6h, B_TRUE, &ipps, + &nexthdrp); + ASSERT(hdr_length == ira->ira_ip_hdr_length); + /* Restore */ + hdr_length = ira->ira_ip_hdr_length + UDPH_SIZE; + ASSERT(nexthdrp == IPPROTO_UDP); + } } /* * This is the inbound data path. Packets are passed upstream as - * T_UNITDATA_IND messages with full IP headers still attached. + * T_UNITDATA_IND messages. */ - if (udp->udp_family == AF_INET) { + if (connp->conn_family == AF_INET) { sin_t *sin; ASSERT(IPH_HDR_VERSION((ipha_t *)rptr) == IPV4_VERSION); /* * Normally only send up the source address. - * If IP_RECVDSTADDR is set we include the destination IP - * address as an option. With IP_RECVOPTS we include all - * the IP options. + * If any ancillary data items are wanted we add those. */ udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t); - if (udp_bits.udpb_recvdstaddr) { - udi_size += sizeof (struct T_opthdr) + - sizeof (struct in_addr); - UDP_STAT(us, udp_in_recvdstaddr); - } - - if (udp_bits.udpb_ip_recvpktinfo && (pinfo != NULL) && - (pinfo->ip_pkt_flags & IPF_RECVADDR)) { - udi_size += sizeof (struct T_opthdr) + - sizeof (struct in_pktinfo); - UDP_STAT(us, udp_ip_rcvpktinfo); - } - - if ((udp_bits.udpb_recvopts) && opt_len > 0) { - udi_size += sizeof (struct T_opthdr) + opt_len; - UDP_STAT(us, udp_in_recvopts); - } - - /* - * If the IP_RECVSLLA or the IP_RECVIF is set then allocate - * space accordingly - */ - if ((udp_bits.udpb_recvif) && (pinfo != NULL) && - (pinfo->ip_pkt_flags & IPF_RECVIF)) { - udi_size += sizeof (struct T_opthdr) + sizeof (uint_t); - UDP_STAT(us, udp_in_recvif); - } - - if ((udp_bits.udpb_recvslla) && (pinfo != NULL) && - (pinfo->ip_pkt_flags & IPF_RECVSLLA)) { - udi_size += sizeof (struct T_opthdr) + - sizeof (struct sockaddr_dl); - UDP_STAT(us, udp_in_recvslla); - } - - if ((udp_bits.udpb_recvucred) && - (cr = msg_getcred(mp, &cpid)) != NULL) { - udi_size += sizeof (struct T_opthdr) + ucredsize; - UDP_STAT(us, udp_in_recvucred); - } - - /* - * If SO_TIMESTAMP is set allocate the appropriate sized - * buffer. Since gethrestime() expects a pointer aligned - * argument, we allocate space necessary for extra - * alignment (even though it might not be used). - */ - if (udp_bits.udpb_timestamp) { - udi_size += sizeof (struct T_opthdr) + - sizeof (timestruc_t) + _POINTER_ALIGNMENT; - UDP_STAT(us, udp_in_timestamp); - } - - /* - * If IP_RECVTTL is set allocate the appropriate sized buffer - */ - if (udp_bits.udpb_recvttl) { - udi_size += sizeof (struct T_opthdr) + sizeof (uint8_t); - UDP_STAT(us, udp_in_recvttl); + if (recv_ancillary.crb_all != 0) { + udi_size += conn_recvancillary_size(connp, + recv_ancillary, ira, mp, &ipps); } /* Allocate a message block for the T_UNITDATA_IND structure. */ mp1 = allocb(udi_size, BPRI_MED); if (mp1 == NULL) { freemsg(mp); - if (options_mp != NULL) - freeb(options_mp); BUMP_MIB(&us->us_udp_mib, udpInErrors); return; } mp1->b_cont = mp; - mp = mp1; - mp->b_datap->db_type = M_PROTO; - tudi = (struct T_unitdata_ind *)mp->b_rptr; - mp->b_wptr = (uchar_t *)tudi + udi_size; + mp1->b_datap->db_type = M_PROTO; + tudi = (struct T_unitdata_ind *)mp1->b_rptr; + mp1->b_wptr = (uchar_t *)tudi + udi_size; tudi->PRIM_type = T_UNITDATA_IND; tudi->SRC_length = sizeof (sin_t); tudi->SRC_offset = sizeof (struct T_unitdata_ind); @@ -3786,7 +2539,7 @@ udp_input(void *arg1, mblk_t *mp, void *arg2) sin = (sin_t *)&tudi[1]; sin->sin_addr.s_addr = ((ipha_t *)rptr)->ipha_src; sin->sin_port = udpha->uha_src_port; - sin->sin_family = udp->udp_family; + sin->sin_family = connp->conn_family; *(uint32_t *)&sin->sin_zero[0] = 0; *(uint32_t *)&sin->sin_zero[4] = 0; @@ -3795,166 +2548,8 @@ udp_input(void *arg1, mblk_t *mp, void *arg2) * IP_RECVTTL has been set. */ if (udi_size != 0) { - /* - * Copy in destination address before options to avoid - * any padding issues. - */ - char *dstopt; - - dstopt = (char *)&sin[1]; - if (udp_bits.udpb_recvdstaddr) { - struct T_opthdr *toh; - ipaddr_t *dstptr; - - toh = (struct T_opthdr *)dstopt; - toh->level = IPPROTO_IP; - toh->name = IP_RECVDSTADDR; - toh->len = sizeof (struct T_opthdr) + - sizeof (ipaddr_t); - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - dstptr = (ipaddr_t *)dstopt; - *dstptr = ((ipha_t *)rptr)->ipha_dst; - dstopt += sizeof (ipaddr_t); - udi_size -= toh->len; - } - - if (udp_bits.udpb_recvopts && opt_len > 0) { - struct T_opthdr *toh; - - toh = (struct T_opthdr *)dstopt; - toh->level = IPPROTO_IP; - toh->name = IP_RECVOPTS; - toh->len = sizeof (struct T_opthdr) + opt_len; - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - bcopy(rptr + IP_SIMPLE_HDR_LENGTH, dstopt, - opt_len); - dstopt += opt_len; - udi_size -= toh->len; - } - - if ((udp_bits.udpb_ip_recvpktinfo) && (pinfo != NULL) && - (pinfo->ip_pkt_flags & IPF_RECVADDR)) { - struct T_opthdr *toh; - struct in_pktinfo *pktinfop; - - toh = (struct T_opthdr *)dstopt; - toh->level = IPPROTO_IP; - toh->name = IP_PKTINFO; - toh->len = sizeof (struct T_opthdr) + - sizeof (*pktinfop); - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - pktinfop = (struct in_pktinfo *)dstopt; - pktinfop->ipi_ifindex = pinfo->ip_pkt_ifindex; - pktinfop->ipi_spec_dst = - pinfo->ip_pkt_match_addr; - pktinfop->ipi_addr.s_addr = - ((ipha_t *)rptr)->ipha_dst; - - dstopt += sizeof (struct in_pktinfo); - udi_size -= toh->len; - } - - if ((udp_bits.udpb_recvslla) && (pinfo != NULL) && - (pinfo->ip_pkt_flags & IPF_RECVSLLA)) { - - struct T_opthdr *toh; - struct sockaddr_dl *dstptr; - - toh = (struct T_opthdr *)dstopt; - toh->level = IPPROTO_IP; - toh->name = IP_RECVSLLA; - toh->len = sizeof (struct T_opthdr) + - sizeof (struct sockaddr_dl); - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - dstptr = (struct sockaddr_dl *)dstopt; - bcopy(&pinfo->ip_pkt_slla, dstptr, - sizeof (struct sockaddr_dl)); - dstopt += sizeof (struct sockaddr_dl); - udi_size -= toh->len; - } - - if ((udp_bits.udpb_recvif) && (pinfo != NULL) && - (pinfo->ip_pkt_flags & IPF_RECVIF)) { - - struct T_opthdr *toh; - uint_t *dstptr; - - toh = (struct T_opthdr *)dstopt; - toh->level = IPPROTO_IP; - toh->name = IP_RECVIF; - toh->len = sizeof (struct T_opthdr) + - sizeof (uint_t); - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - dstptr = (uint_t *)dstopt; - *dstptr = pinfo->ip_pkt_ifindex; - dstopt += sizeof (uint_t); - udi_size -= toh->len; - } - - if (cr != NULL) { - struct T_opthdr *toh; - - toh = (struct T_opthdr *)dstopt; - toh->level = SOL_SOCKET; - toh->name = SCM_UCRED; - toh->len = sizeof (struct T_opthdr) + ucredsize; - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - (void) cred2ucred(cr, cpid, dstopt, rcr); - dstopt += ucredsize; - udi_size -= toh->len; - } - - if (udp_bits.udpb_timestamp) { - struct T_opthdr *toh; - - toh = (struct T_opthdr *)dstopt; - toh->level = SOL_SOCKET; - toh->name = SCM_TIMESTAMP; - toh->len = sizeof (struct T_opthdr) + - sizeof (timestruc_t) + _POINTER_ALIGNMENT; - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - /* Align for gethrestime() */ - dstopt = (char *)P2ROUNDUP((intptr_t)dstopt, - sizeof (intptr_t)); - gethrestime((timestruc_t *)dstopt); - dstopt = (char *)toh + toh->len; - udi_size -= toh->len; - } - - /* - * CAUTION: - * Due to aligment issues - * Processing of IP_RECVTTL option - * should always be the last. Adding - * any option processing after this will - * cause alignment panic. - */ - if (udp_bits.udpb_recvttl) { - struct T_opthdr *toh; - uint8_t *dstptr; - - toh = (struct T_opthdr *)dstopt; - toh->level = IPPROTO_IP; - toh->name = IP_RECVTTL; - toh->len = sizeof (struct T_opthdr) + - sizeof (uint8_t); - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - dstptr = (uint8_t *)dstopt; - *dstptr = ((ipha_t *)rptr)->ipha_ttl; - dstopt += sizeof (uint8_t); - udi_size -= toh->len; - } - - /* Consumed all of allocated space */ - ASSERT(udi_size == 0); + conn_recvancillary_add(connp, recv_ancillary, ira, + &ipps, (uchar_t *)&sin[1], udi_size); } } else { sin6_t *sin6; @@ -3968,89 +2563,21 @@ udp_input(void *arg1, mblk_t *mp, void *arg2) */ udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t); - if (ipp.ipp_fields & (IPPF_HOPOPTS|IPPF_DSTOPTS|IPPF_RTDSTOPTS| - IPPF_RTHDR|IPPF_IFINDEX)) { - if ((udp_bits.udpb_ipv6_recvhopopts) && - (ipp.ipp_fields & IPPF_HOPOPTS)) { - size_t hlen; - - UDP_STAT(us, udp_in_recvhopopts); - hlen = copy_hop_opts(&ipp, NULL); - if (hlen == 0) - ipp.ipp_fields &= ~IPPF_HOPOPTS; - udi_size += hlen; - } - if (((udp_bits.udpb_ipv6_recvdstopts) || - udp_bits.udpb_old_ipv6_recvdstopts) && - (ipp.ipp_fields & IPPF_DSTOPTS)) { - udi_size += sizeof (struct T_opthdr) + - ipp.ipp_dstoptslen; - UDP_STAT(us, udp_in_recvdstopts); - } - if ((((udp_bits.udpb_ipv6_recvdstopts) && - udp_bits.udpb_ipv6_recvrthdr && - (ipp.ipp_fields & IPPF_RTHDR)) || - (udp_bits.udpb_ipv6_recvrthdrdstopts)) && - (ipp.ipp_fields & IPPF_RTDSTOPTS)) { - udi_size += sizeof (struct T_opthdr) + - ipp.ipp_rtdstoptslen; - UDP_STAT(us, udp_in_recvrtdstopts); - } - if ((udp_bits.udpb_ipv6_recvrthdr) && - (ipp.ipp_fields & IPPF_RTHDR)) { - udi_size += sizeof (struct T_opthdr) + - ipp.ipp_rthdrlen; - UDP_STAT(us, udp_in_recvrthdr); - } - if ((udp_bits.udpb_ip_recvpktinfo) && - (ipp.ipp_fields & IPPF_IFINDEX)) { - udi_size += sizeof (struct T_opthdr) + - sizeof (struct in6_pktinfo); - UDP_STAT(us, udp_in_recvpktinfo); - } - - } - if ((udp_bits.udpb_recvucred) && - (cr = msg_getcred(mp, &cpid)) != NULL) { - udi_size += sizeof (struct T_opthdr) + ucredsize; - UDP_STAT(us, udp_in_recvucred); - } - - /* - * If SO_TIMESTAMP is set allocate the appropriate sized - * buffer. Since gethrestime() expects a pointer aligned - * argument, we allocate space necessary for extra - * alignment (even though it might not be used). - */ - if (udp_bits.udpb_timestamp) { - udi_size += sizeof (struct T_opthdr) + - sizeof (timestruc_t) + _POINTER_ALIGNMENT; - UDP_STAT(us, udp_in_timestamp); - } - - if (udp_bits.udpb_ipv6_recvhoplimit) { - udi_size += sizeof (struct T_opthdr) + sizeof (int); - UDP_STAT(us, udp_in_recvhoplimit); - } - - if (udp_bits.udpb_ipv6_recvtclass) { - udi_size += sizeof (struct T_opthdr) + sizeof (int); - UDP_STAT(us, udp_in_recvtclass); + if (recv_ancillary.crb_all != 0) { + udi_size += conn_recvancillary_size(connp, + recv_ancillary, ira, mp, &ipps); } mp1 = allocb(udi_size, BPRI_MED); if (mp1 == NULL) { freemsg(mp); - if (options_mp != NULL) - freeb(options_mp); BUMP_MIB(&us->us_udp_mib, udpInErrors); return; } mp1->b_cont = mp; - mp = mp1; - mp->b_datap->db_type = M_PROTO; - tudi = (struct T_unitdata_ind *)mp->b_rptr; - mp->b_wptr = (uchar_t *)tudi + udi_size; + mp1->b_datap->db_type = M_PROTO; + tudi = (struct T_unitdata_ind *)mp1->b_rptr; + mp1->b_wptr = (uchar_t *)tudi + udi_size; tudi->PRIM_type = T_UNITDATA_IND; tudi->SRC_length = sizeof (sin6_t); tudi->SRC_offset = sizeof (struct T_unitdata_ind); @@ -4059,7 +2586,7 @@ udp_input(void *arg1, mblk_t *mp, void *arg2) udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin6_t)); tudi->OPT_length = udi_size; sin6 = (sin6_t *)&tudi[1]; - if (ipversion == IPV4_VERSION) { + if (ira->ira_flags & IRAF_IS_IPV4) { in6_addr_t v6dst; IN6_IPADDR_TO_V4MAPPED(((ipha_t *)rptr)->ipha_src, @@ -4069,196 +2596,43 @@ udp_input(void *arg1, mblk_t *mp, void *arg2) sin6->sin6_flowinfo = 0; sin6->sin6_scope_id = 0; sin6->__sin6_src_id = ip_srcid_find_addr(&v6dst, - connp->conn_zoneid, us->us_netstack); + IPCL_ZONEID(connp), us->us_netstack); } else { + ip6h = (ip6_t *)rptr; + sin6->sin6_addr = ip6h->ip6_src; /* No sin6_flowinfo per API */ sin6->sin6_flowinfo = 0; - /* For link-scope source pass up scope id */ - if ((ipp.ipp_fields & IPPF_IFINDEX) && - IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) - sin6->sin6_scope_id = ipp.ipp_ifindex; + /* For link-scope pass up scope id */ + if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) + sin6->sin6_scope_id = ira->ira_ruifindex; else sin6->sin6_scope_id = 0; sin6->__sin6_src_id = ip_srcid_find_addr( - &ip6h->ip6_dst, connp->conn_zoneid, + &ip6h->ip6_dst, IPCL_ZONEID(connp), us->us_netstack); } sin6->sin6_port = udpha->uha_src_port; - sin6->sin6_family = udp->udp_family; + sin6->sin6_family = connp->conn_family; if (udi_size != 0) { - uchar_t *dstopt; - - dstopt = (uchar_t *)&sin6[1]; - if ((udp_bits.udpb_ip_recvpktinfo) && - (ipp.ipp_fields & IPPF_IFINDEX)) { - struct T_opthdr *toh; - struct in6_pktinfo *pkti; - - toh = (struct T_opthdr *)dstopt; - toh->level = IPPROTO_IPV6; - toh->name = IPV6_PKTINFO; - toh->len = sizeof (struct T_opthdr) + - sizeof (*pkti); - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - pkti = (struct in6_pktinfo *)dstopt; - if (ipversion == IPV6_VERSION) - pkti->ipi6_addr = ip6h->ip6_dst; - else - IN6_IPADDR_TO_V4MAPPED( - ((ipha_t *)rptr)->ipha_dst, - &pkti->ipi6_addr); - pkti->ipi6_ifindex = ipp.ipp_ifindex; - dstopt += sizeof (*pkti); - udi_size -= toh->len; - } - if (udp_bits.udpb_ipv6_recvhoplimit) { - struct T_opthdr *toh; - - toh = (struct T_opthdr *)dstopt; - toh->level = IPPROTO_IPV6; - toh->name = IPV6_HOPLIMIT; - toh->len = sizeof (struct T_opthdr) + - sizeof (uint_t); - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - if (ipversion == IPV6_VERSION) - *(uint_t *)dstopt = ip6h->ip6_hops; - else - *(uint_t *)dstopt = - ((ipha_t *)rptr)->ipha_ttl; - dstopt += sizeof (uint_t); - udi_size -= toh->len; - } - if (udp_bits.udpb_ipv6_recvtclass) { - struct T_opthdr *toh; - - toh = (struct T_opthdr *)dstopt; - toh->level = IPPROTO_IPV6; - toh->name = IPV6_TCLASS; - toh->len = sizeof (struct T_opthdr) + - sizeof (uint_t); - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - if (ipversion == IPV6_VERSION) { - *(uint_t *)dstopt = - IPV6_FLOW_TCLASS(ip6h->ip6_flow); - } else { - ipha_t *ipha = (ipha_t *)rptr; - *(uint_t *)dstopt = - ipha->ipha_type_of_service; - } - dstopt += sizeof (uint_t); - udi_size -= toh->len; - } - if ((udp_bits.udpb_ipv6_recvhopopts) && - (ipp.ipp_fields & IPPF_HOPOPTS)) { - size_t hlen; - - hlen = copy_hop_opts(&ipp, dstopt); - dstopt += hlen; - udi_size -= hlen; - } - if ((udp_bits.udpb_ipv6_recvdstopts) && - (udp_bits.udpb_ipv6_recvrthdr) && - (ipp.ipp_fields & IPPF_RTHDR) && - (ipp.ipp_fields & IPPF_RTDSTOPTS)) { - struct T_opthdr *toh; - - toh = (struct T_opthdr *)dstopt; - toh->level = IPPROTO_IPV6; - toh->name = IPV6_DSTOPTS; - toh->len = sizeof (struct T_opthdr) + - ipp.ipp_rtdstoptslen; - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - bcopy(ipp.ipp_rtdstopts, dstopt, - ipp.ipp_rtdstoptslen); - dstopt += ipp.ipp_rtdstoptslen; - udi_size -= toh->len; - } - if ((udp_bits.udpb_ipv6_recvrthdr) && - (ipp.ipp_fields & IPPF_RTHDR)) { - struct T_opthdr *toh; - - toh = (struct T_opthdr *)dstopt; - toh->level = IPPROTO_IPV6; - toh->name = IPV6_RTHDR; - toh->len = sizeof (struct T_opthdr) + - ipp.ipp_rthdrlen; - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - bcopy(ipp.ipp_rthdr, dstopt, ipp.ipp_rthdrlen); - dstopt += ipp.ipp_rthdrlen; - udi_size -= toh->len; - } - if ((udp_bits.udpb_ipv6_recvdstopts) && - (ipp.ipp_fields & IPPF_DSTOPTS)) { - struct T_opthdr *toh; - - toh = (struct T_opthdr *)dstopt; - toh->level = IPPROTO_IPV6; - toh->name = IPV6_DSTOPTS; - toh->len = sizeof (struct T_opthdr) + - ipp.ipp_dstoptslen; - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - bcopy(ipp.ipp_dstopts, dstopt, - ipp.ipp_dstoptslen); - dstopt += ipp.ipp_dstoptslen; - udi_size -= toh->len; - } - if (cr != NULL) { - struct T_opthdr *toh; - - toh = (struct T_opthdr *)dstopt; - toh->level = SOL_SOCKET; - toh->name = SCM_UCRED; - toh->len = sizeof (struct T_opthdr) + ucredsize; - toh->status = 0; - (void) cred2ucred(cr, cpid, &toh[1], rcr); - dstopt += toh->len; - udi_size -= toh->len; - } - if (udp_bits.udpb_timestamp) { - struct T_opthdr *toh; - - toh = (struct T_opthdr *)dstopt; - toh->level = SOL_SOCKET; - toh->name = SCM_TIMESTAMP; - toh->len = sizeof (struct T_opthdr) + - sizeof (timestruc_t) + _POINTER_ALIGNMENT; - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - /* Align for gethrestime() */ - dstopt = (uchar_t *)P2ROUNDUP((intptr_t)dstopt, - sizeof (intptr_t)); - gethrestime((timestruc_t *)dstopt); - dstopt = (uchar_t *)toh + toh->len; - udi_size -= toh->len; - } - - /* Consumed all of allocated space */ - ASSERT(udi_size == 0); + conn_recvancillary_add(connp, recv_ancillary, ira, + &ipps, (uchar_t *)&sin6[1], udi_size); } -#undef sin6 - /* No IP_RECVDSTADDR for IPv6. */ } - BUMP_MIB(&us->us_udp_mib, udpHCInDatagrams); - if (options_mp != NULL) - freeb(options_mp); - - udp_ulp_recv(connp, mp); + /* Walk past the headers unless IP_RECVHDR was set. */ + if (!udp->udp_rcvhdr) { + mp->b_rptr = rptr + hdr_length; + pkt_len -= hdr_length; + } + BUMP_MIB(&us->us_udp_mib, udpHCInDatagrams); + udp_ulp_recv(connp, mp1, pkt_len, ira); return; tossit: freemsg(mp); - if (options_mp != NULL) - freeb(options_mp); BUMP_MIB(&us->us_udp_mib, udpInErrors); } @@ -4386,23 +2760,34 @@ udp_snmp_get(queue_t *q, mblk_t *mpctl) needattr = B_TRUE; break; } + mutex_enter(&connp->conn_lock); + if (udp->udp_state == TS_DATA_XFER && + connp->conn_ixa->ixa_tsl != NULL) { + ts_label_t *tsl; + + tsl = connp->conn_ixa->ixa_tsl; + mlp.tme_flags |= MIB2_TMEF_IS_LABELED; + mlp.tme_doi = label2doi(tsl); + mlp.tme_label = *label2bslabel(tsl); + needattr = B_TRUE; + } + mutex_exit(&connp->conn_lock); /* * Create an IPv4 table entry for IPv4 entries and also * any IPv6 entries which are bound to in6addr_any * (i.e. anything a IPv4 peer could connect/send to). */ - if (udp->udp_ipversion == IPV4_VERSION || + if (connp->conn_ipversion == IPV4_VERSION || (udp->udp_state <= TS_IDLE && - IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src))) { + IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6))) { ude.udpEntryInfo.ue_state = state; /* * If in6addr_any this will set it to * INADDR_ANY */ - ude.udpLocalAddress = - V4_PART_OF_V6(udp->udp_v6src); - ude.udpLocalPort = ntohs(udp->udp_port); + ude.udpLocalAddress = connp->conn_laddr_v4; + ude.udpLocalPort = ntohs(connp->conn_lport); if (udp->udp_state == TS_DATA_XFER) { /* * Can potentially get here for @@ -4414,9 +2799,9 @@ udp_snmp_get(queue_t *q, mblk_t *mpctl) * this part of the code. */ ude.udpEntryInfo.ue_RemoteAddress = - V4_PART_OF_V6(udp->udp_v6dst); + connp->conn_faddr_v4; ude.udpEntryInfo.ue_RemotePort = - ntohs(udp->udp_dstport); + ntohs(connp->conn_fport); } else { ude.udpEntryInfo.ue_RemoteAddress = 0; ude.udpEntryInfo.ue_RemotePort = 0; @@ -4429,10 +2814,10 @@ udp_snmp_get(queue_t *q, mblk_t *mpctl) */ ude.udpInstance = (uint32_t)(uintptr_t)udp; ude.udpCreationProcess = - (udp->udp_open_pid < 0) ? + (connp->conn_cpid < 0) ? MIB2_UNKNOWN_PROCESS : - udp->udp_open_pid; - ude.udpCreationTime = udp->udp_open_time; + connp->conn_cpid; + ude.udpCreationTime = connp->conn_open_time; (void) snmp_append_data2(mp_conn_ctl->b_cont, &mp_conn_tail, (char *)&ude, sizeof (ude)); @@ -4442,16 +2827,24 @@ udp_snmp_get(queue_t *q, mblk_t *mpctl) mp_attr_ctl->b_cont, &mp_attr_tail, (char *)&mlp, sizeof (mlp)); } - if (udp->udp_ipversion == IPV6_VERSION) { + if (connp->conn_ipversion == IPV6_VERSION) { ude6.udp6EntryInfo.ue_state = state; - ude6.udp6LocalAddress = udp->udp_v6src; - ude6.udp6LocalPort = ntohs(udp->udp_port); - ude6.udp6IfIndex = udp->udp_bound_if; + ude6.udp6LocalAddress = connp->conn_laddr_v6; + ude6.udp6LocalPort = ntohs(connp->conn_lport); + mutex_enter(&connp->conn_lock); + if (connp->conn_ixa->ixa_flags & + IXAF_SCOPEID_SET) { + ude6.udp6IfIndex = + connp->conn_ixa->ixa_scopeid; + } else { + ude6.udp6IfIndex = connp->conn_bound_if; + } + mutex_exit(&connp->conn_lock); if (udp->udp_state == TS_DATA_XFER) { ude6.udp6EntryInfo.ue_RemoteAddress = - udp->udp_v6dst; + connp->conn_faddr_v6; ude6.udp6EntryInfo.ue_RemotePort = - ntohs(udp->udp_dstport); + ntohs(connp->conn_fport); } else { ude6.udp6EntryInfo.ue_RemoteAddress = sin6_null.sin6_addr; @@ -4464,10 +2857,10 @@ udp_snmp_get(queue_t *q, mblk_t *mpctl) */ ude6.udp6Instance = (uint32_t)(uintptr_t)udp; ude6.udp6CreationProcess = - (udp->udp_open_pid < 0) ? + (connp->conn_cpid < 0) ? MIB2_UNKNOWN_PROCESS : - udp->udp_open_pid; - ude6.udp6CreationTime = udp->udp_open_time; + connp->conn_cpid; + ude6.udp6CreationTime = connp->conn_open_time; (void) snmp_append_data2(mp6_conn_ctl->b_cont, &mp6_conn_tail, (char *)&ude6, @@ -4548,39 +2941,34 @@ udp_snmp_set(queue_t *q, t_scalar_t level, t_scalar_t name, * passed in mp. This message is freed. */ static void -udp_ud_err(queue_t *q, mblk_t *mp, uchar_t *destaddr, t_scalar_t destlen, - t_scalar_t err) +udp_ud_err(queue_t *q, mblk_t *mp, t_scalar_t err) { struct T_unitdata_req *tudr; mblk_t *mp1; + uchar_t *destaddr; + t_scalar_t destlen; uchar_t *optaddr; t_scalar_t optlen; - if (DB_TYPE(mp) == M_DATA) { - ASSERT(destaddr != NULL && destlen != 0); - optaddr = NULL; - optlen = 0; - } else { - if ((mp->b_wptr < mp->b_rptr) || - (MBLKL(mp)) < sizeof (struct T_unitdata_req)) { - goto done; - } - tudr = (struct T_unitdata_req *)mp->b_rptr; - destaddr = mp->b_rptr + tudr->DEST_offset; - if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr || - destaddr + tudr->DEST_length < mp->b_rptr || - destaddr + tudr->DEST_length > mp->b_wptr) { - goto done; - } - optaddr = mp->b_rptr + tudr->OPT_offset; - if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr || - optaddr + tudr->OPT_length < mp->b_rptr || - optaddr + tudr->OPT_length > mp->b_wptr) { - goto done; - } - destlen = tudr->DEST_length; - optlen = tudr->OPT_length; + if ((mp->b_wptr < mp->b_rptr) || + (MBLKL(mp)) < sizeof (struct T_unitdata_req)) { + goto done; } + tudr = (struct T_unitdata_req *)mp->b_rptr; + destaddr = mp->b_rptr + tudr->DEST_offset; + if (destaddr < mp->b_rptr || destaddr >= mp->b_wptr || + destaddr + tudr->DEST_length < mp->b_rptr || + destaddr + tudr->DEST_length > mp->b_wptr) { + goto done; + } + optaddr = mp->b_rptr + tudr->OPT_offset; + if (optaddr < mp->b_rptr || optaddr >= mp->b_wptr || + optaddr + tudr->OPT_length < mp->b_rptr || + optaddr + tudr->OPT_length > mp->b_wptr) { + goto done; + } + destlen = tudr->DEST_length; + optlen = tudr->OPT_length; mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen, (char *)optaddr, optlen, err); @@ -4685,1093 +3073,721 @@ retry: return (port); } +/* + * Handle T_UNITDATA_REQ with options. Both IPv4 and IPv6 + * Either tudr_mp or msg is set. If tudr_mp we take ancillary data from + * the TPI options, otherwise we take them from msg_control. + * If both sin and sin6 is set it is a connected socket and we use conn_faddr. + * Always consumes mp; never consumes tudr_mp. + */ static int -udp_update_label(queue_t *wq, mblk_t *mp, ipaddr_t dst) +udp_output_ancillary(conn_t *connp, sin_t *sin, sin6_t *sin6, mblk_t *mp, + mblk_t *tudr_mp, struct nmsghdr *msg, cred_t *cr, pid_t pid) { - int err; - cred_t *cred; - cred_t *orig_cred = NULL; - cred_t *effective_cred = NULL; - uchar_t opt_storage[IP_MAX_OPT_LENGTH]; - udp_t *udp = Q_TO_UDP(wq); + udp_t *udp = connp->conn_udp; udp_stack_t *us = udp->udp_us; + int error; + ip_xmit_attr_t *ixa; + ip_pkt_t *ipp; + in6_addr_t v6src; + in6_addr_t v6dst; + in6_addr_t v6nexthop; + in_port_t dstport; + uint32_t flowinfo; + uint_t srcid; + int is_absreq_failure = 0; + conn_opt_arg_t coas, *coa; - /* - * All Solaris components should pass a db_credp - * for this message, hence we ASSERT. - * On production kernels we return an error to be robust against - * random streams modules sitting on top of us. - */ - cred = orig_cred = msg_getcred(mp, NULL); - ASSERT(cred != NULL); - if (cred == NULL) - return (EINVAL); + ASSERT(tudr_mp != NULL || msg != NULL); /* - * Verify the destination is allowed to receive packets at - * the security label of the message data. tsol_check_dest() - * may create a new effective cred for this message with a - * modified label or label flags. Note that we use the cred/label - * from the message to handle MLP + * Get ixa before checking state to handle a disconnect race. + * + * We need an exclusive copy of conn_ixa since the ancillary data + * options might modify it. That copy has no pointers hence we + * need to set them up once we've parsed the ancillary data. */ - if ((err = tsol_check_dest(cred, &dst, IPV4_VERSION, - udp->udp_connp->conn_mac_mode, &effective_cred)) != 0) - goto done; - if (effective_cred != NULL) - cred = effective_cred; + ixa = conn_get_ixa_exclusive(connp); + if (ixa == NULL) { + BUMP_MIB(&us->us_udp_mib, udpOutErrors); + freemsg(mp); + return (ENOMEM); + } + ASSERT(cr != NULL); + ixa->ixa_cred = cr; + ixa->ixa_cpid = pid; + if (is_system_labeled()) { + /* We need to restart with a label based on the cred */ + ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); + } - /* - * Calculate the security label to be placed in the text - * of the message (if any). - */ - if ((err = tsol_compute_label(cred, dst, opt_storage, - us->us_netstack->netstack_ip)) != 0) - goto done; + /* In case previous destination was multicast or multirt */ + ip_attr_newdst(ixa); - /* - * Insert the security label in the cached ip options, - * removing any old label that may exist. - */ - if ((err = tsol_update_options(&udp->udp_ip_snd_options, - &udp->udp_ip_snd_options_len, &udp->udp_label_len, - opt_storage)) != 0) + /* Get a copy of conn_xmit_ipp since the options might change it */ + ipp = kmem_zalloc(sizeof (*ipp), KM_NOSLEEP); + if (ipp == NULL) { + ixa_refrele(ixa); + BUMP_MIB(&us->us_udp_mib, udpOutErrors); + freemsg(mp); + return (ENOMEM); + } + mutex_enter(&connp->conn_lock); + error = ip_pkt_copy(&connp->conn_xmit_ipp, ipp, KM_NOSLEEP); + mutex_exit(&connp->conn_lock); + if (error != 0) { + BUMP_MIB(&us->us_udp_mib, udpOutErrors); + freemsg(mp); goto done; + } /* - * Save the destination address and creds we used to - * generate the security label text. + * Parse the options and update ixa and ipp as a result. + * Note that ixa_tsl can be updated if SCM_UCRED. + * ixa_refrele/ixa_inactivate will release any reference on ixa_tsl. */ - if (cred != udp->udp_effective_cred) { - if (udp->udp_effective_cred != NULL) - crfree(udp->udp_effective_cred); - crhold(cred); - udp->udp_effective_cred = cred; - } - if (orig_cred != udp->udp_last_cred) { - if (udp->udp_last_cred != NULL) - crfree(udp->udp_last_cred); - crhold(orig_cred); - udp->udp_last_cred = orig_cred; - } -done: - if (effective_cred != NULL) - crfree(effective_cred); - if (err != 0) { - DTRACE_PROBE4( - tx__ip__log__info__updatelabel__udp, - char *, "queue(1) failed to update options(2) on mp(3)", - queue_t *, wq, char *, opt_storage, mblk_t *, mp); - } - return (err); -} + coa = &coas; + coa->coa_connp = connp; + coa->coa_ixa = ixa; + coa->coa_ipp = ipp; + coa->coa_ancillary = B_TRUE; + coa->coa_changed = 0; -static mblk_t * -udp_output_v4(conn_t *connp, mblk_t *mp, ipaddr_t v4dst, uint16_t port, - uint_t srcid, int *error, boolean_t insert_spi, struct nmsghdr *msg, - cred_t *cr, pid_t pid) -{ - udp_t *udp = connp->conn_udp; - mblk_t *mp1 = mp; - mblk_t *mp2; - ipha_t *ipha; - int ip_hdr_length; - uint32_t ip_len; - udpha_t *udpha; - boolean_t lock_held = B_FALSE; - in_port_t uha_src_port; - udpattrs_t attrs; - uchar_t ip_snd_opt[IP_MAX_OPT_LENGTH]; - uint32_t ip_snd_opt_len = 0; - ip4_pkt_t pktinfo; - ip4_pkt_t *pktinfop = &pktinfo; - ip_opt_info_t optinfo; - ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - udp_stack_t *us = udp->udp_us; - ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; - queue_t *q = connp->conn_wq; - ire_t *ire; - in6_addr_t v6dst; - boolean_t update_lastdst = B_FALSE; - - *error = 0; - pktinfop->ip4_ill_index = 0; - pktinfop->ip4_addr = INADDR_ANY; - optinfo.ip_opt_flags = 0; - optinfo.ip_opt_ill_index = 0; + if (msg != NULL) { + error = process_auxiliary_options(connp, msg->msg_control, + msg->msg_controllen, coa, &udp_opt_obj, udp_opt_set, cr); + } else { + struct T_unitdata_req *tudr; - if (v4dst == INADDR_ANY) - v4dst = htonl(INADDR_LOOPBACK); + tudr = (struct T_unitdata_req *)tudr_mp->b_rptr; + ASSERT(tudr->PRIM_type == T_UNITDATA_REQ); + error = tpi_optcom_buf(connp->conn_wq, tudr_mp, + &tudr->OPT_length, tudr->OPT_offset, cr, &udp_opt_obj, + coa, &is_absreq_failure); + } + if (error != 0) { + /* + * Note: No special action needed in this + * module for "is_absreq_failure" + */ + freemsg(mp); + BUMP_MIB(&us->us_udp_mib, udpOutErrors); + goto done; + } + ASSERT(is_absreq_failure == 0); + mutex_enter(&connp->conn_lock); /* - * If options passed in, feed it for verification and handling + * If laddr is unspecified then we look at sin6_src_id. + * We will give precedence to a source address set with IPV6_PKTINFO + * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't + * want ip_attr_connect to select a source (since it can fail) when + * IPV6_PKTINFO is specified. + * If this doesn't result in a source address then we get a source + * from ip_attr_connect() below. */ - attrs.udpattr_credset = B_FALSE; - if (IPCL_IS_NONSTR(connp)) { - if (msg->msg_controllen != 0) { - attrs.udpattr_ipp4 = pktinfop; - attrs.udpattr_mb = mp; - - rw_enter(&udp->udp_rwlock, RW_WRITER); - *error = process_auxiliary_options(connp, - msg->msg_control, msg->msg_controllen, - &attrs, &udp_opt_obj, udp_opt_set, cr); - rw_exit(&udp->udp_rwlock); - if (*error) - goto done; + v6src = connp->conn_saddr_v6; + if (sin != NULL) { + IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst); + dstport = sin->sin_port; + flowinfo = 0; + ixa->ixa_flags &= ~IXAF_SCOPEID_SET; + ixa->ixa_flags |= IXAF_IS_IPV4; + } else if (sin6 != NULL) { + v6dst = sin6->sin6_addr; + dstport = sin6->sin6_port; + flowinfo = sin6->sin6_flowinfo; + srcid = sin6->__sin6_src_id; + if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) { + ixa->ixa_scopeid = sin6->sin6_scope_id; + ixa->ixa_flags |= IXAF_SCOPEID_SET; + } else { + ixa->ixa_flags &= ~IXAF_SCOPEID_SET; } - } else { - if (DB_TYPE(mp) != M_DATA) { - mp1 = mp->b_cont; - if (((struct T_unitdata_req *) - mp->b_rptr)->OPT_length != 0) { - attrs.udpattr_ipp4 = pktinfop; - attrs.udpattr_mb = mp; - if (udp_unitdata_opt_process(q, mp, error, - &attrs) < 0) - goto done; - /* - * Note: success in processing options. - * mp option buffer represented by - * OPT_length/offset now potentially modified - * and contain option setting results - */ - ASSERT(*error == 0); - } + if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { + ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), + connp->conn_netstack); } + if (IN6_IS_ADDR_V4MAPPED(&v6dst)) + ixa->ixa_flags |= IXAF_IS_IPV4; + else + ixa->ixa_flags &= ~IXAF_IS_IPV4; + } else { + /* Connected case */ + v6dst = connp->conn_faddr_v6; + dstport = connp->conn_fport; + flowinfo = connp->conn_flowinfo; } + mutex_exit(&connp->conn_lock); - /* mp1 points to the M_DATA mblk carrying the packet */ - ASSERT(mp1 != NULL && DB_TYPE(mp1) == M_DATA); - - /* - * Determine whether we need to mark the mblk with the user's - * credentials. - * If labeled then sockfs would have already done this. - */ - ASSERT(!is_system_labeled() || msg_getcred(mp, NULL) != NULL); - - ire = connp->conn_ire_cache; - if (CLASSD(v4dst) || (ire == NULL) || (ire->ire_addr != v4dst) || - (ire->ire_type & (IRE_BROADCAST | IRE_LOCAL | IRE_LOOPBACK))) { - if (cr != NULL && msg_getcred(mp, NULL) == NULL) - mblk_setcred(mp, cr, pid); + /* Handle IPV6_PKTINFO setting source address. */ + if (IN6_IS_ADDR_UNSPECIFIED(&v6src) && + (ipp->ipp_fields & IPPF_ADDR)) { + if (ixa->ixa_flags & IXAF_IS_IPV4) { + if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) + v6src = ipp->ipp_addr; + } else { + if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) + v6src = ipp->ipp_addr; + } } - rw_enter(&udp->udp_rwlock, RW_READER); - lock_held = B_TRUE; + ip_attr_nexthop(ipp, ixa, &v6dst, &v6nexthop); + error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport, + &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | IPDF_IPSEC); - /* - * Cluster and TSOL note: - * udp.udp_v6lastdst is shared by Cluster and TSOL - * udp.udp_lastdstport is used by Cluster - * - * Both Cluster and TSOL need to update the dest addr and/or port. - * Updating is done after both Cluster and TSOL checks, protected - * by conn_lock. - */ - mutex_enter(&connp->conn_lock); - - if (cl_inet_connect2 != NULL && - (!IN6_IS_ADDR_V4MAPPED(&udp->udp_v6lastdst) || - V4_PART_OF_V6(udp->udp_v6lastdst) != v4dst || - udp->udp_lastdstport != port)) { - mutex_exit(&connp->conn_lock); - *error = 0; - IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); - CL_INET_UDP_CONNECT(connp, udp, B_TRUE, &v6dst, port, *error); - if (*error != 0) { - *error = EHOSTUNREACH; - goto done; + switch (error) { + case 0: + break; + case EADDRNOTAVAIL: + /* + * IXAF_VERIFY_SOURCE tells us to pick a better source. + * Don't have the application see that errno + */ + error = ENETUNREACH; + goto failed; + case ENETDOWN: + /* + * Have !ipif_addr_ready address; drop packet silently + * until we can get applications to not send until we + * are ready. + */ + error = 0; + goto failed; + case EHOSTUNREACH: + case ENETUNREACH: + if (ixa->ixa_ire != NULL) { + /* + * Let conn_ip_output/ire_send_noroute return + * the error and send any local ICMP error. + */ + error = 0; + break; } - update_lastdst = B_TRUE; - mutex_enter(&connp->conn_lock); + /* FALLTHRU */ + default: + failed: + freemsg(mp); + BUMP_MIB(&us->us_udp_mib, udpOutErrors); + goto done; } /* - * Check if our saved options are valid; update if not. - * TSOL Note: Since we are not in WRITER mode, UDP packets - * to different destination may require different labels, - * or worse, UDP packets to same IP address may require - * different labels due to use of shared all-zones address. - * We use conn_lock to ensure that lastdst, ip_snd_options, - * and ip_snd_options_len are consistent for the current - * destination and are updated atomically. + * We might be going to a different destination than last time, + * thus check that TX allows the communication and compute any + * needed label. + * + * TSOL Note: We have an exclusive ipp and ixa for this thread so we + * don't have to worry about concurrent threads. */ if (is_system_labeled()) { - cred_t *credp; - pid_t cpid; - /* Using UDP MLP requires SCM_UCRED from user */ if (connp->conn_mlp_type != mlptSingle && - !attrs.udpattr_credset) { - mutex_exit(&connp->conn_lock); - DTRACE_PROBE4( - tx__ip__log__info__output__udp, - char *, "MLP mp(1) lacks SCM_UCRED attr(2) on q(3)", - mblk_t *, mp, udpattrs_t *, &attrs, queue_t *, q); - *error = EINVAL; + !((ixa->ixa_flags & IXAF_UCRED_TSL))) { + BUMP_MIB(&us->us_udp_mib, udpOutErrors); + error = ECONNREFUSED; + freemsg(mp); goto done; } /* - * Update label option for this UDP socket if - * - the destination has changed, - * - the UDP socket is MLP, or - * - the cred attached to the mblk changed. + * Check whether Trusted Solaris policy allows communication + * with this host, and pretend that the destination is + * unreachable if not. + * Compute any needed label and place it in ipp_label_v4/v6. + * + * Later conn_build_hdr_template/conn_prepend_hdr takes + * ipp_label_v4/v6 to form the packet. + * + * Tsol note: We have ipp structure local to this thread so + * no locking is needed. */ - credp = msg_getcred(mp, &cpid); - if (!IN6_IS_ADDR_V4MAPPED(&udp->udp_v6lastdst) || - V4_PART_OF_V6(udp->udp_v6lastdst) != v4dst || - connp->conn_mlp_type != mlptSingle || - credp != udp->udp_last_cred) { - if ((*error = udp_update_label(q, mp, v4dst)) != 0) { - mutex_exit(&connp->conn_lock); - goto done; - } - update_lastdst = B_TRUE; + error = conn_update_label(connp, ixa, &v6dst, ipp); + if (error != 0) { + freemsg(mp); + BUMP_MIB(&us->us_udp_mib, udpOutErrors); + goto done; } - - /* - * Attach the effective cred to the mblk to ensure future - * routing decisions will be based on it's label. - */ - mblk_setcred(mp, udp->udp_effective_cred, cpid); } - if (update_lastdst) { - IN6_IPADDR_TO_V4MAPPED(v4dst, &udp->udp_v6lastdst); - udp->udp_lastdstport = port; + mp = udp_prepend_hdr(connp, ixa, ipp, &v6src, &v6dst, dstport, + flowinfo, mp, &error); + if (mp == NULL) { + ASSERT(error != 0); + BUMP_MIB(&us->us_udp_mib, udpOutErrors); + goto done; } - if (udp->udp_ip_snd_options_len > 0) { - ip_snd_opt_len = udp->udp_ip_snd_options_len; - bcopy(udp->udp_ip_snd_options, ip_snd_opt, ip_snd_opt_len); + if (ixa->ixa_pktlen > IP_MAXPACKET) { + error = EMSGSIZE; + BUMP_MIB(&us->us_udp_mib, udpOutErrors); + freemsg(mp); + goto done; } - mutex_exit(&connp->conn_lock); + /* We're done. Pass the packet to ip. */ + BUMP_MIB(&us->us_udp_mib, udpHCOutDatagrams); - /* Add an IP header */ - ip_hdr_length = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE + ip_snd_opt_len + - (insert_spi ? sizeof (uint32_t) : 0); - ipha = (ipha_t *)&mp1->b_rptr[-ip_hdr_length]; - if (DB_REF(mp1) != 1 || (uchar_t *)ipha < DB_BASE(mp1) || - !OK_32PTR(ipha)) { - mp2 = allocb(ip_hdr_length + us->us_wroff_extra, BPRI_LO); - if (mp2 == NULL) { - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, - "udp_wput_end: q %p (%S)", q, "allocbfail2"); - *error = ENOMEM; - goto done; - } - mp2->b_wptr = DB_LIM(mp2); - mp2->b_cont = mp1; - mp1 = mp2; - if (DB_TYPE(mp) != M_DATA) - mp->b_cont = mp1; - else - mp = mp1; - ipha = (ipha_t *)(mp1->b_wptr - ip_hdr_length); - } - ip_hdr_length -= (UDPH_SIZE + (insert_spi ? sizeof (uint32_t) : 0)); -#ifdef _BIG_ENDIAN - /* Set version, header length, and tos */ - *(uint16_t *)&ipha->ipha_version_and_hdr_length = - ((((IP_VERSION << 4) | (ip_hdr_length>>2)) << 8) | - udp->udp_type_of_service); - /* Set ttl and protocol */ - *(uint16_t *)&ipha->ipha_ttl = (udp->udp_ttl << 8) | IPPROTO_UDP; -#else - /* Set version, header length, and tos */ - *(uint16_t *)&ipha->ipha_version_and_hdr_length = - ((udp->udp_type_of_service << 8) | - ((IP_VERSION << 4) | (ip_hdr_length>>2))); - /* Set ttl and protocol */ - *(uint16_t *)&ipha->ipha_ttl = (IPPROTO_UDP << 8) | udp->udp_ttl; -#endif - if (pktinfop->ip4_addr != INADDR_ANY) { - ipha->ipha_src = pktinfop->ip4_addr; - optinfo.ip_opt_flags = IP_VERIFY_SRC; - } else { + error = conn_ip_output(mp, ixa); + /* No udpOutErrors if an error since IP increases its error counter */ + switch (error) { + case 0: + break; + case EWOULDBLOCK: + (void) ixa_check_drain_insert(connp, ixa); + error = 0; + break; + case EADDRNOTAVAIL: /* - * Copy our address into the packet. If this is zero, - * first look at __sin6_src_id for a hint. If we leave the - * source as INADDR_ANY then ip will fill in the real source - * address. + * IXAF_VERIFY_SOURCE tells us to pick a better source. + * Don't have the application see that errno */ - IN6_V4MAPPED_TO_IPADDR(&udp->udp_v6src, ipha->ipha_src); - if (srcid != 0 && ipha->ipha_src == INADDR_ANY) { - in6_addr_t v6src; - - ip_srcid_find_id(srcid, &v6src, connp->conn_zoneid, - us->us_netstack); - IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src); - } - } - uha_src_port = udp->udp_port; - if (ip_hdr_length == IP_SIMPLE_HDR_LENGTH) { - rw_exit(&udp->udp_rwlock); - lock_held = B_FALSE; - } - - if (pktinfop->ip4_ill_index != 0) { - optinfo.ip_opt_ill_index = pktinfop->ip4_ill_index; + error = ENETUNREACH; + /* FALLTHRU */ + default: + mutex_enter(&connp->conn_lock); + /* + * Clear the source and v6lastdst so we call ip_attr_connect + * for the next packet and try to pick a better source. + */ + if (connp->conn_mcbc_bind) + connp->conn_saddr_v6 = ipv6_all_zeros; + else + connp->conn_saddr_v6 = connp->conn_bound_addr_v6; + connp->conn_v6lastdst = ipv6_all_zeros; + mutex_exit(&connp->conn_lock); + break; } +done: + ixa_refrele(ixa); + ip_pkt_free(ipp); + kmem_free(ipp, sizeof (*ipp)); + return (error); +} - ipha->ipha_fragment_offset_and_flags = 0; - ipha->ipha_ident = 0; - - mp1->b_rptr = (uchar_t *)ipha; - - ASSERT((uintptr_t)(mp1->b_wptr - (uchar_t *)ipha) <= - (uintptr_t)UINT_MAX); +/* + * Handle sending an M_DATA for a connected socket. + * Handles both IPv4 and IPv6. + */ +static int +udp_output_connected(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid) +{ + udp_t *udp = connp->conn_udp; + udp_stack_t *us = udp->udp_us; + int error; + ip_xmit_attr_t *ixa; - /* Determine length of packet */ - ip_len = (uint32_t)(mp1->b_wptr - (uchar_t *)ipha); - if ((mp2 = mp1->b_cont) != NULL) { - do { - ASSERT((uintptr_t)MBLKL(mp2) <= (uintptr_t)UINT_MAX); - ip_len += (uint32_t)MBLKL(mp2); - } while ((mp2 = mp2->b_cont) != NULL); - } /* - * If the size of the packet is greater than the maximum allowed by - * ip, return an error. Passing this down could cause panics because - * the size will have wrapped and be inconsistent with the msg size. + * If no other thread is using conn_ixa this just gets a reference to + * conn_ixa. Otherwise we get a safe copy of conn_ixa. */ - if (ip_len > IP_MAXPACKET) { - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, - "udp_wput_end: q %p (%S)", q, "IP length exceeded"); - *error = EMSGSIZE; - goto done; + ixa = conn_get_ixa(connp, B_FALSE); + if (ixa == NULL) { + BUMP_MIB(&us->us_udp_mib, udpOutErrors); + freemsg(mp); + return (ENOMEM); } - ipha->ipha_length = htons((uint16_t)ip_len); - ip_len -= ip_hdr_length; - ip_len = htons((uint16_t)ip_len); - udpha = (udpha_t *)(((uchar_t *)ipha) + ip_hdr_length); - - /* Insert all-0s SPI now. */ - if (insert_spi) - *((uint32_t *)(udpha + 1)) = 0; - /* - * Copy in the destination address - */ - ipha->ipha_dst = v4dst; - - /* - * Set ttl based on IP_MULTICAST_TTL to match IPv6 logic. - */ - if (CLASSD(v4dst)) - ipha->ipha_ttl = udp->udp_multicast_ttl; - - udpha->uha_dst_port = port; - udpha->uha_src_port = uha_src_port; + ASSERT(cr != NULL); + ixa->ixa_cred = cr; + ixa->ixa_cpid = pid; - if (ip_snd_opt_len > 0) { - uint32_t cksum; + mutex_enter(&connp->conn_lock); + mp = udp_prepend_header_template(connp, ixa, mp, &connp->conn_saddr_v6, + connp->conn_fport, connp->conn_flowinfo, &error); - bcopy(ip_snd_opt, &ipha[1], ip_snd_opt_len); - lock_held = B_FALSE; - rw_exit(&udp->udp_rwlock); - /* - * Massage source route putting first source route in ipha_dst. - * Ignore the destination in T_unitdata_req. - * Create a checksum adjustment for a source route, if any. - */ - cksum = ip_massage_options(ipha, us->us_netstack); - cksum = (cksum & 0xFFFF) + (cksum >> 16); - cksum -= ((ipha->ipha_dst >> 16) & 0xFFFF) + - (ipha->ipha_dst & 0xFFFF); - if ((int)cksum < 0) - cksum--; - cksum = (cksum & 0xFFFF) + (cksum >> 16); - /* - * IP does the checksum if uha_checksum is non-zero, - * We make it easy for IP to include our pseudo header - * by putting our length in uha_checksum. - */ - cksum += ip_len; - cksum = (cksum & 0xFFFF) + (cksum >> 16); - /* There might be a carry. */ - cksum = (cksum & 0xFFFF) + (cksum >> 16); -#ifdef _LITTLE_ENDIAN - if (us->us_do_checksum) - ip_len = (cksum << 16) | ip_len; -#else - if (us->us_do_checksum) - ip_len = (ip_len << 16) | cksum; - else - ip_len <<= 16; -#endif - } else { - /* - * IP does the checksum if uha_checksum is non-zero, - * We make it easy for IP to include our pseudo header - * by putting our length in uha_checksum. - */ - if (us->us_do_checksum) - ip_len |= (ip_len << 16); -#ifndef _LITTLE_ENDIAN - else - ip_len <<= 16; -#endif + if (mp == NULL) { + ASSERT(error != 0); + mutex_exit(&connp->conn_lock); + ixa_refrele(ixa); + BUMP_MIB(&us->us_udp_mib, udpOutErrors); + freemsg(mp); + return (error); } - ASSERT(!lock_held); - /* Set UDP length and checksum */ - *((uint32_t *)&udpha->uha_length) = ip_len; - if (DB_TYPE(mp) != M_DATA) { - cred_t *cr; - pid_t cpid; + /* + * In case we got a safe copy of conn_ixa, or if opt_set made us a new + * safe copy, then we need to fill in any pointers in it. + */ + if (ixa->ixa_ire == NULL) { + in6_addr_t faddr, saddr; + in6_addr_t nexthop; + in_port_t fport; + + saddr = connp->conn_saddr_v6; + faddr = connp->conn_faddr_v6; + fport = connp->conn_fport; + ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &faddr, &nexthop); + mutex_exit(&connp->conn_lock); - /* Move any cred from the T_UNITDATA_REQ to the packet */ - cr = msg_extractcred(mp, &cpid); - if (cr != NULL) { - if (mp1->b_datap->db_credp != NULL) - crfree(mp1->b_datap->db_credp); - mp1->b_datap->db_credp = cr; - mp1->b_datap->db_cpid = cpid; + error = ip_attr_connect(connp, ixa, &saddr, &faddr, &nexthop, + fport, NULL, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | + IPDF_IPSEC); + switch (error) { + case 0: + break; + case EADDRNOTAVAIL: + /* + * IXAF_VERIFY_SOURCE tells us to pick a better source. + * Don't have the application see that errno + */ + error = ENETUNREACH; + goto failed; + case ENETDOWN: + /* + * Have !ipif_addr_ready address; drop packet silently + * until we can get applications to not send until we + * are ready. + */ + error = 0; + goto failed; + case EHOSTUNREACH: + case ENETUNREACH: + if (ixa->ixa_ire != NULL) { + /* + * Let conn_ip_output/ire_send_noroute return + * the error and send any local ICMP error. + */ + error = 0; + break; + } + /* FALLTHRU */ + default: + failed: + ixa_refrele(ixa); + freemsg(mp); + BUMP_MIB(&us->us_udp_mib, udpOutErrors); + return (error); } - ASSERT(mp != mp1); - freeb(mp); + } else { + /* Done with conn_t */ + mutex_exit(&connp->conn_lock); } - - /* mp has been consumed and we'll return success */ - ASSERT(*error == 0); - mp = NULL; + ASSERT(ixa->ixa_ire != NULL); /* We're done. Pass the packet to ip. */ BUMP_MIB(&us->us_udp_mib, udpHCOutDatagrams); - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, - "udp_wput_end: q %p (%S)", q, "end"); - - if ((connp->conn_flags & IPCL_CHECK_POLICY) != 0 || - CONN_OUTBOUND_POLICY_PRESENT(connp, ipss) || - connp->conn_dontroute || - connp->conn_outgoing_ill != NULL || optinfo.ip_opt_flags != 0 || - optinfo.ip_opt_ill_index != 0 || - ipha->ipha_version_and_hdr_length != IP_SIMPLE_HDR_VERSION || - IPP_ENABLED(IPP_LOCAL_OUT, ipst) || - ipst->ips_ip_g_mrouter != NULL) { - UDP_STAT(us, udp_ip_send); - ip_output_options(connp, mp1, connp->conn_wq, IP_WPUT, - &optinfo); - } else { - udp_send_data(udp, connp->conn_wq, mp1, ipha); - } -done: - if (lock_held) - rw_exit(&udp->udp_rwlock); - if (*error != 0) { - ASSERT(mp != NULL); - BUMP_MIB(&us->us_udp_mib, udpOutErrors); + error = conn_ip_output(mp, ixa); + /* No udpOutErrors if an error since IP increases its error counter */ + switch (error) { + case 0: + break; + case EWOULDBLOCK: + (void) ixa_check_drain_insert(connp, ixa); + error = 0; + break; + case EADDRNOTAVAIL: + /* + * IXAF_VERIFY_SOURCE tells us to pick a better source. + * Don't have the application see that errno + */ + error = ENETUNREACH; + break; } - return (mp); + ixa_refrele(ixa); + return (error); } -static void -udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, ipha_t *ipha) +/* + * Handle sending an M_DATA to the last destination. + * Handles both IPv4 and IPv6. + * + * NOTE: The caller must hold conn_lock and we drop it here. + */ +static int +udp_output_lastdst(conn_t *connp, mblk_t *mp, cred_t *cr, pid_t pid, + ip_xmit_attr_t *ixa) { - conn_t *connp = udp->udp_connp; - ipaddr_t src, dst; - ire_t *ire; - ipif_t *ipif = NULL; - mblk_t *ire_fp_mp; - boolean_t retry_caching; - udp_stack_t *us = udp->udp_us; - ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - - dst = ipha->ipha_dst; - src = ipha->ipha_src; - ASSERT(ipha->ipha_ident == 0); - - if (CLASSD(dst)) { - int err; - - ipif = conn_get_held_ipif(connp, - &connp->conn_multicast_ipif, &err); - - if (ipif == NULL || ipif->ipif_isv6 || - (ipif->ipif_ill->ill_phyint->phyint_flags & - PHYI_LOOPBACK)) { - if (ipif != NULL) - ipif_refrele(ipif); - UDP_STAT(us, udp_ip_send); - ip_output(connp, mp, q, IP_WPUT); - return; - } - } + udp_t *udp = connp->conn_udp; + udp_stack_t *us = udp->udp_us; + int error; - retry_caching = B_FALSE; - mutex_enter(&connp->conn_lock); - ire = connp->conn_ire_cache; - ASSERT(!(connp->conn_state_flags & CONN_INCIPIENT)); + ASSERT(MUTEX_HELD(&connp->conn_lock)); + ASSERT(ixa != NULL); - if (ire == NULL || ire->ire_addr != dst || - (ire->ire_marks & IRE_MARK_CONDEMNED)) { - retry_caching = B_TRUE; - } else if (CLASSD(dst) && (ire->ire_type & IRE_CACHE)) { - ill_t *stq_ill = (ill_t *)ire->ire_stq->q_ptr; + ASSERT(cr != NULL); + ixa->ixa_cred = cr; + ixa->ixa_cpid = pid; - ASSERT(ipif != NULL); - if (!IS_ON_SAME_LAN(stq_ill, ipif->ipif_ill)) - retry_caching = B_TRUE; - } + mp = udp_prepend_header_template(connp, ixa, mp, &connp->conn_v6lastsrc, + connp->conn_lastdstport, connp->conn_lastflowinfo, &error); - if (!retry_caching) { - ASSERT(ire != NULL); - IRE_REFHOLD(ire); + if (mp == NULL) { + ASSERT(error != 0); mutex_exit(&connp->conn_lock); - } else { - boolean_t cached = B_FALSE; + ixa_refrele(ixa); + BUMP_MIB(&us->us_udp_mib, udpOutErrors); + freemsg(mp); + return (error); + } - connp->conn_ire_cache = NULL; + /* + * In case we got a safe copy of conn_ixa, or if opt_set made us a new + * safe copy, then we need to fill in any pointers in it. + */ + if (ixa->ixa_ire == NULL) { + in6_addr_t lastdst, lastsrc; + in6_addr_t nexthop; + in_port_t lastport; + + lastsrc = connp->conn_v6lastsrc; + lastdst = connp->conn_v6lastdst; + lastport = connp->conn_lastdstport; + ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &lastdst, &nexthop); mutex_exit(&connp->conn_lock); - /* Release the old ire */ - if (ire != NULL) { - IRE_REFRELE_NOTR(ire); - ire = NULL; - } - - if (CLASSD(dst)) { - ASSERT(ipif != NULL); - ire = ire_ctable_lookup(dst, 0, 0, ipif, - connp->conn_zoneid, msg_getlabel(mp), - MATCH_IRE_ILL, ipst); - } else { - ASSERT(ipif == NULL); - ire = ire_cache_lookup(dst, connp->conn_zoneid, - msg_getlabel(mp), ipst); - } - - if (ire == NULL) { - if (ipif != NULL) - ipif_refrele(ipif); - UDP_STAT(us, udp_ire_null); - ip_output(connp, mp, q, IP_WPUT); - return; - } - IRE_REFHOLD_NOTR(ire); - - mutex_enter(&connp->conn_lock); - if (CONN_CACHE_IRE(connp) && connp->conn_ire_cache == NULL && - !(ire->ire_marks & IRE_MARK_CONDEMNED)) { - irb_t *irb = ire->ire_bucket; - + error = ip_attr_connect(connp, ixa, &lastsrc, &lastdst, + &nexthop, lastport, NULL, NULL, IPDF_ALLOW_MCBC | + IPDF_VERIFY_DST | IPDF_IPSEC); + switch (error) { + case 0: + break; + case EADDRNOTAVAIL: /* - * IRE's created for non-connection oriented transports - * are normally initialized with IRE_MARK_TEMPORARY set - * in the ire_marks. These IRE's are preferentially - * reaped when the hash chain length in the cache - * bucket exceeds the maximum value specified in - * ip[6]_ire_max_bucket_cnt. This can severely affect - * UDP performance if IRE cache entries that we need - * to reuse are continually removed. To remedy this, - * when we cache the IRE in the conn_t, we remove the - * IRE_MARK_TEMPORARY bit from the ire_marks if it was - * set. + * IXAF_VERIFY_SOURCE tells us to pick a better source. + * Don't have the application see that errno */ - if (ire->ire_marks & IRE_MARK_TEMPORARY) { - rw_enter(&irb->irb_lock, RW_WRITER); - if (ire->ire_marks & IRE_MARK_TEMPORARY) { - ire->ire_marks &= ~IRE_MARK_TEMPORARY; - irb->irb_tmp_ire_cnt--; - } - rw_exit(&irb->irb_lock); + error = ENETUNREACH; + goto failed; + case ENETDOWN: + /* + * Have !ipif_addr_ready address; drop packet silently + * until we can get applications to not send until we + * are ready. + */ + error = 0; + goto failed; + case EHOSTUNREACH: + case ENETUNREACH: + if (ixa->ixa_ire != NULL) { + /* + * Let conn_ip_output/ire_send_noroute return + * the error and send any local ICMP error. + */ + error = 0; + break; } - connp->conn_ire_cache = ire; - cached = B_TRUE; + /* FALLTHRU */ + default: + failed: + ixa_refrele(ixa); + freemsg(mp); + BUMP_MIB(&us->us_udp_mib, udpOutErrors); + return (error); } + } else { + /* Done with conn_t */ mutex_exit(&connp->conn_lock); - - /* - * We can continue to use the ire but since it was not - * cached, we should drop the extra reference. - */ - if (!cached) - IRE_REFRELE_NOTR(ire); } - ASSERT(ire != NULL && ire->ire_ipversion == IPV4_VERSION); - ASSERT(!CLASSD(dst) || ipif != NULL); - /* - * Check if we can take the fast-path. - * Note that "incomplete" ire's (where the link-layer for next hop - * is not resolved, or where the fast-path header in nce_fp_mp is not - * available yet) are sent down the legacy (slow) path - */ - if ((ire->ire_type & (IRE_BROADCAST|IRE_LOCAL|IRE_LOOPBACK)) || - (ire->ire_flags & RTF_MULTIRT) || (ire->ire_stq == NULL) || - (ire->ire_max_frag < ntohs(ipha->ipha_length)) || - ((ire->ire_nce == NULL) || - ((ire_fp_mp = ire->ire_nce->nce_fp_mp) == NULL)) || - connp->conn_nexthop_set || (MBLKL(ire_fp_mp) > MBLKHEAD(mp))) { - if (ipif != NULL) - ipif_refrele(ipif); - UDP_STAT(us, udp_ip_ire_send); - IRE_REFRELE(ire); - ip_output(connp, mp, q, IP_WPUT); - return; - } + /* We're done. Pass the packet to ip. */ + BUMP_MIB(&us->us_udp_mib, udpHCOutDatagrams); - if (src == INADDR_ANY && !connp->conn_unspec_src) { - if (CLASSD(dst) && !(ire->ire_flags & RTF_SETSRC)) - ipha->ipha_src = ipif->ipif_src_addr; + error = conn_ip_output(mp, ixa); + /* No udpOutErrors if an error since IP increases its error counter */ + switch (error) { + case 0: + break; + case EWOULDBLOCK: + (void) ixa_check_drain_insert(connp, ixa); + error = 0; + break; + case EADDRNOTAVAIL: + /* + * IXAF_VERIFY_SOURCE tells us to pick a better source. + * Don't have the application see that errno + */ + error = ENETUNREACH; + /* FALLTHRU */ + default: + mutex_enter(&connp->conn_lock); + /* + * Clear the source and v6lastdst so we call ip_attr_connect + * for the next packet and try to pick a better source. + */ + if (connp->conn_mcbc_bind) + connp->conn_saddr_v6 = ipv6_all_zeros; else - ipha->ipha_src = ire->ire_src_addr; + connp->conn_saddr_v6 = connp->conn_bound_addr_v6; + connp->conn_v6lastdst = ipv6_all_zeros; + mutex_exit(&connp->conn_lock); + break; } - - if (ipif != NULL) - ipif_refrele(ipif); - - udp_xmit(connp->conn_wq, mp, ire, connp, connp->conn_zoneid); + ixa_refrele(ixa); + return (error); } -static void -udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid) + +/* + * Prepend the header template and then fill in the source and + * flowinfo. The caller needs to handle the destination address since + * it's setting is different if rthdr or source route. + * + * Returns NULL is allocation failed or if the packet would exceed IP_MAXPACKET. + * When it returns NULL it sets errorp. + */ +static mblk_t * +udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp, + const in6_addr_t *v6src, in_port_t dstport, uint32_t flowinfo, int *errorp) { - ipaddr_t src, dst; - ill_t *ill; - mblk_t *ire_fp_mp; - uint_t ire_fp_mp_len; - uint16_t *up; - uint32_t cksum, hcksum_txflags; - queue_t *dev_q; - udp_t *udp = connp->conn_udp; - ipha_t *ipha = (ipha_t *)mp->b_rptr; + udp_t *udp = connp->conn_udp; udp_stack_t *us = udp->udp_us; - ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - boolean_t ll_multicast = B_FALSE; - boolean_t direct_send; - - dev_q = ire->ire_stq->q_next; - ASSERT(dev_q != NULL); + boolean_t insert_spi = udp->udp_nat_t_endpoint; + uint_t pktlen; + uint_t alloclen; + uint_t copylen; + uint8_t *iph; + uint_t ip_hdr_length; + udpha_t *udpha; + uint32_t cksum; + ip_pkt_t *ipp; - ill = ire_to_ill(ire); - ASSERT(ill != NULL); + ASSERT(MUTEX_HELD(&connp->conn_lock)); /* - * For the direct send case, if resetting of conn_direct_blocked - * was missed, it is still ok because the putq() would enable - * the queue and write service will drain it out. + * Copy the header template and leave space for an SPI */ - direct_send = ILL_DIRECT_CAPABLE(ill); - - /* is queue flow controlled? */ - if ((!direct_send) && (q->q_first != NULL || connp->conn_draining || - DEV_Q_FLOW_BLOCKED(dev_q))) { - BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); - BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); - if (ipst->ips_ip_output_queue) { - DTRACE_PROBE1(udp__xmit__putq, conn_t *, connp); - (void) putq(connp->conn_wq, mp); - } else { - freemsg(mp); - } - ire_refrele(ire); - return; - } - - ire_fp_mp = ire->ire_nce->nce_fp_mp; - ire_fp_mp_len = MBLKL(ire_fp_mp); - ASSERT(MBLKHEAD(mp) >= ire_fp_mp_len); - - dst = ipha->ipha_dst; - src = ipha->ipha_src; - - - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); - - ipha->ipha_ident = (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1); -#ifndef _BIG_ENDIAN - ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8); -#endif - - if (ILL_HCKSUM_CAPABLE(ill) && dohwcksum) { - ASSERT(ill->ill_hcksum_capab != NULL); - hcksum_txflags = ill->ill_hcksum_capab->ill_hcksum_txflags; - } else { - hcksum_txflags = 0; - } - - /* pseudo-header checksum (do it in parts for IP header checksum) */ - cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); - - ASSERT(ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION); - up = IPH_UDPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH); - if (*up != 0) { - IP_CKSUM_XMIT_FAST(ire->ire_ipversion, hcksum_txflags, - mp, ipha, up, IPPROTO_UDP, IP_SIMPLE_HDR_LENGTH, - ntohs(ipha->ipha_length), cksum); - - /* Software checksum? */ - if (DB_CKSUMFLAGS(mp) == 0) { - UDP_STAT(us, udp_out_sw_cksum); - UDP_STAT_UPDATE(us, udp_out_sw_cksum_bytes, - ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH); - } - } - - if (!CLASSD(dst)) { - ipha->ipha_fragment_offset_and_flags |= - (uint32_t)htons(ire->ire_frag_flag); - } - - /* Calculate IP header checksum if hardware isn't capable */ - if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) { - IP_HDR_CKSUM(ipha, cksum, ((uint32_t *)ipha)[0], - ((uint16_t *)ipha)[4]); + copylen = connp->conn_ht_iphc_len; + alloclen = copylen + (insert_spi ? sizeof (uint32_t) : 0); + pktlen = alloclen + msgdsize(mp); + if (pktlen > IP_MAXPACKET) { + freemsg(mp); + *errorp = EMSGSIZE; + return (NULL); } + ixa->ixa_pktlen = pktlen; - if (CLASSD(dst)) { - if (ilm_lookup_ill(ill, dst, ALL_ZONES) != NULL) { - ip_multicast_loopback(q, ill, mp, - connp->conn_multicast_loop ? 0 : - IP_FF_NO_MCAST_LOOP, zoneid); - } + /* check/fix buffer config, setup pointers into it */ + iph = mp->b_rptr - alloclen; + if (DB_REF(mp) != 1 || iph < DB_BASE(mp) || !OK_32PTR(iph)) { + mblk_t *mp1; - /* If multicast TTL is 0 then we are done */ - if (ipha->ipha_ttl == 0) { + mp1 = allocb(alloclen + us->us_wroff_extra, BPRI_MED); + if (mp1 == NULL) { freemsg(mp); - ire_refrele(ire); - return; + *errorp = ENOMEM; + return (NULL); } - ll_multicast = B_TRUE; + mp1->b_wptr = DB_LIM(mp1); + mp1->b_cont = mp; + mp = mp1; + iph = (mp->b_wptr - alloclen); } + mp->b_rptr = iph; + bcopy(connp->conn_ht_iphc, iph, copylen); + ip_hdr_length = (uint_t)(connp->conn_ht_ulp - connp->conn_ht_iphc); - ASSERT(DB_TYPE(ire_fp_mp) == M_DATA); - mp->b_rptr = (uchar_t *)ipha - ire_fp_mp_len; - bcopy(ire_fp_mp->b_rptr, mp->b_rptr, ire_fp_mp_len); - - UPDATE_OB_PKT_COUNT(ire); - ire->ire_last_used_time = lbolt; - - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits); - UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets, - ntohs(ipha->ipha_length)); + ixa->ixa_ip_hdr_length = ip_hdr_length; + udpha = (udpha_t *)(iph + ip_hdr_length); - DTRACE_PROBE4(ip4__physical__out__start, - ill_t *, NULL, ill_t *, ill, ipha_t *, ipha, mblk_t *, mp); - FW_HOOKS(ipst->ips_ip4_physical_out_event, - ipst->ips_ipv4firewall_physical_out, NULL, ill, ipha, mp, mp, - ll_multicast, ipst); - DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); - if (ipst->ips_ip4_observe.he_interested && mp != NULL) { - zoneid_t szone; - - /* - * Both of these functions expect b_rptr to be - * where the IP header starts, so advance past the - * link layer header if present. - */ - mp->b_rptr += ire_fp_mp_len; - szone = ip_get_zoneid_v4(ipha->ipha_src, mp, - ipst, ALL_ZONES); - ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, - ALL_ZONES, ill, ipst); - mp->b_rptr -= ire_fp_mp_len; - } + /* + * Setup header length and prepare for ULP checksum done in IP. + * udp_build_hdr_template has already massaged any routing header + * and placed the result in conn_sum. + * + * We make it easy for IP to include our pseudo header + * by putting our length in uha_checksum. + */ + cksum = pktlen - ip_hdr_length; + udpha->uha_length = htons(cksum); - if (mp == NULL) - goto bail; + cksum += connp->conn_sum; + cksum = (cksum >> 16) + (cksum & 0xFFFF); + ASSERT(cksum < 0x10000); - DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, - void_ip_t *, ipha, __dtrace_ipsr_ill_t *, ill, - ipha_t *, ipha, ip6_t *, NULL, int, 0); + ipp = &connp->conn_xmit_ipp; + if (ixa->ixa_flags & IXAF_IS_IPV4) { + ipha_t *ipha = (ipha_t *)iph; - if (direct_send) { - uintptr_t cookie; - ill_dld_direct_t *idd = &ill->ill_dld_capab->idc_direct; + ipha->ipha_length = htons((uint16_t)pktlen); - cookie = idd->idd_tx_df(idd->idd_tx_dh, mp, - (uintptr_t)connp, 0); - if (cookie != NULL) { - idl_tx_list_t *idl_txl; + /* IP does the checksum if uha_checksum is non-zero */ + if (us->us_do_checksum) + udpha->uha_checksum = htons(cksum); - /* - * Flow controlled. - */ - DTRACE_PROBE2(non__null__cookie, uintptr_t, - cookie, conn_t *, connp); - idl_txl = &ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)]; - mutex_enter(&idl_txl->txl_lock); - /* - * Check again after holding txl_lock to see if Tx - * ring is still blocked and only then insert the - * connp into the drain list. - */ - if (connp->conn_direct_blocked || - (idd->idd_tx_fctl_df(idd->idd_tx_fctl_dh, - cookie) == 0)) { - mutex_exit(&idl_txl->txl_lock); - goto bail; - } - if (idl_txl->txl_cookie != NULL && - idl_txl->txl_cookie != cookie) { - DTRACE_PROBE2(udp__xmit__collision, - uintptr_t, cookie, - uintptr_t, idl_txl->txl_cookie); - UDP_STAT(us, udp_cookie_coll); - } else { - connp->conn_direct_blocked = B_TRUE; - idl_txl->txl_cookie = cookie; - conn_drain_insert(connp, idl_txl); - DTRACE_PROBE1(udp__xmit__insert, - conn_t *, connp); - } - mutex_exit(&idl_txl->txl_lock); + /* if IP_PKTINFO specified an addres it wins over bind() */ + if ((ipp->ipp_fields & IPPF_ADDR) && + IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) { + ASSERT(ipp->ipp_addr_v4 != INADDR_ANY); + ipha->ipha_src = ipp->ipp_addr_v4; + } else { + IN6_V4MAPPED_TO_IPADDR(v6src, ipha->ipha_src); } } else { - DTRACE_PROBE1(udp__xmit__putnext, mblk_t *, mp); - putnext(ire->ire_stq, mp); - } -bail: - IRE_REFRELE(ire); -} + ip6_t *ip6h = (ip6_t *)iph; -static boolean_t -udp_update_label_v6(queue_t *wq, mblk_t *mp, in6_addr_t *dst) -{ - udp_t *udp = Q_TO_UDP(wq); - int err; - cred_t *cred; - cred_t *orig_cred; - cred_t *effective_cred = NULL; - uchar_t opt_storage[TSOL_MAX_IPV6_OPTION]; - udp_stack_t *us = udp->udp_us; - - /* - * All Solaris components should pass a db_credp - * for this message, hence we ASSERT. - * On production kernels we return an error to be robust against - * random streams modules sitting on top of us. - */ - cred = orig_cred = msg_getcred(mp, NULL); - ASSERT(cred != NULL); - if (cred == NULL) - return (EINVAL); - - /* - * Verify the destination is allowed to receive packets at - * the security label of the message data. tsol_check_dest() - * may create a new effective cred for this message with a - * modified label or label flags. Note that we use the - * cred/label from the message to handle MLP. - */ - if ((err = tsol_check_dest(cred, dst, IPV6_VERSION, - udp->udp_connp->conn_mac_mode, &effective_cred)) != 0) - goto done; - if (effective_cred != NULL) - cred = effective_cred; - - /* - * Calculate the security label to be placed in the text - * of the message (if any). - */ - if ((err = tsol_compute_label_v6(cred, dst, opt_storage, - us->us_netstack->netstack_ip)) != 0) - goto done; - - /* - * Insert the security label in the cached ip options, - * removing any old label that may exist. - */ - if ((err = tsol_update_sticky(&udp->udp_sticky_ipp, - &udp->udp_label_len_v6, opt_storage)) != 0) - goto done; + ip6h->ip6_plen = htons((uint16_t)(pktlen - IPV6_HDR_LEN)); + udpha->uha_checksum = htons(cksum); - /* - * Save the destination address and cred we used to - * generate the security label text. - */ - if (cred != udp->udp_effective_cred) { - if (udp->udp_effective_cred != NULL) - crfree(udp->udp_effective_cred); - crhold(cred); - udp->udp_effective_cred = cred; - } - if (orig_cred != udp->udp_last_cred) { - if (udp->udp_last_cred != NULL) - crfree(udp->udp_last_cred); - crhold(orig_cred); - udp->udp_last_cred = orig_cred; + /* if IP_PKTINFO specified an addres it wins over bind() */ + if ((ipp->ipp_fields & IPPF_ADDR) && + !IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) { + ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr)); + ip6h->ip6_src = ipp->ipp_addr; + } else { + ip6h->ip6_src = *v6src; + } + ip6h->ip6_vcf = + (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) | + (flowinfo & ~IPV6_VERS_AND_FLOW_MASK); + if (ipp->ipp_fields & IPPF_TCLASS) { + /* Overrides the class part of flowinfo */ + ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf, + ipp->ipp_tclass); + } } -done: - if (effective_cred != NULL) - crfree(effective_cred); + /* Insert all-0s SPI now. */ + if (insert_spi) + *((uint32_t *)(udpha + 1)) = 0; - if (err != 0) { - DTRACE_PROBE4( - tx__ip__log__drop__updatelabel__udp6, - char *, "queue(1) failed to update options(2) on mp(3)", - queue_t *, wq, char *, opt_storage, mblk_t *, mp); - } - return (err); + udpha->uha_dst_port = dstport; + return (mp); } -static int -udp_send_connected(conn_t *connp, mblk_t *mp, struct nmsghdr *msg, cred_t *cr, - pid_t pid) +/* + * Send a T_UDERR_IND in response to an M_DATA + */ +static void +udp_ud_err_connected(conn_t *connp, t_scalar_t error) { - udp_t *udp = connp->conn_udp; - udp_stack_t *us = udp->udp_us; - ipaddr_t v4dst; - in_port_t dstport; - boolean_t mapped_addr; struct sockaddr_storage ss; sin_t *sin; sin6_t *sin6; struct sockaddr *addr; socklen_t addrlen; - int error; - boolean_t insert_spi = udp->udp_nat_t_endpoint; - - /* M_DATA for connected socket */ - - ASSERT(udp->udp_issocket); - UDP_DBGSTAT(us, udp_data_conn); + mblk_t *mp1; mutex_enter(&connp->conn_lock); - if (udp->udp_state != TS_DATA_XFER) { - mutex_exit(&connp->conn_lock); - BUMP_MIB(&us->us_udp_mib, udpOutErrors); - UDP_STAT(us, udp_out_err_notconn); - freemsg(mp); - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, - "udp_wput_end: connp %p (%S)", connp, - "not-connected; address required"); - return (EDESTADDRREQ); - } - - mapped_addr = IN6_IS_ADDR_V4MAPPED(&udp->udp_v6dst); - if (mapped_addr) - IN6_V4MAPPED_TO_IPADDR(&udp->udp_v6dst, v4dst); - /* Initialize addr and addrlen as if they're passed in */ - if (udp->udp_family == AF_INET) { + if (connp->conn_family == AF_INET) { sin = (sin_t *)&ss; + *sin = sin_null; sin->sin_family = AF_INET; - dstport = sin->sin_port = udp->udp_dstport; - ASSERT(mapped_addr); - sin->sin_addr.s_addr = v4dst; + sin->sin_port = connp->conn_fport; + sin->sin_addr.s_addr = connp->conn_faddr_v4; addr = (struct sockaddr *)sin; addrlen = sizeof (*sin); } else { sin6 = (sin6_t *)&ss; + *sin6 = sin6_null; sin6->sin6_family = AF_INET6; - dstport = sin6->sin6_port = udp->udp_dstport; - sin6->sin6_flowinfo = udp->udp_flowinfo; - sin6->sin6_addr = udp->udp_v6dst; - sin6->sin6_scope_id = 0; + sin6->sin6_port = connp->conn_fport; + sin6->sin6_flowinfo = connp->conn_flowinfo; + sin6->sin6_addr = connp->conn_faddr_v6; + if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_faddr_v6) && + (connp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) { + sin6->sin6_scope_id = connp->conn_ixa->ixa_scopeid; + } else { + sin6->sin6_scope_id = 0; + } sin6->__sin6_src_id = 0; addr = (struct sockaddr *)sin6; addrlen = sizeof (*sin6); } mutex_exit(&connp->conn_lock); - if (mapped_addr) { - /* - * Handle both AF_INET and AF_INET6; the latter - * for IPV4 mapped destination addresses. Note - * here that both addr and addrlen point to the - * corresponding struct depending on the address - * family of the socket. - */ - mp = udp_output_v4(connp, mp, v4dst, dstport, 0, &error, - insert_spi, msg, cr, pid); - } else { - mp = udp_output_v6(connp, mp, sin6, &error, msg, cr, pid); - } - if (error == 0) { - ASSERT(mp == NULL); - return (0); - } - - UDP_STAT(us, udp_out_err_output); - ASSERT(mp != NULL); - if (IPCL_IS_NONSTR(connp)) { - freemsg(mp); - return (error); - } else { - /* mp is freed by the following routine */ - udp_ud_err(connp->conn_wq, mp, (uchar_t *)addr, - (t_scalar_t)addrlen, (t_scalar_t)error); - return (0); - } -} - -/* ARGSUSED */ -static int -udp_send_not_connected(conn_t *connp, mblk_t *mp, struct sockaddr *addr, - socklen_t addrlen, struct nmsghdr *msg, cred_t *cr, pid_t pid) -{ - - udp_t *udp = connp->conn_udp; - boolean_t insert_spi = udp->udp_nat_t_endpoint; - int error = 0; - sin6_t *sin6; - sin_t *sin; - uint_t srcid; - uint16_t port; - ipaddr_t v4dst; - - - ASSERT(addr != NULL); - - switch (udp->udp_family) { - case AF_INET6: - sin6 = (sin6_t *)addr; - if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { - /* - * Destination is a non-IPv4-compatible IPv6 address. - * Send out an IPv6 format packet. - */ - mp = udp_output_v6(connp, mp, sin6, &error, msg, cr, - pid); - if (error != 0) - goto ud_error; - - return (0); - } - /* - * If the local address is not zero or a mapped address - * return an error. It would be possible to send an IPv4 - * packet but the response would never make it back to the - * application since it is bound to a non-mapped address. - */ - if (!IN6_IS_ADDR_V4MAPPED(&udp->udp_v6src) && - !IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) { - error = EADDRNOTAVAIL; - goto ud_error; - } - /* Send IPv4 packet without modifying udp_ipversion */ - /* Extract port and ipaddr */ - port = sin6->sin6_port; - IN6_V4MAPPED_TO_IPADDR(&sin6->sin6_addr, v4dst); - srcid = sin6->__sin6_src_id; - break; - - case AF_INET: - sin = (sin_t *)addr; - /* Extract port and ipaddr */ - port = sin->sin_port; - v4dst = sin->sin_addr.s_addr; - srcid = 0; - break; - } - - mp = udp_output_v4(connp, mp, v4dst, port, srcid, &error, insert_spi, - msg, cr, pid); - - if (error == 0) { - ASSERT(mp == NULL); - return (0); - } - -ud_error: - ASSERT(mp != NULL); - - return (error); + mp1 = mi_tpi_uderror_ind((char *)addr, addrlen, NULL, 0, error); + if (mp1 != NULL) + putnext(connp->conn_rq, mp1); } /* @@ -5788,15 +3804,20 @@ ud_error: void udp_wput(queue_t *q, mblk_t *mp) { + sin6_t *sin6; + sin_t *sin = NULL; + uint_t srcid; conn_t *connp = Q_TO_CONN(q); udp_t *udp = connp->conn_udp; int error = 0; - struct sockaddr *addr; + struct sockaddr *addr = NULL; socklen_t addrlen; udp_stack_t *us = udp->udp_us; - - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_START, - "udp_wput_start: queue %p mp %p", q, mp); + struct T_unitdata_req *tudr; + mblk_t *data_mp; + ushort_t ipversion; + cred_t *cr; + pid_t pid; /* * We directly handle several cases here: T_UNITDATA_REQ message @@ -5805,910 +3826,612 @@ udp_wput(queue_t *q, mblk_t *mp) */ switch (DB_TYPE(mp)) { case M_DATA: - /* - * Quick check for error cases. Checks will be done again - * under the lock later on - */ if (!udp->udp_issocket || udp->udp_state != TS_DATA_XFER) { /* Not connected; address is required */ BUMP_MIB(&us->us_udp_mib, udpOutErrors); + UDP_DBGSTAT(us, udp_data_notconn); UDP_STAT(us, udp_out_err_notconn); freemsg(mp); - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, - "udp_wput_end: connp %p (%S)", connp, - "not-connected; address required"); return; } - (void) udp_send_connected(connp, mp, NULL, NULL, -1); + /* + * All Solaris components should pass a db_credp + * for this message, hence we ASSERT. + * On production kernels we return an error to be robust against + * random streams modules sitting on top of us. + */ + cr = msg_getcred(mp, &pid); + ASSERT(cr != NULL); + if (cr == NULL) { + BUMP_MIB(&us->us_udp_mib, udpOutErrors); + freemsg(mp); + return; + } + ASSERT(udp->udp_issocket); + UDP_DBGSTAT(us, udp_data_conn); + error = udp_output_connected(connp, mp, cr, pid); + if (error != 0) { + UDP_STAT(us, udp_out_err_output); + if (connp->conn_rq != NULL) + udp_ud_err_connected(connp, (t_scalar_t)error); +#ifdef DEBUG + printf("udp_output_connected returned %d\n", error); +#endif + } return; case M_PROTO: - case M_PCPROTO: { - struct T_unitdata_req *tudr; - - ASSERT((uintptr_t)MBLKL(mp) <= (uintptr_t)INT_MAX); + case M_PCPROTO: tudr = (struct T_unitdata_req *)mp->b_rptr; - - /* Handle valid T_UNITDATA_REQ here */ - if (MBLKL(mp) >= sizeof (*tudr) && - ((t_primp_t)mp->b_rptr)->type == T_UNITDATA_REQ) { - if (mp->b_cont == NULL) { - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, - "udp_wput_end: q %p (%S)", q, "badaddr"); - error = EPROTO; - goto ud_error; - } - - if (!MBLKIN(mp, 0, tudr->DEST_offset + - tudr->DEST_length)) { - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, - "udp_wput_end: q %p (%S)", q, "badaddr"); - error = EADDRNOTAVAIL; - goto ud_error; - } - /* - * If a port has not been bound to the stream, fail. - * This is not a problem when sockfs is directly - * above us, because it will ensure that the socket - * is first bound before allowing data to be sent. - */ - if (udp->udp_state == TS_UNBND) { - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, - "udp_wput_end: q %p (%S)", q, "outstate"); - error = EPROTO; - goto ud_error; - } - addr = (struct sockaddr *) - &mp->b_rptr[tudr->DEST_offset]; - addrlen = tudr->DEST_length; - if (tudr->OPT_length != 0) - UDP_STAT(us, udp_out_opt); - break; + if (MBLKL(mp) < sizeof (*tudr) || + ((t_primp_t)mp->b_rptr)->type != T_UNITDATA_REQ) { + udp_wput_other(q, mp); + return; } - /* FALLTHRU */ - } + break; + default: udp_wput_other(q, mp); return; } - ASSERT(addr != NULL); - error = udp_send_not_connected(connp, mp, addr, addrlen, NULL, NULL, - -1); - if (error != 0) { -ud_error: - UDP_STAT(us, udp_out_err_output); - ASSERT(mp != NULL); - /* mp is freed by the following routine */ - udp_ud_err(q, mp, (uchar_t *)addr, (t_scalar_t)addrlen, - (t_scalar_t)error); + /* Handle valid T_UNITDATA_REQ here */ + data_mp = mp->b_cont; + if (data_mp == NULL) { + error = EPROTO; + goto ud_error2; } -} + mp->b_cont = NULL; -/* ARGSUSED */ -static void -udp_wput_fallback(queue_t *wq, mblk_t *mp) -{ -#ifdef DEBUG - cmn_err(CE_CONT, "udp_wput_fallback: Message in fallback \n"); -#endif - freemsg(mp); -} - - -/* - * udp_output_v6(): - * Assumes that udp_wput did some sanity checking on the destination - * address. - */ -static mblk_t * -udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6, int *error, - struct nmsghdr *msg, cred_t *cr, pid_t pid) -{ - ip6_t *ip6h; - ip6i_t *ip6i; /* mp1->b_rptr even if no ip6i_t */ - mblk_t *mp1 = mp; - mblk_t *mp2; - int udp_ip_hdr_len = IPV6_HDR_LEN + UDPH_SIZE; - size_t ip_len; - udpha_t *udph; - udp_t *udp = connp->conn_udp; - udp_stack_t *us = udp->udp_us; - queue_t *q = connp->conn_wq; - ip6_pkt_t ipp_s; /* For ancillary data options */ - ip6_pkt_t *ipp = &ipp_s; - ip6_pkt_t *tipp; /* temporary ipp */ - uint32_t csum = 0; - uint_t ignore = 0; - uint_t option_exists = 0, is_sticky = 0; - uint8_t *cp; - uint8_t *nxthdr_ptr; - in6_addr_t ip6_dst; - in_port_t port; - udpattrs_t attrs; - boolean_t opt_present; - ip6_hbh_t *hopoptsptr = NULL; - uint_t hopoptslen = 0; - boolean_t is_ancillary = B_FALSE; - size_t sth_wroff = 0; - ire_t *ire; - boolean_t update_lastdst = B_FALSE; - - *error = 0; - - /* - * If the local address is a mapped address return - * an error. - * It would be possible to send an IPv6 packet but the - * response would never make it back to the application - * since it is bound to a mapped address. - */ - if (IN6_IS_ADDR_V4MAPPED(&udp->udp_v6src)) { - *error = EADDRNOTAVAIL; - goto done; + if (!MBLKIN(mp, 0, tudr->DEST_offset + tudr->DEST_length)) { + error = EADDRNOTAVAIL; + goto ud_error2; } - ipp->ipp_fields = 0; - ipp->ipp_sticky_ignored = 0; - /* - * If TPI options passed in, feed it for verification and handling + * All Solaris components should pass a db_credp + * for this TPI message, hence we should ASSERT. + * However, RPC (svc_clts_ksend) does this odd thing where it + * passes the options from a T_UNITDATA_IND unchanged in a + * T_UNITDATA_REQ. While that is the right thing to do for + * some options, SCM_UCRED being the key one, this also makes it + * pass down IP_RECVDSTADDR. Hence we can't ASSERT here. */ - attrs.udpattr_credset = B_FALSE; - opt_present = B_FALSE; - if (IPCL_IS_NONSTR(connp)) { - if (msg->msg_controllen != 0) { - attrs.udpattr_ipp6 = ipp; - attrs.udpattr_mb = mp; - - rw_enter(&udp->udp_rwlock, RW_WRITER); - *error = process_auxiliary_options(connp, - msg->msg_control, msg->msg_controllen, - &attrs, &udp_opt_obj, udp_opt_set, cr); - rw_exit(&udp->udp_rwlock); - if (*error) - goto done; - ASSERT(*error == 0); - opt_present = B_TRUE; - } - } else { - if (DB_TYPE(mp) != M_DATA) { - mp1 = mp->b_cont; - if (((struct T_unitdata_req *) - mp->b_rptr)->OPT_length != 0) { - attrs.udpattr_ipp6 = ipp; - attrs.udpattr_mb = mp; - if (udp_unitdata_opt_process(q, mp, error, - &attrs) < 0) { - goto done; - } - ASSERT(*error == 0); - opt_present = B_TRUE; - } - } + cr = msg_getcred(mp, &pid); + if (cr == NULL) { + cr = connp->conn_cred; + pid = connp->conn_cpid; } /* - * Determine whether we need to mark the mblk with the user's - * credentials. - * If labeled then sockfs would have already done this. + * If a port has not been bound to the stream, fail. + * This is not a problem when sockfs is directly + * above us, because it will ensure that the socket + * is first bound before allowing data to be sent. */ - ASSERT(!is_system_labeled() || msg_getcred(mp, NULL) != NULL); - ire = connp->conn_ire_cache; - if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr) || (ire == NULL) || - (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, &sin6->sin6_addr)) || - (ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK))) { - if (cr != NULL && msg_getcred(mp, NULL) == NULL) - mblk_setcred(mp, cr, pid); - } - - rw_enter(&udp->udp_rwlock, RW_READER); - ignore = ipp->ipp_sticky_ignored; - - /* mp1 points to the M_DATA mblk carrying the packet */ - ASSERT(mp1 != NULL && DB_TYPE(mp1) == M_DATA); - - if (sin6->sin6_scope_id != 0 && - IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) { - /* - * IPPF_SCOPE_ID is special. It's neither a sticky - * option nor ancillary data. It needs to be - * explicitly set in options_exists. - */ - option_exists |= IPPF_SCOPE_ID; + if (udp->udp_state == TS_UNBND) { + error = EPROTO; + goto ud_error2; } + addr = (struct sockaddr *)&mp->b_rptr[tudr->DEST_offset]; + addrlen = tudr->DEST_length; - /* - * Compute the destination address - */ - ip6_dst = sin6->sin6_addr; - if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) - ip6_dst = ipv6_loopback; - - port = sin6->sin6_port; - - /* - * Cluster and TSOL notes, Cluster check: - * see comments in udp_output_v4(). - */ - mutex_enter(&connp->conn_lock); - - if (cl_inet_connect2 != NULL && - (!IN6_ARE_ADDR_EQUAL(&ip6_dst, &udp->udp_v6lastdst) || - port != udp->udp_lastdstport)) { - mutex_exit(&connp->conn_lock); - *error = 0; - CL_INET_UDP_CONNECT(connp, udp, B_TRUE, &ip6_dst, port, *error); - if (*error != 0) { - *error = EHOSTUNREACH; - rw_exit(&udp->udp_rwlock); - goto done; + switch (connp->conn_family) { + case AF_INET6: + sin6 = (sin6_t *)addr; + if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) || + (sin6->sin6_family != AF_INET6)) { + error = EADDRNOTAVAIL; + goto ud_error2; } - update_lastdst = B_TRUE; - mutex_enter(&connp->conn_lock); - } - /* - * If we're not going to the same destination as last time, then - * recompute the label required. This is done in a separate routine to - * avoid blowing up our stack here. - * - * TSOL Note: Since we are not in WRITER mode, UDP packets - * to different destination may require different labels, - * or worse, UDP packets to same IP address may require - * different labels due to use of shared all-zones address. - * We use conn_lock to ensure that lastdst, sticky ipp_hopopts, - * and sticky ipp_hopoptslen are consistent for the current - * destination and are updated atomically. - */ - if (is_system_labeled()) { - cred_t *credp; - pid_t cpid; + srcid = sin6->__sin6_src_id; + if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { + /* + * Destination is a non-IPv4-compatible IPv6 address. + * Send out an IPv6 format packet. + */ - /* Using UDP MLP requires SCM_UCRED from user */ - if (connp->conn_mlp_type != mlptSingle && - !attrs.udpattr_credset) { - DTRACE_PROBE4( - tx__ip__log__info__output__udp6, - char *, "MLP mp(1) lacks SCM_UCRED attr(2) on q(3)", - mblk_t *, mp1, udpattrs_t *, &attrs, queue_t *, q); - *error = EINVAL; - rw_exit(&udp->udp_rwlock); - mutex_exit(&connp->conn_lock); - goto done; - } - /* - * update label option for this UDP socket if - * - the destination has changed, - * - the UDP socket is MLP, or - * - the cred attached to the mblk changed. - */ - credp = msg_getcred(mp, &cpid); - if (opt_present || - !IN6_ARE_ADDR_EQUAL(&udp->udp_v6lastdst, &ip6_dst) || - connp->conn_mlp_type != mlptSingle || - credp != udp->udp_last_cred) { - if ((*error = udp_update_label_v6(q, mp, &ip6_dst)) - != 0) { - rw_exit(&udp->udp_rwlock); - mutex_exit(&connp->conn_lock); - goto done; + /* + * If the local address is a mapped address return + * an error. + * It would be possible to send an IPv6 packet but the + * response would never make it back to the application + * since it is bound to a mapped address. + */ + if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) { + error = EADDRNOTAVAIL; + goto ud_error2; } - update_lastdst = B_TRUE; - } - /* - * Attach the effective cred to the mblk to ensure future - * routing decisions will be based on it's label. - */ - mblk_setcred(mp, udp->udp_effective_cred, cpid); - } - if (update_lastdst) { - udp->udp_v6lastdst = ip6_dst; - udp->udp_lastdstport = port; - } + UDP_DBGSTAT(us, udp_out_ipv6); - /* - * If there's a security label here, then we ignore any options the - * user may try to set. We keep the peer's label as a hidden sticky - * option. We make a private copy of this label before releasing the - * lock so that label is kept consistent with the destination addr. - */ - if (udp->udp_label_len_v6 > 0) { - ignore &= ~IPPF_HOPOPTS; - ipp->ipp_fields &= ~IPPF_HOPOPTS; - } + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) + sin6->sin6_addr = ipv6_loopback; + ipversion = IPV6_VERSION; + } else { + if (connp->conn_ipv6_v6only) { + error = EADDRNOTAVAIL; + goto ud_error2; + } - if ((udp->udp_sticky_ipp.ipp_fields == 0) && (ipp->ipp_fields == 0)) { - /* No sticky options nor ancillary data. */ - mutex_exit(&connp->conn_lock); - goto no_options; - } + /* + * If the local address is not zero or a mapped address + * return an error. It would be possible to send an + * IPv4 packet but the response would never make it + * back to the application since it is bound to a + * non-mapped address. + */ + if (!IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6) && + !IN6_IS_ADDR_UNSPECIFIED(&connp->conn_saddr_v6)) { + error = EADDRNOTAVAIL; + goto ud_error2; + } + UDP_DBGSTAT(us, udp_out_mapped); - /* - * Go through the options figuring out where each is going to - * come from and build two masks. The first mask indicates if - * the option exists at all. The second mask indicates if the - * option is sticky or ancillary. - */ - if (!(ignore & IPPF_HOPOPTS)) { - if (ipp->ipp_fields & IPPF_HOPOPTS) { - option_exists |= IPPF_HOPOPTS; - udp_ip_hdr_len += ipp->ipp_hopoptslen; - } else if (udp->udp_sticky_ipp.ipp_fields & IPPF_HOPOPTS) { - option_exists |= IPPF_HOPOPTS; - is_sticky |= IPPF_HOPOPTS; - ASSERT(udp->udp_sticky_ipp.ipp_hopoptslen != 0); - hopoptsptr = kmem_alloc( - udp->udp_sticky_ipp.ipp_hopoptslen, KM_NOSLEEP); - if (hopoptsptr == NULL) { - *error = ENOMEM; - mutex_exit(&connp->conn_lock); - goto done; + if (V4_PART_OF_V6(sin6->sin6_addr) == INADDR_ANY) { + V4_PART_OF_V6(sin6->sin6_addr) = + htonl(INADDR_LOOPBACK); } - hopoptslen = udp->udp_sticky_ipp.ipp_hopoptslen; - bcopy(udp->udp_sticky_ipp.ipp_hopopts, hopoptsptr, - hopoptslen); - udp_ip_hdr_len += hopoptslen; + ipversion = IPV4_VERSION; } - } - mutex_exit(&connp->conn_lock); - if (!(ignore & IPPF_RTHDR)) { - if (ipp->ipp_fields & IPPF_RTHDR) { - option_exists |= IPPF_RTHDR; - udp_ip_hdr_len += ipp->ipp_rthdrlen; - } else if (udp->udp_sticky_ipp.ipp_fields & IPPF_RTHDR) { - option_exists |= IPPF_RTHDR; - is_sticky |= IPPF_RTHDR; - udp_ip_hdr_len += udp->udp_sticky_ipp.ipp_rthdrlen; - } - } + if (tudr->OPT_length != 0) { + /* + * If we are connected then the destination needs to be + * the same as the connected one. + */ + if (udp->udp_state == TS_DATA_XFER && + !conn_same_as_last_v6(connp, sin6)) { + error = EISCONN; + goto ud_error2; + } + UDP_STAT(us, udp_out_opt); + error = udp_output_ancillary(connp, NULL, sin6, + data_mp, mp, NULL, cr, pid); + } else { + ip_xmit_attr_t *ixa; - if (!(ignore & IPPF_RTDSTOPTS) && (option_exists & IPPF_RTHDR)) { - if (ipp->ipp_fields & IPPF_RTDSTOPTS) { - option_exists |= IPPF_RTDSTOPTS; - udp_ip_hdr_len += ipp->ipp_rtdstoptslen; - } else if (udp->udp_sticky_ipp.ipp_fields & IPPF_RTDSTOPTS) { - option_exists |= IPPF_RTDSTOPTS; - is_sticky |= IPPF_RTDSTOPTS; - udp_ip_hdr_len += udp->udp_sticky_ipp.ipp_rtdstoptslen; + /* + * We have to allocate an ip_xmit_attr_t before we grab + * conn_lock and we need to hold conn_lock once we've + * checked conn_same_as_last_v6 to handle concurrent + * send* calls on a socket. + */ + ixa = conn_get_ixa(connp, B_FALSE); + if (ixa == NULL) { + error = ENOMEM; + goto ud_error2; + } + mutex_enter(&connp->conn_lock); + + if (conn_same_as_last_v6(connp, sin6) && + connp->conn_lastsrcid == srcid && + ipsec_outbound_policy_current(ixa)) { + UDP_DBGSTAT(us, udp_out_lastdst); + /* udp_output_lastdst drops conn_lock */ + error = udp_output_lastdst(connp, data_mp, cr, + pid, ixa); + } else { + UDP_DBGSTAT(us, udp_out_diffdst); + /* udp_output_newdst drops conn_lock */ + error = udp_output_newdst(connp, data_mp, NULL, + sin6, ipversion, cr, pid, ixa); + } + ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); } - } - - if (!(ignore & IPPF_DSTOPTS)) { - if (ipp->ipp_fields & IPPF_DSTOPTS) { - option_exists |= IPPF_DSTOPTS; - udp_ip_hdr_len += ipp->ipp_dstoptslen; - } else if (udp->udp_sticky_ipp.ipp_fields & IPPF_DSTOPTS) { - option_exists |= IPPF_DSTOPTS; - is_sticky |= IPPF_DSTOPTS; - udp_ip_hdr_len += udp->udp_sticky_ipp.ipp_dstoptslen; + if (error == 0) { + freeb(mp); + return; } - } + break; - if (!(ignore & IPPF_IFINDEX)) { - if (ipp->ipp_fields & IPPF_IFINDEX) { - option_exists |= IPPF_IFINDEX; - } else if (udp->udp_sticky_ipp.ipp_fields & IPPF_IFINDEX) { - option_exists |= IPPF_IFINDEX; - is_sticky |= IPPF_IFINDEX; + case AF_INET: + sin = (sin_t *)addr; + if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) || + (sin->sin_family != AF_INET)) { + error = EADDRNOTAVAIL; + goto ud_error2; } - } + UDP_DBGSTAT(us, udp_out_ipv4); + if (sin->sin_addr.s_addr == INADDR_ANY) + sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + ipversion = IPV4_VERSION; - if (!(ignore & IPPF_ADDR)) { - if (ipp->ipp_fields & IPPF_ADDR) { - option_exists |= IPPF_ADDR; - } else if (udp->udp_sticky_ipp.ipp_fields & IPPF_ADDR) { - option_exists |= IPPF_ADDR; - is_sticky |= IPPF_ADDR; - } - } + srcid = 0; + if (tudr->OPT_length != 0) { + /* + * If we are connected then the destination needs to be + * the same as the connected one. + */ + if (udp->udp_state == TS_DATA_XFER && + !conn_same_as_last_v4(connp, sin)) { + error = EISCONN; + goto ud_error2; + } + UDP_STAT(us, udp_out_opt); + error = udp_output_ancillary(connp, sin, NULL, + data_mp, mp, NULL, cr, pid); + } else { + ip_xmit_attr_t *ixa; - if (!(ignore & IPPF_DONTFRAG)) { - if (ipp->ipp_fields & IPPF_DONTFRAG) { - option_exists |= IPPF_DONTFRAG; - } else if (udp->udp_sticky_ipp.ipp_fields & IPPF_DONTFRAG) { - option_exists |= IPPF_DONTFRAG; - is_sticky |= IPPF_DONTFRAG; + /* + * We have to allocate an ip_xmit_attr_t before we grab + * conn_lock and we need to hold conn_lock once we've + * checked conn_same_as_last_v4 to handle concurrent + * send* calls on a socket. + */ + ixa = conn_get_ixa(connp, B_FALSE); + if (ixa == NULL) { + error = ENOMEM; + goto ud_error2; + } + mutex_enter(&connp->conn_lock); + + if (conn_same_as_last_v4(connp, sin) && + ipsec_outbound_policy_current(ixa)) { + UDP_DBGSTAT(us, udp_out_lastdst); + /* udp_output_lastdst drops conn_lock */ + error = udp_output_lastdst(connp, data_mp, cr, + pid, ixa); + } else { + UDP_DBGSTAT(us, udp_out_diffdst); + /* udp_output_newdst drops conn_lock */ + error = udp_output_newdst(connp, data_mp, sin, + NULL, ipversion, cr, pid, ixa); + } + ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); } - } - - if (!(ignore & IPPF_USE_MIN_MTU)) { - if (ipp->ipp_fields & IPPF_USE_MIN_MTU) { - option_exists |= IPPF_USE_MIN_MTU; - } else if (udp->udp_sticky_ipp.ipp_fields & - IPPF_USE_MIN_MTU) { - option_exists |= IPPF_USE_MIN_MTU; - is_sticky |= IPPF_USE_MIN_MTU; + if (error == 0) { + freeb(mp); + return; } + break; } + UDP_STAT(us, udp_out_err_output); + ASSERT(mp != NULL); + /* mp is freed by the following routine */ + udp_ud_err(q, mp, (t_scalar_t)error); + return; - if (!(ignore & IPPF_HOPLIMIT) && (ipp->ipp_fields & IPPF_HOPLIMIT)) - option_exists |= IPPF_HOPLIMIT; - /* IPV6_HOPLIMIT can never be sticky */ - ASSERT(!(udp->udp_sticky_ipp.ipp_fields & IPPF_HOPLIMIT)); +ud_error2: + BUMP_MIB(&us->us_udp_mib, udpOutErrors); + freemsg(data_mp); + UDP_STAT(us, udp_out_err_output); + ASSERT(mp != NULL); + /* mp is freed by the following routine */ + udp_ud_err(q, mp, (t_scalar_t)error); +} - if (!(ignore & IPPF_UNICAST_HOPS) && - (udp->udp_sticky_ipp.ipp_fields & IPPF_UNICAST_HOPS)) { - option_exists |= IPPF_UNICAST_HOPS; - is_sticky |= IPPF_UNICAST_HOPS; - } +/* + * Handle the case of the IP address, port, flow label being different + * for both IPv4 and IPv6. + * + * NOTE: The caller must hold conn_lock and we drop it here. + */ +static int +udp_output_newdst(conn_t *connp, mblk_t *data_mp, sin_t *sin, sin6_t *sin6, + ushort_t ipversion, cred_t *cr, pid_t pid, ip_xmit_attr_t *ixa) +{ + uint_t srcid; + uint32_t flowinfo; + udp_t *udp = connp->conn_udp; + int error = 0; + ip_xmit_attr_t *oldixa; + udp_stack_t *us = udp->udp_us; + in6_addr_t v6src; + in6_addr_t v6dst; + in6_addr_t v6nexthop; + in_port_t dstport; - if (!(ignore & IPPF_MULTICAST_HOPS) && - (udp->udp_sticky_ipp.ipp_fields & IPPF_MULTICAST_HOPS)) { - option_exists |= IPPF_MULTICAST_HOPS; - is_sticky |= IPPF_MULTICAST_HOPS; - } + ASSERT(MUTEX_HELD(&connp->conn_lock)); + ASSERT(ixa != NULL); + /* + * We hold conn_lock across all the use and modifications of + * the conn_lastdst, conn_ixa, and conn_xmit_ipp to ensure that they + * stay consistent. + */ - if (!(ignore & IPPF_TCLASS)) { - if (ipp->ipp_fields & IPPF_TCLASS) { - option_exists |= IPPF_TCLASS; - } else if (udp->udp_sticky_ipp.ipp_fields & IPPF_TCLASS) { - option_exists |= IPPF_TCLASS; - is_sticky |= IPPF_TCLASS; - } + ASSERT(cr != NULL); + ixa->ixa_cred = cr; + ixa->ixa_cpid = pid; + if (is_system_labeled()) { + /* We need to restart with a label based on the cred */ + ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); } - if (!(ignore & IPPF_NEXTHOP) && - (udp->udp_sticky_ipp.ipp_fields & IPPF_NEXTHOP)) { - option_exists |= IPPF_NEXTHOP; - is_sticky |= IPPF_NEXTHOP; + /* + * If we are connected then the destination needs to be the + * same as the connected one, which is not the case here since we + * checked for that above. + */ + if (udp->udp_state == TS_DATA_XFER) { + mutex_exit(&connp->conn_lock); + error = EISCONN; + goto ud_error; } -no_options: + /* In case previous destination was multicast or multirt */ + ip_attr_newdst(ixa); /* - * If any options carried in the ip6i_t were specified, we - * need to account for the ip6i_t in the data we'll be sending - * down. + * If laddr is unspecified then we look at sin6_src_id. + * We will give precedence to a source address set with IPV6_PKTINFO + * (aka IPPF_ADDR) but that is handled in build_hdrs. However, we don't + * want ip_attr_connect to select a source (since it can fail) when + * IPV6_PKTINFO is specified. + * If this doesn't result in a source address then we get a source + * from ip_attr_connect() below. */ - if (option_exists & IPPF_HAS_IP6I) - udp_ip_hdr_len += sizeof (ip6i_t); - - /* check/fix buffer config, setup pointers into it */ - ip6h = (ip6_t *)&mp1->b_rptr[-udp_ip_hdr_len]; - if (DB_REF(mp1) != 1 || ((unsigned char *)ip6h < DB_BASE(mp1)) || - !OK_32PTR(ip6h)) { - - /* Try to get everything in a single mblk next time */ - if (udp_ip_hdr_len > udp->udp_max_hdr_len) { - udp->udp_max_hdr_len = udp_ip_hdr_len; - sth_wroff = udp->udp_max_hdr_len + us->us_wroff_extra; + v6src = connp->conn_saddr_v6; + if (sin != NULL) { + IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6dst); + dstport = sin->sin_port; + flowinfo = 0; + srcid = 0; + ixa->ixa_flags &= ~IXAF_SCOPEID_SET; + if (srcid != 0 && V4_PART_OF_V6(&v6src) == INADDR_ANY) { + ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), + connp->conn_netstack); } - - mp2 = allocb(udp_ip_hdr_len + us->us_wroff_extra, BPRI_LO); - if (mp2 == NULL) { - *error = ENOMEM; - rw_exit(&udp->udp_rwlock); - goto done; + ixa->ixa_flags |= IXAF_IS_IPV4; + } else { + v6dst = sin6->sin6_addr; + dstport = sin6->sin6_port; + flowinfo = sin6->sin6_flowinfo; + srcid = sin6->__sin6_src_id; + if (IN6_IS_ADDR_LINKSCOPE(&v6dst) && sin6->sin6_scope_id != 0) { + ixa->ixa_scopeid = sin6->sin6_scope_id; + ixa->ixa_flags |= IXAF_SCOPEID_SET; + } else { + ixa->ixa_flags &= ~IXAF_SCOPEID_SET; } - mp2->b_wptr = DB_LIM(mp2); - mp2->b_cont = mp1; - mp1 = mp2; - if (DB_TYPE(mp) != M_DATA) - mp->b_cont = mp1; + if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { + ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), + connp->conn_netstack); + } + if (IN6_IS_ADDR_V4MAPPED(&v6dst)) + ixa->ixa_flags |= IXAF_IS_IPV4; else - mp = mp1; - - ip6h = (ip6_t *)(mp1->b_wptr - udp_ip_hdr_len); + ixa->ixa_flags &= ~IXAF_IS_IPV4; } - mp1->b_rptr = (unsigned char *)ip6h; - ip6i = (ip6i_t *)ip6h; - -#define ANCIL_OR_STICKY_PTR(f) ((is_sticky & f) ? &udp->udp_sticky_ipp : ipp) - if (option_exists & IPPF_HAS_IP6I) { - ip6h = (ip6_t *)&ip6i[1]; - ip6i->ip6i_flags = 0; - ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW; - - /* sin6_scope_id takes precendence over IPPF_IFINDEX */ - if (option_exists & IPPF_SCOPE_ID) { - ip6i->ip6i_flags |= IP6I_IFINDEX; - ip6i->ip6i_ifindex = sin6->sin6_scope_id; - } else if (option_exists & IPPF_IFINDEX) { - tipp = ANCIL_OR_STICKY_PTR(IPPF_IFINDEX); - ASSERT(tipp->ipp_ifindex != 0); - ip6i->ip6i_flags |= IP6I_IFINDEX; - ip6i->ip6i_ifindex = tipp->ipp_ifindex; - } - - if (option_exists & IPPF_ADDR) { - /* - * Enable per-packet source address verification if - * IPV6_PKTINFO specified the source address. - * ip6_src is set in the transport's _wput function. - */ - ip6i->ip6i_flags |= IP6I_VERIFY_SRC; - } - - if (option_exists & IPPF_DONTFRAG) { - ip6i->ip6i_flags |= IP6I_DONTFRAG; - } + /* Handle IPV6_PKTINFO setting source address. */ + if (IN6_IS_ADDR_UNSPECIFIED(&v6src) && + (connp->conn_xmit_ipp.ipp_fields & IPPF_ADDR)) { + ip_pkt_t *ipp = &connp->conn_xmit_ipp; - if (option_exists & IPPF_USE_MIN_MTU) { - ip6i->ip6i_flags = IP6I_API_USE_MIN_MTU( - ip6i->ip6i_flags, ipp->ipp_use_min_mtu); + if (ixa->ixa_flags & IXAF_IS_IPV4) { + if (IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) + v6src = ipp->ipp_addr; + } else { + if (!IN6_IS_ADDR_V4MAPPED(&ipp->ipp_addr)) + v6src = ipp->ipp_addr; } + } - if (option_exists & IPPF_NEXTHOP) { - tipp = ANCIL_OR_STICKY_PTR(IPPF_NEXTHOP); - ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&tipp->ipp_nexthop)); - ip6i->ip6i_flags |= IP6I_NEXTHOP; - ip6i->ip6i_nexthop = tipp->ipp_nexthop; - } + ip_attr_nexthop(&connp->conn_xmit_ipp, ixa, &v6dst, &v6nexthop); + mutex_exit(&connp->conn_lock); + error = ip_attr_connect(connp, ixa, &v6src, &v6dst, &v6nexthop, dstport, + &v6src, NULL, IPDF_ALLOW_MCBC | IPDF_VERIFY_DST | IPDF_IPSEC); + switch (error) { + case 0: + break; + case EADDRNOTAVAIL: /* - * tell IP this is an ip6i_t private header + * IXAF_VERIFY_SOURCE tells us to pick a better source. + * Don't have the application see that errno */ - ip6i->ip6i_nxt = IPPROTO_RAW; - } - - /* Initialize IPv6 header */ - ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW; - bzero(&ip6h->ip6_src, sizeof (ip6h->ip6_src)); - - /* Set the hoplimit of the outgoing packet. */ - if (option_exists & IPPF_HOPLIMIT) { - /* IPV6_HOPLIMIT ancillary data overrides all other settings. */ - ip6h->ip6_hops = ipp->ipp_hoplimit; - ip6i->ip6i_flags |= IP6I_HOPLIMIT; - } else if (IN6_IS_ADDR_MULTICAST(&sin6->sin6_addr)) { - ip6h->ip6_hops = udp->udp_multicast_ttl; - if (option_exists & IPPF_MULTICAST_HOPS) - ip6i->ip6i_flags |= IP6I_HOPLIMIT; - } else { - ip6h->ip6_hops = udp->udp_ttl; - if (option_exists & IPPF_UNICAST_HOPS) - ip6i->ip6i_flags |= IP6I_HOPLIMIT; - } - - if (option_exists & IPPF_ADDR) { - tipp = ANCIL_OR_STICKY_PTR(IPPF_ADDR); - ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&tipp->ipp_addr)); - ip6h->ip6_src = tipp->ipp_addr; - } else { + error = ENETUNREACH; + goto failed; + case ENETDOWN: /* - * The source address was not set using IPV6_PKTINFO. - * First look at the bound source. - * If unspecified fallback to __sin6_src_id. + * Have !ipif_addr_ready address; drop packet silently + * until we can get applications to not send until we + * are ready. */ - ip6h->ip6_src = udp->udp_v6src; - if (sin6->__sin6_src_id != 0 && - IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src)) { - ip_srcid_find_id(sin6->__sin6_src_id, - &ip6h->ip6_src, connp->conn_zoneid, - us->us_netstack); + error = 0; + goto failed; + case EHOSTUNREACH: + case ENETUNREACH: + if (ixa->ixa_ire != NULL) { + /* + * Let conn_ip_output/ire_send_noroute return + * the error and send any local ICMP error. + */ + error = 0; + break; } + /* FALLTHRU */ + failed: + default: + goto ud_error; } - nxthdr_ptr = (uint8_t *)&ip6h->ip6_nxt; - cp = (uint8_t *)&ip6h[1]; /* - * Here's where we have to start stringing together - * any extension headers in the right order: - * Hop-by-hop, destination, routing, and final destination opts. + * Cluster note: we let the cluster hook know that we are sending to a + * new address and/or port. */ - if (option_exists & IPPF_HOPOPTS) { - /* Hop-by-hop options */ - ip6_hbh_t *hbh = (ip6_hbh_t *)cp; - tipp = ANCIL_OR_STICKY_PTR(IPPF_HOPOPTS); - if (hopoptslen == 0) { - hopoptsptr = tipp->ipp_hopopts; - hopoptslen = tipp->ipp_hopoptslen; - is_ancillary = B_TRUE; - } - - *nxthdr_ptr = IPPROTO_HOPOPTS; - nxthdr_ptr = &hbh->ip6h_nxt; - - bcopy(hopoptsptr, cp, hopoptslen); - cp += hopoptslen; - - if (hopoptsptr != NULL && !is_ancillary) { - kmem_free(hopoptsptr, hopoptslen); - hopoptsptr = NULL; - hopoptslen = 0; + if (cl_inet_connect2 != NULL) { + CL_INET_UDP_CONNECT(connp, B_TRUE, &v6dst, dstport, error); + if (error != 0) { + error = EHOSTUNREACH; + goto ud_error; } } - /* - * En-route destination options - * Only do them if there's a routing header as well - */ - if (option_exists & IPPF_RTDSTOPTS) { - ip6_dest_t *dst = (ip6_dest_t *)cp; - tipp = ANCIL_OR_STICKY_PTR(IPPF_RTDSTOPTS); - - *nxthdr_ptr = IPPROTO_DSTOPTS; - nxthdr_ptr = &dst->ip6d_nxt; - bcopy(tipp->ipp_rtdstopts, cp, tipp->ipp_rtdstoptslen); - cp += tipp->ipp_rtdstoptslen; - } - /* - * Routing header next - */ - if (option_exists & IPPF_RTHDR) { - ip6_rthdr_t *rt = (ip6_rthdr_t *)cp; - tipp = ANCIL_OR_STICKY_PTR(IPPF_RTHDR); - - *nxthdr_ptr = IPPROTO_ROUTING; - nxthdr_ptr = &rt->ip6r_nxt; - - bcopy(tipp->ipp_rthdr, cp, tipp->ipp_rthdrlen); - cp += tipp->ipp_rthdrlen; - } + mutex_enter(&connp->conn_lock); /* - * Do ultimate destination options + * While we dropped the lock some other thread might have connected + * this socket. If so we bail out with EISCONN to ensure that the + * connecting thread is the one that updates conn_ixa, conn_ht_* + * and conn_*last*. */ - if (option_exists & IPPF_DSTOPTS) { - ip6_dest_t *dest = (ip6_dest_t *)cp; - tipp = ANCIL_OR_STICKY_PTR(IPPF_DSTOPTS); - - *nxthdr_ptr = IPPROTO_DSTOPTS; - nxthdr_ptr = &dest->ip6d_nxt; - - bcopy(tipp->ipp_dstopts, cp, tipp->ipp_dstoptslen); - cp += tipp->ipp_dstoptslen; + if (udp->udp_state == TS_DATA_XFER) { + mutex_exit(&connp->conn_lock); + error = EISCONN; + goto ud_error; } - /* - * Now set the last header pointer to the proto passed in - */ - ASSERT((int)(cp - (uint8_t *)ip6i) == (udp_ip_hdr_len - UDPH_SIZE)); - *nxthdr_ptr = IPPROTO_UDP; - - /* Update UDP header */ - udph = (udpha_t *)((uchar_t *)ip6i + udp_ip_hdr_len - UDPH_SIZE); - udph->uha_dst_port = sin6->sin6_port; - udph->uha_src_port = udp->udp_port; /* - * Copy in the destination address + * We need to rebuild the headers if + * - we are labeling packets (could be different for different + * destinations) + * - we have a source route (or routing header) since we need to + * massage that to get the pseudo-header checksum + * - the IP version is different than the last time + * - a socket option with COA_HEADER_CHANGED has been set which + * set conn_v6lastdst to zero. + * + * Otherwise the prepend function will just update the src, dst, + * dstport, and flow label. */ - ip6h->ip6_dst = ip6_dst; - - ip6h->ip6_vcf = - (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) | - (sin6->sin6_flowinfo & ~IPV6_VERS_AND_FLOW_MASK); - - if (option_exists & IPPF_TCLASS) { - tipp = ANCIL_OR_STICKY_PTR(IPPF_TCLASS); - ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf, - tipp->ipp_tclass); - } - rw_exit(&udp->udp_rwlock); - - if (option_exists & IPPF_RTHDR) { - ip6_rthdr_t *rth; - + if (is_system_labeled()) { + /* TX MLP requires SCM_UCRED and don't have that here */ + if (connp->conn_mlp_type != mlptSingle) { + mutex_exit(&connp->conn_lock); + error = ECONNREFUSED; + goto ud_error; + } /* - * Perform any processing needed for source routing. - * We know that all extension headers will be in the same mblk - * as the IPv6 header. + * Check whether Trusted Solaris policy allows communication + * with this host, and pretend that the destination is + * unreachable if not. + * Compute any needed label and place it in ipp_label_v4/v6. + * + * Later conn_build_hdr_template/conn_prepend_hdr takes + * ipp_label_v4/v6 to form the packet. + * + * Tsol note: Since we hold conn_lock we know no other + * thread manipulates conn_xmit_ipp. */ - rth = ip_find_rthdr_v6(ip6h, mp1->b_wptr); - if (rth != NULL && rth->ip6r_segleft != 0) { - if (rth->ip6r_type != IPV6_RTHDR_TYPE_0) { - /* - * Drop packet - only support Type 0 routing. - * Notify the application as well. - */ - *error = EPROTO; - goto done; - } - - /* - * rth->ip6r_len is twice the number of - * addresses in the header. Thus it must be even. - */ - if (rth->ip6r_len & 0x1) { - *error = EPROTO; - goto done; - } - /* - * Shuffle the routing header and ip6_dst - * addresses, and get the checksum difference - * between the first hop (in ip6_dst) and - * the destination (in the last routing hdr entry). - */ - csum = ip_massage_options_v6(ip6h, rth, - us->us_netstack); - /* - * Verify that the first hop isn't a mapped address. - * Routers along the path need to do this verification - * for subsequent hops. - */ - if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_dst)) { - *error = EADDRNOTAVAIL; - goto done; + error = conn_update_label(connp, ixa, &v6dst, + &connp->conn_xmit_ipp); + if (error != 0) { + mutex_exit(&connp->conn_lock); + goto ud_error; + } + /* Rebuild the header template */ + error = udp_build_hdr_template(connp, &v6src, &v6dst, dstport, + flowinfo); + if (error != 0) { + mutex_exit(&connp->conn_lock); + goto ud_error; + } + } else if ((connp->conn_xmit_ipp.ipp_fields & + (IPPF_IPV4_OPTIONS|IPPF_RTHDR)) || + ipversion != connp->conn_lastipversion || + IN6_IS_ADDR_UNSPECIFIED(&connp->conn_v6lastdst)) { + /* Rebuild the header template */ + error = udp_build_hdr_template(connp, &v6src, &v6dst, dstport, + flowinfo); + if (error != 0) { + mutex_exit(&connp->conn_lock); + goto ud_error; + } + } else { + /* Simply update the destination address if no source route */ + if (ixa->ixa_flags & IXAF_IS_IPV4) { + ipha_t *ipha = (ipha_t *)connp->conn_ht_iphc; + + IN6_V4MAPPED_TO_IPADDR(&v6dst, ipha->ipha_dst); + if (ixa->ixa_flags & IXAF_PMTU_IPV4_DF) { + ipha->ipha_fragment_offset_and_flags |= + IPH_DF_HTONS; + } else { + ipha->ipha_fragment_offset_and_flags &= + ~IPH_DF_HTONS; } - - cp += (rth->ip6r_len + 1)*8; + } else { + ip6_t *ip6h = (ip6_t *)connp->conn_ht_iphc; + ip6h->ip6_dst = v6dst; } } - /* count up length of UDP packet */ - ip_len = (mp1->b_wptr - (unsigned char *)ip6h) - IPV6_HDR_LEN; - if ((mp2 = mp1->b_cont) != NULL) { - do { - ASSERT((uintptr_t)MBLKL(mp2) <= (uintptr_t)UINT_MAX); - ip_len += (uint32_t)MBLKL(mp2); - } while ((mp2 = mp2->b_cont) != NULL); - } - /* - * If the size of the packet is greater than the maximum allowed by - * ip, return an error. Passing this down could cause panics because - * the size will have wrapped and be inconsistent with the msg size. - */ - if (ip_len > IP_MAXPACKET) { - *error = EMSGSIZE; - goto done; - } - - /* Store the UDP length. Subtract length of extension hdrs */ - udph->uha_length = htons(ip_len + IPV6_HDR_LEN - - (int)((uchar_t *)udph - (uchar_t *)ip6h)); - - /* - * We make it easy for IP to include our pseudo header - * by putting our length in uh_checksum, modified (if - * we have a routing header) by the checksum difference - * between the ultimate destination and first hop addresses. - * Note: UDP over IPv6 must always checksum the packet. + * Remember the dst/dstport etc which corresponds to the built header + * template and conn_ixa. */ - csum += udph->uha_length; - csum = (csum & 0xFFFF) + (csum >> 16); - udph->uha_checksum = (uint16_t)csum; - -#ifdef _LITTLE_ENDIAN - ip_len = htons(ip_len); -#endif - ip6h->ip6_plen = ip_len; - - if (DB_TYPE(mp) != M_DATA) { - cred_t *cr; - pid_t cpid; - - /* Move any cred from the T_UNITDATA_REQ to the packet */ - cr = msg_extractcred(mp, &cpid); - if (cr != NULL) { - if (mp1->b_datap->db_credp != NULL) - crfree(mp1->b_datap->db_credp); - mp1->b_datap->db_credp = cr; - mp1->b_datap->db_cpid = cpid; - } + oldixa = conn_replace_ixa(connp, ixa); + connp->conn_v6lastdst = v6dst; + connp->conn_lastipversion = ipversion; + connp->conn_lastdstport = dstport; + connp->conn_lastflowinfo = flowinfo; + connp->conn_lastscopeid = ixa->ixa_scopeid; + connp->conn_lastsrcid = srcid; + /* Also remember a source to use together with lastdst */ + connp->conn_v6lastsrc = v6src; + + data_mp = udp_prepend_header_template(connp, ixa, data_mp, &v6src, + dstport, flowinfo, &error); + + /* Done with conn_t */ + mutex_exit(&connp->conn_lock); + ixa_refrele(oldixa); - ASSERT(mp != mp1); - freeb(mp); + if (data_mp == NULL) { + ASSERT(error != 0); + goto ud_error; } - /* mp has been consumed and we'll return success */ - ASSERT(*error == 0); - mp = NULL; - - /* We're done. Pass the packet to IP */ + /* We're done. Pass the packet to ip. */ BUMP_MIB(&us->us_udp_mib, udpHCOutDatagrams); - ip_output_v6(connp, mp1, q, IP_WPUT); -done: - if (sth_wroff != 0) { - (void) proto_set_tx_wroff(RD(q), connp, - udp->udp_max_hdr_len + us->us_wroff_extra); - } - if (hopoptsptr != NULL && !is_ancillary) { - kmem_free(hopoptsptr, hopoptslen); - hopoptsptr = NULL; - } - if (*error != 0) { - ASSERT(mp != NULL); - BUMP_MIB(&us->us_udp_mib, udpOutErrors); - } - return (mp); -} - - -static int -i_udp_getpeername(udp_t *udp, struct sockaddr *sa, uint_t *salenp) -{ - sin_t *sin = (sin_t *)sa; - sin6_t *sin6 = (sin6_t *)sa; - - ASSERT(RW_LOCK_HELD(&udp->udp_rwlock)); - - if (udp->udp_state != TS_DATA_XFER) - return (ENOTCONN); - - switch (udp->udp_family) { - case AF_INET: - ASSERT(udp->udp_ipversion == IPV4_VERSION); - - if (*salenp < sizeof (sin_t)) - return (EINVAL); - - *salenp = sizeof (sin_t); - *sin = sin_null; - sin->sin_family = AF_INET; - sin->sin_port = udp->udp_dstport; - sin->sin_addr.s_addr = V4_PART_OF_V6(udp->udp_v6dst); + error = conn_ip_output(data_mp, ixa); + /* No udpOutErrors if an error since IP increases its error counter */ + switch (error) { + case 0: break; - - case AF_INET6: - if (*salenp < sizeof (sin6_t)) - return (EINVAL); - - *salenp = sizeof (sin6_t); - *sin6 = sin6_null; - sin6->sin6_family = AF_INET6; - sin6->sin6_port = udp->udp_dstport; - sin6->sin6_addr = udp->udp_v6dst; - sin6->sin6_flowinfo = udp->udp_flowinfo; + case EWOULDBLOCK: + (void) ixa_check_drain_insert(connp, ixa); + error = 0; break; - } - - return (0); -} - -static int -udp_getmyname(udp_t *udp, struct sockaddr *sa, uint_t *salenp) -{ - sin_t *sin = (sin_t *)sa; - sin6_t *sin6 = (sin6_t *)sa; - - ASSERT(RW_LOCK_HELD(&udp->udp_rwlock)); - - switch (udp->udp_family) { - case AF_INET: - ASSERT(udp->udp_ipversion == IPV4_VERSION); - - if (*salenp < sizeof (sin_t)) - return (EINVAL); - - *salenp = sizeof (sin_t); - *sin = sin_null; - sin->sin_family = AF_INET; - sin->sin_port = udp->udp_port; - + case EADDRNOTAVAIL: /* - * If udp_v6src is unspecified, we might be bound to broadcast - * / multicast. Use udp_bound_v6src as local address instead - * (that could also still be unspecified). + * IXAF_VERIFY_SOURCE tells us to pick a better source. + * Don't have the application see that errno */ - if (!IN6_IS_ADDR_V4MAPPED_ANY(&udp->udp_v6src) && - !IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) { - sin->sin_addr.s_addr = V4_PART_OF_V6(udp->udp_v6src); - } else { - sin->sin_addr.s_addr = - V4_PART_OF_V6(udp->udp_bound_v6src); - } - break; - - case AF_INET6: - if (*salenp < sizeof (sin6_t)) - return (EINVAL); - - *salenp = sizeof (sin6_t); - *sin6 = sin6_null; - sin6->sin6_family = AF_INET6; - sin6->sin6_port = udp->udp_port; - sin6->sin6_flowinfo = udp->udp_flowinfo; - + error = ENETUNREACH; + /* FALLTHRU */ + default: + mutex_enter(&connp->conn_lock); /* - * If udp_v6src is unspecified, we might be bound to broadcast - * / multicast. Use udp_bound_v6src as local address instead - * (that could also still be unspecified). + * Clear the source and v6lastdst so we call ip_attr_connect + * for the next packet and try to pick a better source. */ - if (!IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) - sin6->sin6_addr = udp->udp_v6src; + if (connp->conn_mcbc_bind) + connp->conn_saddr_v6 = ipv6_all_zeros; else - sin6->sin6_addr = udp->udp_bound_v6src; + connp->conn_saddr_v6 = connp->conn_bound_addr_v6; + connp->conn_v6lastdst = ipv6_all_zeros; + mutex_exit(&connp->conn_lock); break; } + ixa_refrele(ixa); + return (error); - return (0); +ud_error: + if (ixa != NULL) + ixa_refrele(ixa); + + freemsg(data_mp); + BUMP_MIB(&us->us_udp_mib, udpOutErrors); + UDP_STAT(us, udp_out_err_output); + return (error); +} + +/* ARGSUSED */ +static void +udp_wput_fallback(queue_t *wq, mblk_t *mp) +{ +#ifdef DEBUG + cmn_err(CE_CONT, "udp_wput_fallback: Message in fallback \n"); +#endif + freemsg(mp); } + /* * Handle special out-of-band ioctl requests (see PSARC/2008/265). */ @@ -6717,7 +4440,8 @@ udp_wput_cmdblk(queue_t *q, mblk_t *mp) { void *data; mblk_t *datamp = mp->b_cont; - udp_t *udp = Q_TO_UDP(q); + conn_t *connp = Q_TO_CONN(q); + udp_t *udp = connp->conn_udp; cmdblk_t *cmdp = (cmdblk_t *)mp->b_rptr; if (datamp == NULL || MBLKL(datamp) < cmdp->cb_len) { @@ -6727,19 +4451,23 @@ udp_wput_cmdblk(queue_t *q, mblk_t *mp) } data = datamp->b_rptr; - rw_enter(&udp->udp_rwlock, RW_READER); + mutex_enter(&connp->conn_lock); switch (cmdp->cb_cmd) { case TI_GETPEERNAME: - cmdp->cb_error = i_udp_getpeername(udp, data, &cmdp->cb_len); + if (udp->udp_state != TS_DATA_XFER) + cmdp->cb_error = ENOTCONN; + else + cmdp->cb_error = conn_getpeername(connp, data, + &cmdp->cb_len); break; case TI_GETMYNAME: - cmdp->cb_error = udp_getmyname(udp, data, &cmdp->cb_len); + cmdp->cb_error = conn_getsockname(connp, data, &cmdp->cb_len); break; default: cmdp->cb_error = EINVAL; break; } - rw_exit(&udp->udp_rwlock); + mutex_exit(&connp->conn_lock); qreply(q, mp); } @@ -6747,10 +4475,11 @@ udp_wput_cmdblk(queue_t *q, mblk_t *mp) static void udp_use_pure_tpi(udp_t *udp) { - rw_enter(&udp->udp_rwlock, RW_WRITER); - udp->udp_issocket = B_FALSE; - rw_exit(&udp->udp_rwlock); + conn_t *connp = udp->udp_connp; + mutex_enter(&connp->conn_lock); + udp->udp_issocket = B_FALSE; + mutex_exit(&connp->conn_lock); UDP_STAT(udp->udp_us, udp_sock_fallback); } @@ -6758,20 +4487,13 @@ static void udp_wput_other(queue_t *q, mblk_t *mp) { uchar_t *rptr = mp->b_rptr; - struct datab *db; struct iocblk *iocp; - cred_t *cr; conn_t *connp = Q_TO_CONN(q); udp_t *udp = connp->conn_udp; - udp_stack_t *us; - - TRACE_1(TR_FAC_UDP, TR_UDP_WPUT_OTHER_START, - "udp_wput_other_start: q %p", q); - - us = udp->udp_us; - db = mp->b_datap; + udp_stack_t *us = udp->udp_us; + cred_t *cr; - switch (db->db_type) { + switch (mp->b_datap->db_type) { case M_CMD: udp_wput_cmdblk(q, mp); return; @@ -6779,37 +4501,29 @@ udp_wput_other(queue_t *q, mblk_t *mp) case M_PROTO: case M_PCPROTO: if (mp->b_wptr - rptr < sizeof (t_scalar_t)) { + /* + * If the message does not contain a PRIM_type, + * throw it away. + */ freemsg(mp); - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, - "udp_wput_other_end: q %p (%S)", q, "protoshort"); return; } switch (((t_primp_t)rptr)->type) { case T_ADDR_REQ: udp_addr_req(q, mp); - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, - "udp_wput_other_end: q %p (%S)", q, "addrreq"); return; case O_T_BIND_REQ: case T_BIND_REQ: udp_tpi_bind(q, mp); - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, - "udp_wput_other_end: q %p (%S)", q, "bindreq"); return; case T_CONN_REQ: udp_tpi_connect(q, mp); - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, - "udp_wput_other_end: q %p (%S)", q, "connreq"); return; case T_CAPABILITY_REQ: udp_capability_req(q, mp); - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, - "udp_wput_other_end: q %p (%S)", q, "capabreq"); return; case T_INFO_REQ: udp_info_req(q, mp); - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, - "udp_wput_other_end: q %p (%S)", q, "inforeq"); return; case T_UNITDATA_REQ: /* @@ -6817,14 +4531,10 @@ udp_wput_other(queue_t *q, mblk_t *mp) * be bad. Valid T_UNITDATA_REQs are handled * in udp_wput. */ - udp_ud_err(q, mp, NULL, 0, EADDRNOTAVAIL); - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, - "udp_wput_other_end: q %p (%S)", q, "unitdatareq"); + udp_ud_err(q, mp, EADDRNOTAVAIL); return; case T_UNBIND_REQ: udp_tpi_unbind(q, mp); - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, - "udp_wput_other_end: q %p (%S)", q, "unbindreq"); return; case T_SVR4_OPTMGMT_REQ: /* @@ -6842,11 +4552,8 @@ udp_wput_other(queue_t *q, mblk_t *mp) } if (!snmpcom_req(q, mp, udp_snmp_set, ip_snmp_get, cr)) { - (void) svr4_optcom_req(q, - mp, cr, &udp_opt_obj, B_TRUE); + svr4_optcom_req(q, mp, cr, &udp_opt_obj); } - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, - "udp_wput_other_end: q %p (%S)", q, "optmgmtreq"); return; case T_OPTMGMT_REQ: @@ -6863,34 +4570,24 @@ udp_wput_other(queue_t *q, mblk_t *mp) udp_err_ack(q, mp, TSYSERR, EINVAL); return; } - (void) tpi_optcom_req(q, mp, cr, &udp_opt_obj, B_TRUE); - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, - "udp_wput_other_end: q %p (%S)", q, "optmgmtreq"); + tpi_optcom_req(q, mp, cr, &udp_opt_obj); return; case T_DISCON_REQ: udp_tpi_disconnect(q, mp); - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, - "udp_wput_other_end: q %p (%S)", q, "disconreq"); return; /* The following TPI message is not supported by udp. */ case O_T_CONN_RES: case T_CONN_RES: udp_err_ack(q, mp, TNOTSUPPORT, 0); - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, - "udp_wput_other_end: q %p (%S)", q, - "connres/disconreq"); return; - /* The following 3 TPI messages are illegal for udp. */ + /* The following 3 TPI requests are illegal for udp. */ case T_DATA_REQ: case T_EXDATA_REQ: case T_ORDREL_REQ: udp_err_ack(q, mp, TNOTSUPPORT, 0); - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, - "udp_wput_other_end: q %p (%S)", q, - "data/exdata/ordrel"); return; default: break; @@ -6914,13 +4611,10 @@ udp_wput_other(queue_t *q, mblk_t *mp) iocp->ioc_count = 0; mp->b_datap->db_type = M_IOCACK; qreply(q, mp); - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, - "udp_wput_other_end: q %p (%S)", q, - "getpeername"); return; } /* FALLTHRU */ - case TI_GETMYNAME: { + case TI_GETMYNAME: /* * For TI_GETPEERNAME and TI_GETMYNAME, we first * need to copyin the user's strbuf structure. @@ -6929,17 +4623,12 @@ udp_wput_other(queue_t *q, mblk_t *mp) */ mi_copyin(q, mp, NULL, SIZEOF_STRUCT(strbuf, iocp->ioc_flag)); - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, - "udp_wput_other_end: q %p (%S)", q, "getmyname"); return; - } case ND_SET: /* nd_getset performs the necessary checking */ case ND_GET: if (nd_getset(q, us->us_nd, mp)) { qreply(q, mp); - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, - "udp_wput_other_end: q %p (%S)", q, "get"); return; } break; @@ -6969,16 +4658,12 @@ udp_wput_other(queue_t *q, mblk_t *mp) break; case M_IOCDATA: udp_wput_iocdata(q, mp); - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, - "udp_wput_other_end: q %p (%S)", q, "iocdata"); return; default: /* Unrecognized messages are passed through without change. */ break; } - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, - "udp_wput_other_end: q %p (%S)", q, "end"); - ip_output(connp, mp, q, IP_WPUT); + ip_wput_nondata(q, mp); } /* @@ -6991,9 +4676,9 @@ udp_wput_iocdata(queue_t *q, mblk_t *mp) mblk_t *mp1; struct iocblk *iocp = (struct iocblk *)mp->b_rptr; STRUCT_HANDLE(strbuf, sb); - udp_t *udp = Q_TO_UDP(q); - int error; uint_t addrlen; + conn_t *connp = Q_TO_CONN(q); + udp_t *udp = connp->conn_udp; /* Make sure it is one of ours. */ switch (iocp->ioc_cmd) { @@ -7001,7 +4686,7 @@ udp_wput_iocdata(queue_t *q, mblk_t *mp) case TI_GETPEERNAME: break; default: - ip_output(udp->udp_connp, mp, q, IP_WPUT); + ip_wput_nondata(q, mp); return; } @@ -7040,77 +4725,45 @@ udp_wput_iocdata(queue_t *q, mblk_t *mp) * address and then we'll copyout the strbuf. */ STRUCT_SET_HANDLE(sb, iocp->ioc_flag, (void *)mp1->b_rptr); - addrlen = udp->udp_family == AF_INET ? sizeof (sin_t) : sizeof (sin6_t); + + if (connp->conn_family == AF_INET) + addrlen = sizeof (sin_t); + else + addrlen = sizeof (sin6_t); + if (STRUCT_FGET(sb, maxlen) < addrlen) { mi_copy_done(q, mp, EINVAL); return; } - mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE); - - if (mp1 == NULL) - return; - - rw_enter(&udp->udp_rwlock, RW_READER); switch (iocp->ioc_cmd) { case TI_GETMYNAME: - error = udp_do_getsockname(udp, (void *)mp1->b_rptr, &addrlen); break; case TI_GETPEERNAME: - error = udp_do_getpeername(udp, (void *)mp1->b_rptr, &addrlen); + if (udp->udp_state != TS_DATA_XFER) { + mi_copy_done(q, mp, ENOTCONN); + return; + } break; } - rw_exit(&udp->udp_rwlock); - - if (error != 0) { - mi_copy_done(q, mp, error); - } else { - mp1->b_wptr += addrlen; - STRUCT_FSET(sb, len, addrlen); - - /* Copy out the address */ - mi_copyout(q, mp); - } -} - -static int -udp_unitdata_opt_process(queue_t *q, mblk_t *mp, int *errorp, - udpattrs_t *udpattrs) -{ - struct T_unitdata_req *udreqp; - int is_absreq_failure; - cred_t *cr; - - ASSERT(((t_primp_t)mp->b_rptr)->type); - - /* - * All Solaris components should pass a db_credp - * for this TPI message, hence we should ASSERT. - * However, RPC (svc_clts_ksend) does this odd thing where it - * passes the options from a T_UNITDATA_IND unchanged in a - * T_UNITDATA_REQ. While that is the right thing to do for - * some options, SCM_UCRED being the key one, this also makes it - * pass down IP_RECVDSTADDR. Hence we can't ASSERT here. - */ - cr = msg_getcred(mp, NULL); - if (cr == NULL) { - cr = Q_TO_CONN(q)->conn_cred; - } - udreqp = (struct T_unitdata_req *)mp->b_rptr; - - *errorp = tpi_optcom_buf(q, mp, &udreqp->OPT_length, - udreqp->OPT_offset, cr, &udp_opt_obj, - udpattrs, &is_absreq_failure); + mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE); + if (!mp1) + return; - if (*errorp != 0) { - /* - * Note: No special action needed in this - * module for "is_absreq_failure" - */ - return (-1); /* failure */ + STRUCT_FSET(sb, len, addrlen); + switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { + case TI_GETMYNAME: + (void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr, + &addrlen); + break; + case TI_GETPEERNAME: + (void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr, + &addrlen); + break; } - ASSERT(is_absreq_failure == 0); - return (0); /* success */ + mp1->b_wptr += addrlen; + /* Copy out the address */ + mi_copyout(q, mp); } void @@ -7234,34 +4887,19 @@ udp_kstat2_init(netstackid_t stackid, udp_stat_t *us_statisticsp) kstat_t *ksp; udp_stat_t template = { - { "udp_ip_send", KSTAT_DATA_UINT64 }, - { "udp_ip_ire_send", KSTAT_DATA_UINT64 }, - { "udp_ire_null", KSTAT_DATA_UINT64 }, { "udp_sock_fallback", KSTAT_DATA_UINT64 }, - { "udp_out_sw_cksum", KSTAT_DATA_UINT64 }, - { "udp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, { "udp_out_opt", KSTAT_DATA_UINT64 }, { "udp_out_err_notconn", KSTAT_DATA_UINT64 }, { "udp_out_err_output", KSTAT_DATA_UINT64 }, { "udp_out_err_tudr", KSTAT_DATA_UINT64 }, - { "udp_in_pktinfo", KSTAT_DATA_UINT64 }, - { "udp_in_recvdstaddr", KSTAT_DATA_UINT64 }, - { "udp_in_recvopts", KSTAT_DATA_UINT64 }, - { "udp_in_recvif", KSTAT_DATA_UINT64 }, - { "udp_in_recvslla", KSTAT_DATA_UINT64 }, - { "udp_in_recvucred", KSTAT_DATA_UINT64 }, - { "udp_in_recvttl", KSTAT_DATA_UINT64 }, - { "udp_in_recvhopopts", KSTAT_DATA_UINT64 }, - { "udp_in_recvhoplimit", KSTAT_DATA_UINT64 }, - { "udp_in_recvdstopts", KSTAT_DATA_UINT64 }, - { "udp_in_recvrtdstopts", KSTAT_DATA_UINT64 }, - { "udp_in_recvrthdr", KSTAT_DATA_UINT64 }, - { "udp_in_recvpktinfo", KSTAT_DATA_UINT64 }, - { "udp_in_recvtclass", KSTAT_DATA_UINT64 }, - { "udp_in_timestamp", KSTAT_DATA_UINT64 }, #ifdef DEBUG { "udp_data_conn", KSTAT_DATA_UINT64 }, { "udp_data_notconn", KSTAT_DATA_UINT64 }, + { "udp_out_lastdst", KSTAT_DATA_UINT64 }, + { "udp_out_diffdst", KSTAT_DATA_UINT64 }, + { "udp_out_ipv6", KSTAT_DATA_UINT64 }, + { "udp_out_mapped", KSTAT_DATA_UINT64 }, + { "udp_out_ipv4", KSTAT_DATA_UINT64 }, #endif }; @@ -7384,8 +5022,6 @@ udp_set_rcv_hiwat(udp_t *udp, size_t size) static void udp_lrput(queue_t *q, mblk_t *mp) { - mblk_t *mp1; - switch (mp->b_datap->db_type) { case M_FLUSH: /* Turn around */ @@ -7396,9 +5032,6 @@ udp_lrput(queue_t *q, mblk_t *mp) } break; } - /* Could receive messages that passed through ar_rput */ - for (mp1 = mp; mp1; mp1 = mp1->b_cont) - mp1->b_prev = mp1->b_next = NULL; freemsg(mp); } @@ -7425,6 +5058,7 @@ udp_do_open(cred_t *credp, boolean_t isv6, int flags) zoneid_t zoneid; netstack_t *ns; udp_stack_t *us; + int len; ns = netstack_find_by_cred(credp); ASSERT(ns != NULL); @@ -7455,34 +5089,40 @@ udp_do_open(cred_t *credp, boolean_t isv6, int flags) */ netstack_rele(ns); - rw_enter(&udp->udp_rwlock, RW_WRITER); - ASSERT(connp->conn_ulp == IPPROTO_UDP); + /* + * Since this conn_t/udp_t is not yet visible to anybody else we don't + * need to lock anything. + */ + ASSERT(connp->conn_proto == IPPROTO_UDP); ASSERT(connp->conn_udp == udp); ASSERT(udp->udp_connp == connp); /* Set the initial state of the stream and the privilege status. */ udp->udp_state = TS_UNBND; + connp->conn_ixa->ixa_flags |= IXAF_VERIFY_SOURCE; if (isv6) { - udp->udp_family = AF_INET6; - udp->udp_ipversion = IPV6_VERSION; - udp->udp_max_hdr_len = IPV6_HDR_LEN + UDPH_SIZE; - udp->udp_ttl = us->us_ipv6_hoplimit; - connp->conn_af_isv6 = B_TRUE; + connp->conn_family = AF_INET6; + connp->conn_ipversion = IPV6_VERSION; + connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4; + connp->conn_default_ttl = us->us_ipv6_hoplimit; + len = sizeof (ip6_t) + UDPH_SIZE; } else { - udp->udp_family = AF_INET; - udp->udp_ipversion = IPV4_VERSION; - udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE; - udp->udp_ttl = us->us_ipv4_ttl; - connp->conn_af_isv6 = B_FALSE; + connp->conn_family = AF_INET; + connp->conn_ipversion = IPV4_VERSION; + connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4; + connp->conn_default_ttl = us->us_ipv4_ttl; + len = sizeof (ipha_t) + UDPH_SIZE; } - udp->udp_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; - udp->udp_pending_op = -1; - connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; - connp->conn_zoneid = zoneid; + ASSERT(connp->conn_ixa->ixa_protocol == connp->conn_proto); + connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl; + + connp->conn_ixa->ixa_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; + connp->conn_ixa->ixa_flags |= IXAF_MULTICAST_LOOP | IXAF_SET_ULP_CKSUM; + /* conn_allzones can not be set this early, hence no IPCL_ZONEID */ + connp->conn_ixa->ixa_zoneid = zoneid; - udp->udp_open_time = lbolt64; - udp->udp_open_pid = curproc->p_pid; + connp->conn_zoneid = zoneid; /* * If the caller has the process-wide flag set, then default to MAC @@ -7491,22 +5131,38 @@ udp_do_open(cred_t *credp, boolean_t isv6, int flags) if (getpflags(NET_MAC_AWARE, credp) != 0) connp->conn_mac_mode = CONN_MAC_AWARE; - connp->conn_ulp_labeled = is_system_labeled(); + connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID); udp->udp_us = us; + connp->conn_rcvbuf = us->us_recv_hiwat; + connp->conn_sndbuf = us->us_xmit_hiwat; + connp->conn_sndlowat = us->us_xmit_lowat; + connp->conn_rcvlowat = udp_mod_info.mi_lowat; + + connp->conn_wroff = len + us->us_wroff_extra; + connp->conn_so_type = SOCK_DGRAM; + connp->conn_recv = udp_input; + connp->conn_recvicmp = udp_icmp_input; crhold(credp); connp->conn_cred = credp; + connp->conn_cpid = curproc->p_pid; + connp->conn_open_time = lbolt64; + /* Cache things in ixa without an extra refhold */ + connp->conn_ixa->ixa_cred = connp->conn_cred; + connp->conn_ixa->ixa_cpid = connp->conn_cpid; + if (is_system_labeled()) + connp->conn_ixa->ixa_tsl = crgetlabel(connp->conn_cred); *((sin6_t *)&udp->udp_delayed_addr) = sin6_null; - rw_exit(&udp->udp_rwlock); + if (us->us_pmtu_discovery) + connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY; return (connp); } -/* ARGSUSED */ sock_lower_handle_t udp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, uint_t *smodep, int *errorp, int flags, cred_t *credp) @@ -7539,39 +5195,17 @@ udp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, ASSERT(us != NULL); udp->udp_issocket = B_TRUE; - connp->conn_flags |= IPCL_NONSTR | IPCL_SOCKET; - - /* Set flow control */ - rw_enter(&udp->udp_rwlock, RW_WRITER); - (void) udp_set_rcv_hiwat(udp, us->us_recv_hiwat); - udp->udp_rcv_disply_hiwat = us->us_recv_hiwat; - udp->udp_rcv_lowat = udp_mod_info.mi_lowat; - udp->udp_xmit_hiwat = us->us_xmit_hiwat; - udp->udp_xmit_lowat = us->us_xmit_lowat; - - if (udp->udp_family == AF_INET6) { - /* Build initial header template for transmit */ - if ((*errorp = udp_build_hdrs(udp)) != 0) { - rw_exit(&udp->udp_rwlock); - ipcl_conn_destroy(connp); - return (NULL); - } - } - rw_exit(&udp->udp_rwlock); + connp->conn_flags |= IPCL_NONSTR; - connp->conn_flow_cntrld = B_FALSE; - - ASSERT(us->us_ldi_ident != NULL); - - if ((*errorp = ip_create_helper_stream(connp, us->us_ldi_ident)) != 0) { - ip1dbg(("udp_create: create of IP helper stream failed\n")); - udp_do_close(connp); - return (NULL); - } + /* + * Set flow control + * Since this conn_t/udp_t is not yet visible to anybody else we don't + * need to lock anything. + */ + (void) udp_set_rcv_hiwat(udp, connp->conn_rcvbuf); + udp->udp_rcv_disply_hiwat = connp->conn_rcvbuf; - /* Set the send flow control */ - connp->conn_wq->q_hiwat = us->us_xmit_hiwat; - connp->conn_wq->q_lowat = us->us_xmit_lowat; + connp->conn_flow_cntrld = B_FALSE; mutex_enter(&connp->conn_lock); connp->conn_state_flags &= ~CONN_INCIPIENT; @@ -7583,14 +5217,12 @@ udp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, return ((sock_lower_handle_t)connp); } -/* ARGSUSED */ +/* ARGSUSED3 */ void udp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle, sock_upcalls_t *sock_upcalls, int flags, cred_t *cr) { conn_t *connp = (conn_t *)proto_handle; - udp_t *udp = connp->conn_udp; - udp_stack_t *us = udp->udp_us; struct sock_proto_props sopp; /* All Solaris components should pass a cred for this operation. */ @@ -7599,14 +5231,15 @@ udp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle, connp->conn_upcalls = sock_upcalls; connp->conn_upper_handle = sock_handle; - sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | + sopp.sopp_flags = SOCKOPT_WROFF | SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | SOCKOPT_MAXBLK | SOCKOPT_MAXPSZ | SOCKOPT_MINPSZ; - sopp.sopp_wroff = udp->udp_max_hdr_len + us->us_wroff_extra; + sopp.sopp_wroff = connp->conn_wroff; sopp.sopp_maxblk = INFPSZ; - sopp.sopp_rxhiwat = udp->udp_rcv_hiwat; + sopp.sopp_rxhiwat = connp->conn_rcvbuf; + sopp.sopp_rxlowat = connp->conn_rcvlowat; sopp.sopp_maxaddrlen = sizeof (sin6_t); sopp.sopp_maxpsz = - (udp->udp_family == AF_INET) ? UDP_MAXPACKET_IPV4 : + (connp->conn_family == AF_INET) ? UDP_MAXPACKET_IPV4 : UDP_MAXPACKET_IPV6; sopp.sopp_minpsz = (udp_mod_info.mi_minpsz == 1) ? 0 : udp_mod_info.mi_minpsz; @@ -7618,9 +5251,32 @@ udp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle, static void udp_do_close(conn_t *connp) { + udp_t *udp; + ASSERT(connp != NULL && IPCL_IS_UDP(connp)); + udp = connp->conn_udp; + + if (cl_inet_unbind != NULL && udp->udp_state == TS_IDLE) { + /* + * Running in cluster mode - register unbind information + */ + if (connp->conn_ipversion == IPV4_VERSION) { + (*cl_inet_unbind)( + connp->conn_netstack->netstack_stackid, + IPPROTO_UDP, AF_INET, + (uint8_t *)(&V4_PART_OF_V6(connp->conn_laddr_v6)), + (in_port_t)connp->conn_lport, NULL); + } else { + (*cl_inet_unbind)( + connp->conn_netstack->netstack_stackid, + IPPROTO_UDP, AF_INET6, + (uint8_t *)&(connp->conn_laddr_v6), + (in_port_t)connp->conn_lport, NULL); + } + } + + udp_bind_hash_remove(udp, B_FALSE); - udp_quiesce_conn(connp); ip_quiesce_conn(connp); if (!IPCL_IS_NONSTR(connp)) { @@ -7642,6 +5298,7 @@ udp_do_close(conn_t *connp) * future. */ ASSERT(connp->conn_ref == 1); + if (!IPCL_IS_NONSTR(connp)) { inet_minor_free(connp->conn_minor_arena, connp->conn_dev); } else { @@ -7652,7 +5309,7 @@ udp_do_close(conn_t *connp) ipcl_conn_destroy(connp); } -/* ARGSUSED */ +/* ARGSUSED1 */ int udp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr) { @@ -7671,59 +5328,41 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, { sin_t *sin; sin6_t *sin6; - sin6_t sin6addr; + udp_t *udp = connp->conn_udp; + int error = 0; + ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */ in_port_t port; /* Host byte order */ in_port_t requested_port; /* Host byte order */ int count; + ipaddr_t v4src; /* Set if AF_INET */ in6_addr_t v6src; int loopmax; udp_fanout_t *udpf; in_port_t lport; /* Network byte order */ - udp_t *udp; + uint_t scopeid = 0; + zoneid_t zoneid = IPCL_ZONEID(connp); + ip_stack_t *ipst = connp->conn_netstack->netstack_ip; boolean_t is_inaddr_any; mlp_type_t addrtype, mlptype; - udp_stack_t *us; - int error = 0; - mblk_t *mp = NULL; - - udp = connp->conn_udp; - us = udp->udp_us; - - if (udp->udp_state != TS_UNBND) { - (void) strlog(UDP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, - "udp_bind: bad state, %u", udp->udp_state); - return (-TOUTSTATE); - } + udp_stack_t *us = udp->udp_us; switch (len) { - case 0: - if (udp->udp_family == AF_INET) { - sin = (sin_t *)&sin6addr; - *sin = sin_null; - sin->sin_family = AF_INET; - sin->sin_addr.s_addr = INADDR_ANY; - udp->udp_ipversion = IPV4_VERSION; - } else { - ASSERT(udp->udp_family == AF_INET6); - sin6 = (sin6_t *)&sin6addr; - *sin6 = sin6_null; - sin6->sin6_family = AF_INET6; - V6_SET_ZERO(sin6->sin6_addr); - udp->udp_ipversion = IPV6_VERSION; - } - port = 0; - break; - case sizeof (sin_t): /* Complete IPv4 address */ sin = (sin_t *)sa; if (sin == NULL || !OK_32PTR((char *)sin)) return (EINVAL); - if (udp->udp_family != AF_INET || + if (connp->conn_family != AF_INET || sin->sin_family != AF_INET) { return (EAFNOSUPPORT); } + v4src = sin->sin_addr.s_addr; + IN6_IPADDR_TO_V4MAPPED(v4src, &v6src); + if (v4src != INADDR_ANY) { + laddr_type = ip_laddr_verify_v4(v4src, zoneid, ipst, + B_TRUE); + } port = ntohs(sin->sin_port); break; @@ -7733,10 +5372,28 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, if (sin6 == NULL || !OK_32PTR((char *)sin6)) return (EINVAL); - if (udp->udp_family != AF_INET6 || + if (connp->conn_family != AF_INET6 || sin6->sin6_family != AF_INET6) { return (EAFNOSUPPORT); } + v6src = sin6->sin6_addr; + if (IN6_IS_ADDR_V4MAPPED(&v6src)) { + if (connp->conn_ipv6_v6only) + return (EADDRNOTAVAIL); + + IN6_V4MAPPED_TO_IPADDR(&v6src, v4src); + if (v4src != INADDR_ANY) { + laddr_type = ip_laddr_verify_v4(v4src, + zoneid, ipst, B_FALSE); + } + } else { + if (!IN6_IS_ADDR_UNSPECIFIED(&v6src)) { + if (IN6_IS_ADDR_LINKSCOPE(&v6src)) + scopeid = sin6->sin6_scope_id; + laddr_type = ip_laddr_verify_v6(&v6src, + zoneid, ipst, B_TRUE, scopeid); + } + } port = ntohs(sin6->sin6_port); break; @@ -7746,6 +5403,10 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, return (-TBADADDR); } + /* Is the local address a valid unicast, multicast, or broadcast? */ + if (laddr_type == IPVL_BAD) + return (EADDRNOTAVAIL); + requested_port = port; if (requested_port == 0 || !bind_to_req_port_only) @@ -7759,7 +5420,7 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, * doesn't care which port number we bind to. Get one in the * valid range. */ - if (udp->udp_anon_priv_bind) { + if (connp->conn_anon_priv_bind) { port = udp_get_next_priv_port(udp); } else { port = udp_update_next_port(udp, @@ -7798,53 +5459,45 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, * TPI primitives only 1 at a time and wait for the response before * sending the next primitive. */ - rw_enter(&udp->udp_rwlock, RW_WRITER); - if (udp->udp_state != TS_UNBND || udp->udp_pending_op != -1) { - rw_exit(&udp->udp_rwlock); + mutex_enter(&connp->conn_lock); + if (udp->udp_state != TS_UNBND) { + mutex_exit(&connp->conn_lock); (void) strlog(UDP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "udp_bind: bad state, %u", udp->udp_state); return (-TOUTSTATE); } - /* XXX how to remove the T_BIND_REQ? Should set it before calling */ - udp->udp_pending_op = T_BIND_REQ; /* * Copy the source address into our udp structure. This address * may still be zero; if so, IP will fill in the correct address * each time an outbound packet is passed to it. Since the udp is * not yet in the bind hash list, we don't grab the uf_lock to - * change udp_ipversion + * change conn_ipversion */ - if (udp->udp_family == AF_INET) { + if (connp->conn_family == AF_INET) { ASSERT(sin != NULL); - ASSERT(udp->udp_ipversion == IPV4_VERSION); - udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE + - udp->udp_ip_snd_options_len; - IN6_IPADDR_TO_V4MAPPED(sin->sin_addr.s_addr, &v6src); + ASSERT(connp->conn_ixa->ixa_flags & IXAF_IS_IPV4); } else { - ASSERT(sin6 != NULL); - v6src = sin6->sin6_addr; if (IN6_IS_ADDR_V4MAPPED(&v6src)) { /* - * no need to hold the uf_lock to set the udp_ipversion + * no need to hold the uf_lock to set the conn_ipversion * since we are not yet in the fanout list */ - udp->udp_ipversion = IPV4_VERSION; - udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + - UDPH_SIZE + udp->udp_ip_snd_options_len; + connp->conn_ipversion = IPV4_VERSION; + connp->conn_ixa->ixa_flags |= IXAF_IS_IPV4; } else { - udp->udp_ipversion = IPV6_VERSION; - udp->udp_max_hdr_len = udp->udp_sticky_hdrs_len; + connp->conn_ipversion = IPV6_VERSION; + connp->conn_ixa->ixa_flags &= ~IXAF_IS_IPV4; } } /* - * If udp_reuseaddr is not set, then we have to make sure that + * If conn_reuseaddr is not set, then we have to make sure that * the IP address and port number the application requested * (or we selected for the application) is not being used by * another stream. If another stream is already using the * requested IP address and port, the behavior depends on * "bind_to_req_port_only". If set the bind fails; otherwise we - * search for any an unused port to bind to the the stream. + * search for any an unused port to bind to the stream. * * As per the BSD semantics, as modified by the Deering multicast * changes, if udp_reuseaddr is set, then we allow multiple binds @@ -7860,7 +5513,7 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, */ count = 0; - if (udp->udp_anon_priv_bind) { + if (connp->conn_anon_priv_bind) { /* * loopmax = (IPPORT_RESERVED-1) - * us->us_min_anonpriv_port + 1 @@ -7876,6 +5529,7 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, for (;;) { udp_t *udp1; boolean_t found_exclbind = B_FALSE; + conn_t *connp1; /* * Walk through the list of udp streams bound to @@ -7887,7 +5541,9 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, mutex_enter(&udpf->uf_lock); for (udp1 = udpf->uf_udp; udp1 != NULL; udp1 = udp1->udp_bind_hash) { - if (lport != udp1->udp_port) + connp1 = udp1->udp_connp; + + if (lport != connp1->conn_lport) continue; /* @@ -7896,7 +5552,7 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, * privilege as being in all zones, as there's * otherwise no way to identify the right receiver. */ - if (!IPCL_BIND_ZONE_MATCH(udp1->udp_connp, connp)) + if (!IPCL_BIND_ZONE_MATCH(connp1, connp)) continue; /* @@ -7918,12 +5574,13 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, * For labeled systems, SO_MAC_EXEMPT behaves the same * as UDP_EXCLBIND, except that zoneid is ignored. */ - if (udp1->udp_exclbind || udp->udp_exclbind || + if (connp1->conn_exclbind || connp->conn_exclbind || IPCL_CONNS_MAC(udp1->udp_connp, connp)) { if (V6_OR_V4_INADDR_ANY( - udp1->udp_bound_v6src) || + connp1->conn_bound_addr_v6) || is_inaddr_any || - IN6_ARE_ADDR_EQUAL(&udp1->udp_bound_v6src, + IN6_ARE_ADDR_EQUAL( + &connp1->conn_bound_addr_v6, &v6src)) { found_exclbind = B_TRUE; break; @@ -7935,7 +5592,7 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, * Check ipversion to allow IPv4 and IPv6 sockets to * have disjoint port number spaces. */ - if (udp->udp_ipversion != udp1->udp_ipversion) { + if (connp->conn_ipversion != connp1->conn_ipversion) { /* * On the first time through the loop, if the @@ -7963,8 +5620,8 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, * (non-wildcard, also), keep going. */ if (!is_inaddr_any && - !V6_OR_V4_INADDR_ANY(udp1->udp_bound_v6src) && - !IN6_ARE_ADDR_EQUAL(&udp1->udp_bound_v6src, + !V6_OR_V4_INADDR_ANY(connp1->conn_bound_addr_v6) && + !IN6_ARE_ADDR_EQUAL(&connp1->conn_laddr_v6, &v6src)) { continue; } @@ -7972,7 +5629,7 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, } if (!found_exclbind && - (udp->udp_reuseaddr && requested_port != 0)) { + (connp->conn_reuseaddr && requested_port != 0)) { break; } @@ -7995,12 +5652,11 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, * the routine (and exit the loop). * */ - udp->udp_pending_op = -1; - rw_exit(&udp->udp_rwlock); + mutex_exit(&connp->conn_lock); return (-TADDRBUSY); } - if (udp->udp_anon_priv_bind) { + if (connp->conn_anon_priv_bind) { port = udp_get_next_priv_port(udp); } else { if ((count == 0) && (requested_port != 0)) { @@ -8025,66 +5681,82 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, * there are none available, so send an error * to the user. */ - udp->udp_pending_op = -1; - rw_exit(&udp->udp_rwlock); + mutex_exit(&connp->conn_lock); return (-TNOADDR); } } /* * Copy the source address into our udp structure. This address - * may still be zero; if so, ip will fill in the correct address - * each time an outbound packet is passed to it. + * may still be zero; if so, ip_attr_connect will fill in the correct + * address when a packet is about to be sent. * If we are binding to a broadcast or multicast address then - * udp_post_ip_bind_connect will clear the source address - * when udp_do_bind success. + * we just set the conn_bound_addr since we don't want to use + * that as the source address when sending. */ - udp->udp_v6src = udp->udp_bound_v6src = v6src; - udp->udp_port = lport; + connp->conn_bound_addr_v6 = v6src; + connp->conn_laddr_v6 = v6src; + if (scopeid != 0) { + connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET; + connp->conn_ixa->ixa_scopeid = scopeid; + connp->conn_incoming_ifindex = scopeid; + } else { + connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; + connp->conn_incoming_ifindex = connp->conn_bound_if; + } + + switch (laddr_type) { + case IPVL_UNICAST_UP: + case IPVL_UNICAST_DOWN: + connp->conn_saddr_v6 = v6src; + connp->conn_mcbc_bind = B_FALSE; + break; + case IPVL_MCAST: + case IPVL_BCAST: + /* ip_set_destination will pick a source address later */ + connp->conn_saddr_v6 = ipv6_all_zeros; + connp->conn_mcbc_bind = B_TRUE; + break; + } + + /* Any errors after this point should use late_error */ + connp->conn_lport = lport; + /* - * Now reset the the next anonymous port if the application requested + * Now reset the next anonymous port if the application requested * an anonymous port, or we handed out the next anonymous port. */ - if ((requested_port == 0) && (!udp->udp_anon_priv_bind)) { + if ((requested_port == 0) && (!connp->conn_anon_priv_bind)) { us->us_next_port_to_try = port + 1; } - /* Initialize the O_T_BIND_REQ/T_BIND_REQ for ip. */ - if (udp->udp_family == AF_INET) { - sin->sin_port = udp->udp_port; + /* Initialize the T_BIND_ACK. */ + if (connp->conn_family == AF_INET) { + sin->sin_port = connp->conn_lport; } else { - sin6->sin6_port = udp->udp_port; - /* Rebuild the header template */ - error = udp_build_hdrs(udp); - if (error != 0) { - udp->udp_pending_op = -1; - rw_exit(&udp->udp_rwlock); - mutex_exit(&udpf->uf_lock); - return (error); - } + sin6->sin6_port = connp->conn_lport; } udp->udp_state = TS_IDLE; udp_bind_hash_insert(udpf, udp); mutex_exit(&udpf->uf_lock); - rw_exit(&udp->udp_rwlock); + mutex_exit(&connp->conn_lock); if (cl_inet_bind) { /* * Running in cluster mode - register bind information */ - if (udp->udp_ipversion == IPV4_VERSION) { + if (connp->conn_ipversion == IPV4_VERSION) { (*cl_inet_bind)(connp->conn_netstack->netstack_stackid, - IPPROTO_UDP, AF_INET, - (uint8_t *)(&V4_PART_OF_V6(udp->udp_v6src)), - (in_port_t)udp->udp_port, NULL); + IPPROTO_UDP, AF_INET, (uint8_t *)&v4src, + (in_port_t)connp->conn_lport, NULL); } else { (*cl_inet_bind)(connp->conn_netstack->netstack_stackid, - IPPROTO_UDP, AF_INET6, - (uint8_t *)&(udp->udp_v6src), - (in_port_t)udp->udp_port, NULL); + IPPROTO_UDP, AF_INET6, (uint8_t *)&v6src, + (in_port_t)connp->conn_lport, NULL); } } + mutex_enter(&connp->conn_lock); connp->conn_anon_port = (is_system_labeled() && requested_port == 0); if (is_system_labeled() && (!connp->conn_anon_port || connp->conn_anon_mlp)) { @@ -8092,18 +5764,16 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, zone_t *zone; zone = crgetzone(cr); - connp->conn_mlp_type = udp->udp_recvucred ? mlptBoth : + connp->conn_mlp_type = + connp->conn_recv_ancillary.crb_recvucred ? mlptBoth : mlptSingle; addrtype = tsol_mlp_addr_type( connp->conn_allzones ? ALL_ZONES : zone->zone_id, IPV6_VERSION, &v6src, us->us_netstack->netstack_ip); if (addrtype == mlptSingle) { - rw_enter(&udp->udp_rwlock, RW_WRITER); - udp->udp_pending_op = -1; - rw_exit(&udp->udp_rwlock); - connp->conn_anon_port = B_FALSE; - connp->conn_mlp_type = mlptSingle; - return (-TNOADDR); + error = -TNOADDR; + mutex_exit(&connp->conn_lock); + goto late_error; } mlpport = connp->conn_anon_port ? PMAPPORT : port; mlptype = tsol_mlp_port_type(zone, IPPROTO_UDP, mlpport, @@ -8115,12 +5785,9 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, */ if (mlptype != mlptSingle && connp->conn_mlp_type == mlptSingle) { - rw_enter(&udp->udp_rwlock, RW_WRITER); - udp->udp_pending_op = -1; - rw_exit(&udp->udp_rwlock); - connp->conn_anon_port = B_FALSE; - connp->conn_mlp_type = mlptSingle; - return (EINVAL); + error = EINVAL; + mutex_exit(&connp->conn_lock); + goto late_error; } /* @@ -8129,18 +5796,15 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, */ if (mlptype != mlptSingle && secpolicy_net_bindmlp(cr) != 0) { - if (udp->udp_debug) { + if (connp->conn_debug) { (void) strlog(UDP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "udp_bind: no priv for multilevel port %d", mlpport); } - rw_enter(&udp->udp_rwlock, RW_WRITER); - udp->udp_pending_op = -1; - rw_exit(&udp->udp_rwlock); - connp->conn_anon_port = B_FALSE; - connp->conn_mlp_type = mlptSingle; - return (-TACCES); + error = -TACCES; + mutex_exit(&connp->conn_lock); + goto late_error; } /* @@ -8158,7 +5822,7 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, mlpzone = tsol_mlp_findzone(IPPROTO_UDP, htons(mlpport)); if (connp->conn_zoneid != mlpzone) { - if (udp->udp_debug) { + if (connp->conn_debug) { (void) strlog(UDP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "udp_bind: attempt to bind port " @@ -8167,62 +5831,82 @@ udp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, mlpport, connp->conn_zoneid, mlpzone); } - rw_enter(&udp->udp_rwlock, RW_WRITER); - udp->udp_pending_op = -1; - rw_exit(&udp->udp_rwlock); - connp->conn_anon_port = B_FALSE; - connp->conn_mlp_type = mlptSingle; - return (-TACCES); + error = -TACCES; + mutex_exit(&connp->conn_lock); + goto late_error; } } if (connp->conn_anon_port) { - error = tsol_mlp_anon(zone, mlptype, connp->conn_ulp, + error = tsol_mlp_anon(zone, mlptype, connp->conn_proto, port, B_TRUE); if (error != 0) { - if (udp->udp_debug) { + if (connp->conn_debug) { (void) strlog(UDP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "udp_bind: cannot establish anon " "MLP for port %d", port); } - rw_enter(&udp->udp_rwlock, RW_WRITER); - udp->udp_pending_op = -1; - rw_exit(&udp->udp_rwlock); - connp->conn_anon_port = B_FALSE; - connp->conn_mlp_type = mlptSingle; - return (-TACCES); + error = -TACCES; + mutex_exit(&connp->conn_lock); + goto late_error; } } connp->conn_mlp_type = mlptype; } - if (!V6_OR_V4_INADDR_ANY(udp->udp_v6src)) { - /* - * Append a request for an IRE if udp_v6src not - * zero (IPv4 - INADDR_ANY, or IPv6 - all-zeroes address). - */ - mp = allocb(sizeof (ire_t), BPRI_HI); - if (!mp) { - rw_enter(&udp->udp_rwlock, RW_WRITER); - udp->udp_pending_op = -1; - rw_exit(&udp->udp_rwlock); - return (ENOMEM); - } - mp->b_wptr += sizeof (ire_t); - mp->b_datap->db_type = IRE_DB_REQ_TYPE; + /* + * We create an initial header template here to make a subsequent + * sendto have a starting point. Since conn_last_dst is zero the + * first sendto will always follow the 'dst changed' code path. + * Note that we defer massaging options and the related checksum + * adjustment until we have a destination address. + */ + error = udp_build_hdr_template(connp, &connp->conn_saddr_v6, + &connp->conn_faddr_v6, connp->conn_fport, connp->conn_flowinfo); + if (error != 0) { + mutex_exit(&connp->conn_lock); + goto late_error; } - if (udp->udp_family == AF_INET6) { - ASSERT(udp->udp_connp->conn_af_isv6); - error = ip_proto_bind_laddr_v6(connp, &mp, IPPROTO_UDP, - &udp->udp_bound_v6src, udp->udp_port, B_TRUE); - } else { - ASSERT(!udp->udp_connp->conn_af_isv6); - error = ip_proto_bind_laddr_v4(connp, &mp, IPPROTO_UDP, - V4_PART_OF_V6(udp->udp_bound_v6src), udp->udp_port, - B_TRUE); + /* Just in case */ + connp->conn_faddr_v6 = ipv6_all_zeros; + connp->conn_fport = 0; + connp->conn_v6lastdst = ipv6_all_zeros; + mutex_exit(&connp->conn_lock); + + error = ip_laddr_fanout_insert(connp); + if (error != 0) + goto late_error; + + /* Bind succeeded */ + return (0); + +late_error: + /* We had already picked the port number, and then the bind failed */ + mutex_enter(&connp->conn_lock); + udpf = &us->us_bind_fanout[ + UDP_BIND_HASH(connp->conn_lport, + us->us_bind_fanout_size)]; + mutex_enter(&udpf->uf_lock); + connp->conn_saddr_v6 = ipv6_all_zeros; + connp->conn_bound_addr_v6 = ipv6_all_zeros; + connp->conn_laddr_v6 = ipv6_all_zeros; + if (scopeid != 0) { + connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; + connp->conn_incoming_ifindex = connp->conn_bound_if; } + udp->udp_state = TS_UNBND; + udp_bind_hash_remove(udp, B_TRUE); + connp->conn_lport = 0; + mutex_exit(&udpf->uf_lock); + connp->conn_anon_port = B_FALSE; + connp->conn_mlp_type = mlptSingle; - (void) udp_post_ip_bind_connect(udp, mp, error); + connp->conn_v6lastdst = ipv6_all_zeros; + + /* Restore the header that was built above - different source address */ + (void) udp_build_hdr_template(connp, &connp->conn_saddr_v6, + &connp->conn_faddr_v6, connp->conn_fport, connp->conn_flowinfo); + mutex_exit(&connp->conn_lock); return (error); } @@ -8256,12 +5940,32 @@ udp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, static int udp_implicit_bind(conn_t *connp, cred_t *cr) { + sin6_t sin6addr; + sin_t *sin; + sin6_t *sin6; + socklen_t len; int error; /* All Solaris components should pass a cred for this operation. */ ASSERT(cr != NULL); - error = udp_do_bind(connp, NULL, 0, cr, B_FALSE); + if (connp->conn_family == AF_INET) { + len = sizeof (struct sockaddr_in); + sin = (sin_t *)&sin6addr; + *sin = sin_null; + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = INADDR_ANY; + } else { + ASSERT(connp->conn_family == AF_INET6); + len = sizeof (sin6_t); + sin6 = (sin6_t *)&sin6addr; + *sin6 = sin6_null; + sin6->sin6_family = AF_INET6; + V6_SET_ZERO(sin6->sin6_addr); + } + + error = udp_do_bind(connp, (struct sockaddr *)&sin6addr, len, + cr, B_FALSE); return ((error < 0) ? proto_tlitosyserr(-error) : error); } @@ -8280,137 +5984,51 @@ udp_do_unbind(conn_t *connp) /* * Running in cluster mode - register unbind information */ - if (udp->udp_ipversion == IPV4_VERSION) { + if (connp->conn_ipversion == IPV4_VERSION) { (*cl_inet_unbind)( connp->conn_netstack->netstack_stackid, IPPROTO_UDP, AF_INET, - (uint8_t *)(&V4_PART_OF_V6(udp->udp_v6src)), - (in_port_t)udp->udp_port, NULL); + (uint8_t *)(&V4_PART_OF_V6(connp->conn_laddr_v6)), + (in_port_t)connp->conn_lport, NULL); } else { (*cl_inet_unbind)( connp->conn_netstack->netstack_stackid, IPPROTO_UDP, AF_INET6, - (uint8_t *)&(udp->udp_v6src), - (in_port_t)udp->udp_port, NULL); + (uint8_t *)&(connp->conn_laddr_v6), + (in_port_t)connp->conn_lport, NULL); } } - rw_enter(&udp->udp_rwlock, RW_WRITER); - if (udp->udp_state == TS_UNBND || udp->udp_pending_op != -1) { - rw_exit(&udp->udp_rwlock); + mutex_enter(&connp->conn_lock); + /* If a bind has not been done, we can't unbind. */ + if (udp->udp_state == TS_UNBND) { + mutex_exit(&connp->conn_lock); return (-TOUTSTATE); } - udp->udp_pending_op = T_UNBIND_REQ; - rw_exit(&udp->udp_rwlock); - - /* - * Pass the unbind to IP; T_UNBIND_REQ is larger than T_OK_ACK - * and therefore ip_unbind must never return NULL. - */ - ip_unbind(connp); - - /* - * Once we're unbound from IP, the pending operation may be cleared - * here. - */ - rw_enter(&udp->udp_rwlock, RW_WRITER); - udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port, + udpf = &us->us_bind_fanout[UDP_BIND_HASH(connp->conn_lport, us->us_bind_fanout_size)]; - mutex_enter(&udpf->uf_lock); udp_bind_hash_remove(udp, B_TRUE); - V6_SET_ZERO(udp->udp_v6src); - V6_SET_ZERO(udp->udp_bound_v6src); - udp->udp_port = 0; + connp->conn_saddr_v6 = ipv6_all_zeros; + connp->conn_bound_addr_v6 = ipv6_all_zeros; + connp->conn_laddr_v6 = ipv6_all_zeros; + connp->conn_mcbc_bind = B_FALSE; + connp->conn_lport = 0; + /* In case we were also connected */ + connp->conn_faddr_v6 = ipv6_all_zeros; + connp->conn_fport = 0; mutex_exit(&udpf->uf_lock); - udp->udp_pending_op = -1; + connp->conn_v6lastdst = ipv6_all_zeros; udp->udp_state = TS_UNBND; - if (udp->udp_family == AF_INET6) - (void) udp_build_hdrs(udp); - rw_exit(&udp->udp_rwlock); - return (0); -} - -static int -udp_post_ip_bind_connect(udp_t *udp, mblk_t *ire_mp, int error) -{ - ire_t *ire; - udp_fanout_t *udpf; - udp_stack_t *us = udp->udp_us; - - ASSERT(udp->udp_pending_op != -1); - rw_enter(&udp->udp_rwlock, RW_WRITER); - if (error == 0) { - /* For udp_do_connect() success */ - /* udp_do_bind() success will do nothing in here */ - /* - * If a broadcast/multicast address was bound, set - * the source address to 0. - * This ensures no datagrams with broadcast address - * as source address are emitted (which would violate - * RFC1122 - Hosts requirements) - * - * Note that when connecting the returned IRE is - * for the destination address and we only perform - * the broadcast check for the source address (it - * is OK to connect to a broadcast/multicast address.) - */ - if (ire_mp != NULL && ire_mp->b_datap->db_type == IRE_DB_TYPE) { - ire = (ire_t *)ire_mp->b_rptr; + (void) udp_build_hdr_template(connp, &connp->conn_saddr_v6, + &connp->conn_faddr_v6, connp->conn_fport, connp->conn_flowinfo); + mutex_exit(&connp->conn_lock); - /* - * Note: we get IRE_BROADCAST for IPv6 to "mark" a - * multicast local address. - */ - udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port, - us->us_bind_fanout_size)]; - if (ire->ire_type == IRE_BROADCAST && - udp->udp_state != TS_DATA_XFER) { - ASSERT(udp->udp_pending_op == T_BIND_REQ || - udp->udp_pending_op == O_T_BIND_REQ); - /* - * This was just a local bind to a broadcast - * addr. - */ - mutex_enter(&udpf->uf_lock); - V6_SET_ZERO(udp->udp_v6src); - mutex_exit(&udpf->uf_lock); - if (udp->udp_family == AF_INET6) - (void) udp_build_hdrs(udp); - } else if (V6_OR_V4_INADDR_ANY(udp->udp_v6src)) { - if (udp->udp_family == AF_INET6) - (void) udp_build_hdrs(udp); - } - } - } else { - udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port, - us->us_bind_fanout_size)]; - mutex_enter(&udpf->uf_lock); + ip_unbind(connp); - if (udp->udp_state == TS_DATA_XFER) { - /* Connect failed */ - /* Revert back to the bound source */ - udp->udp_v6src = udp->udp_bound_v6src; - udp->udp_state = TS_IDLE; - } else { - /* For udp_do_bind() failed */ - V6_SET_ZERO(udp->udp_v6src); - V6_SET_ZERO(udp->udp_bound_v6src); - udp->udp_state = TS_UNBND; - udp_bind_hash_remove(udp, B_TRUE); - udp->udp_port = 0; - } - mutex_exit(&udpf->uf_lock); - if (udp->udp_family == AF_INET6) - (void) udp_build_hdrs(udp); - } - udp->udp_pending_op = -1; - rw_exit(&udp->udp_rwlock); - if (ire_mp != NULL) - freeb(ire_mp); - return (error); + return (0); } /* @@ -8418,7 +6036,7 @@ udp_post_ip_bind_connect(udp_t *udp, mblk_t *ire_mp, int error) */ static int udp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, - cred_t *cr) + cred_t *cr, pid_t pid) { sin6_t *sin6; sin_t *sin; @@ -8426,12 +6044,16 @@ udp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, ipaddr_t v4dst; uint16_t dstport; uint32_t flowinfo; - mblk_t *ire_mp; udp_fanout_t *udpf; udp_t *udp, *udp1; ushort_t ipversion; udp_stack_t *us; int error; + conn_t *connp1; + ip_xmit_attr_t *ixa; + uint_t scopeid = 0; + uint_t srcid = 0; + in6_addr_t v6src = connp->conn_saddr_v6; udp = connp->conn_udp; us = udp->udp_us; @@ -8451,7 +6073,7 @@ udp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, v4dst = sin->sin_addr.s_addr; dstport = sin->sin_port; IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); - ASSERT(udp->udp_ipversion == IPV4_VERSION); + ASSERT(connp->conn_ipversion == IPV4_VERSION); ipversion = IPV4_VERSION; break; @@ -8459,13 +6081,33 @@ udp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, sin6 = (sin6_t *)sa; v6dst = sin6->sin6_addr; dstport = sin6->sin6_port; + srcid = sin6->__sin6_src_id; + if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&v6src)) { + ip_srcid_find_id(srcid, &v6src, IPCL_ZONEID(connp), + connp->conn_netstack); + } if (IN6_IS_ADDR_V4MAPPED(&v6dst)) { + if (connp->conn_ipv6_v6only) + return (EADDRNOTAVAIL); + + /* + * Destination adress is mapped IPv6 address. + * Source bound address should be unspecified or + * IPv6 mapped address as well. + */ + if (!IN6_IS_ADDR_UNSPECIFIED( + &connp->conn_bound_addr_v6) && + !IN6_IS_ADDR_V4MAPPED(&connp->conn_bound_addr_v6)) { + return (EADDRNOTAVAIL); + } IN6_V4MAPPED_TO_IPADDR(&v6dst, v4dst); ipversion = IPV4_VERSION; flowinfo = 0; } else { ipversion = IPV6_VERSION; flowinfo = sin6->sin6_flowinfo; + if (IN6_IS_ADDR_LINKLOCAL(&sin6->sin6_addr)) + scopeid = sin6->sin6_scope_id; } break; } @@ -8473,44 +6115,53 @@ udp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, if (dstport == 0) return (-TBADADDR); - rw_enter(&udp->udp_rwlock, RW_WRITER); + /* + * If there is a different thread using conn_ixa then we get a new + * copy and cut the old one loose from conn_ixa. Otherwise we use + * conn_ixa and prevent any other thread from using/changing it. + * Once connect() is done other threads can use conn_ixa since the + * refcnt will be back at one. + */ + ixa = conn_get_ixa(connp, B_TRUE); + if (ixa == NULL) + return (ENOMEM); + ASSERT(ixa->ixa_refcnt >= 2); + ASSERT(ixa == connp->conn_ixa); + + mutex_enter(&connp->conn_lock); /* - * This UDP must have bound to a port already before doing a connect. - * TPI mandates that users must send TPI primitives only 1 at a time - * and wait for the response before sending the next primitive. + * This udp_t must have bound to a port already before doing a connect. + * Reject if a connect is in progress (we drop conn_lock during + * udp_do_connect). */ - if (udp->udp_state == TS_UNBND || udp->udp_pending_op != -1) { - rw_exit(&udp->udp_rwlock); + if (udp->udp_state == TS_UNBND || udp->udp_state == TS_WCON_CREQ) { + mutex_exit(&connp->conn_lock); (void) strlog(UDP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "udp_connect: bad state, %u", udp->udp_state); + ixa_refrele(ixa); return (-TOUTSTATE); } - udp->udp_pending_op = T_CONN_REQ; - ASSERT(udp->udp_port != 0 && udp->udp_ptpbhn != NULL); - - if (ipversion == IPV4_VERSION) { - udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE + - udp->udp_ip_snd_options_len; - } else { - udp->udp_max_hdr_len = udp->udp_sticky_hdrs_len; - } + ASSERT(connp->conn_lport != 0 && udp->udp_ptpbhn != NULL); - udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port, + udpf = &us->us_bind_fanout[UDP_BIND_HASH(connp->conn_lport, us->us_bind_fanout_size)]; mutex_enter(&udpf->uf_lock); if (udp->udp_state == TS_DATA_XFER) { /* Already connected - clear out state */ - udp->udp_v6src = udp->udp_bound_v6src; + if (connp->conn_mcbc_bind) + connp->conn_saddr_v6 = ipv6_all_zeros; + else + connp->conn_saddr_v6 = connp->conn_bound_addr_v6; + connp->conn_laddr_v6 = connp->conn_bound_addr_v6; + connp->conn_faddr_v6 = ipv6_all_zeros; + connp->conn_fport = 0; udp->udp_state = TS_IDLE; } - /* - * Create a default IP header with no IP options. - */ - udp->udp_dstport = dstport; - udp->udp_ipversion = ipversion; + connp->conn_fport = dstport; + connp->conn_ipversion = ipversion; if (ipversion == IPV4_VERSION) { /* * Interpret a zero destination to mean loopback. @@ -8520,29 +6171,16 @@ udp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, if (v4dst == INADDR_ANY) { v4dst = htonl(INADDR_LOOPBACK); IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); - if (udp->udp_family == AF_INET) { + if (connp->conn_family == AF_INET) { sin->sin_addr.s_addr = v4dst; } else { sin6->sin6_addr = v6dst; } } - udp->udp_v6dst = v6dst; - udp->udp_flowinfo = 0; - - /* - * If the destination address is multicast and - * an outgoing multicast interface has been set, - * use the address of that interface as our - * source address if no source address has been set. - */ - if (V4_PART_OF_V6(udp->udp_v6src) == INADDR_ANY && - CLASSD(v4dst) && - udp->udp_multicast_if_addr != INADDR_ANY) { - IN6_IPADDR_TO_V4MAPPED(udp->udp_multicast_if_addr, - &udp->udp_v6src); - } + connp->conn_faddr_v6 = v6dst; + connp->conn_flowinfo = 0; } else { - ASSERT(udp->udp_ipversion == IPV6_VERSION); + ASSERT(connp->conn_ipversion == IPV6_VERSION); /* * Interpret a zero destination to mean loopback. * Update the T_CONN_REQ (sin/sin6) since it is used to @@ -8552,82 +6190,133 @@ udp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, v6dst = ipv6_loopback; sin6->sin6_addr = v6dst; } - udp->udp_v6dst = v6dst; - udp->udp_flowinfo = flowinfo; - /* - * If the destination address is multicast and - * an outgoing multicast interface has been set, - * then the ip bind logic will pick the correct source - * address (i.e. matching the outgoing multicast interface). - */ + connp->conn_faddr_v6 = v6dst; + connp->conn_flowinfo = flowinfo; + } + mutex_exit(&udpf->uf_lock); + + ixa->ixa_cred = cr; + ixa->ixa_cpid = pid; + if (is_system_labeled()) { + /* We need to restart with a label based on the cred */ + ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); + } + + if (scopeid != 0) { + ixa->ixa_flags |= IXAF_SCOPEID_SET; + ixa->ixa_scopeid = scopeid; + connp->conn_incoming_ifindex = scopeid; + } else { + ixa->ixa_flags &= ~IXAF_SCOPEID_SET; + connp->conn_incoming_ifindex = connp->conn_bound_if; + } + /* + * conn_connect will drop conn_lock and reacquire it. + * To prevent a send* from messing with this udp_t while the lock + * is dropped we set udp_state and clear conn_v6lastdst. + * That will make all send* fail with EISCONN. + */ + connp->conn_v6lastdst = ipv6_all_zeros; + udp->udp_state = TS_WCON_CREQ; + + error = conn_connect(connp, NULL, IPDF_ALLOW_MCBC); + mutex_exit(&connp->conn_lock); + if (error != 0) + goto connect_failed; + + /* + * The addresses have been verified. Time to insert in + * the correct fanout list. + */ + error = ipcl_conn_insert(connp); + if (error != 0) + goto connect_failed; + + mutex_enter(&connp->conn_lock); + error = udp_build_hdr_template(connp, &connp->conn_saddr_v6, + &connp->conn_faddr_v6, connp->conn_fport, connp->conn_flowinfo); + if (error != 0) { + mutex_exit(&connp->conn_lock); + goto connect_failed; } + udp->udp_state = TS_DATA_XFER; + /* Record this as the "last" send even though we haven't sent any */ + connp->conn_v6lastdst = connp->conn_faddr_v6; + connp->conn_lastipversion = connp->conn_ipversion; + connp->conn_lastdstport = connp->conn_fport; + connp->conn_lastflowinfo = connp->conn_flowinfo; + connp->conn_lastscopeid = scopeid; + connp->conn_lastsrcid = srcid; + /* Also remember a source to use together with lastdst */ + connp->conn_v6lastsrc = v6src; + mutex_exit(&connp->conn_lock); + /* - * Verify that the src/port/dst/port is unique for all - * connections in TS_DATA_XFER + * We've picked a source address above. Now we can + * verify that the src/port/dst/port is unique for all + * connections in TS_DATA_XFER, skipping ourselves. */ + mutex_enter(&udpf->uf_lock); for (udp1 = udpf->uf_udp; udp1 != NULL; udp1 = udp1->udp_bind_hash) { if (udp1->udp_state != TS_DATA_XFER) continue; - if (udp->udp_port != udp1->udp_port || - udp->udp_ipversion != udp1->udp_ipversion || - dstport != udp1->udp_dstport || - !IN6_ARE_ADDR_EQUAL(&udp->udp_v6src, &udp1->udp_v6src) || - !IN6_ARE_ADDR_EQUAL(&v6dst, &udp1->udp_v6dst) || - !(IPCL_ZONE_MATCH(udp->udp_connp, - udp1->udp_connp->conn_zoneid) || - IPCL_ZONE_MATCH(udp1->udp_connp, - udp->udp_connp->conn_zoneid))) + + if (udp1 == udp) + continue; + + connp1 = udp1->udp_connp; + if (connp->conn_lport != connp1->conn_lport || + connp->conn_ipversion != connp1->conn_ipversion || + dstport != connp1->conn_fport || + !IN6_ARE_ADDR_EQUAL(&connp->conn_laddr_v6, + &connp1->conn_laddr_v6) || + !IN6_ARE_ADDR_EQUAL(&v6dst, &connp1->conn_faddr_v6) || + !(IPCL_ZONE_MATCH(connp, connp1->conn_zoneid) || + IPCL_ZONE_MATCH(connp1, connp->conn_zoneid))) continue; mutex_exit(&udpf->uf_lock); - udp->udp_pending_op = -1; - rw_exit(&udp->udp_rwlock); - return (-TBADADDR); + error = -TBADADDR; + goto connect_failed; } - if (cl_inet_connect2 != NULL) { - CL_INET_UDP_CONNECT(connp, udp, B_TRUE, &v6dst, dstport, error); + CL_INET_UDP_CONNECT(connp, B_TRUE, &v6dst, dstport, error); if (error != 0) { mutex_exit(&udpf->uf_lock); - udp->udp_pending_op = -1; - rw_exit(&udp->udp_rwlock); - return (-TBADADDR); + error = -TBADADDR; + goto connect_failed; } } - - udp->udp_state = TS_DATA_XFER; mutex_exit(&udpf->uf_lock); - ire_mp = allocb(sizeof (ire_t), BPRI_HI); - if (ire_mp == NULL) { - mutex_enter(&udpf->uf_lock); - udp->udp_state = TS_IDLE; - udp->udp_pending_op = -1; - mutex_exit(&udpf->uf_lock); - rw_exit(&udp->udp_rwlock); - return (ENOMEM); - } - - rw_exit(&udp->udp_rwlock); + ixa_refrele(ixa); + return (0); - ire_mp->b_wptr += sizeof (ire_t); - ire_mp->b_datap->db_type = IRE_DB_REQ_TYPE; +connect_failed: + if (ixa != NULL) + ixa_refrele(ixa); + mutex_enter(&connp->conn_lock); + mutex_enter(&udpf->uf_lock); + udp->udp_state = TS_IDLE; + connp->conn_faddr_v6 = ipv6_all_zeros; + connp->conn_fport = 0; + /* In case the source address was set above */ + if (connp->conn_mcbc_bind) + connp->conn_saddr_v6 = ipv6_all_zeros; + else + connp->conn_saddr_v6 = connp->conn_bound_addr_v6; + connp->conn_laddr_v6 = connp->conn_bound_addr_v6; + mutex_exit(&udpf->uf_lock); - if (udp->udp_family == AF_INET) { - error = ip_proto_bind_connected_v4(connp, &ire_mp, IPPROTO_UDP, - &V4_PART_OF_V6(udp->udp_v6src), udp->udp_port, - V4_PART_OF_V6(udp->udp_v6dst), udp->udp_dstport, - B_TRUE, B_TRUE, cr); - } else { - error = ip_proto_bind_connected_v6(connp, &ire_mp, IPPROTO_UDP, - &udp->udp_v6src, udp->udp_port, &udp->udp_v6dst, - &udp->udp_sticky_ipp, udp->udp_dstport, B_TRUE, B_TRUE, cr); - } + connp->conn_v6lastdst = ipv6_all_zeros; + connp->conn_flowinfo = 0; - return (udp_post_ip_bind_connect(udp, ire_mp, error)); + (void) udp_build_hdr_template(connp, &connp->conn_saddr_v6, + &connp->conn_faddr_v6, connp->conn_fport, connp->conn_flowinfo); + mutex_exit(&connp->conn_lock); + return (error); } -/* ARGSUSED */ static int udp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, socklen_t len, sock_connid_t *id, cred_t *cr) @@ -8636,6 +6325,7 @@ udp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, udp_t *udp = connp->conn_udp; int error; boolean_t did_bind = B_FALSE; + pid_t pid = curproc->p_pid; /* All Solaris components should pass a cred for this operation. */ ASSERT(cr != NULL); @@ -8652,7 +6342,7 @@ udp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, return (error); } - error = proto_verify_ip_addr(udp->udp_family, sa, len); + error = proto_verify_ip_addr(connp->conn_family, sa, len); if (error != 0) goto done; @@ -8671,9 +6361,9 @@ udp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, /* * set SO_DGRAM_ERRIND */ - udp->udp_dgram_errind = B_TRUE; + connp->conn_dgram_errind = B_TRUE; - error = udp_do_connect(connp, sa, len, cr); + error = udp_do_connect(connp, sa, len, cr, pid); if (error != 0 && did_bind) { int unbind_err; @@ -8702,44 +6392,33 @@ done: return (error); } -/* ARGSUSED */ int udp_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, cred_t *cr) { + sin6_t *sin6; + sin_t *sin = NULL; + uint_t srcid; conn_t *connp = (conn_t *)proto_handle; udp_t *udp = connp->conn_udp; - udp_stack_t *us = udp->udp_us; int error = 0; + udp_stack_t *us = udp->udp_us; + ushort_t ipversion; + pid_t pid = curproc->p_pid; + ip_xmit_attr_t *ixa; ASSERT(DB_TYPE(mp) == M_DATA); /* All Solaris components should pass a cred for this operation. */ ASSERT(cr != NULL); - /* If labeled then sockfs should have already set db_credp */ - ASSERT(!is_system_labeled() || msg_getcred(mp, NULL) != NULL); - - /* - * If the socket is connected and no change in destination - */ - if (msg->msg_namelen == 0) { - error = udp_send_connected(connp, mp, msg, cr, curproc->p_pid); - if (error == EDESTADDRREQ) - return (error); - else - return (udp->udp_dgram_errind ? error : 0); - } - - /* - * Do an implicit bind if necessary. - */ + /* do an implicit bind if necessary */ if (udp->udp_state == TS_UNBND) { error = udp_implicit_bind(connp, cr); /* * We could be racing with an actual bind, in which case * we would see EPROTO. We cross our fingers and try - * to send. + * to connect. */ if (!(error == 0 || error == EPROTO)) { freemsg(mp); @@ -8747,75 +6426,203 @@ udp_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, } } - rw_enter(&udp->udp_rwlock, RW_WRITER); - - if (msg->msg_name != NULL && udp->udp_state == TS_DATA_XFER) { - rw_exit(&udp->udp_rwlock); - freemsg(mp); + /* Connected? */ + if (msg->msg_name == NULL) { + if (udp->udp_state != TS_DATA_XFER) { + BUMP_MIB(&us->us_udp_mib, udpOutErrors); + return (EDESTADDRREQ); + } + if (msg->msg_controllen != 0) { + error = udp_output_ancillary(connp, NULL, NULL, mp, + NULL, msg, cr, pid); + } else { + error = udp_output_connected(connp, mp, cr, pid); + } + if (us->us_sendto_ignerr) + return (0); + else + return (error); + } + if (udp->udp_state == TS_DATA_XFER) { + BUMP_MIB(&us->us_udp_mib, udpOutErrors); return (EISCONN); } + error = proto_verify_ip_addr(connp->conn_family, + (struct sockaddr *)msg->msg_name, msg->msg_namelen); + if (error != 0) { + BUMP_MIB(&us->us_udp_mib, udpOutErrors); + return (error); + } + switch (connp->conn_family) { + case AF_INET6: + sin6 = (sin6_t *)msg->msg_name; + srcid = sin6->__sin6_src_id; - if (udp->udp_delayed_error != 0) { - boolean_t match; + if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { + /* + * Destination is a non-IPv4-compatible IPv6 address. + * Send out an IPv6 format packet. + */ - error = udp->udp_delayed_error; - match = B_FALSE; - udp->udp_delayed_error = 0; - switch (udp->udp_family) { - case AF_INET: { - /* Compare just IP address and port */ - sin_t *sin1 = (sin_t *)msg->msg_name; - sin_t *sin2 = (sin_t *)&udp->udp_delayed_addr; + /* + * If the local address is a mapped address return + * an error. + * It would be possible to send an IPv6 packet but the + * response would never make it back to the application + * since it is bound to a mapped address. + */ + if (IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6)) { + BUMP_MIB(&us->us_udp_mib, udpOutErrors); + return (EADDRNOTAVAIL); + } + if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) + sin6->sin6_addr = ipv6_loopback; + ipversion = IPV6_VERSION; + } else { + if (connp->conn_ipv6_v6only) { + BUMP_MIB(&us->us_udp_mib, udpOutErrors); + return (EADDRNOTAVAIL); + } - if (msg->msg_namelen == sizeof (sin_t) && - sin1->sin_port == sin2->sin_port && - sin1->sin_addr.s_addr == sin2->sin_addr.s_addr) - match = B_TRUE; + /* + * If the local address is not zero or a mapped address + * return an error. It would be possible to send an + * IPv4 packet but the response would never make it + * back to the application since it is bound to a + * non-mapped address. + */ + if (!IN6_IS_ADDR_V4MAPPED(&connp->conn_saddr_v6) && + !IN6_IS_ADDR_UNSPECIFIED(&connp->conn_saddr_v6)) { + BUMP_MIB(&us->us_udp_mib, udpOutErrors); + return (EADDRNOTAVAIL); + } - break; + if (V4_PART_OF_V6(sin6->sin6_addr) == INADDR_ANY) { + V4_PART_OF_V6(sin6->sin6_addr) = + htonl(INADDR_LOOPBACK); + } + ipversion = IPV4_VERSION; } - case AF_INET6: { - sin6_t *sin1 = (sin6_t *)msg->msg_name; - sin6_t *sin2 = (sin6_t *)&udp->udp_delayed_addr; - if (msg->msg_namelen == sizeof (sin6_t) && - sin1->sin6_port == sin2->sin6_port && - IN6_ARE_ADDR_EQUAL(&sin1->sin6_addr, - &sin2->sin6_addr)) - match = B_TRUE; - break; - } - default: - ASSERT(0); + /* + * We have to allocate an ip_xmit_attr_t before we grab + * conn_lock and we need to hold conn_lock once we've check + * conn_same_as_last_v6 to handle concurrent send* calls on a + * socket. + */ + if (msg->msg_controllen == 0) { + ixa = conn_get_ixa(connp, B_FALSE); + if (ixa == NULL) { + BUMP_MIB(&us->us_udp_mib, udpOutErrors); + return (ENOMEM); + } + } else { + ixa = NULL; } + mutex_enter(&connp->conn_lock); + if (udp->udp_delayed_error != 0) { + sin6_t *sin2 = (sin6_t *)&udp->udp_delayed_addr; - *((sin6_t *)&udp->udp_delayed_addr) = sin6_null; + error = udp->udp_delayed_error; + udp->udp_delayed_error = 0; - if (match) { - rw_exit(&udp->udp_rwlock); - freemsg(mp); + /* Compare IP address, port, and family */ + + if (sin6->sin6_port == sin2->sin6_port && + IN6_ARE_ADDR_EQUAL(&sin6->sin6_addr, + &sin2->sin6_addr) && + sin6->sin6_family == sin2->sin6_family) { + mutex_exit(&connp->conn_lock); + BUMP_MIB(&us->us_udp_mib, udpOutErrors); + if (ixa != NULL) + ixa_refrele(ixa); + return (error); + } + } + + if (msg->msg_controllen != 0) { + mutex_exit(&connp->conn_lock); + ASSERT(ixa == NULL); + error = udp_output_ancillary(connp, NULL, sin6, mp, + NULL, msg, cr, pid); + } else if (conn_same_as_last_v6(connp, sin6) && + connp->conn_lastsrcid == srcid && + ipsec_outbound_policy_current(ixa)) { + /* udp_output_lastdst drops conn_lock */ + error = udp_output_lastdst(connp, mp, cr, pid, ixa); + } else { + /* udp_output_newdst drops conn_lock */ + error = udp_output_newdst(connp, mp, NULL, sin6, + ipversion, cr, pid, ixa); + } + ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); + if (us->us_sendto_ignerr) + return (0); + else return (error); + case AF_INET: + sin = (sin_t *)msg->msg_name; + + ipversion = IPV4_VERSION; + + if (sin->sin_addr.s_addr == INADDR_ANY) + sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + + /* + * We have to allocate an ip_xmit_attr_t before we grab + * conn_lock and we need to hold conn_lock once we've check + * conn_same_as_last_v6 to handle concurrent send* on a socket. + */ + if (msg->msg_controllen == 0) { + ixa = conn_get_ixa(connp, B_FALSE); + if (ixa == NULL) { + BUMP_MIB(&us->us_udp_mib, udpOutErrors); + return (ENOMEM); + } + } else { + ixa = NULL; } - } + mutex_enter(&connp->conn_lock); + if (udp->udp_delayed_error != 0) { + sin_t *sin2 = (sin_t *)&udp->udp_delayed_addr; - error = proto_verify_ip_addr(udp->udp_family, - (struct sockaddr *)msg->msg_name, msg->msg_namelen); - rw_exit(&udp->udp_rwlock); + error = udp->udp_delayed_error; + udp->udp_delayed_error = 0; - if (error != 0) { - freemsg(mp); - return (error); - } + /* Compare IP address and port */ - error = udp_send_not_connected(connp, mp, - (struct sockaddr *)msg->msg_name, msg->msg_namelen, msg, cr, - curproc->p_pid); - if (error != 0) { - UDP_STAT(us, udp_out_err_output); - freemsg(mp); + if (sin->sin_port == sin2->sin_port && + sin->sin_addr.s_addr == sin2->sin_addr.s_addr) { + mutex_exit(&connp->conn_lock); + BUMP_MIB(&us->us_udp_mib, udpOutErrors); + if (ixa != NULL) + ixa_refrele(ixa); + return (error); + } + } + if (msg->msg_controllen != 0) { + mutex_exit(&connp->conn_lock); + ASSERT(ixa == NULL); + error = udp_output_ancillary(connp, sin, NULL, mp, + NULL, msg, cr, pid); + } else if (conn_same_as_last_v4(connp, sin) && + ipsec_outbound_policy_current(ixa)) { + /* udp_output_lastdst drops conn_lock */ + error = udp_output_lastdst(connp, mp, cr, pid, ixa); + } else { + /* udp_output_newdst drops conn_lock */ + error = udp_output_newdst(connp, mp, sin, NULL, + ipversion, cr, pid, ixa); + } + ASSERT(MUTEX_NOT_HELD(&connp->conn_lock)); + if (us->us_sendto_ignerr) + return (0); + else + return (error); + default: + return (EINVAL); } - return (udp->udp_dgram_errind ? error : 0); } int @@ -8854,8 +6661,7 @@ udp_fallback(sock_lower_handle_t proto_handle, queue_t *q, stropt_mp->b_wptr += sizeof (*stropt); stropt = (struct stroptions *)stropt_mp->b_rptr; stropt->so_flags = SO_WROFF | SO_HIWAT; - stropt->so_wroff = - (ushort_t)(udp->udp_max_hdr_len + udp->udp_us->us_wroff_extra); + stropt->so_wroff = connp->conn_wroff; stropt->so_hiwat = udp->udp_rcv_disply_hiwat; putnext(RD(q), stropt_mp); @@ -8881,9 +6687,9 @@ udp_fallback(sock_lower_handle_t proto_handle, queue_t *q, faddrlen = 0; opts = 0; - if (udp->udp_dgram_errind) + if (connp->conn_dgram_errind) opts |= SO_DGRAM_ERRIND; - if (udp->udp_dontroute) + if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE) opts |= SO_DONTROUTE; (*quiesced_cb)(connp->conn_upper_handle, q, &tca, @@ -8908,9 +6714,9 @@ udp_fallback(sock_lower_handle_t proto_handle, queue_t *q, /* * No longer a streams less socket */ - rw_enter(&udp->udp_rwlock, RW_WRITER); + mutex_enter(&connp->conn_lock); connp->conn_flags &= ~IPCL_NONSTR; - rw_exit(&udp->udp_rwlock); + mutex_exit(&connp->conn_lock); mutex_exit(&udp->udp_recv_lock); @@ -8919,48 +6725,7 @@ udp_fallback(sock_lower_handle_t proto_handle, queue_t *q, return (0); } -static int -udp_do_getpeername(udp_t *udp, struct sockaddr *sa, uint_t *salenp) -{ - sin_t *sin = (sin_t *)sa; - sin6_t *sin6 = (sin6_t *)sa; - - ASSERT(RW_LOCK_HELD(&udp->udp_rwlock)); - ASSERT(udp != NULL); - - if (udp->udp_state != TS_DATA_XFER) - return (ENOTCONN); - - switch (udp->udp_family) { - case AF_INET: - ASSERT(udp->udp_ipversion == IPV4_VERSION); - - if (*salenp < sizeof (sin_t)) - return (EINVAL); - - *salenp = sizeof (sin_t); - *sin = sin_null; - sin->sin_family = AF_INET; - sin->sin_port = udp->udp_dstport; - sin->sin_addr.s_addr = V4_PART_OF_V6(udp->udp_v6dst); - break; - case AF_INET6: - if (*salenp < sizeof (sin6_t)) - return (EINVAL); - - *salenp = sizeof (sin6_t); - *sin6 = sin6_null; - sin6->sin6_family = AF_INET6; - sin6->sin6_port = udp->udp_dstport; - sin6->sin6_addr = udp->udp_v6dst; - sin6->sin6_flowinfo = udp->udp_flowinfo; - break; - } - - return (0); -} - -/* ARGSUSED */ +/* ARGSUSED3 */ int udp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa, socklen_t *salenp, cred_t *cr) @@ -8972,104 +6737,29 @@ udp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *sa, /* All Solaris components should pass a cred for this operation. */ ASSERT(cr != NULL); - ASSERT(udp != NULL); - - rw_enter(&udp->udp_rwlock, RW_READER); - - error = udp_do_getpeername(udp, sa, salenp); - - rw_exit(&udp->udp_rwlock); - + mutex_enter(&connp->conn_lock); + if (udp->udp_state != TS_DATA_XFER) + error = ENOTCONN; + else + error = conn_getpeername(connp, sa, salenp); + mutex_exit(&connp->conn_lock); return (error); } -static int -udp_do_getsockname(udp_t *udp, struct sockaddr *sa, uint_t *salenp) -{ - sin_t *sin = (sin_t *)sa; - sin6_t *sin6 = (sin6_t *)sa; - - ASSERT(udp != NULL); - ASSERT(RW_LOCK_HELD(&udp->udp_rwlock)); - - switch (udp->udp_family) { - case AF_INET: - ASSERT(udp->udp_ipversion == IPV4_VERSION); - - if (*salenp < sizeof (sin_t)) - return (EINVAL); - - *salenp = sizeof (sin_t); - *sin = sin_null; - sin->sin_family = AF_INET; - if (udp->udp_state == TS_UNBND) { - break; - } - sin->sin_port = udp->udp_port; - - if (!IN6_IS_ADDR_V4MAPPED_ANY(&udp->udp_v6src) && - !IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) { - sin->sin_addr.s_addr = V4_PART_OF_V6(udp->udp_v6src); - } else { - /* - * INADDR_ANY - * udp_v6src is not set, we might be bound to - * broadcast/multicast. Use udp_bound_v6src as - * local address instead (that could - * also still be INADDR_ANY) - */ - sin->sin_addr.s_addr = - V4_PART_OF_V6(udp->udp_bound_v6src); - } - break; - - case AF_INET6: - if (*salenp < sizeof (sin6_t)) - return (EINVAL); - - *salenp = sizeof (sin6_t); - *sin6 = sin6_null; - sin6->sin6_family = AF_INET6; - if (udp->udp_state == TS_UNBND) { - break; - } - sin6->sin6_port = udp->udp_port; - - if (!IN6_IS_ADDR_UNSPECIFIED(&udp->udp_v6src)) { - sin6->sin6_addr = udp->udp_v6src; - } else { - /* - * UNSPECIFIED - * udp_v6src is not set, we might be bound to - * broadcast/multicast. Use udp_bound_v6src as - * local address instead (that could - * also still be UNSPECIFIED) - */ - sin6->sin6_addr = udp->udp_bound_v6src; - } - } - return (0); -} - -/* ARGSUSED */ +/* ARGSUSED3 */ int udp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *sa, socklen_t *salenp, cred_t *cr) { conn_t *connp = (conn_t *)proto_handle; - udp_t *udp = connp->conn_udp; int error; /* All Solaris components should pass a cred for this operation. */ ASSERT(cr != NULL); - ASSERT(udp != NULL); - rw_enter(&udp->udp_rwlock, RW_READER); - - error = udp_do_getsockname(udp, sa, salenp); - - rw_exit(&udp->udp_rwlock); - + mutex_enter(&connp->conn_lock); + error = conn_getsockname(connp, sa, salenp); + mutex_exit(&connp->conn_lock); return (error); } @@ -9078,7 +6768,6 @@ udp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, void *optvalp, socklen_t *optlen, cred_t *cr) { conn_t *connp = (conn_t *)proto_handle; - udp_t *udp = connp->conn_udp; int error; t_uscalar_t max_optbuf_len; void *optvalp_buf; @@ -9090,7 +6779,6 @@ udp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len, udp_opt_obj.odb_opt_des_arr, udp_opt_obj.odb_opt_arr_cnt, - udp_opt_obj.odb_topmost_tpiprovider, B_FALSE, B_TRUE, cr); if (error != 0) { if (error < 0) @@ -9099,28 +6787,22 @@ udp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, } optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP); - rw_enter(&udp->udp_rwlock, RW_READER); len = udp_opt_get(connp, level, option_name, optvalp_buf); - rw_exit(&udp->udp_rwlock); - - if (len < 0) { - /* - * Pass on to IP - */ + if (len == -1) { kmem_free(optvalp_buf, max_optbuf_len); - return (ip_get_options(connp, level, option_name, - optvalp, optlen, cr)); - } else { - /* - * update optlen and copy option value - */ - t_uscalar_t size = MIN(len, *optlen); - bcopy(optvalp_buf, optvalp, size); - bcopy(&size, optlen, sizeof (size)); - - kmem_free(optvalp_buf, max_optbuf_len); - return (0); + return (EINVAL); } + + /* + * update optlen and copy option value + */ + t_uscalar_t size = MIN(len, *optlen); + + bcopy(optvalp_buf, optvalp, size); + bcopy(&size, optlen, sizeof (size)); + + kmem_free(optvalp_buf, max_optbuf_len); + return (0); } int @@ -9128,7 +6810,6 @@ udp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, const void *optvalp, socklen_t optlen, cred_t *cr) { conn_t *connp = (conn_t *)proto_handle; - udp_t *udp = connp->conn_udp; int error; /* All Solaris components should pass a cred for this operation. */ @@ -9137,7 +6818,6 @@ udp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, error = proto_opt_check(level, option_name, optlen, NULL, udp_opt_obj.odb_opt_des_arr, udp_opt_obj.odb_opt_arr_cnt, - udp_opt_obj.odb_topmost_tpiprovider, B_TRUE, B_FALSE, cr); if (error != 0) { @@ -9146,19 +6826,11 @@ udp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, return (error); } - rw_enter(&udp->udp_rwlock, RW_WRITER); error = udp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp, NULL, cr); - rw_exit(&udp->udp_rwlock); - if (error < 0) { - /* - * Pass on to ip - */ - error = ip_set_options(connp, level, option_name, optvalp, - optlen, cr); - } + ASSERT(error >= 0); return (error); } @@ -9174,7 +6846,7 @@ udp_clr_flowctrl(sock_lower_handle_t proto_handle) mutex_exit(&udp->udp_recv_lock); } -/* ARGSUSED */ +/* ARGSUSED2 */ int udp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) { @@ -9204,6 +6876,27 @@ udp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, /* All Solaris components should pass a cred for this operation. */ ASSERT(cr != NULL); + /* + * If we don't have a helper stream then create one. + * ip_create_helper_stream takes care of locking the conn_t, + * so this check for NULL is just a performance optimization. + */ + if (connp->conn_helper_info == NULL) { + udp_stack_t *us = connp->conn_udp->udp_us; + + ASSERT(us->us_ldi_ident != NULL); + + /* + * Create a helper stream for non-STREAMS socket. + */ + error = ip_create_helper_stream(connp, us->us_ldi_ident); + if (error != 0) { + ip0dbg(("tcp_ioctl: create of IP helper stream " + "failed %d\n", error)); + return (error); + } + } + switch (cmd) { case ND_SET: case ND_GET: diff --git a/usr/src/uts/common/inet/udp/udp_opt_data.c b/usr/src/uts/common/inet/udp/udp_opt_data.c index 425d258697..02d9d3f8f8 100644 --- a/usr/src/uts/common/inet/udp/udp_opt_data.c +++ b/usr/src/uts/common/inet/udp/udp_opt_data.c @@ -56,227 +56,229 @@ */ opdes_t udp_opt_arr[] = { -{ SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 +{ SO_DEBUG, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_SNDTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, +{ SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_SNDTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (struct timeval), 0 }, -{ SO_RCVTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ SO_RCVTIMEO, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (struct timeval), 0 }, -{ SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), +{ SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ SO_RECVUCRED, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 +{ SO_RECVUCRED, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT, sizeof (int), +{ SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), 0 }, -{ SO_TIMESTAMP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 +{ SO_TIMESTAMP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), +{ SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), +{ SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), +{ SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { SCM_UCRED, SOL_SOCKET, OA_W, OA_W, OP_NP, OP_VARLEN|OP_NODEFAULT, 512, 0 }, -{ SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, +{ SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, +{ SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, { IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), + (OP_VARLEN|OP_NODEFAULT), IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, { T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), + (OP_VARLEN|OP_NODEFAULT), IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, -{ IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ IP_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ IP_RECVOPTS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ IP_RECVDSTADDR, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 +{ IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ T_IP_TOS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ IP_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ IP_RECVOPTS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ IP_RECVDSTADDR, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IP_RECVIF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ IP_RECVSLLA, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ IP_RECVTTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), +{ IP_RECVIF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ IP_RECVSLLA, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ IP_RECVTTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IP_MULTICAST_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IP_MULTICAST_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (struct in_addr), 0 /* INADDR_ANY */ }, -{ IP_MULTICAST_LOOP, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_DEF_FN), +{ IP_MULTICAST_LOOP, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN, sizeof (uchar_t), -1 /* not initialized */}, -{ IP_MULTICAST_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_DEF_FN), +{ IP_MULTICAST_TTL, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN, sizeof (uchar_t), -1 /* not initialized */ }, -{ IP_ADD_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT), +{ IP_ADD_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, OP_NODEFAULT, sizeof (struct ip_mreq), -1 /* not initialized */ }, -{ IP_DROP_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT), +{ IP_DROP_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, OP_NODEFAULT, sizeof (struct ip_mreq), -1 /* not initialized */ }, -{ IP_BLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT), +{ IP_BLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, OP_NODEFAULT, sizeof (struct ip_mreq_source), -1 /* not initialized */ }, -{ IP_UNBLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT), +{ IP_UNBLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, OP_NODEFAULT, sizeof (struct ip_mreq_source), -1 /* not initialized */ }, { IP_ADD_SOURCE_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct ip_mreq_source), -1 }, + OP_NODEFAULT, sizeof (struct ip_mreq_source), -1 }, { IP_DROP_SOURCE_MEMBERSHIP, IPPROTO_IP, OA_X, OA_X, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct ip_mreq_source), -1 }, + OP_NODEFAULT, sizeof (struct ip_mreq_source), -1 }, -{ IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_NODEFAULT), +{ IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT, sizeof (ipsec_req_t), -1 /* not initialized */ }, -{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 /* no ifindex */ }, -{ IP_DHCPINIT_IF, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT, +{ IP_DHCPINIT_IF, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), 0 }, -{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, OP_PASSNEXT, +{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0, sizeof (int), 0 }, { IP_BROADCAST_TTL, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0, sizeof (uchar_t), 0 /* disabled */ }, { IP_PKTINFO, IPPROTO_IP, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT|OP_VARLEN), + (OP_NODEFAULT|OP_VARLEN), sizeof (struct in_pktinfo), -1 /* not initialized */ }, -{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, OP_PASSNEXT, +{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (in_addr_t), -1 /* not initialized */ }, +{ IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, + { MCAST_JOIN_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_req), + OP_NODEFAULT, sizeof (struct group_req), -1 /* not initialized */ }, { MCAST_LEAVE_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_req), + OP_NODEFAULT, sizeof (struct group_req), -1 /* not initialized */ }, { MCAST_BLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req), + OP_NODEFAULT, sizeof (struct group_source_req), -1 /* not initialized */ }, { MCAST_UNBLOCK_SOURCE, IPPROTO_IP, OA_X, OA_X, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req), + OP_NODEFAULT, sizeof (struct group_source_req), -1 /* not initialized */ }, { MCAST_JOIN_SOURCE_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req), + OP_NODEFAULT, sizeof (struct group_source_req), -1 /* not initialized */ }, { MCAST_LEAVE_SOURCE_GROUP, IPPROTO_IP, OA_X, OA_X, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req), + OP_NODEFAULT, sizeof (struct group_source_req), -1 /* not initialized */ }, -{ IPV6_MULTICAST_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_MULTICAST_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { IPV6_MULTICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_DEF_FN), sizeof (int), -1 /* not initialized */ }, + OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, { IPV6_MULTICAST_LOOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_DEF_FN), sizeof (int), -1 /* not initialized */}, + OP_DEF_FN, sizeof (int), -1 /* not initialized */}, -{ IPV6_JOIN_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, (OP_PASSNEXT|OP_NODEFAULT), +{ IPV6_JOIN_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, OP_NODEFAULT, sizeof (struct ipv6_mreq), -1 /* not initialized */ }, { IPV6_LEAVE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), + OP_NODEFAULT, sizeof (struct ipv6_mreq), -1 /* not initialized */ }, -{ IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_DEF_FN), +{ IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN, sizeof (int), -1 /* not initialized */ }, -{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 /* no ifindex */ }, -{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, OP_PASSNEXT, +{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0, sizeof (int), 0 }, { IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT|OP_VARLEN), + (OP_NODEFAULT|OP_VARLEN), sizeof (struct in6_pktinfo), -1 /* not initialized */ }, { IPV6_HOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), + OP_NODEFAULT, sizeof (int), -1 /* not initialized */ }, { IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT|OP_VARLEN), + (OP_NODEFAULT|OP_VARLEN), sizeof (sin6_t), -1 /* not initialized */ }, { IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), + (OP_VARLEN|OP_NODEFAULT), MAX_EHDR_LEN, -1 /* not initialized */ }, { IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), + (OP_VARLEN|OP_NODEFAULT), MAX_EHDR_LEN, -1 /* not initialized */ }, { IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), + (OP_VARLEN|OP_NODEFAULT), MAX_EHDR_LEN, -1 /* not initialized */ }, { IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_VARLEN|OP_NODEFAULT), + (OP_VARLEN|OP_NODEFAULT), MAX_EHDR_LEN, -1 /* not initialized */ }, { IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), + OP_NODEFAULT, sizeof (int), -1 /* not initialized */ }, { IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), - sizeof (int), -1 /* not initialized */ }, -{ IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, + OP_NODEFAULT, + sizeof (struct ip6_mtuinfo), -1 }, +{ IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { IPV6_RECVPATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, - OP_PASSNEXT, sizeof (int), 0 }, -{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, + 0, sizeof (int), 0 }, +{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, (OP_PASSNEXT|OP_NODEFAULT), +{ IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT, sizeof (ipsec_req_t), -1 /* not initialized */ }, -{ IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, +{ IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT }, { MCAST_JOIN_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_req), + OP_NODEFAULT, sizeof (struct group_req), -1 /* not initialized */ }, { MCAST_LEAVE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_req), + OP_NODEFAULT, sizeof (struct group_req), -1 /* not initialized */ }, { MCAST_BLOCK_SOURCE, IPPROTO_IPV6, OA_X, OA_X, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req), + OP_NODEFAULT, sizeof (struct group_source_req), -1 /* not initialized */ }, { MCAST_UNBLOCK_SOURCE, IPPROTO_IPV6, OA_X, OA_X, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req), + OP_NODEFAULT, sizeof (struct group_source_req), -1 /* not initialized */ }, { MCAST_JOIN_SOURCE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req), + OP_NODEFAULT, sizeof (struct group_source_req), -1 /* not initialized */ }, { MCAST_LEAVE_SOURCE_GROUP, IPPROTO_IPV6, OA_X, OA_X, OP_NP, - (OP_PASSNEXT|OP_NODEFAULT), sizeof (struct group_source_req), + OP_NODEFAULT, sizeof (struct group_source_req), -1 /* not initialized */ }, -{ UDP_ANONPRIVBIND, IPPROTO_UDP, OA_R, OA_RW, OP_PRIVPORT, OP_PASSNEXT, +{ UDP_ANONPRIVBIND, IPPROTO_UDP, OA_R, OA_RW, OP_PRIVPORT, 0, sizeof (int), 0 }, -{ UDP_EXCLBIND, IPPROTO_UDP, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 +{ UDP_EXCLBIND, IPPROTO_UDP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { UDP_RCVHDR, IPPROTO_UDP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, @@ -317,7 +319,6 @@ optdb_obj_t udp_opt_obj = { udp_opt_default, /* UDP default value function pointer */ udp_tpi_opt_get, /* UDP get function pointer */ udp_tpi_opt_set, /* UDP set function pointer */ - B_TRUE, /* UDP is tpi provider */ UDP_OPT_ARR_CNT, /* UDP option database count of entries */ udp_opt_arr, /* UDP option database */ UDP_VALID_LEVELS_CNT, /* UDP valid level count of entries */ diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h index 1b4935f456..4da82a0377 100644 --- a/usr/src/uts/common/inet/udp_impl.h +++ b/usr/src/uts/common/inet/udp_impl.h @@ -51,84 +51,6 @@ extern "C" { #define UDP_MOD_ID 5607 -typedef struct udp_bits_s { - - uint32_t - - udpb_debug : 1, /* SO_DEBUG "socket" option. */ - udpb_dontroute : 1, /* SO_DONTROUTE "socket" option. */ - udpb_broadcast : 1, /* SO_BROADCAST "socket" option. */ - udpb_useloopback : 1, /* SO_USELOOPBACK "socket" option */ - - udpb_reuseaddr : 1, /* SO_REUSEADDR "socket" option. */ - udpb_dgram_errind : 1, /* SO_DGRAM_ERRIND option */ - udpb_recvdstaddr : 1, /* IP_RECVDSTADDR option */ - udpb_recvopts : 1, /* IP_RECVOPTS option */ - - udpb_unspec_source : 1, /* IP*_UNSPEC_SRC option */ - udpb_ip_recvpktinfo : 1, /* IPV6_RECVPKTINFO option */ - udpb_ipv6_recvhoplimit : 1, /* IPV6_RECVHOPLIMIT option */ - udpb_ipv6_recvhopopts : 1, /* IPV6_RECVHOPOPTS option */ - - udpb_ipv6_recvdstopts : 1, /* IPV6_RECVDSTOPTS option */ - udpb_ipv6_recvrthdr : 1, /* IPV6_RECVRTHDR option */ - udpb_ipv6_recvtclass : 1, /* IPV6_RECVTCLASS */ - udpb_ipv6_recvpathmtu : 1, /* IPV6_RECVPATHMTU */ - - udpb_anon_priv_bind : 1, - udpb_exclbind : 1, /* ``exclusive'' binding */ - udpb_recvif : 1, /* IP_RECVIF option */ - udpb_recvslla : 1, /* IP_RECVSLLA option */ - - udpb_recvttl : 1, /* IP_RECVTTL option */ - udpb_recvucred : 1, /* IP_RECVUCRED option */ - udpb_old_ipv6_recvdstopts : 1, /* old form of IPV6_DSTOPTS */ - udpb_ipv6_recvrthdrdstopts : 1, /* IPV6_RECVRTHDRDSTOPTS */ - - udpb_rcvhdr : 1, /* UDP_RCVHDR option */ - udpb_issocket : 1, /* socket mode; sockfs is on top */ - udpb_timestamp : 1, /* SO_TIMESTAMP "socket" option */ - - udpb_nat_t_endpoint : 1, /* UDP_NAT_T_ENDPOINT option */ - udpb_pad_to_bit_31 : 4; -} udp_bits_t; - -#define udp_debug udp_bits.udpb_debug -#define udp_dontroute udp_bits.udpb_dontroute -#define udp_broadcast udp_bits.udpb_broadcast -#define udp_useloopback udp_bits.udpb_useloopback - -#define udp_reuseaddr udp_bits.udpb_reuseaddr -#define udp_dgram_errind udp_bits.udpb_dgram_errind -#define udp_recvdstaddr udp_bits.udpb_recvdstaddr -#define udp_recvopts udp_bits.udpb_recvopts - -#define udp_unspec_source udp_bits.udpb_unspec_source -#define udp_ip_recvpktinfo udp_bits.udpb_ip_recvpktinfo -#define udp_ipv6_recvhoplimit udp_bits.udpb_ipv6_recvhoplimit -#define udp_ipv6_recvhopopts udp_bits.udpb_ipv6_recvhopopts - -#define udp_ipv6_recvdstopts udp_bits.udpb_ipv6_recvdstopts -#define udp_ipv6_recvrthdr udp_bits.udpb_ipv6_recvrthdr -#define udp_ipv6_recvtclass udp_bits.udpb_ipv6_recvtclass -#define udp_ipv6_recvpathmtu udp_bits.udpb_ipv6_recvpathmtu - -#define udp_anon_priv_bind udp_bits.udpb_anon_priv_bind -#define udp_exclbind udp_bits.udpb_exclbind -#define udp_recvif udp_bits.udpb_recvif -#define udp_recvslla udp_bits.udpb_recvslla - -#define udp_recvttl udp_bits.udpb_recvttl -#define udp_recvucred udp_bits.udpb_recvucred -#define udp_old_ipv6_recvdstopts udp_bits.udpb_old_ipv6_recvdstopts -#define udp_ipv6_recvrthdrdstopts udp_bits.udpb_ipv6_recvrthdrdstopts - -#define udp_rcvhdr udp_bits.udpb_rcvhdr -#define udp_issocket udp_bits.udpb_issocket -#define udp_timestamp udp_bits.udpb_timestamp - -#define udp_nat_t_endpoint udp_bits.udpb_nat_t_endpoint - /* * Bind hash list size and hash function. It has to be a power of 2 for * hashing. @@ -148,49 +70,21 @@ typedef struct udp_fanout_s { #endif } udp_fanout_t; -/* - * dev_q is the write side queue of the entity below IP. - * If there is a module below IP, we can't optimize by looking - * at q_first of the queue below IP. If the driver is directly - * below IP and if the q_first is NULL, we optimize by not doing - * the canput check - */ -#define DEV_Q_FLOW_BLOCKED(dev_q) \ - (((dev_q)->q_next != NULL || (dev_q)->q_first != NULL) && \ - !canput(dev_q)) - /* Kstats */ typedef struct udp_stat { /* Class "net" kstats */ - kstat_named_t udp_ip_send; - kstat_named_t udp_ip_ire_send; - kstat_named_t udp_ire_null; kstat_named_t udp_sock_fallback; - kstat_named_t udp_out_sw_cksum; - kstat_named_t udp_out_sw_cksum_bytes; kstat_named_t udp_out_opt; kstat_named_t udp_out_err_notconn; kstat_named_t udp_out_err_output; kstat_named_t udp_out_err_tudr; - kstat_named_t udp_in_pktinfo; - kstat_named_t udp_in_recvdstaddr; - kstat_named_t udp_in_recvopts; - kstat_named_t udp_in_recvif; - kstat_named_t udp_in_recvslla; - kstat_named_t udp_in_recvucred; - kstat_named_t udp_in_recvttl; - kstat_named_t udp_in_recvhopopts; - kstat_named_t udp_in_recvhoplimit; - kstat_named_t udp_in_recvdstopts; - kstat_named_t udp_in_recvrtdstopts; - kstat_named_t udp_in_recvrthdr; - kstat_named_t udp_in_recvpktinfo; - kstat_named_t udp_in_recvtclass; - kstat_named_t udp_in_timestamp; - kstat_named_t udp_ip_rcvpktinfo; - kstat_named_t udp_cookie_coll; #ifdef DEBUG kstat_named_t udp_data_conn; kstat_named_t udp_data_notconn; + kstat_named_t udp_out_lastdst; + kstat_named_t udp_out_diffdst; + kstat_named_t udp_out_ipv6; + kstat_named_t udp_out_mapped; + kstat_named_t udp_out_ipv4; #endif } udp_stat_t; @@ -242,79 +136,43 @@ typedef struct udp_stack udp_stack_t; /* Internal udp control structure, one per open stream */ typedef struct udp_s { - krwlock_t udp_rwlock; /* Protects most of udp_t */ - t_scalar_t udp_pending_op; /* The current TPI operation */ /* - * Following fields up to udp_ipversion protected by conn_lock, - * and the fanout lock i.e.uf_lock. Need both locks to change the - * field, either lock is sufficient for reading the field. + * The addresses and ports in the conn_t and udp_state are protected by + * conn_lock and the fanout lock i.e. uf_lock. Need both locks to change + * the fields, either lock is sufficient for reading the field. + * conn_lock also protects the content of udp_t. */ uint32_t udp_state; /* TPI state */ - in_port_t udp_port; /* Port bound to this stream */ - in_port_t udp_dstport; /* Connected port */ - in6_addr_t udp_v6src; /* Source address of this stream */ - in6_addr_t udp_bound_v6src; /* Explicitly bound address */ - in6_addr_t udp_v6dst; /* Connected destination */ - /* - * IP format that packets transmitted from this struct should use. - * Value can be IP4_VERSION or IPV6_VERSION. - */ - ushort_t udp_ipversion; - /* Written to only once at the time of opening the endpoint */ - sa_family_t udp_family; /* Family from socket() call */ - - /* Following protected by udp_rwlock */ - uint32_t udp_flowinfo; /* Connected flow id and tclass */ - uint32_t udp_max_hdr_len; /* For write offset in stream head */ - uint32_t udp_ip_snd_options_len; /* Len of IPv4 options */ - uchar_t *udp_ip_snd_options; /* Ptr to IPv4 options */ - uint32_t udp_ip_rcv_options_len; /* Len of IPv4 options recvd */ - uchar_t *udp_ip_rcv_options; /* Ptr to IPv4 options recvd */ - uchar_t udp_multicast_ttl; /* IP*_MULTICAST_TTL/HOPS */ - ipaddr_t udp_multicast_if_addr; /* IP_MULTICAST_IF option */ - uint_t udp_multicast_if_index; /* IPV6_MULTICAST_IF option */ - int udp_bound_if; /* IP*_BOUND_IF option */ + ip_pkt_t udp_recv_ipp; /* Used for IPv4 options received */ /* Written to only once at the time of opening the endpoint */ conn_t *udp_connp; - /* Following protected by udp_rwlock */ - udp_bits_t udp_bits; /* Bit fields defined above */ - uint8_t udp_type_of_service; /* IP_TOS option */ - uint8_t udp_ttl; /* TTL or hoplimit */ - ip6_pkt_t udp_sticky_ipp; /* Sticky options */ - uint8_t *udp_sticky_hdrs; /* Prebuilt IPv6 hdrs */ - uint_t udp_sticky_hdrs_len; /* Incl. ip6h and any ip6i */ + uint32_t + udp_issocket : 1, /* socket mode; sockfs is on top */ + udp_nat_t_endpoint : 1, /* UDP_NAT_T_ENDPOINT option */ + udp_rcvhdr : 1, /* UDP_RCVHDR option */ + + udp_pad_to_bit_31 : 29; /* Following 2 fields protected by the uf_lock */ struct udp_s *udp_bind_hash; /* Bind hash chain */ struct udp_s **udp_ptpbhn; /* Pointer to previous bind hash next. */ - /* Following protected by udp_rwlock */ kmutex_t udp_recv_lock; /* recv lock */ size_t udp_rcv_disply_hiwat; /* user's view of rcvbuf */ size_t udp_rcv_hiwat; /* receive high watermark */ - size_t udp_rcv_lowat; /* receive low watermark */ - size_t udp_xmit_hiwat; /* Send buffer high watermark */ - size_t udp_xmit_lowat; /* Send buffer low watermark */ - uint_t udp_label_len; /* length of security label */ - uint_t udp_label_len_v6; /* len of v6 security label */ - in6_addr_t udp_v6lastdst; /* most recent destination */ - in_port_t udp_lastdstport; /* most recent dest port */ - cred_t *udp_last_cred; /* most recent credentials */ - cred_t *udp_effective_cred; /* cred with effective label */ - - uint64_t udp_open_time; /* time when this was opened */ - pid_t udp_open_pid; /* process id when this was opened */ + + /* Set at open time and never changed */ udp_stack_t *udp_us; /* Stack instance for zone */ + int udp_delayed_error; mblk_t *udp_fallback_queue_head; mblk_t *udp_fallback_queue_tail; struct sockaddr_storage udp_delayed_addr; } udp_t; -/* UDP Protocol header */ /* UDP Protocol header aligned */ typedef struct udpahdr_s { in_port_t uha_src_port; /* Source port */ @@ -334,6 +192,8 @@ typedef struct udpahdr_s { #define us_xmit_lowat us_param_arr[8].udp_param_value #define us_recv_hiwat us_param_arr[9].udp_param_value #define us_max_buf us_param_arr[10].udp_param_value +#define us_pmtu_discovery us_param_arr[11].udp_param_value +#define us_sendto_ignerr us_param_arr[12].udp_param_value #define UDP_STAT(us, x) ((us)->us_statistics.x.value.ui64++) @@ -348,14 +208,11 @@ typedef struct udpahdr_s { extern int udp_opt_default(queue_t *, t_scalar_t, t_scalar_t, uchar_t *); extern int udp_tpi_opt_get(queue_t *, t_scalar_t, t_scalar_t, uchar_t *); extern int udp_tpi_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *, - uint_t *, uchar_t *, void *, cred_t *, mblk_t *); + uint_t *, uchar_t *, void *, cred_t *); extern mblk_t *udp_snmp_get(queue_t *, mblk_t *); extern int udp_snmp_set(queue_t *, t_scalar_t, t_scalar_t, uchar_t *, int); -extern void udp_close_free(conn_t *); -extern void udp_quiesce_conn(conn_t *); extern void udp_ddi_g_init(void); extern void udp_ddi_g_destroy(void); -extern void udp_g_q_inactive(udp_stack_t *); extern void udp_output(conn_t *connp, mblk_t *mp, struct sockaddr *addr, socklen_t addrlen); extern void udp_wput(queue_t *, mblk_t *); diff --git a/usr/src/uts/common/io/dld/dld_proto.c b/usr/src/uts/common/io/dld/dld_proto.c index 338a1c96d0..79b88ca659 100644 --- a/usr/src/uts/common/io/dld/dld_proto.c +++ b/usr/src/uts/common/io/dld/dld_proto.c @@ -1478,7 +1478,7 @@ dld_capab_lso(dld_str_t *dsp, void *data, uint_t flags) lso->lso_flags = 0; /* translate the flag for mac clients */ if ((mac_lso.lso_flags & LSO_TX_BASIC_TCP_IPV4) != 0) - lso->lso_flags |= DLD_LSO_TX_BASIC_TCP_IPV4; + lso->lso_flags |= DLD_LSO_BASIC_TCP_IPV4; dsp->ds_lso = B_TRUE; dsp->ds_lso_max = lso->lso_max; } else { diff --git a/usr/src/uts/common/io/ib/clients/rds/rds_opt.c b/usr/src/uts/common/io/ib/clients/rds/rds_opt.c index 902d838ff4..639bb28bcc 100644 --- a/usr/src/uts/common/io/ib/clients/rds/rds_opt.c +++ b/usr/src/uts/common/io/ib/clients/rds/rds_opt.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -29,9 +29,9 @@ #define rds_max_buf 2097152 opdes_t rds_opt_arr[] = { -{ SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, -{ SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, OP_PASSNEXT, sizeof (int), 0 }, +{ SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, +{ SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_RCVBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, }; /* ARGSUSED */ @@ -79,7 +79,7 @@ rds_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) int rds_opt_set(queue_t *q, uint_t optset_context, int level, int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, - uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk) + uchar_t *outvalp, void *thisdg_attrs, cred_t *cr) { int *i1 = (int *)(uintptr_t)invalp; boolean_t checkonly; @@ -187,7 +187,6 @@ optdb_obj_t rds_opt_obj = { rds_opt_default, /* RDS default value function pointer */ rds_opt_get, /* RDS get function pointer */ rds_opt_set, /* RDS set function pointer */ - B_TRUE, /* RDS is tpi provider */ RDS_OPT_ARR_CNT, /* RDS option database count of entries */ rds_opt_arr, /* RDS option database */ RDS_VALID_LEVELS_CNT, /* RDS valid level count of entries */ diff --git a/usr/src/uts/common/io/ib/clients/rds/rdsddi.c b/usr/src/uts/common/io/ib/clients/rds/rdsddi.c index a4a9c6c8e0..13a1d4bf75 100644 --- a/usr/src/uts/common/io/ib/clients/rds/rdsddi.c +++ b/usr/src/uts/common/io/ib/clients/rds/rdsddi.c @@ -654,11 +654,9 @@ rds_wput_other(queue_t *q, mblk_t *mp) } if (((union T_primitives *)(uintptr_t)rptr)->type == T_SVR4_OPTMGMT_REQ) { - (void) svr4_optcom_req(q, mp, cr, &rds_opt_obj, - B_FALSE); + svr4_optcom_req(q, mp, cr, &rds_opt_obj); } else { - (void) tpi_optcom_req(q, mp, cr, &rds_opt_obj, - B_FALSE); + tpi_optcom_req(q, mp, cr, &rds_opt_obj); } return; case T_CONN_REQ: diff --git a/usr/src/uts/common/io/ib/clients/rds/rdssubr.c b/usr/src/uts/common/io/ib/clients/rds/rdssubr.c index 8e57cb783d..f9bbcd092f 100644 --- a/usr/src/uts/common/io/ib/clients/rds/rdssubr.c +++ b/usr/src/uts/common/io/ib/clients/rds/rdssubr.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/ib/clients/rds/rds.h> #include <sys/ib/clients/rds/rds_kstat.h> @@ -135,9 +133,9 @@ rds_init() * kstats */ rds_kstatsp = kstat_create("rds", 0, - "rds_kstat", "misc", KSTAT_TYPE_NAMED, - sizeof (rds_kstat) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE); + "rds_kstat", "misc", KSTAT_TYPE_NAMED, + sizeof (rds_kstat) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE); if (rds_kstatsp != NULL) { rds_kstatsp->ks_lock = &rds_kstat_mutex; rds_kstatsp->ks_data = (void *)&rds_kstat; @@ -298,17 +296,14 @@ rds_fanout(ipaddr_t local_addr, ipaddr_t rem_addr, boolean_t rds_islocal(ipaddr_t addr) { - ire_t *ire; ip_stack_t *ipst; ipst = netstack_find_by_zoneid(GLOBAL_ZONEID)->netstack_ip; ASSERT(ipst != NULL); - - ire = ire_ctable_lookup(addr, NULL, IRE_LOCAL | IRE_LOOPBACK | - IRE_BROADCAST, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); - netstack_rele(ipst->ips_netstack); - if (ire == NULL) + if (ip_laddr_verify_v4(addr, ALL_ZONES, ipst, B_FALSE) == IPVL_BAD) { + netstack_rele(ipst->ips_netstack); return (B_FALSE); - ire_refrele(ire); + } + netstack_rele(ipst->ips_netstack); return (B_TRUE); } diff --git a/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp.c b/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp.c index 944e61a067..3bb7d3a98c 100644 --- a/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp.c +++ b/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp.c @@ -26,41 +26,28 @@ #include <sys/types.h> #include <sys/ddi.h> #include <sys/sunddi.h> -#include <sys/stropts.h> -#include <sys/stream.h> -#include <sys/strsun.h> #include <sys/strsubr.h> #include <sys/socket.h> -#include <sys/stat.h> #include <net/if_arp.h> #include <net/if_types.h> -#include <sys/file.h> #include <sys/sockio.h> #include <sys/pathname.h> -#include <inet/arp.h> -#include <sys/modctl.h> #include <sys/ib/mgt/ibcm/ibcm_arp.h> #include <sys/kstr.h> -#include <sys/tiuser.h> #include <sys/t_kuser.h> extern char cmlog[]; -extern int ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr, - ibt_ip_addr_t *src_addr, ibcm_arp_pr_comp_func_t func); -extern void ibcm_arp_pr_arp_ack(mblk_t *mp); -extern void ibcm_arp_prwqn_delete(ibcm_arp_prwqn_t *wqnp); +extern int ibcm_resolver_pr_lookup(ibcm_arp_streams_t *ib_s, + ibt_ip_addr_t *dst_addr, ibt_ip_addr_t *src_addr); +extern void ibcm_arp_delete_prwqn(ibcm_arp_prwqn_t *wqnp); -_NOTE(SCHEME_PROTECTS_DATA("Unshared data", datab)) _NOTE(SCHEME_PROTECTS_DATA("Unshared data", ibt_ip_addr_s)) _NOTE(SCHEME_PROTECTS_DATA("Unshared data", ibcm_arp_ip_t)) _NOTE(SCHEME_PROTECTS_DATA("Unshared data", ibcm_arp_ibd_insts_t)) _NOTE(SCHEME_PROTECTS_DATA("Unshared data", ibcm_arp_prwqn_t)) -_NOTE(SCHEME_PROTECTS_DATA("Unshared data", iocblk)) -_NOTE(SCHEME_PROTECTS_DATA("Unshared data", msgb)) -_NOTE(SCHEME_PROTECTS_DATA("Unshared data", queue)) _NOTE(SCHEME_PROTECTS_DATA("Unshared data", sockaddr_in)) _NOTE(SCHEME_PROTECTS_DATA("Unshared data", sockaddr_in6)) @@ -89,269 +76,6 @@ ibcm_ip_print(char *label, ibt_ip_addr_t *ipaddr) } } -/* - * ibcm_arp_get_ibaddr_cb - */ -static int -ibcm_arp_get_ibaddr_cb(void *arg, int status) -{ - ibcm_arp_prwqn_t *wqnp = (ibcm_arp_prwqn_t *)arg; - ibcm_arp_streams_t *ib_s = (ibcm_arp_streams_t *)wqnp->arg; - - IBTF_DPRINTF_L4(cmlog, "ibcm_arp_get_ibaddr_cb(ib_s: %p wqnp: %p)", - ib_s, wqnp); - - mutex_enter(&ib_s->lock); - ib_s->status = status; - ib_s->done = B_TRUE; - - IBTF_DPRINTF_L3(cmlog, "ibcm_arp_get_ibaddr_cb: SGID %llX:%llX " - "DGID: %llX:%llX", wqnp->sgid.gid_prefix, wqnp->sgid.gid_guid, - wqnp->dgid.gid_prefix, wqnp->dgid.gid_guid); - - /* lock is held by the caller. */ - cv_signal(&ib_s->cv); - mutex_exit(&ib_s->lock); - return (0); -} - -/* - * Lower read service procedure (messages coming back from arp/ip). - * Process messages based on queue type. - */ -static int -ibcm_arp_lrsrv(queue_t *q) -{ - mblk_t *mp; - ibcm_arp_streams_t *ib_s = q->q_ptr; - - IBTF_DPRINTF_L4(cmlog, "ibcm_arp_lrsrv(%p, ibd_s: 0x%p)", q, ib_s); - - if (WR(q) == ib_s->arpqueue) { - while (mp = getq(q)) { - ibcm_arp_pr_arp_ack(mp); - } - } - - return (0); -} - -/* - * Lower write service procedure. - * Used when lower streams are flow controlled. - */ -static int -ibcm_arp_lwsrv(queue_t *q) -{ - mblk_t *mp; - - IBTF_DPRINTF_L4(cmlog, "ibcm_arp_lwsrv(%p)", q); - - while (mp = getq(q)) { - if (canputnext(q)) { - putnext(q, mp); - } else { - (void) putbq(q, mp); - qenable(q); - break; - } - } - - return (0); -} - -/* - * Lower read put procedure. Arp/ip messages come here. - */ -static int -ibcm_arp_lrput(queue_t *q, mblk_t *mp) -{ - IBTF_DPRINTF_L4(cmlog, "ibcm_arp_lrput(0x%p, db_type: %d)", - q, DB_TYPE(mp)); - - switch (DB_TYPE(mp)) { - case M_FLUSH: - /* - * Turn around - */ - if (*mp->b_rptr & FLUSHW) { - *mp->b_rptr &= ~FLUSHR; - qreply(q, mp); - return (0); - } - freemsg(mp); - break; - case M_IOCACK: - case M_IOCNAK: - case M_DATA: - /* - * This could be in interrupt context. - * Some of the ibt calls cannot be called in - * interrupt context, so - * put it in the queue and the message will be - * processed by service proccedure - */ - (void) putq(q, mp); - qenable(q); - break; - default: - IBTF_DPRINTF_L2(cmlog, "ibcm_arp_lrput: " - "got unknown msg <0x%x>\n", mp->b_datap->db_type); - ASSERT(0); - break; - } - - return (0); -} - -/* - * Streams write queue module info - */ -static struct module_info ibcm_arp_winfo = { - 0, /* module ID number */ - "ibcm", /* module name */ - 0, /* min packet size */ - INFPSZ, - 49152, /* STREAM queue high water mark -- 49152 */ - 12 /* STREAM queue low water mark -- 12 */ -}; - -/* - * Streams lower write queue, for ibcm/ip requests. - */ -static struct qinit ibcm_arp_lwinit = { - NULL, /* qi_putp */ - ibcm_arp_lwsrv, /* qi_srvp */ - NULL, /* qi_qopen */ - NULL, /* qi_qclose */ - NULL, /* qi_qadmin */ - &ibcm_arp_winfo, /* module info */ - NULL, /* module statistics struct */ - NULL, - NULL, - STRUIOT_NONE /* stream uio type is standard uiomove() */ -}; - -/* - * Streams lower read queue: read reply messages from ibcm/ip. - */ -static struct qinit ibcm_arp_lrinit = { - ibcm_arp_lrput, /* qi_putp */ - ibcm_arp_lrsrv, /* qi_srvp */ - NULL, /* qi_qopen */ - NULL, /* qi_qclose */ - NULL, /* qi_qadmin */ - &ibcm_arp_winfo, /* module info */ - NULL, /* module statistics struct */ - NULL, - NULL, - STRUIOT_NONE /* stream uio type is standard uiomove() */ -}; - - -static int -ibcm_arp_link_driver(ibcm_arp_streams_t *ib_s, char *path, queue_t **q, - vnode_t **dev_vp) -{ - struct stdata *dev_stp; - vnode_t *vp; - int error; - queue_t *rq; - - IBTF_DPRINTF_L4(cmlog, "ibcm_arp_link_driver: Enter: %s", path); - - /* open the driver from inside the kernel */ - error = vn_open(path, UIO_SYSSPACE, FREAD|FWRITE, 0, &vp, - 0, NULL); - if (error) { - IBTF_DPRINTF_L2(cmlog, "ibcm_arp_link_driver: " - "vn_open('%s') failed\n", path); - return (error); - } - *dev_vp = vp; - - dev_stp = vp->v_stream; - *q = dev_stp->sd_wrq; - - VN_HOLD(vp); - - rq = RD(dev_stp->sd_wrq); - RD(rq)->q_ptr = WR(rq)->q_ptr = ib_s; - setq(rq, &ibcm_arp_lrinit, &ibcm_arp_lwinit, NULL, QMTSAFE, - SQ_CI|SQ_CO, B_FALSE); - - return (0); -} - -extern struct qinit strdata; -extern struct qinit stwdata; - -/* - * Unlink ip, ibcm, icmp6 drivers - */ -/* ARGSUSED */ -static int -ibcm_arp_unlink_driver(queue_t **q, vnode_t **dev_vp) -{ - vnode_t *vp = *dev_vp; - struct stdata *dev_stp = vp->v_stream; - queue_t *wrq, *rq; - int rc; - - IBTF_DPRINTF_L4(cmlog, "ibcm_arp_unlink_driver: Enter: 0x%p", q); - - wrq = dev_stp->sd_wrq; - rq = RD(wrq); - - disable_svc(rq); - wait_svc(rq); - flushq(rq, FLUSHALL); - flushq(WR(rq), FLUSHALL); - - rq->q_ptr = wrq->q_ptr = dev_stp; - - setq(rq, &strdata, &stwdata, NULL, QMTSAFE, SQ_CI|SQ_CO, B_TRUE); - - if ((rc = VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL)) != 0) { - IBTF_DPRINTF_L2(cmlog, "ibcm_arp_unlink_driver: VOP_CLOSE " - "failed %d\n", rc); - } - VN_RELE(vp); - - return (0); -} - -static int -ibcm_arp_unlink_drivers(ibcm_arp_streams_t *ib_s) -{ - IBTF_DPRINTF_L4(cmlog, "ibcm_arp_unlink_drivers(%p)", ib_s); - - if (ib_s->arpqueue) { - (void) ibcm_arp_unlink_driver(&ib_s->arpqueue, &ib_s->arp_vp); - } - - return (0); -} - -/* - * Link ip, ibtl drivers below ibtl - */ -static int -ibcm_arp_link_drivers(ibcm_arp_streams_t *ib_s) -{ - int rc; - - IBTF_DPRINTF_L4(cmlog, "ibcm_arp_link_drivers(%p)", ib_s); - - if ((rc = ibcm_arp_link_driver(ib_s, "/dev/arp", &ib_s->arpqueue, - &ib_s->arp_vp)) != 0) { - IBTF_DPRINTF_L2(cmlog, "ibcm_arp_link_drivers: " - "ibcm_arp_link_driver failed: %d\n", rc); - return (rc); - } - - return (0); -} ibt_status_t ibcm_arp_get_ibaddr(ibt_ip_addr_t srcaddr, ibt_ip_addr_t destaddr, @@ -370,21 +94,13 @@ ibcm_arp_get_ibaddr(ibt_ip_addr_t srcaddr, ibt_ip_addr_t destaddr, mutex_init(&ib_s->lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&ib_s->cv, NULL, CV_DRIVER, NULL); - ret = ibcm_arp_link_drivers(ib_s); - if (ret != 0) { - IBTF_DPRINTF_L3(cmlog, "ibcm_arp_get_ibaddr: " - "ibcm_arp_link_drivers failed %d", ret); - goto arp_ibaddr_error; - } - mutex_enter(&ib_s->lock); ib_s->done = B_FALSE; mutex_exit(&ib_s->lock); - ret = ibcm_arp_pr_lookup(ib_s, &destaddr, &srcaddr, - ibcm_arp_get_ibaddr_cb); + ret = ibcm_resolver_pr_lookup(ib_s, &destaddr, &srcaddr); - IBTF_DPRINTF_L3(cmlog, "ibcm_arp_get_ibaddr: ibcm_arp_pr_lookup " + IBTF_DPRINTF_L3(cmlog, "ibcm_arp_get_ibaddr: ibcm_resolver_pr_lookup " "returned: %d", ret); if (ret == 0) { mutex_enter(&ib_s->lock); @@ -393,7 +109,6 @@ ibcm_arp_get_ibaddr(ibt_ip_addr_t srcaddr, ibt_ip_addr_t destaddr, mutex_exit(&ib_s->lock); } - (void) ibcm_arp_unlink_drivers(ib_s); mutex_enter(&ib_s->lock); wqnp = ib_s->wqnp; if (ib_s->status == 0) { @@ -407,11 +122,11 @@ ibcm_arp_get_ibaddr(ibt_ip_addr_t srcaddr, ibt_ip_addr_t destaddr, ib_s->wqnp->sgid.gid_prefix, ib_s->wqnp->sgid.gid_guid, ib_s->wqnp->dgid.gid_prefix, ib_s->wqnp->dgid.gid_guid); - ibcm_arp_prwqn_delete(wqnp); + ibcm_arp_delete_prwqn(wqnp); } else if (ret == 0) { /* * We come here only when lookup has returned empty (failed) - * via callback routine - ibcm_arp_get_ibaddr_cb + * via callback routine. * i.e. ib_s->status is non-zero, while ret is zero. */ if (wqnp) @@ -884,20 +599,3 @@ srcip_plist_end: return (ret); } -/* Routines for warlock */ - -/* ARGSUSED */ -static int -ibcm_arp_dummy_ibaddr_hdl(void *arg, int status) -{ - ibcm_arp_prwqn_t dummy_wqn1; - ibcm_arp_prwqn_t dummy_wqn2; - - dummy_wqn1.func = ibcm_arp_get_ibaddr_cb; - dummy_wqn2.func = ibcm_arp_dummy_ibaddr_hdl; - - IBTF_DPRINTF_L5(cmlog, "ibcm_arp_dummy_ibaddr_hdl: " - "dummy_wqn1.func %p %p", dummy_wqn1.func, dummy_wqn2.func); - - return (0); -} diff --git a/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp_link.c b/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp_link.c index 79d420d467..45fbfd7932 100644 --- a/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp_link.c +++ b/usr/src/uts/common/io/ib/mgt/ibcm/ibcm_arp_link.c @@ -24,309 +24,32 @@ */ #include <sys/types.h> -#include <sys/stream.h> -#include <sys/dlpi.h> -#include <sys/stropts.h> -#include <sys/strsun.h> -#include <sys/sysmacros.h> -#include <sys/strlog.h> -#include <sys/ddi.h> -#include <sys/cmn_err.h> -#include <sys/socket.h> #include <net/if.h> #include <net/if_types.h> -#include <netinet/in.h> -#include <sys/ethernet.h> -#include <inet/arp.h> #include <inet/ip.h> #include <inet/ip_ire.h> #include <inet/ip_if.h> #include <sys/ib/mgt/ibcm/ibcm_arp.h> -#include <inet/ip_ftable.h> - -static areq_t ibcm_arp_areq_template = { - AR_ENTRY_QUERY, /* cmd */ - sizeof (areq_t) + (2 * IP_ADDR_LEN), /* name offset */ - sizeof (areq_t), /* name len */ - IP_ARP_PROTO_TYPE, /* protocol, from arps perspective */ - sizeof (areq_t), /* target addr offset */ - IP_ADDR_LEN, /* target ADDR_length */ - 0, /* flags */ - sizeof (areq_t) + IP_ADDR_LEN, /* sender addr offset */ - IP_ADDR_LEN, /* sender addr length */ - IBCM_ARP_XMIT_COUNT, /* xmit_count */ - IBCM_ARP_XMIT_INTERVAL, /* (re)xmit_interval in milliseconds */ - 4 /* max # of requests to buffer */ - /* - * anything else filled in by the code - */ -}; - -static area_t ibcm_arp_area_template = { - AR_ENTRY_ADD, /* cmd */ - sizeof (area_t) + IPOIB_ADDRL + (2 * IP_ADDR_LEN), /* name offset */ - sizeof (area_t), /* name len */ - IP_ARP_PROTO_TYPE, /* protocol, from arps perspective */ - sizeof (area_t), /* proto addr offset */ - IP_ADDR_LEN, /* proto ADDR_length */ - sizeof (area_t) + (IP_ADDR_LEN), /* proto mask offset */ - 0, /* flags */ - sizeof (area_t) + (2 * IP_ADDR_LEN), /* hw addr offset */ - IPOIB_ADDRL /* hw addr length */ -}; extern char cmlog[]; -_NOTE(SCHEME_PROTECTS_DATA("Unshared data", msgb)) -_NOTE(SCHEME_PROTECTS_DATA("Unshared data", area_t)) _NOTE(SCHEME_PROTECTS_DATA("Unshared data", ibcm_arp_streams_t)) -static void ibcm_arp_timeout(void *arg); -static void ibcm_arp_pr_callback(ibcm_arp_prwqn_t *wqnp, int status); -static void ibcm_ipv6_resolver_ack(ip2mac_t *, void *); -static int ibcm_ipv6_lookup(ibcm_arp_prwqn_t *wqnp, ill_t *ill, zoneid_t zid); - -/* - * issue a AR_ENTRY_QUERY to arp driver and schedule a timeout. - */ -static int -ibcm_arp_query_arp(ibcm_arp_prwqn_t *wqnp) -{ - int len; - int name_len; - int name_offset; - char *cp; - mblk_t *mp; - mblk_t *mp1; - areq_t *areqp; - ibcm_arp_streams_t *ib_s = (ibcm_arp_streams_t *)wqnp->arg; - - IBTF_DPRINTF_L4(cmlog, "ibcm_arp_query_arp(ib_s: %p wqnp: %p)", - ib_s, wqnp); - - name_offset = ibcm_arp_areq_template.areq_name_offset; - - /* - * allocate mblk for AR_ENTRY_QUERY - */ - name_len = strlen(wqnp->ifname) + 1; - len = name_len + name_offset; - if ((mp = allocb(len, BPRI_HI)) == NULL) { - return (ENOMEM); - } - bzero(mp->b_rptr, len); - mp->b_wptr += len; - - /* - * allocate a mblk and set wqnp in the data - */ - if ((mp1 = allocb(sizeof (void *), BPRI_HI)) == NULL) { - freeb(mp); - return (ENOMEM); - } - - mp1->b_wptr += sizeof (void *); - *(uintptr_t *)(void *)mp1->b_rptr = (uintptr_t)wqnp; /* store wqnp */ - - cp = (char *)mp->b_rptr; - bcopy(&ibcm_arp_areq_template, cp, sizeof (areq_t)); - areqp = (void *)cp; - areqp->areq_name_length = name_len; - - cp = (char *)areqp + areqp->areq_name_offset; - bcopy(wqnp->ifname, cp, name_len); - - areqp->areq_proto = wqnp->ifproto; - bcopy(&wqnp->ifproto, areqp->areq_sap, 2); - cp = (char *)areqp + areqp->areq_target_addr_offset; - bcopy(&wqnp->dst_addr.un.ip4addr, cp, IP_ADDR_LEN); - cp = (char *)areqp + areqp->areq_sender_addr_offset; - bcopy(&wqnp->src_addr.un.ip4addr, cp, IP_ADDR_LEN); - - mp->b_cont = mp1; - - DB_TYPE(mp) = M_PROTO; - - /* - * issue the request to arp - */ - wqnp->flags |= IBCM_ARP_PR_RESOLVE_PENDING; - wqnp->timeout_id = timeout(ibcm_arp_timeout, wqnp, - drv_usectohz(IBCM_ARP_TIMEOUT * 1000)); - if (canputnext(ib_s->arpqueue)) { - putnext(ib_s->arpqueue, mp); - } else { - (void) putq(ib_s->arpqueue, mp); - qenable(ib_s->arpqueue); - } - - return (0); -} - -/* - * issue AR_ENTRY_SQUERY to arp driver - */ -static int -ibcm_arp_squery_arp(ibcm_arp_prwqn_t *wqnp) -{ - int len; - int name_len; - char *cp; - mblk_t *mp; - mblk_t *mp1; - area_t *areap; - uint32_t proto_mask = 0xffffffff; - struct iocblk *ioc; - ibcm_arp_streams_t *ib_s = (ibcm_arp_streams_t *)wqnp->arg; - - IBTF_DPRINTF_L4(cmlog, "ibcm_arp_squery_arp(ib_s: %p wqnp: %p)", - ib_s, wqnp); - - /* - * allocate mblk for AR_ENTRY_SQUERY - */ - name_len = strlen(wqnp->ifname) + 1; - len = ibcm_arp_area_template.area_name_offset + name_len + - sizeof (uintptr_t); - if ((mp = allocb(len, BPRI_HI)) == NULL) { - return (ENOMEM); - } - bzero(mp->b_rptr, len); - mp->b_wptr += len + sizeof (uintptr_t); - - *(uintptr_t *)(void *)mp->b_rptr = (uintptr_t)wqnp; /* store wqnp */ - mp->b_rptr += sizeof (uintptr_t); - - - cp = (char *)mp->b_rptr; - bcopy(&ibcm_arp_area_template, cp, sizeof (area_t)); - - areap = (void *)cp; - areap->area_cmd = AR_ENTRY_SQUERY; - areap->area_name_length = name_len; - cp = (char *)areap + areap->area_name_offset; - bcopy(wqnp->ifname, cp, name_len); - - cp = (char *)areap + areap->area_proto_addr_offset; - bcopy(&wqnp->dst_addr.un.ip4addr, cp, IP_ADDR_LEN); - - cp = (char *)areap + areap->area_proto_mask_offset; - bcopy(&proto_mask, cp, IP_ADDR_LEN); - - mp1 = allocb(sizeof (struct iocblk), BPRI_HI); - if (mp1 == NULL) { - freeb(mp); - return (ENOMEM); - } - ioc = (void *)mp1->b_rptr; - ioc->ioc_cmd = AR_ENTRY_SQUERY; - ioc->ioc_error = 0; - ioc->ioc_cr = NULL; - ioc->ioc_count = msgdsize(mp); - mp1->b_wptr += sizeof (struct iocblk); - mp1->b_cont = mp; - - DB_TYPE(mp1) = M_IOCTL; - - if (canputnext(ib_s->arpqueue)) { - putnext(ib_s->arpqueue, mp1); - } else { - (void) putq(ib_s->arpqueue, mp1); - qenable(ib_s->arpqueue); - } - return (0); -} - -/* - * issue a AR_ENTRY_ADD to arp driver - * This is required as arp driver does not maintain a cache. - */ -static int -ibcm_arp_add(ibcm_arp_prwqn_t *wqnp) -{ - int len; - int name_len; - char *cp; - mblk_t *mp; - area_t *areap; - uint32_t proto_mask = 0xffffffff; - ibcm_arp_streams_t *ib_s = (ibcm_arp_streams_t *)wqnp->arg; - - IBTF_DPRINTF_L4(cmlog, "ibcm_arp_add(ib_s: %p wqnp: %p)", ib_s, wqnp); - - /* - * allocate mblk for AR_ENTRY_ADD - */ - - name_len = strlen(wqnp->ifname) + 1; - len = ibcm_arp_area_template.area_name_offset + name_len; - if ((mp = allocb(len, BPRI_HI)) == NULL) { - return (ENOMEM); - } - bzero(mp->b_rptr, len); - mp->b_wptr += len; - - cp = (char *)mp->b_rptr; - bcopy(&ibcm_arp_area_template, cp, sizeof (area_t)); - - areap = (void *)mp->b_rptr; - areap->area_name_length = name_len; - cp = (char *)areap + areap->area_name_offset; - bcopy(wqnp->ifname, cp, name_len); - - cp = (char *)areap + areap->area_proto_addr_offset; - bcopy(&wqnp->dst_addr.un.ip4addr, cp, IP_ADDR_LEN); - - cp = (char *)areap + areap->area_proto_mask_offset; - bcopy(&proto_mask, cp, IP_ADDR_LEN); - - cp = (char *)areap + areap->area_hw_addr_offset; - bcopy(&wqnp->dst_mac, cp, IPOIB_ADDRL); - - DB_TYPE(mp) = M_PROTO; - - if (canputnext(ib_s->arpqueue)) { - putnext(ib_s->arpqueue, mp); - } else { - (void) putq(ib_s->arpqueue, mp); - qenable(ib_s->arpqueue); - } - return (0); -} - - -/* - * timeout routine when there is no response to AR_ENTRY_QUERY - */ -static void -ibcm_arp_timeout(void *arg) -{ - ibcm_arp_prwqn_t *wqnp = (ibcm_arp_prwqn_t *)arg; - ibcm_arp_streams_t *ib_s = (ibcm_arp_streams_t *)wqnp->arg; - - IBTF_DPRINTF_L4(cmlog, "ibcm_arp_timeout(ib_s: %p wqnp: %p)", - ib_s, wqnp); - wqnp->flags &= ~IBCM_ARP_PR_RESOLVE_PENDING; - cv_broadcast(&ib_s->cv); - - /* - * indicate to user - */ - ibcm_arp_pr_callback(wqnp, EHOSTUNREACH); -} +static void ibcm_resolver_ack(ip2mac_t *, void *); +static int ibcm_nce_lookup(ibcm_arp_prwqn_t *wqnp, ill_t *ill, zoneid_t zid); /* * delete a wait queue node from the list. * assumes mutex is acquired */ void -ibcm_arp_prwqn_delete(ibcm_arp_prwqn_t *wqnp) +ibcm_arp_delete_prwqn(ibcm_arp_prwqn_t *wqnp) { ibcm_arp_streams_t *ib_s; - IBTF_DPRINTF_L4(cmlog, "ibcm_arp_prwqn_delete(%p)", wqnp); + IBTF_DPRINTF_L4(cmlog, "ibcm_arp_delete_prwqn(%p)", wqnp); - ib_s = (ibcm_arp_streams_t *)wqnp->arg; + ib_s = wqnp->ib_str; ib_s->wqnp = NULL; kmem_free(wqnp, sizeof (ibcm_arp_prwqn_t)); } @@ -336,7 +59,7 @@ ibcm_arp_prwqn_delete(ibcm_arp_prwqn_t *wqnp) */ static ibcm_arp_prwqn_t * ibcm_arp_create_prwqn(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr, - ibt_ip_addr_t *src_addr, ibcm_arp_pr_comp_func_t func) + ibt_ip_addr_t *src_addr) { ibcm_arp_prwqn_t *wqnp; @@ -354,8 +77,7 @@ ibcm_arp_create_prwqn(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr, if (src_addr) { wqnp->usrc_addr = *src_addr; } - wqnp->func = func; - wqnp->arg = ib_s; + wqnp->ib_str = ib_s; wqnp->ifproto = (dst_addr->family == AF_INET) ? ETHERTYPE_IP : ETHERTYPE_IPV6; @@ -366,17 +88,6 @@ ibcm_arp_create_prwqn(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr, return (wqnp); } -/* - * call the user function - * called with lock held - */ -static void -ibcm_arp_pr_callback(ibcm_arp_prwqn_t *wqnp, int status) -{ - IBTF_DPRINTF_L4(cmlog, "ibcm_arp_pr_callback(%p, %d)", wqnp, status); - - wqnp->func((void *)wqnp, status); -} /* * Check if the interface is loopback or IB. @@ -391,23 +102,24 @@ ibcm_arp_check_interface(ill_t *ill) } int -ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr, - ibt_ip_addr_t *src_addr, ibcm_arp_pr_comp_func_t func) +ibcm_resolver_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr, + ibt_ip_addr_t *src_addr) { ibcm_arp_prwqn_t *wqnp; ire_t *ire = NULL; - ire_t *src_ire = NULL; - ipif_t *ipif; - ill_t *ill, *hwaddr_ill = NULL; + ipif_t *ipif = NULL; + ill_t *ill = NULL; + ill_t *hwaddr_ill = NULL; ip_stack_t *ipst; int len; + ipaddr_t setsrcv4; + in6_addr_t setsrcv6; IBCM_PRINT_IP("ibcm_arp_pr_lookup: SRC", src_addr); IBCM_PRINT_IP("ibcm_arp_pr_lookup: DST", dst_addr); - if ((wqnp = ibcm_arp_create_prwqn(ib_s, dst_addr, - src_addr, func)) == NULL) { - IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: " + if ((wqnp = ibcm_arp_create_prwqn(ib_s, dst_addr, src_addr)) == NULL) { + IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: " "ibcm_arp_create_prwqn failed"); ib_s->status = ENOMEM; return (1); @@ -416,86 +128,111 @@ ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr, ipst = netstack_find_by_zoneid(GLOBAL_ZONEID)->netstack_ip; if (dst_addr->family == AF_INET) { /* - * Get the ire for the local address + * A local address is always specified, and it is used + * to find the zoneid. */ - IBTF_DPRINTF_L5(cmlog, "ibcm_arp_pr_lookup: ire_ctable_lookup"); - src_ire = ire_ctable_lookup(src_addr->un.ip4addr, NULL, - IRE_LOCAL, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); - if (src_ire == NULL) { - IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: " - "ire_ctable_lookup failed"); + ipif = ipif_lookup_addr(src_addr->un.ip4addr, NULL, ALL_ZONES, + ipst); + if (ipif == NULL) { + IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: " + "ipif_lookup_addr failed"); ib_s->status = EFAULT; goto fail; } - IBTF_DPRINTF_L5(cmlog, "ibcm_arp_pr_lookup: ire_ctable_lookup"); /* - * get an ire for the destination address with the matching - * source address + * get an ire for the destination adress. + * Note that we can't use MATCH_IRE_ILL since that would + * require that the first ill we find have ire_ill set. Thus + * we compare ire_ill against ipif_ill after the lookup. */ - ire = ire_ftable_lookup(dst_addr->un.ip4addr, 0, 0, 0, - src_ire->ire_ipif, 0, src_ire->ire_zoneid, 0, NULL, - MATCH_IRE_SRC, ipst); - if (ire == NULL) { - IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: " - "ire_ftable_lookup failed"); + setsrcv4 = INADDR_ANY; + ire = ire_route_recursive_v4(dst_addr->un.ip4addr, 0, NULL, + ipif->ipif_zoneid, NULL, MATCH_IRE_DSTONLY, B_TRUE, 0, ipst, + &setsrcv4, NULL, NULL); + + ASSERT(ire != NULL); + if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { + IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: " + "ire_route_recursive_v4 failed"); + ib_s->status = EFAULT; + goto fail; + } + ill = ire_nexthop_ill(ire); + if (ill == NULL) { + IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: " + "ire_nexthop_ill failed"); + ib_s->status = EFAULT; + goto fail; + } + if (ill != ipif->ipif_ill) { + IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: " + "wrong ill"); ib_s->status = EFAULT; goto fail; } - IBTF_DPRINTF_L5(cmlog, "ibcm_arp_pr_lookup: ire_ftable_lookup:" - "done"); - - wqnp->gateway.un.ip4addr = - ((ire->ire_gateway_addr == INADDR_ANY) ? - ire->ire_addr : ire->ire_gateway_addr); + wqnp->gateway.un.ip4addr = ire->ire_gateway_addr; wqnp->netmask.un.ip4addr = ire->ire_mask; - wqnp->src_addr.un.ip4addr = ire->ire_src_addr; + wqnp->src_addr.un.ip4addr = src_addr->un.ip4addr; wqnp->src_addr.family = wqnp->gateway.family = wqnp->netmask.family = AF_INET; } else if (dst_addr->family == AF_INET6) { /* - * Get the ire for the local address + * A local address is always specified, and it is used + * to find the zoneid. + * We should really match on scopeid for link locals here. */ - src_ire = ire_ctable_lookup_v6(&src_addr->un.ip6addr, NULL, - IRE_LOCAL, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst); - if (src_ire == NULL) { - IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: " - "ire_ctable_lookup_v6 failed"); + ipif = ipif_lookup_addr_v6(&src_addr->un.ip6addr, NULL, + ALL_ZONES, ipst); + if (ipif == NULL) { + IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: " + "ipif_lookup_addr_v6 failed"); ib_s->status = EFAULT; goto fail; } - IBTF_DPRINTF_L5(cmlog, "ibcm_arp_pr_lookup: " - "ire_ctable_lookup_v6: done"); /* - * get an ire for the destination address with the matching - * source address + * get an ire for the destination adress. + * Note that we can't use MATCH_IRE_ILL since that would + * require that the first ill we find have ire_ill set. Thus + * we compare ire_ill against ipif_ill after the lookup. */ - ire = ire_ftable_lookup_v6(&dst_addr->un.ip6addr, 0, 0, 0, - src_ire->ire_ipif, 0, src_ire->ire_zoneid, 0, NULL, - MATCH_IRE_SRC, ipst); - if (ire == NULL) { - IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: " - "ire_ftable_lookup_v6 failed"); + setsrcv6 = ipv6_all_zeros; + ire = ire_route_recursive_v6(&dst_addr->un.ip6addr, 0, NULL, + ipif->ipif_zoneid, NULL, MATCH_IRE_DSTONLY, B_TRUE, 0, ipst, + &setsrcv6, NULL, NULL); + + ASSERT(ire != NULL); + if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { + IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: " + "ire_route_recursive_v6 failed"); + ib_s->status = EFAULT; + goto fail; + } + ill = ire_nexthop_ill(ire); + if (ill == NULL) { + IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: " + "ire_nexthop_ill failed"); + ib_s->status = EFAULT; + goto fail; + } + + if (ill != ipif->ipif_ill) { + IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: " + "wrong ill"); ib_s->status = EFAULT; goto fail; } - IBTF_DPRINTF_L5(cmlog, "ibcm_arp_pr_lookup: " - "ire_ftable_lookup_v6: done"); - wqnp->gateway.un.ip6addr = - (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6) ? - ire->ire_addr_v6 : ire->ire_gateway_addr_v6); + wqnp->gateway.un.ip6addr = ire->ire_gateway_addr_v6; wqnp->netmask.un.ip6addr = ire->ire_mask_v6; - wqnp->src_addr.un.ip6addr = ire->ire_src_addr_v6; + wqnp->src_addr.un.ip6addr = src_addr->un.ip6addr; wqnp->src_addr.family = wqnp->gateway.family = wqnp->netmask.family = AF_INET6; } - ipif = src_ire->ire_ipif; - ill = ipif->ipif_ill; (void) strlcpy(wqnp->ifname, ill->ill_name, sizeof (wqnp->ifname)); /* @@ -504,18 +241,19 @@ ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr, */ if (IS_IPMP(ill)) { if ((hwaddr_ill = ipmp_ipif_hold_bound_ill(ipif)) == NULL) { - IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: no bound " - "ill for IPMP interface %s", ill->ill_name); + IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: " + "no bound ill for IPMP interface %s", + ill->ill_name); ib_s->status = EFAULT; goto fail; } } else { hwaddr_ill = ill; - ill_refhold(hwaddr_ill); /* for symmetry */ + ill_refhold(hwaddr_ill); /* for symmetry */ } if ((ib_s->status = ibcm_arp_check_interface(hwaddr_ill)) != 0) { - IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: " + IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: " "ibcm_arp_check_interface failed"); goto fail; } @@ -523,7 +261,7 @@ ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr, bcopy(hwaddr_ill->ill_phys_addr, &wqnp->src_mac, hwaddr_ill->ill_phys_addr_length); - IBTF_DPRINTF_L4(cmlog, "ibcm_arp_pr_lookup: outgoing if:%s", + IBTF_DPRINTF_L4(cmlog, "ibcm_resolver_pr_lookup: outgoing if:%s", wqnp->ifname); /* @@ -534,8 +272,8 @@ ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr, len = (wqnp->usrc_addr.family == AF_INET) ? IP_ADDR_LEN : sizeof (in6_addr_t); if (bcmp(&wqnp->usrc_addr.un, &wqnp->src_addr.un, len)) { - IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: srcaddr " - "mismatch:%d", ENETUNREACH); + IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: " + "srcaddr mismatch:%d", ENETUNREACH); goto fail; } } @@ -545,253 +283,77 @@ ibcm_arp_pr_lookup(ibcm_arp_streams_t *ib_s, ibt_ip_addr_t *dst_addr, * interface, now get the destination mac address from * arp or ipv6 drivers */ - if (wqnp->dst_addr.family == AF_INET) { - if ((ib_s->status = ibcm_arp_squery_arp(wqnp)) != 0) { - IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: " - "ibcm_arp_squery_arp failed: %d", ib_s->status); - goto fail; - } - } else { - if ((ib_s->status = ibcm_ipv6_lookup(wqnp, ill, getzoneid())) != - 0) { - IBTF_DPRINTF_L2(cmlog, "ibcm_arp_pr_lookup: " - "ibcm_ipv6_lookup failed: %d", ib_s->status); - goto fail; - } + ib_s->status = ibcm_nce_lookup(wqnp, ill, getzoneid()); + if (ib_s->status != 0) { + IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_pr_lookup: " + "ibcm_nce_lookup failed: %d", ib_s->status); + goto fail; } ill_refrele(hwaddr_ill); - IRE_REFRELE(ire); - IRE_REFRELE(src_ire); + ill_refrele(ill); + ire_refrele(ire); + ipif_refrele(ipif); netstack_rele(ipst->ips_netstack); - IBTF_DPRINTF_L4(cmlog, "ibcm_arp_pr_lookup: Return: 0x%p", wqnp); + IBTF_DPRINTF_L4(cmlog, "ibcm_resolver_pr_lookup: Return: 0x%p", wqnp); return (0); fail: if (hwaddr_ill != NULL) ill_refrele(hwaddr_ill); + if (ill != NULL) + ill_refrele(ill); if (ire != NULL) - IRE_REFRELE(ire); - if (src_ire != NULL) - IRE_REFRELE(src_ire); - ibcm_arp_prwqn_delete(wqnp); + ire_refrele(ire); + if (ipif != NULL) + ipif_refrele(ipif); + ibcm_arp_delete_prwqn(wqnp); netstack_rele(ipst->ips_netstack); return (1); } /* - * called from lrsrv. - * process a AR_ENTRY_QUERY reply from arp - * the message should be M_DATA -->> dl_unitdata_req - */ -static void -ibcm_arp_pr_arp_query_ack(mblk_t *mp) -{ - ibcm_arp_prwqn_t *wqnp; - dl_unitdata_req_t *dlreq; - ibcm_arp_streams_t *ib_s; - char *cp; - int rc; - - IBTF_DPRINTF_L4(cmlog, "ibcm_arp_pr_arp_query_ack(%p)", mp); - - /* - * the first mblk contains the wqnp pointer for the request - */ - if (MBLKL(mp) != sizeof (void *)) { - freemsg(mp); - return; - } - - wqnp = *(ibcm_arp_prwqn_t **)(void *)mp->b_rptr; /* retrieve wqnp */ - ib_s = (ibcm_arp_streams_t *)wqnp->arg; - - mutex_enter(&ib_s->lock); - - /* - * cancel the timeout for this request - */ - (void) untimeout(wqnp->timeout_id); - - /* - * sanity checks on the dl_unitdata_req block - */ - if (!mp->b_cont) { - IBTF_DPRINTF_L2(cmlog, "areq_ack: b_cont = NULL\n"); - rc = EPROTO; - goto user_callback; - } - if (MBLKL(mp->b_cont) < (sizeof (dl_unitdata_req_t) + IPOIB_ADDRL)) { - IBTF_DPRINTF_L2(cmlog, "areq_ack: invalid len in " - "dl_unitdatareq_t block\n"); - rc = EPROTO; - goto user_callback; - } - dlreq = (void *)mp->b_cont->b_rptr; - if (dlreq->dl_primitive != DL_UNITDATA_REQ) { - IBTF_DPRINTF_L2(cmlog, "areq_ack: invalid dl_primitive " - "in dl_unitdatareq_t block\n"); - rc = EPROTO; - goto user_callback; - } - if (dlreq->dl_dest_addr_length != (IPOIB_ADDRL + 2)) { - IBTF_DPRINTF_L2(cmlog, "areq_ack: invalid hw len in " - "dl_unitdatareq_t block %d\n", dlreq->dl_dest_addr_length); - rc = EPROTO; - goto user_callback; - } - cp = (char *)mp->b_cont->b_rptr + dlreq->dl_dest_addr_offset; - bcopy(cp, &wqnp->dst_mac, IPOIB_ADDRL); - - /* - * at this point we have src/dst gid's derived from the mac addresses - * now get the hca, port - */ - bcopy(&wqnp->src_mac.ipoib_gidpref, &wqnp->sgid, sizeof (ib_gid_t)); - bcopy(&wqnp->dst_mac.ipoib_gidpref, &wqnp->dgid, sizeof (ib_gid_t)); - freemsg(mp); - - IBCM_H2N_GID(wqnp->sgid); - IBCM_H2N_GID(wqnp->dgid); - - (void) ibcm_arp_add(wqnp); - - mutex_exit(&ib_s->lock); - ibcm_arp_pr_callback(wqnp, 0); - - return; -user_callback: - freemsg(mp); - mutex_exit(&ib_s->lock); - - /* - * indicate to user - */ - ibcm_arp_pr_callback(wqnp, rc); -} - -/* - * process a AR_ENTRY_SQUERY reply from arp - * the message should be M_IOCACK -->> area_t + * Query the neighbor cache for IPv4/IPv6 to mac address mapping. */ -static void -ibcm_arp_pr_arp_squery_ack(mblk_t *mp) +static int +ibcm_nce_lookup(ibcm_arp_prwqn_t *wqnp, ill_t *ill, zoneid_t zoneid) { - struct iocblk *ioc; - mblk_t *mp1; - ibcm_arp_prwqn_t *wqnp; - ibcm_arp_streams_t *ib_s; - area_t *areap; - char *cp; - - IBTF_DPRINTF_L4(cmlog, "ibcm_arp_pr_arp_squery_ack(%p)", mp); - - if (MBLKL(mp) < sizeof (struct iocblk)) { - freemsg(mp); - return; - } - - ioc = (void *)mp->b_rptr; - if ((ioc->ioc_cmd != AR_ENTRY_SQUERY) || (mp->b_cont == NULL)) { - freemsg(mp); - return; - } - - mp1 = mp->b_cont; - - wqnp = *(ibcm_arp_prwqn_t **)((uintptr_t)mp1->b_rptr - - sizeof (uintptr_t)); - ib_s = (ibcm_arp_streams_t *)wqnp->arg; - - mutex_enter(&ib_s->lock); - - /* - * cancel the timeout for this request - */ - (void) untimeout(wqnp->timeout_id); - - /* If the entry was not in arp cache, ioc_error is set */ - if (ioc->ioc_error) { - - /* - * send out AR_ENTRY_QUERY which would send - * arp-request on wire - */ - IBTF_DPRINTF_L3(cmlog, "Sending a Query_ARP"); - - (void) ibcm_arp_query_arp(wqnp); - freemsg(mp); - mutex_exit(&ib_s->lock); - return; + ip2mac_t ip2m; + sin_t *sin; + sin6_t *sin6; + ip2mac_id_t ip2mid; + int err; + + if (wqnp->src_addr.family != wqnp->dst_addr.family) { + IBTF_DPRINTF_L2(cmlog, "ibcm_nce_lookup: Mis-match SRC_ADDR " + "Family: %d, DST_ADDR Family %d", wqnp->src_addr.family, + wqnp->dst_addr.family); + return (1); } + bzero(&ip2m, sizeof (ip2m)); - areap = (void *)mp1->b_rptr; - cp = (char *)areap + areap->area_hw_addr_offset; - bcopy(cp, &wqnp->dst_mac, IPOIB_ADDRL); - - /* - * at this point we have src/dst gid's derived from the mac addresses - * now get the hca, port - */ - bcopy(&wqnp->src_mac.ipoib_gidpref, &wqnp->sgid, sizeof (ib_gid_t)); - bcopy(&wqnp->dst_mac.ipoib_gidpref, &wqnp->dgid, sizeof (ib_gid_t)); - freemsg(mp); - - IBCM_H2N_GID(wqnp->sgid); - IBCM_H2N_GID(wqnp->dgid); - - mutex_exit(&ib_s->lock); - ibcm_arp_pr_callback(wqnp, 0); -} - -/* - * Process arp ack's. - */ -void -ibcm_arp_pr_arp_ack(mblk_t *mp) -{ - IBTF_DPRINTF_L4(cmlog, "ibcm_arp_pr_arp_ack(0x%p, DB_TYPE %lX)", - mp, DB_TYPE(mp)); - - if (DB_TYPE(mp) == M_DATA) { - ibcm_arp_pr_arp_query_ack(mp); - } else if ((DB_TYPE(mp) == M_IOCACK) || - (DB_TYPE(mp) == M_IOCNAK)) { - ibcm_arp_pr_arp_squery_ack(mp); + if (wqnp->dst_addr.family == AF_INET) { + sin = (sin_t *)&ip2m.ip2mac_pa; + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = wqnp->dst_addr.un.ip4addr; + } else if (wqnp->dst_addr.family == AF_INET6) { + sin6 = (sin6_t *)&ip2m.ip2mac_pa; + sin6->sin6_family = AF_INET6; + sin6->sin6_addr = wqnp->dst_addr.un.ip6addr; } else { - freemsg(mp); - } -} - -/* - * query the ipv6 driver cache for ipv6 to mac address mapping. - */ -static int -ibcm_ipv6_lookup(ibcm_arp_prwqn_t *wqnp, ill_t *ill, zoneid_t zoneid) -{ - ip2mac_t ip2m; - sin6_t *sin6; - ip2mac_id_t ip2mid; - int err; - - if (wqnp->src_addr.family != AF_INET6) { - IBTF_DPRINTF_L2(cmlog, "ibcm_ipv6_lookup: SRC_ADDR NOT INET6: " - "%d", wqnp->src_addr.family); + IBTF_DPRINTF_L2(cmlog, "ibcm_nce_lookup: Invalid DST_ADDR " + "Family: %d", wqnp->dst_addr.family); return (1); } - bzero(&ip2m, sizeof (ip2m)); - sin6 = (sin6_t *)&ip2m.ip2mac_pa; - sin6->sin6_family = AF_INET6; - sin6->sin6_addr = wqnp->dst_addr.un.ip6addr; ip2m.ip2mac_ifindex = ill->ill_phyint->phyint_ifindex; wqnp->flags |= IBCM_ARP_PR_RESOLVE_PENDING; + /* - * XXX XTBD set the scopeid? * issue the request to IP for Neighbor Discovery */ - ip2mid = ip2mac(IP2MAC_RESOLVE, &ip2m, ibcm_ipv6_resolver_ack, wqnp, + ip2mid = ip2mac(IP2MAC_RESOLVE, &ip2m, ibcm_resolver_ack, wqnp, zoneid); err = ip2m.ip2mac_err; if (err == EINPROGRESS) { @@ -799,7 +361,7 @@ ibcm_ipv6_lookup(ibcm_arp_prwqn_t *wqnp, ill_t *ill, zoneid_t zoneid) wqnp->flags |= IBCM_ARP_PR_RESOLVE_PENDING; err = 0; } else if (err == 0) { - ibcm_ipv6_resolver_ack(&ip2m, wqnp); + ibcm_resolver_ack(&ip2m, wqnp); } return (err); } @@ -822,16 +384,16 @@ ibcm_check_sockdl(struct sockaddr_dl *sdl) * If Address resolution was succesful: return GID info. */ static void -ibcm_ipv6_resolver_ack(ip2mac_t *ip2macp, void *arg) +ibcm_resolver_ack(ip2mac_t *ip2macp, void *arg) { ibcm_arp_prwqn_t *wqnp = (ibcm_arp_prwqn_t *)arg; ibcm_arp_streams_t *ib_s; uchar_t *cp; int err = 0; - IBTF_DPRINTF_L4(cmlog, "ibcm_ipv6_resolver_ack(%p, %p)", ip2macp, wqnp); + IBTF_DPRINTF_L4(cmlog, "ibcm_resolver_ack(%p, %p)", ip2macp, wqnp); - ib_s = (ibcm_arp_streams_t *)wqnp->arg; + ib_s = wqnp->ib_str; mutex_enter(&ib_s->lock); if (ip2macp->ip2mac_err != 0) { @@ -842,7 +404,7 @@ ibcm_ipv6_resolver_ack(ip2mac_t *ip2macp, void *arg) } if (!ibcm_check_sockdl(&ip2macp->ip2mac_ha)) { - IBTF_DPRINTF_L2(cmlog, "ibcm_ipv6_resolver_ack: Error: " + IBTF_DPRINTF_L2(cmlog, "ibcm_resolver_ack: Error: " "interface %s is not IB\n", wqnp->ifname); err = EHOSTUNREACH; goto user_callback; @@ -862,6 +424,11 @@ ibcm_ipv6_resolver_ack(ip2mac_t *ip2macp, void *arg) IBCM_H2N_GID(wqnp->dgid); user_callback: + + ib_s->status = err; + ib_s->done = B_TRUE; + + /* lock is held by the caller. */ + cv_signal(&ib_s->cv); mutex_exit(&ib_s->lock); - ibcm_arp_pr_callback(wqnp, err); } diff --git a/usr/src/uts/common/io/mac/mac_util.c b/usr/src/uts/common/io/mac/mac_util.c index 0d342fdd93..88468b353e 100644 --- a/usr/src/uts/common/io/mac/mac_util.c +++ b/usr/src/uts/common/io/mac/mac_util.c @@ -476,7 +476,7 @@ mac_ip_hdr_length_v6(mblk_t *mp, ip6_t *ip6h, uint16_t *hdr_length, endptr = mp->b_wptr; if (((uchar_t *)ip6h + IPV6_HDR_LEN) > endptr) return (B_FALSE); - ASSERT((IPH_HDR_VERSION(ip6h) & ~IP_FORWARD_PROG_BIT) == IPV6_VERSION); + ASSERT(IPH_HDR_VERSION(ip6h) == IPV6_VERSION); length = IPV6_HDR_LEN; whereptr = ((uint8_t *)&ip6h[1]); /* point to next hdr */ diff --git a/usr/src/uts/common/io/softmac/softmac_dev.c b/usr/src/uts/common/io/softmac/softmac_dev.c index 23f43ced0b..eeb09fcb0b 100644 --- a/usr/src/uts/common/io/softmac/softmac_dev.c +++ b/usr/src/uts/common/io/softmac/softmac_dev.c @@ -146,6 +146,9 @@ static struct modlinkage softmac_modlinkage = { NULL }; +static void softmac_dedicated_rx(void *, mac_resource_handle_t, mblk_t *, + mac_header_info_t *); + /*ARGSUSED*/ static int softmac_upper_constructor(void *buf, void *arg, int kmflag) @@ -367,7 +370,8 @@ softmac_mod_rput(queue_t *rq, mblk_t *mp) if (dlp->dl_primitive == DL_UNITDATA_IND) { if ((rxinfo = slp->sl_rxinfo) != NULL) { - rxinfo->slr_rx(rxinfo->slr_arg, NULL, mp, NULL); + softmac_dedicated_rx(slp->sl_sup, NULL, mp, + NULL); break; } diff --git a/usr/src/uts/common/io/softmac/softmac_fp.c b/usr/src/uts/common/io/softmac/softmac_fp.c index 7a10aa68b7..2fc66e9bd3 100644 --- a/usr/src/uts/common/io/softmac/softmac_fp.c +++ b/usr/src/uts/common/io/softmac/softmac_fp.c @@ -674,9 +674,12 @@ softmac_wput_single_nondata(softmac_upper_t *sup, mblk_t *mp) t_uscalar_t prim; dbtype = DB_TYPE(mp); + sup->su_is_arp = 0; switch (dbtype) { - case M_IOCTL: - case M_CTL: { + case M_CTL: + sup->su_is_arp = 1; + /* FALLTHROUGH */ + case M_IOCTL: { uint32_t expected_mode; if (((struct iocblk *)(mp->b_rptr))->ioc_cmd != SIOCSLIFNAME) @@ -1132,7 +1135,10 @@ softmac_datapath_switch(softmac_t *softmac, boolean_t disable, boolean_t admin) break; req->ssq_expected_mode = expected_mode; - + if (sup->su_is_arp) { + list_insert_tail(&reqlist, req); + continue; + } /* * Allocate the DL_NOTE_REPLUMB message. */ @@ -1174,18 +1180,19 @@ softmac_datapath_switch(softmac_t *softmac, boolean_t disable, boolean_t admin) */ for (sup = list_head(&softmac->smac_sup_list); sup != NULL; sup = list_next(&softmac->smac_sup_list, sup)) { - mp = head->b_next; - head->b_next = NULL; - + if (!sup->su_is_arp) { + mp = head->b_next; + head->b_next = NULL; + softmac_wput_nondata(sup, head); + head = mp; + } /* - * Add the swtich request to the requests list of the stream. + * Add the switch request to the requests list of the stream. */ req = list_head(&reqlist); ASSERT(req != NULL); list_remove(&reqlist, req); list_insert_tail(&sup->su_req_list, req); - softmac_wput_nondata(sup, head); - head = mp; } mutex_exit(&softmac->smac_fp_mutex); diff --git a/usr/src/uts/common/io/stream.c b/usr/src/uts/common/io/stream.c index b23036e9c5..658735b784 100644 --- a/usr/src/uts/common/io/stream.c +++ b/usr/src/uts/common/io/stream.c @@ -1605,7 +1605,9 @@ pullupmsg(mblk_t *mp, ssize_t len) ASSERT(bp->b_datap->db_ref > 0); ASSERT(bp->b_wptr >= bp->b_rptr); n = MIN(bp->b_wptr - bp->b_rptr, len); - bcopy(bp->b_rptr, mp->b_wptr, (size_t)n); + ASSERT(n >= 0); /* allow zero-length mblk_t's */ + if (n > 0) + bcopy(bp->b_rptr, mp->b_wptr, (size_t)n); mp->b_wptr += n; bp->b_rptr += n; len -= n; diff --git a/usr/src/uts/common/io/strplumb.c b/usr/src/uts/common/io/strplumb.c index f43648fd7f..473f7bc72e 100644 --- a/usr/src/uts/common/io/strplumb.c +++ b/usr/src/uts/common/io/strplumb.c @@ -53,17 +53,6 @@ #include <sys/esunddi.h> #include <sys/promif.h> -#include <netinet/in.h> -#include <netinet/ip6.h> -#include <netinet/icmp6.h> -#include <netinet/sctp.h> -#include <inet/common.h> -#include <inet/ip.h> -#include <inet/ip6.h> -#include <inet/tcp.h> -#include <inet/sctp_ip.h> -#include <inet/udp_impl.h> - #include <sys/strlog.h> #include <sys/log.h> #include <sys/ethernet.h> @@ -222,104 +211,6 @@ strplumb_init(void) return (0); } -static int -strplumb_autopush(void) -{ - major_t maj; - minor_t min; - char *mods[5]; - uint_t anchor = 1; - int err; - - min = (minor_t)-1; - mods[1] = NULL; - - /* - * ARP - */ - DBG0("setting up arp autopush\n"); - - mods[0] = ARP; - - maj = ddi_name_to_major(ARP); - if ((err = kstr_autopush(SET_AUTOPUSH, &maj, &min, NULL, &anchor, - mods)) != 0) { - printf("strplumb: kstr_autopush(SET/ARP) failed: %d\n", err); - return (err); - } - - return (0); -} - -static int -strplumb_sctpq(ldi_ident_t li) -{ - ldi_handle_t lh = NULL; - int err; - int rval; - - DBG0("configuring SCTP default queue\n"); - - if ((err = ldi_open_by_name(SCTP6DEV, FREAD|FWRITE, CRED(), &lh, - li)) != 0) { - printf("strplumb: open of SCTP6DEV failed: %d\n", err); - return (err); - } - - if ((err = ldi_ioctl(lh, SCTP_IOC_DEFAULT_Q, (intptr_t)0, FKIOCTL, - CRED(), &rval)) != 0) { - printf("strplumb: failed to set SCTP default queue: %d\n", - err); - (void) ldi_close(lh, FREAD|FWRITE, CRED()); - return (err); - } - - return (0); -} - -static int -strplumb_tcpq(ldi_ident_t li) -{ - ldi_handle_t lh = NULL; - ldi_handle_t ip_lh = NULL; - int err; - int rval; - - DBG0("configuring TCP default queue\n"); - - /* - * We open IP6DEV here because we need to have it open to in - * order to open TCP6DEV successfully. - */ - if ((err = ldi_open_by_name(IP6DEV, FREAD|FWRITE, CRED(), &ip_lh, - li)) != 0) { - printf("strplumb: open of IP6DEV failed: %d\n", err); - return (err); - } - - /* - * We set the tcp default queue to IPv6 because IPv4 falls back to - * IPv6 when it can't find a client, but IPv6 does not fall back to - * IPv4. - */ - if ((err = ldi_open_by_name(TCP6DEV, FREAD|FWRITE, CRED(), &lh, - li)) != 0) { - printf("strplumb: open of TCP6DEV failed: %d\n", err); - goto done; - } - - if ((err = ldi_ioctl(lh, TCP_IOC_DEFAULT_Q, (intptr_t)0, FKIOCTL, - CRED(), &rval)) != 0) { - printf("strplumb: failed to set TCP default queue: %d\n", - err); - goto done; - } - -done: - (void) ldi_close(ip_lh, FREAD|FWRITE, CRED()); - return (err); -} - /* * Can be set in /etc/system in the case of local booting. See comment below. */ @@ -447,11 +338,8 @@ strplumb_dev(ldi_ident_t li) /* * Now set up the links. Ultimately, we should have two streams - * permanently linked underneath UDP (which is actually IP with UDP - * autopushed). One stream consists of the ARP-[ifname] combination, - * while the other consists of ARP-IP-[ifname]. The second combination - * seems a little weird, but is linked underneath UDP just to keep it - * around. + * permanently linked under UDP. One stream consists of the + * ARP-[ifname] combination, while the other consists of IP-[ifname]. * * We pin underneath UDP here to match what is done in ifconfig(1m); * otherwise, ifconfig will be unable to unplumb the stream (the major @@ -462,7 +350,7 @@ strplumb_dev(ldi_ident_t li) */ /* - * Plumb UDP-ARP-IP-<dev> + * Plumb UDP-IP-<dev> */ if ((err = ldi_open_by_name(rootfs.bo_devname, FREAD|FWRITE, CRED(), @@ -494,12 +382,6 @@ strplumb_dev(ldi_ident_t li) lifr.lifr_flags &= ~IFF_IPV4; name = UDP6DEV; } - if ((err = ldi_ioctl(lh, I_PUSH, (intptr_t)ARP, FKIOCTL, CRED(), - &rval)) != 0) { - printf("strplumb: push ARP failed: %d\n", err); - goto done; - } - (void) strlcpy(lifr.lifr_name, rootfs.bo_ifname, sizeof (lifr.lifr_name)); lifr.lifr_ppa = rootfs.bo_ppa; @@ -507,29 +389,17 @@ strplumb_dev(ldi_ident_t li) if ((err = setifname(lh, &lifr)) != 0) goto done; - /* Get the flags and check if ARP is needed */ + /* get the flags and check if ARP is needed */ if ((err = getifflags(lh, &lifr)) != 0) { printf("strplumb: getifflags %s IP failed, error %d\n", lifr.lifr_name, err); goto done; } - - /* Pop out ARP if not needed */ - if (lifr.lifr_flags & (IFF_NOARP | IFF_IPV6)) { - err = ldi_ioctl(lh, I_POP, (intptr_t)0, FKIOCTL, CRED(), - &rval); - if (err != 0) { - printf("strplumb: pop ARP failed, error %d\n", err); - goto done; - } - } - if ((err = ldi_open_by_name(name, FREAD|FWRITE, CRED(), &mux_lh, li)) != 0) { printf("strplumb: open of %s failed: %d\n", name, err); goto done; } - if ((err = ldi_ioctl(mux_lh, I_PLINK, (intptr_t)lh, FREAD|FWRITE|FNOCTTY|FKIOCTL, CRED(), &(ifr.ifr_ip_muxid))) != 0) { @@ -538,9 +408,9 @@ strplumb_dev(ldi_ident_t li) goto done; } - if (af == AF_INET6) { + /* if ARP is not needed, we are done */ + if (lifr.lifr_flags & (IFF_NOARP | IFF_IPV6)) goto done; - } DBG2("UDP-ARP-IP-%s muxid: %d\n", rootfs.bo_ifname, ifr.ifr_ip_muxid); @@ -610,22 +480,9 @@ strplumb(void) if ((err = strplumb_init()) != 0) return (err); - if ((err = strplumb_autopush()) != 0) - return (err); - if ((err = ldi_ident_from_mod(&modlinkage, &li)) != 0) return (err); - /* - * Setup the TCP and SCTP default queues for the global stack. - * tcp/sctp_stack_init will do this for additional stack instances. - */ - if ((err = strplumb_sctpq(li)) != 0) - goto done; - - if ((err = strplumb_tcpq(li)) != 0) - goto done; - if ((err = resolve_boot_path()) != 0) goto done; diff --git a/usr/src/uts/common/io/tl.c b/usr/src/uts/common/io/tl.c index 7ddb24cddb..83f8cf6944 100644 --- a/usr/src/uts/common/io/tl.c +++ b/usr/src/uts/common/io/tl.c @@ -452,7 +452,7 @@ opdes_t tl_opt_arr[] = { OA_R, OA_R, OP_NP, - OP_PASSNEXT, + 0, sizeof (t_scalar_t), 0 }, @@ -462,7 +462,7 @@ opdes_t tl_opt_arr[] = { OA_RW, OA_RW, OP_NP, - OP_PASSNEXT, + 0, sizeof (int), 0 } @@ -867,7 +867,7 @@ static void tl_fill_option(uchar_t *, cred_t *, pid_t, int, cred_t *); static int tl_default_opt(queue_t *, int, int, uchar_t *); static int tl_get_opt(queue_t *, int, int, uchar_t *); static int tl_set_opt(queue_t *, uint_t, int, int, uint_t, uchar_t *, uint_t *, - uchar_t *, void *, cred_t *, mblk_t *); + uchar_t *, void *, cred_t *); static void tl_memrecover(queue_t *, mblk_t *, size_t); static void tl_freetip(tl_endpt_t *, tl_icon_t *); static void tl_free(tl_endpt_t *); @@ -904,7 +904,6 @@ optdb_obj_t tl_opt_obj = { tl_default_opt, /* TL default value function pointer */ tl_get_opt, /* TL get function pointer */ tl_set_opt, /* TL set function pointer */ - B_TRUE, /* TL is tpi provider */ TL_OPT_ARR_CNT, /* TL option database count of entries */ tl_opt_arr, /* TL option database */ TL_VALID_LEVELS_CNT, /* TL valid level count of entries */ @@ -2789,12 +2788,10 @@ tl_optmgmt(queue_t *wq, mblk_t *mp) * call common option management routine from drv/ip */ if (prim->type == T_SVR4_OPTMGMT_REQ) { - (void) svr4_optcom_req(wq, mp, cr, &tl_opt_obj, - B_FALSE); + svr4_optcom_req(wq, mp, cr, &tl_opt_obj); } else { ASSERT(prim->type == T_OPTMGMT_REQ); - (void) tpi_optcom_req(wq, mp, cr, &tl_opt_obj, - B_FALSE); + tpi_optcom_req(wq, mp, cr, &tl_opt_obj); } } @@ -6066,8 +6063,7 @@ tl_set_opt( uint_t *outlenp, uchar_t *outvalp, void *thisdg_attrs, - cred_t *cr, - mblk_t *mblk) + cred_t *cr) { int error; tl_endpt_t *tep; diff --git a/usr/src/uts/common/io/warlock/ibcm.wlcmd b/usr/src/uts/common/io/warlock/ibcm.wlcmd index b4ae04a925..e66149c4fd 100644 --- a/usr/src/uts/common/io/warlock/ibcm.wlcmd +++ b/usr/src/uts/common/io/warlock/ibcm.wlcmd @@ -66,11 +66,7 @@ root ibt_get_src_ip root ibt_ofuvcm_get_req_data root ibt_ofuvcm_proceed -root ibcm_arp_timeout root ibcm_arp_get_srcip_plist -root ibcm_arp_lrput -root ibcm_arp_lwsrv -root ibcm_arp_lrsrv root ibcm_arp_get_ibd_insts_cb # callback entry points from ibmf diff --git a/usr/src/uts/common/ipp/dlcosmk/dlcosmk.c b/usr/src/uts/common/ipp/dlcosmk/dlcosmk.c index 27eaaba86f..c827fb9e82 100644 --- a/usr/src/uts/common/ipp/dlcosmk/dlcosmk.c +++ b/usr/src/uts/common/ipp/dlcosmk/dlcosmk.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/stream.h> #include <sys/dlpi.h> @@ -88,8 +86,8 @@ dlcosmk_process(mblk_t **mpp, dlcosmk_data_t *dlcosmk_data, uint32_t ill_index, } if ((ill_index == 0) || - ((ill = ill_lookup_on_ifindex_global_instance(ill_index, B_FALSE, - NULL, NULL, NULL, NULL)) == NULL)) { + ((ill = ill_lookup_on_ifindex_global_instance(ill_index, + B_FALSE)) == NULL)) { dlcosmk2dbg(("dlcosmk_process:invalid ill index %u\n", ill_index)); atomic_add_64(&dlcosmk_data->ipackets, 1); diff --git a/usr/src/uts/common/ipp/ipgpc/classifierddi.c b/usr/src/uts/common/ipp/ipgpc/classifierddi.c index 4d31da6396..e76c181d92 100644 --- a/usr/src/uts/common/ipp/ipgpc/classifierddi.c +++ b/usr/src/uts/common/ipp/ipgpc/classifierddi.c @@ -445,10 +445,9 @@ ipgpc_invoke_action(ipp_action_id_t aid, ipp_packet_t *packet) pkt.direction = callout_pos; /* set packet direction */ /* The ill_index could be 0 when called from forwarding (read) path */ - if (ill_idx > 0) { - ill = ill_lookup_on_ifindex_global_instance(ill_idx, B_FALSE, - NULL, NULL, NULL, NULL); - } + if (ill_idx > 0) + ill = ill_lookup_on_ifindex_global_instance(ill_idx, B_FALSE); + if (ill != NULL) { /* * Since all IPP actions in an IPMP group are performed diff --git a/usr/src/uts/common/ktli/t_kutil.c b/usr/src/uts/common/ktli/t_kutil.c index cfd153d873..ab762403fd 100644 --- a/usr/src/uts/common/ktli/t_kutil.c +++ b/usr/src/uts/common/ktli/t_kutil.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -36,8 +36,6 @@ * contributors. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Contains the following utility functions: * tli_send: @@ -230,7 +228,7 @@ t_kadvise(TIUSER *tiptr, uchar_t *addr, int addr_len) bzero(ipid, sizeof (*ipid)); ipid->ipid_cmd = IP_IOC_IRE_DELETE_NO_REPLY; - ipid->ipid_ire_type = IRE_CACHE; + ipid->ipid_ire_type = 0; ipid->ipid_addr_offset = sizeof (ipid_t); ipid->ipid_addr_length = addr_len; diff --git a/usr/src/uts/common/net/route.h b/usr/src/uts/common/net/route.h index 3e4307f25e..9c004b74b1 100644 --- a/usr/src/uts/common/net/route.h +++ b/usr/src/uts/common/net/route.h @@ -130,7 +130,8 @@ struct rtentry { #define RTF_PROTO1 0x8000 /* protocol specific routing flag */ #define RTF_MULTIRT 0x10000 /* multiroute */ #define RTF_SETSRC 0x20000 /* set default outgoing src address */ - +#define RTF_INDIRECT 0x40000 /* gateway not directly reachable */ +#define RTF_KERNEL 0x80000 /* created by kernel; can't delete */ /* * OLD statistics not used by the kernel. The kernel uses <inet/mib2.h>. diff --git a/usr/src/uts/common/netinet/in.h b/usr/src/uts/common/netinet/in.h index fc2c750ba7..c1166fc34f 100644 --- a/usr/src/uts/common/netinet/in.h +++ b/usr/src/uts/common/netinet/in.h @@ -888,6 +888,7 @@ struct sockaddr_in6 { */ #define IP_PKTINFO 0x1a /* specify src address and/or index */ #define IP_RECVPKTINFO 0x1a /* recv dest/matched addr and index */ +#define IP_DONTFRAG 0x1b /* don't fragment packets */ #if !defined(_XPG4_2) || defined(__EXTENSIONS__) /* diff --git a/usr/src/uts/common/netinet/ip_mroute.h b/usr/src/uts/common/netinet/ip_mroute.h index 8a658a0fca..b1dde41b1f 100644 --- a/usr/src/uts/common/netinet/ip_mroute.h +++ b/usr/src/uts/common/netinet/ip_mroute.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,17 +18,16 @@ * * CDDL HEADER END */ + /* - * Copyright 1991, 1997-1999, 2001, 2003 Sun Microsystems, Inc. - * All rights reserved. Use is subject to license terms. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ #ifndef _NETINET_IP_MROUTE_H #define _NETINET_IP_MROUTE_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -188,6 +186,7 @@ struct vif { uint_t v_refcnt; uchar_t v_marks; kmutex_t v_lock; + ilm_t *v_ilm; /* allmulti join */ }; /* diff --git a/usr/src/uts/common/os/ip_cksum.c b/usr/src/uts/common/os/ip_cksum.c index 722c793b79..1fa1c9425b 100644 --- a/usr/src/uts/common/os/ip_cksum.c +++ b/usr/src/uts/common/os/ip_cksum.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -93,9 +93,6 @@ ip_cksum(mblk_t *mp, int offset, uint_t sum) #endif ASSERT(dp); - TRACE_2(TR_FAC_IP, TR_IP_CKSUM_START, - "ip_cksum_start:%p (%X)", mp, sum); - if (mp->b_cont == NULL) { /* * May be fast-path, only one mblk. @@ -277,9 +274,6 @@ slow1: mlen = mp->b_wptr - (uchar_t *)w; } - TRACE_2(TR_FAC_IP, TR_IP_CKSUM_START, - "ip_cksum_start:%p (%X)", mp, sum) - mp = mp->b_cont; if (mlen > 0 && pmlen == -1) { /* diff --git a/usr/src/uts/common/os/strsubr.c b/usr/src/uts/common/os/strsubr.c index 76ce1af025..22bdc86e03 100644 --- a/usr/src/uts/common/os/strsubr.c +++ b/usr/src/uts/common/os/strsubr.c @@ -8474,9 +8474,7 @@ hcksum_retrieve(mblk_t *mp, multidata_t *mmd, pdesc_t *pd, ASSERT(DB_TYPE(mp) == M_DATA || DB_TYPE(mp) == M_MULTIDATA); if (mp->b_datap->db_type == M_DATA) { if (flags != NULL) { - *flags = DB_CKSUMFLAGS(mp) & (HCK_IPV4_HDRCKSUM | - HCK_PARTIALCKSUM | HCK_FULLCKSUM | - HCK_FULLCKSUM_OK); + *flags = DB_CKSUMFLAGS(mp) & HCK_FLAGS; if ((*flags & (HCK_PARTIALCKSUM | HCK_FULLCKSUM)) != 0) { if (value != NULL) diff --git a/usr/src/uts/common/sys/dld.h b/usr/src/uts/common/sys/dld.h index 9542a15a8e..ed80269fbc 100644 --- a/usr/src/uts/common/sys/dld.h +++ b/usr/src/uts/common/sys/dld.h @@ -395,7 +395,8 @@ typedef struct dld_capab_poll_s { /* * Currently supported flags for LSO. */ -#define DLD_LSO_TX_BASIC_TCP_IPV4 0x01 /* TCP LSO capability */ +#define DLD_LSO_BASIC_TCP_IPV4 0x01 /* TCP LSO over IPv4 capability */ +#define DLD_LSO_BASIC_TCP_IPV6 0x02 /* TCP LSO over IPv6 capability */ typedef struct dld_capab_lso_s { uint_t lso_flags; /* capability flags */ diff --git a/usr/src/uts/common/sys/dlpi.h b/usr/src/uts/common/sys/dlpi.h index 8b0681e2d8..6b3a5801d7 100644 --- a/usr/src/uts/common/sys/dlpi.h +++ b/usr/src/uts/common/sys/dlpi.h @@ -593,10 +593,6 @@ union DL_qos_types { /* dl_data is dl_capab_id_t */ #define DL_CAPAB_HCKSUM 0x01 /* Checksum offload */ /* dl_data is dl_capab_hcksum_t */ -#define DL_CAPAB_IPSEC_AH 0x02 /* IPsec AH acceleration */ - /* dl_data is dl_capab_ipsec_t */ -#define DL_CAPAB_IPSEC_ESP 0x03 /* IPsec ESP acceleration */ - /* dl_data is dl_capab_ipsec_t */ #define DL_CAPAB_MDT 0x04 /* Multidata Transmit capability */ /* dl_data is dl_capab_mdt_t */ #define DL_CAPAB_ZEROCOPY 0x05 /* Zero-copy capability */ @@ -611,45 +607,8 @@ typedef struct { } dl_capability_sub_t; /* - * Definitions and structures needed for DL_CONTROL_REQ and DL_CONTROL_ACK - * primitives. - * Extensible message to send down control information to the DLS provider. - * The response is a DL_CONTROL_ACK or DL_ERROR_ACK. - * - * Different types of control operations will define different format for the - * key and data fields. ADD requires key and data fields; if the <type, key> - * matches an already existing entry a DL_ERROR_ACK will be returned. DELETE - * requires a key field; if the <type, key> does not exist, a DL_ERROR_ACK - * will be returned. FLUSH requires neither a key nor data; it - * unconditionally removes all entries for the specified type. GET requires a - * key field; the get operation returns the data for the <type, key>. If - * <type, key> doesn't exist a DL_ERROR_ACK is returned. UPDATE requires key - * and data fields; if <type, key> doesn't exist a DL_ERROR_ACK is returned. - */ - -/* - * Control operations - */ -#define DL_CO_ADD 0x01 /* Add new entry matching for <type,key> */ -#define DL_CO_DELETE 0x02 /* Delete the entry matching <type,key> */ -#define DL_CO_FLUSH 0x03 /* Purge all entries of <type> */ -#define DL_CO_GET 0x04 /* Get the data for the <type,key> */ -#define DL_CO_UPDATE 0x05 /* Update the data for <type,key> */ -#define DL_CO_SET 0x06 /* Add or update as appropriate */ - -/* - * Control types (dl_type field of dl_control_req_t and dl_control_ack_t) - */ -#define DL_CT_IPSEC_AH 0x01 /* AH; key=spi,dest_addr; */ - /* data=keying material */ -#define DL_CT_IPSEC_ESP 0x02 /* ESP; key=spi,des_taddr; */ - /* data=keying material */ - -/* * Module ID token to be included in new sub-capability structures. - * Existing sub-capabilities lacking an identification token, e.g. IPSEC - * hardware acceleration, need to be encapsulated within the ID sub- - * capability. Access to this structure must be done through + * Access to this structure must be done through * dlcapab{set,check}qid(). */ typedef struct { diff --git a/usr/src/uts/common/sys/ib/mgt/ibcm/ibcm_arp.h b/usr/src/uts/common/sys/ib/mgt/ibcm/ibcm_arp.h index c307ed7575..e0b7e1e1e7 100644 --- a/usr/src/uts/common/sys/ib/mgt/ibcm/ibcm_arp.h +++ b/usr/src/uts/common/sys/ib/mgt/ibcm/ibcm_arp.h @@ -31,24 +31,11 @@ extern "C" { #endif #include <sys/ib/mgt/ibcm/ibcm_impl.h> -#include <sys/modhash.h> #include <sys/ib/clients/ibd/ibd.h> -#include <sys/strsun.h> -#include <sys/socket.h> -#include <sys/stat.h> /* for S_IFCHR */ #include <inet/ip2mac.h> #include <inet/ip6.h> -/* - * IPoIB addr lookup completion function - */ -typedef int (*ibcm_arp_pr_comp_func_t) (void *usr_arg, int status); - #define IBCM_ARP_MAX_IFNAME_LEN 24 -#define IBCM_ARP_XMIT_COUNT 6 -#define IBCM_ARP_XMIT_INTERVAL 1000 /* timeout in milliseconds */ -#define IBCM_ARP_TIMEOUT \ - ((IBCM_ARP_XMIT_COUNT + 1) * IBCM_ARP_XMIT_INTERVAL) #define IBCM_H2N_GID(gid) \ { \ @@ -68,9 +55,7 @@ typedef int (*ibcm_arp_pr_comp_func_t) (void *usr_arg, int status); * Path record wait queue node definition */ typedef struct ibcm_arp_prwqn { - ibcm_arp_pr_comp_func_t func; /* user callback function */ - void *arg; /* callback function arg */ - timeout_id_t timeout_id; + struct ibcm_arp_streams_s *ib_str; uint8_t flags; ibt_ip_addr_t usrc_addr; /* user supplied src address */ ibt_ip_addr_t dst_addr; /* user supplied dest address */ @@ -89,15 +74,11 @@ typedef struct ibcm_arp_prwqn { typedef struct ibcm_arp_streams_s { kmutex_t lock; kcondvar_t cv; - queue_t *arpqueue; - vnode_t *arp_vp; int status; boolean_t done; ibcm_arp_prwqn_t *wqnp; } ibcm_arp_streams_t; -/* GID to IP-Addr and Ip-Addr to GID look-up functions. */ - #define IBCM_ARP_IBD_INSTANCES 4 typedef struct ibcm_arp_ip_s { diff --git a/usr/src/uts/common/sys/iphada.h b/usr/src/uts/common/sys/iphada.h deleted file mode 100644 index 9d1a6e28e8..0000000000 --- a/usr/src/uts/common/sys/iphada.h +++ /dev/null @@ -1,144 +0,0 @@ -/* - * CDDL HEADER START - * - * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. - * - * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE - * or http://www.opensolaris.org/os/licensing. - * See the License for the specific language governing permissions - * and limitations under the License. - * - * When distributing Covered Code, include this CDDL HEADER in each - * file and include the License file at usr/src/OPENSOLARIS.LICENSE. - * If applicable, add the following below this CDDL HEADER, with the - * fields enclosed by brackets "[]" replaced with your own identifying - * information: Portions Copyright [yyyy] [name of copyright owner] - * - * CDDL HEADER END - */ -/* - * Copyright 2002-2003 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. - */ - -#ifndef _SYS_IPHADA_H -#define _SYS_IPHADA_H - -#pragma ident "%Z%%M% %I% %E% SMI" - -#ifdef __cplusplus -extern "C" { -#endif - -#define DA_ICV_MAX_LEN 128 /* max ICV length [bytes] */ - -/* - * iphada.h header for IP Hardware Acceleration Data Attributes - * - * This is a contract private interface for use by the Sun - * Hardware Accelerated Ethernet driver ONLY. - */ -typedef struct da_ipsec { - int da_type; /* M_CTL message ident */ - int da_flag; - uint32_t da_icv_len; /* da_icv length in bytes */ - uchar_t da_icv[DA_ICV_MAX_LEN]; /* ICV for AH or ESP+auth */ -} da_ipsec_t; - -#define IPHADA_M_CTL 0xA1D53DE5u - -/* - * IPSec algorithms capabilities (cip_data in dl_capab_ipsec_t) - */ -typedef struct { - t_uscalar_t alg_type; - t_uscalar_t alg_prim; /* algorithm primitive */ - t_uscalar_t alg_thruput; /* approx throughput metric in Mb/s */ - t_uscalar_t alg_flag; /* flags */ - t_uscalar_t alg_minbits; /* minimum key len in bits */ - t_uscalar_t alg_maxbits; /* maximum key len in bits */ - t_uscalar_t alg_incrbits; /* key len increment in bits */ -} dl_capab_ipsec_alg_t; - -/* - * IPSec sub-capability (follows dl_capability_sub_t) - */ -typedef struct { - t_uscalar_t cip_version; /* interface version */ - t_uscalar_t cip_nciphers; /* number ciphers supported */ - dl_capab_ipsec_alg_t cip_data[1]; /* data */ -} dl_capab_ipsec_t; - -/* - * Algorithm types (alg_type field of dl_capab_ipsec_alg_t) - */ -#define DL_CAPAB_IPSEC_ALG_AUTH 0x01 /* authentication alg. */ -#define DL_CAPAB_IPSEC_ALG_ENCR 0x02 /* encryption alg. */ - -/* alg_prim ciphers */ -#define DL_CAPAB_IPSEC_ENCR_DES 0x02 -#define DL_CAPAB_IPSEC_ENCR_3DES 0x03 -#define DL_CAPAB_IPSEC_ENCR_BLOWFISH 0x07 -#define DL_CAPAB_IPSEC_ENCR_NULL 0x0b /* no encryption */ -#define DL_CAPAB_IPSEC_ENCR_AES 0x0c - -/* alg_prim authentications */ -#define DL_CAPAB_IPSEC_AUTH_NONE 0x00 /* no authentication */ -#define DL_CAPAB_IPSEC_AUTH_MD5HMAC 0x02 -#define DL_CAPAB_IPSEC_AUTH_SHA1HMAC 0x03 - -/* alg_flag values */ -#define DL_CAPAB_ALG_ENABLE 0x01 /* enable this algorithm */ - -/* - * For DL_CT_IPSEC_AH and DL_CT_IPSEC_ESP, the optional dl_key data - * that follows the dl_control_req_t or dl_control_ack_t will be the IPsec - * SPI (Security Parameters Index) value and the destination address. - * This is defined as being unique per protocol. - */ - -#define DL_CTL_IPSEC_ADDR_LEN 16 /* IP addr length in bytes */ - -typedef struct dl_ct_ipsec_key { - uint32_t dl_key_spi; /* Security Parameters Index value */ - uchar_t dl_key_dest_addr[DL_CTL_IPSEC_ADDR_LEN]; /* dest IP address */ - uint32_t dl_key_addr_family; /* family of dest IP address */ - /* (AF_INET or AF_INET6) */ -} dl_ct_ipsec_key_t; - -#define DL_CT_IPSEC_MAX_KEY_LEN 512 /* max key length in bytes */ - -/* - * Possible flags for sadb_sa_flags. - */ -#define DL_CT_IPSEC_INBOUND 0x01 /* SA can be used for inbound pkts */ -#define DL_CT_IPSEC_OUTBOUND 0x02 /* SA can be used for outbound pkts */ - -/* - * minimal SADB entry content - * fields are defined as per RFC 2367 and <net/pfkeyv2.h> - * This defines the content and format of the dl_data portion of - * the dl_control_req_t or dl_control_ack_t. - */ -typedef struct dl_ct_ipsec { - uint8_t sadb_sa_auth; /* Authentication algorithm */ - uint8_t sadb_sa_encrypt; /* Encryption algorithm */ - uint32_t sadb_sa_flags; /* SA flags. */ - uint16_t sadb_key_len_a; /* auth key length in bytes */ - uint16_t sadb_key_bits_a; /* auth key length in bits */ - uint16_t sadb_key_data_a[DL_CT_IPSEC_MAX_KEY_LEN]; /* key data */ - uint16_t sadb_key_len_e; /* encr key length in bytes */ - uint16_t sadb_key_bits_e; /* encr key length in bits */ - uint16_t sadb_key_data_e[DL_CT_IPSEC_MAX_KEY_LEN]; /* key data */ -} dl_ct_ipsec_t; - - - -#ifdef __cplusplus -} -#endif - -#endif /* _SYS_IPHADA_H */ diff --git a/usr/src/uts/common/sys/pattr.h b/usr/src/uts/common/sys/pattr.h index cac046d675..f3b8397681 100644 --- a/usr/src/uts/common/sys/pattr.h +++ b/usr/src/uts/common/sys/pattr.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_PATTR_H #define _SYS_PATTR_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -92,6 +90,9 @@ typedef struct pattr_hcksum_s { /* check the attached h/w computed */ /* checksum value to determine if */ /* checksum was bad */ + +#define HCK_FLAGS (HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | \ + HCK_FULLCKSUM | HCK_FULLCKSUM_OK) /* * Extended hardware offloading flags that also use hcksum_flags */ diff --git a/usr/src/uts/common/sys/softmac_impl.h b/usr/src/uts/common/sys/softmac_impl.h index eb71063bc7..bd94d4982e 100644 --- a/usr/src/uts/common/sys/softmac_impl.h +++ b/usr/src/uts/common/sys/softmac_impl.h @@ -301,7 +301,9 @@ typedef struct softmac_upper_s { uint32_t su_bound : 1, /* SL */ su_active : 1, /* SL */ - su_direct : 1; /* SL */ + su_direct : 1, /* SL */ + su_is_arp : 1, + su_pad_to_32:28; /* * Used for fastpath data path. diff --git a/usr/src/uts/common/sys/squeue.h b/usr/src/uts/common/sys/squeue.h index a2d808f647..de0f18bd4d 100644 --- a/usr/src/uts/common/sys/squeue.h +++ b/usr/src/uts/common/sys/squeue.h @@ -44,21 +44,19 @@ typedef struct squeue_s squeue_t; (mp)->b_prev = (mblk_t *)(arg); \ } -#define GET_SQUEUE(mp) ((conn_t *)((mp)->b_prev))->conn_sqp - #define SQ_FILL 0x0001 #define SQ_NODRAIN 0x0002 #define SQ_PROCESS 0x0004 -#define SQUEUE_ENTER(sqp, head, tail, cnt, flag, tag) { \ - sqp->sq_enter(sqp, head, tail, cnt, flag, tag); \ +#define SQUEUE_ENTER(sqp, head, tail, cnt, ira, flag, tag) { \ + sqp->sq_enter(sqp, head, tail, cnt, ira, flag, tag); \ } -#define SQUEUE_ENTER_ONE(sqp, mp, proc, arg, flag, tag) { \ +#define SQUEUE_ENTER_ONE(sqp, mp, proc, arg, ira, flag, tag) { \ ASSERT(mp->b_next == NULL); \ ASSERT(mp->b_prev == NULL); \ SET_SQUEUE(mp, proc, arg); \ - SQUEUE_ENTER(sqp, mp, mp, 1, flag, tag); \ + SQUEUE_ENTER(sqp, mp, mp, 1, ira, flag, tag); \ } /* @@ -77,12 +75,13 @@ typedef enum { SQPRIVATE_MAX } sqprivate_t; +struct ip_recv_attr_s; extern void squeue_init(void); extern squeue_t *squeue_create(clock_t, pri_t); extern void squeue_bind(squeue_t *, processorid_t); extern void squeue_unbind(squeue_t *); extern void squeue_enter(squeue_t *, mblk_t *, mblk_t *, - uint32_t, int, uint8_t); + uint32_t, struct ip_recv_attr_s *, int, uint8_t); extern uintptr_t *squeue_getprivate(squeue_t *, sqprivate_t); struct conn_s; diff --git a/usr/src/uts/common/sys/squeue_impl.h b/usr/src/uts/common/sys/squeue_impl.h index bd934cc0b3..22550886eb 100644 --- a/usr/src/uts/common/sys/squeue_impl.h +++ b/usr/src/uts/common/sys/squeue_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -79,9 +79,9 @@ typedef struct squeue_set_s { processorid_t sqs_cpuid; } squeue_set_t; -typedef void (*sqproc_t)(void *, mblk_t *, void *); +typedef void (*sqproc_t)(void *, mblk_t *, void *, struct ip_recv_attr_s *); typedef void (*sq_enter_proc_t)(squeue_t *, mblk_t *, mblk_t *, uint32_t, - int, uint8_t); + struct ip_recv_attr_s *, int, uint8_t); typedef void (*sq_drain_proc_t)(squeue_t *, uint_t, hrtime_t); extern void squeue_worker_wakeup(squeue_t *); diff --git a/usr/src/uts/common/sys/stream.h b/usr/src/uts/common/sys/stream.h index b9c96a8345..7a3b4e3448 100644 --- a/usr/src/uts/common/sys/stream.h +++ b/usr/src/uts/common/sys/stream.h @@ -404,9 +404,6 @@ typedef struct bcache { #define STRUIO_IP 0x04 /* IP checksum stored in db_struioun */ #define STRUIO_ZC 0x08 /* mblk eligible for zero-copy */ #define STRUIO_ZCNOTIFY 0x10 /* notify stream head when mblk acked */ -#define STRUIO_EAGER 0x20 /* new eager; db_cksumstart has squeue to use */ -#define STRUIO_POLICY 0x40 /* new eager when IPsec is enabled */ -#define STRUIO_CONNECT 0x80 /* conn did a connect */ /* * Message flags. These are interpreted by the stream head. @@ -418,8 +415,7 @@ typedef struct bcache { /* UNUSED 0x08 was MSGNOGET (can be recycled) */ #define MSGMARKNEXT 0x10 /* Private: first byte of next msg marked */ #define MSGNOTMARKNEXT 0x20 /* Private: ... not marked */ -#define MSGHASREF 0x40 /* Private: message has reference to owner */ -#define MSGWAITSYNC 0x80 /* Private: waiting for sync squeue enter */ +#define MSGWAITSYNC 0x40 /* Private: waiting for sync squeue enter */ /* * Streams message types. diff --git a/usr/src/uts/common/sys/tsol/tnet.h b/usr/src/uts/common/sys/tsol/tnet.h index 221f4c775a..0da65ae5ca 100644 --- a/usr/src/uts/common/sys/tsol/tnet.h +++ b/usr/src/uts/common/sys/tsol/tnet.h @@ -46,35 +46,30 @@ extern "C" { extern int tsol_tnrh_chk(tsol_tpent_t *, bslabel_t *, int); extern tsol_tnrhc_t *find_rhc(const void *, uchar_t, boolean_t); -extern int tsol_check_dest(const cred_t *, const void *, uchar_t, uint_t, - cred_t **); -extern int tsol_compute_label(const cred_t *, ipaddr_t, uchar_t *, - ip_stack_t *); -extern int tsol_compute_label_v6(const cred_t *, const in6_addr_t *, uchar_t *, - ip_stack_t *); -extern int tsol_check_label(const cred_t *, mblk_t **, uint_t, - ip_stack_t *, pid_t); -extern int tsol_check_label_v6(const cred_t *, mblk_t **, uint_t, - ip_stack_t *, pid_t); +extern int tsol_check_dest(const ts_label_t *, const void *, uchar_t, + uint_t, boolean_t, ts_label_t **); +extern int tsol_compute_label_v4(const ts_label_t *, zoneid_t, ipaddr_t, + uchar_t *, ip_stack_t *); +extern int tsol_compute_label_v6(const ts_label_t *, zoneid_t, + const in6_addr_t *, uchar_t *, ip_stack_t *); +extern int tsol_check_label_v4(const ts_label_t *, zoneid_t, mblk_t **, + uint_t, boolean_t, ip_stack_t *, ts_label_t **); +extern int tsol_check_label_v6(const ts_label_t *, zoneid_t, mblk_t **, + uint_t, boolean_t, ip_stack_t *, ts_label_t **); extern int tsol_prepend_option(uchar_t *, ipha_t *, int); extern int tsol_prepend_option_v6(uchar_t *, ip6_t *, int); extern int tsol_remove_secopt(ipha_t *, int); extern int tsol_remove_secopt_v6(ip6_t *, int); -extern int tsol_update_sticky(ip6_pkt_t *, uint_t *, const uchar_t *); -extern int tsol_update_options(uchar_t **, uint_t *, uint_t *, - const uchar_t *); -extern boolean_t tsol_option_set(uchar_t **, uint_t *, uint_t, const uchar_t *, - uint_t); extern tsol_ire_gw_secattr_t *ire_gw_secattr_alloc(int); extern void ire_gw_secattr_free(tsol_ire_gw_secattr_t *); -extern boolean_t tsol_can_reply_error(const mblk_t *); +extern boolean_t tsol_can_reply_error(const mblk_t *, ip_recv_attr_t *); extern boolean_t tsol_receive_local(const mblk_t *, const void *, uchar_t, - boolean_t, const conn_t *); -extern boolean_t tsol_can_accept_raw(mblk_t *, boolean_t); -extern boolean_t tsol_get_pkt_label(mblk_t *, int); -extern zoneid_t tsol_packet_to_zoneid(const mblk_t *); + ip_recv_attr_t *, const conn_t *); +extern boolean_t tsol_can_accept_raw(mblk_t *, ip_recv_attr_t *, boolean_t); +extern boolean_t tsol_get_pkt_label(mblk_t *, int, ip_recv_attr_t *); +extern zoneid_t tsol_attr_to_zoneid(const ip_recv_attr_t *); extern boolean_t tsol_get_option_v4(mblk_t *, tsol_ip_label_t *, uint8_t **); extern boolean_t tsol_get_option_v6(mblk_t *, tsol_ip_label_t *, uint8_t **); @@ -83,8 +78,8 @@ extern boolean_t tsol_find_secopt_v6(const uchar_t *, uint_t, uchar_t **, extern int tsol_ire_match_gwattr(ire_t *, const ts_label_t *); extern int tsol_rtsa_init(rt_msghdr_t *, tsol_rtsecattr_t *, caddr_t); -extern int tsol_ire_init_gwattr(ire_t *, uchar_t, tsol_gc_t *, tsol_gcgrp_t *); -extern mblk_t *tsol_ip_forward(ire_t *, mblk_t *); +extern int tsol_ire_init_gwattr(ire_t *, uchar_t, tsol_gc_t *); +extern mblk_t *tsol_ip_forward(ire_t *, mblk_t *, const ip_recv_attr_t *); extern uint32_t tsol_pmtu_adjust(mblk_t *, uint32_t, int, int); extern mlp_type_t tsol_mlp_addr_type(zoneid_t, uchar_t, const void *, diff --git a/usr/src/uts/intel/Makefile.intel.shared b/usr/src/uts/intel/Makefile.intel.shared index f1ceb0257e..6b20559ef4 100644 --- a/usr/src/uts/intel/Makefile.intel.shared +++ b/usr/src/uts/intel/Makefile.intel.shared @@ -371,7 +371,6 @@ DRV_KMODS += pppt DRV_KMODS += ncall nsctl sdbc nskern sv DRV_KMODS += ii rdc rdcsrv rdcstub DRV_KMODS += iptun -DRV_KMODS += iptunq # # Don't build some of these for OpenSolaris, since they will be diff --git a/usr/src/uts/intel/arp/Makefile b/usr/src/uts/intel/arp/Makefile index aff11806da..9b91950434 100644 --- a/usr/src/uts/intel/arp/Makefile +++ b/usr/src/uts/intel/arp/Makefile @@ -21,11 +21,9 @@ # # uts/intel/arp/Makefile # -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -# ident "%Z%%M% %I% %E% SMI" -# # This makefile drives the production of the arp driver kernel module. # # intel implementation architecture dependent @@ -68,7 +66,7 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOTLINK) $(ROOT_CONFFILE) # # depends on ip # -LDFLAGS += -dy -Ndrv/ip -Ndrv/hook -Nmisc/neti +LDFLAGS += -dy -Ndrv/ip # # For now, disable these lint checks; maintainers should endeavor diff --git a/usr/src/uts/intel/arp/arp.global-objs.debug64 b/usr/src/uts/intel/arp/arp.global-objs.debug64 index 7f826ea213..f936276753 100644 --- a/usr/src/uts/intel/arp/arp.global-objs.debug64 +++ b/usr/src/uts/intel/arp/arp.global-objs.debug64 @@ -23,15 +23,6 @@ # Use is subject to license terms. # -ar_cmd_tbl -ar_m_tbl -arp_mod_info -arp_no_defense -arpinfo -arprinit -arpwinit -arp_param_arr -arp_netinfo cb_inet_devops fsw inet_dev_info diff --git a/usr/src/uts/intel/ia32/ml/modstubs.s b/usr/src/uts/intel/ia32/ml/modstubs.s index 6cd415a78f..3837728d4c 100644 --- a/usr/src/uts/intel/ia32/ml/modstubs.s +++ b/usr/src/uts/intel/ia32/ml/modstubs.s @@ -509,7 +509,6 @@ fcnname/**/_info: \ MODULE(ipsecah,drv); WSTUB(ipsecah, ipsec_construct_inverse_acquire, nomod_zero); WSTUB(ipsecah, sadb_acquire, nomod_zero); - WSTUB(ipsecah, sadb_ill_download, nomod_zero); WSTUB(ipsecah, ipsecah_algs_changed, nomod_zero); WSTUB(ipsecah, sadb_alg_update, nomod_zero); WSTUB(ipsecah, sadb_unlinkassoc, nomod_zero); @@ -1294,8 +1293,6 @@ fcnname/**/_info: \ STUB(iptun, iptun_create, nomod_einval); STUB(iptun, iptun_delete, nomod_einval); STUB(iptun, iptun_set_policy, nomod_void) ; - STUB(iptun, iptun_set_g_q, nomod_einval); - STUB(iptun, iptun_clear_g_q, nomod_void); END_MODULE(iptun); #endif diff --git a/usr/src/uts/intel/ip/ip.global-objs.debug64 b/usr/src/uts/intel/ip/ip.global-objs.debug64 index 6009f5b006..07e9aaedde 100644 --- a/usr/src/uts/intel/ip/ip.global-objs.debug64 +++ b/usr/src/uts/intel/ip/ip.global-objs.debug64 @@ -23,19 +23,24 @@ # Use is subject to license terms. # +arp_m_tbl +arp_mod_info +arp_netinfo +arp_no_defense +arpinfo cb_inet_devops cl_inet_bind +cl_inet_checkspi cl_inet_connect2 +cl_inet_deletespi cl_inet_disconnect +cl_inet_getspi +cl_inet_idlesa cl_inet_ipident cl_inet_isclusterwide cl_inet_listen cl_inet_unbind cl_inet_unlisten -cl_inet_getspi -cl_inet_checkspi -cl_inet_deletespi -cl_inet_idlesa cl_sctp_assoc_change cl_sctp_check_addrs cl_sctp_connect @@ -43,6 +48,7 @@ cl_sctp_disconnect cl_sctp_listen cl_sctp_unlisten conn_drain_nthreads +dce_cache default_ip6_asp_table do_tcp_fusion do_tcpzcopy @@ -97,74 +103,45 @@ ill_no_arena ill_null inet_dev_info inet_devops -ip6_area_template -ip6_ared_template -ip6_cache_table_size ip6_ftable_hash_size -ip6_ire_max_bucket_cnt -ip6_ire_min_bucket_cnt -ip6_max_cache_table_size ip6opt_ls -ip_ard_template -ip_area_template -ip_ared_template -ip_areq_template -ip_arma_multi_template -ip_aroff_template -ip_aron_template -ip_aru_template -ip_cache_table_size ip_cgtp_filter_rev ip_conn_cache ip_debug ip_g_all_ones -ip_helper_stream_cache ip_helper_stream_info ip_helper_stream_rinit ip_helper_stream_winit ip_ioctl_ftbl -ip_ire_cleanup_cnt -ip_ire_cpu_ratio -ip_ire_max_bucket_cnt -ip_ire_mem_ratio -ip_ire_min_bucket_cnt -ip_loopback_mtu ip_loopback_mtu_v6plus ip_loopback_mtuplus ip_m_tbl -ip_max_cache_table_size ip_max_frag_dups ip_min_frag_prune_time -ip_minor_arena_sa ip_minor_arena_la +ip_minor_arena_sa ip_misc_ioctl_count ip_misc_ioctl_table ip_mod_info ip_modclose_ackwait_ms ip_ndx_ioctl_count ip_ndx_ioctl_table -ip_opt_arr -ip_opt_obj ip_poll_normal_ms ip_poll_normal_ticks ip_rput_pullups ip_six_byte_all_ones ip_squeue_create_callback ip_squeue_enter -ip_squeue_enter_unbound ip_squeue_fanout ip_squeue_flag ip_squeue_worker_wait ip_thread_data ip_thread_list ip_thread_rwlock -ip_use_helper_cache -ip_wput_frag_mdt_min ipcl_bind_fanout_size ipcl_conn_hash_maxsize ipcl_conn_hash_memfactor ipcl_conn_hash_size -ipcl_debug_level ipcl_iptun_fanout_size ipcl_raw_fanout_size ipcl_udp_fanout_size @@ -174,24 +151,16 @@ ipinfov4 ipinfov6 iplrinit iplwinit -ipmp_aract_template -ipmp_ardeact_template ipmp_kstats iprinitv4 iprinitv6 ipsec_action_cache ipsec_hdr_pullup_needed -ipsec_info_cache ipsec_pol_cache ipsec_policy_failure_msgs ipsec_sel_cache ipsec_spd_hashsize ipsec_weird_null_inbound_policy -ipsechw_debug -iptunq_info -iptunq_modinfo -iptunq_rinit -iptunq_winit ipv4_forward_suffix ipv4info ipv6_all_hosts_mcast @@ -199,29 +168,22 @@ ipv6_all_ones ipv6_all_rtrs_mcast ipv6_all_v2rtrs_mcast ipv6_all_zeros -ipv6_areq_template ipv6_forward_suffix ipv6_ll_template ipv6_loopback ipv6_solicited_node_mcast ipv6_unspecified_group ipv6info -ipwinitv4 -ipwinitv6 +ipwinit ire_cache ire_gw_secattr_cache -ire_idle_cutoff_interval ire_null ire_nv_arr ire_nv_tbl -ire_uinfo_null lcl_ndp_arr lcl_param_arr lcl_sctp_param_arr lcl_sctp_wroff_xtra_param -lcl_tcp_mdt_head_param -lcl_tcp_mdt_max_pbufs_param -lcl_tcp_mdt_tail_param lcl_tcp_param_arr lcl_tcp_wroff_xtra_param mask_rnhead @@ -230,6 +192,8 @@ modldrv modlinkage modlstrmod multicast_encap_iphdr +nce_cache +ncec_cache netdev_privs prov_update_handle radix_mask_cache @@ -238,6 +202,7 @@ rawip_conn_cache recvq_call recvq_loop_cnt req_arr +rinit_arp rn_mkfreelist rn_ones rn_zeros @@ -260,25 +225,23 @@ sctp_kmem_faddr_cache sctp_kmem_ftsn_set_cache sctp_kmem_set_cache sctp_mod_info +sctp_opt_arr +sctp_opt_arr_size sctp_recvq_tq_task_max sctp_recvq_tq_task_min sctp_recvq_tq_thr_max sctp_recvq_tq_thr_min sctp_sin6_null -sctp_taskq sctpdebug sctpinfo sctprinit sctpwinit -sendq_collision -sendq_empty -sendq_loop_cnt sin6_null sin_null skip_sctp_cksum -sock_tcp_downcalls -sock_rts_downcalls sock_rawip_downcalls +sock_rts_downcalls +sock_tcp_downcalls sock_udp_downcalls sqset_global_list sqset_global_size @@ -300,12 +263,10 @@ tcp_g_statistics tcp_g_t_info_ack tcp_g_t_info_ack_v6 tcp_icmp_source_quench -tcp_iphc_cache tcp_max_optsize -tcp_mdt_chain -tcp_mdt_smss_threshold tcp_opt_arr tcp_opt_obj +tcp_outbound_squeue_switch tcp_random_anon_port tcp_random_end_ptr tcp_random_fptr @@ -321,13 +282,11 @@ tcp_sock_winit tcp_squeue_flag tcp_squeue_wput tcp_static_maxpsz -tcp_taskq tcp_timercache tcp_tx_pull_len tcp_valid_levels_arr tcp_winfo tcp_winit -tcp_outbound_squeue_switch tcpinfov4 tcpinfov6 tli_errs @@ -352,4 +311,6 @@ udp_valid_levels_arr udp_winit udpinfov4 udpinfov6 -zero_info +winit_arp +eri_cksum_workaround +nxge_cksum_workaround diff --git a/usr/src/uts/intel/ip/ip.global-objs.obj64 b/usr/src/uts/intel/ip/ip.global-objs.obj64 index 1706a82aa7..526e907ab5 100644 --- a/usr/src/uts/intel/ip/ip.global-objs.obj64 +++ b/usr/src/uts/intel/ip/ip.global-objs.obj64 @@ -23,19 +23,24 @@ # Use is subject to license terms. # +arp_m_tbl +arp_mod_info +arp_netinfo +arp_no_defense +arpinfo cb_inet_devops cl_inet_bind +cl_inet_checkspi cl_inet_connect2 +cl_inet_deletespi cl_inet_disconnect +cl_inet_getspi +cl_inet_idlesa cl_inet_ipident cl_inet_isclusterwide cl_inet_listen cl_inet_unbind cl_inet_unlisten -cl_inet_getspi -cl_inet_checkspi -cl_inet_deletespi -cl_inet_idlesa cl_sctp_assoc_change cl_sctp_check_addrs cl_sctp_connect @@ -43,6 +48,7 @@ cl_sctp_disconnect cl_sctp_listen cl_sctp_unlisten conn_drain_nthreads +dce_cache default_ip6_asp_table do_tcp_fusion do_tcpzcopy @@ -97,69 +103,41 @@ ill_no_arena ill_null inet_dev_info inet_devops -ip6_area_template -ip6_ared_template -ip6_cache_table_size ip6_ftable_hash_size -ip6_ire_max_bucket_cnt -ip6_ire_min_bucket_cnt -ip6_max_cache_table_size ip6opt_ls -ip_ard_template -ip_area_template -ip_ared_template -ip_areq_template -ip_arma_multi_template -ip_aroff_template -ip_aron_template -ip_aru_template -ip_cache_table_size ip_cgtp_filter_rev ip_conn_cache ip_debug ip_g_all_ones -ip_helper_stream_cache ip_helper_stream_info ip_helper_stream_rinit ip_helper_stream_winit ip_ioctl_ftbl -ip_ire_cleanup_cnt -ip_ire_cpu_ratio -ip_ire_max_bucket_cnt -ip_ire_mem_ratio -ip_ire_min_bucket_cnt -ip_loopback_mtu ip_loopback_mtu_v6plus ip_loopback_mtuplus ip_m_tbl -ip_max_cache_table_size ip_max_frag_dups ip_min_frag_prune_time -ip_minor_arena_sa ip_minor_arena_la +ip_minor_arena_sa ip_misc_ioctl_count ip_misc_ioctl_table ip_mod_info ip_modclose_ackwait_ms ip_ndx_ioctl_count ip_ndx_ioctl_table -ip_opt_arr -ip_opt_obj ip_poll_normal_ms ip_poll_normal_ticks ip_rput_pullups ip_six_byte_all_ones ip_squeue_create_callback ip_squeue_enter -ip_squeue_enter_unbound ip_squeue_fanout ip_squeue_flag ip_squeue_worker_wait ip_thread_data ip_thread_list ip_thread_rwlock -ip_use_helper_cache -ip_wput_frag_mdt_min ipcl_bind_fanout_size ipcl_conn_hash_maxsize ipcl_conn_hash_memfactor @@ -173,23 +151,16 @@ ipinfov4 ipinfov6 iplrinit iplwinit -ipmp_aract_template -ipmp_ardeact_template ipmp_kstats iprinitv4 iprinitv6 ipsec_action_cache ipsec_hdr_pullup_needed -ipsec_info_cache ipsec_pol_cache ipsec_policy_failure_msgs ipsec_sel_cache ipsec_spd_hashsize ipsec_weird_null_inbound_policy -iptunq_info -iptunq_modinfo -iptunq_rinit -iptunq_winit ipv4_forward_suffix ipv4info ipv6_all_hosts_mcast @@ -197,29 +168,22 @@ ipv6_all_ones ipv6_all_rtrs_mcast ipv6_all_v2rtrs_mcast ipv6_all_zeros -ipv6_areq_template ipv6_forward_suffix ipv6_ll_template ipv6_loopback ipv6_solicited_node_mcast ipv6_unspecified_group ipv6info -ipwinitv4 -ipwinitv6 +ipwinit ire_cache ire_gw_secattr_cache -ire_idle_cutoff_interval ire_null ire_nv_arr ire_nv_tbl -ire_uinfo_null lcl_ndp_arr lcl_param_arr lcl_sctp_param_arr lcl_sctp_wroff_xtra_param -lcl_tcp_mdt_head_param -lcl_tcp_mdt_max_pbufs_param -lcl_tcp_mdt_tail_param lcl_tcp_param_arr lcl_tcp_wroff_xtra_param mask_rnhead @@ -228,12 +192,15 @@ modldrv modlinkage modlstrmod multicast_encap_iphdr +nce_cache +ncec_cache netdev_privs prov_update_handle radix_mask_cache radix_node_cache rawip_conn_cache req_arr +rinit_arp rn_mkfreelist rn_ones rn_zeros @@ -256,21 +223,22 @@ sctp_kmem_faddr_cache sctp_kmem_ftsn_set_cache sctp_kmem_set_cache sctp_mod_info +sctp_opt_arr +sctp_opt_arr_size sctp_recvq_tq_task_max sctp_recvq_tq_task_min sctp_recvq_tq_thr_max sctp_recvq_tq_thr_min sctp_sin6_null -sctp_taskq sctpdebug sctpinfo sctprinit sctpwinit sin6_null sin_null -sock_tcp_downcalls -sock_rts_downcalls sock_rawip_downcalls +sock_rts_downcalls +sock_tcp_downcalls sock_udp_downcalls sqset_global_list sqset_global_size @@ -292,12 +260,10 @@ tcp_g_statistics tcp_g_t_info_ack tcp_g_t_info_ack_v6 tcp_icmp_source_quench -tcp_iphc_cache tcp_max_optsize -tcp_mdt_chain -tcp_mdt_smss_threshold tcp_opt_arr tcp_opt_obj +tcp_outbound_squeue_switch tcp_random_anon_port tcp_random_end_ptr tcp_random_fptr @@ -313,13 +279,11 @@ tcp_sock_winit tcp_squeue_flag tcp_squeue_wput tcp_static_maxpsz -tcp_taskq tcp_timercache tcp_tx_pull_len tcp_valid_levels_arr tcp_winfo tcp_winit -tcp_outbound_squeue_switch tcpinfov4 tcpinfov6 tli_errs @@ -344,4 +308,6 @@ udp_valid_levels_arr udp_winit udpinfov4 udpinfov6 -zero_info +winit_arp +eri_cksum_workaround +nxge_cksum_workaround diff --git a/usr/src/uts/sparc/Makefile.sparc.shared b/usr/src/uts/sparc/Makefile.sparc.shared index 7aa463978d..873557cbd6 100644 --- a/usr/src/uts/sparc/Makefile.sparc.shared +++ b/usr/src/uts/sparc/Makefile.sparc.shared @@ -205,7 +205,7 @@ DRV_KMODS += aggr arp audio bl bofi clone cn conskbd consms cpuid DRV_KMODS += crypto cryptoadm devinfo dump DRV_KMODS += dtrace fasttrap fbt lockstat profile sdt systrace dcpc DRV_KMODS += fssnap icmp icmp6 ip ip6 ipnet ipsecah -DRV_KMODS += ipsecesp iptun iptunq iwscn keysock kmdb kstat ksyms llc1 +DRV_KMODS += ipsecesp iptun iwscn keysock kmdb kstat ksyms llc1 DRV_KMODS += lofi DRV_KMODS += log logindmux kssl mm nca physmem pm poll pool DRV_KMODS += pseudo ptc ptm pts ptsl ramdisk random rsm rts sad diff --git a/usr/src/uts/sparc/arp/Makefile b/usr/src/uts/sparc/arp/Makefile index 21c26c762e..6d1610da66 100644 --- a/usr/src/uts/sparc/arp/Makefile +++ b/usr/src/uts/sparc/arp/Makefile @@ -20,11 +20,9 @@ # # # uts/sparc/arp/Makefile -# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Copyright 2009 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # -#ident "%Z%%M% %I% %E% SMI" -# # This makefile drives the production of the arp driver kernel module. # # sparc architecture dependent @@ -72,7 +70,7 @@ CFLAGS += $(CCVERBOSE) # # depends on ip # -LDFLAGS += -dy -Ndrv/ip -Ndrv/hook -Nmisc/neti +LDFLAGS += -dy -Ndrv/ip # # For now, disable these lint checks; maintainers should endeavor diff --git a/usr/src/uts/sparc/arp/arp.global-objs.debug64 b/usr/src/uts/sparc/arp/arp.global-objs.debug64 index 7f826ea213..f936276753 100644 --- a/usr/src/uts/sparc/arp/arp.global-objs.debug64 +++ b/usr/src/uts/sparc/arp/arp.global-objs.debug64 @@ -23,15 +23,6 @@ # Use is subject to license terms. # -ar_cmd_tbl -ar_m_tbl -arp_mod_info -arp_no_defense -arpinfo -arprinit -arpwinit -arp_param_arr -arp_netinfo cb_inet_devops fsw inet_dev_info diff --git a/usr/src/uts/sparc/ip/ip.global-objs.debug64 b/usr/src/uts/sparc/ip/ip.global-objs.debug64 index 8df87d813d..07e9aaedde 100644 --- a/usr/src/uts/sparc/ip/ip.global-objs.debug64 +++ b/usr/src/uts/sparc/ip/ip.global-objs.debug64 @@ -23,19 +23,24 @@ # Use is subject to license terms. # +arp_m_tbl +arp_mod_info +arp_netinfo +arp_no_defense +arpinfo cb_inet_devops cl_inet_bind +cl_inet_checkspi cl_inet_connect2 +cl_inet_deletespi cl_inet_disconnect +cl_inet_getspi +cl_inet_idlesa cl_inet_ipident cl_inet_isclusterwide cl_inet_listen cl_inet_unbind cl_inet_unlisten -cl_inet_getspi -cl_inet_checkspi -cl_inet_deletespi -cl_inet_idlesa cl_sctp_assoc_change cl_sctp_check_addrs cl_sctp_connect @@ -43,6 +48,7 @@ cl_sctp_disconnect cl_sctp_listen cl_sctp_unlisten conn_drain_nthreads +dce_cache default_ip6_asp_table do_tcp_fusion do_tcpzcopy @@ -97,74 +103,45 @@ ill_no_arena ill_null inet_dev_info inet_devops -ip6_area_template -ip6_ared_template -ip6_cache_table_size ip6_ftable_hash_size -ip6_ire_max_bucket_cnt -ip6_ire_min_bucket_cnt -ip6_max_cache_table_size ip6opt_ls -ip_ard_template -ip_area_template -ip_ared_template -ip_areq_template -ip_arma_multi_template -ip_aroff_template -ip_aron_template -ip_aru_template -ip_cache_table_size ip_cgtp_filter_rev ip_conn_cache ip_debug ip_g_all_ones -ip_helper_stream_cache ip_helper_stream_info ip_helper_stream_rinit ip_helper_stream_winit ip_ioctl_ftbl -ip_ire_cleanup_cnt -ip_ire_cpu_ratio -ip_ire_max_bucket_cnt -ip_ire_mem_ratio -ip_ire_min_bucket_cnt -ip_loopback_mtu ip_loopback_mtu_v6plus ip_loopback_mtuplus ip_m_tbl -ip_max_cache_table_size ip_max_frag_dups ip_min_frag_prune_time -ip_minor_arena_sa ip_minor_arena_la +ip_minor_arena_sa ip_misc_ioctl_count ip_misc_ioctl_table ip_mod_info ip_modclose_ackwait_ms ip_ndx_ioctl_count ip_ndx_ioctl_table -ip_opt_arr -ip_opt_obj ip_poll_normal_ms ip_poll_normal_ticks ip_rput_pullups ip_six_byte_all_ones ip_squeue_create_callback ip_squeue_enter -ip_squeue_enter_unbound ip_squeue_fanout ip_squeue_flag ip_squeue_worker_wait ip_thread_data ip_thread_list ip_thread_rwlock -ip_use_helper_cache -ip_wput_frag_mdt_min ipcl_bind_fanout_size ipcl_conn_hash_maxsize ipcl_conn_hash_memfactor ipcl_conn_hash_size -ipcl_debug_level ipcl_iptun_fanout_size ipcl_raw_fanout_size ipcl_udp_fanout_size @@ -174,24 +151,16 @@ ipinfov4 ipinfov6 iplrinit iplwinit -ipmp_aract_template -ipmp_ardeact_template ipmp_kstats iprinitv4 iprinitv6 ipsec_action_cache ipsec_hdr_pullup_needed -ipsec_info_cache ipsec_pol_cache ipsec_policy_failure_msgs ipsec_sel_cache ipsec_spd_hashsize ipsec_weird_null_inbound_policy -ipsechw_debug -iptunq_info -iptunq_modinfo -iptunq_rinit -iptunq_winit ipv4_forward_suffix ipv4info ipv6_all_hosts_mcast @@ -199,29 +168,22 @@ ipv6_all_ones ipv6_all_rtrs_mcast ipv6_all_v2rtrs_mcast ipv6_all_zeros -ipv6_areq_template ipv6_forward_suffix ipv6_ll_template ipv6_loopback ipv6_solicited_node_mcast ipv6_unspecified_group ipv6info -ipwinitv4 -ipwinitv6 +ipwinit ire_cache ire_gw_secattr_cache -ire_idle_cutoff_interval ire_null ire_nv_arr ire_nv_tbl -ire_uinfo_null lcl_ndp_arr lcl_param_arr lcl_sctp_param_arr lcl_sctp_wroff_xtra_param -lcl_tcp_mdt_head_param -lcl_tcp_mdt_max_pbufs_param -lcl_tcp_mdt_tail_param lcl_tcp_param_arr lcl_tcp_wroff_xtra_param mask_rnhead @@ -230,6 +192,8 @@ modldrv modlinkage modlstrmod multicast_encap_iphdr +nce_cache +ncec_cache netdev_privs prov_update_handle radix_mask_cache @@ -238,6 +202,7 @@ rawip_conn_cache recvq_call recvq_loop_cnt req_arr +rinit_arp rn_mkfreelist rn_ones rn_zeros @@ -260,19 +225,17 @@ sctp_kmem_faddr_cache sctp_kmem_ftsn_set_cache sctp_kmem_set_cache sctp_mod_info +sctp_opt_arr +sctp_opt_arr_size sctp_recvq_tq_task_max sctp_recvq_tq_task_min sctp_recvq_tq_thr_max sctp_recvq_tq_thr_min sctp_sin6_null -sctp_taskq sctpdebug sctpinfo sctprinit sctpwinit -sendq_collision -sendq_empty -sendq_loop_cnt sin6_null sin_null skip_sctp_cksum @@ -300,12 +263,10 @@ tcp_g_statistics tcp_g_t_info_ack tcp_g_t_info_ack_v6 tcp_icmp_source_quench -tcp_iphc_cache tcp_max_optsize -tcp_mdt_chain -tcp_mdt_smss_threshold tcp_opt_arr tcp_opt_obj +tcp_outbound_squeue_switch tcp_random_anon_port tcp_random_end_ptr tcp_random_fptr @@ -321,13 +282,11 @@ tcp_sock_winit tcp_squeue_flag tcp_squeue_wput tcp_static_maxpsz -tcp_taskq tcp_timercache tcp_tx_pull_len tcp_valid_levels_arr tcp_winfo tcp_winit -tcp_outbound_squeue_switch tcpinfov4 tcpinfov6 tli_errs @@ -352,4 +311,6 @@ udp_valid_levels_arr udp_winit udpinfov4 udpinfov6 -zero_info +winit_arp +eri_cksum_workaround +nxge_cksum_workaround diff --git a/usr/src/uts/sparc/ip/ip.global-objs.obj64 b/usr/src/uts/sparc/ip/ip.global-objs.obj64 index 3df973b8f9..526e907ab5 100644 --- a/usr/src/uts/sparc/ip/ip.global-objs.obj64 +++ b/usr/src/uts/sparc/ip/ip.global-objs.obj64 @@ -23,19 +23,24 @@ # Use is subject to license terms. # +arp_m_tbl +arp_mod_info +arp_netinfo +arp_no_defense +arpinfo cb_inet_devops cl_inet_bind +cl_inet_checkspi cl_inet_connect2 +cl_inet_deletespi cl_inet_disconnect +cl_inet_getspi +cl_inet_idlesa cl_inet_ipident cl_inet_isclusterwide cl_inet_listen cl_inet_unbind cl_inet_unlisten -cl_inet_getspi -cl_inet_checkspi -cl_inet_deletespi -cl_inet_idlesa cl_sctp_assoc_change cl_sctp_check_addrs cl_sctp_connect @@ -43,6 +48,7 @@ cl_sctp_disconnect cl_sctp_listen cl_sctp_unlisten conn_drain_nthreads +dce_cache default_ip6_asp_table do_tcp_fusion do_tcpzcopy @@ -97,69 +103,41 @@ ill_no_arena ill_null inet_dev_info inet_devops -ip6_area_template -ip6_ared_template -ip6_cache_table_size ip6_ftable_hash_size -ip6_ire_max_bucket_cnt -ip6_ire_min_bucket_cnt -ip6_max_cache_table_size ip6opt_ls -ip_ard_template -ip_area_template -ip_ared_template -ip_areq_template -ip_arma_multi_template -ip_aroff_template -ip_aron_template -ip_aru_template -ip_cache_table_size ip_cgtp_filter_rev ip_conn_cache ip_debug ip_g_all_ones -ip_helper_stream_cache ip_helper_stream_info ip_helper_stream_rinit ip_helper_stream_winit ip_ioctl_ftbl -ip_ire_cleanup_cnt -ip_ire_cpu_ratio -ip_ire_max_bucket_cnt -ip_ire_mem_ratio -ip_ire_min_bucket_cnt -ip_loopback_mtu ip_loopback_mtu_v6plus ip_loopback_mtuplus ip_m_tbl -ip_max_cache_table_size ip_max_frag_dups ip_min_frag_prune_time -ip_minor_arena_sa ip_minor_arena_la +ip_minor_arena_sa ip_misc_ioctl_count ip_misc_ioctl_table ip_mod_info ip_modclose_ackwait_ms ip_ndx_ioctl_count ip_ndx_ioctl_table -ip_opt_arr -ip_opt_obj ip_poll_normal_ms ip_poll_normal_ticks ip_rput_pullups ip_six_byte_all_ones ip_squeue_create_callback ip_squeue_enter -ip_squeue_enter_unbound ip_squeue_fanout ip_squeue_flag ip_squeue_worker_wait ip_thread_data ip_thread_list ip_thread_rwlock -ip_use_helper_cache -ip_wput_frag_mdt_min ipcl_bind_fanout_size ipcl_conn_hash_maxsize ipcl_conn_hash_memfactor @@ -173,23 +151,16 @@ ipinfov4 ipinfov6 iplrinit iplwinit -ipmp_aract_template -ipmp_ardeact_template ipmp_kstats iprinitv4 iprinitv6 ipsec_action_cache ipsec_hdr_pullup_needed -ipsec_info_cache ipsec_pol_cache ipsec_policy_failure_msgs ipsec_sel_cache ipsec_spd_hashsize ipsec_weird_null_inbound_policy -iptunq_info -iptunq_modinfo -iptunq_rinit -iptunq_winit ipv4_forward_suffix ipv4info ipv6_all_hosts_mcast @@ -197,29 +168,22 @@ ipv6_all_ones ipv6_all_rtrs_mcast ipv6_all_v2rtrs_mcast ipv6_all_zeros -ipv6_areq_template ipv6_forward_suffix ipv6_ll_template ipv6_loopback ipv6_solicited_node_mcast ipv6_unspecified_group ipv6info -ipwinitv4 -ipwinitv6 +ipwinit ire_cache ire_gw_secattr_cache -ire_idle_cutoff_interval ire_null ire_nv_arr ire_nv_tbl -ire_uinfo_null lcl_ndp_arr lcl_param_arr lcl_sctp_param_arr lcl_sctp_wroff_xtra_param -lcl_tcp_mdt_head_param -lcl_tcp_mdt_max_pbufs_param -lcl_tcp_mdt_tail_param lcl_tcp_param_arr lcl_tcp_wroff_xtra_param mask_rnhead @@ -228,12 +192,15 @@ modldrv modlinkage modlstrmod multicast_encap_iphdr +nce_cache +ncec_cache netdev_privs prov_update_handle radix_mask_cache radix_node_cache rawip_conn_cache req_arr +rinit_arp rn_mkfreelist rn_ones rn_zeros @@ -256,12 +223,13 @@ sctp_kmem_faddr_cache sctp_kmem_ftsn_set_cache sctp_kmem_set_cache sctp_mod_info +sctp_opt_arr +sctp_opt_arr_size sctp_recvq_tq_task_max sctp_recvq_tq_task_min sctp_recvq_tq_thr_max sctp_recvq_tq_thr_min sctp_sin6_null -sctp_taskq sctpdebug sctpinfo sctprinit @@ -292,12 +260,10 @@ tcp_g_statistics tcp_g_t_info_ack tcp_g_t_info_ack_v6 tcp_icmp_source_quench -tcp_iphc_cache tcp_max_optsize -tcp_mdt_chain -tcp_mdt_smss_threshold tcp_opt_arr tcp_opt_obj +tcp_outbound_squeue_switch tcp_random_anon_port tcp_random_end_ptr tcp_random_fptr @@ -313,13 +279,11 @@ tcp_sock_winit tcp_squeue_flag tcp_squeue_wput tcp_static_maxpsz -tcp_taskq tcp_timercache tcp_tx_pull_len tcp_valid_levels_arr tcp_winfo tcp_winit -tcp_outbound_squeue_switch tcpinfov4 tcpinfov6 tli_errs @@ -344,4 +308,6 @@ udp_valid_levels_arr udp_winit udpinfov4 udpinfov6 -zero_info +winit_arp +eri_cksum_workaround +nxge_cksum_workaround diff --git a/usr/src/uts/sparc/ml/modstubs.s b/usr/src/uts/sparc/ml/modstubs.s index 18eba0bdfa..24058b72e4 100644 --- a/usr/src/uts/sparc/ml/modstubs.s +++ b/usr/src/uts/sparc/ml/modstubs.s @@ -397,7 +397,6 @@ stubs_base: MODULE(ipsecah,drv); WSTUB(ipsecah, ipsec_construct_inverse_acquire, nomod_zero); WSTUB(ipsecah, sadb_acquire, nomod_zero); - WSTUB(ipsecah, sadb_ill_download, nomod_zero); WSTUB(ipsecah, ipsecah_algs_changed, nomod_zero); WSTUB(ipsecah, sadb_alg_update, nomod_zero); WSTUB(ipsecah, sadb_unlinkassoc, nomod_zero); @@ -1218,8 +1217,6 @@ stubs_base: STUB(iptun, iptun_create, nomod_einval); STUB(iptun, iptun_delete, nomod_einval); STUB(iptun, iptun_set_policy, nomod_einval); - STUB(iptun, iptun_set_g_q, nomod_einval); - STUB(iptun, iptun_clear_g_q, nomod_void); END_MODULE(iptun); #endif |