diff options
author | Erik Nordmark <Erik.Nordmark@Sun.COM> | 2009-11-11 11:49:49 -0800 |
---|---|---|
committer | Erik Nordmark <Erik.Nordmark@Sun.COM> | 2009-11-11 11:49:49 -0800 |
commit | bd670b35a010421b6e1a5536c34453a827007c81 (patch) | |
tree | 97c2057b6771dd40411a12eb89d2db2e2b2cce31 /usr/src/uts/common/inet/ip/ip_rts.c | |
parent | b3388e4fc5f5c24c8a39fbe132a00b02dae5b717 (diff) | |
download | illumos-joyent-bd670b35a010421b6e1a5536c34453a827007c81.tar.gz |
PSARC/2009/331 IP Datapath Refactoring
PSARC/2008/522 EOF of 2001/070 IPsec HW Acceleration support
PSARC/2009/495 netstat -r flags for blackhole and reject routes
PSARC 2009/496 EOF of XRESOLV
PSARC/2009/494 IP_DONTFRAG socket option
PSARC/2009/515 fragmentation controls for ping and traceroute
6798716 ip_newroute delenda est
6798739 ARP and IP are too separate
6807265 IPv4 ip2mac() support
6756382 Please remove Venus IPsec HWACCEL code
6880632 sendto/sendmsg never returns EHOSTUNREACH in Solaris
6748582 sendmsg() return OK, but doesn't send message using IPv4-mapped x IPv6 addr
1119790 TCP and path mtu discovery
4637227 should support equal-cost multi-path (ECMP)
5078568 getsockopt() for IPV6_PATHMTU on a non-connected socket should not succeed
6419648 "AR* contract private note" should be removed as part of ATM SW EOL
6274715 Arp could keep the old entry in the cache while it waits for an arp response
6605615 Remove duplicated TCP/IP opt_set/opt_get code; use conn_t
6874677 IP_TTL can be used to send with ttl zero
4034090 arp should not let you delete your own entry
6882140 Implement IP_DONTFRAG socket option
6883858 Implement ping -D option; traceroute -F should work for IPv6 and shared-IP zones
1119792 TCP/IP black hole detection is broken on receiver
4078796 Directed broadcast forwarding code has problems
4104337 restrict the IPPROTO_IP and IPPROTO_IPV6 options based on the socket family
4203747 Source address selection for source routed packets
4230259 pmtu is increased every ip_ire_pathmtu_interval timer value.
4300533 When sticky option ipv6_pktinfo set to bogus address subsequent connect time out
4471035 ire_delete_cache_gw is called through ire_walk unnecessarily
4514572 SO_DONTROUTE socket option doesn't work with IPv6
4524980 tcp_lookup_ipv4() should compare the ifindex against tcpb->tcpb_bound_if
4532714 machine fails to switch quickly among failed default routes
4634219 IPv6 path mtu discovery is broken when using routing header
4691581 udp broadcast handling causes too many replicas
4708405 mcast is broken on machines when all interfaces are IFF_POINTOPOINT
4770457 netstat/route: source address of interface routes pretends to be gateway address
4786974 use routing table to determine routes/interface for multicast
4792619 An ip_fanout_udp_ipc_v6() routine might lead to some simpler code
4816115 Nuke ipsec_out_use_global_policy
4862844 ipsec offload corner case
4867533 tcp_rq and tcp_wq are redundant
4868589 NCEs should be shared across an IPMP group
4872093 unplumbing an improper virtual interface panics in ip_newroute_get_dst_ill()
4901671 FireEngine needs some cleanup
4907617 IPsec identity latching should be done before sending SYN-ACK
4941461 scopeid and IPV6_PKTINFO with UDP/ICMP connect() does not work properly
4944981 ip does nothing with IP6I_NEXTHOP
4963353 IPv4 and IPv6 proto fanout codes could be brought closer
4963360 consider passing zoneid using ip6i_t instead of ipsec_out_t in NDP
4963734 new ip6_asp locking is used incorrectly in ip_newroute_v6()
5008315 IPv6 code passes ip6i_t to IPsec code instead of ip6_t
5009636 memory leak in ip_fanout_proto_v6()
5092337 tcp/udp option handling can use some cleanup
5035841 Solaris can fail to create a valid broadcast ire
5043747 ar_query_xmit: Could not find the ace
5051574 tcp_check_policy is missing some checks
6305037 full hardware checksum is discarded when there're more than 2 mblks in the chain
6311149 ip.c needs to be put through a woodchipper
4708860 Unable to reassemble CGTP fragmented multicast packets
6224628 Large IPv6 packets with IPsec protection sometimes have length mismatch.
6213243 Solaris does not currently support Dead Gateway Detection
5029091 duplicate code in IP's input path for TCP/UDP/SCTP
4674643 through IPv6 CGTP routes, the very first packet is sent only after a while
6207318 Multiple default routes do not round robin connections to routers.
4823410 IP has an inconsistent view of link mtu
5105520 adding interface route to down interface causes ifconfig hang
5105707 advanced sockets API introduced some dead code
6318399 IP option handling for icmp and udp is too complicated
6321434 Every dropped packet in IP should use ip_drop_packet()
6341693 ifconfig mtu should operate on the physical interface, not individual ipif's
6352430 The credentials attached to an mblk are not particularly useful
6357894 uninitialised ipp_hoplimit needs to be cleaned up.
6363568 ip_xmit_v6() may be missing IRE releases in error cases
6364828 ip_rput_forward needs a makeover
6384416 System panics when running as multicast forwarder using multicast tunnels
6402382 TX: UDP v6 slowpath is not modified to handle mac_exempt conns
6418413 assertion failed ipha->ipha_ident == 0||ipha->ipha_ident == 0xFFFF
6420916 assertion failures in ipv6 wput path
6430851 use of b_prev to store ifindex is not 100% safe
6446106 IPv6 packets stored in nce->nce_qd_mp will be sent with incorrect tcp/udp checksums
6453711 SCTP OOTB sent as if genetated by global zone
6465212 ARP/IP merge should remove ire_freemblk.esballoc
6490163 ip_input() could misbehave if the first mblk's size is not big enough
6496664 missing ipif_refrele leads to reference leak and deferred crash in ip_wput_ipsec_out_v6
6504856 memory leak in ip_fanout_proto_v6() when using link local outer tunnel addresses
6507765 IRE cache hash function performs badly
6510186 IP_FORWARD_PROG bit is easily overlooked
6514727 cgtp ipv6 failure on snv54
6528286 MULTIRT (CGTP) should offload checksum to hardware
6533904 SCTP: doesn't support traffic class for IPv6
6539415 TX: ipif source selection is flawed for unlabeled gateways
6539851 plumbed unworking nic blocks sending broadcast packets
6564468 non-solaris SCTP stack over rawip socket: netstat command counts rawipInData not rawipOutDatagrams
6568511 ipIfStatsOutDiscards not bumped when discarding an ipsec packet on the wrong NIC
6584162 tcp_g_q_inactive() makes incorrect use of taskq_dispatch()
6603974 round-robin default with many interfaces causes infinite temporary IRE thrashing
6611750 ilm_lookup_ill_index_v4 was born an orphan
6618423 ip_wput_frag_mdt sends out packets that void pfhooks
6620964 IRE max bucket count calculations performed in ip_ire_init() are flawed
6626266 various _broadcasts seem redundant
6638182 IP_PKTINFO + SO_DONTROUTE + CIPSO IP option == panic
6647710 IPv6 possible DoS vulnerability
6657357 nce should be kmem_cache alloc'ed from an nce_cache.
6685131 ilg_add -> conn_ilg_alloc interacting with conn_ilg[] walkers can cause panic.
6730298 adding 0.0.0.0 key with mask != 0 causes 'route delete default' to fail
6730976 vni and ipv6 doesn't quite work.
6740956 assertion failed: mp->b_next == 0L && mp->b_prev == 0L in nce_queue_mp_common()
6748515 BUMP_MIB() is occasionally done on the wrong ill
6753250 ip_output_v6() `notv6' error path has an errant ill_refrele()
6756411 NULL-pointer dereference in ip_wput_local()
6769582 IP must forward packet returned from FW-HOOK
6781525 bogus usesrc usage leads directly to panic
6422839 System paniced in ip_multicast_loopback due to NULL pointer dereference
6785521 initial IPv6 DAD solicitation is dropped in ip_newroute_ipif_v6()
6787370 ipnet devices not seeing forwarded IP packets on outgoing interface
6791187 ip*dbg() calls in ip_output_options() claim to originate from ip_wput()
6794047 nce_fp_mp prevents sharing of NCEs across an IPMP group
6797926 many unnecessary ip0dbg() in ip_rput_data_v6
6846919 Packet queued for ND gets sent in the clear.
6856591 ping doesn't send packets with DF set
6861113 arp module has incorrect dependency path for hook module
6865664 IPV6_NEXTHOP does not work with TCP socket
6874681 No ICMP time exceeded when a router receives packet with ttl = 0
6880977 ip_wput_ire() uses over 1k of stack
6595433 IPsec performance could be significantly better when calling hw crypto provider synchronously
6848397 ifconfig down of an interface can hang.
6849602 IPV6_PATHMTU size issue for UDP
6885359 Add compile-time option for testing pure IPsec overhead
6889268 Odd loopback source address selection with IPMP
6895420 assertion failed: connp->conn_helper_info == NULL
6851189 Routing-related panic occurred during reboot on T2000 system running snv_117
6896174 Post-async-encryption, AH+ESP packets may have misinitialized ipha/ip6
6896687 iptun presents IPv6 with an MTU < 1280
6897006 assertion failed: ipif->ipif_id != 0 in ip_sioctl_slifzone_restart
Diffstat (limited to 'usr/src/uts/common/inet/ip/ip_rts.c')
-rw-r--r-- | usr/src/uts/common/inet/ip/ip_rts.c | 923 |
1 files changed, 520 insertions, 403 deletions
diff --git a/usr/src/uts/common/inet/ip/ip_rts.c b/usr/src/uts/common/inet/ip/ip_rts.c index 70c8bd2ea1..228c7581a3 100644 --- a/usr/src/uts/common/inet/ip/ip_rts.c +++ b/usr/src/uts/common/inet/ip/ip_rts.c @@ -81,24 +81,33 @@ static size_t rts_copyfromsockaddr(struct sockaddr *sa, in6_addr_t *addrp); static void rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst, ipaddr_t mask, ipaddr_t gateway, ipaddr_t src_addr, ipaddr_t brd_addr, - ipaddr_t author, const ipif_t *ipif, mblk_t *mp, uint_t, const tsol_gc_t *); + ipaddr_t author, ipaddr_t ifaddr, const ill_t *ill, mblk_t *mp, + const tsol_gc_t *); static int rts_getaddrs(rt_msghdr_t *rtm, in6_addr_t *dst_addrp, in6_addr_t *gw_addrp, in6_addr_t *net_maskp, in6_addr_t *authorp, in6_addr_t *if_addrp, in6_addr_t *src_addrp, ushort_t *indexp, sa_family_t *afp, tsol_rtsecattr_t *rtsecattr, int *error); static void rts_getifdata(if_data_t *if_data, const ipif_t *ipif); static int rts_getmetrics(ire_t *ire, rt_metrics_t *metrics); -static mblk_t *rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire, - sa_family_t af); +static mblk_t *rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *ifire, + const in6_addr_t *setsrc, tsol_ire_gw_secattr_t *attrp, sa_family_t af); static void rts_setmetrics(ire_t *ire, uint_t which, rt_metrics_t *metrics); -static void ip_rts_request_retry(ipsq_t *, queue_t *q, mblk_t *mp, void *); +static ire_t *ire_lookup_v4(ipaddr_t dst_addr, ipaddr_t net_mask, + ipaddr_t gw_addr, const ill_t *ill, zoneid_t zoneid, + const ts_label_t *tsl, int match_flags, ip_stack_t *ipst, ire_t **pifire, + ipaddr_t *v4setsrcp, tsol_ire_gw_secattr_t **gwattrp); +static ire_t *ire_lookup_v6(const in6_addr_t *dst_addr_v6, + const in6_addr_t *net_mask_v6, const in6_addr_t *gw_addr_v6, + const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, int match_flags, + ip_stack_t *ipst, ire_t **pifire, + in6_addr_t *v6setsrcp, tsol_ire_gw_secattr_t **gwattrp); /* * Send `mp' to all eligible routing queues. A queue is ineligible if: * * 1. SO_USELOOPBACK is off and it is not the originating queue. - * 2. RTAW_UNDER_IPMP is on and RTSQ_UNDER_IPMP is clear in `flags'. - * 3. RTAW_UNDER_IPMP is off and RTSQ_NORMAL is clear in `flags'. + * 2. RTA_UNDER_IPMP is on and RTSQ_UNDER_IPMP is not set in `flags'. + * 3. RTA_UNDER_IPMP is off and RTSQ_NORMAL is not set in `flags'. * 4. It is not the same address family as `af', and `af' isn't AF_UNSPEC. */ void @@ -110,7 +119,7 @@ rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, uint_t flags, /* * Since we don't have an ill_t here, RTSQ_DEFAULT must already be - * resolved to one or more of RTSQ_NORMAL|RTSQ_UNDER_IPMP by now. + * resolved to one or more of RTSQ_NORMAL|RTSQ_UNDER_IPMP at this point. */ ASSERT(!(flags & RTSQ_DEFAULT)); @@ -119,7 +128,6 @@ rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, uint_t flags, for (; connp != NULL; connp = next_connp) { next_connp = connp->conn_next; - /* * If there was a family specified when this routing socket was * created and it doesn't match the family of the message to @@ -139,28 +147,27 @@ rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, uint_t flags, if (!(flags & RTSQ_NORMAL)) continue; } - /* * For the originating queue, we only copy the message upstream * if loopback is set. For others reading on the routing * socket, we check if there is room upstream for a copy of the * message. */ - if ((o_connp == connp) && connp->conn_loopback == 0) { + if ((o_connp == connp) && connp->conn_useloopback == 0) { connp = connp->conn_next; continue; } CONN_INC_REF(connp); mutex_exit(&ipst->ips_rts_clients->connf_lock); /* Pass to rts_input */ - if ((IPCL_IS_NONSTR(connp) && !PROTO_FLOW_CNTRLD(connp))|| - (!IPCL_IS_NONSTR(connp) && - canputnext(CONNP_TO_RQ(connp)))) { + if (IPCL_IS_NONSTR(connp) ? !connp->conn_flow_cntrld : + canputnext(connp->conn_rq)) { mp1 = dupmsg(mp); if (mp1 == NULL) mp1 = copymsg(mp); + /* Note that we pass a NULL ira to rts_input */ if (mp1 != NULL) - (connp->conn_recv)(connp, mp1, NULL); + (connp->conn_recv)(connp, mp1, NULL, NULL); } mutex_enter(&ipst->ips_rts_clients->connf_lock); @@ -176,7 +183,7 @@ rts_queue_input(mblk_t *mp, conn_t *o_connp, sa_family_t af, uint_t flags, * Takes an ire and sends an ack to all the routing sockets. This * routine is used * - when a route is created/deleted through the ioctl interface. - * - when ire_expire deletes a stale redirect + * - when a stale redirect is deleted */ void ip_rts_rtmsg(int type, ire_t *ire, int error, ip_stack_t *ipst) @@ -192,6 +199,8 @@ ip_rts_rtmsg(int type, ire_t *ire, int error, ip_stack_t *ipst) ASSERT(ire->ire_ipversion == IPV4_VERSION || ire->ire_ipversion == IPV6_VERSION); + ASSERT(!(ire->ire_type & IRE_IF_CLONE)); + if (ire->ire_flags & RTF_SETSRC) rtm_addrs |= RTA_SRC; @@ -202,8 +211,8 @@ ip_rts_rtmsg(int type, ire_t *ire, int error, ip_stack_t *ipst) if (mp == NULL) return; rts_fill_msg(type, rtm_addrs, ire->ire_addr, ire->ire_mask, - ire->ire_gateway_addr, ire->ire_src_addr, 0, 0, NULL, mp, - 0, NULL); + ire->ire_gateway_addr, ire->ire_setsrc_addr, 0, 0, 0, NULL, + mp, NULL); break; case IPV6_VERSION: af = AF_INET6; @@ -215,8 +224,8 @@ ip_rts_rtmsg(int type, ire_t *ire, int error, ip_stack_t *ipst) mutex_exit(&ire->ire_lock); rts_fill_msg_v6(type, rtm_addrs, &ire->ire_addr_v6, &ire->ire_mask_v6, &gw_addr_v6, - &ire->ire_src_addr_v6, &ipv6_all_zeros, &ipv6_all_zeros, - NULL, mp, 0, NULL); + &ire->ire_setsrc_addr_v6, &ipv6_all_zeros, &ipv6_all_zeros, + &ipv6_all_zeros, NULL, mp, NULL); break; } rtm = (rt_msghdr_t *)mp->b_rptr; @@ -230,13 +239,6 @@ ip_rts_rtmsg(int type, ire_t *ire, int error, ip_stack_t *ipst) rts_queue_input(mp, NULL, af, RTSQ_ALL, ipst); } -/* ARGSUSED */ -static void -ip_rts_request_retry(ipsq_t *dummy_sq, queue_t *q, mblk_t *mp, void *dummy) -{ - (void) ip_rts_request(q, mp, msg_getcred(mp, NULL)); -} - /* * This is a call from the RTS module * indicating that this is a Routing Socket @@ -248,7 +250,7 @@ ip_rts_register(conn_t *connp) { ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - connp->conn_loopback = 1; + connp->conn_useloopback = 1; ipcl_hash_insert_wildcard(ipst->ips_rts_clients, connp); } @@ -269,18 +271,9 @@ ip_rts_unregister(conn_t *connp) * * In general, this function does not consume the message supplied but rather * sends the message upstream with an appropriate UNIX errno. - * - * We may need to restart this operation if the ipif cannot be looked up - * due to an exclusive operation that is currently in progress. The restart - * entry point is ip_rts_request_retry. While the request is enqueud in the - * ipsq the ioctl could be aborted and the conn close. To ensure that we don't - * have stale conn pointers, ip_wput_ioctl does a conn refhold. This is - * released at the completion of the rts ioctl at the end of this function - * by calling CONN_OPER_PENDING_DONE or when the ioctl is aborted and - * conn close occurs in conn_ioctl_cleanup. */ int -ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr) +ip_rts_request_common(mblk_t *mp, conn_t *connp, cred_t *ioc_cr) { rt_msghdr_t *rtm = NULL; in6_addr_t dst_addr_v6; @@ -289,9 +282,12 @@ ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr) in6_addr_t net_mask_v6; in6_addr_t author_v6; in6_addr_t if_addr_v6; - mblk_t *mp1, *ioc_mp = mp; + mblk_t *mp1; ire_t *ire = NULL; - ire_t *sire = NULL; + ire_t *ifire = NULL; + ipaddr_t v4setsrc; + in6_addr_t v6setsrc = ipv6_all_zeros; + tsol_ire_gw_secattr_t *gwattr = NULL; int error = 0; int match_flags = MATCH_IRE_DSTONLY; int match_flags_local = MATCH_IRE_TYPE | MATCH_IRE_GW; @@ -302,9 +298,6 @@ ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr) ipaddr_t src_addr; ipaddr_t net_mask; ushort_t index; - ipif_t *ipif = NULL; - ipif_t *tmp_ipif = NULL; - IOCP iocp = (IOCP)mp->b_rptr; boolean_t gcgrp_xtraref = B_FALSE; tsol_gcgrp_addr_t ga; tsol_rtsecattr_t rtsecattr; @@ -314,42 +307,11 @@ ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr) ts_label_t *tsl = NULL; zoneid_t zoneid; ip_stack_t *ipst; - - ip1dbg(("ip_rts_request: mp is %x\n", DB_TYPE(mp))); + ill_t *ill = NULL; zoneid = connp->conn_zoneid; ipst = connp->conn_netstack->netstack_ip; - ASSERT(mp->b_cont != NULL); - /* ioc_mp holds mp */ - mp = mp->b_cont; - - /* - * The Routing Socket data starts on - * next block. If there is no next block - * this is an indication from routing module - * that it is a routing socket stream queue. - * We need to support that for compatibility with SDP since - * it has a contract private interface to use IP_IOC_RTS_REQUEST. - */ - if (mp->b_cont == NULL) { - /* - * This is a message from SDP - * indicating that this is a Routing Socket - * Stream. Insert this conn_t in routing - * socket client list. - */ - connp->conn_loopback = 1; - ipcl_hash_insert_wildcard(ipst->ips_rts_clients, connp); - goto done; - } - mp1 = dupmsg(mp->b_cont); - if (mp1 == NULL) { - error = ENOBUFS; - goto done; - } - mp = mp1; - if (mp->b_cont != NULL && !pullupmsg(mp, -1)) { freemsg(mp); error = EINVAL; @@ -446,20 +408,13 @@ ip_rts_request_common(queue_t *q, mblk_t *mp, conn_t *connp, cred_t *ioc_cr) */ ASSERT(af == AF_INET || af == AF_INET6); + /* Handle RTA_IFP */ if (index != 0) { - ill_t *ill; + ipif_t *ipif; lookup: - /* - * IPC must be refheld somewhere in ip_wput_nondata or - * ip_wput_ioctl etc... and cleaned up if ioctl is killed. - * If ILL_CHANGING the request is queued in the ipsq. - */ - ill = ill_lookup_on_ifindex(index, af == AF_INET6, - CONNP_TO_WQ(connp), ioc_mp, ip_rts_request_retry, &error, - ipst); + ill = ill_lookup_on_ifindex(index, af == AF_INET6, ipst); if (ill == NULL) { - if (error != EINPROGRESS) - error = EINVAL; + error = EINVAL; goto done; } @@ -474,13 +429,13 @@ lookup: switch (rtm->rtm_type) { case RTM_CHANGE: case RTM_DELETE: - ill_refrele(ill); error = EINVAL; goto done; case RTM_ADD: index = ipmp_ill_get_ipmp_ifindex(ill); ill_refrele(ill); if (index == 0) { + ill = NULL; /* already refrele'd */ error = EINVAL; goto done; } @@ -488,9 +443,18 @@ lookup: } } - ipif = ipif_get_next_ipif(NULL, ill); - ill_refrele(ill); match_flags |= MATCH_IRE_ILL; + /* + * This provides the same zoneid as in Solaris 10 + * that -ifp picks the zoneid from the first ipif on the ill. + * But it might not be useful since the first ipif will always + * have the same zoneid as the ill. + */ + ipif = ipif_get_next_ipif(NULL, ill); + if (ipif != NULL) { + zoneid = ipif->ipif_zoneid; + ipif_refrele(ipif); + } } /* @@ -545,6 +509,8 @@ lookup: switch (af) { case AF_INET: if (src_addr != INADDR_ANY) { + uint_t type; + /* * The RTF_SETSRC flag is present, check that * the supplied src address is not the loopback @@ -556,20 +522,11 @@ lookup: } /* * Also check that the supplied address is a - * valid, local one. + * valid, local one. Only allow IFF_UP ones */ - tmp_ipif = ipif_lookup_addr(src_addr, NULL, - ALL_ZONES, CONNP_TO_WQ(connp), ioc_mp, - ip_rts_request_retry, &error, ipst); - if (tmp_ipif == NULL) { - if (error != EINPROGRESS) - error = EADDRNOTAVAIL; - goto done; - } - if (!(tmp_ipif->ipif_flags & IPIF_UP) || - (tmp_ipif->ipif_flags & - (IPIF_NOLOCAL | IPIF_ANYCAST))) { - error = EINVAL; + type = ip_type_v4(src_addr, ipst); + if (!(type & (IRE_LOCAL|IRE_LOOPBACK))) { + error = EADDRNOTAVAIL; goto done; } } else { @@ -584,14 +541,15 @@ lookup: } error = ip_rt_add(dst_addr, net_mask, gw_addr, src_addr, - rtm->rtm_flags, ipif, &ire, B_FALSE, - WR(q), ioc_mp, ip_rts_request_retry, - rtsap, ipst); - if (ipif != NULL) - ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); + rtm->rtm_flags, ill, &ire, B_FALSE, + rtsap, ipst, zoneid); + if (ill != NULL) + ASSERT(!MUTEX_HELD(&ill->ill_lock)); break; case AF_INET6: if (!IN6_IS_ADDR_UNSPECIFIED(&src_addr_v6)) { + uint_t type; + /* * The RTF_SETSRC flag is present, check that * the supplied src address is not the loopback @@ -603,28 +561,17 @@ lookup: } /* * Also check that the supplied address is a - * valid, local one. + * valid, local one. Only allow UP ones. */ - tmp_ipif = ipif_lookup_addr_v6(&src_addr_v6, - NULL, ALL_ZONES, CONNP_TO_WQ(connp), ioc_mp, - ip_rts_request_retry, &error, ipst); - if (tmp_ipif == NULL) { - if (error != EINPROGRESS) - error = EADDRNOTAVAIL; - goto done; - } - - if (!(tmp_ipif->ipif_flags & IPIF_UP) || - (tmp_ipif->ipif_flags & - (IPIF_NOLOCAL | IPIF_ANYCAST))) { - error = EINVAL; + type = ip_type_v6(&src_addr_v6, ipst); + if (!(type & (IRE_LOCAL|IRE_LOOPBACK))) { + error = EADDRNOTAVAIL; goto done; } error = ip_rt_add_v6(&dst_addr_v6, &net_mask_v6, &gw_addr_v6, &src_addr_v6, rtm->rtm_flags, - ipif, &ire, WR(q), ioc_mp, - ip_rts_request_retry, rtsap, ipst); + ill, &ire, rtsap, ipst, zoneid); break; } /* @@ -637,10 +584,9 @@ lookup: } error = ip_rt_add_v6(&dst_addr_v6, &net_mask_v6, &gw_addr_v6, NULL, rtm->rtm_flags, - ipif, &ire, WR(q), ioc_mp, - ip_rts_request_retry, rtsap, ipst); - if (ipif != NULL) - ASSERT(!MUTEX_HELD(&ipif->ipif_ill->ill_lock)); + ill, &ire, rtsap, ipst, zoneid); + if (ill != NULL) + ASSERT(!MUTEX_HELD(&ill->ill_lock)); break; } if (error != 0) @@ -666,13 +612,13 @@ lookup: switch (af) { case AF_INET: error = ip_rt_delete(dst_addr, net_mask, gw_addr, - found_addrs, rtm->rtm_flags, ipif, B_FALSE, - WR(q), ioc_mp, ip_rts_request_retry, ipst); + found_addrs, rtm->rtm_flags, ill, B_FALSE, + ipst, zoneid); break; case AF_INET6: error = ip_rt_delete_v6(&dst_addr_v6, &net_mask_v6, - &gw_addr_v6, found_addrs, rtm->rtm_flags, ipif, - WR(q), ioc_mp, ip_rts_request_retry, ipst); + &gw_addr_v6, found_addrs, rtm->rtm_flags, ill, + ipst, zoneid); break; } break; @@ -680,8 +626,7 @@ lookup: case RTM_CHANGE: /* * In the case of RTM_GET, the forwarding table should be - * searched recursively with default being matched if the - * specific route doesn't exist. Also, if a gateway was + * searched recursively. Also, if a gateway was * specified then the gateway address must also be matched. * * In the case of RTM_CHANGE, the gateway address (if supplied) @@ -706,9 +651,7 @@ lookup: } if (rtm->rtm_type == RTM_GET) { - match_flags |= - (MATCH_IRE_DEFAULT | MATCH_IRE_RECURSIVE | - MATCH_IRE_SECATTR); + match_flags |= MATCH_IRE_SECATTR; match_flags_local |= MATCH_IRE_SECATTR; if ((found_addrs & RTA_GATEWAY) != 0) match_flags |= MATCH_IRE_GW; @@ -749,57 +692,34 @@ lookup: * IRE_LOCAL entry. * * If we didn't check for or find an IRE_LOOPBACK or IRE_LOCAL - * entry, then look in the forwarding table. + * entry, then look for any other type of IRE. */ switch (af) { case AF_INET: if (net_mask == IP_HOST_MASK) { - ire = ire_ctable_lookup(dst_addr, gw_addr, + ire = ire_ftable_lookup_v4(dst_addr, 0, gw_addr, IRE_LOCAL | IRE_LOOPBACK, NULL, zoneid, - tsl, match_flags_local, ipst); - /* - * If we found an IRE_LOCAL, make sure - * it is one that would be used by this - * zone to send packets. - */ - if (ire != NULL && - ire->ire_type == IRE_LOCAL && - ipst->ips_ip_restrict_interzone_loopback && - !ire_local_ok_across_zones(ire, - zoneid, &dst_addr, tsl, ipst)) { - ire_refrele(ire); - ire = NULL; - } + tsl, match_flags_local, 0, ipst, NULL); } if (ire == NULL) { - ire = ire_ftable_lookup(dst_addr, net_mask, - gw_addr, 0, ipif, &sire, zoneid, 0, - tsl, match_flags, ipst); + ire = ire_lookup_v4(dst_addr, net_mask, + gw_addr, ill, zoneid, tsl, match_flags, + ipst, &ifire, &v4setsrc, &gwattr); + IN6_IPADDR_TO_V4MAPPED(v4setsrc, &v6setsrc); } break; case AF_INET6: if (IN6_ARE_ADDR_EQUAL(&net_mask_v6, &ipv6_all_ones)) { - ire = ire_ctable_lookup_v6(&dst_addr_v6, + ire = ire_ftable_lookup_v6(&dst_addr_v6, NULL, &gw_addr_v6, IRE_LOCAL | IRE_LOOPBACK, NULL, - zoneid, tsl, match_flags_local, ipst); - /* - * If we found an IRE_LOCAL, make sure - * it is one that would be used by this - * zone to send packets. - */ - if (ire != NULL && - ire->ire_type == IRE_LOCAL && - ipst->ips_ip_restrict_interzone_loopback && - !ire_local_ok_across_zones(ire, - zoneid, (void *)&dst_addr_v6, tsl, ipst)) { - ire_refrele(ire); - ire = NULL; - } + zoneid, tsl, match_flags_local, 0, ipst, + NULL); } if (ire == NULL) { - ire = ire_ftable_lookup_v6(&dst_addr_v6, - &net_mask_v6, &gw_addr_v6, 0, ipif, &sire, - zoneid, 0, tsl, match_flags, ipst); + ire = ire_lookup_v6(&dst_addr_v6, + &net_mask_v6, &gw_addr_v6, ill, zoneid, + tsl, match_flags, ipst, &ifire, &v6setsrc, + &gwattr); } break; } @@ -810,10 +730,21 @@ lookup: error = ESRCH; goto done; } + /* + * Want to return failure if we get an IRE_NOROUTE from + * ire_route_recursive + */ + if (ire->ire_type & IRE_NOROUTE) { + ire_refrele(ire); + ire = NULL; + error = ESRCH; + goto done; + } + /* we know the IRE before we come here */ switch (rtm->rtm_type) { case RTM_GET: - mp1 = rts_rtmget(mp, ire, sire, af); + mp1 = rts_rtmget(mp, ire, ifire, &v6setsrc, gwattr, af); if (mp1 == NULL) { error = ENOBUFS; goto done; @@ -843,7 +774,6 @@ lookup: */ switch (af) { case AF_INET: - ire_flush_cache_v4(ire, IRE_FLUSH_DELETE); if ((found_addrs & RTA_GATEWAY) != 0 && (ire->ire_gateway_addr != gw_addr)) { ire->ire_gateway_addr = gw_addr; @@ -863,9 +793,10 @@ lookup: if ((found_addrs & RTA_SRC) != 0 && (rtm->rtm_flags & RTF_SETSRC) != 0 && - (ire->ire_src_addr != src_addr)) { - + (ire->ire_setsrc_addr != src_addr)) { if (src_addr != INADDR_ANY) { + uint_t type; + /* * The RTF_SETSRC flag is * present, check that the @@ -880,50 +811,47 @@ lookup: goto done; } /* - * Also check that the the + * Also check that the * supplied addr is a valid * local address. */ - tmp_ipif = ipif_lookup_addr( - src_addr, NULL, ALL_ZONES, - WR(q), ioc_mp, - ip_rts_request_retry, - &error, ipst); - if (tmp_ipif == NULL) { - error = (error == - EINPROGRESS) ? - error : - EADDRNOTAVAIL; - goto done; - } - - if (!(tmp_ipif->ipif_flags & - IPIF_UP) || - (tmp_ipif->ipif_flags & - (IPIF_NOLOCAL | - IPIF_ANYCAST))) { - error = EINVAL; + type = ip_type_v4(src_addr, + ipst); + if (!(type & + (IRE_LOCAL|IRE_LOOPBACK))) { + error = EADDRNOTAVAIL; goto done; } ire->ire_flags |= RTF_SETSRC; + ire->ire_setsrc_addr = + src_addr; } else { ire->ire_flags &= ~RTF_SETSRC; + ire->ire_setsrc_addr = + INADDR_ANY; } - ire->ire_src_addr = src_addr; + /* + * Let conn_ixa caching know that + * source address selection changed + */ + ip_update_source_selection(ipst); } + ire_flush_cache_v4(ire, IRE_FLUSH_GWCHANGE); break; case AF_INET6: - ire_flush_cache_v6(ire, IRE_FLUSH_DELETE); mutex_enter(&ire->ire_lock); if ((found_addrs & RTA_GATEWAY) != 0 && !IN6_ARE_ADDR_EQUAL( &ire->ire_gateway_addr_v6, &gw_addr_v6)) { ire->ire_gateway_addr_v6 = gw_addr_v6; } + mutex_exit(&ire->ire_lock); if (rtsap != NULL) { ga.ga_af = AF_INET6; + mutex_enter(&ire->ire_lock); ga.ga_addr = ire->ire_gateway_addr_v6; + mutex_exit(&ire->ire_lock); gcgrp = gcgrp_lookup(&ga, B_TRUE); if (gcgrp == NULL) { @@ -935,10 +863,11 @@ lookup: if ((found_addrs & RTA_SRC) != 0 && (rtm->rtm_flags & RTF_SETSRC) != 0 && !IN6_ARE_ADDR_EQUAL( - &ire->ire_src_addr_v6, &src_addr_v6)) { - + &ire->ire_setsrc_addr_v6, &src_addr_v6)) { if (!IN6_IS_ADDR_UNSPECIFIED( &src_addr_v6)) { + uint_t type; + /* * The RTF_SETSRC flag is * present, check that the @@ -949,54 +878,44 @@ lookup: */ if (IN6_IS_ADDR_LOOPBACK( &src_addr_v6)) { - mutex_exit( - &ire->ire_lock); error = EINVAL; goto done; } /* - * Also check that the the + * Also check that the * supplied addr is a valid * local address. */ - tmp_ipif = ipif_lookup_addr_v6( - &src_addr_v6, NULL, - ALL_ZONES, - CONNP_TO_WQ(connp), ioc_mp, - ip_rts_request_retry, - &error, ipst); - if (tmp_ipif == NULL) { - mutex_exit( - &ire->ire_lock); - error = (error == - EINPROGRESS) ? - error : - EADDRNOTAVAIL; - goto done; - } - if (!(tmp_ipif->ipif_flags & - IPIF_UP) || - (tmp_ipif->ipif_flags & - (IPIF_NOLOCAL | - IPIF_ANYCAST))) { - mutex_exit( - &ire->ire_lock); - error = EINVAL; + type = ip_type_v6(&src_addr_v6, + ipst); + if (!(type & + (IRE_LOCAL|IRE_LOOPBACK))) { + error = EADDRNOTAVAIL; goto done; } + mutex_enter(&ire->ire_lock); ire->ire_flags |= RTF_SETSRC; + ire->ire_setsrc_addr_v6 = + src_addr_v6; + mutex_exit(&ire->ire_lock); } else { + mutex_enter(&ire->ire_lock); ire->ire_flags &= ~RTF_SETSRC; + ire->ire_setsrc_addr_v6 = + ipv6_all_zeros; + mutex_exit(&ire->ire_lock); } - ire->ire_src_addr_v6 = src_addr_v6; + /* + * Let conn_ixa caching know that + * source address selection changed + */ + ip_update_source_selection(ipst); } - mutex_exit(&ire->ire_lock); + ire_flush_cache_v6(ire, IRE_FLUSH_GWCHANGE); break; } if (rtsap != NULL) { - in_addr_t ga_addr4; - ASSERT(gcgrp != NULL); /* @@ -1010,7 +929,7 @@ lookup: gc = gc_create(rtsap, gcgrp, &gcgrp_xtraref); if (gc == NULL || (error = tsol_ire_init_gwattr(ire, - ire->ire_ipversion, gc, NULL)) != 0) { + ire->ire_ipversion, gc)) != 0) { if (gc != NULL) { GC_REFRELE(gc); } else { @@ -1019,21 +938,6 @@ lookup: } goto done; } - - /* - * Now delete any existing gateway IRE caches - * as well as all caches using the gateway, - * and allow them to be created on demand - * through ip_newroute{_v6}. - */ - IN6_V4MAPPED_TO_IPADDR(&ga.ga_addr, ga_addr4); - if (af == AF_INET) { - ire_clookup_delete_cache_gw( - ga_addr4, ALL_ZONES, ipst); - } else { - ire_clookup_delete_cache_gw_v6( - &ga.ga_addr, ALL_ZONES, ipst); - } } rts_setmetrics(ire, rtm->rtm_inits, &rtm->rtm_rmx); break; @@ -1046,21 +950,14 @@ lookup: done: if (ire != NULL) ire_refrele(ire); - if (sire != NULL) - ire_refrele(sire); - if (ipif != NULL) - ipif_refrele(ipif); - if (tmp_ipif != NULL) - ipif_refrele(tmp_ipif); + if (ifire != NULL) + ire_refrele(ifire); + if (ill != NULL) + ill_refrele(ill); if (gcgrp_xtraref) GCGRP_REFRELE(gcgrp); - if (error == EINPROGRESS) { - if (rtm != NULL) - freemsg(mp); - return (error); - } if (rtm != NULL) { ASSERT(mp->b_wptr <= mp->b_datap->db_lim); if (error != 0) { @@ -1074,12 +971,190 @@ done: } rts_queue_input(mp, connp, af, RTSQ_ALL, ipst); } + return (error); +} + +/* + * Helper function that can do recursive lookups including when + * MATCH_IRE_GW and/or MATCH_IRE_MASK is set. + */ +static ire_t * +ire_lookup_v4(ipaddr_t dst_addr, ipaddr_t net_mask, ipaddr_t gw_addr, + const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, + int match_flags, ip_stack_t *ipst, ire_t **pifire, ipaddr_t *v4setsrcp, + tsol_ire_gw_secattr_t **gwattrp) +{ + ire_t *ire; + ire_t *ifire = NULL; + uint_t ire_type; + + *pifire = NULL; + *v4setsrcp = INADDR_ANY; + *gwattrp = NULL; + + /* Skip IRE_IF_CLONE */ + match_flags |= MATCH_IRE_TYPE; + ire_type = (IRE_ONLINK|IRE_OFFLINK) & ~IRE_IF_CLONE; + + /* + * ire_route_recursive can't match gateway or mask thus if they are + * set we have to do two steps of lookups + */ + if (match_flags & (MATCH_IRE_GW|MATCH_IRE_MASK)) { + ire = ire_ftable_lookup_v4(dst_addr, net_mask, gw_addr, + ire_type, ill, zoneid, tsl, match_flags, 0, ipst, NULL); + + if (ire == NULL ||(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) + return (ire); + + if (ire->ire_type & IRE_ONLINK) + return (ire); + + if (ire->ire_flags & RTF_SETSRC) { + ASSERT(ire->ire_setsrc_addr != INADDR_ANY); + *v4setsrcp = ire->ire_setsrc_addr; + v4setsrcp = NULL; + } + + /* The first ire_gw_secattr is passed back */ + if (ire->ire_gw_secattr != NULL) { + *gwattrp = ire->ire_gw_secattr; + gwattrp = NULL; + } + + /* Look for an interface ire recursively based on the gateway */ + dst_addr = ire->ire_gateway_addr; + match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_MASK); + ifire = ire_route_recursive_v4(dst_addr, ire_type, ill, zoneid, + tsl, match_flags, B_FALSE, 0, ipst, v4setsrcp, gwattrp, + NULL); + } else { + ire = ire_route_recursive_v4(dst_addr, ire_type, ill, zoneid, + tsl, match_flags, B_FALSE, 0, ipst, v4setsrcp, gwattrp, + NULL); + } + *pifire = ifire; + return (ire); +} + +static ire_t * +ire_lookup_v6(const in6_addr_t *dst_addr_v6, + const in6_addr_t *net_mask_v6, const in6_addr_t *gw_addr_v6, + const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, int match_flags, + ip_stack_t *ipst, ire_t **pifire, + in6_addr_t *v6setsrcp, tsol_ire_gw_secattr_t **gwattrp) +{ + ire_t *ire; + ire_t *ifire = NULL; + uint_t ire_type; + + *pifire = NULL; + *v6setsrcp = ipv6_all_zeros; + *gwattrp = NULL; + + /* Skip IRE_IF_CLONE */ + match_flags |= MATCH_IRE_TYPE; + ire_type = (IRE_ONLINK|IRE_OFFLINK) & ~IRE_IF_CLONE; + + /* + * ire_route_recursive can't match gateway or mask thus if they are + * set we have to do two steps of lookups + */ + if (match_flags & (MATCH_IRE_GW|MATCH_IRE_MASK)) { + in6_addr_t dst; + + ire = ire_ftable_lookup_v6(dst_addr_v6, net_mask_v6, + gw_addr_v6, ire_type, ill, zoneid, tsl, match_flags, 0, + ipst, NULL); + + if (ire == NULL ||(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) + return (ire); + + if (ire->ire_type & IRE_ONLINK) + return (ire); + + if (ire->ire_flags & RTF_SETSRC) { + ASSERT(!IN6_IS_ADDR_UNSPECIFIED( + &ire->ire_setsrc_addr_v6)); + *v6setsrcp = ire->ire_setsrc_addr_v6; + v6setsrcp = NULL; + } + + /* The first ire_gw_secattr is passed back */ + if (ire->ire_gw_secattr != NULL) { + *gwattrp = ire->ire_gw_secattr; + gwattrp = NULL; + } + + mutex_enter(&ire->ire_lock); + dst = ire->ire_gateway_addr_v6; + mutex_exit(&ire->ire_lock); + match_flags &= ~(MATCH_IRE_GW|MATCH_IRE_MASK); + ifire = ire_route_recursive_v6(&dst, ire_type, ill, zoneid, tsl, + match_flags, B_FALSE, 0, ipst, v6setsrcp, gwattrp, NULL); + } else { + ire = ire_route_recursive_v6(dst_addr_v6, ire_type, ill, zoneid, + tsl, match_flags, B_FALSE, 0, ipst, v6setsrcp, gwattrp, + NULL); + } + *pifire = ifire; + return (ire); +} + + +/* + * Handle IP_IOC_RTS_REQUEST ioctls + */ +int +ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr) +{ + conn_t *connp = Q_TO_CONN(q); + IOCP iocp = (IOCP)mp->b_rptr; + mblk_t *mp1, *ioc_mp = mp; + int error = 0; + ip_stack_t *ipst; + ipst = connp->conn_netstack->netstack_ip; + + ASSERT(mp->b_cont != NULL); + /* ioc_mp holds mp */ + mp = mp->b_cont; + + /* + * The Routing Socket data starts on + * next block. If there is no next block + * this is an indication from routing module + * that it is a routing socket stream queue. + * We need to support that for compatibility with SDP since + * it has a contract private interface to use IP_IOC_RTS_REQUEST. + * Note: SDP no longer uses IP_IOC_RTS_REQUEST - we can remove this. + */ + if (mp->b_cont == NULL) { + /* + * This is a message from SDP + * indicating that this is a Routing Socket + * Stream. Insert this conn_t in routing + * socket client list. + */ + connp->conn_useloopback = 1; + ipcl_hash_insert_wildcard(ipst->ips_rts_clients, connp); + goto done; + } + mp1 = dupmsg(mp->b_cont); + if (mp1 == NULL) { + error = ENOBUFS; + goto done; + } + mp = mp1; + + error = ip_rts_request_common(mp, connp, ioc_cr); +done: iocp->ioc_error = error; ioc_mp->b_datap->db_type = M_IOCACK; if (iocp->ioc_error != 0) iocp->ioc_count = 0; - (connp->conn_recv)(connp, ioc_mp, NULL); + /* Note that we pass a NULL ira to rts_input */ + (connp->conn_recv)(connp, ioc_mp, NULL, NULL); /* conn was refheld in ip_wput_ioctl. */ CONN_OPER_PENDING_DONE(connp); @@ -1087,12 +1162,6 @@ done: return (error); } -int -ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr) -{ - return (ip_rts_request_common(q, mp, Q_TO_CONN(q), ioc_cr)); -} - /* * Build a reply to the RTM_GET request contained in the given message block * using the retrieved IRE of the destination address, the parent IRE (if it @@ -1102,26 +1171,34 @@ ip_rts_request(queue_t *q, mblk_t *mp, cred_t *ioc_cr) * otherwise NULL is returned. */ static mblk_t * -rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire, sa_family_t af) +rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *ifire, const in6_addr_t *setsrc, + tsol_ire_gw_secattr_t *attrp, sa_family_t af) { rt_msghdr_t *rtm; rt_msghdr_t *new_rtm; mblk_t *new_mp; int rtm_addrs; int rtm_flags; - in6_addr_t gw_addr_v6; - tsol_ire_gw_secattr_t *attrp = NULL; tsol_gc_t *gc = NULL; tsol_gcgrp_t *gcgrp = NULL; - int sacnt = 0; + ill_t *ill; + ipif_t *ipif = NULL; + ipaddr_t brdaddr; /* IFF_POINTOPOINT destination */ + ipaddr_t ifaddr; + in6_addr_t brdaddr6; /* IFF_POINTOPOINT destination */ + in6_addr_t ifaddr6; + ipaddr_t v4setsrc; - ASSERT(ire->ire_ipif != NULL); rtm = (rt_msghdr_t *)mp->b_rptr; - if (sire != NULL && sire->ire_gw_secattr != NULL) - attrp = sire->ire_gw_secattr; - else if (ire->ire_gw_secattr != NULL) - attrp = ire->ire_gw_secattr; + /* + * Find the ill used to send packets. This will be NULL in case + * of a reject or blackhole. + */ + if (ifire != NULL) + ill = ire_nexthop_ill(ifire); + else + ill = ire_nexthop_ill(ire); if (attrp != NULL) { mutex_enter(&attrp->igsa_lock); @@ -1129,29 +1206,9 @@ rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire, sa_family_t af) gcgrp = gc->gc_grp; ASSERT(gcgrp != NULL); rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); - sacnt = 1; - } else if ((gcgrp = attrp->igsa_gcgrp) != NULL) { - rw_enter(&gcgrp->gcgrp_rwlock, RW_READER); - gc = gcgrp->gcgrp_head; - sacnt = gcgrp->gcgrp_count; } mutex_exit(&attrp->igsa_lock); - - /* do nothing if there's no gc to report */ - if (gc == NULL) { - ASSERT(sacnt == 0); - if (gcgrp != NULL) { - /* we might as well drop the lock now */ - rw_exit(&gcgrp->gcgrp_rwlock); - gcgrp = NULL; - } - attrp = NULL; - } - - ASSERT(gc == NULL || (gcgrp != NULL && - RW_LOCK_HELD(&gcgrp->gcgrp_rwlock))); } - ASSERT(sacnt == 0 || gc != NULL); /* * Always return RTA_DST, RTA_GATEWAY and RTA_NETMASK. @@ -1162,16 +1219,36 @@ rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire, sa_family_t af) * point-to-point. */ rtm_addrs = (RTA_DST | RTA_GATEWAY | RTA_NETMASK); - if (rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) { + if ((rtm->rtm_addrs & (RTA_IFP | RTA_IFA)) && ill != NULL) { rtm_addrs |= (RTA_IFP | RTA_IFA); - if (ire->ire_ipif->ipif_flags & IPIF_POINTOPOINT) - rtm_addrs |= RTA_BRD; + /* + * We associate an IRE with an ILL, hence we don't exactly + * know what might make sense for RTA_IFA and RTA_BRD. We + * pick the first ipif on the ill. + */ + ipif = ipif_get_next_ipif(NULL, ill); + if (ipif != NULL) { + if (ipif->ipif_isv6) + ifaddr6 = ipif->ipif_v6lcl_addr; + else + ifaddr = ipif->ipif_lcl_addr; + if (ipif->ipif_flags & IPIF_POINTOPOINT) { + rtm_addrs |= RTA_BRD; + if (ipif->ipif_isv6) + brdaddr6 = ipif->ipif_v6pp_dst_addr; + else + brdaddr = ipif->ipif_pp_dst_addr; + } + ipif_refrele(ipif); + } } - new_mp = rts_alloc_msg(RTM_GET, rtm_addrs, af, sacnt); + new_mp = rts_alloc_msg(RTM_GET, rtm_addrs, af, gc != NULL ? 1 : 0); if (new_mp == NULL) { if (gcgrp != NULL) rw_exit(&gcgrp->gcgrp_rwlock); + if (ill != NULL) + ill_refrele(ill); return (NULL); } @@ -1187,49 +1264,24 @@ rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire, sa_family_t af) ASSERT(af == AF_INET || af == AF_INET6); switch (af) { case AF_INET: - if (sire == NULL) { - rtm_flags = ire->ire_flags; - rts_fill_msg(RTM_GET, rtm_addrs, ire->ire_addr, - ire->ire_mask, ire->ire_src_addr, ire->ire_src_addr, - ire->ire_ipif->ipif_pp_dst_addr, 0, ire->ire_ipif, - new_mp, sacnt, gc); - } else { - if (sire->ire_flags & RTF_SETSRC) - rtm_addrs |= RTA_SRC; - - rtm_flags = sire->ire_flags; - rts_fill_msg(RTM_GET, rtm_addrs, sire->ire_addr, - sire->ire_mask, sire->ire_gateway_addr, - (sire->ire_flags & RTF_SETSRC) ? - sire->ire_src_addr : ire->ire_src_addr, - ire->ire_ipif->ipif_pp_dst_addr, - 0, ire->ire_ipif, new_mp, sacnt, gc); - } + IN6_V4MAPPED_TO_IPADDR(setsrc, v4setsrc); + if (v4setsrc != INADDR_ANY) + rtm_addrs |= RTA_SRC; + + rtm_flags = ire->ire_flags; + rts_fill_msg(RTM_GET, rtm_addrs, ire->ire_addr, + ire->ire_mask, ire->ire_gateway_addr, v4setsrc, + brdaddr, 0, ifaddr, ill, new_mp, gc); break; case AF_INET6: - if (sire == NULL) { - rtm_flags = ire->ire_flags; - rts_fill_msg_v6(RTM_GET, rtm_addrs, &ire->ire_addr_v6, - &ire->ire_mask_v6, &ire->ire_src_addr_v6, - &ire->ire_src_addr_v6, - &ire->ire_ipif->ipif_v6pp_dst_addr, - &ipv6_all_zeros, ire->ire_ipif, new_mp, - sacnt, gc); - } else { - if (sire->ire_flags & RTF_SETSRC) - rtm_addrs |= RTA_SRC; - - rtm_flags = sire->ire_flags; - mutex_enter(&sire->ire_lock); - gw_addr_v6 = sire->ire_gateway_addr_v6; - mutex_exit(&sire->ire_lock); - rts_fill_msg_v6(RTM_GET, rtm_addrs, &sire->ire_addr_v6, - &sire->ire_mask_v6, &gw_addr_v6, - (sire->ire_flags & RTF_SETSRC) ? - &sire->ire_src_addr_v6 : &ire->ire_src_addr_v6, - &ire->ire_ipif->ipif_v6pp_dst_addr, &ipv6_all_zeros, - ire->ire_ipif, new_mp, sacnt, gc); - } + if (!IN6_IS_ADDR_UNSPECIFIED(setsrc)) + rtm_addrs |= RTA_SRC; + + rtm_flags = ire->ire_flags; + rts_fill_msg_v6(RTM_GET, rtm_addrs, &ire->ire_addr_v6, + &ire->ire_mask_v6, &ire->ire_gateway_addr_v6, + setsrc, &brdaddr6, &ipv6_all_zeros, + &ifaddr6, ill, new_mp, gc); break; } @@ -1259,11 +1311,9 @@ rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire, sa_family_t af) new_rtm->rtm_use = rtm->rtm_use; new_rtm->rtm_addrs = rtm_addrs; new_rtm->rtm_flags = rtm_flags; - if (sire == NULL) - new_rtm->rtm_inits = rts_getmetrics(ire, &new_rtm->rtm_rmx); - else - new_rtm->rtm_inits = rts_getmetrics(sire, &new_rtm->rtm_rmx); - + new_rtm->rtm_inits = rts_getmetrics(ire, &new_rtm->rtm_rmx); + if (ill != NULL) + ill_refrele(ill); return (new_mp); } @@ -1273,10 +1323,11 @@ rts_rtmget(mblk_t *mp, ire_t *ire, ire_t *sire, sa_family_t af) static void rts_getifdata(if_data_t *if_data, const ipif_t *ipif) { - if_data->ifi_type = ipif->ipif_type; /* ethernet, tokenring, etc */ + if_data->ifi_type = ipif->ipif_ill->ill_type; + /* ethernet, tokenring, etc */ if_data->ifi_addrlen = 0; /* media address length */ if_data->ifi_hdrlen = 0; /* media header length */ - if_data->ifi_mtu = ipif->ipif_mtu; /* maximum transmission unit */ + if_data->ifi_mtu = ipif->ipif_ill->ill_mtu; /* mtu */ if_data->ifi_metric = ipif->ipif_metric; /* metric (external only) */ if_data->ifi_baudrate = 0; /* linespeed */ @@ -1302,18 +1353,19 @@ rts_setmetrics(ire_t *ire, uint_t which, rt_metrics_t *metrics) { clock_t rtt; clock_t rtt_sd; - ipif_t *ipif; + ill_t *ill; ifrt_t *ifrt; mblk_t *mp; in6_addr_t gw_addr_v6; + /* Need to add back some metrics to the IRE? */ /* - * Bypass obtaining the lock and searching ipif_saved_ire_mp in the + * Bypass obtaining the lock and searching ill_saved_ire_mp in the * common case of no metrics. */ if (which == 0) return; - ire->ire_uinfo.iulp_set = B_TRUE; + ire->ire_metrics.iulp_set = B_TRUE; /* * iulp_rtt and iulp_rtt_sd are in milliseconds, but 4.4BSD-Lite2's @@ -1330,42 +1382,41 @@ rts_setmetrics(ire_t *ire, uint_t which, rt_metrics_t *metrics) */ mutex_enter(&ire->ire_lock); if (which & RTV_MTU) - ire->ire_max_frag = metrics->rmx_mtu; + ire->ire_metrics.iulp_mtu = metrics->rmx_mtu; if (which & RTV_RTT) - ire->ire_uinfo.iulp_rtt = rtt; + ire->ire_metrics.iulp_rtt = rtt; if (which & RTV_SSTHRESH) - ire->ire_uinfo.iulp_ssthresh = metrics->rmx_ssthresh; + ire->ire_metrics.iulp_ssthresh = metrics->rmx_ssthresh; if (which & RTV_RTTVAR) - ire->ire_uinfo.iulp_rtt_sd = rtt_sd; + ire->ire_metrics.iulp_rtt_sd = rtt_sd; if (which & RTV_SPIPE) - ire->ire_uinfo.iulp_spipe = metrics->rmx_sendpipe; + ire->ire_metrics.iulp_spipe = metrics->rmx_sendpipe; if (which & RTV_RPIPE) - ire->ire_uinfo.iulp_rpipe = metrics->rmx_recvpipe; + ire->ire_metrics.iulp_rpipe = metrics->rmx_recvpipe; mutex_exit(&ire->ire_lock); /* - * Search through the ifrt_t chain hanging off the IPIF in order to + * Search through the ifrt_t chain hanging off the ILL in order to * reflect the metric change there. */ - ipif = ire->ire_ipif; - if (ipif == NULL) + ill = ire->ire_ill; + if (ill == NULL) return; - ASSERT((ipif->ipif_isv6 && ire->ire_ipversion == IPV6_VERSION) || - ((!ipif->ipif_isv6 && ire->ire_ipversion == IPV4_VERSION))); - if (ipif->ipif_isv6) { + ASSERT((ill->ill_isv6 && ire->ire_ipversion == IPV6_VERSION) || + ((!ill->ill_isv6 && ire->ire_ipversion == IPV4_VERSION))); + if (ill->ill_isv6) { mutex_enter(&ire->ire_lock); gw_addr_v6 = ire->ire_gateway_addr_v6; mutex_exit(&ire->ire_lock); } - mutex_enter(&ipif->ipif_saved_ire_lock); - for (mp = ipif->ipif_saved_ire_mp; mp != NULL; mp = mp->b_cont) { + mutex_enter(&ill->ill_saved_ire_lock); + for (mp = ill->ill_saved_ire_mp; mp != NULL; mp = mp->b_cont) { /* - * On a given ipif, the triple of address, gateway and mask is - * unique for each saved IRE (in the case of ordinary interface - * routes, the gateway address is all-zeroes). + * On a given ill, the tuple of address, gateway, mask, + * ire_type and zoneid unique for each saved IRE. */ ifrt = (ifrt_t *)mp->b_rptr; - if (ipif->ipif_isv6) { + if (ill->ill_isv6) { if (!IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6addr, &ire->ire_addr_v6) || !IN6_ARE_ADDR_EQUAL(&ifrt->ifrt_v6gateway_addr, @@ -1379,23 +1430,36 @@ rts_setmetrics(ire_t *ire, uint_t which, rt_metrics_t *metrics) ifrt->ifrt_mask != ire->ire_mask) continue; } + if (ifrt->ifrt_zoneid != ire->ire_zoneid || + ifrt->ifrt_type != ire->ire_type) + continue; + if (which & RTV_MTU) - ifrt->ifrt_max_frag = metrics->rmx_mtu; + ifrt->ifrt_metrics.iulp_mtu = metrics->rmx_mtu; if (which & RTV_RTT) - ifrt->ifrt_iulp_info.iulp_rtt = rtt; + ifrt->ifrt_metrics.iulp_rtt = rtt; if (which & RTV_SSTHRESH) { - ifrt->ifrt_iulp_info.iulp_ssthresh = + ifrt->ifrt_metrics.iulp_ssthresh = metrics->rmx_ssthresh; } if (which & RTV_RTTVAR) - ifrt->ifrt_iulp_info.iulp_rtt_sd = metrics->rmx_rttvar; + ifrt->ifrt_metrics.iulp_rtt_sd = metrics->rmx_rttvar; if (which & RTV_SPIPE) - ifrt->ifrt_iulp_info.iulp_spipe = metrics->rmx_sendpipe; + ifrt->ifrt_metrics.iulp_spipe = metrics->rmx_sendpipe; if (which & RTV_RPIPE) - ifrt->ifrt_iulp_info.iulp_rpipe = metrics->rmx_recvpipe; + ifrt->ifrt_metrics.iulp_rpipe = metrics->rmx_recvpipe; break; } - mutex_exit(&ipif->ipif_saved_ire_lock); + mutex_exit(&ill->ill_saved_ire_lock); + + /* + * Update any IRE_IF_CLONE hanging created from this IRE_IF so they + * get any new iulp_mtu. + * We do that by deleting them; ire_create_if_clone will pick + * up the new metrics. + */ + if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != 0) + ire_dep_delete_if_clone(ire); } /* @@ -1407,27 +1471,69 @@ rts_getmetrics(ire_t *ire, rt_metrics_t *metrics) int metrics_set = 0; bzero(metrics, sizeof (rt_metrics_t)); + /* * iulp_rtt and iulp_rtt_sd are in milliseconds, but 4.4BSD-Lite2's * <net/route.h> says: rmx_rtt and rmx_rttvar are stored as * microseconds. */ - metrics->rmx_rtt = ire->ire_uinfo.iulp_rtt * 1000; + metrics->rmx_rtt = ire->ire_metrics.iulp_rtt * 1000; metrics_set |= RTV_RTT; - metrics->rmx_mtu = ire->ire_max_frag; + metrics->rmx_mtu = ire->ire_metrics.iulp_mtu; metrics_set |= RTV_MTU; - metrics->rmx_ssthresh = ire->ire_uinfo.iulp_ssthresh; + metrics->rmx_ssthresh = ire->ire_metrics.iulp_ssthresh; metrics_set |= RTV_SSTHRESH; - metrics->rmx_rttvar = ire->ire_uinfo.iulp_rtt_sd * 1000; + metrics->rmx_rttvar = ire->ire_metrics.iulp_rtt_sd * 1000; metrics_set |= RTV_RTTVAR; - metrics->rmx_sendpipe = ire->ire_uinfo.iulp_spipe; + metrics->rmx_sendpipe = ire->ire_metrics.iulp_spipe; metrics_set |= RTV_SPIPE; - metrics->rmx_recvpipe = ire->ire_uinfo.iulp_rpipe; + metrics->rmx_recvpipe = ire->ire_metrics.iulp_rpipe; metrics_set |= RTV_RPIPE; return (metrics_set); } /* + * Given two sets of metrics (src and dst), use the dst values if they are + * set. If a dst value is not set but the src value is set, then we use + * the src value. + * dst is updated with the new values. + * This is used to merge information from a dce_t and ire_metrics, where the + * dce values takes precedence. + */ +void +rts_merge_metrics(iulp_t *dst, const iulp_t *src) +{ + if (!src->iulp_set) + return; + + if (dst->iulp_ssthresh == 0) + dst->iulp_ssthresh = src->iulp_ssthresh; + if (dst->iulp_rtt == 0) + dst->iulp_rtt = src->iulp_rtt; + if (dst->iulp_rtt_sd == 0) + dst->iulp_rtt_sd = src->iulp_rtt_sd; + if (dst->iulp_spipe == 0) + dst->iulp_spipe = src->iulp_spipe; + if (dst->iulp_rpipe == 0) + dst->iulp_rpipe = src->iulp_rpipe; + if (dst->iulp_rtomax == 0) + dst->iulp_rtomax = src->iulp_rtomax; + if (dst->iulp_sack == 0) + dst->iulp_sack = src->iulp_sack; + if (dst->iulp_tstamp_ok == 0) + dst->iulp_tstamp_ok = src->iulp_tstamp_ok; + if (dst->iulp_wscale_ok == 0) + dst->iulp_wscale_ok = src->iulp_wscale_ok; + if (dst->iulp_ecn_ok == 0) + dst->iulp_ecn_ok = src->iulp_ecn_ok; + if (dst->iulp_pmtud_ok == 0) + dst->iulp_pmtud_ok = src->iulp_pmtud_ok; + if (dst->iulp_mtu == 0) + dst->iulp_mtu = src->iulp_mtu; +} + + +/* * Takes a pointer to a routing message and extracts necessary info by looking * at the rtm->rtm_addrs bits and store the requested sockaddrs in the pointers * passed (all of which must be valid). @@ -1552,7 +1658,8 @@ rts_getaddrs(rt_msghdr_t *rtm, in6_addr_t *dst_addrp, in6_addr_t *gw_addrp, static void rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst, ipaddr_t mask, ipaddr_t gateway, ipaddr_t src_addr, ipaddr_t brd_addr, ipaddr_t author, - const ipif_t *ipif, mblk_t *mp, uint_t sacnt, const tsol_gc_t *gc) + ipaddr_t ifaddr, const ill_t *ill, mblk_t *mp, + const tsol_gc_t *gc) { rt_msghdr_t *rtm; sin_t *sin; @@ -1561,7 +1668,6 @@ rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst, ipaddr_t mask, int i; ASSERT(mp != NULL); - ASSERT(sacnt == 0 || gc != NULL); /* * First find the type of the message * and its length. @@ -1571,7 +1677,7 @@ rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst, ipaddr_t mask, * Now find the size of the data * that follows the message header. */ - data_size = rts_data_msg_size(rtm_addrs, AF_INET, sacnt); + data_size = rts_data_msg_size(rtm_addrs, AF_INET, gc != NULL ? 1 : 0); rtm = (rt_msghdr_t *)mp->b_rptr; mp->b_wptr = &mp->b_rptr[header_size]; @@ -1596,9 +1702,13 @@ rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst, ipaddr_t mask, cp += sizeof (sin_t); break; case RTA_IFP: - cp += ill_dls_info((struct sockaddr_dl *)cp, ipif); + cp += ill_dls_info((struct sockaddr_dl *)cp, ill); break; case RTA_IFA: + sin->sin_addr.s_addr = ifaddr; + sin->sin_family = AF_INET; + cp += sizeof (sin_t); + break; case RTA_SRC: sin->sin_addr.s_addr = src_addr; sin->sin_family = AF_INET; @@ -1625,24 +1735,20 @@ rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst, ipaddr_t mask, rtm_ext_t *rtm_ext; struct rtsa_s *rp_dst; tsol_rtsecattr_t *rsap; - int i; ASSERT(gc->gc_grp != NULL); ASSERT(RW_LOCK_HELD(&gc->gc_grp->gcgrp_rwlock)); - ASSERT(sacnt > 0); rtm_ext = (rtm_ext_t *)cp; rtm_ext->rtmex_type = RTMEX_GATEWAY_SECATTR; - rtm_ext->rtmex_len = TSOL_RTSECATTR_SIZE(sacnt); + rtm_ext->rtmex_len = TSOL_RTSECATTR_SIZE(1); rsap = (tsol_rtsecattr_t *)(rtm_ext + 1); - rsap->rtsa_cnt = sacnt; + rsap->rtsa_cnt = 1; rp_dst = rsap->rtsa_attr; - for (i = 0; i < sacnt; i++, gc = gc->gc_next, rp_dst++) { - ASSERT(gc->gc_db != NULL); - bcopy(&gc->gc_db->gcdb_attr, rp_dst, sizeof (*rp_dst)); - } + ASSERT(gc->gc_db != NULL); + bcopy(&gc->gc_db->gcdb_attr, rp_dst, sizeof (*rp_dst)); cp = (uchar_t *)rp_dst; } @@ -1659,6 +1765,7 @@ rts_fill_msg(int type, int rtm_addrs, ipaddr_t dst, ipaddr_t mask, /* * Allocates and initializes a routing socket message. + * Note that sacnt is either zero or one. */ mblk_t * rts_alloc_msg(int type, int rtm_addrs, sa_family_t af, uint_t sacnt) @@ -1755,7 +1862,7 @@ ip_rts_change(int type, ipaddr_t dst_addr, ipaddr_t gw_addr, ipaddr_t net_mask, if (mp == NULL) return; rts_fill_msg(type, rtm_addrs, dst_addr, net_mask, gw_addr, source, 0, - author, NULL, mp, 0, NULL); + author, 0, NULL, mp, NULL); rtm = (rt_msghdr_t *)mp->b_rptr; rtm->rtm_flags = flags; rtm->rtm_errno = error; @@ -1784,12 +1891,12 @@ ip_rts_xifmsg(const ipif_t *ipif, uint64_t set, uint64_t clear, uint_t flags) ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; /* - * This message should be generated only when the physical interface - * is changing state. + * This message should be generated only + * when the physical device is changing + * state. */ if (ipif->ipif_id != 0) return; - if (ipif->ipif_isv6) { af = AF_INET6; mp = rts_alloc_msg(RTM_IFINFO, RTA_IFP, af, 0); @@ -1797,14 +1904,15 @@ ip_rts_xifmsg(const ipif_t *ipif, uint64_t set, uint64_t clear, uint_t flags) return; rts_fill_msg_v6(RTM_IFINFO, RTA_IFP, &ipv6_all_zeros, &ipv6_all_zeros, &ipv6_all_zeros, &ipv6_all_zeros, - &ipv6_all_zeros, &ipv6_all_zeros, ipif, mp, 0, NULL); + &ipv6_all_zeros, &ipv6_all_zeros, &ipv6_all_zeros, + ipif->ipif_ill, mp, NULL); } else { af = AF_INET; mp = rts_alloc_msg(RTM_IFINFO, RTA_IFP, af, 0); if (mp == NULL) return; - rts_fill_msg(RTM_IFINFO, RTA_IFP, 0, 0, 0, 0, 0, 0, ipif, mp, - 0, NULL); + rts_fill_msg(RTM_IFINFO, RTA_IFP, 0, 0, 0, 0, 0, 0, 0, + ipif->ipif_ill, mp, NULL); } ifm = (if_msghdr_t *)mp->b_rptr; ifm->ifm_index = ipif->ipif_ill->ill_phyint->phyint_ifindex; @@ -1843,6 +1951,12 @@ ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif, uint_t flags) sa_family_t af; ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; + /* + * Let conn_ixa caching know that source address selection + * changed + */ + ip_update_source_selection(ipst); + if (ipif->ipif_isv6) af = AF_INET6; else @@ -1875,15 +1989,17 @@ ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif, uint_t flags) case AF_INET: rts_fill_msg(ncmd, rtm_addrs, 0, ipif->ipif_net_mask, 0, ipif->ipif_lcl_addr, - ipif->ipif_pp_dst_addr, 0, ipif, mp, - 0, NULL); + ipif->ipif_pp_dst_addr, 0, + ipif->ipif_lcl_addr, ipif->ipif_ill, + mp, NULL); break; case AF_INET6: rts_fill_msg_v6(ncmd, rtm_addrs, &ipv6_all_zeros, &ipif->ipif_v6net_mask, &ipv6_all_zeros, &ipif->ipif_v6lcl_addr, &ipif->ipif_v6pp_dst_addr, &ipv6_all_zeros, - ipif, mp, 0, NULL); + &ipif->ipif_v6lcl_addr, ipif->ipif_ill, + mp, NULL); break; } ifam = (ifa_msghdr_t *)mp->b_rptr; @@ -1904,14 +2020,15 @@ ip_rts_newaddrmsg(int cmd, int error, const ipif_t *ipif, uint_t flags) case AF_INET: rts_fill_msg(cmd, rtm_addrs, ipif->ipif_lcl_addr, ipif->ipif_net_mask, 0, - 0, 0, 0, NULL, mp, 0, NULL); + 0, 0, 0, 0, NULL, mp, NULL); break; case AF_INET6: rts_fill_msg_v6(cmd, rtm_addrs, &ipif->ipif_v6lcl_addr, &ipif->ipif_v6net_mask, &ipv6_all_zeros, &ipv6_all_zeros, &ipv6_all_zeros, - &ipv6_all_zeros, NULL, mp, 0, NULL); + &ipv6_all_zeros, &ipv6_all_zeros, + NULL, mp, NULL); break; } rtm = (rt_msghdr_t *)mp->b_rptr; |