diff options
Diffstat (limited to 'usr/src/uts/common/inet/ip/ip6_ire.c')
| -rw-r--r-- | usr/src/uts/common/inet/ip/ip6_ire.c | 3123 |
1 files changed, 871 insertions, 2252 deletions
diff --git a/usr/src/uts/common/inet/ip/ip6_ire.c b/usr/src/uts/common/inet/ip/ip6_ire.c index c13a66fcc2..7697ca20c7 100644 --- a/usr/src/uts/common/inet/ip/ip6_ire.c +++ b/usr/src/uts/common/inet/ip/ip6_ire.c @@ -60,122 +60,122 @@ #include <sys/tsol/label.h> #include <sys/tsol/tnet.h> +#define IS_DEFAULT_ROUTE_V6(ire) \ + (((ire)->ire_type & IRE_DEFAULT) || \ + (((ire)->ire_type & IRE_INTERFACE) && \ + (IN6_IS_ADDR_UNSPECIFIED(&(ire)->ire_addr_v6)))) + static ire_t ire_null; -static ire_t *ire_ihandle_lookup_onlink_v6(ire_t *cire); -static boolean_t ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, - const in6_addr_t *mask, const in6_addr_t *gateway, int type, - const ipif_t *ipif, zoneid_t zoneid, uint32_t ihandle, - const ts_label_t *tsl, int match_flags); -static ire_t *ire_init_v6(ire_t *, const in6_addr_t *, const in6_addr_t *, - const in6_addr_t *, const in6_addr_t *, uint_t *, queue_t *, queue_t *, - ushort_t, ipif_t *, const in6_addr_t *, uint32_t, uint32_t, uint_t, - const iulp_t *, tsol_gc_t *, tsol_gcgrp_t *, ip_stack_t *); -static ire_t *ip6_ctable_lookup_impl(ire_ctable_args_t *); +static ire_t * +ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask, + const in6_addr_t *gateway, int type, const ill_t *ill, + zoneid_t zoneid, const ts_label_t *tsl, int flags, + ip_stack_t *ipst); /* * Initialize the ire that is specific to IPv6 part and call * ire_init_common to finish it. + * Returns zero or errno. */ -static ire_t * +int ire_init_v6(ire_t *ire, const in6_addr_t *v6addr, const in6_addr_t *v6mask, - const in6_addr_t *v6src_addr, const in6_addr_t *v6gateway, - uint_t *max_fragp, queue_t *rfq, queue_t *stq, ushort_t type, - ipif_t *ipif, const in6_addr_t *v6cmask, uint32_t phandle, - uint32_t ihandle, uint_t flags, const iulp_t *ulp_info, tsol_gc_t *gc, - tsol_gcgrp_t *gcgrp, ip_stack_t *ipst) + const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, + zoneid_t zoneid, uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst) { + int error; /* - * Reject IRE security attribute creation/initialization + * Reject IRE security attmakeribute creation/initialization * if system is not running in Trusted mode. */ - if ((gc != NULL || gcgrp != NULL) && !is_system_labeled()) - return (NULL); - + if (gc != NULL && !is_system_labeled()) + return (EINVAL); BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_alloced); - ire->ire_addr_v6 = *v6addr; - - if (v6src_addr != NULL) - ire->ire_src_addr_v6 = *v6src_addr; - if (v6mask != NULL) { - ire->ire_mask_v6 = *v6mask; - ire->ire_masklen = ip_mask_to_plen_v6(&ire->ire_mask_v6); - } + if (v6addr != NULL) + ire->ire_addr_v6 = *v6addr; if (v6gateway != NULL) ire->ire_gateway_addr_v6 = *v6gateway; - if (type == IRE_CACHE && v6cmask != NULL) - ire->ire_cmask_v6 = *v6cmask; - - /* - * Multirouted packets need to have a fragment header added so that - * the receiver is able to discard duplicates according to their - * fragment identifier. - */ - if (type == IRE_CACHE && (flags & RTF_MULTIRT)) { - ire->ire_frag_flag = IPH_FRAG_HDR; + /* Make sure we don't have stray values in some fields */ + switch (type) { + case IRE_LOOPBACK: + ire->ire_gateway_addr_v6 = ire->ire_addr_v6; + /* FALLTHRU */ + case IRE_HOST: + case IRE_LOCAL: + case IRE_IF_CLONE: + ire->ire_mask_v6 = ipv6_all_ones; + ire->ire_masklen = IPV6_ABITS; + break; + case IRE_PREFIX: + case IRE_DEFAULT: + case IRE_IF_RESOLVER: + case IRE_IF_NORESOLVER: + if (v6mask != NULL) { + ire->ire_mask_v6 = *v6mask; + ire->ire_masklen = + ip_mask_to_plen_v6(&ire->ire_mask_v6); + } + break; + case IRE_MULTICAST: + case IRE_NOROUTE: + ASSERT(v6mask == NULL); + break; + default: + ASSERT(0); + return (EINVAL); } - /* ire_init_common will free the mblks upon encountering any failure */ - if (!ire_init_common(ire, max_fragp, NULL, rfq, stq, type, ipif, - phandle, ihandle, flags, IPV6_VERSION, ulp_info, gc, gcgrp, ipst)) - return (NULL); - - return (ire); -} - -/* - * Similar to ire_create_v6 except that it is called only when - * we want to allocate ire as an mblk e.g. we have a external - * resolver. Do we need this in IPv6 ? - * - * IPv6 initializes the ire_nce in ire_add_v6, which expects to - * find the ire_nce to be null when it is called. So, although - * we have a src_nce parameter (in the interest of matching up with - * the argument list of the v4 version), we ignore the src_nce - * argument here. - */ -/* ARGSUSED */ -ire_t * -ire_create_mp_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask, - const in6_addr_t *v6src_addr, const in6_addr_t *v6gateway, - nce_t *src_nce, queue_t *rfq, queue_t *stq, ushort_t type, - ipif_t *ipif, const in6_addr_t *v6cmask, - uint32_t phandle, uint32_t ihandle, uint_t flags, const iulp_t *ulp_info, - tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, ip_stack_t *ipst) -{ - ire_t *ire; - ire_t *ret_ire; - mblk_t *mp; + error = ire_init_common(ire, type, ill, zoneid, flags, IPV6_VERSION, + gc, ipst); + if (error != NULL) + return (error); - ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr)); + /* Determine which function pointers to use */ + ire->ire_postfragfn = ip_xmit; /* Common case */ - /* Allocate the new IRE. */ - mp = allocb(sizeof (ire_t), BPRI_MED); - if (mp == NULL) { - ip1dbg(("ire_create_mp_v6: alloc failed\n")); - return (NULL); + switch (ire->ire_type) { + case IRE_LOCAL: + ire->ire_sendfn = ire_send_local_v6; + ire->ire_recvfn = ire_recv_local_v6; +#ifdef SO_VRRP + ASSERT(ire->ire_ill != NULL); + if (ire->ire_ill->ill_flags & ILLF_NOACCEPT) { + ire->ire_noaccept = B_TRUE; + ire->ire_recvfn = ire_recv_noaccept_v6; + } +#endif + break; + case IRE_LOOPBACK: + ire->ire_sendfn = ire_send_local_v6; + ire->ire_recvfn = ire_recv_loopback_v6; + break; + case IRE_MULTICAST: + ire->ire_postfragfn = ip_postfrag_loopcheck; + ire->ire_sendfn = ire_send_multicast_v6; + ire->ire_recvfn = ire_recv_multicast_v6; + break; + default: + /* + * For IRE_IF_ALL and IRE_OFFLINK we forward received + * packets by default. + */ + ire->ire_sendfn = ire_send_wire_v6; + ire->ire_recvfn = ire_recv_forward_v6; + break; } - - ire = (ire_t *)mp->b_rptr; - mp->b_wptr = (uchar_t *)&ire[1]; - - /* Start clean. */ - *ire = ire_null; - ire->ire_mp = mp; - mp->b_datap->db_type = IRE_DB_TYPE; - - ret_ire = ire_init_v6(ire, v6addr, v6mask, v6src_addr, v6gateway, - NULL, rfq, stq, type, ipif, v6cmask, phandle, - ihandle, flags, ulp_info, gc, gcgrp, ipst); - - if (ret_ire == NULL) { - freeb(ire->ire_mp); - return (NULL); + if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { + ire->ire_sendfn = ire_send_noroute_v6; + ire->ire_recvfn = ire_recv_noroute_v6; + } else if (ire->ire_flags & RTF_MULTIRT) { + ire->ire_postfragfn = ip_postfrag_multirt_v6; + ire->ire_sendfn = ire_send_multirt_v6; + ire->ire_recvfn = ire_recv_multirt_v6; } - return (ire); + ire->ire_nce_capable = ire_determine_nce_capable(ire); + return (0); } /* @@ -183,153 +183,76 @@ ire_create_mp_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask, * * NOTE : This is called as writer sometimes though not required * by this function. - * - * See comments above ire_create_mp_v6() for the rationale behind the - * unused src_nce argument. */ /* ARGSUSED */ ire_t * ire_create_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask, - const in6_addr_t *v6src_addr, const in6_addr_t *v6gateway, - uint_t *max_fragp, nce_t *src_nce, queue_t *rfq, queue_t *stq, - ushort_t type, ipif_t *ipif, const in6_addr_t *v6cmask, - uint32_t phandle, uint32_t ihandle, uint_t flags, const iulp_t *ulp_info, - tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, ip_stack_t *ipst) + const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, zoneid_t zoneid, + uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst) { ire_t *ire; - ire_t *ret_ire; + int error; ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr)); ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); if (ire == NULL) { - ip1dbg(("ire_create_v6: alloc failed\n")); + DTRACE_PROBE(kmem__cache__alloc); return (NULL); } *ire = ire_null; - ret_ire = ire_init_v6(ire, v6addr, v6mask, v6src_addr, v6gateway, - max_fragp, rfq, stq, type, ipif, v6cmask, phandle, - ihandle, flags, ulp_info, gc, gcgrp, ipst); + error = ire_init_v6(ire, v6addr, v6mask, v6gateway, + type, ill, zoneid, flags, gc, ipst); - if (ret_ire == NULL) { + if (error != 0) { + DTRACE_PROBE2(ire__init__v6, ire_t *, ire, int, error); kmem_cache_free(ire_cache, ire); return (NULL); } - ASSERT(ret_ire == ire); return (ire); } /* - * Find an IRE_INTERFACE for the multicast group. + * Find the ill matching a multicast group. * Allows different routes for multicast addresses * in the unicast routing table (akin to FF::0/8 but could be more specific) * which point at different interfaces. This is used when IPV6_MULTICAST_IF * isn't specified (when sending) and when IPV6_JOIN_GROUP doesn't * specify the interface to join on. * - * Supports link-local addresses by following the ipif/ill when recursing. + * Supports link-local addresses by using ire_route_recursive which follows + * the ill when recursing. + * + * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group + * and the MULTIRT property can be different for different groups, we + * extract RTF_MULTIRT from the special unicast route added for a group + * with CGTP and pass that back in the multirtp argument. + * This is used in ip_set_destination etc to set ixa_postfragfn for multicast. + * We have a setsrcp argument for the same reason. */ -ire_t * -ire_lookup_multi_v6(const in6_addr_t *group, zoneid_t zoneid, ip_stack_t *ipst) +ill_t * +ire_lookup_multi_ill_v6(const in6_addr_t *group, zoneid_t zoneid, + ip_stack_t *ipst, boolean_t *multirtp, in6_addr_t *setsrcp) { ire_t *ire; - ipif_t *ipif = NULL; - int match_flags = MATCH_IRE_TYPE; - in6_addr_t gw_addr_v6; - - ire = ire_ftable_lookup_v6(group, 0, 0, 0, NULL, NULL, - zoneid, 0, NULL, MATCH_IRE_DEFAULT, ipst); + ill_t *ill; - /* We search a resolvable ire in case of multirouting. */ - if ((ire != NULL) && (ire->ire_flags & RTF_MULTIRT)) { - ire_t *cire = NULL; - /* - * If the route is not resolvable, the looked up ire - * may be changed here. In that case, ire_multirt_lookup_v6() - * IRE_REFRELE the original ire and change it. - */ - (void) ire_multirt_lookup_v6(&cire, &ire, MULTIRT_CACHEGW, - NULL, ipst); - if (cire != NULL) - ire_refrele(cire); - } - if (ire == NULL) - return (NULL); - /* - * Make sure we follow ire_ipif. - * - * We need to determine the interface route through - * which the gateway will be reached. - */ - if (ire->ire_ipif != NULL) { - ipif = ire->ire_ipif; - match_flags |= MATCH_IRE_ILL; - } + ire = ire_route_recursive_v6(group, 0, NULL, zoneid, NULL, + MATCH_IRE_DSTONLY, B_FALSE, 0, ipst, setsrcp, NULL, NULL); + ASSERT(ire != NULL); - switch (ire->ire_type) { - case IRE_DEFAULT: - case IRE_PREFIX: - case IRE_HOST: - mutex_enter(&ire->ire_lock); - gw_addr_v6 = ire->ire_gateway_addr_v6; - mutex_exit(&ire->ire_lock); - ire_refrele(ire); - ire = ire_ftable_lookup_v6(&gw_addr_v6, 0, 0, - IRE_INTERFACE, ipif, NULL, zoneid, 0, - NULL, match_flags, ipst); - return (ire); - case IRE_IF_NORESOLVER: - case IRE_IF_RESOLVER: - return (ire); - default: + if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { ire_refrele(ire); return (NULL); } -} -/* - * Return any local address. We use this to target ourselves - * when the src address was specified as 'default'. - * Preference for IRE_LOCAL entries. - */ -ire_t * -ire_lookup_local_v6(zoneid_t zoneid, ip_stack_t *ipst) -{ - ire_t *ire; - irb_t *irb; - ire_t *maybe = NULL; - int i; + if (multirtp != NULL) + *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0; - for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) { - irb = &ipst->ips_ip_cache_table_v6[i]; - if (irb->irb_ire == NULL) - continue; - rw_enter(&irb->irb_lock, RW_READER); - for (ire = irb->irb_ire; ire; ire = ire->ire_next) { - if ((ire->ire_marks & IRE_MARK_CONDEMNED) || - ire->ire_zoneid != zoneid && - ire->ire_zoneid != ALL_ZONES) - continue; - switch (ire->ire_type) { - case IRE_LOOPBACK: - if (maybe == NULL) { - IRE_REFHOLD(ire); - maybe = ire; - } - break; - case IRE_LOCAL: - if (maybe != NULL) { - ire_refrele(maybe); - } - IRE_REFHOLD(ire); - rw_exit(&irb->irb_lock); - return (ire); - } - } - rw_exit(&irb->irb_lock); - } - return (maybe); + ill = ire_nexthop_ill(ire); + ire_refrele(ire); + return (ill); } /* @@ -369,6 +292,8 @@ ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask) if (plen < 0 || plen > IPV6_ABITS) return (NULL); *bitmask = ipv6_all_zeros; + if (plen == 0) + return (bitmask); ptr = (uint32_t *)bitmask; while (plen > 32) { @@ -380,196 +305,78 @@ ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask) } /* - * Add a fully initialized IRE to an appropriate - * table based on ire_type. - * - * The forward table contains IRE_PREFIX/IRE_HOST/IRE_HOST and - * IRE_IF_RESOLVER/IRE_IF_NORESOLVER and IRE_DEFAULT. - * - * The cache table contains IRE_BROADCAST/IRE_LOCAL/IRE_LOOPBACK - * and IRE_CACHE. - * - * NOTE : This function is called as writer though not required - * by this function. + * Add a fully initialized IPv6 IRE to the forwarding table. + * This returns NULL on failure, or a held IRE on success. + * Normally the returned IRE is the same as the argument. But a different + * IRE will be returned if the added IRE is deemed identical to an existing + * one. In that case ire_identical_ref will be increased. + * The caller always needs to do an ire_refrele() on the returned IRE. */ -int -ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func) +ire_t * +ire_add_v6(ire_t *ire) { ire_t *ire1; int mask_table_index; irb_t *irb_ptr; ire_t **irep; - int flags; - ire_t *pire = NULL; - ill_t *stq_ill; - boolean_t ndp_g_lock_held = B_FALSE; - ire_t *ire = *ire_p; + int match_flags; int error; ip_stack_t *ipst = ire->ire_ipst; - uint_t marks = 0; ASSERT(ire->ire_ipversion == IPV6_VERSION); - ASSERT(ire->ire_mp == NULL); /* Calls should go through ire_add */ - ASSERT(ire->ire_nce == NULL); - - /* - * IREs with source addresses hosted on interfaces that are under IPMP - * should be hidden so that applications don't accidentally end up - * sending packets with test addresses as their source addresses, or - * sending out interfaces that are e.g. IFF_INACTIVE. Hide them here. - * (We let IREs with unspecified source addresses slip through since - * ire_send_v6() will delete them automatically.) - */ - if (ire->ire_ipif != NULL && IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) && - !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6)) { - DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire); - marks |= IRE_MARK_TESTHIDDEN; - } - - /* Find the appropriate list head. */ - switch (ire->ire_type) { - case IRE_HOST: - ire->ire_mask_v6 = ipv6_all_ones; - ire->ire_masklen = IPV6_ABITS; - ire->ire_marks |= marks; - if ((ire->ire_flags & RTF_SETSRC) == 0) - ire->ire_src_addr_v6 = ipv6_all_zeros; - break; - case IRE_CACHE: - ire->ire_mask_v6 = ipv6_all_ones; - ire->ire_masklen = IPV6_ABITS; - ire->ire_marks |= marks; - break; - case IRE_LOCAL: - case IRE_LOOPBACK: - ire->ire_mask_v6 = ipv6_all_ones; - ire->ire_masklen = IPV6_ABITS; - break; - case IRE_PREFIX: - case IRE_DEFAULT: - ire->ire_marks |= marks; - if ((ire->ire_flags & RTF_SETSRC) == 0) - ire->ire_src_addr_v6 = ipv6_all_zeros; - break; - case IRE_IF_RESOLVER: - case IRE_IF_NORESOLVER: - ire->ire_marks |= marks; - break; - default: - printf("ire_add_v6: ire %p has unrecognized IRE type (%d)\n", - (void *)ire, ire->ire_type); - ire_delete(ire); - *ire_p = NULL; - return (EINVAL); - } /* Make sure the address is properly masked. */ V6_MASK_COPY(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6); - if ((ire->ire_type & IRE_CACHETABLE) == 0) { - /* IRE goes into Forward Table */ - mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6); - if ((ipst->ips_ip_forwarding_table_v6[mask_table_index]) == - NULL) { - irb_t *ptr; - int i; - - ptr = (irb_t *)mi_zalloc(( - ipst->ips_ip6_ftable_hash_size * sizeof (irb_t))); - if (ptr == NULL) { - ire_delete(ire); - *ire_p = NULL; - return (ENOMEM); - } - for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) { - rw_init(&ptr[i].irb_lock, NULL, - RW_DEFAULT, NULL); - } - mutex_enter(&ipst->ips_ire_ft_init_lock); - if (ipst->ips_ip_forwarding_table_v6[ - mask_table_index] == NULL) { - ipst->ips_ip_forwarding_table_v6[ - mask_table_index] = ptr; - mutex_exit(&ipst->ips_ire_ft_init_lock); - } else { - /* - * Some other thread won the race in - * initializing the forwarding table at the - * same index. - */ - mutex_exit(&ipst->ips_ire_ft_init_lock); - for (i = 0; i < ipst->ips_ip6_ftable_hash_size; - i++) { - rw_destroy(&ptr[i].irb_lock); - } - mi_free(ptr); - } - } - irb_ptr = &(ipst->ips_ip_forwarding_table_v6[mask_table_index][ - IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6, - ipst->ips_ip6_ftable_hash_size)]); - } else { - irb_ptr = &(ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6( - ire->ire_addr_v6, ipst->ips_ip6_cache_table_size)]); - } - /* - * For xresolv interfaces (v6 interfaces with an external - * address resolver), ip_newroute_v6/ip_newroute_ipif_v6 - * are unable to prevent the deletion of the interface route - * while adding an IRE_CACHE for an on-link destination - * in the IRE_IF_RESOLVER case, since the ire has to go to - * the external resolver and return. We can't do a REFHOLD on the - * associated interface ire for fear of the message being freed - * if the external resolver can't resolve the address. - * Here we look up the interface ire in the forwarding table - * and make sure that the interface route has not been deleted. - */ - if (ire->ire_type == IRE_CACHE && - IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6) && - (((ill_t *)ire->ire_stq->q_ptr)->ill_net_type == IRE_IF_RESOLVER) && - (((ill_t *)ire->ire_stq->q_ptr)->ill_flags & ILLF_XRESOLV)) { + mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6); + if ((ipst->ips_ip_forwarding_table_v6[mask_table_index]) == NULL) { + irb_t *ptr; + int i; - pire = ire_ihandle_lookup_onlink_v6(ire); - if (pire == NULL) { + ptr = (irb_t *)mi_zalloc((ipst->ips_ip6_ftable_hash_size * + sizeof (irb_t))); + if (ptr == NULL) { ire_delete(ire); - *ire_p = NULL; - return (EINVAL); + return (NULL); } - /* Prevent pire from getting deleted */ - IRB_REFHOLD(pire->ire_bucket); - /* Has it been removed already? */ - if (pire->ire_marks & IRE_MARK_CONDEMNED) { - IRB_REFRELE(pire->ire_bucket); - ire_refrele(pire); - ire_delete(ire); - *ire_p = NULL; - return (EINVAL); + for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) { + rw_init(&ptr[i].irb_lock, NULL, RW_DEFAULT, NULL); + } + mutex_enter(&ipst->ips_ire_ft_init_lock); + if (ipst->ips_ip_forwarding_table_v6[mask_table_index] == + NULL) { + ipst->ips_ip_forwarding_table_v6[mask_table_index] = + ptr; + mutex_exit(&ipst->ips_ire_ft_init_lock); + } else { + /* + * Some other thread won the race in + * initializing the forwarding table at the + * same index. + */ + mutex_exit(&ipst->ips_ire_ft_init_lock); + for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) { + rw_destroy(&ptr[i].irb_lock); + } + mi_free(ptr); } } + irb_ptr = &(ipst->ips_ip_forwarding_table_v6[mask_table_index][ + IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6, + ipst->ips_ip6_ftable_hash_size)]); - flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW); + match_flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW); + if (ire->ire_ill != NULL) + match_flags |= MATCH_IRE_ILL; /* - * For IRE_CACHES, MATCH_IRE_IPIF is not enough to check - * for duplicates because : - * - * 1) ire_ipif->ipif_ill and ire_stq->q_ptr could be - * pointing at different ills. A real duplicate is - * a match on both ire_ipif and ire_stq. - * - * 2) We could have multiple packets trying to create - * an IRE_CACHE for the same ill. - * - * Rather than looking at the packet, we depend on the above for - * MATCH_IRE_ILL here. - * - * Unlike IPv4, MATCH_IRE_IPIF is needed here as we could have - * multiple IRE_CACHES for an ill for the same destination - * with various scoped addresses i.e represented by ipifs. - * - * MATCH_IRE_ILL is done implicitly below for IRE_CACHES. + * Start the atomic add of the ire. Grab the bucket lock and the + * ill lock. Check for condemned. */ - if (ire->ire_ipif != NULL) - flags |= MATCH_IRE_IPIF; + error = ire_atomic_start(irb_ptr, ire); + if (error != 0) { + ire_delete(ire); + return (NULL); + } /* * If we are creating a hidden IRE, make sure we search for @@ -577,103 +384,36 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func) * Otherwise, we might find an IRE on some other interface * that's not marked hidden. */ - if (ire->ire_marks & IRE_MARK_TESTHIDDEN) - flags |= MATCH_IRE_MARK_TESTHIDDEN; - - /* - * Start the atomic add of the ire. Grab the ill locks, - * ill_g_usesrc_lock and the bucket lock. Check for condemned. - * To avoid lock order problems, get the ndp6.ndp_g_lock now itself. - */ - if (ire->ire_type == IRE_CACHE) { - mutex_enter(&ipst->ips_ndp6->ndp_g_lock); - ndp_g_lock_held = B_TRUE; - } - - /* - * If ipif or ill is changing ire_atomic_start() may queue the - * request and return EINPROGRESS. - */ - - error = ire_atomic_start(irb_ptr, ire, q, mp, func); - if (error != 0) { - if (ndp_g_lock_held) - mutex_exit(&ipst->ips_ndp6->ndp_g_lock); - /* - * We don't know whether it is a valid ipif or not. - * So, set it to NULL. This assumes that the ire has not added - * a reference to the ipif. - */ - ire->ire_ipif = NULL; - ire_delete(ire); - if (pire != NULL) { - IRB_REFRELE(pire->ire_bucket); - ire_refrele(pire); - } - *ire_p = NULL; - return (error); - } - /* - * To avoid creating ires having stale values for the ire_max_frag - * we get the latest value atomically here. For more details - * see the block comment in ip_sioctl_mtu and in DL_NOTE_SDU_CHANGE - * in ip_rput_dlpi_writer - */ - if (ire->ire_max_fragp == NULL) { - if (IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) - ire->ire_max_frag = ire->ire_ipif->ipif_mtu; - else - ire->ire_max_frag = pire->ire_max_frag; - } else { - uint_t max_frag; - - max_frag = *ire->ire_max_fragp; - ire->ire_max_fragp = NULL; - ire->ire_max_frag = max_frag; - } + if (ire->ire_testhidden) + match_flags |= MATCH_IRE_TESTHIDDEN; /* * Atomically check for duplicate and insert in the table. */ for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { - if (ire1->ire_marks & IRE_MARK_CONDEMNED) + if (IRE_IS_CONDEMNED(ire1)) continue; - - if (ire->ire_type == IRE_CACHE) { - /* - * We do MATCH_IRE_ILL implicitly here for IRE_CACHES. - * As ire_ipif and ire_stq could point to two - * different ills, we can't pass just ire_ipif to - * ire_match_args and get a match on both ills. - * This is just needed for duplicate checks here and - * so we don't add an extra argument to - * ire_match_args for this. Do it locally. - * - * NOTE : Currently there is no part of the code - * that asks for both MATH_IRE_IPIF and MATCH_IRE_ILL - * match for IRE_CACHEs. Thus we don't want to - * extend the arguments to ire_match_args_v6. - */ - if (ire1->ire_stq != ire->ire_stq) - continue; - /* - * Multiroute IRE_CACHEs for a given destination can - * have the same ire_ipif, typically if their source - * address is forced using RTF_SETSRC, and the same - * send-to queue. We differentiate them using the parent - * handle. - */ - if ((ire1->ire_flags & RTF_MULTIRT) && - (ire->ire_flags & RTF_MULTIRT) && - (ire1->ire_phandle != ire->ire_phandle)) - continue; - } + /* + * Here we need an exact match on zoneid, i.e., + * ire_match_args doesn't fit. + */ if (ire1->ire_zoneid != ire->ire_zoneid) continue; + + if (ire1->ire_type != ire->ire_type) + continue; + + /* + * Note: We do not allow multiple routes that differ only + * in the gateway security attributes; such routes are + * considered duplicates. + * To change that we explicitly have to treat them as + * different here. + */ if (ire_match_args_v6(ire1, &ire->ire_addr_v6, &ire->ire_mask_v6, &ire->ire_gateway_addr_v6, - ire->ire_type, ire->ire_ipif, ire->ire_zoneid, 0, NULL, - flags)) { + ire->ire_type, ire->ire_ill, ire->ire_zoneid, NULL, + match_flags)) { /* * Return the old ire after doing a REFHOLD. * As most of the callers continue to use the IRE @@ -683,141 +423,25 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func) */ ip1dbg(("found dup ire existing %p new %p", (void *)ire1, (void *)ire)); - IRE_REFHOLD(ire1); - if (ndp_g_lock_held) - mutex_exit(&ipst->ips_ndp6->ndp_g_lock); + ire_refhold(ire1); + atomic_add_32(&ire1->ire_identical_ref, 1); ire_atomic_end(irb_ptr, ire); ire_delete(ire); - if (pire != NULL) { - /* - * Assert that it is - * not yet removed from the list. - */ - ASSERT(pire->ire_ptpn != NULL); - IRB_REFRELE(pire->ire_bucket); - ire_refrele(pire); - } - *ire_p = ire1; - return (0); + return (ire1); } } - if (ire->ire_type == IRE_CACHE) { - const in6_addr_t *addr_v6; - ill_t *ill = ire_to_ill(ire); - char buf[INET6_ADDRSTRLEN]; - nce_t *nce; - /* - * All IRE_CACHE types must have a nce. If this is - * not the case the entry will not be added. We need - * to make sure that if somebody deletes the nce - * after we looked up, they will find this ire and - * delete the ire. To delete this ire one needs the - * bucket lock which we are still holding here. So, - * even if the nce gets deleted after we looked up, - * this ire will get deleted. - * - * NOTE : Don't need the ire_lock for accessing - * ire_gateway_addr_v6 as it is appearing first - * time on the list and rts_setgwr_v6 could not - * be changing this. - */ - addr_v6 = &ire->ire_gateway_addr_v6; - if (IN6_IS_ADDR_UNSPECIFIED(addr_v6)) - addr_v6 = &ire->ire_addr_v6; - - /* nce fastpath is per-ill; don't match across illgrp */ - nce = ndp_lookup_v6(ill, B_FALSE, addr_v6, B_TRUE); - if (nce == NULL) - goto failed; - - /* Pair of refhold, refrele just to get the tracing right */ - NCE_REFHOLD_TO_REFHOLD_NOTR(nce); - /* - * Atomically make sure that new IREs don't point - * to an NCE that is logically deleted (CONDEMNED). - * ndp_delete() first marks the NCE CONDEMNED. - * This ensures that the nce_refcnt won't increase - * due to new nce_lookups or due to addition of new IREs - * pointing to this NCE. Then ndp_delete() cleans up - * existing references. If we don't do it atomically here, - * ndp_delete() -> nce_ire_delete() will not be able to - * clean up the IRE list completely, and the nce_refcnt - * won't go down to zero. - */ - mutex_enter(&nce->nce_lock); - if (ill->ill_flags & ILLF_XRESOLV) { - /* - * If we used an external resolver, we may not - * have gone through neighbor discovery to get here. - * Must update the nce_state before the next check. - */ - if (nce->nce_state == ND_INCOMPLETE) - nce->nce_state = ND_REACHABLE; - } - if (nce->nce_state == ND_INCOMPLETE || - (nce->nce_flags & NCE_F_CONDEMNED) || - (nce->nce_state == ND_UNREACHABLE)) { -failed: - if (ndp_g_lock_held) - mutex_exit(&ipst->ips_ndp6->ndp_g_lock); - if (nce != NULL) - mutex_exit(&nce->nce_lock); - ire_atomic_end(irb_ptr, ire); - ip1dbg(("ire_add_v6: No nce for dst %s \n", - inet_ntop(AF_INET6, &ire->ire_addr_v6, - buf, sizeof (buf)))); - ire_delete(ire); - if (pire != NULL) { - /* - * Assert that it is - * not yet removed from the list. - */ - ASSERT(pire->ire_ptpn != NULL); - IRB_REFRELE(pire->ire_bucket); - ire_refrele(pire); - } - if (nce != NULL) - NCE_REFRELE_NOTR(nce); - *ire_p = NULL; - return (EINVAL); - } else { - ire->ire_nce = nce; - } - mutex_exit(&nce->nce_lock); - } /* - * Find the first entry that matches ire_addr - provides - * tail insertion. *irep will be null if no match. + * Normally we do head insertion since most things do not care about + * the order of the IREs in the bucket. + * However, due to shared-IP zones (and restrict_interzone_loopback) + * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same + * address. For that reason we do tail insertion for IRE_IF_CLONE. */ irep = (ire_t **)irb_ptr; - while ((ire1 = *irep) != NULL && - !IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, &ire1->ire_addr_v6)) - irep = &ire1->ire_next; - ASSERT(!(ire->ire_type & IRE_BROADCAST)); - - if (*irep != NULL) { - /* - * Find the last ire which matches ire_addr_v6. - * Needed to do tail insertion among entries with the same - * ire_addr_v6. - */ - while (IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, - &ire1->ire_addr_v6)) { + if (ire->ire_type & IRE_IF_CLONE) { + while ((ire1 = *irep) != NULL) irep = &ire1->ire_next; - ire1 = *irep; - if (ire1 == NULL) - break; - } - } - - if (ire->ire_type == IRE_DEFAULT) { - /* - * We keep a count of default gateways which is used when - * assigning them as routes. - */ - ipst->ips_ipv6_ire_default_count++; - ASSERT(ipst->ips_ipv6_ire_default_count != 0); /* Wraparound */ } /* Insert at *irep */ ire1 = *irep; @@ -852,62 +476,22 @@ failed: * in the list for the first time and no one else can bump * up the reference count on this yet. */ - IRE_REFHOLD_LOCKED(ire); + ire_refhold_locked(ire); BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_inserted); irb_ptr->irb_ire_cnt++; - if (ire->ire_marks & IRE_MARK_TEMPORARY) - irb_ptr->irb_tmp_ire_cnt++; - if (ire->ire_ipif != NULL) { - DTRACE_PROBE3(ipif__incr__cnt, (ipif_t *), ire->ire_ipif, + if (ire->ire_ill != NULL) { + DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ire->ire_ill, (char *), "ire", (void *), ire); - ire->ire_ipif->ipif_ire_cnt++; - if (ire->ire_stq != NULL) { - stq_ill = (ill_t *)ire->ire_stq->q_ptr; - DTRACE_PROBE3(ill__incr__cnt, (ill_t *), stq_ill, - (char *), "ire", (void *), ire); - stq_ill->ill_ire_cnt++; - } - } else { - ASSERT(ire->ire_stq == NULL); + ire->ire_ill->ill_ire_cnt++; + ASSERT(ire->ire_ill->ill_ire_cnt != 0); /* Wraparound */ } - - if (ndp_g_lock_held) - mutex_exit(&ipst->ips_ndp6->ndp_g_lock); ire_atomic_end(irb_ptr, ire); - if (pire != NULL) { - /* Assert that it is not removed from the list yet */ - ASSERT(pire->ire_ptpn != NULL); - IRB_REFRELE(pire->ire_bucket); - ire_refrele(pire); - } - - if (ire->ire_type != IRE_CACHE) { - /* - * For ire's with with host mask see if there is an entry - * in the cache. If there is one flush the whole cache as - * there might be multiple entries due to RTF_MULTIRT (CGTP). - * If no entry is found than there is no need to flush the - * cache. - */ - - if (ip_mask_to_plen_v6(&ire->ire_mask_v6) == IPV6_ABITS) { - ire_t *lire; - lire = ire_ctable_lookup_v6(&ire->ire_addr_v6, NULL, - IRE_CACHE, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE, - ipst); - if (lire != NULL) { - ire_refrele(lire); - ire_flush_cache_v6(ire, IRE_FLUSH_ADD); - } - } else { - ire_flush_cache_v6(ire, IRE_FLUSH_ADD); - } - } + /* Make any caching of the IREs be notified or updated */ + ire_flush_cache_v6(ire, IRE_FLUSH_ADD); - *ire_p = ire; - return (0); + return (ire); } /* @@ -931,7 +515,7 @@ ire_delete_host_redirects_v6(const in6_addr_t *gateway, ip_stack_t *ipst) return; for (i = 0; (i < ipst->ips_ip6_ftable_hash_size); i++) { irb = &irb_ptr[i]; - IRB_REFHOLD(irb); + irb_refhold(irb); for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { if (!(ire->ire_flags & RTF_DYNAMIC)) continue; @@ -941,50 +525,11 @@ ire_delete_host_redirects_v6(const in6_addr_t *gateway, ip_stack_t *ipst) if (IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) ire_delete(ire); } - IRB_REFRELE(irb); + irb_refrele(irb); } } /* - * Delete all the cache entries with this 'addr'. This is the IPv6 counterpart - * of ip_ire_clookup_and_delete. The difference being this function does not - * return any value. IPv6 processing of a gratuitous ARP, as it stands, is - * different than IPv4 in that, regardless of the presence of a cache entry - * for this address, an ire_walk_v6 is done. Another difference is that unlike - * in the case of IPv4 this does not take an ipif_t argument, since it is only - * called by ip_arp_news and the match is always only on the address. - */ -void -ip_ire_clookup_and_delete_v6(const in6_addr_t *addr, ip_stack_t *ipst) -{ - irb_t *irb; - ire_t *cire; - boolean_t found = B_FALSE; - - irb = &ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr, - ipst->ips_ip6_cache_table_size)]; - IRB_REFHOLD(irb); - for (cire = irb->irb_ire; cire != NULL; cire = cire->ire_next) { - if (cire->ire_marks & IRE_MARK_CONDEMNED) - continue; - if (IN6_ARE_ADDR_EQUAL(&cire->ire_addr_v6, addr)) { - - /* This signifies start of a match */ - if (!found) - found = B_TRUE; - if (cire->ire_type == IRE_CACHE) { - if (cire->ire_nce != NULL) - ndp_delete(cire->ire_nce); - ire_delete_v6(cire); - } - /* End of the match */ - } else if (found) - break; - } - IRB_REFRELE(irb); -} - -/* * Delete the specified IRE. * All calls should use ire_delete(). * Sometimes called as writer though not required by this function. @@ -998,11 +543,20 @@ ire_delete_v6(ire_t *ire) in6_addr_t gw_addr_v6; ip_stack_t *ipst = ire->ire_ipst; + /* + * Make sure ire_generation increases from ire_flush_cache happen + * after any lookup/reader has read ire_generation. + * Since the rw_enter makes us wait until any lookup/reader has + * completed we can exit the lock immediately. + */ + rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER); + rw_exit(&ipst->ips_ip6_ire_head_lock); + ASSERT(ire->ire_refcnt >= 1); ASSERT(ire->ire_ipversion == IPV6_VERSION); - if (ire->ire_type != IRE_CACHE) - ire_flush_cache_v6(ire, IRE_FLUSH_DELETE); + ire_flush_cache_v6(ire, IRE_FLUSH_DELETE); + if (ire->ire_type == IRE_DEFAULT) { /* * when a default gateway is going away @@ -1014,368 +568,284 @@ ire_delete_v6(ire_t *ire) mutex_exit(&ire->ire_lock); ire_delete_host_redirects_v6(&gw_addr_v6, ipst); } -} - -/* - * ire_walk routine to delete all IRE_CACHE and IRE_HOST type redirect - * entries. - */ -/*ARGSUSED1*/ -void -ire_delete_cache_v6(ire_t *ire, char *arg) -{ - char addrstr1[INET6_ADDRSTRLEN]; - char addrstr2[INET6_ADDRSTRLEN]; - - if ((ire->ire_type & IRE_CACHE) || - (ire->ire_flags & RTF_DYNAMIC)) { - ip1dbg(("ire_delete_cache_v6: deleted %s type %d through %s\n", - inet_ntop(AF_INET6, &ire->ire_addr_v6, - addrstr1, sizeof (addrstr1)), - ire->ire_type, - inet_ntop(AF_INET6, &ire->ire_gateway_addr_v6, - addrstr2, sizeof (addrstr2)))); - ire_delete(ire); - } - -} -/* - * ire_walk routine to delete all IRE_CACHE/IRE_HOST type redirect entries - * that have a given gateway address. - */ -void -ire_delete_cache_gw_v6(ire_t *ire, char *addr) -{ - in6_addr_t *gw_addr = (in6_addr_t *)addr; - char buf1[INET6_ADDRSTRLEN]; - char buf2[INET6_ADDRSTRLEN]; - in6_addr_t ire_gw_addr_v6; - - if (!(ire->ire_type & IRE_CACHE) && - !(ire->ire_flags & RTF_DYNAMIC)) - return; - - mutex_enter(&ire->ire_lock); - ire_gw_addr_v6 = ire->ire_gateway_addr_v6; - mutex_exit(&ire->ire_lock); + /* + * If we are deleting an IRE_INTERFACE then we make sure we also + * delete any IRE_IF_CLONE that has been created from it. + * Those are always in ire_dep_children. + */ + if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != 0) + ire_dep_delete_if_clone(ire); - if (IN6_ARE_ADDR_EQUAL(&ire_gw_addr_v6, gw_addr)) { - ip1dbg(("ire_delete_cache_gw_v6: deleted %s type %d to %s\n", - inet_ntop(AF_INET6, &ire->ire_src_addr_v6, - buf1, sizeof (buf1)), - ire->ire_type, - inet_ntop(AF_INET6, &ire_gw_addr_v6, - buf2, sizeof (buf2)))); - ire_delete(ire); + /* Remove from parent dependencies and child */ + rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER); + if (ire->ire_dep_parent != NULL) { + ire_dep_remove(ire); } + while (ire->ire_dep_children != NULL) + ire_dep_remove(ire->ire_dep_children); + rw_exit(&ipst->ips_ire_dep_lock); } /* - * Remove all IRE_CACHE entries that match - * the ire specified. (Sometimes called - * as writer though not required by this function.) - * - * The flag argument indicates if the - * flush request is due to addition - * of new route (IRE_FLUSH_ADD) or deletion of old - * route (IRE_FLUSH_DELETE). + * When an IRE is added or deleted this routine is called to make sure + * any caching of IRE information is notified or updated. * - * This routine takes only the IREs from the forwarding - * table and flushes the corresponding entries from - * the cache table. - * - * When flushing due to the deletion of an old route, it - * just checks the cache handles (ire_phandle and ire_ihandle) and - * deletes the ones that match. - * - * When flushing due to the creation of a new route, it checks - * if a cache entry's address matches the one in the IRE and - * that the cache entry's parent has a less specific mask than the - * one in IRE. The destination of such a cache entry could be the - * gateway for other cache entries, so we need to flush those as - * well by looking for gateway addresses matching the IRE's address. + * The flag argument indicates if the flush request is due to addition + * of new route (IRE_FLUSH_ADD), deletion of old route (IRE_FLUSH_DELETE), + * or a change to ire_gateway_addr (IRE_FLUSH_GWCHANGE). */ void ire_flush_cache_v6(ire_t *ire, int flag) { - int i; - ire_t *cire; - irb_t *irb; - ip_stack_t *ipst = ire->ire_ipst; + ip_stack_t *ipst = ire->ire_ipst; - if (ire->ire_type & IRE_CACHE) + /* + * IRE_IF_CLONE ire's don't provide any new information + * than the parent from which they are cloned, so don't + * perturb the generation numbers. + */ + if (ire->ire_type & IRE_IF_CLONE) return; /* - * If a default is just created, there is no point - * in going through the cache, as there will not be any - * cached ires. + * Ensure that an ire_add during a lookup serializes the updates of + * the generation numbers under ire_head_lock so that the lookup gets + * either the old ire and old generation number, or a new ire and new + * generation number. + */ + rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER); + + /* + * If a route was just added, we need to notify everybody that + * has cached an IRE_NOROUTE since there might now be a better + * route for them. */ - if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) - return; if (flag == IRE_FLUSH_ADD) { + ire_increment_generation(ipst->ips_ire_reject_v6); + ire_increment_generation(ipst->ips_ire_blackhole_v6); + } + + /* Adding a default can't otherwise provide a better route */ + if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) { + rw_exit(&ipst->ips_ip6_ire_head_lock); + return; + } + + switch (flag) { + case IRE_FLUSH_DELETE: + case IRE_FLUSH_GWCHANGE: /* - * This selective flush is - * due to the addition of - * new IRE. + * Update ire_generation for all ire_dep_children chains + * starting with this IRE */ - for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) { - irb = &ipst->ips_ip_cache_table_v6[i]; - if ((cire = irb->irb_ire) == NULL) - continue; - IRB_REFHOLD(irb); - for (cire = irb->irb_ire; cire != NULL; - cire = cire->ire_next) { - if (cire->ire_type != IRE_CACHE) - continue; - /* - * If 'cire' belongs to the same subnet - * as the new ire being added, and 'cire' - * is derived from a prefix that is less - * specific than the new ire being added, - * we need to flush 'cire'; for instance, - * when a new interface comes up. - */ - if ((V6_MASK_EQ_2(cire->ire_addr_v6, - ire->ire_mask_v6, ire->ire_addr_v6) && - (ip_mask_to_plen_v6(&cire->ire_cmask_v6) <= - ire->ire_masklen))) { - ire_delete(cire); - continue; - } - /* - * This is the case when the ire_gateway_addr - * of 'cire' belongs to the same subnet as - * the new ire being added. - * Flushing such ires is sometimes required to - * avoid misrouting: say we have a machine with - * two interfaces (I1 and I2), a default router - * R on the I1 subnet, and a host route to an - * off-link destination D with a gateway G on - * the I2 subnet. - * Under normal operation, we will have an - * on-link cache entry for G and an off-link - * cache entry for D with G as ire_gateway_addr, - * traffic to D will reach its destination - * through gateway G. - * If the administrator does 'ifconfig I2 down', - * the cache entries for D and G will be - * flushed. However, G will now be resolved as - * an off-link destination using R (the default - * router) as gateway. Then D will also be - * resolved as an off-link destination using G - * as gateway - this behavior is due to - * compatibility reasons, see comment in - * ire_ihandle_lookup_offlink(). Traffic to D - * will go to the router R and probably won't - * reach the destination. - * The administrator then does 'ifconfig I2 up'. - * Since G is on the I2 subnet, this routine - * will flush its cache entry. It must also - * flush the cache entry for D, otherwise - * traffic will stay misrouted until the IRE - * times out. - */ - if (V6_MASK_EQ_2(cire->ire_gateway_addr_v6, - ire->ire_mask_v6, ire->ire_addr_v6)) { - ire_delete(cire); - continue; - } - } - IRB_REFRELE(irb); - } - } else { + ire_dep_incr_generation(ire); + break; + case IRE_FLUSH_ADD: { + in6_addr_t addr; + in6_addr_t mask; + ip_stack_t *ipst = ire->ire_ipst; + uint_t masklen; + /* - * delete the cache entries based on - * handle in the IRE as this IRE is - * being deleted/changed. + * Find an IRE which is a shorter match than the ire to be added + * For any such IRE (which we repeat) we update the + * ire_generation the same way as in the delete case. */ - for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) { - irb = &ipst->ips_ip_cache_table_v6[i]; - if ((cire = irb->irb_ire) == NULL) - continue; - IRB_REFHOLD(irb); - for (cire = irb->irb_ire; cire != NULL; - cire = cire->ire_next) { - if (cire->ire_type != IRE_CACHE) - continue; - if ((cire->ire_phandle == 0 || - cire->ire_phandle != ire->ire_phandle) && - (cire->ire_ihandle == 0 || - cire->ire_ihandle != ire->ire_ihandle)) - continue; - ire_delete(cire); - } - IRB_REFRELE(irb); + addr = ire->ire_addr_v6; + mask = ire->ire_mask_v6; + masklen = ip_mask_to_plen_v6(&mask); + + ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, NULL, + ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst); + while (ire != NULL) { + /* We need to handle all in the same bucket */ + irb_increment_generation(ire->ire_bucket); + + mask = ire->ire_mask_v6; + ASSERT(masklen > ip_mask_to_plen_v6(&mask)); + masklen = ip_mask_to_plen_v6(&mask); + ire_refrele(ire); + ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, + NULL, ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst); + } } + break; } + rw_exit(&ipst->ips_ip6_ire_head_lock); } /* * Matches the arguments passed with the values in the ire. * - * Note: for match types that match using "ipif" passed in, ipif + * Note: for match types that match using "ill" passed in, ill * must be checked for non-NULL before calling this routine. */ -static boolean_t +boolean_t ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask, - const in6_addr_t *gateway, int type, const ipif_t *ipif, zoneid_t zoneid, - uint32_t ihandle, const ts_label_t *tsl, int match_flags) + const in6_addr_t *gateway, int type, const ill_t *ill, zoneid_t zoneid, + const ts_label_t *tsl, int match_flags) { in6_addr_t masked_addr; in6_addr_t gw_addr_v6; ill_t *ire_ill = NULL, *dst_ill; - ill_t *ipif_ill = NULL; - ipif_t *src_ipif; + ip_stack_t *ipst = ire->ire_ipst; ASSERT(ire->ire_ipversion == IPV6_VERSION); ASSERT(addr != NULL); ASSERT(mask != NULL); ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL); ASSERT((!(match_flags & MATCH_IRE_ILL)) || - (ipif != NULL && ipif->ipif_isv6)); + (ill != NULL && ill->ill_isv6)); /* - * If MATCH_IRE_MARK_TESTHIDDEN is set, then only return the IRE if it - * is in fact hidden, to ensure the caller gets the right one. One - * exception: if the caller passed MATCH_IRE_IHANDLE, then they - * already know the identity of the given IRE_INTERFACE entry and - * there's no point trying to hide it from them. + * If MATCH_IRE_TESTHIDDEN is set, then only return the IRE if it + * is in fact hidden, to ensure the caller gets the right one. */ - if (ire->ire_marks & IRE_MARK_TESTHIDDEN) { - if (match_flags & MATCH_IRE_IHANDLE) - match_flags |= MATCH_IRE_MARK_TESTHIDDEN; - - if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) + if (ire->ire_testhidden) { + if (!(match_flags & MATCH_IRE_TESTHIDDEN)) return (B_FALSE); } if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid && ire->ire_zoneid != ALL_ZONES) { /* - * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid is - * valid and does not match that of ire_zoneid, a failure to + * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid + * does not match that of ire_zoneid, a failure to * match is reported at this point. Otherwise, since some IREs * that are available in the global zone can be used in local * zones, additional checks need to be performed: * - * IRE_CACHE and IRE_LOOPBACK entries should - * never be matched in this situation. + * IRE_LOOPBACK + * entries should never be matched in this situation. + * Each zone has its own IRE_LOOPBACK. * - * IRE entries that have an interface associated with them - * should in general not match unless they are an IRE_LOCAL - * or in the case when MATCH_IRE_DEFAULT has been set in - * the caller. In the case of the former, checking of the - * other fields supplied should take place. + * IRE_LOCAL + * We allow them for any zoneid. ire_route_recursive + * does additional checks when + * ip_restrict_interzone_loopback is set. * - * In the case where MATCH_IRE_DEFAULT has been set, - * all of the ipif's associated with the IRE's ill are - * checked to see if there is a matching zoneid. If any - * one ipif has a matching zoneid, this IRE is a - * potential candidate so checking of the other fields - * takes place. + * If ill_usesrc_ifindex is set + * Then we check if the zone has a valid source address + * on the usesrc ill. * - * In the case where the IRE_INTERFACE has a usable source - * address (indicated by ill_usesrc_ifindex) in the - * correct zone then it's permitted to return this IRE + * If ire_ill is set, then check that the zone has an ipif + * on that ill. + * + * Outside of this function (in ire_round_robin) we check + * that any IRE_OFFLINK has a gateway that reachable from the + * zone when we have multiple choices (ECMP). */ if (match_flags & MATCH_IRE_ZONEONLY) return (B_FALSE); - if (ire->ire_type & (IRE_CACHE | IRE_LOOPBACK)) + if (ire->ire_type & IRE_LOOPBACK) return (B_FALSE); + + if (ire->ire_type & IRE_LOCAL) + goto matchit; + /* - * Note, IRE_INTERFACE can have the stq as NULL. For - * example, if the default multicast route is tied to - * the loopback address. + * The normal case of IRE_ONLINK has a matching zoneid. + * Here we handle the case when shared-IP zones have been + * configured with IP addresses on vniN. In that case it + * is ok for traffic from a zone to use IRE_ONLINK routes + * if the ill has a usesrc pointing at vniN + * Applies to IRE_INTERFACE. */ - if ((ire->ire_type & IRE_INTERFACE) && - (ire->ire_stq != NULL)) { - dst_ill = (ill_t *)ire->ire_stq->q_ptr; + dst_ill = ire->ire_ill; + if (ire->ire_type & IRE_ONLINK) { + uint_t ifindex; + + /* + * Note there is no IRE_INTERFACE on vniN thus + * can't do an IRE lookup for a matching route. + */ + ifindex = dst_ill->ill_usesrc_ifindex; + if (ifindex == 0) + return (B_FALSE); + /* * If there is a usable source address in the - * zone, then it's ok to return an - * IRE_INTERFACE + * zone, then it's ok to return this IRE_INTERFACE */ - if ((dst_ill->ill_usesrc_ifindex != 0) && - (src_ipif = ipif_select_source_v6(dst_ill, addr, - B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid)) - != NULL) { - ip3dbg(("ire_match_args: src_ipif %p" - " dst_ill %p", (void *)src_ipif, - (void *)dst_ill)); - ipif_refrele(src_ipif); - } else { - ip3dbg(("ire_match_args: src_ipif NULL" + if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6, + zoneid, ipst)) { + ip3dbg(("ire_match_args: no usrsrc for zone" " dst_ill %p\n", (void *)dst_ill)); return (B_FALSE); } } - if (ire->ire_ipif != NULL && ire->ire_type != IRE_LOCAL && - !(ire->ire_type & IRE_INTERFACE)) { + /* + * For exampe, with + * route add 11.0.0.0 gw1 -ifp bge0 + * route add 11.0.0.0 gw2 -ifp bge1 + * this code would differentiate based on + * where the sending zone has addresses. + * Only if the zone has an address on bge0 can it use the first + * route. It isn't clear if this behavior is documented + * anywhere. + */ + if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) { ipif_t *tipif; - if ((match_flags & MATCH_IRE_DEFAULT) == 0) - return (B_FALSE); - mutex_enter(&ire->ire_ipif->ipif_ill->ill_lock); - for (tipif = ire->ire_ipif->ipif_ill->ill_ipif; + mutex_enter(&dst_ill->ill_lock); + for (tipif = dst_ill->ill_ipif; tipif != NULL; tipif = tipif->ipif_next) { - if (IPIF_CAN_LOOKUP(tipif) && + if (!IPIF_IS_CONDEMNED(tipif) && (tipif->ipif_flags & IPIF_UP) && (tipif->ipif_zoneid == zoneid || tipif->ipif_zoneid == ALL_ZONES)) break; } - mutex_exit(&ire->ire_ipif->ipif_ill->ill_lock); + mutex_exit(&dst_ill->ill_lock); if (tipif == NULL) return (B_FALSE); } } +matchit: if (match_flags & MATCH_IRE_GW) { mutex_enter(&ire->ire_lock); gw_addr_v6 = ire->ire_gateway_addr_v6; mutex_exit(&ire->ire_lock); } - - /* - * For IRE_CACHE entries, MATCH_IRE_ILL means that somebody wants to - * send out ire_stq (ire_ipif for IRE_CACHE entries is just the means - * of getting a source address -- i.e., ire_src_addr_v6 == - * ire->ire_ipif->ipif_v6src_addr). ire_to_ill() handles this. - * - * NOTE: For IPMP, MATCH_IRE_ILL usually matches any ill in the group. - * However, if MATCH_IRE_MARK_TESTHIDDEN is set (i.e., the IRE is for - * IPMP test traffic), then the ill must match exactly. - */ if (match_flags & MATCH_IRE_ILL) { - ire_ill = ire_to_ill(ire); - ipif_ill = ipif->ipif_ill; - } + ire_ill = ire->ire_ill; + /* + * If asked to match an ill, we *must* match + * on the ire_ill for ipmp test addresses, or + * any of the ill in the group for data addresses. + * If we don't, we may as well fail. + * However, we need an exception for IRE_LOCALs to ensure + * we loopback packets even sent to test addresses on different + * interfaces in the group. + */ + if ((match_flags & MATCH_IRE_TESTHIDDEN) && + !(ire->ire_type & IRE_LOCAL)) { + if (ire->ire_ill != ill) + return (B_FALSE); + } else { + match_flags &= ~MATCH_IRE_TESTHIDDEN; + /* + * We know that ill is not NULL, but ire_ill could be + * NULL + */ + if (ire_ill == NULL || !IS_ON_SAME_LAN(ill, ire_ill)) + return (B_FALSE); + } + } /* No ire_addr_v6 bits set past the mask */ ASSERT(V6_MASK_EQ(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6)); V6_MASK_COPY(*addr, *mask, masked_addr); - if (V6_MASK_EQ(*addr, *mask, ire->ire_addr_v6) && ((!(match_flags & MATCH_IRE_GW)) || IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) && - ((!(match_flags & MATCH_IRE_TYPE)) || - (ire->ire_type & type)) && - ((!(match_flags & MATCH_IRE_SRC)) || - IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6, - &ipif->ipif_v6src_addr)) && - ((!(match_flags & MATCH_IRE_IPIF)) || - (ire->ire_ipif == ipif)) && - ((!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) || - (ire->ire_marks & IRE_MARK_TESTHIDDEN)) && - ((!(match_flags & MATCH_IRE_ILL)) || - (ire_ill == ipif_ill || - (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN) && - ire_ill != NULL && IS_IN_SAME_ILLGRP(ipif_ill, ire_ill)))) && - ((!(match_flags & MATCH_IRE_IHANDLE)) || - (ire->ire_ihandle == ihandle)) && + ((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & type)) && + ((!(match_flags & MATCH_IRE_TESTHIDDEN)) || ire->ire_testhidden) && + ((!(match_flags & MATCH_IRE_MASK)) || + (IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, mask))) && ((!(match_flags & MATCH_IRE_SECATTR)) || (!is_system_labeled()) || (tsol_ire_match_gwattr(ire, tsl) == 0))) { @@ -1386,41 +856,38 @@ ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask, } /* - * Lookup for a route in all the tables + * Check if the zoneid (not ALL_ZONES) has an IRE_INTERFACE for the specified + * gateway address. If ill is non-NULL we also match on it. + * The caller must hold a read lock on RADIX_NODE_HEAD if lock_held is set. */ -ire_t * -ire_route_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask, - const in6_addr_t *gateway, int type, const ipif_t *ipif, ire_t **pire, - zoneid_t zoneid, const ts_label_t *tsl, int flags, ip_stack_t *ipst) +boolean_t +ire_gateway_ok_zone_v6(const in6_addr_t *gateway, zoneid_t zoneid, ill_t *ill, + const ts_label_t *tsl, ip_stack_t *ipst, boolean_t lock_held) { - ire_t *ire = NULL; + ire_t *ire; + uint_t match_flags; - /* - * ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or - * MATCH_IRE_ILL is set. - */ - if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL)) - return (NULL); + if (lock_held) + ASSERT(RW_READ_HELD(&ipst->ips_ip6_ire_head_lock)); + else + rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); - /* - * might be asking for a cache lookup, - * This is not best way to lookup cache, - * user should call ire_cache_lookup directly. - * - * If MATCH_IRE_TYPE was set, first lookup in the cache table and then - * in the forwarding table, if the applicable type flags were set. - */ - if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_CACHETABLE) != 0) { - ire = ire_ctable_lookup_v6(addr, gateway, type, ipif, zoneid, - tsl, flags, ipst); - if (ire != NULL) - return (ire); - } - if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_FORWARDTABLE) != 0) { - ire = ire_ftable_lookup_v6(addr, mask, gateway, type, ipif, - pire, zoneid, 0, tsl, flags, ipst); + match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR; + if (ill != NULL) + match_flags |= MATCH_IRE_ILL; + + ire = ire_ftable_lookup_impl_v6(gateway, &ipv6_all_zeros, + &ipv6_all_zeros, IRE_INTERFACE, ill, zoneid, tsl, match_flags, + ipst); + + if (!lock_held) + rw_exit(&ipst->ips_ip6_ire_head_lock); + if (ire != NULL) { + ire_refrele(ire); + return (B_TRUE); + } else { + return (B_FALSE); } - return (ire); } /* @@ -1429,63 +896,121 @@ ire_route_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask, * required parameters and indicating the * match required in flag field. * - * Looking for default route can be done in three ways - * 1) pass mask as ipv6_all_zeros and set MATCH_IRE_MASK in flags field - * along with other matches. - * 2) pass type as IRE_DEFAULT and set MATCH_IRE_TYPE in flags - * field along with other matches. - * 3) if the destination and mask are passed as zeros. - * - * A request to return a default route if no route - * is found, can be specified by setting MATCH_IRE_DEFAULT - * in flags. - * - * It does not support recursion more than one level. It - * will do recursive lookup only when the lookup maps to - * a prefix or default route and MATCH_IRE_RECURSIVE flag is passed. - * - * If the routing table is setup to allow more than one level - * of recursion, the cleaning up cache table will not work resulting - * in invalid routing. - * * Supports link-local addresses by following the ipif/ill when recursing. - * - * NOTE : When this function returns NULL, pire has already been released. - * pire is valid only when this function successfully returns an - * ire. */ ire_t * ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask, - const in6_addr_t *gateway, int type, const ipif_t *ipif, ire_t **pire, - zoneid_t zoneid, uint32_t ihandle, const ts_label_t *tsl, int flags, - ip_stack_t *ipst) + const in6_addr_t *gateway, int type, const ill_t *ill, + zoneid_t zoneid, const ts_label_t *tsl, int flags, + uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp) { - irb_t *irb_ptr; - ire_t *rire; ire_t *ire = NULL; - ire_t *saved_ire; - nce_t *nce; - int i; - in6_addr_t gw_addr_v6; ASSERT(addr != NULL); ASSERT((!(flags & MATCH_IRE_MASK)) || mask != NULL); ASSERT((!(flags & MATCH_IRE_GW)) || gateway != NULL); - ASSERT(ipif == NULL || ipif->ipif_isv6); + ASSERT(ill == NULL || ill->ill_isv6); + + ASSERT(!IN6_IS_ADDR_V4MAPPED(addr)); /* - * When we return NULL from this function, we should make - * sure that *pire is NULL so that the callers will not - * wrongly REFRELE the pire. + * ire_match_args_v6() will dereference ill if MATCH_IRE_ILL + * is set. */ - if (pire != NULL) - *pire = NULL; + if ((flags & (MATCH_IRE_ILL)) && (ill == NULL)) + return (NULL); + + rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); + ire = ire_ftable_lookup_impl_v6(addr, mask, gateway, type, ill, zoneid, + tsl, flags, ipst); + if (ire == NULL) { + rw_exit(&ipst->ips_ip6_ire_head_lock); + return (NULL); + } + /* - * ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or - * MATCH_IRE_ILL is set. + * round-robin only if we have more than one route in the bucket. + * ips_ip_ecmp_behavior controls when we do ECMP + * 2: always + * 1: for IRE_DEFAULT and /0 IRE_INTERFACE + * 0: never + * + * Note: if we found an IRE_IF_CLONE we won't look at the bucket with + * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match + * and the IRE_INTERFACESs are likely to be shorter matches. */ - if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL)) - return (NULL); + if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) { + if (ipst->ips_ip_ecmp_behavior == 2 || + (ipst->ips_ip_ecmp_behavior == 1 && + IS_DEFAULT_ROUTE_V6(ire))) { + ire_t *next_ire; + ire_ftable_args_t margs; + + (void) memset(&margs, 0, sizeof (margs)); + margs.ift_addr_v6 = *addr; + if (mask != NULL) + margs.ift_mask_v6 = *mask; + if (gateway != NULL) + margs.ift_gateway_v6 = *gateway; + margs.ift_type = type; + margs.ift_ill = ill; + margs.ift_zoneid = zoneid; + margs.ift_tsl = tsl; + margs.ift_flags = flags; + + next_ire = ire_round_robin(ire->ire_bucket, &margs, + xmit_hint, ire, ipst); + if (next_ire == NULL) { + /* keep ire if next_ire is null */ + goto done; + } + ire_refrele(ire); + ire = next_ire; + } + } + +done: + /* Return generation before dropping lock */ + if (generationp != NULL) + *generationp = ire->ire_generation; + + rw_exit(&ipst->ips_ip6_ire_head_lock); + + /* + * For shared-IP zones we need additional checks to what was + * done in ire_match_args to make sure IRE_LOCALs are handled. + * + * When ip_restrict_interzone_loopback is set, then + * we ensure that IRE_LOCAL are only used for loopback + * between zones when the logical "Ethernet" would + * have looped them back. That is, if in the absense of + * the IRE_LOCAL we would have sent to packet out the + * same ill. + */ + if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES && + ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES && + ipst->ips_ip_restrict_interzone_loopback) { + ire = ire_alt_local(ire, zoneid, tsl, ill, generationp); + ASSERT(ire != NULL); + } + + return (ire); +} + +/* + * Look up a single ire. The caller holds either the read or write lock. + */ +ire_t * +ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask, + const in6_addr_t *gateway, int type, const ill_t *ill, + zoneid_t zoneid, const ts_label_t *tsl, int flags, + ip_stack_t *ipst) +{ + irb_t *irb_ptr; + ire_t *ire = NULL; + int i; + + ASSERT(RW_LOCK_HELD(&ipst->ips_ip6_ire_head_lock)); /* * If the mask is known, the lookup @@ -1496,28 +1021,41 @@ ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask, uint_t masklen; masklen = ip_mask_to_plen_v6(mask); - if (ipst->ips_ip_forwarding_table_v6[masklen] == NULL) + if (ipst->ips_ip_forwarding_table_v6[masklen] == NULL) { return (NULL); + } irb_ptr = &(ipst->ips_ip_forwarding_table_v6[masklen][ IRE_ADDR_MASK_HASH_V6(*addr, *mask, ipst->ips_ip6_ftable_hash_size)]); rw_enter(&irb_ptr->irb_lock, RW_READER); for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { - if (ire->ire_marks & IRE_MARK_CONDEMNED) + if (IRE_IS_CONDEMNED(ire)) continue; if (ire_match_args_v6(ire, addr, mask, gateway, type, - ipif, zoneid, ihandle, tsl, flags)) + ill, zoneid, tsl, flags)) goto found_ire; } rw_exit(&irb_ptr->irb_lock); } else { + uint_t masklen; + /* * In this case we don't know the mask, we need to * search the table assuming different mask sizes. - * we start with 128 bit mask, we don't allow default here. */ - for (i = (IP6_MASK_TABLE_SIZE - 1); i > 0; i--) { + if (flags & MATCH_IRE_SHORTERMASK) { + masklen = ip_mask_to_plen_v6(mask); + if (masklen == 0) { + /* Nothing shorter than zero */ + return (NULL); + } + masklen--; + } else { + masklen = IP6_MASK_TABLE_SIZE - 1; + } + + for (i = masklen; i >= 0; i--) { in6_addr_t tmpmask; if ((ipst->ips_ip_forwarding_table_v6[i]) == NULL) @@ -1529,1334 +1067,415 @@ ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask, rw_enter(&irb_ptr->irb_lock, RW_READER); for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { - if (ire->ire_marks & IRE_MARK_CONDEMNED) + if (IRE_IS_CONDEMNED(ire)) continue; if (ire_match_args_v6(ire, addr, - &ire->ire_mask_v6, gateway, type, ipif, - zoneid, ihandle, tsl, flags)) + &ire->ire_mask_v6, gateway, type, ill, + zoneid, tsl, flags)) goto found_ire; } rw_exit(&irb_ptr->irb_lock); } } - - /* - * We come here if no route has yet been found. - * - * Handle the case where default route is - * requested by specifying type as one of the possible - * types for that can have a zero mask (IRE_DEFAULT and IRE_INTERFACE). - * - * If MATCH_IRE_MASK is specified, then the appropriate default route - * would have been found above if it exists so it isn't looked up here. - * If MATCH_IRE_DEFAULT was also specified, then a default route will be - * searched for later. - */ - if ((flags & (MATCH_IRE_TYPE | MATCH_IRE_MASK)) == MATCH_IRE_TYPE && - (type & (IRE_DEFAULT | IRE_INTERFACE))) { - if (ipst->ips_ip_forwarding_table_v6[0] != NULL) { - /* addr & mask is zero for defaults */ - irb_ptr = &ipst->ips_ip_forwarding_table_v6[0][ - IRE_ADDR_HASH_V6(ipv6_all_zeros, - ipst->ips_ip6_ftable_hash_size)]; - rw_enter(&irb_ptr->irb_lock, RW_READER); - for (ire = irb_ptr->irb_ire; ire != NULL; - ire = ire->ire_next) { - - if (ire->ire_marks & IRE_MARK_CONDEMNED) - continue; - - if (ire_match_args_v6(ire, addr, - &ipv6_all_zeros, gateway, type, ipif, - zoneid, ihandle, tsl, flags)) - goto found_ire; - } - rw_exit(&irb_ptr->irb_lock); - } - } - /* - * We come here only if no route is found. - * see if the default route can be used which is allowed - * only if the default matching criteria is specified. - * The ipv6_ire_default_count tracks the number of IRE_DEFAULT - * entries. However, the ip_forwarding_table_v6[0] also contains - * interface routes thus the count can be zero. - */ - saved_ire = NULL; - if ((flags & (MATCH_IRE_DEFAULT | MATCH_IRE_MASK)) == - MATCH_IRE_DEFAULT) { - ire_t *ire_origin; - uint_t g_index; - uint_t index; - - if (ipst->ips_ip_forwarding_table_v6[0] == NULL) - return (NULL); - irb_ptr = &(ipst->ips_ip_forwarding_table_v6[0])[0]; - - /* - * Keep a tab on the bucket while looking the IRE_DEFAULT - * entries. We need to keep track of a particular IRE - * (ire_origin) so this ensures that it will not be unlinked - * from the hash list during the recursive lookup below. - */ - IRB_REFHOLD(irb_ptr); - ire = irb_ptr->irb_ire; - if (ire == NULL) { - IRB_REFRELE(irb_ptr); - return (NULL); - } - - /* - * Get the index first, since it can be changed by other - * threads. Then get to the right default route skipping - * default interface routes if any. As we hold a reference on - * the IRE bucket, ipv6_ire_default_count can only increase so - * we can't reach the end of the hash list unexpectedly. - */ - if (ipst->ips_ipv6_ire_default_count != 0) { - g_index = ipst->ips_ipv6_ire_default_index++; - index = g_index % ipst->ips_ipv6_ire_default_count; - while (index != 0) { - if (!(ire->ire_type & IRE_INTERFACE)) - index--; - ire = ire->ire_next; - } - ASSERT(ire != NULL); - } else { - /* - * No default route, so we only have default interface - * routes: don't enter the first loop. - */ - ire = NULL; - } - - /* - * Round-robin the default routers list looking for a neighbor - * that matches the passed in parameters and is reachable. If - * none found, just return a route from the default router list - * if it exists. If we can't find a default route (IRE_DEFAULT), - * look for interface default routes. - * We start with the ire we found above and we walk the hash - * list until we're back where we started, see - * ire_get_next_default_ire(). It doesn't matter if default - * routes are added or deleted by other threads - we know this - * ire will stay in the list because we hold a reference on the - * ire bucket. - * NB: if we only have interface default routes, ire is NULL so - * we don't even enter this loop (see above). - */ - ire_origin = ire; - for (; ire != NULL; - ire = ire_get_next_default_ire(ire, ire_origin)) { - - if (ire_match_args_v6(ire, addr, - &ipv6_all_zeros, gateway, type, ipif, - zoneid, ihandle, tsl, flags)) { - int match_flags; - - /* - * We have something to work with. - * If we can find a resolved/reachable - * entry, we will use this. Otherwise - * we'll try to find an entry that has - * a resolved cache entry. We will fallback - * on this if we don't find anything else. - */ - if (saved_ire == NULL) - saved_ire = ire; - mutex_enter(&ire->ire_lock); - gw_addr_v6 = ire->ire_gateway_addr_v6; - mutex_exit(&ire->ire_lock); - match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR; - rire = ire_ctable_lookup_v6(&gw_addr_v6, NULL, - 0, ire->ire_ipif, zoneid, tsl, match_flags, - ipst); - if (rire != NULL) { - nce = rire->ire_nce; - if (nce != NULL && - NCE_ISREACHABLE(nce) && - nce->nce_flags & NCE_F_ISROUTER) { - ire_refrele(rire); - IRE_REFHOLD(ire); - IRB_REFRELE(irb_ptr); - goto found_ire_held; - } else if (nce != NULL && - !(nce->nce_flags & - NCE_F_ISROUTER)) { - /* - * Make sure we don't use - * this ire - */ - if (saved_ire == ire) - saved_ire = NULL; - } - ire_refrele(rire); - } else if (ipst-> - ips_ipv6_ire_default_count > 1 && - zoneid != GLOBAL_ZONEID) { - /* - * When we're in a local zone, we're - * only interested in default routers - * that are reachable through ipifs - * within our zone. - * The potentially expensive call to - * ire_route_lookup_v6() is avoided when - * we have only one default route. - */ - int ire_match_flags = MATCH_IRE_TYPE | - MATCH_IRE_SECATTR; - - if (ire->ire_ipif != NULL) { - ire_match_flags |= - MATCH_IRE_ILL; - } - rire = ire_route_lookup_v6(&gw_addr_v6, - NULL, NULL, IRE_INTERFACE, - ire->ire_ipif, NULL, - zoneid, tsl, ire_match_flags, ipst); - if (rire != NULL) { - ire_refrele(rire); - saved_ire = ire; - } else if (saved_ire == ire) { - /* - * Make sure we don't use - * this ire - */ - saved_ire = NULL; - } - } - } - } - if (saved_ire != NULL) { - ire = saved_ire; - IRE_REFHOLD(ire); - IRB_REFRELE(irb_ptr); - goto found_ire_held; - } else { - /* - * Look for a interface default route matching the - * args passed in. No round robin here. Just pick - * the right one. - */ - for (ire = irb_ptr->irb_ire; ire != NULL; - ire = ire->ire_next) { - - if (!(ire->ire_type & IRE_INTERFACE)) - continue; - - if (ire->ire_marks & IRE_MARK_CONDEMNED) - continue; - - if (ire_match_args_v6(ire, addr, - &ipv6_all_zeros, gateway, type, ipif, - zoneid, ihandle, tsl, flags)) { - IRE_REFHOLD(ire); - IRB_REFRELE(irb_ptr); - goto found_ire_held; - } - } - IRB_REFRELE(irb_ptr); - } - } ASSERT(ire == NULL); ip1dbg(("ire_ftable_lookup_v6: returning NULL ire")); return (NULL); + found_ire: - ASSERT((ire->ire_marks & IRE_MARK_CONDEMNED) == 0); - IRE_REFHOLD(ire); + ire_refhold(ire); rw_exit(&irb_ptr->irb_lock); - -found_ire_held: - if ((flags & MATCH_IRE_RJ_BHOLE) && - (ire->ire_flags & (RTF_BLACKHOLE | RTF_REJECT))) { - return (ire); - } - /* - * At this point, IRE that was found must be an IRE_FORWARDTABLE - * or IRE_CACHETABLE type. If this is a recursive lookup and an - * IRE_INTERFACE type was found, return that. If it was some other - * IRE_FORWARDTABLE type of IRE (one of the prefix types), then it - * is necessary to fill in the parent IRE pointed to by pire, and - * then lookup the gateway address of the parent. For backwards - * compatiblity, if this lookup returns an - * IRE other than a IRE_CACHETABLE or IRE_INTERFACE, then one more level - * of lookup is done. - */ - if (flags & MATCH_IRE_RECURSIVE) { - const ipif_t *gw_ipif; - int match_flags = MATCH_IRE_DSTONLY; - - if (ire->ire_type & IRE_INTERFACE) - return (ire); - if (pire != NULL) - *pire = ire; - /* - * If we can't find an IRE_INTERFACE or the caller has not - * asked for pire, we need to REFRELE the saved_ire. - */ - saved_ire = ire; - - if (ire->ire_ipif != NULL) - match_flags |= MATCH_IRE_ILL; - - mutex_enter(&ire->ire_lock); - gw_addr_v6 = ire->ire_gateway_addr_v6; - mutex_exit(&ire->ire_lock); - - ire = ire_route_lookup_v6(&gw_addr_v6, NULL, NULL, 0, - ire->ire_ipif, NULL, zoneid, tsl, match_flags, ipst); - if (ire == NULL) { - /* - * In this case we have to deal with the - * MATCH_IRE_PARENT flag, which means the - * parent has to be returned if ire is NULL. - * The aim of this is to have (at least) a starting - * ire when we want to look at all of the ires in a - * bucket aimed at a single destination (as is the - * case in ip_newroute_v6 for the RTF_MULTIRT - * flagged routes). - */ - if (flags & MATCH_IRE_PARENT) { - if (pire != NULL) { - /* - * Need an extra REFHOLD, if the - * parent ire is returned via both - * ire and pire. - */ - IRE_REFHOLD(saved_ire); - } - ire = saved_ire; - } else { - ire_refrele(saved_ire); - if (pire != NULL) - *pire = NULL; - } - return (ire); - } - if (ire->ire_type & (IRE_CACHETABLE | IRE_INTERFACE)) { - /* - * If the caller did not ask for pire, release - * it now. - */ - if (pire == NULL) { - ire_refrele(saved_ire); - } - return (ire); - } - match_flags |= MATCH_IRE_TYPE; - mutex_enter(&ire->ire_lock); - gw_addr_v6 = ire->ire_gateway_addr_v6; - mutex_exit(&ire->ire_lock); - gw_ipif = ire->ire_ipif; - ire_refrele(ire); - ire = ire_route_lookup_v6(&gw_addr_v6, NULL, NULL, - (IRE_CACHETABLE | IRE_INTERFACE), gw_ipif, NULL, zoneid, - NULL, match_flags, ipst); - if (ire == NULL) { - /* - * In this case we have to deal with the - * MATCH_IRE_PARENT flag, which means the - * parent has to be returned if ire is NULL. - * The aim of this is to have (at least) a starting - * ire when we want to look at all of the ires in a - * bucket aimed at a single destination (as is the - * case in ip_newroute_v6 for the RTF_MULTIRT - * flagged routes). - */ - if (flags & MATCH_IRE_PARENT) { - if (pire != NULL) { - /* - * Need an extra REFHOLD, if the - * parent ire is returned via both - * ire and pire. - */ - IRE_REFHOLD(saved_ire); - } - ire = saved_ire; - } else { - ire_refrele(saved_ire); - if (pire != NULL) - *pire = NULL; - } - return (ire); - } else if (pire == NULL) { - /* - * If the caller did not ask for pire, release - * it now. - */ - ire_refrele(saved_ire); - } - return (ire); - } - - ASSERT(pire == NULL || *pire == NULL); return (ire); } -/* - * Delete the IRE cache for the gateway and all IRE caches whose - * ire_gateway_addr_v6 points to this gateway, and allow them to - * be created on demand by ip_newroute_v6. - */ -void -ire_clookup_delete_cache_gw_v6(const in6_addr_t *addr, zoneid_t zoneid, - ip_stack_t *ipst) -{ - irb_t *irb; - ire_t *ire; - - irb = &ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr, - ipst->ips_ip6_cache_table_size)]; - IRB_REFHOLD(irb); - for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { - if (ire->ire_marks & IRE_MARK_CONDEMNED) - continue; - - ASSERT(IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, &ipv6_all_ones)); - if (ire_match_args_v6(ire, addr, &ire->ire_mask_v6, 0, - IRE_CACHE, NULL, zoneid, 0, NULL, MATCH_IRE_TYPE)) { - ire_delete(ire); - } - } - IRB_REFRELE(irb); - - ire_walk_v6(ire_delete_cache_gw_v6, (char *)addr, zoneid, ipst); -} - -/* - * Looks up cache table for a route. - * specific lookup can be indicated by - * passing the MATCH_* flags and the - * necessary parameters. - */ -ire_t * -ire_ctable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *gateway, - int type, const ipif_t *ipif, zoneid_t zoneid, const ts_label_t *tsl, - int flags, ip_stack_t *ipst) -{ - ire_ctable_args_t margs; - - margs.ict_addr = (void *)addr; - margs.ict_gateway = (void *)gateway; - margs.ict_type = type; - margs.ict_ipif = ipif; - margs.ict_zoneid = zoneid; - margs.ict_tsl = tsl; - margs.ict_flags = flags; - margs.ict_ipst = ipst; - margs.ict_wq = NULL; - - return (ip6_ctable_lookup_impl(&margs)); -} /* - * Lookup cache. + * This function is called by + * ip_input/ire_route_recursive when doing a route lookup on only the + * destination address. * - * In general the zoneid has to match (where ALL_ZONES match all of them). - * But for IRE_LOCAL we also need to handle the case where L2 should - * conceptually loop back the packet. This is necessary since neither - * Ethernet drivers nor Ethernet hardware loops back packets sent to their - * own MAC address. This loopback is needed when the normal - * routes (ignoring IREs with different zoneids) would send out the packet on - * the same ill as the ill with which this IRE_LOCAL is associated. + * The optimizations of this function over ire_ftable_lookup are: + * o removing unnecessary flag matching + * o doing longest prefix match instead of overloading it further + * with the unnecessary "best_prefix_match" * - * Earlier versions of this code always matched an IRE_LOCAL independently of - * the zoneid. We preserve that earlier behavior when - * ip_restrict_interzone_loopback is turned off. + * If no route is found we return IRE_NOROUTE. */ ire_t * -ire_cache_lookup_v6(const in6_addr_t *addr, zoneid_t zoneid, - const ts_label_t *tsl, ip_stack_t *ipst) -{ - irb_t *irb_ptr; - ire_t *ire; - - irb_ptr = &ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr, - ipst->ips_ip6_cache_table_size)]; - rw_enter(&irb_ptr->irb_lock, RW_READER); - for (ire = irb_ptr->irb_ire; ire; ire = ire->ire_next) { - if (ire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN)) - continue; - if (IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, addr)) { - /* - * Finally, check if the security policy has any - * restriction on using this route for the specified - * message. - */ - if (tsl != NULL && - ire->ire_gw_secattr != NULL && - tsol_ire_match_gwattr(ire, tsl) != 0) { - continue; - } - - if (zoneid == ALL_ZONES || ire->ire_zoneid == zoneid || - ire->ire_zoneid == ALL_ZONES) { - IRE_REFHOLD(ire); - rw_exit(&irb_ptr->irb_lock); - return (ire); - } - - if (ire->ire_type == IRE_LOCAL) { - if (ipst->ips_ip_restrict_interzone_loopback && - !ire_local_ok_across_zones(ire, zoneid, - (void *)addr, tsl, ipst)) - continue; - - IRE_REFHOLD(ire); - rw_exit(&irb_ptr->irb_lock); - return (ire); - } - } - } - rw_exit(&irb_ptr->irb_lock); - return (NULL); -} - -/* - * Locate the interface ire that is tied to the cache ire 'cire' via - * cire->ire_ihandle. - * - * We are trying to create the cache ire for an onlink destn. or - * gateway in 'cire'. We are called from ire_add_v6() in the IRE_IF_RESOLVER - * case for xresolv interfaces, after the ire has come back from - * an external resolver. - */ -static ire_t * -ire_ihandle_lookup_onlink_v6(ire_t *cire) +ire_ftable_lookup_simple_v6(const in6_addr_t *addr, uint32_t xmit_hint, + ip_stack_t *ipst, uint_t *generationp) { ire_t *ire; - int match_flags; - int i; - int j; - irb_t *irb_ptr; - ip_stack_t *ipst = cire->ire_ipst; - - ASSERT(cire != NULL); - match_flags = MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK; - /* - * We know that the mask of the interface ire equals cire->ire_cmask. - * (When ip_newroute_v6() created 'cire' for an on-link destn. - * it set its cmask from the interface ire's mask) - */ - ire = ire_ftable_lookup_v6(&cire->ire_addr_v6, &cire->ire_cmask_v6, - NULL, IRE_INTERFACE, NULL, NULL, ALL_ZONES, cire->ire_ihandle, - NULL, match_flags, ipst); - if (ire != NULL) - return (ire); - /* - * If we didn't find an interface ire above, we can't declare failure. - * For backwards compatibility, we need to support prefix routes - * pointing to next hop gateways that are not on-link. - * - * In the resolver/noresolver case, ip_newroute_v6() thinks - * it is creating the cache ire for an onlink destination in 'cire'. - * But 'cire' is not actually onlink, because ire_ftable_lookup_v6() - * cheated it, by doing ire_route_lookup_v6() twice and returning an - * interface ire. - * - * Eg. default - gw1 (line 1) - * gw1 - gw2 (line 2) - * gw2 - hme0 (line 3) - * - * In the above example, ip_newroute_v6() tried to create the cache ire - * 'cire' for gw1, based on the interface route in line 3. The - * ire_ftable_lookup_v6() above fails, because there is - * no interface route to reach gw1. (it is gw2). We fall thru below. - * - * Do a brute force search based on the ihandle in a subset of the - * forwarding tables, corresponding to cire->ire_cmask_v6. Otherwise - * things become very complex, since we don't have 'pire' in this - * case. (Also note that this method is not possible in the offlink - * case because we don't know the mask) - */ - i = ip_mask_to_plen_v6(&cire->ire_cmask_v6); - if ((ipst->ips_ip_forwarding_table_v6[i]) == NULL) - return (NULL); - for (j = 0; j < ipst->ips_ip6_ftable_hash_size; j++) { - irb_ptr = &ipst->ips_ip_forwarding_table_v6[i][j]; - rw_enter(&irb_ptr->irb_lock, RW_READER); - for (ire = irb_ptr->irb_ire; ire != NULL; - ire = ire->ire_next) { - if (ire->ire_marks & IRE_MARK_CONDEMNED) - continue; - if ((ire->ire_type & IRE_INTERFACE) && - (ire->ire_ihandle == cire->ire_ihandle)) { - IRE_REFHOLD(ire); - rw_exit(&irb_ptr->irb_lock); - return (ire); - } - } - rw_exit(&irb_ptr->irb_lock); + ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL, ALL_ZONES, NULL, + MATCH_IRE_DSTONLY, xmit_hint, ipst, generationp); + if (ire == NULL) { + ire = ire_reject(ipst, B_TRUE); + if (generationp != NULL) + *generationp = IRE_GENERATION_VERIFY; } - return (NULL); + /* ftable_lookup did round robin */ + return (ire); } - -/* - * Locate the interface ire that is tied to the cache ire 'cire' via - * cire->ire_ihandle. - * - * We are trying to create the cache ire for an offlink destn based - * on the cache ire of the gateway in 'cire'. 'pire' is the prefix ire - * as found by ip_newroute_v6(). We are called from ip_newroute_v6() in - * the IRE_CACHE case. - */ ire_t * -ire_ihandle_lookup_offlink_v6(ire_t *cire, ire_t *pire) +ip_select_route_v6(const in6_addr_t *dst, ip_xmit_attr_t *ixa, + uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp) { - ire_t *ire; - int match_flags; - in6_addr_t gw_addr; - ipif_t *gw_ipif; - ip_stack_t *ipst = cire->ire_ipst; - - ASSERT(cire != NULL && pire != NULL); + ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4)); - match_flags = MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK; - if (pire->ire_ipif != NULL) - match_flags |= MATCH_IRE_ILL; - /* - * We know that the mask of the interface ire equals cire->ire_cmask. - * (When ip_newroute_v6() created 'cire' for an on-link destn. it set - * its cmask from the interface ire's mask) - */ - ire = ire_ftable_lookup_v6(&cire->ire_addr_v6, &cire->ire_cmask_v6, 0, - IRE_INTERFACE, pire->ire_ipif, NULL, ALL_ZONES, cire->ire_ihandle, - NULL, match_flags, ipst); - if (ire != NULL) - return (ire); - /* - * If we didn't find an interface ire above, we can't declare failure. - * For backwards compatibility, we need to support prefix routes - * pointing to next hop gateways that are not on-link. - * - * Assume we are trying to ping some offlink destn, and we have the - * routing table below. - * - * Eg. default - gw1 <--- pire (line 1) - * gw1 - gw2 (line 2) - * gw2 - hme0 (line 3) - * - * If we already have a cache ire for gw1 in 'cire', the - * ire_ftable_lookup_v6 above would have failed, since there is no - * interface ire to reach gw1. We will fallthru below. - * - * Here we duplicate the steps that ire_ftable_lookup_v6() did in - * getting 'cire' from 'pire', in the MATCH_IRE_RECURSIVE case. - * The differences are the following - * i. We want the interface ire only, so we call - * ire_ftable_lookup_v6() instead of ire_route_lookup_v6() - * ii. We look for only prefix routes in the 1st call below. - * ii. We want to match on the ihandle in the 2nd call below. - */ - match_flags = MATCH_IRE_TYPE; - if (pire->ire_ipif != NULL) - match_flags |= MATCH_IRE_ILL; - - mutex_enter(&pire->ire_lock); - gw_addr = pire->ire_gateway_addr_v6; - mutex_exit(&pire->ire_lock); - ire = ire_ftable_lookup_v6(&gw_addr, 0, 0, IRE_OFFSUBNET, - pire->ire_ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst); - if (ire == NULL) - return (NULL); - /* - * At this point 'ire' corresponds to the entry shown in line 2. - * gw_addr is 'gw2' in the example above. - */ - mutex_enter(&ire->ire_lock); - gw_addr = ire->ire_gateway_addr_v6; - mutex_exit(&ire->ire_lock); - gw_ipif = ire->ire_ipif; - ire_refrele(ire); - - match_flags |= MATCH_IRE_IHANDLE; - ire = ire_ftable_lookup_v6(&gw_addr, 0, 0, IRE_INTERFACE, - gw_ipif, NULL, ALL_ZONES, cire->ire_ihandle, - NULL, match_flags, ipst); - return (ire); + return (ip_select_route(dst, ixa, generationp, setsrcp, errorp, + multirtp)); } /* - * Return the IRE_LOOPBACK, IRE_IF_RESOLVER or IRE_IF_NORESOLVER - * ire associated with the specified ipif. + * Recursively look for a route to the destination. Can also match on + * the zoneid, ill, and label. Used for the data paths. See also + * ire_route_recursive_dstonly. * - * This might occasionally be called when IPIF_UP is not set since - * the IPV6_MULTICAST_IF as well as creating interface routes - * allows specifying a down ipif (ipif_lookup* match ipifs that are down). + * If ill is set this means we will match it by adding MATCH_IRE_ILL. * - * Note that if IPIF_NOLOCAL, IPIF_NOXMIT, or IPIF_DEPRECATED is set on - * the ipif this routine might return NULL. - * (Sometimes called as writer though not required by this function.) + * If allocate is not set then we will only inspect the existing IREs; never + * create an IRE_IF_CLONE. This is used on the receive side when we are not + * forwarding. + * + * Note that this function never returns NULL. It returns an IRE_NOROUTE + * instead. + * + * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it + * is an error. + * Allow at most one RTF_INDIRECT. */ ire_t * -ipif_to_ire_v6(const ipif_t *ipif) +ire_route_recursive_impl_v6(ire_t *ire, + const in6_addr_t *nexthop, uint_t ire_type, const ill_t *ill_arg, + zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, + boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst, + in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) { - ire_t *ire; - ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; - uint_t match_flags = MATCH_IRE_TYPE | MATCH_IRE_IPIF; + int i, j; + in6_addr_t v6nexthop = *nexthop; + ire_t *ires[MAX_IRE_RECURSION]; + uint_t generation; + uint_t generations[MAX_IRE_RECURSION]; + boolean_t need_refrele = B_FALSE; + boolean_t invalidate = B_FALSE; + int prefs[MAX_IRE_RECURSION]; + ill_t *ill = NULL; + + if (setsrcp != NULL) + ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp)); + if (gwattrp != NULL) + ASSERT(*gwattrp == NULL); + + if (ill_arg != NULL) + match_args |= MATCH_IRE_ILL; /* - * IRE_INTERFACE entries for ills under IPMP are IRE_MARK_TESTHIDDEN - * so that they aren't accidentally returned. However, if the - * caller's ipif is on an ill under IPMP, there's no need to hide 'em. + * We iterate up to three times to resolve a route, even though + * we have four slots in the array. The extra slot is for an + * IRE_IF_CLONE we might need to create. */ - if (IS_UNDER_IPMP(ipif->ipif_ill)) - match_flags |= MATCH_IRE_MARK_TESTHIDDEN; - - ASSERT(ipif->ipif_isv6); - if (ipif->ipif_ire_type == IRE_LOOPBACK) { - ire = ire_ctable_lookup_v6(&ipif->ipif_v6lcl_addr, NULL, - IRE_LOOPBACK, ipif, ALL_ZONES, NULL, match_flags, ipst); - } else if (ipif->ipif_flags & IPIF_POINTOPOINT) { - /* In this case we need to lookup destination address. */ - ire = ire_ftable_lookup_v6(&ipif->ipif_v6pp_dst_addr, - &ipv6_all_ones, NULL, IRE_INTERFACE, ipif, NULL, ALL_ZONES, - 0, NULL, (match_flags | MATCH_IRE_MASK), ipst); - } else { - ire = ire_ftable_lookup_v6(&ipif->ipif_v6subnet, - &ipif->ipif_v6net_mask, NULL, IRE_INTERFACE, ipif, NULL, - ALL_ZONES, 0, NULL, (match_flags | MATCH_IRE_MASK), ipst); - } - return (ire); -} - -/* - * Return B_TRUE if a multirt route is resolvable - * (or if no route is resolved yet), B_FALSE otherwise. - * This only works in the global zone. - */ -boolean_t -ire_multirt_need_resolve_v6(const in6_addr_t *v6dstp, const ts_label_t *tsl, - ip_stack_t *ipst) -{ - ire_t *first_fire; - ire_t *first_cire; - ire_t *fire; - ire_t *cire; - irb_t *firb; - irb_t *cirb; - int unres_cnt = 0; - boolean_t resolvable = B_FALSE; - - /* Retrieve the first IRE_HOST that matches the destination */ - first_fire = ire_ftable_lookup_v6(v6dstp, &ipv6_all_ones, 0, IRE_HOST, - NULL, NULL, ALL_ZONES, 0, tsl, MATCH_IRE_MASK | MATCH_IRE_TYPE | - MATCH_IRE_SECATTR, ipst); - - /* No route at all */ - if (first_fire == NULL) { - return (B_TRUE); - } - - firb = first_fire->ire_bucket; - ASSERT(firb); - - /* Retrieve the first IRE_CACHE ire for that destination. */ - first_cire = ire_cache_lookup_v6(v6dstp, GLOBAL_ZONEID, tsl, ipst); - - /* No resolved route. */ - if (first_cire == NULL) { - ire_refrele(first_fire); - return (B_TRUE); - } - - /* At least one route is resolved. */ - - cirb = first_cire->ire_bucket; - ASSERT(cirb); - - /* Count the number of routes to that dest that are declared. */ - IRB_REFHOLD(firb); - for (fire = first_fire; fire != NULL; fire = fire->ire_next) { - if (!(fire->ire_flags & RTF_MULTIRT)) - continue; - if (!IN6_ARE_ADDR_EQUAL(&fire->ire_addr_v6, v6dstp)) - continue; - unres_cnt++; - } - IRB_REFRELE(firb); - - - /* Then subtract the number of routes to that dst that are resolved */ - IRB_REFHOLD(cirb); - for (cire = first_cire; cire != NULL; cire = cire->ire_next) { - if (!(cire->ire_flags & RTF_MULTIRT)) - continue; - if (!IN6_ARE_ADDR_EQUAL(&cire->ire_addr_v6, v6dstp)) - continue; - if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN)) - continue; - unres_cnt--; - } - IRB_REFRELE(cirb); - - /* At least one route is unresolved; search for a resolvable route. */ - if (unres_cnt > 0) - resolvable = ire_multirt_lookup_v6(&first_cire, &first_fire, - MULTIRT_USESTAMP|MULTIRT_CACHEGW, tsl, ipst); - - if (first_fire) - ire_refrele(first_fire); - - if (first_cire) - ire_refrele(first_cire); - - return (resolvable); -} - - -/* - * Return B_TRUE and update *ire_arg and *fire_arg - * if at least one resolvable route is found. - * Return B_FALSE otherwise (all routes are resolved or - * the remaining unresolved routes are all unresolvable). - * This only works in the global zone. - */ -boolean_t -ire_multirt_lookup_v6(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags, - const ts_label_t *tsl, ip_stack_t *ipst) -{ - clock_t delta; - ire_t *best_fire = NULL; - ire_t *best_cire = NULL; - ire_t *first_fire; - ire_t *first_cire; - ire_t *fire; - ire_t *cire; - irb_t *firb = NULL; - irb_t *cirb = NULL; - ire_t *gw_ire; - boolean_t already_resolved; - boolean_t res; - in6_addr_t v6dst; - in6_addr_t v6gw; - - ip2dbg(("ire_multirt_lookup_v6: *ire_arg %p, *fire_arg %p, " - "flags %04x\n", (void *)*ire_arg, (void *)*fire_arg, flags)); - - ASSERT(ire_arg); - ASSERT(fire_arg); - - /* Not an IRE_HOST ire; give up. */ - if ((*fire_arg == NULL) || - ((*fire_arg)->ire_type != IRE_HOST)) { - return (B_FALSE); - } + i = 0; + while (i < MAX_IRE_RECURSION - 1) { + /* ire_ftable_lookup handles round-robin/ECMP */ + if (ire == NULL) { + ire = ire_ftable_lookup_v6(&v6nexthop, 0, 0, ire_type, + (ill_arg != NULL ? ill_arg : ill), zoneid, tsl, + match_args, xmit_hint, ipst, &generation); + } else { + /* Caller passed it; extra hold since we will rele */ + ire_refhold(ire); + if (generationp != NULL) + generation = *generationp; + else + generation = IRE_GENERATION_VERIFY; + } - /* This is the first IRE_HOST ire for that destination. */ - first_fire = *fire_arg; - firb = first_fire->ire_bucket; - ASSERT(firb); + if (ire == NULL) + ire = ire_reject(ipst, B_TRUE); - mutex_enter(&first_fire->ire_lock); - v6dst = first_fire->ire_addr_v6; - mutex_exit(&first_fire->ire_lock); + /* Need to return the ire with RTF_REJECT|BLACKHOLE */ + if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) + goto error; - ip2dbg(("ire_multirt_lookup_v6: dst %08x\n", - ntohl(V4_PART_OF_V6(v6dst)))); + ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */ - /* - * Retrieve the first IRE_CACHE ire for that destination; - * if we don't find one, no route for that dest is - * resolved yet. - */ - first_cire = ire_cache_lookup_v6(&v6dst, GLOBAL_ZONEID, tsl, ipst); - if (first_cire) { - cirb = first_cire->ire_bucket; - } - - ip2dbg(("ire_multirt_lookup_v6: first_cire %p\n", (void *)first_cire)); + prefs[i] = ire_pref(ire); + if (i != 0) { + /* + * Don't allow anything unusual past the first + * iteration. + */ + if ((ire->ire_type & + (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) || + prefs[i] <= prefs[i-1]) { + ire_refrele(ire); + ire = ire_reject(ipst, B_TRUE); + goto error; + } + } + /* We have a usable IRE */ + ires[i] = ire; + generations[i] = generation; + i++; + + /* The first RTF_SETSRC address is passed back if setsrcp */ + if ((ire->ire_flags & RTF_SETSRC) && + setsrcp != NULL && IN6_IS_ADDR_UNSPECIFIED(setsrcp)) { + ASSERT(!IN6_IS_ADDR_UNSPECIFIED( + &ire->ire_setsrc_addr_v6)); + *setsrcp = ire->ire_setsrc_addr_v6; + } - /* - * Search for a resolvable route, giving the top priority - * to routes that can be resolved without any call to the resolver. - */ - IRB_REFHOLD(firb); + /* The first ire_gw_secattr is passed back if gwattrp */ + if (ire->ire_gw_secattr != NULL && + gwattrp != NULL && *gwattrp == NULL) + *gwattrp = ire->ire_gw_secattr; - if (!IN6_IS_ADDR_MULTICAST(&v6dst)) { /* - * For all multiroute IRE_HOST ires for that destination, - * check if the route via the IRE_HOST's gateway is - * resolved yet. + * Check if we have a short-cut pointer to an IRE for this + * destination, and that the cached dependency isn't stale. + * In that case we've rejoined an existing tree towards a + * parent, thus we don't need to continue the loop to + * discover the rest of the tree. */ - for (fire = first_fire; fire != NULL; fire = fire->ire_next) { - - if (!(fire->ire_flags & RTF_MULTIRT)) - continue; - if (!IN6_ARE_ADDR_EQUAL(&fire->ire_addr_v6, &v6dst)) - continue; - - if (fire->ire_gw_secattr != NULL && - tsol_ire_match_gwattr(fire, tsl) != 0) { - continue; - } - - mutex_enter(&fire->ire_lock); - v6gw = fire->ire_gateway_addr_v6; - mutex_exit(&fire->ire_lock); - - ip2dbg(("ire_multirt_lookup_v6: fire %p, " - "ire_addr %08x, ire_gateway_addr %08x\n", - (void *)fire, - ntohl(V4_PART_OF_V6(fire->ire_addr_v6)), - ntohl(V4_PART_OF_V6(v6gw)))); + mutex_enter(&ire->ire_lock); + if (ire->ire_dep_parent != NULL && + ire->ire_dep_parent->ire_generation == + ire->ire_dep_parent_generation) { + mutex_exit(&ire->ire_lock); + ire = NULL; + goto done; + } + mutex_exit(&ire->ire_lock); - already_resolved = B_FALSE; + /* + * If this type should have an ire_nce_cache (even if it + * doesn't yet have one) then we are done. Includes + * IRE_INTERFACE with a full 128 bit mask. + */ + if (ire->ire_nce_capable) { + ire = NULL; + goto done; + } - if (first_cire) { - ASSERT(cirb); + ASSERT(!(ire->ire_type & IRE_IF_CLONE)); + /* + * For an IRE_INTERFACE we create an IRE_IF_CLONE for this + * particular destination + */ + if (ire->ire_type & IRE_INTERFACE) { + ire_t *clone; - IRB_REFHOLD(cirb); - /* - * For all IRE_CACHE ires for that - * destination. - */ - for (cire = first_cire; - cire != NULL; - cire = cire->ire_next) { - - if (!(cire->ire_flags & RTF_MULTIRT)) - continue; - if (!IN6_ARE_ADDR_EQUAL( - &cire->ire_addr_v6, &v6dst)) - continue; - if (cire->ire_marks & - (IRE_MARK_CONDEMNED| - IRE_MARK_TESTHIDDEN)) - continue; - - if (cire->ire_gw_secattr != NULL && - tsol_ire_match_gwattr(cire, - tsl) != 0) { - continue; - } - - /* - * Check if the IRE_CACHE's gateway - * matches the IRE_HOST's gateway. - */ - if (IN6_ARE_ADDR_EQUAL( - &cire->ire_gateway_addr_v6, - &v6gw)) { - already_resolved = B_TRUE; - break; - } - } - IRB_REFRELE(cirb); - } + ASSERT(ire->ire_masklen != IPV6_ABITS); /* - * This route is already resolved; - * proceed with next one. + * In the case of ip_input and ILLF_FORWARDING not + * being set, and in the case of RTM_GET, + * there is no point in allocating + * an IRE_IF_CLONE. We return the IRE_INTERFACE. + * Note that !allocate can result in a ire_dep_parent + * which is IRE_IF_* without an IRE_IF_CLONE. + * We recover from that when we need to send packets + * by ensuring that the generations become + * IRE_GENERATION_VERIFY in this case. */ - if (already_resolved) { - ip2dbg(("ire_multirt_lookup_v6: found cire %p, " - "already resolved\n", (void *)cire)); - continue; + if (!allocate) { + invalidate = B_TRUE; + ire = NULL; + goto done; } - /* - * The route is unresolved; is it actually - * resolvable, i.e. is there a cache or a resolver - * for the gateway? - */ - gw_ire = ire_route_lookup_v6(&v6gw, 0, 0, 0, NULL, NULL, - ALL_ZONES, tsl, MATCH_IRE_RECURSIVE | - MATCH_IRE_SECATTR, ipst); - - ip2dbg(("ire_multirt_lookup_v6: looked up gw_ire %p\n", - (void *)gw_ire)); - - /* - * This route can be resolved without any call to the - * resolver; if the MULTIRT_CACHEGW flag is set, - * give the top priority to this ire and exit the - * loop. - * This occurs when an resolver reply is processed - * through ip_wput_nondata() - */ - if ((flags & MULTIRT_CACHEGW) && - (gw_ire != NULL) && - (gw_ire->ire_type & IRE_CACHETABLE)) { + clone = ire_create_if_clone(ire, &v6nexthop, + &generation); + if (clone == NULL) { /* - * Release the resolver associated to the - * previous candidate best ire, if any. + * Temporary failure - no memory. + * Don't want caller to cache IRE_NOROUTE. */ - if (best_cire) { - ire_refrele(best_cire); - ASSERT(best_fire); - } - - best_fire = fire; - best_cire = gw_ire; - - ip2dbg(("ire_multirt_lookup_v6: found top prio " - "best_fire %p, best_cire %p\n", - (void *)best_fire, (void *)best_cire)); - break; + invalidate = B_TRUE; + ire = ire_blackhole(ipst, B_TRUE); + goto error; } - /* - * Compute the time elapsed since our preceding - * attempt to resolve that route. - * If the MULTIRT_USESTAMP flag is set, we take that - * route into account only if this time interval - * exceeds ip_multirt_resolution_interval; - * this prevents us from attempting to resolve a - * broken route upon each sending of a packet. + * Make clone next to last entry and the + * IRE_INTERFACE the last in the dependency + * chain since the clone depends on the + * IRE_INTERFACE. */ - delta = lbolt - fire->ire_last_used_time; - delta = TICK_TO_MSEC(delta); - - res = (boolean_t) - ((delta > ipst-> - ips_ip_multirt_resolution_interval) || - (!(flags & MULTIRT_USESTAMP))); + ASSERT(i >= 1); + ASSERT(i < MAX_IRE_RECURSION); - ip2dbg(("ire_multirt_lookup_v6: fire %p, delta %lu, " - "res %d\n", - (void *)fire, delta, res)); - - if (res) { - /* - * A resolver exists for the gateway: save - * the current IRE_HOST ire as a candidate - * best ire. If we later discover that a - * top priority ire exists (i.e. no need to - * call the resolver), then this new ire - * will be preferred to the current one. - */ - if (gw_ire != NULL) { - if (best_fire == NULL) { - ASSERT(best_cire == NULL); - - best_fire = fire; - best_cire = gw_ire; - - ip2dbg(("ire_multirt_lookup_v6:" - "found candidate " - "best_fire %p, " - "best_cire %p\n", - (void *)best_fire, - (void *)best_cire)); - - /* - * If MULTIRT_CACHEGW is not - * set, we ignore the top - * priority ires that can - * be resolved without any - * call to the resolver; - * In that case, there is - * actually no need - * to continue the loop. - */ - if (!(flags & - MULTIRT_CACHEGW)) { - break; - } - continue; - } - } else { - /* - * No resolver for the gateway: the - * route is not resolvable. - * If the MULTIRT_SETSTAMP flag is - * set, we stamp the IRE_HOST ire, - * so we will not select it again - * during this resolution interval. - */ - if (flags & MULTIRT_SETSTAMP) - fire->ire_last_used_time = - lbolt; - } - } + ires[i] = ires[i-1]; + generations[i] = generations[i-1]; + ires[i-1] = clone; + generations[i-1] = generation; + i++; - if (gw_ire != NULL) - ire_refrele(gw_ire); + ire = NULL; + goto done; } - } else { /* IN6_IS_ADDR_MULTICAST(&v6dst) */ - for (fire = first_fire; - fire != NULL; - fire = fire->ire_next) { - - if (!(fire->ire_flags & RTF_MULTIRT)) - continue; - if (!IN6_ARE_ADDR_EQUAL(&fire->ire_addr_v6, &v6dst)) - continue; - - if (fire->ire_gw_secattr != NULL && - tsol_ire_match_gwattr(fire, tsl) != 0) { - continue; - } - - already_resolved = B_FALSE; - - mutex_enter(&fire->ire_lock); - v6gw = fire->ire_gateway_addr_v6; - mutex_exit(&fire->ire_lock); - - gw_ire = ire_ftable_lookup_v6(&v6gw, 0, 0, - IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, tsl, - MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE | - MATCH_IRE_SECATTR, ipst); - - /* No resolver for the gateway; we skip this ire. */ - if (gw_ire == NULL) { - continue; - } + /* + * We only match on the type and optionally ILL when + * recursing. The type match is used by some callers + * to exclude certain types (such as IRE_IF_CLONE or + * IRE_LOCAL|IRE_LOOPBACK). + */ + match_args &= MATCH_IRE_TYPE; + v6nexthop = ire->ire_gateway_addr_v6; + if (ill == NULL && ire->ire_ill != NULL) { + ill = ire->ire_ill; + need_refrele = B_TRUE; + ill_refhold(ill); + match_args |= MATCH_IRE_ILL; + } - if (first_cire) { + ire = NULL; + } + ASSERT(ire == NULL); + ire = ire_reject(ipst, B_TRUE); - IRB_REFHOLD(cirb); - /* - * For all IRE_CACHE ires for that - * destination. - */ - for (cire = first_cire; - cire != NULL; - cire = cire->ire_next) { - - if (!(cire->ire_flags & RTF_MULTIRT)) - continue; - if (!IN6_ARE_ADDR_EQUAL( - &cire->ire_addr_v6, &v6dst)) - continue; - if (cire->ire_marks & - IRE_MARK_CONDEMNED) - continue; - - if (cire->ire_gw_secattr != NULL && - tsol_ire_match_gwattr(cire, - tsl) != 0) { - continue; - } - - /* - * Cache entries are linked to the - * parent routes using the parent handle - * (ire_phandle). If no cache entry has - * the same handle as fire, fire is - * still unresolved. - */ - ASSERT(cire->ire_phandle != 0); - if (cire->ire_phandle == - fire->ire_phandle) { - already_resolved = B_TRUE; - break; - } - } - IRB_REFRELE(cirb); - } +error: + ASSERT(ire != NULL); + if (need_refrele) + ill_refrele(ill); - /* - * This route is already resolved; proceed with - * next one. - */ - if (already_resolved) { - ire_refrele(gw_ire); - continue; - } + /* + * In the case of MULTIRT we want to try a different IRE the next + * time. We let the next packet retry in that case. + */ + if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT)) + (void) ire_no_good(ires[0]); - /* - * Compute the time elapsed since our preceding - * attempt to resolve that route. - * If the MULTIRT_USESTAMP flag is set, we take - * that route into account only if this time - * interval exceeds ip_multirt_resolution_interval; - * this prevents us from attempting to resolve a - * broken route upon each sending of a packet. - */ - delta = lbolt - fire->ire_last_used_time; - delta = TICK_TO_MSEC(delta); - - res = (boolean_t) - ((delta > ipst-> - ips_ip_multirt_resolution_interval) || - (!(flags & MULTIRT_USESTAMP))); - - ip3dbg(("ire_multirt_lookup_v6: fire %p, delta %lx, " - "flags %04x, res %d\n", - (void *)fire, delta, flags, res)); - - if (res) { - if (best_cire) { - /* - * Release the resolver associated - * to the preceding candidate best - * ire, if any. - */ - ire_refrele(best_cire); - ASSERT(best_fire); - } - best_fire = fire; - best_cire = gw_ire; - continue; - } +cleanup: + /* cleanup ires[i] */ + ire_dep_unbuild(ires, i); + for (j = 0; j < i; j++) + ire_refrele(ires[j]); - ire_refrele(gw_ire); - } - } + ASSERT(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)); + /* + * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the + * ip_select_route since the reject or lack of memory might be gone. + */ + if (generationp != NULL) + *generationp = IRE_GENERATION_VERIFY; + return (ire); - if (best_fire) { - IRE_REFHOLD(best_fire); +done: + ASSERT(ire == NULL); + if (need_refrele) + ill_refrele(ill); + + /* Build dependencies */ + if (!ire_dep_build(ires, generations, i)) { + /* Something in chain was condemned; tear it apart */ + ire = ire_blackhole(ipst, B_TRUE); + goto cleanup; } - IRB_REFRELE(firb); - /* Release the first IRE_CACHE we initially looked up, if any. */ - if (first_cire) - ire_refrele(first_cire); - - /* Found a resolvable route. */ - if (best_fire) { - ASSERT(best_cire); - - if (*fire_arg) - ire_refrele(*fire_arg); - if (*ire_arg) - ire_refrele(*ire_arg); + /* + * Release all refholds except the one for ires[0] that we + * will return to the caller. + */ + for (j = 1; j < i; j++) + ire_refrele(ires[j]); + if (invalidate) { /* - * Update the passed arguments with the - * resolvable multirt route we found + * Since we needed to allocate but couldn't we need to make + * sure that the dependency chain is rebuilt the next time. */ - *fire_arg = best_fire; - *ire_arg = best_cire; - - ip2dbg(("ire_multirt_lookup_v6: returning B_TRUE, " - "*fire_arg %p, *ire_arg %p\n", - (void *)best_fire, (void *)best_cire)); - - return (B_TRUE); + ire_dep_invalidate_generations(ires[0]); + generation = IRE_GENERATION_VERIFY; + } else { + /* + * IREs can have been added or deleted while we did the + * recursive lookup and we can't catch those until we've built + * the dependencies. We verify the stored + * ire_dep_parent_generation to catch any such changes and + * return IRE_GENERATION_VERIFY (which will cause + * ip_select_route to be called again so we can redo the + * recursive lookup next time we send a packet. + */ + generation = ire_dep_validate_generations(ires[0]); + if (generations[0] != ires[0]->ire_generation) { + /* Something changed at the top */ + generation = IRE_GENERATION_VERIFY; + } } + if (generationp != NULL) + *generationp = generation; - ASSERT(best_cire == NULL); - - ip2dbg(("ire_multirt_lookup_v6: returning B_FALSE, *fire_arg %p, " - "*ire_arg %p\n", - (void *)*fire_arg, (void *)*ire_arg)); - - /* No resolvable route. */ - return (B_FALSE); + return (ires[0]); } - -/* - * Find an IRE_OFFSUBNET IRE entry for the multicast address 'v6dstp' - * that goes through 'ipif'. As a fallback, a route that goes through - * ipif->ipif_ill can be returned. - */ ire_t * -ipif_lookup_multi_ire_v6(ipif_t *ipif, const in6_addr_t *v6dstp) +ire_route_recursive_v6(const in6_addr_t *nexthop, uint_t ire_type, + const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, + boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst, + in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) { - ire_t *ire; - ire_t *save_ire = NULL; - ire_t *gw_ire; - irb_t *irb; - in6_addr_t v6gw; - int match_flags = MATCH_IRE_TYPE | MATCH_IRE_ILL; - ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; - - ire = ire_ftable_lookup_v6(v6dstp, 0, 0, 0, NULL, NULL, ALL_ZONES, 0, - NULL, MATCH_IRE_DEFAULT, ipst); - - if (ire == NULL) - return (NULL); - - irb = ire->ire_bucket; - ASSERT(irb); - - IRB_REFHOLD(irb); - ire_refrele(ire); - for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { - if (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, v6dstp) || - (ipif->ipif_zoneid != ire->ire_zoneid && - ire->ire_zoneid != ALL_ZONES)) { - continue; - } - - switch (ire->ire_type) { - case IRE_DEFAULT: - case IRE_PREFIX: - case IRE_HOST: - mutex_enter(&ire->ire_lock); - v6gw = ire->ire_gateway_addr_v6; - mutex_exit(&ire->ire_lock); - gw_ire = ire_ftable_lookup_v6(&v6gw, 0, 0, - IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0, - NULL, match_flags, ipst); - - if (gw_ire != NULL) { - if (save_ire != NULL) { - ire_refrele(save_ire); - } - IRE_REFHOLD(ire); - if (gw_ire->ire_ipif == ipif) { - ire_refrele(gw_ire); - - IRB_REFRELE(irb); - return (ire); - } - ire_refrele(gw_ire); - save_ire = ire; - } - break; - case IRE_IF_NORESOLVER: - case IRE_IF_RESOLVER: - if (ire->ire_ipif == ipif) { - if (save_ire != NULL) { - ire_refrele(save_ire); - } - IRE_REFHOLD(ire); - - IRB_REFRELE(irb); - return (ire); - } - break; - } - } - IRB_REFRELE(irb); - - return (save_ire); + return (ire_route_recursive_impl_v6(NULL, nexthop, ire_type, ill, + zoneid, tsl, match_args, allocate, xmit_hint, ipst, setsrcp, + gwattrp, generationp)); } /* - * This is the implementation of the IPv6 IRE cache lookup procedure. - * Separating the interface from the implementation allows additional - * flexibility when specifying search criteria. + * Recursively look for a route to the destination. + * We only handle a destination match here, yet we have the same arguments + * as the full match to allow function pointers to select between the two. + * + * Note that this function never returns NULL. It returns an IRE_NOROUTE + * instead. + * + * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it + * is an error. + * Allow at most one RTF_INDIRECT. */ -static ire_t * -ip6_ctable_lookup_impl(ire_ctable_args_t *margs) +ire_t * +ire_route_recursive_dstonly_v6(const in6_addr_t *nexthop, boolean_t allocate, + uint32_t xmit_hint, ip_stack_t *ipst) { - irb_t *irb_ptr; - ire_t *ire; - ip_stack_t *ipst = margs->ict_ipst; + ire_t *ire; + ire_t *ire1; + uint_t generation; - if ((margs->ict_flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && - (margs->ict_ipif == NULL)) { - return (NULL); - } + /* ire_ftable_lookup handles round-robin/ECMP */ + ire = ire_ftable_lookup_simple_v6(nexthop, xmit_hint, ipst, + &generation); + ASSERT(ire != NULL); - irb_ptr = &ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6( - *((in6_addr_t *)(margs->ict_addr)), - ipst->ips_ip6_cache_table_size)]; - rw_enter(&irb_ptr->irb_lock, RW_READER); - for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { - if (ire->ire_marks & IRE_MARK_CONDEMNED) - continue; - ASSERT(IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, &ipv6_all_ones)); - if (ire_match_args_v6(ire, (in6_addr_t *)margs->ict_addr, - &ire->ire_mask_v6, (in6_addr_t *)margs->ict_gateway, - margs->ict_type, margs->ict_ipif, margs->ict_zoneid, 0, - margs->ict_tsl, margs->ict_flags)) { - IRE_REFHOLD(ire); - rw_exit(&irb_ptr->irb_lock); - return (ire); - } + /* + * If this type should have an ire_nce_cache (even if it + * doesn't yet have one) then we are done. Includes + * IRE_INTERFACE with a full 128 bit mask. + */ + if (ire->ire_nce_capable) + return (ire); + + /* + * If the IRE has a current cached parent we know that the whole + * parent chain is current, hence we don't need to discover and + * build any dependencies by doing a recursive lookup. + */ + mutex_enter(&ire->ire_lock); + if (ire->ire_dep_parent != NULL && + ire->ire_dep_parent->ire_generation == + ire->ire_dep_parent_generation) { + mutex_exit(&ire->ire_lock); + return (ire); } + mutex_exit(&ire->ire_lock); - rw_exit(&irb_ptr->irb_lock); - return (NULL); + /* + * Fallback to loop in the normal code starting with the ire + * we found. Normally this would return the same ire. + */ + ire1 = ire_route_recursive_impl_v6(ire, nexthop, 0, NULL, ALL_ZONES, + NULL, MATCH_IRE_DSTONLY, allocate, xmit_hint, ipst, NULL, NULL, + &generation); + ire_refrele(ire); + return (ire1); } |
