summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/inet/ip/ip6_ire.c
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/inet/ip/ip6_ire.c')
-rw-r--r--usr/src/uts/common/inet/ip/ip6_ire.c3123
1 files changed, 871 insertions, 2252 deletions
diff --git a/usr/src/uts/common/inet/ip/ip6_ire.c b/usr/src/uts/common/inet/ip/ip6_ire.c
index c13a66fcc2..7697ca20c7 100644
--- a/usr/src/uts/common/inet/ip/ip6_ire.c
+++ b/usr/src/uts/common/inet/ip/ip6_ire.c
@@ -60,122 +60,122 @@
#include <sys/tsol/label.h>
#include <sys/tsol/tnet.h>
+#define IS_DEFAULT_ROUTE_V6(ire) \
+ (((ire)->ire_type & IRE_DEFAULT) || \
+ (((ire)->ire_type & IRE_INTERFACE) && \
+ (IN6_IS_ADDR_UNSPECIFIED(&(ire)->ire_addr_v6))))
+
static ire_t ire_null;
-static ire_t *ire_ihandle_lookup_onlink_v6(ire_t *cire);
-static boolean_t ire_match_args_v6(ire_t *ire, const in6_addr_t *addr,
- const in6_addr_t *mask, const in6_addr_t *gateway, int type,
- const ipif_t *ipif, zoneid_t zoneid, uint32_t ihandle,
- const ts_label_t *tsl, int match_flags);
-static ire_t *ire_init_v6(ire_t *, const in6_addr_t *, const in6_addr_t *,
- const in6_addr_t *, const in6_addr_t *, uint_t *, queue_t *, queue_t *,
- ushort_t, ipif_t *, const in6_addr_t *, uint32_t, uint32_t, uint_t,
- const iulp_t *, tsol_gc_t *, tsol_gcgrp_t *, ip_stack_t *);
-static ire_t *ip6_ctable_lookup_impl(ire_ctable_args_t *);
+static ire_t *
+ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask,
+ const in6_addr_t *gateway, int type, const ill_t *ill,
+ zoneid_t zoneid, const ts_label_t *tsl, int flags,
+ ip_stack_t *ipst);
/*
* Initialize the ire that is specific to IPv6 part and call
* ire_init_common to finish it.
+ * Returns zero or errno.
*/
-static ire_t *
+int
ire_init_v6(ire_t *ire, const in6_addr_t *v6addr, const in6_addr_t *v6mask,
- const in6_addr_t *v6src_addr, const in6_addr_t *v6gateway,
- uint_t *max_fragp, queue_t *rfq, queue_t *stq, ushort_t type,
- ipif_t *ipif, const in6_addr_t *v6cmask, uint32_t phandle,
- uint32_t ihandle, uint_t flags, const iulp_t *ulp_info, tsol_gc_t *gc,
- tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
+ const in6_addr_t *v6gateway, ushort_t type, ill_t *ill,
+ zoneid_t zoneid, uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst)
{
+ int error;
/*
- * Reject IRE security attribute creation/initialization
+ * Reject IRE security attmakeribute creation/initialization
* if system is not running in Trusted mode.
*/
- if ((gc != NULL || gcgrp != NULL) && !is_system_labeled())
- return (NULL);
-
+ if (gc != NULL && !is_system_labeled())
+ return (EINVAL);
BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_alloced);
- ire->ire_addr_v6 = *v6addr;
-
- if (v6src_addr != NULL)
- ire->ire_src_addr_v6 = *v6src_addr;
- if (v6mask != NULL) {
- ire->ire_mask_v6 = *v6mask;
- ire->ire_masklen = ip_mask_to_plen_v6(&ire->ire_mask_v6);
- }
+ if (v6addr != NULL)
+ ire->ire_addr_v6 = *v6addr;
if (v6gateway != NULL)
ire->ire_gateway_addr_v6 = *v6gateway;
- if (type == IRE_CACHE && v6cmask != NULL)
- ire->ire_cmask_v6 = *v6cmask;
-
- /*
- * Multirouted packets need to have a fragment header added so that
- * the receiver is able to discard duplicates according to their
- * fragment identifier.
- */
- if (type == IRE_CACHE && (flags & RTF_MULTIRT)) {
- ire->ire_frag_flag = IPH_FRAG_HDR;
+ /* Make sure we don't have stray values in some fields */
+ switch (type) {
+ case IRE_LOOPBACK:
+ ire->ire_gateway_addr_v6 = ire->ire_addr_v6;
+ /* FALLTHRU */
+ case IRE_HOST:
+ case IRE_LOCAL:
+ case IRE_IF_CLONE:
+ ire->ire_mask_v6 = ipv6_all_ones;
+ ire->ire_masklen = IPV6_ABITS;
+ break;
+ case IRE_PREFIX:
+ case IRE_DEFAULT:
+ case IRE_IF_RESOLVER:
+ case IRE_IF_NORESOLVER:
+ if (v6mask != NULL) {
+ ire->ire_mask_v6 = *v6mask;
+ ire->ire_masklen =
+ ip_mask_to_plen_v6(&ire->ire_mask_v6);
+ }
+ break;
+ case IRE_MULTICAST:
+ case IRE_NOROUTE:
+ ASSERT(v6mask == NULL);
+ break;
+ default:
+ ASSERT(0);
+ return (EINVAL);
}
- /* ire_init_common will free the mblks upon encountering any failure */
- if (!ire_init_common(ire, max_fragp, NULL, rfq, stq, type, ipif,
- phandle, ihandle, flags, IPV6_VERSION, ulp_info, gc, gcgrp, ipst))
- return (NULL);
-
- return (ire);
-}
-
-/*
- * Similar to ire_create_v6 except that it is called only when
- * we want to allocate ire as an mblk e.g. we have a external
- * resolver. Do we need this in IPv6 ?
- *
- * IPv6 initializes the ire_nce in ire_add_v6, which expects to
- * find the ire_nce to be null when it is called. So, although
- * we have a src_nce parameter (in the interest of matching up with
- * the argument list of the v4 version), we ignore the src_nce
- * argument here.
- */
-/* ARGSUSED */
-ire_t *
-ire_create_mp_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask,
- const in6_addr_t *v6src_addr, const in6_addr_t *v6gateway,
- nce_t *src_nce, queue_t *rfq, queue_t *stq, ushort_t type,
- ipif_t *ipif, const in6_addr_t *v6cmask,
- uint32_t phandle, uint32_t ihandle, uint_t flags, const iulp_t *ulp_info,
- tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
-{
- ire_t *ire;
- ire_t *ret_ire;
- mblk_t *mp;
+ error = ire_init_common(ire, type, ill, zoneid, flags, IPV6_VERSION,
+ gc, ipst);
+ if (error != NULL)
+ return (error);
- ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr));
+ /* Determine which function pointers to use */
+ ire->ire_postfragfn = ip_xmit; /* Common case */
- /* Allocate the new IRE. */
- mp = allocb(sizeof (ire_t), BPRI_MED);
- if (mp == NULL) {
- ip1dbg(("ire_create_mp_v6: alloc failed\n"));
- return (NULL);
+ switch (ire->ire_type) {
+ case IRE_LOCAL:
+ ire->ire_sendfn = ire_send_local_v6;
+ ire->ire_recvfn = ire_recv_local_v6;
+#ifdef SO_VRRP
+ ASSERT(ire->ire_ill != NULL);
+ if (ire->ire_ill->ill_flags & ILLF_NOACCEPT) {
+ ire->ire_noaccept = B_TRUE;
+ ire->ire_recvfn = ire_recv_noaccept_v6;
+ }
+#endif
+ break;
+ case IRE_LOOPBACK:
+ ire->ire_sendfn = ire_send_local_v6;
+ ire->ire_recvfn = ire_recv_loopback_v6;
+ break;
+ case IRE_MULTICAST:
+ ire->ire_postfragfn = ip_postfrag_loopcheck;
+ ire->ire_sendfn = ire_send_multicast_v6;
+ ire->ire_recvfn = ire_recv_multicast_v6;
+ break;
+ default:
+ /*
+ * For IRE_IF_ALL and IRE_OFFLINK we forward received
+ * packets by default.
+ */
+ ire->ire_sendfn = ire_send_wire_v6;
+ ire->ire_recvfn = ire_recv_forward_v6;
+ break;
}
-
- ire = (ire_t *)mp->b_rptr;
- mp->b_wptr = (uchar_t *)&ire[1];
-
- /* Start clean. */
- *ire = ire_null;
- ire->ire_mp = mp;
- mp->b_datap->db_type = IRE_DB_TYPE;
-
- ret_ire = ire_init_v6(ire, v6addr, v6mask, v6src_addr, v6gateway,
- NULL, rfq, stq, type, ipif, v6cmask, phandle,
- ihandle, flags, ulp_info, gc, gcgrp, ipst);
-
- if (ret_ire == NULL) {
- freeb(ire->ire_mp);
- return (NULL);
+ if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
+ ire->ire_sendfn = ire_send_noroute_v6;
+ ire->ire_recvfn = ire_recv_noroute_v6;
+ } else if (ire->ire_flags & RTF_MULTIRT) {
+ ire->ire_postfragfn = ip_postfrag_multirt_v6;
+ ire->ire_sendfn = ire_send_multirt_v6;
+ ire->ire_recvfn = ire_recv_multirt_v6;
}
- return (ire);
+ ire->ire_nce_capable = ire_determine_nce_capable(ire);
+ return (0);
}
/*
@@ -183,153 +183,76 @@ ire_create_mp_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask,
*
* NOTE : This is called as writer sometimes though not required
* by this function.
- *
- * See comments above ire_create_mp_v6() for the rationale behind the
- * unused src_nce argument.
*/
/* ARGSUSED */
ire_t *
ire_create_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask,
- const in6_addr_t *v6src_addr, const in6_addr_t *v6gateway,
- uint_t *max_fragp, nce_t *src_nce, queue_t *rfq, queue_t *stq,
- ushort_t type, ipif_t *ipif, const in6_addr_t *v6cmask,
- uint32_t phandle, uint32_t ihandle, uint_t flags, const iulp_t *ulp_info,
- tsol_gc_t *gc, tsol_gcgrp_t *gcgrp, ip_stack_t *ipst)
+ const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, zoneid_t zoneid,
+ uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst)
{
ire_t *ire;
- ire_t *ret_ire;
+ int error;
ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr));
ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP);
if (ire == NULL) {
- ip1dbg(("ire_create_v6: alloc failed\n"));
+ DTRACE_PROBE(kmem__cache__alloc);
return (NULL);
}
*ire = ire_null;
- ret_ire = ire_init_v6(ire, v6addr, v6mask, v6src_addr, v6gateway,
- max_fragp, rfq, stq, type, ipif, v6cmask, phandle,
- ihandle, flags, ulp_info, gc, gcgrp, ipst);
+ error = ire_init_v6(ire, v6addr, v6mask, v6gateway,
+ type, ill, zoneid, flags, gc, ipst);
- if (ret_ire == NULL) {
+ if (error != 0) {
+ DTRACE_PROBE2(ire__init__v6, ire_t *, ire, int, error);
kmem_cache_free(ire_cache, ire);
return (NULL);
}
- ASSERT(ret_ire == ire);
return (ire);
}
/*
- * Find an IRE_INTERFACE for the multicast group.
+ * Find the ill matching a multicast group.
* Allows different routes for multicast addresses
* in the unicast routing table (akin to FF::0/8 but could be more specific)
* which point at different interfaces. This is used when IPV6_MULTICAST_IF
* isn't specified (when sending) and when IPV6_JOIN_GROUP doesn't
* specify the interface to join on.
*
- * Supports link-local addresses by following the ipif/ill when recursing.
+ * Supports link-local addresses by using ire_route_recursive which follows
+ * the ill when recursing.
+ *
+ * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group
+ * and the MULTIRT property can be different for different groups, we
+ * extract RTF_MULTIRT from the special unicast route added for a group
+ * with CGTP and pass that back in the multirtp argument.
+ * This is used in ip_set_destination etc to set ixa_postfragfn for multicast.
+ * We have a setsrcp argument for the same reason.
*/
-ire_t *
-ire_lookup_multi_v6(const in6_addr_t *group, zoneid_t zoneid, ip_stack_t *ipst)
+ill_t *
+ire_lookup_multi_ill_v6(const in6_addr_t *group, zoneid_t zoneid,
+ ip_stack_t *ipst, boolean_t *multirtp, in6_addr_t *setsrcp)
{
ire_t *ire;
- ipif_t *ipif = NULL;
- int match_flags = MATCH_IRE_TYPE;
- in6_addr_t gw_addr_v6;
-
- ire = ire_ftable_lookup_v6(group, 0, 0, 0, NULL, NULL,
- zoneid, 0, NULL, MATCH_IRE_DEFAULT, ipst);
+ ill_t *ill;
- /* We search a resolvable ire in case of multirouting. */
- if ((ire != NULL) && (ire->ire_flags & RTF_MULTIRT)) {
- ire_t *cire = NULL;
- /*
- * If the route is not resolvable, the looked up ire
- * may be changed here. In that case, ire_multirt_lookup_v6()
- * IRE_REFRELE the original ire and change it.
- */
- (void) ire_multirt_lookup_v6(&cire, &ire, MULTIRT_CACHEGW,
- NULL, ipst);
- if (cire != NULL)
- ire_refrele(cire);
- }
- if (ire == NULL)
- return (NULL);
- /*
- * Make sure we follow ire_ipif.
- *
- * We need to determine the interface route through
- * which the gateway will be reached.
- */
- if (ire->ire_ipif != NULL) {
- ipif = ire->ire_ipif;
- match_flags |= MATCH_IRE_ILL;
- }
+ ire = ire_route_recursive_v6(group, 0, NULL, zoneid, NULL,
+ MATCH_IRE_DSTONLY, B_FALSE, 0, ipst, setsrcp, NULL, NULL);
+ ASSERT(ire != NULL);
- switch (ire->ire_type) {
- case IRE_DEFAULT:
- case IRE_PREFIX:
- case IRE_HOST:
- mutex_enter(&ire->ire_lock);
- gw_addr_v6 = ire->ire_gateway_addr_v6;
- mutex_exit(&ire->ire_lock);
- ire_refrele(ire);
- ire = ire_ftable_lookup_v6(&gw_addr_v6, 0, 0,
- IRE_INTERFACE, ipif, NULL, zoneid, 0,
- NULL, match_flags, ipst);
- return (ire);
- case IRE_IF_NORESOLVER:
- case IRE_IF_RESOLVER:
- return (ire);
- default:
+ if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
ire_refrele(ire);
return (NULL);
}
-}
-/*
- * Return any local address. We use this to target ourselves
- * when the src address was specified as 'default'.
- * Preference for IRE_LOCAL entries.
- */
-ire_t *
-ire_lookup_local_v6(zoneid_t zoneid, ip_stack_t *ipst)
-{
- ire_t *ire;
- irb_t *irb;
- ire_t *maybe = NULL;
- int i;
+ if (multirtp != NULL)
+ *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0;
- for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) {
- irb = &ipst->ips_ip_cache_table_v6[i];
- if (irb->irb_ire == NULL)
- continue;
- rw_enter(&irb->irb_lock, RW_READER);
- for (ire = irb->irb_ire; ire; ire = ire->ire_next) {
- if ((ire->ire_marks & IRE_MARK_CONDEMNED) ||
- ire->ire_zoneid != zoneid &&
- ire->ire_zoneid != ALL_ZONES)
- continue;
- switch (ire->ire_type) {
- case IRE_LOOPBACK:
- if (maybe == NULL) {
- IRE_REFHOLD(ire);
- maybe = ire;
- }
- break;
- case IRE_LOCAL:
- if (maybe != NULL) {
- ire_refrele(maybe);
- }
- IRE_REFHOLD(ire);
- rw_exit(&irb->irb_lock);
- return (ire);
- }
- }
- rw_exit(&irb->irb_lock);
- }
- return (maybe);
+ ill = ire_nexthop_ill(ire);
+ ire_refrele(ire);
+ return (ill);
}
/*
@@ -369,6 +292,8 @@ ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask)
if (plen < 0 || plen > IPV6_ABITS)
return (NULL);
*bitmask = ipv6_all_zeros;
+ if (plen == 0)
+ return (bitmask);
ptr = (uint32_t *)bitmask;
while (plen > 32) {
@@ -380,196 +305,78 @@ ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask)
}
/*
- * Add a fully initialized IRE to an appropriate
- * table based on ire_type.
- *
- * The forward table contains IRE_PREFIX/IRE_HOST/IRE_HOST and
- * IRE_IF_RESOLVER/IRE_IF_NORESOLVER and IRE_DEFAULT.
- *
- * The cache table contains IRE_BROADCAST/IRE_LOCAL/IRE_LOOPBACK
- * and IRE_CACHE.
- *
- * NOTE : This function is called as writer though not required
- * by this function.
+ * Add a fully initialized IPv6 IRE to the forwarding table.
+ * This returns NULL on failure, or a held IRE on success.
+ * Normally the returned IRE is the same as the argument. But a different
+ * IRE will be returned if the added IRE is deemed identical to an existing
+ * one. In that case ire_identical_ref will be increased.
+ * The caller always needs to do an ire_refrele() on the returned IRE.
*/
-int
-ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func)
+ire_t *
+ire_add_v6(ire_t *ire)
{
ire_t *ire1;
int mask_table_index;
irb_t *irb_ptr;
ire_t **irep;
- int flags;
- ire_t *pire = NULL;
- ill_t *stq_ill;
- boolean_t ndp_g_lock_held = B_FALSE;
- ire_t *ire = *ire_p;
+ int match_flags;
int error;
ip_stack_t *ipst = ire->ire_ipst;
- uint_t marks = 0;
ASSERT(ire->ire_ipversion == IPV6_VERSION);
- ASSERT(ire->ire_mp == NULL); /* Calls should go through ire_add */
- ASSERT(ire->ire_nce == NULL);
-
- /*
- * IREs with source addresses hosted on interfaces that are under IPMP
- * should be hidden so that applications don't accidentally end up
- * sending packets with test addresses as their source addresses, or
- * sending out interfaces that are e.g. IFF_INACTIVE. Hide them here.
- * (We let IREs with unspecified source addresses slip through since
- * ire_send_v6() will delete them automatically.)
- */
- if (ire->ire_ipif != NULL && IS_UNDER_IPMP(ire->ire_ipif->ipif_ill) &&
- !IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6)) {
- DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire);
- marks |= IRE_MARK_TESTHIDDEN;
- }
-
- /* Find the appropriate list head. */
- switch (ire->ire_type) {
- case IRE_HOST:
- ire->ire_mask_v6 = ipv6_all_ones;
- ire->ire_masklen = IPV6_ABITS;
- ire->ire_marks |= marks;
- if ((ire->ire_flags & RTF_SETSRC) == 0)
- ire->ire_src_addr_v6 = ipv6_all_zeros;
- break;
- case IRE_CACHE:
- ire->ire_mask_v6 = ipv6_all_ones;
- ire->ire_masklen = IPV6_ABITS;
- ire->ire_marks |= marks;
- break;
- case IRE_LOCAL:
- case IRE_LOOPBACK:
- ire->ire_mask_v6 = ipv6_all_ones;
- ire->ire_masklen = IPV6_ABITS;
- break;
- case IRE_PREFIX:
- case IRE_DEFAULT:
- ire->ire_marks |= marks;
- if ((ire->ire_flags & RTF_SETSRC) == 0)
- ire->ire_src_addr_v6 = ipv6_all_zeros;
- break;
- case IRE_IF_RESOLVER:
- case IRE_IF_NORESOLVER:
- ire->ire_marks |= marks;
- break;
- default:
- printf("ire_add_v6: ire %p has unrecognized IRE type (%d)\n",
- (void *)ire, ire->ire_type);
- ire_delete(ire);
- *ire_p = NULL;
- return (EINVAL);
- }
/* Make sure the address is properly masked. */
V6_MASK_COPY(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6);
- if ((ire->ire_type & IRE_CACHETABLE) == 0) {
- /* IRE goes into Forward Table */
- mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6);
- if ((ipst->ips_ip_forwarding_table_v6[mask_table_index]) ==
- NULL) {
- irb_t *ptr;
- int i;
-
- ptr = (irb_t *)mi_zalloc((
- ipst->ips_ip6_ftable_hash_size * sizeof (irb_t)));
- if (ptr == NULL) {
- ire_delete(ire);
- *ire_p = NULL;
- return (ENOMEM);
- }
- for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) {
- rw_init(&ptr[i].irb_lock, NULL,
- RW_DEFAULT, NULL);
- }
- mutex_enter(&ipst->ips_ire_ft_init_lock);
- if (ipst->ips_ip_forwarding_table_v6[
- mask_table_index] == NULL) {
- ipst->ips_ip_forwarding_table_v6[
- mask_table_index] = ptr;
- mutex_exit(&ipst->ips_ire_ft_init_lock);
- } else {
- /*
- * Some other thread won the race in
- * initializing the forwarding table at the
- * same index.
- */
- mutex_exit(&ipst->ips_ire_ft_init_lock);
- for (i = 0; i < ipst->ips_ip6_ftable_hash_size;
- i++) {
- rw_destroy(&ptr[i].irb_lock);
- }
- mi_free(ptr);
- }
- }
- irb_ptr = &(ipst->ips_ip_forwarding_table_v6[mask_table_index][
- IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6,
- ipst->ips_ip6_ftable_hash_size)]);
- } else {
- irb_ptr = &(ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(
- ire->ire_addr_v6, ipst->ips_ip6_cache_table_size)]);
- }
- /*
- * For xresolv interfaces (v6 interfaces with an external
- * address resolver), ip_newroute_v6/ip_newroute_ipif_v6
- * are unable to prevent the deletion of the interface route
- * while adding an IRE_CACHE for an on-link destination
- * in the IRE_IF_RESOLVER case, since the ire has to go to
- * the external resolver and return. We can't do a REFHOLD on the
- * associated interface ire for fear of the message being freed
- * if the external resolver can't resolve the address.
- * Here we look up the interface ire in the forwarding table
- * and make sure that the interface route has not been deleted.
- */
- if (ire->ire_type == IRE_CACHE &&
- IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6) &&
- (((ill_t *)ire->ire_stq->q_ptr)->ill_net_type == IRE_IF_RESOLVER) &&
- (((ill_t *)ire->ire_stq->q_ptr)->ill_flags & ILLF_XRESOLV)) {
+ mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6);
+ if ((ipst->ips_ip_forwarding_table_v6[mask_table_index]) == NULL) {
+ irb_t *ptr;
+ int i;
- pire = ire_ihandle_lookup_onlink_v6(ire);
- if (pire == NULL) {
+ ptr = (irb_t *)mi_zalloc((ipst->ips_ip6_ftable_hash_size *
+ sizeof (irb_t)));
+ if (ptr == NULL) {
ire_delete(ire);
- *ire_p = NULL;
- return (EINVAL);
+ return (NULL);
}
- /* Prevent pire from getting deleted */
- IRB_REFHOLD(pire->ire_bucket);
- /* Has it been removed already? */
- if (pire->ire_marks & IRE_MARK_CONDEMNED) {
- IRB_REFRELE(pire->ire_bucket);
- ire_refrele(pire);
- ire_delete(ire);
- *ire_p = NULL;
- return (EINVAL);
+ for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) {
+ rw_init(&ptr[i].irb_lock, NULL, RW_DEFAULT, NULL);
+ }
+ mutex_enter(&ipst->ips_ire_ft_init_lock);
+ if (ipst->ips_ip_forwarding_table_v6[mask_table_index] ==
+ NULL) {
+ ipst->ips_ip_forwarding_table_v6[mask_table_index] =
+ ptr;
+ mutex_exit(&ipst->ips_ire_ft_init_lock);
+ } else {
+ /*
+ * Some other thread won the race in
+ * initializing the forwarding table at the
+ * same index.
+ */
+ mutex_exit(&ipst->ips_ire_ft_init_lock);
+ for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) {
+ rw_destroy(&ptr[i].irb_lock);
+ }
+ mi_free(ptr);
}
}
+ irb_ptr = &(ipst->ips_ip_forwarding_table_v6[mask_table_index][
+ IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6,
+ ipst->ips_ip6_ftable_hash_size)]);
- flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW);
+ match_flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW);
+ if (ire->ire_ill != NULL)
+ match_flags |= MATCH_IRE_ILL;
/*
- * For IRE_CACHES, MATCH_IRE_IPIF is not enough to check
- * for duplicates because :
- *
- * 1) ire_ipif->ipif_ill and ire_stq->q_ptr could be
- * pointing at different ills. A real duplicate is
- * a match on both ire_ipif and ire_stq.
- *
- * 2) We could have multiple packets trying to create
- * an IRE_CACHE for the same ill.
- *
- * Rather than looking at the packet, we depend on the above for
- * MATCH_IRE_ILL here.
- *
- * Unlike IPv4, MATCH_IRE_IPIF is needed here as we could have
- * multiple IRE_CACHES for an ill for the same destination
- * with various scoped addresses i.e represented by ipifs.
- *
- * MATCH_IRE_ILL is done implicitly below for IRE_CACHES.
+ * Start the atomic add of the ire. Grab the bucket lock and the
+ * ill lock. Check for condemned.
*/
- if (ire->ire_ipif != NULL)
- flags |= MATCH_IRE_IPIF;
+ error = ire_atomic_start(irb_ptr, ire);
+ if (error != 0) {
+ ire_delete(ire);
+ return (NULL);
+ }
/*
* If we are creating a hidden IRE, make sure we search for
@@ -577,103 +384,36 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func)
* Otherwise, we might find an IRE on some other interface
* that's not marked hidden.
*/
- if (ire->ire_marks & IRE_MARK_TESTHIDDEN)
- flags |= MATCH_IRE_MARK_TESTHIDDEN;
-
- /*
- * Start the atomic add of the ire. Grab the ill locks,
- * ill_g_usesrc_lock and the bucket lock. Check for condemned.
- * To avoid lock order problems, get the ndp6.ndp_g_lock now itself.
- */
- if (ire->ire_type == IRE_CACHE) {
- mutex_enter(&ipst->ips_ndp6->ndp_g_lock);
- ndp_g_lock_held = B_TRUE;
- }
-
- /*
- * If ipif or ill is changing ire_atomic_start() may queue the
- * request and return EINPROGRESS.
- */
-
- error = ire_atomic_start(irb_ptr, ire, q, mp, func);
- if (error != 0) {
- if (ndp_g_lock_held)
- mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
- /*
- * We don't know whether it is a valid ipif or not.
- * So, set it to NULL. This assumes that the ire has not added
- * a reference to the ipif.
- */
- ire->ire_ipif = NULL;
- ire_delete(ire);
- if (pire != NULL) {
- IRB_REFRELE(pire->ire_bucket);
- ire_refrele(pire);
- }
- *ire_p = NULL;
- return (error);
- }
- /*
- * To avoid creating ires having stale values for the ire_max_frag
- * we get the latest value atomically here. For more details
- * see the block comment in ip_sioctl_mtu and in DL_NOTE_SDU_CHANGE
- * in ip_rput_dlpi_writer
- */
- if (ire->ire_max_fragp == NULL) {
- if (IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6))
- ire->ire_max_frag = ire->ire_ipif->ipif_mtu;
- else
- ire->ire_max_frag = pire->ire_max_frag;
- } else {
- uint_t max_frag;
-
- max_frag = *ire->ire_max_fragp;
- ire->ire_max_fragp = NULL;
- ire->ire_max_frag = max_frag;
- }
+ if (ire->ire_testhidden)
+ match_flags |= MATCH_IRE_TESTHIDDEN;
/*
* Atomically check for duplicate and insert in the table.
*/
for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) {
- if (ire1->ire_marks & IRE_MARK_CONDEMNED)
+ if (IRE_IS_CONDEMNED(ire1))
continue;
-
- if (ire->ire_type == IRE_CACHE) {
- /*
- * We do MATCH_IRE_ILL implicitly here for IRE_CACHES.
- * As ire_ipif and ire_stq could point to two
- * different ills, we can't pass just ire_ipif to
- * ire_match_args and get a match on both ills.
- * This is just needed for duplicate checks here and
- * so we don't add an extra argument to
- * ire_match_args for this. Do it locally.
- *
- * NOTE : Currently there is no part of the code
- * that asks for both MATH_IRE_IPIF and MATCH_IRE_ILL
- * match for IRE_CACHEs. Thus we don't want to
- * extend the arguments to ire_match_args_v6.
- */
- if (ire1->ire_stq != ire->ire_stq)
- continue;
- /*
- * Multiroute IRE_CACHEs for a given destination can
- * have the same ire_ipif, typically if their source
- * address is forced using RTF_SETSRC, and the same
- * send-to queue. We differentiate them using the parent
- * handle.
- */
- if ((ire1->ire_flags & RTF_MULTIRT) &&
- (ire->ire_flags & RTF_MULTIRT) &&
- (ire1->ire_phandle != ire->ire_phandle))
- continue;
- }
+ /*
+ * Here we need an exact match on zoneid, i.e.,
+ * ire_match_args doesn't fit.
+ */
if (ire1->ire_zoneid != ire->ire_zoneid)
continue;
+
+ if (ire1->ire_type != ire->ire_type)
+ continue;
+
+ /*
+ * Note: We do not allow multiple routes that differ only
+ * in the gateway security attributes; such routes are
+ * considered duplicates.
+ * To change that we explicitly have to treat them as
+ * different here.
+ */
if (ire_match_args_v6(ire1, &ire->ire_addr_v6,
&ire->ire_mask_v6, &ire->ire_gateway_addr_v6,
- ire->ire_type, ire->ire_ipif, ire->ire_zoneid, 0, NULL,
- flags)) {
+ ire->ire_type, ire->ire_ill, ire->ire_zoneid, NULL,
+ match_flags)) {
/*
* Return the old ire after doing a REFHOLD.
* As most of the callers continue to use the IRE
@@ -683,141 +423,25 @@ ire_add_v6(ire_t **ire_p, queue_t *q, mblk_t *mp, ipsq_func_t func)
*/
ip1dbg(("found dup ire existing %p new %p",
(void *)ire1, (void *)ire));
- IRE_REFHOLD(ire1);
- if (ndp_g_lock_held)
- mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
+ ire_refhold(ire1);
+ atomic_add_32(&ire1->ire_identical_ref, 1);
ire_atomic_end(irb_ptr, ire);
ire_delete(ire);
- if (pire != NULL) {
- /*
- * Assert that it is
- * not yet removed from the list.
- */
- ASSERT(pire->ire_ptpn != NULL);
- IRB_REFRELE(pire->ire_bucket);
- ire_refrele(pire);
- }
- *ire_p = ire1;
- return (0);
+ return (ire1);
}
}
- if (ire->ire_type == IRE_CACHE) {
- const in6_addr_t *addr_v6;
- ill_t *ill = ire_to_ill(ire);
- char buf[INET6_ADDRSTRLEN];
- nce_t *nce;
- /*
- * All IRE_CACHE types must have a nce. If this is
- * not the case the entry will not be added. We need
- * to make sure that if somebody deletes the nce
- * after we looked up, they will find this ire and
- * delete the ire. To delete this ire one needs the
- * bucket lock which we are still holding here. So,
- * even if the nce gets deleted after we looked up,
- * this ire will get deleted.
- *
- * NOTE : Don't need the ire_lock for accessing
- * ire_gateway_addr_v6 as it is appearing first
- * time on the list and rts_setgwr_v6 could not
- * be changing this.
- */
- addr_v6 = &ire->ire_gateway_addr_v6;
- if (IN6_IS_ADDR_UNSPECIFIED(addr_v6))
- addr_v6 = &ire->ire_addr_v6;
-
- /* nce fastpath is per-ill; don't match across illgrp */
- nce = ndp_lookup_v6(ill, B_FALSE, addr_v6, B_TRUE);
- if (nce == NULL)
- goto failed;
-
- /* Pair of refhold, refrele just to get the tracing right */
- NCE_REFHOLD_TO_REFHOLD_NOTR(nce);
- /*
- * Atomically make sure that new IREs don't point
- * to an NCE that is logically deleted (CONDEMNED).
- * ndp_delete() first marks the NCE CONDEMNED.
- * This ensures that the nce_refcnt won't increase
- * due to new nce_lookups or due to addition of new IREs
- * pointing to this NCE. Then ndp_delete() cleans up
- * existing references. If we don't do it atomically here,
- * ndp_delete() -> nce_ire_delete() will not be able to
- * clean up the IRE list completely, and the nce_refcnt
- * won't go down to zero.
- */
- mutex_enter(&nce->nce_lock);
- if (ill->ill_flags & ILLF_XRESOLV) {
- /*
- * If we used an external resolver, we may not
- * have gone through neighbor discovery to get here.
- * Must update the nce_state before the next check.
- */
- if (nce->nce_state == ND_INCOMPLETE)
- nce->nce_state = ND_REACHABLE;
- }
- if (nce->nce_state == ND_INCOMPLETE ||
- (nce->nce_flags & NCE_F_CONDEMNED) ||
- (nce->nce_state == ND_UNREACHABLE)) {
-failed:
- if (ndp_g_lock_held)
- mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
- if (nce != NULL)
- mutex_exit(&nce->nce_lock);
- ire_atomic_end(irb_ptr, ire);
- ip1dbg(("ire_add_v6: No nce for dst %s \n",
- inet_ntop(AF_INET6, &ire->ire_addr_v6,
- buf, sizeof (buf))));
- ire_delete(ire);
- if (pire != NULL) {
- /*
- * Assert that it is
- * not yet removed from the list.
- */
- ASSERT(pire->ire_ptpn != NULL);
- IRB_REFRELE(pire->ire_bucket);
- ire_refrele(pire);
- }
- if (nce != NULL)
- NCE_REFRELE_NOTR(nce);
- *ire_p = NULL;
- return (EINVAL);
- } else {
- ire->ire_nce = nce;
- }
- mutex_exit(&nce->nce_lock);
- }
/*
- * Find the first entry that matches ire_addr - provides
- * tail insertion. *irep will be null if no match.
+ * Normally we do head insertion since most things do not care about
+ * the order of the IREs in the bucket.
+ * However, due to shared-IP zones (and restrict_interzone_loopback)
+ * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same
+ * address. For that reason we do tail insertion for IRE_IF_CLONE.
*/
irep = (ire_t **)irb_ptr;
- while ((ire1 = *irep) != NULL &&
- !IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, &ire1->ire_addr_v6))
- irep = &ire1->ire_next;
- ASSERT(!(ire->ire_type & IRE_BROADCAST));
-
- if (*irep != NULL) {
- /*
- * Find the last ire which matches ire_addr_v6.
- * Needed to do tail insertion among entries with the same
- * ire_addr_v6.
- */
- while (IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6,
- &ire1->ire_addr_v6)) {
+ if (ire->ire_type & IRE_IF_CLONE) {
+ while ((ire1 = *irep) != NULL)
irep = &ire1->ire_next;
- ire1 = *irep;
- if (ire1 == NULL)
- break;
- }
- }
-
- if (ire->ire_type == IRE_DEFAULT) {
- /*
- * We keep a count of default gateways which is used when
- * assigning them as routes.
- */
- ipst->ips_ipv6_ire_default_count++;
- ASSERT(ipst->ips_ipv6_ire_default_count != 0); /* Wraparound */
}
/* Insert at *irep */
ire1 = *irep;
@@ -852,62 +476,22 @@ failed:
* in the list for the first time and no one else can bump
* up the reference count on this yet.
*/
- IRE_REFHOLD_LOCKED(ire);
+ ire_refhold_locked(ire);
BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_inserted);
irb_ptr->irb_ire_cnt++;
- if (ire->ire_marks & IRE_MARK_TEMPORARY)
- irb_ptr->irb_tmp_ire_cnt++;
- if (ire->ire_ipif != NULL) {
- DTRACE_PROBE3(ipif__incr__cnt, (ipif_t *), ire->ire_ipif,
+ if (ire->ire_ill != NULL) {
+ DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ire->ire_ill,
(char *), "ire", (void *), ire);
- ire->ire_ipif->ipif_ire_cnt++;
- if (ire->ire_stq != NULL) {
- stq_ill = (ill_t *)ire->ire_stq->q_ptr;
- DTRACE_PROBE3(ill__incr__cnt, (ill_t *), stq_ill,
- (char *), "ire", (void *), ire);
- stq_ill->ill_ire_cnt++;
- }
- } else {
- ASSERT(ire->ire_stq == NULL);
+ ire->ire_ill->ill_ire_cnt++;
+ ASSERT(ire->ire_ill->ill_ire_cnt != 0); /* Wraparound */
}
-
- if (ndp_g_lock_held)
- mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
ire_atomic_end(irb_ptr, ire);
- if (pire != NULL) {
- /* Assert that it is not removed from the list yet */
- ASSERT(pire->ire_ptpn != NULL);
- IRB_REFRELE(pire->ire_bucket);
- ire_refrele(pire);
- }
-
- if (ire->ire_type != IRE_CACHE) {
- /*
- * For ire's with with host mask see if there is an entry
- * in the cache. If there is one flush the whole cache as
- * there might be multiple entries due to RTF_MULTIRT (CGTP).
- * If no entry is found than there is no need to flush the
- * cache.
- */
-
- if (ip_mask_to_plen_v6(&ire->ire_mask_v6) == IPV6_ABITS) {
- ire_t *lire;
- lire = ire_ctable_lookup_v6(&ire->ire_addr_v6, NULL,
- IRE_CACHE, NULL, ALL_ZONES, NULL, MATCH_IRE_TYPE,
- ipst);
- if (lire != NULL) {
- ire_refrele(lire);
- ire_flush_cache_v6(ire, IRE_FLUSH_ADD);
- }
- } else {
- ire_flush_cache_v6(ire, IRE_FLUSH_ADD);
- }
- }
+ /* Make any caching of the IREs be notified or updated */
+ ire_flush_cache_v6(ire, IRE_FLUSH_ADD);
- *ire_p = ire;
- return (0);
+ return (ire);
}
/*
@@ -931,7 +515,7 @@ ire_delete_host_redirects_v6(const in6_addr_t *gateway, ip_stack_t *ipst)
return;
for (i = 0; (i < ipst->ips_ip6_ftable_hash_size); i++) {
irb = &irb_ptr[i];
- IRB_REFHOLD(irb);
+ irb_refhold(irb);
for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
if (!(ire->ire_flags & RTF_DYNAMIC))
continue;
@@ -941,50 +525,11 @@ ire_delete_host_redirects_v6(const in6_addr_t *gateway, ip_stack_t *ipst)
if (IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway))
ire_delete(ire);
}
- IRB_REFRELE(irb);
+ irb_refrele(irb);
}
}
/*
- * Delete all the cache entries with this 'addr'. This is the IPv6 counterpart
- * of ip_ire_clookup_and_delete. The difference being this function does not
- * return any value. IPv6 processing of a gratuitous ARP, as it stands, is
- * different than IPv4 in that, regardless of the presence of a cache entry
- * for this address, an ire_walk_v6 is done. Another difference is that unlike
- * in the case of IPv4 this does not take an ipif_t argument, since it is only
- * called by ip_arp_news and the match is always only on the address.
- */
-void
-ip_ire_clookup_and_delete_v6(const in6_addr_t *addr, ip_stack_t *ipst)
-{
- irb_t *irb;
- ire_t *cire;
- boolean_t found = B_FALSE;
-
- irb = &ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr,
- ipst->ips_ip6_cache_table_size)];
- IRB_REFHOLD(irb);
- for (cire = irb->irb_ire; cire != NULL; cire = cire->ire_next) {
- if (cire->ire_marks & IRE_MARK_CONDEMNED)
- continue;
- if (IN6_ARE_ADDR_EQUAL(&cire->ire_addr_v6, addr)) {
-
- /* This signifies start of a match */
- if (!found)
- found = B_TRUE;
- if (cire->ire_type == IRE_CACHE) {
- if (cire->ire_nce != NULL)
- ndp_delete(cire->ire_nce);
- ire_delete_v6(cire);
- }
- /* End of the match */
- } else if (found)
- break;
- }
- IRB_REFRELE(irb);
-}
-
-/*
* Delete the specified IRE.
* All calls should use ire_delete().
* Sometimes called as writer though not required by this function.
@@ -998,11 +543,20 @@ ire_delete_v6(ire_t *ire)
in6_addr_t gw_addr_v6;
ip_stack_t *ipst = ire->ire_ipst;
+ /*
+ * Make sure ire_generation increases from ire_flush_cache happen
+ * after any lookup/reader has read ire_generation.
+ * Since the rw_enter makes us wait until any lookup/reader has
+ * completed we can exit the lock immediately.
+ */
+ rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER);
+ rw_exit(&ipst->ips_ip6_ire_head_lock);
+
ASSERT(ire->ire_refcnt >= 1);
ASSERT(ire->ire_ipversion == IPV6_VERSION);
- if (ire->ire_type != IRE_CACHE)
- ire_flush_cache_v6(ire, IRE_FLUSH_DELETE);
+ ire_flush_cache_v6(ire, IRE_FLUSH_DELETE);
+
if (ire->ire_type == IRE_DEFAULT) {
/*
* when a default gateway is going away
@@ -1014,368 +568,284 @@ ire_delete_v6(ire_t *ire)
mutex_exit(&ire->ire_lock);
ire_delete_host_redirects_v6(&gw_addr_v6, ipst);
}
-}
-
-/*
- * ire_walk routine to delete all IRE_CACHE and IRE_HOST type redirect
- * entries.
- */
-/*ARGSUSED1*/
-void
-ire_delete_cache_v6(ire_t *ire, char *arg)
-{
- char addrstr1[INET6_ADDRSTRLEN];
- char addrstr2[INET6_ADDRSTRLEN];
-
- if ((ire->ire_type & IRE_CACHE) ||
- (ire->ire_flags & RTF_DYNAMIC)) {
- ip1dbg(("ire_delete_cache_v6: deleted %s type %d through %s\n",
- inet_ntop(AF_INET6, &ire->ire_addr_v6,
- addrstr1, sizeof (addrstr1)),
- ire->ire_type,
- inet_ntop(AF_INET6, &ire->ire_gateway_addr_v6,
- addrstr2, sizeof (addrstr2))));
- ire_delete(ire);
- }
-
-}
-/*
- * ire_walk routine to delete all IRE_CACHE/IRE_HOST type redirect entries
- * that have a given gateway address.
- */
-void
-ire_delete_cache_gw_v6(ire_t *ire, char *addr)
-{
- in6_addr_t *gw_addr = (in6_addr_t *)addr;
- char buf1[INET6_ADDRSTRLEN];
- char buf2[INET6_ADDRSTRLEN];
- in6_addr_t ire_gw_addr_v6;
-
- if (!(ire->ire_type & IRE_CACHE) &&
- !(ire->ire_flags & RTF_DYNAMIC))
- return;
-
- mutex_enter(&ire->ire_lock);
- ire_gw_addr_v6 = ire->ire_gateway_addr_v6;
- mutex_exit(&ire->ire_lock);
+ /*
+ * If we are deleting an IRE_INTERFACE then we make sure we also
+ * delete any IRE_IF_CLONE that has been created from it.
+ * Those are always in ire_dep_children.
+ */
+ if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != 0)
+ ire_dep_delete_if_clone(ire);
- if (IN6_ARE_ADDR_EQUAL(&ire_gw_addr_v6, gw_addr)) {
- ip1dbg(("ire_delete_cache_gw_v6: deleted %s type %d to %s\n",
- inet_ntop(AF_INET6, &ire->ire_src_addr_v6,
- buf1, sizeof (buf1)),
- ire->ire_type,
- inet_ntop(AF_INET6, &ire_gw_addr_v6,
- buf2, sizeof (buf2))));
- ire_delete(ire);
+ /* Remove from parent dependencies and child */
+ rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER);
+ if (ire->ire_dep_parent != NULL) {
+ ire_dep_remove(ire);
}
+ while (ire->ire_dep_children != NULL)
+ ire_dep_remove(ire->ire_dep_children);
+ rw_exit(&ipst->ips_ire_dep_lock);
}
/*
- * Remove all IRE_CACHE entries that match
- * the ire specified. (Sometimes called
- * as writer though not required by this function.)
- *
- * The flag argument indicates if the
- * flush request is due to addition
- * of new route (IRE_FLUSH_ADD) or deletion of old
- * route (IRE_FLUSH_DELETE).
+ * When an IRE is added or deleted this routine is called to make sure
+ * any caching of IRE information is notified or updated.
*
- * This routine takes only the IREs from the forwarding
- * table and flushes the corresponding entries from
- * the cache table.
- *
- * When flushing due to the deletion of an old route, it
- * just checks the cache handles (ire_phandle and ire_ihandle) and
- * deletes the ones that match.
- *
- * When flushing due to the creation of a new route, it checks
- * if a cache entry's address matches the one in the IRE and
- * that the cache entry's parent has a less specific mask than the
- * one in IRE. The destination of such a cache entry could be the
- * gateway for other cache entries, so we need to flush those as
- * well by looking for gateway addresses matching the IRE's address.
+ * The flag argument indicates if the flush request is due to addition
+ * of new route (IRE_FLUSH_ADD), deletion of old route (IRE_FLUSH_DELETE),
+ * or a change to ire_gateway_addr (IRE_FLUSH_GWCHANGE).
*/
void
ire_flush_cache_v6(ire_t *ire, int flag)
{
- int i;
- ire_t *cire;
- irb_t *irb;
- ip_stack_t *ipst = ire->ire_ipst;
+ ip_stack_t *ipst = ire->ire_ipst;
- if (ire->ire_type & IRE_CACHE)
+ /*
+ * IRE_IF_CLONE ire's don't provide any new information
+ * than the parent from which they are cloned, so don't
+ * perturb the generation numbers.
+ */
+ if (ire->ire_type & IRE_IF_CLONE)
return;
/*
- * If a default is just created, there is no point
- * in going through the cache, as there will not be any
- * cached ires.
+ * Ensure that an ire_add during a lookup serializes the updates of
+ * the generation numbers under ire_head_lock so that the lookup gets
+ * either the old ire and old generation number, or a new ire and new
+ * generation number.
+ */
+ rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER);
+
+ /*
+ * If a route was just added, we need to notify everybody that
+ * has cached an IRE_NOROUTE since there might now be a better
+ * route for them.
*/
- if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD)
- return;
if (flag == IRE_FLUSH_ADD) {
+ ire_increment_generation(ipst->ips_ire_reject_v6);
+ ire_increment_generation(ipst->ips_ire_blackhole_v6);
+ }
+
+ /* Adding a default can't otherwise provide a better route */
+ if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) {
+ rw_exit(&ipst->ips_ip6_ire_head_lock);
+ return;
+ }
+
+ switch (flag) {
+ case IRE_FLUSH_DELETE:
+ case IRE_FLUSH_GWCHANGE:
/*
- * This selective flush is
- * due to the addition of
- * new IRE.
+ * Update ire_generation for all ire_dep_children chains
+ * starting with this IRE
*/
- for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) {
- irb = &ipst->ips_ip_cache_table_v6[i];
- if ((cire = irb->irb_ire) == NULL)
- continue;
- IRB_REFHOLD(irb);
- for (cire = irb->irb_ire; cire != NULL;
- cire = cire->ire_next) {
- if (cire->ire_type != IRE_CACHE)
- continue;
- /*
- * If 'cire' belongs to the same subnet
- * as the new ire being added, and 'cire'
- * is derived from a prefix that is less
- * specific than the new ire being added,
- * we need to flush 'cire'; for instance,
- * when a new interface comes up.
- */
- if ((V6_MASK_EQ_2(cire->ire_addr_v6,
- ire->ire_mask_v6, ire->ire_addr_v6) &&
- (ip_mask_to_plen_v6(&cire->ire_cmask_v6) <=
- ire->ire_masklen))) {
- ire_delete(cire);
- continue;
- }
- /*
- * This is the case when the ire_gateway_addr
- * of 'cire' belongs to the same subnet as
- * the new ire being added.
- * Flushing such ires is sometimes required to
- * avoid misrouting: say we have a machine with
- * two interfaces (I1 and I2), a default router
- * R on the I1 subnet, and a host route to an
- * off-link destination D with a gateway G on
- * the I2 subnet.
- * Under normal operation, we will have an
- * on-link cache entry for G and an off-link
- * cache entry for D with G as ire_gateway_addr,
- * traffic to D will reach its destination
- * through gateway G.
- * If the administrator does 'ifconfig I2 down',
- * the cache entries for D and G will be
- * flushed. However, G will now be resolved as
- * an off-link destination using R (the default
- * router) as gateway. Then D will also be
- * resolved as an off-link destination using G
- * as gateway - this behavior is due to
- * compatibility reasons, see comment in
- * ire_ihandle_lookup_offlink(). Traffic to D
- * will go to the router R and probably won't
- * reach the destination.
- * The administrator then does 'ifconfig I2 up'.
- * Since G is on the I2 subnet, this routine
- * will flush its cache entry. It must also
- * flush the cache entry for D, otherwise
- * traffic will stay misrouted until the IRE
- * times out.
- */
- if (V6_MASK_EQ_2(cire->ire_gateway_addr_v6,
- ire->ire_mask_v6, ire->ire_addr_v6)) {
- ire_delete(cire);
- continue;
- }
- }
- IRB_REFRELE(irb);
- }
- } else {
+ ire_dep_incr_generation(ire);
+ break;
+ case IRE_FLUSH_ADD: {
+ in6_addr_t addr;
+ in6_addr_t mask;
+ ip_stack_t *ipst = ire->ire_ipst;
+ uint_t masklen;
+
/*
- * delete the cache entries based on
- * handle in the IRE as this IRE is
- * being deleted/changed.
+ * Find an IRE which is a shorter match than the ire to be added
+ * For any such IRE (which we repeat) we update the
+ * ire_generation the same way as in the delete case.
*/
- for (i = 0; i < ipst->ips_ip6_cache_table_size; i++) {
- irb = &ipst->ips_ip_cache_table_v6[i];
- if ((cire = irb->irb_ire) == NULL)
- continue;
- IRB_REFHOLD(irb);
- for (cire = irb->irb_ire; cire != NULL;
- cire = cire->ire_next) {
- if (cire->ire_type != IRE_CACHE)
- continue;
- if ((cire->ire_phandle == 0 ||
- cire->ire_phandle != ire->ire_phandle) &&
- (cire->ire_ihandle == 0 ||
- cire->ire_ihandle != ire->ire_ihandle))
- continue;
- ire_delete(cire);
- }
- IRB_REFRELE(irb);
+ addr = ire->ire_addr_v6;
+ mask = ire->ire_mask_v6;
+ masklen = ip_mask_to_plen_v6(&mask);
+
+ ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, NULL,
+ ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst);
+ while (ire != NULL) {
+ /* We need to handle all in the same bucket */
+ irb_increment_generation(ire->ire_bucket);
+
+ mask = ire->ire_mask_v6;
+ ASSERT(masklen > ip_mask_to_plen_v6(&mask));
+ masklen = ip_mask_to_plen_v6(&mask);
+ ire_refrele(ire);
+ ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0,
+ NULL, ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst);
+ }
}
+ break;
}
+ rw_exit(&ipst->ips_ip6_ire_head_lock);
}
/*
* Matches the arguments passed with the values in the ire.
*
- * Note: for match types that match using "ipif" passed in, ipif
+ * Note: for match types that match using "ill" passed in, ill
* must be checked for non-NULL before calling this routine.
*/
-static boolean_t
+boolean_t
ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask,
- const in6_addr_t *gateway, int type, const ipif_t *ipif, zoneid_t zoneid,
- uint32_t ihandle, const ts_label_t *tsl, int match_flags)
+ const in6_addr_t *gateway, int type, const ill_t *ill, zoneid_t zoneid,
+ const ts_label_t *tsl, int match_flags)
{
in6_addr_t masked_addr;
in6_addr_t gw_addr_v6;
ill_t *ire_ill = NULL, *dst_ill;
- ill_t *ipif_ill = NULL;
- ipif_t *src_ipif;
+ ip_stack_t *ipst = ire->ire_ipst;
ASSERT(ire->ire_ipversion == IPV6_VERSION);
ASSERT(addr != NULL);
ASSERT(mask != NULL);
ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL);
ASSERT((!(match_flags & MATCH_IRE_ILL)) ||
- (ipif != NULL && ipif->ipif_isv6));
+ (ill != NULL && ill->ill_isv6));
/*
- * If MATCH_IRE_MARK_TESTHIDDEN is set, then only return the IRE if it
- * is in fact hidden, to ensure the caller gets the right one. One
- * exception: if the caller passed MATCH_IRE_IHANDLE, then they
- * already know the identity of the given IRE_INTERFACE entry and
- * there's no point trying to hide it from them.
+ * If MATCH_IRE_TESTHIDDEN is set, then only return the IRE if it
+ * is in fact hidden, to ensure the caller gets the right one.
*/
- if (ire->ire_marks & IRE_MARK_TESTHIDDEN) {
- if (match_flags & MATCH_IRE_IHANDLE)
- match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
-
- if (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN))
+ if (ire->ire_testhidden) {
+ if (!(match_flags & MATCH_IRE_TESTHIDDEN))
return (B_FALSE);
}
if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid &&
ire->ire_zoneid != ALL_ZONES) {
/*
- * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid is
- * valid and does not match that of ire_zoneid, a failure to
+ * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid
+ * does not match that of ire_zoneid, a failure to
* match is reported at this point. Otherwise, since some IREs
* that are available in the global zone can be used in local
* zones, additional checks need to be performed:
*
- * IRE_CACHE and IRE_LOOPBACK entries should
- * never be matched in this situation.
+ * IRE_LOOPBACK
+ * entries should never be matched in this situation.
+ * Each zone has its own IRE_LOOPBACK.
*
- * IRE entries that have an interface associated with them
- * should in general not match unless they are an IRE_LOCAL
- * or in the case when MATCH_IRE_DEFAULT has been set in
- * the caller. In the case of the former, checking of the
- * other fields supplied should take place.
+ * IRE_LOCAL
+ * We allow them for any zoneid. ire_route_recursive
+ * does additional checks when
+ * ip_restrict_interzone_loopback is set.
*
- * In the case where MATCH_IRE_DEFAULT has been set,
- * all of the ipif's associated with the IRE's ill are
- * checked to see if there is a matching zoneid. If any
- * one ipif has a matching zoneid, this IRE is a
- * potential candidate so checking of the other fields
- * takes place.
+ * If ill_usesrc_ifindex is set
+ * Then we check if the zone has a valid source address
+ * on the usesrc ill.
*
- * In the case where the IRE_INTERFACE has a usable source
- * address (indicated by ill_usesrc_ifindex) in the
- * correct zone then it's permitted to return this IRE
+ * If ire_ill is set, then check that the zone has an ipif
+ * on that ill.
+ *
+ * Outside of this function (in ire_round_robin) we check
+ * that any IRE_OFFLINK has a gateway that reachable from the
+ * zone when we have multiple choices (ECMP).
*/
if (match_flags & MATCH_IRE_ZONEONLY)
return (B_FALSE);
- if (ire->ire_type & (IRE_CACHE | IRE_LOOPBACK))
+ if (ire->ire_type & IRE_LOOPBACK)
return (B_FALSE);
+
+ if (ire->ire_type & IRE_LOCAL)
+ goto matchit;
+
/*
- * Note, IRE_INTERFACE can have the stq as NULL. For
- * example, if the default multicast route is tied to
- * the loopback address.
+ * The normal case of IRE_ONLINK has a matching zoneid.
+ * Here we handle the case when shared-IP zones have been
+ * configured with IP addresses on vniN. In that case it
+ * is ok for traffic from a zone to use IRE_ONLINK routes
+ * if the ill has a usesrc pointing at vniN
+ * Applies to IRE_INTERFACE.
*/
- if ((ire->ire_type & IRE_INTERFACE) &&
- (ire->ire_stq != NULL)) {
- dst_ill = (ill_t *)ire->ire_stq->q_ptr;
+ dst_ill = ire->ire_ill;
+ if (ire->ire_type & IRE_ONLINK) {
+ uint_t ifindex;
+
+ /*
+ * Note there is no IRE_INTERFACE on vniN thus
+ * can't do an IRE lookup for a matching route.
+ */
+ ifindex = dst_ill->ill_usesrc_ifindex;
+ if (ifindex == 0)
+ return (B_FALSE);
+
/*
* If there is a usable source address in the
- * zone, then it's ok to return an
- * IRE_INTERFACE
+ * zone, then it's ok to return this IRE_INTERFACE
*/
- if ((dst_ill->ill_usesrc_ifindex != 0) &&
- (src_ipif = ipif_select_source_v6(dst_ill, addr,
- B_FALSE, IPV6_PREFER_SRC_DEFAULT, zoneid))
- != NULL) {
- ip3dbg(("ire_match_args: src_ipif %p"
- " dst_ill %p", (void *)src_ipif,
- (void *)dst_ill));
- ipif_refrele(src_ipif);
- } else {
- ip3dbg(("ire_match_args: src_ipif NULL"
+ if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6,
+ zoneid, ipst)) {
+ ip3dbg(("ire_match_args: no usrsrc for zone"
" dst_ill %p\n", (void *)dst_ill));
return (B_FALSE);
}
}
- if (ire->ire_ipif != NULL && ire->ire_type != IRE_LOCAL &&
- !(ire->ire_type & IRE_INTERFACE)) {
+ /*
+ * For exampe, with
+ * route add 11.0.0.0 gw1 -ifp bge0
+ * route add 11.0.0.0 gw2 -ifp bge1
+ * this code would differentiate based on
+ * where the sending zone has addresses.
+ * Only if the zone has an address on bge0 can it use the first
+ * route. It isn't clear if this behavior is documented
+ * anywhere.
+ */
+ if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) {
ipif_t *tipif;
- if ((match_flags & MATCH_IRE_DEFAULT) == 0)
- return (B_FALSE);
- mutex_enter(&ire->ire_ipif->ipif_ill->ill_lock);
- for (tipif = ire->ire_ipif->ipif_ill->ill_ipif;
+ mutex_enter(&dst_ill->ill_lock);
+ for (tipif = dst_ill->ill_ipif;
tipif != NULL; tipif = tipif->ipif_next) {
- if (IPIF_CAN_LOOKUP(tipif) &&
+ if (!IPIF_IS_CONDEMNED(tipif) &&
(tipif->ipif_flags & IPIF_UP) &&
(tipif->ipif_zoneid == zoneid ||
tipif->ipif_zoneid == ALL_ZONES))
break;
}
- mutex_exit(&ire->ire_ipif->ipif_ill->ill_lock);
+ mutex_exit(&dst_ill->ill_lock);
if (tipif == NULL)
return (B_FALSE);
}
}
+matchit:
if (match_flags & MATCH_IRE_GW) {
mutex_enter(&ire->ire_lock);
gw_addr_v6 = ire->ire_gateway_addr_v6;
mutex_exit(&ire->ire_lock);
}
-
- /*
- * For IRE_CACHE entries, MATCH_IRE_ILL means that somebody wants to
- * send out ire_stq (ire_ipif for IRE_CACHE entries is just the means
- * of getting a source address -- i.e., ire_src_addr_v6 ==
- * ire->ire_ipif->ipif_v6src_addr). ire_to_ill() handles this.
- *
- * NOTE: For IPMP, MATCH_IRE_ILL usually matches any ill in the group.
- * However, if MATCH_IRE_MARK_TESTHIDDEN is set (i.e., the IRE is for
- * IPMP test traffic), then the ill must match exactly.
- */
if (match_flags & MATCH_IRE_ILL) {
- ire_ill = ire_to_ill(ire);
- ipif_ill = ipif->ipif_ill;
- }
+ ire_ill = ire->ire_ill;
+ /*
+ * If asked to match an ill, we *must* match
+ * on the ire_ill for ipmp test addresses, or
+ * any of the ill in the group for data addresses.
+ * If we don't, we may as well fail.
+ * However, we need an exception for IRE_LOCALs to ensure
+ * we loopback packets even sent to test addresses on different
+ * interfaces in the group.
+ */
+ if ((match_flags & MATCH_IRE_TESTHIDDEN) &&
+ !(ire->ire_type & IRE_LOCAL)) {
+ if (ire->ire_ill != ill)
+ return (B_FALSE);
+ } else {
+ match_flags &= ~MATCH_IRE_TESTHIDDEN;
+ /*
+ * We know that ill is not NULL, but ire_ill could be
+ * NULL
+ */
+ if (ire_ill == NULL || !IS_ON_SAME_LAN(ill, ire_ill))
+ return (B_FALSE);
+ }
+ }
/* No ire_addr_v6 bits set past the mask */
ASSERT(V6_MASK_EQ(ire->ire_addr_v6, ire->ire_mask_v6,
ire->ire_addr_v6));
V6_MASK_COPY(*addr, *mask, masked_addr);
-
if (V6_MASK_EQ(*addr, *mask, ire->ire_addr_v6) &&
((!(match_flags & MATCH_IRE_GW)) ||
IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) &&
- ((!(match_flags & MATCH_IRE_TYPE)) ||
- (ire->ire_type & type)) &&
- ((!(match_flags & MATCH_IRE_SRC)) ||
- IN6_ARE_ADDR_EQUAL(&ire->ire_src_addr_v6,
- &ipif->ipif_v6src_addr)) &&
- ((!(match_flags & MATCH_IRE_IPIF)) ||
- (ire->ire_ipif == ipif)) &&
- ((!(match_flags & MATCH_IRE_MARK_TESTHIDDEN)) ||
- (ire->ire_marks & IRE_MARK_TESTHIDDEN)) &&
- ((!(match_flags & MATCH_IRE_ILL)) ||
- (ire_ill == ipif_ill ||
- (!(match_flags & MATCH_IRE_MARK_TESTHIDDEN) &&
- ire_ill != NULL && IS_IN_SAME_ILLGRP(ipif_ill, ire_ill)))) &&
- ((!(match_flags & MATCH_IRE_IHANDLE)) ||
- (ire->ire_ihandle == ihandle)) &&
+ ((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & type)) &&
+ ((!(match_flags & MATCH_IRE_TESTHIDDEN)) || ire->ire_testhidden) &&
+ ((!(match_flags & MATCH_IRE_MASK)) ||
+ (IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, mask))) &&
((!(match_flags & MATCH_IRE_SECATTR)) ||
(!is_system_labeled()) ||
(tsol_ire_match_gwattr(ire, tsl) == 0))) {
@@ -1386,41 +856,38 @@ ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask,
}
/*
- * Lookup for a route in all the tables
+ * Check if the zoneid (not ALL_ZONES) has an IRE_INTERFACE for the specified
+ * gateway address. If ill is non-NULL we also match on it.
+ * The caller must hold a read lock on RADIX_NODE_HEAD if lock_held is set.
*/
-ire_t *
-ire_route_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
- const in6_addr_t *gateway, int type, const ipif_t *ipif, ire_t **pire,
- zoneid_t zoneid, const ts_label_t *tsl, int flags, ip_stack_t *ipst)
+boolean_t
+ire_gateway_ok_zone_v6(const in6_addr_t *gateway, zoneid_t zoneid, ill_t *ill,
+ const ts_label_t *tsl, ip_stack_t *ipst, boolean_t lock_held)
{
- ire_t *ire = NULL;
+ ire_t *ire;
+ uint_t match_flags;
- /*
- * ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or
- * MATCH_IRE_ILL is set.
- */
- if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL))
- return (NULL);
+ if (lock_held)
+ ASSERT(RW_READ_HELD(&ipst->ips_ip6_ire_head_lock));
+ else
+ rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
- /*
- * might be asking for a cache lookup,
- * This is not best way to lookup cache,
- * user should call ire_cache_lookup directly.
- *
- * If MATCH_IRE_TYPE was set, first lookup in the cache table and then
- * in the forwarding table, if the applicable type flags were set.
- */
- if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_CACHETABLE) != 0) {
- ire = ire_ctable_lookup_v6(addr, gateway, type, ipif, zoneid,
- tsl, flags, ipst);
- if (ire != NULL)
- return (ire);
- }
- if ((flags & MATCH_IRE_TYPE) == 0 || (type & IRE_FORWARDTABLE) != 0) {
- ire = ire_ftable_lookup_v6(addr, mask, gateway, type, ipif,
- pire, zoneid, 0, tsl, flags, ipst);
+ match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR;
+ if (ill != NULL)
+ match_flags |= MATCH_IRE_ILL;
+
+ ire = ire_ftable_lookup_impl_v6(gateway, &ipv6_all_zeros,
+ &ipv6_all_zeros, IRE_INTERFACE, ill, zoneid, tsl, match_flags,
+ ipst);
+
+ if (!lock_held)
+ rw_exit(&ipst->ips_ip6_ire_head_lock);
+ if (ire != NULL) {
+ ire_refrele(ire);
+ return (B_TRUE);
+ } else {
+ return (B_FALSE);
}
- return (ire);
}
/*
@@ -1429,63 +896,121 @@ ire_route_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
* required parameters and indicating the
* match required in flag field.
*
- * Looking for default route can be done in three ways
- * 1) pass mask as ipv6_all_zeros and set MATCH_IRE_MASK in flags field
- * along with other matches.
- * 2) pass type as IRE_DEFAULT and set MATCH_IRE_TYPE in flags
- * field along with other matches.
- * 3) if the destination and mask are passed as zeros.
- *
- * A request to return a default route if no route
- * is found, can be specified by setting MATCH_IRE_DEFAULT
- * in flags.
- *
- * It does not support recursion more than one level. It
- * will do recursive lookup only when the lookup maps to
- * a prefix or default route and MATCH_IRE_RECURSIVE flag is passed.
- *
- * If the routing table is setup to allow more than one level
- * of recursion, the cleaning up cache table will not work resulting
- * in invalid routing.
- *
* Supports link-local addresses by following the ipif/ill when recursing.
- *
- * NOTE : When this function returns NULL, pire has already been released.
- * pire is valid only when this function successfully returns an
- * ire.
*/
ire_t *
ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
- const in6_addr_t *gateway, int type, const ipif_t *ipif, ire_t **pire,
- zoneid_t zoneid, uint32_t ihandle, const ts_label_t *tsl, int flags,
- ip_stack_t *ipst)
+ const in6_addr_t *gateway, int type, const ill_t *ill,
+ zoneid_t zoneid, const ts_label_t *tsl, int flags,
+ uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp)
{
- irb_t *irb_ptr;
- ire_t *rire;
ire_t *ire = NULL;
- ire_t *saved_ire;
- nce_t *nce;
- int i;
- in6_addr_t gw_addr_v6;
ASSERT(addr != NULL);
ASSERT((!(flags & MATCH_IRE_MASK)) || mask != NULL);
ASSERT((!(flags & MATCH_IRE_GW)) || gateway != NULL);
- ASSERT(ipif == NULL || ipif->ipif_isv6);
+ ASSERT(ill == NULL || ill->ill_isv6);
+
+ ASSERT(!IN6_IS_ADDR_V4MAPPED(addr));
/*
- * When we return NULL from this function, we should make
- * sure that *pire is NULL so that the callers will not
- * wrongly REFRELE the pire.
+ * ire_match_args_v6() will dereference ill if MATCH_IRE_ILL
+ * is set.
*/
- if (pire != NULL)
- *pire = NULL;
+ if ((flags & (MATCH_IRE_ILL)) && (ill == NULL))
+ return (NULL);
+
+ rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER);
+ ire = ire_ftable_lookup_impl_v6(addr, mask, gateway, type, ill, zoneid,
+ tsl, flags, ipst);
+ if (ire == NULL) {
+ rw_exit(&ipst->ips_ip6_ire_head_lock);
+ return (NULL);
+ }
+
/*
- * ire_match_args_v6() will dereference ipif MATCH_IRE_SRC or
- * MATCH_IRE_ILL is set.
+ * round-robin only if we have more than one route in the bucket.
+ * ips_ip_ecmp_behavior controls when we do ECMP
+ * 2: always
+ * 1: for IRE_DEFAULT and /0 IRE_INTERFACE
+ * 0: never
+ *
+ * Note: if we found an IRE_IF_CLONE we won't look at the bucket with
+ * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match
+ * and the IRE_INTERFACESs are likely to be shorter matches.
*/
- if ((flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) && (ipif == NULL))
- return (NULL);
+ if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) {
+ if (ipst->ips_ip_ecmp_behavior == 2 ||
+ (ipst->ips_ip_ecmp_behavior == 1 &&
+ IS_DEFAULT_ROUTE_V6(ire))) {
+ ire_t *next_ire;
+ ire_ftable_args_t margs;
+
+ (void) memset(&margs, 0, sizeof (margs));
+ margs.ift_addr_v6 = *addr;
+ if (mask != NULL)
+ margs.ift_mask_v6 = *mask;
+ if (gateway != NULL)
+ margs.ift_gateway_v6 = *gateway;
+ margs.ift_type = type;
+ margs.ift_ill = ill;
+ margs.ift_zoneid = zoneid;
+ margs.ift_tsl = tsl;
+ margs.ift_flags = flags;
+
+ next_ire = ire_round_robin(ire->ire_bucket, &margs,
+ xmit_hint, ire, ipst);
+ if (next_ire == NULL) {
+ /* keep ire if next_ire is null */
+ goto done;
+ }
+ ire_refrele(ire);
+ ire = next_ire;
+ }
+ }
+
+done:
+ /* Return generation before dropping lock */
+ if (generationp != NULL)
+ *generationp = ire->ire_generation;
+
+ rw_exit(&ipst->ips_ip6_ire_head_lock);
+
+ /*
+ * For shared-IP zones we need additional checks to what was
+ * done in ire_match_args to make sure IRE_LOCALs are handled.
+ *
+ * When ip_restrict_interzone_loopback is set, then
+ * we ensure that IRE_LOCAL are only used for loopback
+ * between zones when the logical "Ethernet" would
+ * have looped them back. That is, if in the absense of
+ * the IRE_LOCAL we would have sent to packet out the
+ * same ill.
+ */
+ if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES &&
+ ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES &&
+ ipst->ips_ip_restrict_interzone_loopback) {
+ ire = ire_alt_local(ire, zoneid, tsl, ill, generationp);
+ ASSERT(ire != NULL);
+ }
+
+ return (ire);
+}
+
+/*
+ * Look up a single ire. The caller holds either the read or write lock.
+ */
+ire_t *
+ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask,
+ const in6_addr_t *gateway, int type, const ill_t *ill,
+ zoneid_t zoneid, const ts_label_t *tsl, int flags,
+ ip_stack_t *ipst)
+{
+ irb_t *irb_ptr;
+ ire_t *ire = NULL;
+ int i;
+
+ ASSERT(RW_LOCK_HELD(&ipst->ips_ip6_ire_head_lock));
/*
* If the mask is known, the lookup
@@ -1496,28 +1021,41 @@ ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
uint_t masklen;
masklen = ip_mask_to_plen_v6(mask);
- if (ipst->ips_ip_forwarding_table_v6[masklen] == NULL)
+ if (ipst->ips_ip_forwarding_table_v6[masklen] == NULL) {
return (NULL);
+ }
irb_ptr = &(ipst->ips_ip_forwarding_table_v6[masklen][
IRE_ADDR_MASK_HASH_V6(*addr, *mask,
ipst->ips_ip6_ftable_hash_size)]);
rw_enter(&irb_ptr->irb_lock, RW_READER);
for (ire = irb_ptr->irb_ire; ire != NULL;
ire = ire->ire_next) {
- if (ire->ire_marks & IRE_MARK_CONDEMNED)
+ if (IRE_IS_CONDEMNED(ire))
continue;
if (ire_match_args_v6(ire, addr, mask, gateway, type,
- ipif, zoneid, ihandle, tsl, flags))
+ ill, zoneid, tsl, flags))
goto found_ire;
}
rw_exit(&irb_ptr->irb_lock);
} else {
+ uint_t masklen;
+
/*
* In this case we don't know the mask, we need to
* search the table assuming different mask sizes.
- * we start with 128 bit mask, we don't allow default here.
*/
- for (i = (IP6_MASK_TABLE_SIZE - 1); i > 0; i--) {
+ if (flags & MATCH_IRE_SHORTERMASK) {
+ masklen = ip_mask_to_plen_v6(mask);
+ if (masklen == 0) {
+ /* Nothing shorter than zero */
+ return (NULL);
+ }
+ masklen--;
+ } else {
+ masklen = IP6_MASK_TABLE_SIZE - 1;
+ }
+
+ for (i = masklen; i >= 0; i--) {
in6_addr_t tmpmask;
if ((ipst->ips_ip_forwarding_table_v6[i]) == NULL)
@@ -1529,1334 +1067,415 @@ ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask,
rw_enter(&irb_ptr->irb_lock, RW_READER);
for (ire = irb_ptr->irb_ire; ire != NULL;
ire = ire->ire_next) {
- if (ire->ire_marks & IRE_MARK_CONDEMNED)
+ if (IRE_IS_CONDEMNED(ire))
continue;
if (ire_match_args_v6(ire, addr,
- &ire->ire_mask_v6, gateway, type, ipif,
- zoneid, ihandle, tsl, flags))
+ &ire->ire_mask_v6, gateway, type, ill,
+ zoneid, tsl, flags))
goto found_ire;
}
rw_exit(&irb_ptr->irb_lock);
}
}
-
- /*
- * We come here if no route has yet been found.
- *
- * Handle the case where default route is
- * requested by specifying type as one of the possible
- * types for that can have a zero mask (IRE_DEFAULT and IRE_INTERFACE).
- *
- * If MATCH_IRE_MASK is specified, then the appropriate default route
- * would have been found above if it exists so it isn't looked up here.
- * If MATCH_IRE_DEFAULT was also specified, then a default route will be
- * searched for later.
- */
- if ((flags & (MATCH_IRE_TYPE | MATCH_IRE_MASK)) == MATCH_IRE_TYPE &&
- (type & (IRE_DEFAULT | IRE_INTERFACE))) {
- if (ipst->ips_ip_forwarding_table_v6[0] != NULL) {
- /* addr & mask is zero for defaults */
- irb_ptr = &ipst->ips_ip_forwarding_table_v6[0][
- IRE_ADDR_HASH_V6(ipv6_all_zeros,
- ipst->ips_ip6_ftable_hash_size)];
- rw_enter(&irb_ptr->irb_lock, RW_READER);
- for (ire = irb_ptr->irb_ire; ire != NULL;
- ire = ire->ire_next) {
-
- if (ire->ire_marks & IRE_MARK_CONDEMNED)
- continue;
-
- if (ire_match_args_v6(ire, addr,
- &ipv6_all_zeros, gateway, type, ipif,
- zoneid, ihandle, tsl, flags))
- goto found_ire;
- }
- rw_exit(&irb_ptr->irb_lock);
- }
- }
- /*
- * We come here only if no route is found.
- * see if the default route can be used which is allowed
- * only if the default matching criteria is specified.
- * The ipv6_ire_default_count tracks the number of IRE_DEFAULT
- * entries. However, the ip_forwarding_table_v6[0] also contains
- * interface routes thus the count can be zero.
- */
- saved_ire = NULL;
- if ((flags & (MATCH_IRE_DEFAULT | MATCH_IRE_MASK)) ==
- MATCH_IRE_DEFAULT) {
- ire_t *ire_origin;
- uint_t g_index;
- uint_t index;
-
- if (ipst->ips_ip_forwarding_table_v6[0] == NULL)
- return (NULL);
- irb_ptr = &(ipst->ips_ip_forwarding_table_v6[0])[0];
-
- /*
- * Keep a tab on the bucket while looking the IRE_DEFAULT
- * entries. We need to keep track of a particular IRE
- * (ire_origin) so this ensures that it will not be unlinked
- * from the hash list during the recursive lookup below.
- */
- IRB_REFHOLD(irb_ptr);
- ire = irb_ptr->irb_ire;
- if (ire == NULL) {
- IRB_REFRELE(irb_ptr);
- return (NULL);
- }
-
- /*
- * Get the index first, since it can be changed by other
- * threads. Then get to the right default route skipping
- * default interface routes if any. As we hold a reference on
- * the IRE bucket, ipv6_ire_default_count can only increase so
- * we can't reach the end of the hash list unexpectedly.
- */
- if (ipst->ips_ipv6_ire_default_count != 0) {
- g_index = ipst->ips_ipv6_ire_default_index++;
- index = g_index % ipst->ips_ipv6_ire_default_count;
- while (index != 0) {
- if (!(ire->ire_type & IRE_INTERFACE))
- index--;
- ire = ire->ire_next;
- }
- ASSERT(ire != NULL);
- } else {
- /*
- * No default route, so we only have default interface
- * routes: don't enter the first loop.
- */
- ire = NULL;
- }
-
- /*
- * Round-robin the default routers list looking for a neighbor
- * that matches the passed in parameters and is reachable. If
- * none found, just return a route from the default router list
- * if it exists. If we can't find a default route (IRE_DEFAULT),
- * look for interface default routes.
- * We start with the ire we found above and we walk the hash
- * list until we're back where we started, see
- * ire_get_next_default_ire(). It doesn't matter if default
- * routes are added or deleted by other threads - we know this
- * ire will stay in the list because we hold a reference on the
- * ire bucket.
- * NB: if we only have interface default routes, ire is NULL so
- * we don't even enter this loop (see above).
- */
- ire_origin = ire;
- for (; ire != NULL;
- ire = ire_get_next_default_ire(ire, ire_origin)) {
-
- if (ire_match_args_v6(ire, addr,
- &ipv6_all_zeros, gateway, type, ipif,
- zoneid, ihandle, tsl, flags)) {
- int match_flags;
-
- /*
- * We have something to work with.
- * If we can find a resolved/reachable
- * entry, we will use this. Otherwise
- * we'll try to find an entry that has
- * a resolved cache entry. We will fallback
- * on this if we don't find anything else.
- */
- if (saved_ire == NULL)
- saved_ire = ire;
- mutex_enter(&ire->ire_lock);
- gw_addr_v6 = ire->ire_gateway_addr_v6;
- mutex_exit(&ire->ire_lock);
- match_flags = MATCH_IRE_ILL | MATCH_IRE_SECATTR;
- rire = ire_ctable_lookup_v6(&gw_addr_v6, NULL,
- 0, ire->ire_ipif, zoneid, tsl, match_flags,
- ipst);
- if (rire != NULL) {
- nce = rire->ire_nce;
- if (nce != NULL &&
- NCE_ISREACHABLE(nce) &&
- nce->nce_flags & NCE_F_ISROUTER) {
- ire_refrele(rire);
- IRE_REFHOLD(ire);
- IRB_REFRELE(irb_ptr);
- goto found_ire_held;
- } else if (nce != NULL &&
- !(nce->nce_flags &
- NCE_F_ISROUTER)) {
- /*
- * Make sure we don't use
- * this ire
- */
- if (saved_ire == ire)
- saved_ire = NULL;
- }
- ire_refrele(rire);
- } else if (ipst->
- ips_ipv6_ire_default_count > 1 &&
- zoneid != GLOBAL_ZONEID) {
- /*
- * When we're in a local zone, we're
- * only interested in default routers
- * that are reachable through ipifs
- * within our zone.
- * The potentially expensive call to
- * ire_route_lookup_v6() is avoided when
- * we have only one default route.
- */
- int ire_match_flags = MATCH_IRE_TYPE |
- MATCH_IRE_SECATTR;
-
- if (ire->ire_ipif != NULL) {
- ire_match_flags |=
- MATCH_IRE_ILL;
- }
- rire = ire_route_lookup_v6(&gw_addr_v6,
- NULL, NULL, IRE_INTERFACE,
- ire->ire_ipif, NULL,
- zoneid, tsl, ire_match_flags, ipst);
- if (rire != NULL) {
- ire_refrele(rire);
- saved_ire = ire;
- } else if (saved_ire == ire) {
- /*
- * Make sure we don't use
- * this ire
- */
- saved_ire = NULL;
- }
- }
- }
- }
- if (saved_ire != NULL) {
- ire = saved_ire;
- IRE_REFHOLD(ire);
- IRB_REFRELE(irb_ptr);
- goto found_ire_held;
- } else {
- /*
- * Look for a interface default route matching the
- * args passed in. No round robin here. Just pick
- * the right one.
- */
- for (ire = irb_ptr->irb_ire; ire != NULL;
- ire = ire->ire_next) {
-
- if (!(ire->ire_type & IRE_INTERFACE))
- continue;
-
- if (ire->ire_marks & IRE_MARK_CONDEMNED)
- continue;
-
- if (ire_match_args_v6(ire, addr,
- &ipv6_all_zeros, gateway, type, ipif,
- zoneid, ihandle, tsl, flags)) {
- IRE_REFHOLD(ire);
- IRB_REFRELE(irb_ptr);
- goto found_ire_held;
- }
- }
- IRB_REFRELE(irb_ptr);
- }
- }
ASSERT(ire == NULL);
ip1dbg(("ire_ftable_lookup_v6: returning NULL ire"));
return (NULL);
+
found_ire:
- ASSERT((ire->ire_marks & IRE_MARK_CONDEMNED) == 0);
- IRE_REFHOLD(ire);
+ ire_refhold(ire);
rw_exit(&irb_ptr->irb_lock);
-
-found_ire_held:
- if ((flags & MATCH_IRE_RJ_BHOLE) &&
- (ire->ire_flags & (RTF_BLACKHOLE | RTF_REJECT))) {
- return (ire);
- }
- /*
- * At this point, IRE that was found must be an IRE_FORWARDTABLE
- * or IRE_CACHETABLE type. If this is a recursive lookup and an
- * IRE_INTERFACE type was found, return that. If it was some other
- * IRE_FORWARDTABLE type of IRE (one of the prefix types), then it
- * is necessary to fill in the parent IRE pointed to by pire, and
- * then lookup the gateway address of the parent. For backwards
- * compatiblity, if this lookup returns an
- * IRE other than a IRE_CACHETABLE or IRE_INTERFACE, then one more level
- * of lookup is done.
- */
- if (flags & MATCH_IRE_RECURSIVE) {
- const ipif_t *gw_ipif;
- int match_flags = MATCH_IRE_DSTONLY;
-
- if (ire->ire_type & IRE_INTERFACE)
- return (ire);
- if (pire != NULL)
- *pire = ire;
- /*
- * If we can't find an IRE_INTERFACE or the caller has not
- * asked for pire, we need to REFRELE the saved_ire.
- */
- saved_ire = ire;
-
- if (ire->ire_ipif != NULL)
- match_flags |= MATCH_IRE_ILL;
-
- mutex_enter(&ire->ire_lock);
- gw_addr_v6 = ire->ire_gateway_addr_v6;
- mutex_exit(&ire->ire_lock);
-
- ire = ire_route_lookup_v6(&gw_addr_v6, NULL, NULL, 0,
- ire->ire_ipif, NULL, zoneid, tsl, match_flags, ipst);
- if (ire == NULL) {
- /*
- * In this case we have to deal with the
- * MATCH_IRE_PARENT flag, which means the
- * parent has to be returned if ire is NULL.
- * The aim of this is to have (at least) a starting
- * ire when we want to look at all of the ires in a
- * bucket aimed at a single destination (as is the
- * case in ip_newroute_v6 for the RTF_MULTIRT
- * flagged routes).
- */
- if (flags & MATCH_IRE_PARENT) {
- if (pire != NULL) {
- /*
- * Need an extra REFHOLD, if the
- * parent ire is returned via both
- * ire and pire.
- */
- IRE_REFHOLD(saved_ire);
- }
- ire = saved_ire;
- } else {
- ire_refrele(saved_ire);
- if (pire != NULL)
- *pire = NULL;
- }
- return (ire);
- }
- if (ire->ire_type & (IRE_CACHETABLE | IRE_INTERFACE)) {
- /*
- * If the caller did not ask for pire, release
- * it now.
- */
- if (pire == NULL) {
- ire_refrele(saved_ire);
- }
- return (ire);
- }
- match_flags |= MATCH_IRE_TYPE;
- mutex_enter(&ire->ire_lock);
- gw_addr_v6 = ire->ire_gateway_addr_v6;
- mutex_exit(&ire->ire_lock);
- gw_ipif = ire->ire_ipif;
- ire_refrele(ire);
- ire = ire_route_lookup_v6(&gw_addr_v6, NULL, NULL,
- (IRE_CACHETABLE | IRE_INTERFACE), gw_ipif, NULL, zoneid,
- NULL, match_flags, ipst);
- if (ire == NULL) {
- /*
- * In this case we have to deal with the
- * MATCH_IRE_PARENT flag, which means the
- * parent has to be returned if ire is NULL.
- * The aim of this is to have (at least) a starting
- * ire when we want to look at all of the ires in a
- * bucket aimed at a single destination (as is the
- * case in ip_newroute_v6 for the RTF_MULTIRT
- * flagged routes).
- */
- if (flags & MATCH_IRE_PARENT) {
- if (pire != NULL) {
- /*
- * Need an extra REFHOLD, if the
- * parent ire is returned via both
- * ire and pire.
- */
- IRE_REFHOLD(saved_ire);
- }
- ire = saved_ire;
- } else {
- ire_refrele(saved_ire);
- if (pire != NULL)
- *pire = NULL;
- }
- return (ire);
- } else if (pire == NULL) {
- /*
- * If the caller did not ask for pire, release
- * it now.
- */
- ire_refrele(saved_ire);
- }
- return (ire);
- }
-
- ASSERT(pire == NULL || *pire == NULL);
return (ire);
}
-/*
- * Delete the IRE cache for the gateway and all IRE caches whose
- * ire_gateway_addr_v6 points to this gateway, and allow them to
- * be created on demand by ip_newroute_v6.
- */
-void
-ire_clookup_delete_cache_gw_v6(const in6_addr_t *addr, zoneid_t zoneid,
- ip_stack_t *ipst)
-{
- irb_t *irb;
- ire_t *ire;
-
- irb = &ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr,
- ipst->ips_ip6_cache_table_size)];
- IRB_REFHOLD(irb);
- for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
- if (ire->ire_marks & IRE_MARK_CONDEMNED)
- continue;
-
- ASSERT(IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, &ipv6_all_ones));
- if (ire_match_args_v6(ire, addr, &ire->ire_mask_v6, 0,
- IRE_CACHE, NULL, zoneid, 0, NULL, MATCH_IRE_TYPE)) {
- ire_delete(ire);
- }
- }
- IRB_REFRELE(irb);
-
- ire_walk_v6(ire_delete_cache_gw_v6, (char *)addr, zoneid, ipst);
-}
-
-/*
- * Looks up cache table for a route.
- * specific lookup can be indicated by
- * passing the MATCH_* flags and the
- * necessary parameters.
- */
-ire_t *
-ire_ctable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *gateway,
- int type, const ipif_t *ipif, zoneid_t zoneid, const ts_label_t *tsl,
- int flags, ip_stack_t *ipst)
-{
- ire_ctable_args_t margs;
-
- margs.ict_addr = (void *)addr;
- margs.ict_gateway = (void *)gateway;
- margs.ict_type = type;
- margs.ict_ipif = ipif;
- margs.ict_zoneid = zoneid;
- margs.ict_tsl = tsl;
- margs.ict_flags = flags;
- margs.ict_ipst = ipst;
- margs.ict_wq = NULL;
-
- return (ip6_ctable_lookup_impl(&margs));
-}
/*
- * Lookup cache.
+ * This function is called by
+ * ip_input/ire_route_recursive when doing a route lookup on only the
+ * destination address.
*
- * In general the zoneid has to match (where ALL_ZONES match all of them).
- * But for IRE_LOCAL we also need to handle the case where L2 should
- * conceptually loop back the packet. This is necessary since neither
- * Ethernet drivers nor Ethernet hardware loops back packets sent to their
- * own MAC address. This loopback is needed when the normal
- * routes (ignoring IREs with different zoneids) would send out the packet on
- * the same ill as the ill with which this IRE_LOCAL is associated.
+ * The optimizations of this function over ire_ftable_lookup are:
+ * o removing unnecessary flag matching
+ * o doing longest prefix match instead of overloading it further
+ * with the unnecessary "best_prefix_match"
*
- * Earlier versions of this code always matched an IRE_LOCAL independently of
- * the zoneid. We preserve that earlier behavior when
- * ip_restrict_interzone_loopback is turned off.
+ * If no route is found we return IRE_NOROUTE.
*/
ire_t *
-ire_cache_lookup_v6(const in6_addr_t *addr, zoneid_t zoneid,
- const ts_label_t *tsl, ip_stack_t *ipst)
-{
- irb_t *irb_ptr;
- ire_t *ire;
-
- irb_ptr = &ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(*addr,
- ipst->ips_ip6_cache_table_size)];
- rw_enter(&irb_ptr->irb_lock, RW_READER);
- for (ire = irb_ptr->irb_ire; ire; ire = ire->ire_next) {
- if (ire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN))
- continue;
- if (IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, addr)) {
- /*
- * Finally, check if the security policy has any
- * restriction on using this route for the specified
- * message.
- */
- if (tsl != NULL &&
- ire->ire_gw_secattr != NULL &&
- tsol_ire_match_gwattr(ire, tsl) != 0) {
- continue;
- }
-
- if (zoneid == ALL_ZONES || ire->ire_zoneid == zoneid ||
- ire->ire_zoneid == ALL_ZONES) {
- IRE_REFHOLD(ire);
- rw_exit(&irb_ptr->irb_lock);
- return (ire);
- }
-
- if (ire->ire_type == IRE_LOCAL) {
- if (ipst->ips_ip_restrict_interzone_loopback &&
- !ire_local_ok_across_zones(ire, zoneid,
- (void *)addr, tsl, ipst))
- continue;
-
- IRE_REFHOLD(ire);
- rw_exit(&irb_ptr->irb_lock);
- return (ire);
- }
- }
- }
- rw_exit(&irb_ptr->irb_lock);
- return (NULL);
-}
-
-/*
- * Locate the interface ire that is tied to the cache ire 'cire' via
- * cire->ire_ihandle.
- *
- * We are trying to create the cache ire for an onlink destn. or
- * gateway in 'cire'. We are called from ire_add_v6() in the IRE_IF_RESOLVER
- * case for xresolv interfaces, after the ire has come back from
- * an external resolver.
- */
-static ire_t *
-ire_ihandle_lookup_onlink_v6(ire_t *cire)
+ire_ftable_lookup_simple_v6(const in6_addr_t *addr, uint32_t xmit_hint,
+ ip_stack_t *ipst, uint_t *generationp)
{
ire_t *ire;
- int match_flags;
- int i;
- int j;
- irb_t *irb_ptr;
- ip_stack_t *ipst = cire->ire_ipst;
-
- ASSERT(cire != NULL);
- match_flags = MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK;
- /*
- * We know that the mask of the interface ire equals cire->ire_cmask.
- * (When ip_newroute_v6() created 'cire' for an on-link destn.
- * it set its cmask from the interface ire's mask)
- */
- ire = ire_ftable_lookup_v6(&cire->ire_addr_v6, &cire->ire_cmask_v6,
- NULL, IRE_INTERFACE, NULL, NULL, ALL_ZONES, cire->ire_ihandle,
- NULL, match_flags, ipst);
- if (ire != NULL)
- return (ire);
- /*
- * If we didn't find an interface ire above, we can't declare failure.
- * For backwards compatibility, we need to support prefix routes
- * pointing to next hop gateways that are not on-link.
- *
- * In the resolver/noresolver case, ip_newroute_v6() thinks
- * it is creating the cache ire for an onlink destination in 'cire'.
- * But 'cire' is not actually onlink, because ire_ftable_lookup_v6()
- * cheated it, by doing ire_route_lookup_v6() twice and returning an
- * interface ire.
- *
- * Eg. default - gw1 (line 1)
- * gw1 - gw2 (line 2)
- * gw2 - hme0 (line 3)
- *
- * In the above example, ip_newroute_v6() tried to create the cache ire
- * 'cire' for gw1, based on the interface route in line 3. The
- * ire_ftable_lookup_v6() above fails, because there is
- * no interface route to reach gw1. (it is gw2). We fall thru below.
- *
- * Do a brute force search based on the ihandle in a subset of the
- * forwarding tables, corresponding to cire->ire_cmask_v6. Otherwise
- * things become very complex, since we don't have 'pire' in this
- * case. (Also note that this method is not possible in the offlink
- * case because we don't know the mask)
- */
- i = ip_mask_to_plen_v6(&cire->ire_cmask_v6);
- if ((ipst->ips_ip_forwarding_table_v6[i]) == NULL)
- return (NULL);
- for (j = 0; j < ipst->ips_ip6_ftable_hash_size; j++) {
- irb_ptr = &ipst->ips_ip_forwarding_table_v6[i][j];
- rw_enter(&irb_ptr->irb_lock, RW_READER);
- for (ire = irb_ptr->irb_ire; ire != NULL;
- ire = ire->ire_next) {
- if (ire->ire_marks & IRE_MARK_CONDEMNED)
- continue;
- if ((ire->ire_type & IRE_INTERFACE) &&
- (ire->ire_ihandle == cire->ire_ihandle)) {
- IRE_REFHOLD(ire);
- rw_exit(&irb_ptr->irb_lock);
- return (ire);
- }
- }
- rw_exit(&irb_ptr->irb_lock);
+ ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL, ALL_ZONES, NULL,
+ MATCH_IRE_DSTONLY, xmit_hint, ipst, generationp);
+ if (ire == NULL) {
+ ire = ire_reject(ipst, B_TRUE);
+ if (generationp != NULL)
+ *generationp = IRE_GENERATION_VERIFY;
}
- return (NULL);
+ /* ftable_lookup did round robin */
+ return (ire);
}
-
-/*
- * Locate the interface ire that is tied to the cache ire 'cire' via
- * cire->ire_ihandle.
- *
- * We are trying to create the cache ire for an offlink destn based
- * on the cache ire of the gateway in 'cire'. 'pire' is the prefix ire
- * as found by ip_newroute_v6(). We are called from ip_newroute_v6() in
- * the IRE_CACHE case.
- */
ire_t *
-ire_ihandle_lookup_offlink_v6(ire_t *cire, ire_t *pire)
+ip_select_route_v6(const in6_addr_t *dst, ip_xmit_attr_t *ixa,
+ uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp)
{
- ire_t *ire;
- int match_flags;
- in6_addr_t gw_addr;
- ipif_t *gw_ipif;
- ip_stack_t *ipst = cire->ire_ipst;
-
- ASSERT(cire != NULL && pire != NULL);
+ ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4));
- match_flags = MATCH_IRE_TYPE | MATCH_IRE_IHANDLE | MATCH_IRE_MASK;
- if (pire->ire_ipif != NULL)
- match_flags |= MATCH_IRE_ILL;
- /*
- * We know that the mask of the interface ire equals cire->ire_cmask.
- * (When ip_newroute_v6() created 'cire' for an on-link destn. it set
- * its cmask from the interface ire's mask)
- */
- ire = ire_ftable_lookup_v6(&cire->ire_addr_v6, &cire->ire_cmask_v6, 0,
- IRE_INTERFACE, pire->ire_ipif, NULL, ALL_ZONES, cire->ire_ihandle,
- NULL, match_flags, ipst);
- if (ire != NULL)
- return (ire);
- /*
- * If we didn't find an interface ire above, we can't declare failure.
- * For backwards compatibility, we need to support prefix routes
- * pointing to next hop gateways that are not on-link.
- *
- * Assume we are trying to ping some offlink destn, and we have the
- * routing table below.
- *
- * Eg. default - gw1 <--- pire (line 1)
- * gw1 - gw2 (line 2)
- * gw2 - hme0 (line 3)
- *
- * If we already have a cache ire for gw1 in 'cire', the
- * ire_ftable_lookup_v6 above would have failed, since there is no
- * interface ire to reach gw1. We will fallthru below.
- *
- * Here we duplicate the steps that ire_ftable_lookup_v6() did in
- * getting 'cire' from 'pire', in the MATCH_IRE_RECURSIVE case.
- * The differences are the following
- * i. We want the interface ire only, so we call
- * ire_ftable_lookup_v6() instead of ire_route_lookup_v6()
- * ii. We look for only prefix routes in the 1st call below.
- * ii. We want to match on the ihandle in the 2nd call below.
- */
- match_flags = MATCH_IRE_TYPE;
- if (pire->ire_ipif != NULL)
- match_flags |= MATCH_IRE_ILL;
-
- mutex_enter(&pire->ire_lock);
- gw_addr = pire->ire_gateway_addr_v6;
- mutex_exit(&pire->ire_lock);
- ire = ire_ftable_lookup_v6(&gw_addr, 0, 0, IRE_OFFSUBNET,
- pire->ire_ipif, NULL, ALL_ZONES, 0, NULL, match_flags, ipst);
- if (ire == NULL)
- return (NULL);
- /*
- * At this point 'ire' corresponds to the entry shown in line 2.
- * gw_addr is 'gw2' in the example above.
- */
- mutex_enter(&ire->ire_lock);
- gw_addr = ire->ire_gateway_addr_v6;
- mutex_exit(&ire->ire_lock);
- gw_ipif = ire->ire_ipif;
- ire_refrele(ire);
-
- match_flags |= MATCH_IRE_IHANDLE;
- ire = ire_ftable_lookup_v6(&gw_addr, 0, 0, IRE_INTERFACE,
- gw_ipif, NULL, ALL_ZONES, cire->ire_ihandle,
- NULL, match_flags, ipst);
- return (ire);
+ return (ip_select_route(dst, ixa, generationp, setsrcp, errorp,
+ multirtp));
}
/*
- * Return the IRE_LOOPBACK, IRE_IF_RESOLVER or IRE_IF_NORESOLVER
- * ire associated with the specified ipif.
+ * Recursively look for a route to the destination. Can also match on
+ * the zoneid, ill, and label. Used for the data paths. See also
+ * ire_route_recursive_dstonly.
*
- * This might occasionally be called when IPIF_UP is not set since
- * the IPV6_MULTICAST_IF as well as creating interface routes
- * allows specifying a down ipif (ipif_lookup* match ipifs that are down).
+ * If ill is set this means we will match it by adding MATCH_IRE_ILL.
*
- * Note that if IPIF_NOLOCAL, IPIF_NOXMIT, or IPIF_DEPRECATED is set on
- * the ipif this routine might return NULL.
- * (Sometimes called as writer though not required by this function.)
+ * If allocate is not set then we will only inspect the existing IREs; never
+ * create an IRE_IF_CLONE. This is used on the receive side when we are not
+ * forwarding.
+ *
+ * Note that this function never returns NULL. It returns an IRE_NOROUTE
+ * instead.
+ *
+ * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
+ * is an error.
+ * Allow at most one RTF_INDIRECT.
*/
ire_t *
-ipif_to_ire_v6(const ipif_t *ipif)
+ire_route_recursive_impl_v6(ire_t *ire,
+ const in6_addr_t *nexthop, uint_t ire_type, const ill_t *ill_arg,
+ zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
+ boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst,
+ in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
{
- ire_t *ire;
- ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
- uint_t match_flags = MATCH_IRE_TYPE | MATCH_IRE_IPIF;
+ int i, j;
+ in6_addr_t v6nexthop = *nexthop;
+ ire_t *ires[MAX_IRE_RECURSION];
+ uint_t generation;
+ uint_t generations[MAX_IRE_RECURSION];
+ boolean_t need_refrele = B_FALSE;
+ boolean_t invalidate = B_FALSE;
+ int prefs[MAX_IRE_RECURSION];
+ ill_t *ill = NULL;
+
+ if (setsrcp != NULL)
+ ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp));
+ if (gwattrp != NULL)
+ ASSERT(*gwattrp == NULL);
+
+ if (ill_arg != NULL)
+ match_args |= MATCH_IRE_ILL;
/*
- * IRE_INTERFACE entries for ills under IPMP are IRE_MARK_TESTHIDDEN
- * so that they aren't accidentally returned. However, if the
- * caller's ipif is on an ill under IPMP, there's no need to hide 'em.
+ * We iterate up to three times to resolve a route, even though
+ * we have four slots in the array. The extra slot is for an
+ * IRE_IF_CLONE we might need to create.
*/
- if (IS_UNDER_IPMP(ipif->ipif_ill))
- match_flags |= MATCH_IRE_MARK_TESTHIDDEN;
-
- ASSERT(ipif->ipif_isv6);
- if (ipif->ipif_ire_type == IRE_LOOPBACK) {
- ire = ire_ctable_lookup_v6(&ipif->ipif_v6lcl_addr, NULL,
- IRE_LOOPBACK, ipif, ALL_ZONES, NULL, match_flags, ipst);
- } else if (ipif->ipif_flags & IPIF_POINTOPOINT) {
- /* In this case we need to lookup destination address. */
- ire = ire_ftable_lookup_v6(&ipif->ipif_v6pp_dst_addr,
- &ipv6_all_ones, NULL, IRE_INTERFACE, ipif, NULL, ALL_ZONES,
- 0, NULL, (match_flags | MATCH_IRE_MASK), ipst);
- } else {
- ire = ire_ftable_lookup_v6(&ipif->ipif_v6subnet,
- &ipif->ipif_v6net_mask, NULL, IRE_INTERFACE, ipif, NULL,
- ALL_ZONES, 0, NULL, (match_flags | MATCH_IRE_MASK), ipst);
- }
- return (ire);
-}
-
-/*
- * Return B_TRUE if a multirt route is resolvable
- * (or if no route is resolved yet), B_FALSE otherwise.
- * This only works in the global zone.
- */
-boolean_t
-ire_multirt_need_resolve_v6(const in6_addr_t *v6dstp, const ts_label_t *tsl,
- ip_stack_t *ipst)
-{
- ire_t *first_fire;
- ire_t *first_cire;
- ire_t *fire;
- ire_t *cire;
- irb_t *firb;
- irb_t *cirb;
- int unres_cnt = 0;
- boolean_t resolvable = B_FALSE;
-
- /* Retrieve the first IRE_HOST that matches the destination */
- first_fire = ire_ftable_lookup_v6(v6dstp, &ipv6_all_ones, 0, IRE_HOST,
- NULL, NULL, ALL_ZONES, 0, tsl, MATCH_IRE_MASK | MATCH_IRE_TYPE |
- MATCH_IRE_SECATTR, ipst);
-
- /* No route at all */
- if (first_fire == NULL) {
- return (B_TRUE);
- }
-
- firb = first_fire->ire_bucket;
- ASSERT(firb);
-
- /* Retrieve the first IRE_CACHE ire for that destination. */
- first_cire = ire_cache_lookup_v6(v6dstp, GLOBAL_ZONEID, tsl, ipst);
-
- /* No resolved route. */
- if (first_cire == NULL) {
- ire_refrele(first_fire);
- return (B_TRUE);
- }
-
- /* At least one route is resolved. */
-
- cirb = first_cire->ire_bucket;
- ASSERT(cirb);
-
- /* Count the number of routes to that dest that are declared. */
- IRB_REFHOLD(firb);
- for (fire = first_fire; fire != NULL; fire = fire->ire_next) {
- if (!(fire->ire_flags & RTF_MULTIRT))
- continue;
- if (!IN6_ARE_ADDR_EQUAL(&fire->ire_addr_v6, v6dstp))
- continue;
- unres_cnt++;
- }
- IRB_REFRELE(firb);
-
-
- /* Then subtract the number of routes to that dst that are resolved */
- IRB_REFHOLD(cirb);
- for (cire = first_cire; cire != NULL; cire = cire->ire_next) {
- if (!(cire->ire_flags & RTF_MULTIRT))
- continue;
- if (!IN6_ARE_ADDR_EQUAL(&cire->ire_addr_v6, v6dstp))
- continue;
- if (cire->ire_marks & (IRE_MARK_CONDEMNED|IRE_MARK_TESTHIDDEN))
- continue;
- unres_cnt--;
- }
- IRB_REFRELE(cirb);
-
- /* At least one route is unresolved; search for a resolvable route. */
- if (unres_cnt > 0)
- resolvable = ire_multirt_lookup_v6(&first_cire, &first_fire,
- MULTIRT_USESTAMP|MULTIRT_CACHEGW, tsl, ipst);
-
- if (first_fire)
- ire_refrele(first_fire);
-
- if (first_cire)
- ire_refrele(first_cire);
-
- return (resolvable);
-}
-
-
-/*
- * Return B_TRUE and update *ire_arg and *fire_arg
- * if at least one resolvable route is found.
- * Return B_FALSE otherwise (all routes are resolved or
- * the remaining unresolved routes are all unresolvable).
- * This only works in the global zone.
- */
-boolean_t
-ire_multirt_lookup_v6(ire_t **ire_arg, ire_t **fire_arg, uint32_t flags,
- const ts_label_t *tsl, ip_stack_t *ipst)
-{
- clock_t delta;
- ire_t *best_fire = NULL;
- ire_t *best_cire = NULL;
- ire_t *first_fire;
- ire_t *first_cire;
- ire_t *fire;
- ire_t *cire;
- irb_t *firb = NULL;
- irb_t *cirb = NULL;
- ire_t *gw_ire;
- boolean_t already_resolved;
- boolean_t res;
- in6_addr_t v6dst;
- in6_addr_t v6gw;
-
- ip2dbg(("ire_multirt_lookup_v6: *ire_arg %p, *fire_arg %p, "
- "flags %04x\n", (void *)*ire_arg, (void *)*fire_arg, flags));
-
- ASSERT(ire_arg);
- ASSERT(fire_arg);
-
- /* Not an IRE_HOST ire; give up. */
- if ((*fire_arg == NULL) ||
- ((*fire_arg)->ire_type != IRE_HOST)) {
- return (B_FALSE);
- }
+ i = 0;
+ while (i < MAX_IRE_RECURSION - 1) {
+ /* ire_ftable_lookup handles round-robin/ECMP */
+ if (ire == NULL) {
+ ire = ire_ftable_lookup_v6(&v6nexthop, 0, 0, ire_type,
+ (ill_arg != NULL ? ill_arg : ill), zoneid, tsl,
+ match_args, xmit_hint, ipst, &generation);
+ } else {
+ /* Caller passed it; extra hold since we will rele */
+ ire_refhold(ire);
+ if (generationp != NULL)
+ generation = *generationp;
+ else
+ generation = IRE_GENERATION_VERIFY;
+ }
- /* This is the first IRE_HOST ire for that destination. */
- first_fire = *fire_arg;
- firb = first_fire->ire_bucket;
- ASSERT(firb);
+ if (ire == NULL)
+ ire = ire_reject(ipst, B_TRUE);
- mutex_enter(&first_fire->ire_lock);
- v6dst = first_fire->ire_addr_v6;
- mutex_exit(&first_fire->ire_lock);
+ /* Need to return the ire with RTF_REJECT|BLACKHOLE */
+ if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))
+ goto error;
- ip2dbg(("ire_multirt_lookup_v6: dst %08x\n",
- ntohl(V4_PART_OF_V6(v6dst))));
+ ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */
- /*
- * Retrieve the first IRE_CACHE ire for that destination;
- * if we don't find one, no route for that dest is
- * resolved yet.
- */
- first_cire = ire_cache_lookup_v6(&v6dst, GLOBAL_ZONEID, tsl, ipst);
- if (first_cire) {
- cirb = first_cire->ire_bucket;
- }
-
- ip2dbg(("ire_multirt_lookup_v6: first_cire %p\n", (void *)first_cire));
+ prefs[i] = ire_pref(ire);
+ if (i != 0) {
+ /*
+ * Don't allow anything unusual past the first
+ * iteration.
+ */
+ if ((ire->ire_type &
+ (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) ||
+ prefs[i] <= prefs[i-1]) {
+ ire_refrele(ire);
+ ire = ire_reject(ipst, B_TRUE);
+ goto error;
+ }
+ }
+ /* We have a usable IRE */
+ ires[i] = ire;
+ generations[i] = generation;
+ i++;
+
+ /* The first RTF_SETSRC address is passed back if setsrcp */
+ if ((ire->ire_flags & RTF_SETSRC) &&
+ setsrcp != NULL && IN6_IS_ADDR_UNSPECIFIED(setsrcp)) {
+ ASSERT(!IN6_IS_ADDR_UNSPECIFIED(
+ &ire->ire_setsrc_addr_v6));
+ *setsrcp = ire->ire_setsrc_addr_v6;
+ }
- /*
- * Search for a resolvable route, giving the top priority
- * to routes that can be resolved without any call to the resolver.
- */
- IRB_REFHOLD(firb);
+ /* The first ire_gw_secattr is passed back if gwattrp */
+ if (ire->ire_gw_secattr != NULL &&
+ gwattrp != NULL && *gwattrp == NULL)
+ *gwattrp = ire->ire_gw_secattr;
- if (!IN6_IS_ADDR_MULTICAST(&v6dst)) {
/*
- * For all multiroute IRE_HOST ires for that destination,
- * check if the route via the IRE_HOST's gateway is
- * resolved yet.
+ * Check if we have a short-cut pointer to an IRE for this
+ * destination, and that the cached dependency isn't stale.
+ * In that case we've rejoined an existing tree towards a
+ * parent, thus we don't need to continue the loop to
+ * discover the rest of the tree.
*/
- for (fire = first_fire; fire != NULL; fire = fire->ire_next) {
-
- if (!(fire->ire_flags & RTF_MULTIRT))
- continue;
- if (!IN6_ARE_ADDR_EQUAL(&fire->ire_addr_v6, &v6dst))
- continue;
-
- if (fire->ire_gw_secattr != NULL &&
- tsol_ire_match_gwattr(fire, tsl) != 0) {
- continue;
- }
-
- mutex_enter(&fire->ire_lock);
- v6gw = fire->ire_gateway_addr_v6;
- mutex_exit(&fire->ire_lock);
-
- ip2dbg(("ire_multirt_lookup_v6: fire %p, "
- "ire_addr %08x, ire_gateway_addr %08x\n",
- (void *)fire,
- ntohl(V4_PART_OF_V6(fire->ire_addr_v6)),
- ntohl(V4_PART_OF_V6(v6gw))));
+ mutex_enter(&ire->ire_lock);
+ if (ire->ire_dep_parent != NULL &&
+ ire->ire_dep_parent->ire_generation ==
+ ire->ire_dep_parent_generation) {
+ mutex_exit(&ire->ire_lock);
+ ire = NULL;
+ goto done;
+ }
+ mutex_exit(&ire->ire_lock);
- already_resolved = B_FALSE;
+ /*
+ * If this type should have an ire_nce_cache (even if it
+ * doesn't yet have one) then we are done. Includes
+ * IRE_INTERFACE with a full 128 bit mask.
+ */
+ if (ire->ire_nce_capable) {
+ ire = NULL;
+ goto done;
+ }
- if (first_cire) {
- ASSERT(cirb);
+ ASSERT(!(ire->ire_type & IRE_IF_CLONE));
+ /*
+ * For an IRE_INTERFACE we create an IRE_IF_CLONE for this
+ * particular destination
+ */
+ if (ire->ire_type & IRE_INTERFACE) {
+ ire_t *clone;
- IRB_REFHOLD(cirb);
- /*
- * For all IRE_CACHE ires for that
- * destination.
- */
- for (cire = first_cire;
- cire != NULL;
- cire = cire->ire_next) {
-
- if (!(cire->ire_flags & RTF_MULTIRT))
- continue;
- if (!IN6_ARE_ADDR_EQUAL(
- &cire->ire_addr_v6, &v6dst))
- continue;
- if (cire->ire_marks &
- (IRE_MARK_CONDEMNED|
- IRE_MARK_TESTHIDDEN))
- continue;
-
- if (cire->ire_gw_secattr != NULL &&
- tsol_ire_match_gwattr(cire,
- tsl) != 0) {
- continue;
- }
-
- /*
- * Check if the IRE_CACHE's gateway
- * matches the IRE_HOST's gateway.
- */
- if (IN6_ARE_ADDR_EQUAL(
- &cire->ire_gateway_addr_v6,
- &v6gw)) {
- already_resolved = B_TRUE;
- break;
- }
- }
- IRB_REFRELE(cirb);
- }
+ ASSERT(ire->ire_masklen != IPV6_ABITS);
/*
- * This route is already resolved;
- * proceed with next one.
+ * In the case of ip_input and ILLF_FORWARDING not
+ * being set, and in the case of RTM_GET,
+ * there is no point in allocating
+ * an IRE_IF_CLONE. We return the IRE_INTERFACE.
+ * Note that !allocate can result in a ire_dep_parent
+ * which is IRE_IF_* without an IRE_IF_CLONE.
+ * We recover from that when we need to send packets
+ * by ensuring that the generations become
+ * IRE_GENERATION_VERIFY in this case.
*/
- if (already_resolved) {
- ip2dbg(("ire_multirt_lookup_v6: found cire %p, "
- "already resolved\n", (void *)cire));
- continue;
+ if (!allocate) {
+ invalidate = B_TRUE;
+ ire = NULL;
+ goto done;
}
- /*
- * The route is unresolved; is it actually
- * resolvable, i.e. is there a cache or a resolver
- * for the gateway?
- */
- gw_ire = ire_route_lookup_v6(&v6gw, 0, 0, 0, NULL, NULL,
- ALL_ZONES, tsl, MATCH_IRE_RECURSIVE |
- MATCH_IRE_SECATTR, ipst);
-
- ip2dbg(("ire_multirt_lookup_v6: looked up gw_ire %p\n",
- (void *)gw_ire));
-
- /*
- * This route can be resolved without any call to the
- * resolver; if the MULTIRT_CACHEGW flag is set,
- * give the top priority to this ire and exit the
- * loop.
- * This occurs when an resolver reply is processed
- * through ip_wput_nondata()
- */
- if ((flags & MULTIRT_CACHEGW) &&
- (gw_ire != NULL) &&
- (gw_ire->ire_type & IRE_CACHETABLE)) {
+ clone = ire_create_if_clone(ire, &v6nexthop,
+ &generation);
+ if (clone == NULL) {
/*
- * Release the resolver associated to the
- * previous candidate best ire, if any.
+ * Temporary failure - no memory.
+ * Don't want caller to cache IRE_NOROUTE.
*/
- if (best_cire) {
- ire_refrele(best_cire);
- ASSERT(best_fire);
- }
-
- best_fire = fire;
- best_cire = gw_ire;
-
- ip2dbg(("ire_multirt_lookup_v6: found top prio "
- "best_fire %p, best_cire %p\n",
- (void *)best_fire, (void *)best_cire));
- break;
+ invalidate = B_TRUE;
+ ire = ire_blackhole(ipst, B_TRUE);
+ goto error;
}
-
/*
- * Compute the time elapsed since our preceding
- * attempt to resolve that route.
- * If the MULTIRT_USESTAMP flag is set, we take that
- * route into account only if this time interval
- * exceeds ip_multirt_resolution_interval;
- * this prevents us from attempting to resolve a
- * broken route upon each sending of a packet.
+ * Make clone next to last entry and the
+ * IRE_INTERFACE the last in the dependency
+ * chain since the clone depends on the
+ * IRE_INTERFACE.
*/
- delta = lbolt - fire->ire_last_used_time;
- delta = TICK_TO_MSEC(delta);
-
- res = (boolean_t)
- ((delta > ipst->
- ips_ip_multirt_resolution_interval) ||
- (!(flags & MULTIRT_USESTAMP)));
+ ASSERT(i >= 1);
+ ASSERT(i < MAX_IRE_RECURSION);
- ip2dbg(("ire_multirt_lookup_v6: fire %p, delta %lu, "
- "res %d\n",
- (void *)fire, delta, res));
-
- if (res) {
- /*
- * A resolver exists for the gateway: save
- * the current IRE_HOST ire as a candidate
- * best ire. If we later discover that a
- * top priority ire exists (i.e. no need to
- * call the resolver), then this new ire
- * will be preferred to the current one.
- */
- if (gw_ire != NULL) {
- if (best_fire == NULL) {
- ASSERT(best_cire == NULL);
-
- best_fire = fire;
- best_cire = gw_ire;
-
- ip2dbg(("ire_multirt_lookup_v6:"
- "found candidate "
- "best_fire %p, "
- "best_cire %p\n",
- (void *)best_fire,
- (void *)best_cire));
-
- /*
- * If MULTIRT_CACHEGW is not
- * set, we ignore the top
- * priority ires that can
- * be resolved without any
- * call to the resolver;
- * In that case, there is
- * actually no need
- * to continue the loop.
- */
- if (!(flags &
- MULTIRT_CACHEGW)) {
- break;
- }
- continue;
- }
- } else {
- /*
- * No resolver for the gateway: the
- * route is not resolvable.
- * If the MULTIRT_SETSTAMP flag is
- * set, we stamp the IRE_HOST ire,
- * so we will not select it again
- * during this resolution interval.
- */
- if (flags & MULTIRT_SETSTAMP)
- fire->ire_last_used_time =
- lbolt;
- }
- }
+ ires[i] = ires[i-1];
+ generations[i] = generations[i-1];
+ ires[i-1] = clone;
+ generations[i-1] = generation;
+ i++;
- if (gw_ire != NULL)
- ire_refrele(gw_ire);
+ ire = NULL;
+ goto done;
}
- } else { /* IN6_IS_ADDR_MULTICAST(&v6dst) */
- for (fire = first_fire;
- fire != NULL;
- fire = fire->ire_next) {
-
- if (!(fire->ire_flags & RTF_MULTIRT))
- continue;
- if (!IN6_ARE_ADDR_EQUAL(&fire->ire_addr_v6, &v6dst))
- continue;
-
- if (fire->ire_gw_secattr != NULL &&
- tsol_ire_match_gwattr(fire, tsl) != 0) {
- continue;
- }
-
- already_resolved = B_FALSE;
-
- mutex_enter(&fire->ire_lock);
- v6gw = fire->ire_gateway_addr_v6;
- mutex_exit(&fire->ire_lock);
-
- gw_ire = ire_ftable_lookup_v6(&v6gw, 0, 0,
- IRE_INTERFACE, NULL, NULL, ALL_ZONES, 0, tsl,
- MATCH_IRE_RECURSIVE | MATCH_IRE_TYPE |
- MATCH_IRE_SECATTR, ipst);
-
- /* No resolver for the gateway; we skip this ire. */
- if (gw_ire == NULL) {
- continue;
- }
+ /*
+ * We only match on the type and optionally ILL when
+ * recursing. The type match is used by some callers
+ * to exclude certain types (such as IRE_IF_CLONE or
+ * IRE_LOCAL|IRE_LOOPBACK).
+ */
+ match_args &= MATCH_IRE_TYPE;
+ v6nexthop = ire->ire_gateway_addr_v6;
+ if (ill == NULL && ire->ire_ill != NULL) {
+ ill = ire->ire_ill;
+ need_refrele = B_TRUE;
+ ill_refhold(ill);
+ match_args |= MATCH_IRE_ILL;
+ }
- if (first_cire) {
+ ire = NULL;
+ }
+ ASSERT(ire == NULL);
+ ire = ire_reject(ipst, B_TRUE);
- IRB_REFHOLD(cirb);
- /*
- * For all IRE_CACHE ires for that
- * destination.
- */
- for (cire = first_cire;
- cire != NULL;
- cire = cire->ire_next) {
-
- if (!(cire->ire_flags & RTF_MULTIRT))
- continue;
- if (!IN6_ARE_ADDR_EQUAL(
- &cire->ire_addr_v6, &v6dst))
- continue;
- if (cire->ire_marks &
- IRE_MARK_CONDEMNED)
- continue;
-
- if (cire->ire_gw_secattr != NULL &&
- tsol_ire_match_gwattr(cire,
- tsl) != 0) {
- continue;
- }
-
- /*
- * Cache entries are linked to the
- * parent routes using the parent handle
- * (ire_phandle). If no cache entry has
- * the same handle as fire, fire is
- * still unresolved.
- */
- ASSERT(cire->ire_phandle != 0);
- if (cire->ire_phandle ==
- fire->ire_phandle) {
- already_resolved = B_TRUE;
- break;
- }
- }
- IRB_REFRELE(cirb);
- }
+error:
+ ASSERT(ire != NULL);
+ if (need_refrele)
+ ill_refrele(ill);
- /*
- * This route is already resolved; proceed with
- * next one.
- */
- if (already_resolved) {
- ire_refrele(gw_ire);
- continue;
- }
+ /*
+ * In the case of MULTIRT we want to try a different IRE the next
+ * time. We let the next packet retry in that case.
+ */
+ if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT))
+ (void) ire_no_good(ires[0]);
- /*
- * Compute the time elapsed since our preceding
- * attempt to resolve that route.
- * If the MULTIRT_USESTAMP flag is set, we take
- * that route into account only if this time
- * interval exceeds ip_multirt_resolution_interval;
- * this prevents us from attempting to resolve a
- * broken route upon each sending of a packet.
- */
- delta = lbolt - fire->ire_last_used_time;
- delta = TICK_TO_MSEC(delta);
-
- res = (boolean_t)
- ((delta > ipst->
- ips_ip_multirt_resolution_interval) ||
- (!(flags & MULTIRT_USESTAMP)));
-
- ip3dbg(("ire_multirt_lookup_v6: fire %p, delta %lx, "
- "flags %04x, res %d\n",
- (void *)fire, delta, flags, res));
-
- if (res) {
- if (best_cire) {
- /*
- * Release the resolver associated
- * to the preceding candidate best
- * ire, if any.
- */
- ire_refrele(best_cire);
- ASSERT(best_fire);
- }
- best_fire = fire;
- best_cire = gw_ire;
- continue;
- }
+cleanup:
+ /* cleanup ires[i] */
+ ire_dep_unbuild(ires, i);
+ for (j = 0; j < i; j++)
+ ire_refrele(ires[j]);
- ire_refrele(gw_ire);
- }
- }
+ ASSERT(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE));
+ /*
+ * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the
+ * ip_select_route since the reject or lack of memory might be gone.
+ */
+ if (generationp != NULL)
+ *generationp = IRE_GENERATION_VERIFY;
+ return (ire);
- if (best_fire) {
- IRE_REFHOLD(best_fire);
+done:
+ ASSERT(ire == NULL);
+ if (need_refrele)
+ ill_refrele(ill);
+
+ /* Build dependencies */
+ if (!ire_dep_build(ires, generations, i)) {
+ /* Something in chain was condemned; tear it apart */
+ ire = ire_blackhole(ipst, B_TRUE);
+ goto cleanup;
}
- IRB_REFRELE(firb);
- /* Release the first IRE_CACHE we initially looked up, if any. */
- if (first_cire)
- ire_refrele(first_cire);
-
- /* Found a resolvable route. */
- if (best_fire) {
- ASSERT(best_cire);
-
- if (*fire_arg)
- ire_refrele(*fire_arg);
- if (*ire_arg)
- ire_refrele(*ire_arg);
+ /*
+ * Release all refholds except the one for ires[0] that we
+ * will return to the caller.
+ */
+ for (j = 1; j < i; j++)
+ ire_refrele(ires[j]);
+ if (invalidate) {
/*
- * Update the passed arguments with the
- * resolvable multirt route we found
+ * Since we needed to allocate but couldn't we need to make
+ * sure that the dependency chain is rebuilt the next time.
*/
- *fire_arg = best_fire;
- *ire_arg = best_cire;
-
- ip2dbg(("ire_multirt_lookup_v6: returning B_TRUE, "
- "*fire_arg %p, *ire_arg %p\n",
- (void *)best_fire, (void *)best_cire));
-
- return (B_TRUE);
+ ire_dep_invalidate_generations(ires[0]);
+ generation = IRE_GENERATION_VERIFY;
+ } else {
+ /*
+ * IREs can have been added or deleted while we did the
+ * recursive lookup and we can't catch those until we've built
+ * the dependencies. We verify the stored
+ * ire_dep_parent_generation to catch any such changes and
+ * return IRE_GENERATION_VERIFY (which will cause
+ * ip_select_route to be called again so we can redo the
+ * recursive lookup next time we send a packet.
+ */
+ generation = ire_dep_validate_generations(ires[0]);
+ if (generations[0] != ires[0]->ire_generation) {
+ /* Something changed at the top */
+ generation = IRE_GENERATION_VERIFY;
+ }
}
+ if (generationp != NULL)
+ *generationp = generation;
- ASSERT(best_cire == NULL);
-
- ip2dbg(("ire_multirt_lookup_v6: returning B_FALSE, *fire_arg %p, "
- "*ire_arg %p\n",
- (void *)*fire_arg, (void *)*ire_arg));
-
- /* No resolvable route. */
- return (B_FALSE);
+ return (ires[0]);
}
-
-/*
- * Find an IRE_OFFSUBNET IRE entry for the multicast address 'v6dstp'
- * that goes through 'ipif'. As a fallback, a route that goes through
- * ipif->ipif_ill can be returned.
- */
ire_t *
-ipif_lookup_multi_ire_v6(ipif_t *ipif, const in6_addr_t *v6dstp)
+ire_route_recursive_v6(const in6_addr_t *nexthop, uint_t ire_type,
+ const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args,
+ boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst,
+ in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp)
{
- ire_t *ire;
- ire_t *save_ire = NULL;
- ire_t *gw_ire;
- irb_t *irb;
- in6_addr_t v6gw;
- int match_flags = MATCH_IRE_TYPE | MATCH_IRE_ILL;
- ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
-
- ire = ire_ftable_lookup_v6(v6dstp, 0, 0, 0, NULL, NULL, ALL_ZONES, 0,
- NULL, MATCH_IRE_DEFAULT, ipst);
-
- if (ire == NULL)
- return (NULL);
-
- irb = ire->ire_bucket;
- ASSERT(irb);
-
- IRB_REFHOLD(irb);
- ire_refrele(ire);
- for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) {
- if (!IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, v6dstp) ||
- (ipif->ipif_zoneid != ire->ire_zoneid &&
- ire->ire_zoneid != ALL_ZONES)) {
- continue;
- }
-
- switch (ire->ire_type) {
- case IRE_DEFAULT:
- case IRE_PREFIX:
- case IRE_HOST:
- mutex_enter(&ire->ire_lock);
- v6gw = ire->ire_gateway_addr_v6;
- mutex_exit(&ire->ire_lock);
- gw_ire = ire_ftable_lookup_v6(&v6gw, 0, 0,
- IRE_INTERFACE, ipif, NULL, ALL_ZONES, 0,
- NULL, match_flags, ipst);
-
- if (gw_ire != NULL) {
- if (save_ire != NULL) {
- ire_refrele(save_ire);
- }
- IRE_REFHOLD(ire);
- if (gw_ire->ire_ipif == ipif) {
- ire_refrele(gw_ire);
-
- IRB_REFRELE(irb);
- return (ire);
- }
- ire_refrele(gw_ire);
- save_ire = ire;
- }
- break;
- case IRE_IF_NORESOLVER:
- case IRE_IF_RESOLVER:
- if (ire->ire_ipif == ipif) {
- if (save_ire != NULL) {
- ire_refrele(save_ire);
- }
- IRE_REFHOLD(ire);
-
- IRB_REFRELE(irb);
- return (ire);
- }
- break;
- }
- }
- IRB_REFRELE(irb);
-
- return (save_ire);
+ return (ire_route_recursive_impl_v6(NULL, nexthop, ire_type, ill,
+ zoneid, tsl, match_args, allocate, xmit_hint, ipst, setsrcp,
+ gwattrp, generationp));
}
/*
- * This is the implementation of the IPv6 IRE cache lookup procedure.
- * Separating the interface from the implementation allows additional
- * flexibility when specifying search criteria.
+ * Recursively look for a route to the destination.
+ * We only handle a destination match here, yet we have the same arguments
+ * as the full match to allow function pointers to select between the two.
+ *
+ * Note that this function never returns NULL. It returns an IRE_NOROUTE
+ * instead.
+ *
+ * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it
+ * is an error.
+ * Allow at most one RTF_INDIRECT.
*/
-static ire_t *
-ip6_ctable_lookup_impl(ire_ctable_args_t *margs)
+ire_t *
+ire_route_recursive_dstonly_v6(const in6_addr_t *nexthop, boolean_t allocate,
+ uint32_t xmit_hint, ip_stack_t *ipst)
{
- irb_t *irb_ptr;
- ire_t *ire;
- ip_stack_t *ipst = margs->ict_ipst;
+ ire_t *ire;
+ ire_t *ire1;
+ uint_t generation;
- if ((margs->ict_flags & (MATCH_IRE_SRC | MATCH_IRE_ILL)) &&
- (margs->ict_ipif == NULL)) {
- return (NULL);
- }
+ /* ire_ftable_lookup handles round-robin/ECMP */
+ ire = ire_ftable_lookup_simple_v6(nexthop, xmit_hint, ipst,
+ &generation);
+ ASSERT(ire != NULL);
- irb_ptr = &ipst->ips_ip_cache_table_v6[IRE_ADDR_HASH_V6(
- *((in6_addr_t *)(margs->ict_addr)),
- ipst->ips_ip6_cache_table_size)];
- rw_enter(&irb_ptr->irb_lock, RW_READER);
- for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) {
- if (ire->ire_marks & IRE_MARK_CONDEMNED)
- continue;
- ASSERT(IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, &ipv6_all_ones));
- if (ire_match_args_v6(ire, (in6_addr_t *)margs->ict_addr,
- &ire->ire_mask_v6, (in6_addr_t *)margs->ict_gateway,
- margs->ict_type, margs->ict_ipif, margs->ict_zoneid, 0,
- margs->ict_tsl, margs->ict_flags)) {
- IRE_REFHOLD(ire);
- rw_exit(&irb_ptr->irb_lock);
- return (ire);
- }
+ /*
+ * If this type should have an ire_nce_cache (even if it
+ * doesn't yet have one) then we are done. Includes
+ * IRE_INTERFACE with a full 128 bit mask.
+ */
+ if (ire->ire_nce_capable)
+ return (ire);
+
+ /*
+ * If the IRE has a current cached parent we know that the whole
+ * parent chain is current, hence we don't need to discover and
+ * build any dependencies by doing a recursive lookup.
+ */
+ mutex_enter(&ire->ire_lock);
+ if (ire->ire_dep_parent != NULL &&
+ ire->ire_dep_parent->ire_generation ==
+ ire->ire_dep_parent_generation) {
+ mutex_exit(&ire->ire_lock);
+ return (ire);
}
+ mutex_exit(&ire->ire_lock);
- rw_exit(&irb_ptr->irb_lock);
- return (NULL);
+ /*
+ * Fallback to loop in the normal code starting with the ire
+ * we found. Normally this would return the same ire.
+ */
+ ire1 = ire_route_recursive_impl_v6(ire, nexthop, 0, NULL, ALL_ZONES,
+ NULL, MATCH_IRE_DSTONLY, allocate, xmit_hint, ipst, NULL, NULL,
+ &generation);
+ ire_refrele(ire);
+ return (ire1);
}