summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/inet/ip/ip_dce.c
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/inet/ip/ip_dce.c')
-rw-r--r--usr/src/uts/common/inet/ip/ip_dce.c873
1 files changed, 873 insertions, 0 deletions
diff --git a/usr/src/uts/common/inet/ip/ip_dce.c b/usr/src/uts/common/inet/ip/ip_dce.c
new file mode 100644
index 0000000000..839c5ae0d0
--- /dev/null
+++ b/usr/src/uts/common/inet/ip/ip_dce.c
@@ -0,0 +1,873 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/stream.h>
+#include <sys/strsun.h>
+#include <sys/zone.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/atomic.h>
+#define _SUN_TPI_VERSION 2
+#include <sys/tihdr.h>
+
+#include <inet/common.h>
+#include <inet/mi.h>
+#include <inet/mib2.h>
+#include <inet/snmpcom.h>
+
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+
+#include <inet/ip.h>
+#include <inet/ip_impl.h>
+#include <inet/ip6.h>
+#include <inet/ip6_asp.h>
+#include <inet/ip_multi.h>
+#include <inet/ip_if.h>
+#include <inet/ip_ire.h>
+#include <inet/ip_ftable.h>
+#include <inet/ip_rts.h>
+#include <inet/ip_ndp.h>
+#include <inet/ipclassifier.h>
+#include <inet/ip_listutils.h>
+
+#include <sys/sunddi.h>
+
+/*
+ * Routines for handling destination cache entries.
+ * There is always one DCEF_DEFAULT for each ip_stack_t created at init time.
+ * That entry holds both the IP ident value and the dce generation number.
+ *
+ * Any time a DCE is changed significantly (different path MTU, but NOT
+ * different ULP info!), the dce_generation number is increased.
+ * Also, when a new DCE is created, the dce_generation number in the default
+ * DCE is bumped. That allows the dce_t information to be cached efficiently
+ * as long as the entity caching the dce_t also caches the dce_generation,
+ * and compares the cached generation to detect any changes.
+ * Furthermore, when a DCE is deleted, if there are any outstanding references
+ * to the DCE it will be marked as condemned. The condemned mark is
+ * a designated generation number which is never otherwise used, hence
+ * the single comparison with the generation number captures that as well.
+ *
+ * An example of code which caches is as follows:
+ *
+ * if (mystruct->my_dce_generation != mystruct->my_dce->dce_generation) {
+ * The DCE has changed
+ * mystruct->my_dce = dce_lookup_pkt(mp, ixa,
+ * &mystruct->my_dce_generation);
+ * Not needed in practice, since we have the default DCE:
+ * if (DCE_IS_CONDEMNED(mystruct->my_dce))
+ * return failure;
+ * }
+ *
+ * Note that for IPv6 link-local addresses we record the ifindex since the
+ * link-locals are not globally unique.
+ */
+
+/*
+ * Hash bucket structure for DCEs
+ */
+typedef struct dcb_s {
+ krwlock_t dcb_lock;
+ uint32_t dcb_cnt;
+ dce_t *dcb_dce;
+} dcb_t;
+
+static void dce_delete_locked(dcb_t *, dce_t *);
+static void dce_make_condemned(dce_t *);
+
+static kmem_cache_t *dce_cache;
+
+
+/* Operates on a uint64_t */
+#define RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48))
+
+/*
+ * Reclaim a fraction of dce's in the dcb.
+ * For now we have a higher probability to delete DCEs without DCE_PMTU.
+ */
+static void
+dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction)
+{
+ uint_t fraction_pmtu = fraction*4;
+ uint_t hash;
+ dce_t *dce, *nextdce;
+
+ rw_enter(&dcb->dcb_lock, RW_WRITER);
+ for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
+ nextdce = dce->dce_next;
+ /* Clear DCEF_PMTU if the pmtu is too old */
+ mutex_enter(&dce->dce_lock);
+ if ((dce->dce_flags & DCEF_PMTU) &&
+ TICK_TO_SEC(lbolt64) - dce->dce_last_change_time >
+ ipst->ips_ip_pathmtu_interval) {
+ dce->dce_flags &= ~DCEF_PMTU;
+ mutex_exit(&dce->dce_lock);
+ dce_increment_generation(dce);
+ } else {
+ mutex_exit(&dce->dce_lock);
+ }
+ hash = RANDOM_HASH((uint64_t)(uintptr_t)dce);
+ if (dce->dce_flags & DCEF_PMTU) {
+ if (hash % fraction_pmtu != 0)
+ continue;
+ } else {
+ if (hash % fraction != 0)
+ continue;
+ }
+
+ IP_STAT(ipst, ip_dce_reclaim_deleted);
+ dce_delete_locked(dcb, dce);
+ dce_refrele(dce);
+ }
+ rw_exit(&dcb->dcb_lock);
+}
+
+/*
+ * kmem_cache callback to free up memory.
+ *
+ */
+static void
+ip_dce_reclaim_stack(ip_stack_t *ipst)
+{
+ int i;
+
+ IP_STAT(ipst, ip_dce_reclaim_calls);
+ for (i = 0; i < ipst->ips_dce_hashsize; i++) {
+ dcb_reclaim(&ipst->ips_dce_hash_v4[i], ipst,
+ ipst->ips_ip_dce_reclaim_fraction);
+
+ dcb_reclaim(&ipst->ips_dce_hash_v6[i], ipst,
+ ipst->ips_ip_dce_reclaim_fraction);
+ }
+
+ /*
+ * Walk all CONNs that can have a reference on an ire, nce or dce.
+ * Get them to update any stale references to drop any refholds they
+ * have.
+ */
+ ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
+}
+
+/*
+ * Called by the memory allocator subsystem directly, when the system
+ * is running low on memory.
+ */
+/* ARGSUSED */
+void
+ip_dce_reclaim(void *args)
+{
+ netstack_handle_t nh;
+ netstack_t *ns;
+
+ netstack_next_init(&nh);
+ while ((ns = netstack_next(&nh)) != NULL) {
+ ip_dce_reclaim_stack(ns->netstack_ip);
+ netstack_rele(ns);
+ }
+ netstack_next_fini(&nh);
+}
+
+void
+dce_g_init(void)
+{
+ dce_cache = kmem_cache_create("dce_cache",
+ sizeof (dce_t), 0, NULL, NULL, ip_dce_reclaim, NULL, NULL, 0);
+}
+
+void
+dce_g_destroy(void)
+{
+ kmem_cache_destroy(dce_cache);
+}
+
+
+/*
+ * Allocate a default DCE and a hash table for per-IP address DCEs
+ */
+void
+dce_stack_init(ip_stack_t *ipst)
+{
+ int i;
+
+ ipst->ips_dce_default = kmem_cache_alloc(dce_cache, KM_SLEEP);
+ bzero(ipst->ips_dce_default, sizeof (dce_t));
+ ipst->ips_dce_default->dce_flags = DCEF_DEFAULT;
+ ipst->ips_dce_default->dce_generation = DCE_GENERATION_INITIAL;
+ ipst->ips_dce_default->dce_last_change_time = TICK_TO_SEC(lbolt64);
+ ipst->ips_dce_default->dce_refcnt = 1; /* Should never go away */
+ ipst->ips_dce_default->dce_ipst = ipst;
+
+ /* This must be a power of two since we are using IRE_ADDR_HASH macro */
+ ipst->ips_dce_hashsize = 256;
+ ipst->ips_dce_hash_v4 = kmem_zalloc(ipst->ips_dce_hashsize *
+ sizeof (dcb_t), KM_SLEEP);
+ ipst->ips_dce_hash_v6 = kmem_zalloc(ipst->ips_dce_hashsize *
+ sizeof (dcb_t), KM_SLEEP);
+ for (i = 0; i < ipst->ips_dce_hashsize; i++) {
+ rw_init(&ipst->ips_dce_hash_v4[i].dcb_lock, NULL, RW_DEFAULT,
+ NULL);
+ rw_init(&ipst->ips_dce_hash_v6[i].dcb_lock, NULL, RW_DEFAULT,
+ NULL);
+ }
+}
+
+void
+dce_stack_destroy(ip_stack_t *ipst)
+{
+ int i;
+ for (i = 0; i < ipst->ips_dce_hashsize; i++) {
+ rw_destroy(&ipst->ips_dce_hash_v4[i].dcb_lock);
+ rw_destroy(&ipst->ips_dce_hash_v6[i].dcb_lock);
+ }
+ kmem_free(ipst->ips_dce_hash_v4,
+ ipst->ips_dce_hashsize * sizeof (dcb_t));
+ ipst->ips_dce_hash_v4 = NULL;
+ kmem_free(ipst->ips_dce_hash_v6,
+ ipst->ips_dce_hashsize * sizeof (dcb_t));
+ ipst->ips_dce_hash_v6 = NULL;
+ ipst->ips_dce_hashsize = 0;
+
+ ASSERT(ipst->ips_dce_default->dce_refcnt == 1);
+ kmem_cache_free(dce_cache, ipst->ips_dce_default);
+ ipst->ips_dce_default = NULL;
+}
+
+/* When any DCE is good enough */
+dce_t *
+dce_get_default(ip_stack_t *ipst)
+{
+ dce_t *dce;
+
+ dce = ipst->ips_dce_default;
+ dce_refhold(dce);
+ return (dce);
+}
+
+/*
+ * Generic for IPv4 and IPv6.
+ *
+ * Used by callers that need to cache e.g., the datapath
+ * Returns the generation number in the last argument.
+ */
+dce_t *
+dce_lookup_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp)
+{
+ if (ixa->ixa_flags & IXAF_IS_IPV4) {
+ /*
+ * If we have a source route we need to look for the final
+ * destination in the source route option.
+ */
+ ipaddr_t final_dst;
+ ipha_t *ipha = (ipha_t *)mp->b_rptr;
+
+ final_dst = ip_get_dst(ipha);
+ return (dce_lookup_v4(final_dst, ixa->ixa_ipst, generationp));
+ } else {
+ uint_t ifindex;
+ /*
+ * If we have a routing header we need to look for the final
+ * destination in the routing extension header.
+ */
+ in6_addr_t final_dst;
+ ip6_t *ip6h = (ip6_t *)mp->b_rptr;
+
+ final_dst = ip_get_dst_v6(ip6h, mp, NULL);
+ ifindex = 0;
+ if (IN6_IS_ADDR_LINKSCOPE(&final_dst) && ixa->ixa_nce != NULL) {
+ ifindex = ixa->ixa_nce->nce_common->ncec_ill->
+ ill_phyint->phyint_ifindex;
+ }
+ return (dce_lookup_v6(&final_dst, ifindex, ixa->ixa_ipst,
+ generationp));
+ }
+}
+
+/*
+ * Used by callers that need to cache e.g., the datapath
+ * Returns the generation number in the last argument.
+ */
+dce_t *
+dce_lookup_v4(ipaddr_t dst, ip_stack_t *ipst, uint_t *generationp)
+{
+ uint_t hash;
+ dcb_t *dcb;
+ dce_t *dce;
+
+ /* Set *generationp before dropping the lock(s) that allow additions */
+ if (generationp != NULL)
+ *generationp = ipst->ips_dce_default->dce_generation;
+
+ hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
+ dcb = &ipst->ips_dce_hash_v4[hash];
+ rw_enter(&dcb->dcb_lock, RW_READER);
+ for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
+ if (dce->dce_v4addr == dst) {
+ mutex_enter(&dce->dce_lock);
+ if (!DCE_IS_CONDEMNED(dce)) {
+ dce_refhold(dce);
+ if (generationp != NULL)
+ *generationp = dce->dce_generation;
+ mutex_exit(&dce->dce_lock);
+ rw_exit(&dcb->dcb_lock);
+ return (dce);
+ }
+ mutex_exit(&dce->dce_lock);
+ }
+ }
+ rw_exit(&dcb->dcb_lock);
+ /* Not found */
+ dce = ipst->ips_dce_default;
+ dce_refhold(dce);
+ return (dce);
+}
+
+/*
+ * Used by callers that need to cache e.g., the datapath
+ * Returns the generation number in the last argument.
+ * ifindex should only be set for link-locals
+ */
+dce_t *
+dce_lookup_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst,
+ uint_t *generationp)
+{
+ uint_t hash;
+ dcb_t *dcb;
+ dce_t *dce;
+
+ /* Set *generationp before dropping the lock(s) that allow additions */
+ if (generationp != NULL)
+ *generationp = ipst->ips_dce_default->dce_generation;
+
+ hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
+ dcb = &ipst->ips_dce_hash_v6[hash];
+ rw_enter(&dcb->dcb_lock, RW_READER);
+ for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
+ if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
+ dce->dce_ifindex == ifindex) {
+ mutex_enter(&dce->dce_lock);
+ if (!DCE_IS_CONDEMNED(dce)) {
+ dce_refhold(dce);
+ if (generationp != NULL)
+ *generationp = dce->dce_generation;
+ mutex_exit(&dce->dce_lock);
+ rw_exit(&dcb->dcb_lock);
+ return (dce);
+ }
+ mutex_exit(&dce->dce_lock);
+ }
+ }
+ rw_exit(&dcb->dcb_lock);
+ /* Not found */
+ dce = ipst->ips_dce_default;
+ dce_refhold(dce);
+ return (dce);
+}
+
+/*
+ * Atomically looks for a non-default DCE, and if not found tries to create one.
+ * If there is no memory it returns NULL.
+ * When an entry is created we increase the generation number on
+ * the default DCE so that conn_ip_output will detect there is a new DCE.
+ */
+dce_t *
+dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst)
+{
+ uint_t hash;
+ dcb_t *dcb;
+ dce_t *dce;
+
+ hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
+ dcb = &ipst->ips_dce_hash_v4[hash];
+ rw_enter(&dcb->dcb_lock, RW_WRITER);
+ for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
+ if (dce->dce_v4addr == dst) {
+ mutex_enter(&dce->dce_lock);
+ if (!DCE_IS_CONDEMNED(dce)) {
+ dce_refhold(dce);
+ mutex_exit(&dce->dce_lock);
+ rw_exit(&dcb->dcb_lock);
+ return (dce);
+ }
+ mutex_exit(&dce->dce_lock);
+ }
+ }
+ dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
+ if (dce == NULL) {
+ rw_exit(&dcb->dcb_lock);
+ return (NULL);
+ }
+ bzero(dce, sizeof (dce_t));
+ dce->dce_ipst = ipst; /* No netstack_hold */
+ dce->dce_v4addr = dst;
+ dce->dce_generation = DCE_GENERATION_INITIAL;
+ dce->dce_ipversion = IPV4_VERSION;
+ dce->dce_last_change_time = TICK_TO_SEC(lbolt64);
+ dce_refhold(dce); /* For the hash list */
+
+ /* Link into list */
+ if (dcb->dcb_dce != NULL)
+ dcb->dcb_dce->dce_ptpn = &dce->dce_next;
+ dce->dce_next = dcb->dcb_dce;
+ dce->dce_ptpn = &dcb->dcb_dce;
+ dcb->dcb_dce = dce;
+ dce->dce_bucket = dcb;
+ dce_refhold(dce); /* For the caller */
+ rw_exit(&dcb->dcb_lock);
+
+ /* Initialize dce_ident to be different than for the last packet */
+ dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
+
+ dce_increment_generation(ipst->ips_dce_default);
+ return (dce);
+}
+
+/*
+ * Atomically looks for a non-default DCE, and if not found tries to create one.
+ * If there is no memory it returns NULL.
+ * When an entry is created we increase the generation number on
+ * the default DCE so that conn_ip_output will detect there is a new DCE.
+ * ifindex should only be used with link-local addresses.
+ */
+dce_t *
+dce_lookup_and_add_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst)
+{
+ uint_t hash;
+ dcb_t *dcb;
+ dce_t *dce;
+
+ /* We should not create entries for link-locals w/o an ifindex */
+ ASSERT(!(IN6_IS_ADDR_LINKSCOPE(dst)) || ifindex != 0);
+
+ hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
+ dcb = &ipst->ips_dce_hash_v6[hash];
+ rw_enter(&dcb->dcb_lock, RW_WRITER);
+ for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
+ if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
+ dce->dce_ifindex == ifindex) {
+ mutex_enter(&dce->dce_lock);
+ if (!DCE_IS_CONDEMNED(dce)) {
+ dce_refhold(dce);
+ mutex_exit(&dce->dce_lock);
+ rw_exit(&dcb->dcb_lock);
+ return (dce);
+ }
+ mutex_exit(&dce->dce_lock);
+ }
+ }
+
+ dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
+ if (dce == NULL) {
+ rw_exit(&dcb->dcb_lock);
+ return (NULL);
+ }
+ bzero(dce, sizeof (dce_t));
+ dce->dce_ipst = ipst; /* No netstack_hold */
+ dce->dce_v6addr = *dst;
+ dce->dce_ifindex = ifindex;
+ dce->dce_generation = DCE_GENERATION_INITIAL;
+ dce->dce_ipversion = IPV6_VERSION;
+ dce->dce_last_change_time = TICK_TO_SEC(lbolt64);
+ dce_refhold(dce); /* For the hash list */
+
+ /* Link into list */
+ if (dcb->dcb_dce != NULL)
+ dcb->dcb_dce->dce_ptpn = &dce->dce_next;
+ dce->dce_next = dcb->dcb_dce;
+ dce->dce_ptpn = &dcb->dcb_dce;
+ dcb->dcb_dce = dce;
+ dce->dce_bucket = dcb;
+ atomic_add_32(&dcb->dcb_cnt, 1);
+ dce_refhold(dce); /* For the caller */
+ rw_exit(&dcb->dcb_lock);
+
+ /* Initialize dce_ident to be different than for the last packet */
+ dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
+ dce_increment_generation(ipst->ips_dce_default);
+ return (dce);
+}
+
+/*
+ * Set/update uinfo. Creates a per-destination dce if none exists.
+ *
+ * Note that we do not bump the generation number here.
+ * New connections will find the new uinfo.
+ *
+ * The only use of this (tcp, sctp using iulp_t) is to set rtt+rtt_sd.
+ */
+static void
+dce_setuinfo(dce_t *dce, iulp_t *uinfo)
+{
+ /*
+ * Update the round trip time estimate and/or the max frag size
+ * and/or the slow start threshold.
+ *
+ * We serialize multiple advises using dce_lock.
+ */
+ mutex_enter(&dce->dce_lock);
+ /* Gard against setting to zero */
+ if (uinfo->iulp_rtt != 0) {
+ /*
+ * If there is no old cached values, initialize them
+ * conservatively. Set them to be (1.5 * new value).
+ */
+ if (dce->dce_uinfo.iulp_rtt != 0) {
+ dce->dce_uinfo.iulp_rtt = (dce->dce_uinfo.iulp_rtt +
+ uinfo->iulp_rtt) >> 1;
+ } else {
+ dce->dce_uinfo.iulp_rtt = uinfo->iulp_rtt +
+ (uinfo->iulp_rtt >> 1);
+ }
+ if (dce->dce_uinfo.iulp_rtt_sd != 0) {
+ dce->dce_uinfo.iulp_rtt_sd =
+ (dce->dce_uinfo.iulp_rtt_sd +
+ uinfo->iulp_rtt_sd) >> 1;
+ } else {
+ dce->dce_uinfo.iulp_rtt_sd = uinfo->iulp_rtt_sd +
+ (uinfo->iulp_rtt_sd >> 1);
+ }
+ }
+ if (uinfo->iulp_mtu != 0) {
+ if (dce->dce_flags & DCEF_PMTU) {
+ dce->dce_pmtu = MIN(uinfo->iulp_mtu, dce->dce_pmtu);
+ } else {
+ dce->dce_pmtu = MIN(uinfo->iulp_mtu, IP_MAXPACKET);
+ dce->dce_flags |= DCEF_PMTU;
+ }
+ dce->dce_last_change_time = TICK_TO_SEC(lbolt64);
+ }
+ if (uinfo->iulp_ssthresh != 0) {
+ if (dce->dce_uinfo.iulp_ssthresh != 0)
+ dce->dce_uinfo.iulp_ssthresh =
+ (uinfo->iulp_ssthresh +
+ dce->dce_uinfo.iulp_ssthresh) >> 1;
+ else
+ dce->dce_uinfo.iulp_ssthresh = uinfo->iulp_ssthresh;
+ }
+ /* We have uinfo for sure */
+ dce->dce_flags |= DCEF_UINFO;
+ mutex_exit(&dce->dce_lock);
+}
+
+
+int
+dce_update_uinfo_v4(ipaddr_t dst, iulp_t *uinfo, ip_stack_t *ipst)
+{
+ dce_t *dce;
+
+ dce = dce_lookup_and_add_v4(dst, ipst);
+ if (dce == NULL)
+ return (ENOMEM);
+
+ dce_setuinfo(dce, uinfo);
+ dce_refrele(dce);
+ return (0);
+}
+
+int
+dce_update_uinfo_v6(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
+ ip_stack_t *ipst)
+{
+ dce_t *dce;
+
+ dce = dce_lookup_and_add_v6(dst, ifindex, ipst);
+ if (dce == NULL)
+ return (ENOMEM);
+
+ dce_setuinfo(dce, uinfo);
+ dce_refrele(dce);
+ return (0);
+}
+
+/* Common routine for IPv4 and IPv6 */
+int
+dce_update_uinfo(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
+ ip_stack_t *ipst)
+{
+ ipaddr_t dst4;
+
+ if (IN6_IS_ADDR_V4MAPPED_ANY(dst)) {
+ IN6_V4MAPPED_TO_IPADDR(dst, dst4);
+ return (dce_update_uinfo_v4(dst4, uinfo, ipst));
+ } else {
+ return (dce_update_uinfo_v6(dst, ifindex, uinfo, ipst));
+ }
+}
+
+static void
+dce_make_condemned(dce_t *dce)
+{
+ ip_stack_t *ipst = dce->dce_ipst;
+
+ mutex_enter(&dce->dce_lock);
+ ASSERT(!DCE_IS_CONDEMNED(dce));
+ dce->dce_generation = DCE_GENERATION_CONDEMNED;
+ mutex_exit(&dce->dce_lock);
+ /* Count how many condemned dces for kmem_cache callback */
+ atomic_add_32(&ipst->ips_num_dce_condemned, 1);
+}
+
+/*
+ * Increment the generation avoiding the special condemned value
+ */
+void
+dce_increment_generation(dce_t *dce)
+{
+ uint_t generation;
+
+ mutex_enter(&dce->dce_lock);
+ if (!DCE_IS_CONDEMNED(dce)) {
+ generation = dce->dce_generation + 1;
+ if (generation == DCE_GENERATION_CONDEMNED)
+ generation = DCE_GENERATION_INITIAL;
+ ASSERT(generation != DCE_GENERATION_VERIFY);
+ dce->dce_generation = generation;
+ }
+ mutex_exit(&dce->dce_lock);
+}
+
+/*
+ * Increment the generation number on all dces that have a path MTU and
+ * the default DCE. Used when ill_mtu changes.
+ */
+void
+dce_increment_all_generations(boolean_t isv6, ip_stack_t *ipst)
+{
+ int i;
+ dcb_t *dcb;
+ dce_t *dce;
+
+ for (i = 0; i < ipst->ips_dce_hashsize; i++) {
+ if (isv6)
+ dcb = &ipst->ips_dce_hash_v6[i];
+ else
+ dcb = &ipst->ips_dce_hash_v4[i];
+ rw_enter(&dcb->dcb_lock, RW_WRITER);
+ for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
+ if (DCE_IS_CONDEMNED(dce))
+ continue;
+ dce_increment_generation(dce);
+ }
+ rw_exit(&dcb->dcb_lock);
+ }
+ dce_increment_generation(ipst->ips_dce_default);
+}
+
+/*
+ * Caller needs to do a dce_refrele since we can't do the
+ * dce_refrele under dcb_lock.
+ */
+static void
+dce_delete_locked(dcb_t *dcb, dce_t *dce)
+{
+ dce->dce_bucket = NULL;
+ *dce->dce_ptpn = dce->dce_next;
+ if (dce->dce_next != NULL)
+ dce->dce_next->dce_ptpn = dce->dce_ptpn;
+ dce->dce_ptpn = NULL;
+ dce->dce_next = NULL;
+ atomic_add_32(&dcb->dcb_cnt, -1);
+ dce_make_condemned(dce);
+}
+
+static void
+dce_inactive(dce_t *dce)
+{
+ ip_stack_t *ipst = dce->dce_ipst;
+
+ ASSERT(!(dce->dce_flags & DCEF_DEFAULT));
+ ASSERT(dce->dce_ptpn == NULL);
+ ASSERT(dce->dce_bucket == NULL);
+
+ /* Count how many condemned dces for kmem_cache callback */
+ if (DCE_IS_CONDEMNED(dce))
+ atomic_add_32(&ipst->ips_num_dce_condemned, -1);
+
+ kmem_cache_free(dce_cache, dce);
+}
+
+void
+dce_refrele(dce_t *dce)
+{
+ ASSERT(dce->dce_refcnt != 0);
+ if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0)
+ dce_inactive(dce);
+}
+
+void
+dce_refhold(dce_t *dce)
+{
+ atomic_add_32(&dce->dce_refcnt, 1);
+ ASSERT(dce->dce_refcnt != 0);
+}
+
+/* No tracing support yet hence the same as the above functions */
+void
+dce_refrele_notr(dce_t *dce)
+{
+ ASSERT(dce->dce_refcnt != 0);
+ if (atomic_add_32_nv(&dce->dce_refcnt, -1) == 0)
+ dce_inactive(dce);
+}
+
+void
+dce_refhold_notr(dce_t *dce)
+{
+ atomic_add_32(&dce->dce_refcnt, 1);
+ ASSERT(dce->dce_refcnt != 0);
+}
+
+/* Report both the IPv4 and IPv6 DCEs. */
+mblk_t *
+ip_snmp_get_mib2_ip_dce(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
+{
+ struct opthdr *optp;
+ mblk_t *mp2ctl;
+ dest_cache_entry_t dest_cache;
+ mblk_t *mp_tail = NULL;
+ dce_t *dce;
+ dcb_t *dcb;
+ int i;
+ uint64_t current_time;
+
+ current_time = TICK_TO_SEC(lbolt64);
+
+ /*
+ * make a copy of the original message
+ */
+ mp2ctl = copymsg(mpctl);
+
+ /* First we do IPv4 entries */
+ optp = (struct opthdr *)&mpctl->b_rptr[
+ sizeof (struct T_optmgmt_ack)];
+ optp->level = MIB2_IP;
+ optp->name = EXPER_IP_DCE;
+
+ for (i = 0; i < ipst->ips_dce_hashsize; i++) {
+ dcb = &ipst->ips_dce_hash_v4[i];
+ rw_enter(&dcb->dcb_lock, RW_READER);
+ for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
+ dest_cache.DestIpv4Address = dce->dce_v4addr;
+ dest_cache.DestFlags = dce->dce_flags;
+ if (dce->dce_flags & DCEF_PMTU)
+ dest_cache.DestPmtu = dce->dce_pmtu;
+ else
+ dest_cache.DestPmtu = 0;
+ dest_cache.DestIdent = dce->dce_ident;
+ dest_cache.DestIfindex = 0;
+ dest_cache.DestAge = current_time -
+ dce->dce_last_change_time;
+ if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
+ (char *)&dest_cache, (int)sizeof (dest_cache))) {
+ ip1dbg(("ip_snmp_get_mib2_ip_dce: "
+ "failed to allocate %u bytes\n",
+ (uint_t)sizeof (dest_cache)));
+ }
+ }
+ rw_exit(&dcb->dcb_lock);
+ }
+ optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
+ ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
+ (int)optp->level, (int)optp->name, (int)optp->len));
+ qreply(q, mpctl);
+
+ if (mp2ctl == NULL) {
+ /* Copymsg failed above */
+ return (NULL);
+ }
+
+ /* Now for IPv6 */
+ mpctl = mp2ctl;
+ mp_tail = NULL;
+ mp2ctl = copymsg(mpctl);
+ optp = (struct opthdr *)&mpctl->b_rptr[
+ sizeof (struct T_optmgmt_ack)];
+ optp->level = MIB2_IP6;
+ optp->name = EXPER_IP_DCE;
+
+ for (i = 0; i < ipst->ips_dce_hashsize; i++) {
+ dcb = &ipst->ips_dce_hash_v6[i];
+ rw_enter(&dcb->dcb_lock, RW_READER);
+ for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
+ dest_cache.DestIpv6Address = dce->dce_v6addr;
+ dest_cache.DestFlags = dce->dce_flags;
+ if (dce->dce_flags & DCEF_PMTU)
+ dest_cache.DestPmtu = dce->dce_pmtu;
+ else
+ dest_cache.DestPmtu = 0;
+ dest_cache.DestIdent = dce->dce_ident;
+ if (IN6_IS_ADDR_LINKSCOPE(&dce->dce_v6addr))
+ dest_cache.DestIfindex = dce->dce_ifindex;
+ else
+ dest_cache.DestIfindex = 0;
+ dest_cache.DestAge = current_time -
+ dce->dce_last_change_time;
+ if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
+ (char *)&dest_cache, (int)sizeof (dest_cache))) {
+ ip1dbg(("ip_snmp_get_mib2_ip_dce: "
+ "failed to allocate %u bytes\n",
+ (uint_t)sizeof (dest_cache)));
+ }
+ }
+ rw_exit(&dcb->dcb_lock);
+ }
+ optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
+ ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
+ (int)optp->level, (int)optp->name, (int)optp->len));
+ qreply(q, mpctl);
+
+ return (mp2ctl);
+}
+
+/*
+ * Remove IPv6 DCEs which refer to an ifindex that is going away.
+ * This is not required for correctness, but it avoids netstat -d
+ * showing stale stuff that will never be used.
+ */
+void
+dce_cleanup(uint_t ifindex, ip_stack_t *ipst)
+{
+ uint_t i;
+ dcb_t *dcb;
+ dce_t *dce, *nextdce;
+
+ for (i = 0; i < ipst->ips_dce_hashsize; i++) {
+ dcb = &ipst->ips_dce_hash_v6[i];
+ rw_enter(&dcb->dcb_lock, RW_WRITER);
+
+ for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
+ nextdce = dce->dce_next;
+ if (dce->dce_ifindex == ifindex) {
+ dce_delete_locked(dcb, dce);
+ dce_refrele(dce);
+ }
+ }
+ rw_exit(&dcb->dcb_lock);
+ }
+}