summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/inet/ip/ip_arp.c
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/inet/ip/ip_arp.c')
-rw-r--r--usr/src/uts/common/inet/ip/ip_arp.c2468
1 files changed, 2468 insertions, 0 deletions
diff --git a/usr/src/uts/common/inet/ip/ip_arp.c b/usr/src/uts/common/inet/ip/ip_arp.c
new file mode 100644
index 0000000000..489d59dbf6
--- /dev/null
+++ b/usr/src/uts/common/inet/ip/ip_arp.c
@@ -0,0 +1,2468 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <inet/ip_arp.h>
+#include <inet/ip_ndp.h>
+#include <net/if_arp.h>
+#include <netinet/if_ether.h>
+#include <sys/strsubr.h>
+#include <inet/ip6.h>
+#include <inet/ip.h>
+#include <inet/ip_ire.h>
+#include <inet/ip_if.h>
+#include <sys/dlpi.h>
+#include <sys/sunddi.h>
+#include <sys/strsun.h>
+#include <sys/sdt.h>
+#include <inet/mi.h>
+#include <inet/arp.h>
+#include <inet/ipdrop.h>
+#include <sys/sockio.h>
+#include <inet/ip_impl.h>
+#include <sys/policy.h>
+
+#define ARL_LL_ADDR_OFFSET(arl) (((arl)->arl_sap_length) < 0 ? \
+ (sizeof (dl_unitdata_req_t)) : \
+ ((sizeof (dl_unitdata_req_t)) + (ABS((arl)->arl_sap_length))))
+
+/*
+ * MAC-specific intelligence. Shouldn't be needed, but the DL_INFO_ACK
+ * doesn't quite do it for us.
+ */
+typedef struct arp_m_s {
+ t_uscalar_t arp_mac_type;
+ uint32_t arp_mac_arp_hw_type;
+ t_scalar_t arp_mac_sap_length;
+ uint32_t arp_mac_hw_addr_length;
+} arp_m_t;
+
+static int arp_close(queue_t *, int);
+static void arp_rput(queue_t *, mblk_t *);
+static void arp_wput(queue_t *, mblk_t *);
+static arp_m_t *arp_m_lookup(t_uscalar_t mac_type);
+static void arp_notify(ipaddr_t, mblk_t *, uint32_t, ip_recv_attr_t *,
+ ncec_t *);
+static int arp_output(ill_t *, uint32_t, const uchar_t *, const uchar_t *,
+ const uchar_t *, const uchar_t *, uchar_t *);
+static int arp_modclose(arl_t *);
+static void arp_mod_close_tail(arl_t *);
+static mblk_t *arl_unbind(arl_t *);
+static void arp_process_packet(ill_t *, mblk_t *);
+static void arp_excl(ipsq_t *, queue_t *, mblk_t *, void *);
+static void arp_drop_packet(const char *str, mblk_t *, ill_t *);
+static int arp_open(queue_t *, dev_t *, int, int, cred_t *);
+static int ip_sioctl_ifunitsel_arp(queue_t *, int *);
+static int ip_sioctl_slifname_arp(queue_t *, void *);
+static void arp_dlpi_send(arl_t *, mblk_t *);
+static void arl_defaults_common(arl_t *, mblk_t *);
+static int arp_modopen(queue_t *, dev_t *, int, int, cred_t *);
+static void arp_ifname_notify(arl_t *);
+static void arp_rput_dlpi_writer(ipsq_t *, queue_t *, mblk_t *, void *);
+static arl_t *ill_to_arl(ill_t *);
+
+#define DL_PRIM(mp) (((union DL_primitives *)(mp)->b_rptr)->dl_primitive)
+#define IS_DLPI_DATA(mp) \
+ ((DB_TYPE(mp) == M_PROTO) && \
+ MBLKL(mp) >= sizeof (dl_unitdata_ind_t) && \
+ (DL_PRIM(mp) == DL_UNITDATA_IND))
+
+#define AR_NOTFOUND 1 /* No matching ace found in cache */
+#define AR_MERGED 2 /* Matching ace updated (RFC 826 Merge_flag) */
+#define AR_LOOPBACK 3 /* Our own arp packet was received */
+#define AR_BOGON 4 /* Another host has our IP addr. */
+#define AR_FAILED 5 /* Duplicate Address Detection has failed */
+#define AR_CHANGED 6 /* Address has changed; tell IP (and merged) */
+
+boolean_t arp_no_defense;
+
+struct module_info arp_mod_info = {
+ IP_MOD_ID, "arpip", 1, INFPSZ, 65536, 1024
+};
+static struct qinit rinit_arp = {
+ (pfi_t)arp_rput, NULL, arp_open, arp_close, NULL, &arp_mod_info
+};
+static struct qinit winit_arp = {
+ (pfi_t)arp_wput, NULL, arp_open, arp_close, NULL,
+ &arp_mod_info
+};
+struct streamtab arpinfo = {
+ &rinit_arp, &winit_arp
+};
+#define ARH_FIXED_LEN 8
+#define AR_LL_HDR_SLACK 32
+
+/*
+ * pfhooks for ARP.
+ */
+#define ARP_HOOK_IN(_hook, _event, _ilp, _hdr, _fm, _m, ipst) \
+ \
+ if ((_hook).he_interested) { \
+ hook_pkt_event_t info; \
+ \
+ info.hpe_protocol = ipst->ips_arp_net_data; \
+ info.hpe_ifp = _ilp; \
+ info.hpe_ofp = 0; \
+ info.hpe_hdr = _hdr; \
+ info.hpe_mp = &(_fm); \
+ info.hpe_mb = _m; \
+ if (hook_run(ipst->ips_arp_net_data->netd_hooks, \
+ _event, (hook_data_t)&info) != 0) { \
+ if (_fm != NULL) { \
+ freemsg(_fm); \
+ _fm = NULL; \
+ } \
+ _hdr = NULL; \
+ _m = NULL; \
+ } else { \
+ _hdr = info.hpe_hdr; \
+ _m = info.hpe_mb; \
+ } \
+ }
+
+#define ARP_HOOK_OUT(_hook, _event, _olp, _hdr, _fm, _m, ipst) \
+ \
+ if ((_hook).he_interested) { \
+ hook_pkt_event_t info; \
+ \
+ info.hpe_protocol = ipst->ips_arp_net_data; \
+ info.hpe_ifp = 0; \
+ info.hpe_ofp = _olp; \
+ info.hpe_hdr = _hdr; \
+ info.hpe_mp = &(_fm); \
+ info.hpe_mb = _m; \
+ if (hook_run(ipst->ips_arp_net_data->netd_hooks, \
+ _event, (hook_data_t)&info) != 0) { \
+ if (_fm != NULL) { \
+ freemsg(_fm); \
+ _fm = NULL; \
+ } \
+ _hdr = NULL; \
+ _m = NULL; \
+ } else { \
+ _hdr = info.hpe_hdr; \
+ _m = info.hpe_mb; \
+ } \
+ }
+
+static arp_m_t arp_m_tbl[] = {
+ { DL_CSMACD, ARPHRD_ETHER, -2, 6}, /* 802.3 */
+ { DL_TPB, ARPHRD_IEEE802, -2, 6}, /* 802.4 */
+ { DL_TPR, ARPHRD_IEEE802, -2, 6}, /* 802.5 */
+ { DL_METRO, ARPHRD_IEEE802, -2, 6}, /* 802.6 */
+ { DL_ETHER, ARPHRD_ETHER, -2, 6}, /* Ethernet */
+ { DL_FDDI, ARPHRD_ETHER, -2, 6}, /* FDDI */
+ { DL_IB, ARPHRD_IB, -2, 20}, /* Infiniband */
+ { DL_OTHER, ARPHRD_ETHER, -2, 6} /* unknown */
+};
+
+static void
+arl_refhold_locked(arl_t *arl)
+{
+ ASSERT(MUTEX_HELD(&arl->arl_lock));
+ arl->arl_refcnt++;
+ ASSERT(arl->arl_refcnt != 0);
+}
+
+static void
+arl_refrele(arl_t *arl)
+{
+ mutex_enter(&arl->arl_lock);
+ ASSERT(arl->arl_refcnt != 0);
+ arl->arl_refcnt--;
+ if (arl->arl_refcnt > 1) {
+ mutex_exit(&arl->arl_lock);
+ return;
+ }
+
+ /* ill_close or arp_unbind_complete may be waiting */
+ cv_broadcast(&arl->arl_cv);
+ mutex_exit(&arl->arl_lock);
+}
+
+/*
+ * wake up any pending ip ioctls.
+ */
+static void
+arp_cmd_done(ill_t *ill, int err, t_uscalar_t lastprim)
+{
+ if (lastprim == DL_UNBIND_REQ && ill->ill_replumbing)
+ arp_replumb_done(ill, 0);
+ else
+ arp_bringup_done(ill, err);
+}
+
+static int
+ip_nce_resolve_all(ill_t *ill, uchar_t *src_haddr, uint32_t hlen,
+ const in_addr_t *src_paddr, ncec_t **sncec, int op)
+{
+ int retv;
+ ncec_t *ncec;
+ boolean_t ll_changed;
+ uchar_t *lladdr = NULL;
+ int new_state;
+
+ ASSERT(ill != NULL);
+
+ ncec = ncec_lookup_illgrp_v4(ill, src_paddr);
+ *sncec = ncec;
+
+ if (ncec == NULL) {
+ retv = AR_NOTFOUND;
+ goto done;
+ }
+
+ mutex_enter(&ncec->ncec_lock);
+ /*
+ * IP addr and hardware address match what we already
+ * have, then this is a broadcast packet emitted by one of our
+ * interfaces, reflected by the switch and received on another
+ * interface. We return AR_LOOPBACK.
+ */
+ lladdr = ncec->ncec_lladdr;
+ if (NCE_MYADDR(ncec) && hlen == ncec->ncec_ill->ill_phys_addr_length &&
+ bcmp(lladdr, src_haddr, hlen) == 0) {
+ mutex_exit(&ncec->ncec_lock);
+ retv = AR_LOOPBACK;
+ goto done;
+ }
+ /*
+ * If the entry is unverified, then we've just verified that
+ * someone else already owns this address, because this is a
+ * message with the same protocol address but different
+ * hardware address.
+ */
+ if (ncec->ncec_flags & NCE_F_UNVERIFIED) {
+ mutex_exit(&ncec->ncec_lock);
+ ncec_delete(ncec);
+ ncec_refrele(ncec);
+ *sncec = NULL;
+ retv = AR_FAILED;
+ goto done;
+ }
+
+ /*
+ * If the IP address matches ours and we're authoritative for
+ * this entry, then some other node is using our IP addr, so
+ * return AR_BOGON. Also reset the transmit count to zero so
+ * that, if we're currently in initial announcement mode, we
+ * switch back to the lazier defense mode. Knowing that
+ * there's at least one duplicate out there, we ought not
+ * blindly announce.
+ *
+ * NCE_F_AUTHORITY is set in one of two ways:
+ * 1. /sbin/arp told us so, via the "permanent" flag.
+ * 2. This is one of my addresses.
+ */
+ if (ncec->ncec_flags & NCE_F_AUTHORITY) {
+ ncec->ncec_unsolicit_count = 0;
+ mutex_exit(&ncec->ncec_lock);
+ retv = AR_BOGON;
+ goto done;
+ }
+
+ /*
+ * No address conflict was detected, and we are getting
+ * ready to update the ncec's hwaddr. The nce MUST NOT be on an
+ * under interface, because all dynamic nce's are created on the
+ * native interface (in the non-IPMP case) or on the IPMP
+ * meta-interface (in the IPMP case)
+ */
+ ASSERT(!IS_UNDER_IPMP(ncec->ncec_ill));
+
+ /*
+ * update ncec with src_haddr, hlen.
+ *
+ * We are trying to resolve this ncec_addr/src_paddr and we
+ * got a REQUEST/RESPONSE from the ncec_addr/src_paddr.
+ * So the new_state is at least "STALE". If, in addition,
+ * this a solicited, unicast ARP_RESPONSE, we can transition
+ * to REACHABLE.
+ */
+ new_state = ND_STALE;
+ ip1dbg(("got info for ncec %p from addr %x\n",
+ (void *)ncec, *src_paddr));
+ retv = AR_MERGED;
+ if (ncec->ncec_state == ND_INCOMPLETE ||
+ ncec->ncec_state == ND_INITIAL) {
+ ll_changed = B_TRUE;
+ } else {
+ ll_changed = nce_cmp_ll_addr(ncec, src_haddr, hlen);
+ if (!ll_changed)
+ new_state = ND_UNCHANGED;
+ else
+ retv = AR_CHANGED;
+ }
+ /*
+ * We don't have the equivalent of the IPv6 'S' flag indicating
+ * a solicited response, so we assume that if we are in
+ * INCOMPLETE, or got back an unchanged lladdr in PROBE state,
+ * and this is an ARP_RESPONSE, it must be a
+ * solicited response allowing us to transtion to REACHABLE.
+ */
+ if (op == ARP_RESPONSE) {
+ switch (ncec->ncec_state) {
+ case ND_PROBE:
+ new_state = (ll_changed ? ND_STALE : ND_REACHABLE);
+ break;
+ case ND_INCOMPLETE:
+ new_state = ND_REACHABLE;
+ break;
+ }
+ }
+ /*
+ * Call nce_update() to refresh fastpath information on any
+ * dependent nce_t entries.
+ */
+ nce_update(ncec, new_state, (ll_changed ? src_haddr : NULL));
+ mutex_exit(&ncec->ncec_lock);
+ nce_resolv_ok(ncec);
+done:
+ return (retv);
+}
+
+/* Find an entry for a particular MAC type in the arp_m_tbl. */
+static arp_m_t *
+arp_m_lookup(t_uscalar_t mac_type)
+{
+ arp_m_t *arm;
+
+ for (arm = arp_m_tbl; arm < A_END(arp_m_tbl); arm++) {
+ if (arm->arp_mac_type == mac_type)
+ return (arm);
+ }
+ return (NULL);
+}
+
+static uint32_t
+arp_hw_type(t_uscalar_t mactype)
+{
+ arp_m_t *arm;
+
+ if ((arm = arp_m_lookup(mactype)) == NULL)
+ arm = arp_m_lookup(DL_OTHER);
+ return (arm->arp_mac_arp_hw_type);
+}
+
+/*
+ * Called when an DLPI control message has been acked; send down the next
+ * queued message (if any).
+ * The DLPI messages of interest being bind, attach and unbind since
+ * these are the only ones sent by ARP via arp_dlpi_send.
+ */
+static void
+arp_dlpi_done(arl_t *arl, ill_t *ill)
+{
+ mblk_t *mp;
+ int err;
+ t_uscalar_t prim;
+
+ mutex_enter(&arl->arl_lock);
+ prim = arl->arl_dlpi_pending;
+
+ if ((mp = arl->arl_dlpi_deferred) == NULL) {
+ arl->arl_dlpi_pending = DL_PRIM_INVAL;
+ if (arl->arl_state_flags & ARL_LL_DOWN)
+ err = ENETDOWN;
+ else
+ err = 0;
+ mutex_exit(&arl->arl_lock);
+
+ mutex_enter(&ill->ill_lock);
+ ill->ill_arl_dlpi_pending = 0;
+ mutex_exit(&ill->ill_lock);
+ arp_cmd_done(ill, err, prim);
+ return;
+ }
+
+ arl->arl_dlpi_deferred = mp->b_next;
+ mp->b_next = NULL;
+
+ ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
+
+ arl->arl_dlpi_pending = DL_PRIM(mp);
+ mutex_exit(&arl->arl_lock);
+
+ mutex_enter(&ill->ill_lock);
+ ill->ill_arl_dlpi_pending = 1;
+ mutex_exit(&ill->ill_lock);
+
+ putnext(arl->arl_wq, mp);
+}
+
+/*
+ * This routine is called during module initialization when the DL_INFO_ACK
+ * comes back from the device. We set up defaults for all the device dependent
+ * doo-dads we are going to need. This will leave us ready to roll if we are
+ * attempting auto-configuration. Alternatively, these defaults can be
+ * overridden by initialization procedures possessing higher intelligence.
+ *
+ * Caller will free the mp.
+ */
+static void
+arp_ll_set_defaults(arl_t *arl, mblk_t *mp)
+{
+ arp_m_t *arm;
+ dl_info_ack_t *dlia = (dl_info_ack_t *)mp->b_rptr;
+
+ if ((arm = arp_m_lookup(dlia->dl_mac_type)) == NULL)
+ arm = arp_m_lookup(DL_OTHER);
+ ASSERT(arm != NULL);
+
+ /*
+ * We initialize based on parameters in the (currently) not too
+ * exhaustive arp_m_tbl.
+ */
+ if (dlia->dl_version == DL_VERSION_2) {
+ arl->arl_sap_length = dlia->dl_sap_length;
+ arl->arl_phys_addr_length = dlia->dl_brdcst_addr_length;
+ if (dlia->dl_provider_style == DL_STYLE2)
+ arl->arl_needs_attach = 1;
+ } else {
+ arl->arl_sap_length = arm->arp_mac_sap_length;
+ arl->arl_phys_addr_length = arm->arp_mac_hw_addr_length;
+ }
+ /*
+ * Note: the arp_hw_type in the arp header may be derived from
+ * the ill_mac_type and arp_m_lookup().
+ */
+ arl->arl_sap = ETHERTYPE_ARP;
+ arl_defaults_common(arl, mp);
+}
+
+static void
+arp_wput(queue_t *q, mblk_t *mp)
+{
+ int err = EINVAL;
+ struct iocblk *ioc;
+ mblk_t *mp1;
+
+ switch (DB_TYPE(mp)) {
+ case M_IOCTL:
+ ASSERT(q->q_next != NULL);
+ ioc = (struct iocblk *)mp->b_rptr;
+ if (ioc->ioc_cmd != SIOCSLIFNAME &&
+ ioc->ioc_cmd != IF_UNITSEL) {
+ DTRACE_PROBE4(arl__dlpi, char *, "arp_wput",
+ char *, "<some ioctl>", char *, "-",
+ arl_t *, (arl_t *)q->q_ptr);
+ putnext(q, mp);
+ return;
+ }
+ if ((mp1 = mp->b_cont) == 0)
+ err = EINVAL;
+ else if (ioc->ioc_cmd == SIOCSLIFNAME)
+ err = ip_sioctl_slifname_arp(q, mp1->b_rptr);
+ else if (ioc->ioc_cmd == IF_UNITSEL)
+ err = ip_sioctl_ifunitsel_arp(q, (int *)mp1->b_rptr);
+ if (err == 0)
+ miocack(q, mp, 0, 0);
+ else
+ miocnak(q, mp, 0, err);
+ return;
+ default:
+ DTRACE_PROBE4(arl__dlpi, char *, "arp_wput default",
+ char *, "default mblk", char *, "-",
+ arl_t *, (arl_t *)q->q_ptr);
+ putnext(q, mp);
+ return;
+ }
+}
+
+/*
+ * similar to ill_dlpi_pending(): verify that the received DLPI response
+ * matches the one that is pending for the arl.
+ */
+static boolean_t
+arl_dlpi_pending(arl_t *arl, t_uscalar_t prim)
+{
+ t_uscalar_t pending;
+
+ mutex_enter(&arl->arl_lock);
+ if (arl->arl_dlpi_pending == prim) {
+ mutex_exit(&arl->arl_lock);
+ return (B_TRUE);
+ }
+
+ if (arl->arl_state_flags & ARL_CONDEMNED) {
+ mutex_exit(&arl->arl_lock);
+ return (B_FALSE);
+ }
+ pending = arl->arl_dlpi_pending;
+ mutex_exit(&arl->arl_lock);
+
+ if (pending == DL_PRIM_INVAL) {
+ ip0dbg(("arl_dlpi_pending unsolicited ack for %s on %s",
+ dl_primstr(prim), arl->arl_name));
+ } else {
+ ip0dbg(("arl_dlpi_pending ack for %s on %s expect %s",
+ dl_primstr(prim), arl->arl_name, dl_primstr(pending)));
+ }
+ return (B_FALSE);
+}
+
+/* DLPI messages, other than DL_UNITDATA_IND are handled here. */
+static void
+arp_rput_dlpi(queue_t *q, mblk_t *mp)
+{
+ arl_t *arl = (arl_t *)q->q_ptr;
+ union DL_primitives *dlp;
+ t_uscalar_t prim;
+ t_uscalar_t reqprim = DL_PRIM_INVAL;
+ ill_t *ill;
+
+ if ((mp->b_wptr - mp->b_rptr) < sizeof (dlp->dl_primitive)) {
+ putnext(q, mp);
+ return;
+ }
+ dlp = (union DL_primitives *)mp->b_rptr;
+ prim = dlp->dl_primitive;
+
+ /*
+ * If we received an ACK but didn't send a request for it, then it
+ * can't be part of any pending operation; discard up-front.
+ */
+ switch (prim) {
+ case DL_ERROR_ACK:
+ /*
+ * ce is confused about how DLPI works, so we have to interpret
+ * an "error" on DL_NOTIFY_ACK (which we never could have sent)
+ * as really meaning an error on DL_NOTIFY_REQ.
+ *
+ * Note that supporting DL_NOTIFY_REQ is optional, so printing
+ * out an error message on the console isn't warranted except
+ * for debug.
+ */
+ if (dlp->error_ack.dl_error_primitive == DL_NOTIFY_ACK ||
+ dlp->error_ack.dl_error_primitive == DL_NOTIFY_REQ) {
+ reqprim = DL_NOTIFY_REQ;
+ } else {
+ reqprim = dlp->error_ack.dl_error_primitive;
+ }
+ break;
+ case DL_INFO_ACK:
+ reqprim = DL_INFO_REQ;
+ break;
+ case DL_OK_ACK:
+ reqprim = dlp->ok_ack.dl_correct_primitive;
+ break;
+ case DL_BIND_ACK:
+ reqprim = DL_BIND_REQ;
+ break;
+ default:
+ DTRACE_PROBE2(rput_dl_badprim, arl_t *, arl,
+ union DL_primitives *, dlp);
+ putnext(q, mp);
+ return;
+ }
+ if (reqprim == DL_PRIM_INVAL || !arl_dlpi_pending(arl, reqprim)) {
+ freemsg(mp);
+ return;
+ }
+ DTRACE_PROBE4(arl__dlpi, char *, "arp_rput_dlpi received",
+ char *, dl_primstr(prim), char *, dl_primstr(reqprim),
+ arl_t *, arl);
+
+ ASSERT(prim != DL_NOTIFY_IND);
+
+ ill = arl_to_ill(arl);
+
+ switch (reqprim) {
+ case DL_INFO_REQ:
+ /*
+ * ill has not been set up yet for this case. This is the
+ * DL_INFO_ACK for the first DL_INFO_REQ sent from
+ * arp_modopen(). There should be no other arl_dlpi_deferred
+ * messages pending. We initialize the arl here.
+ */
+ ASSERT(!arl->arl_dlpi_style_set);
+ ASSERT(arl->arl_dlpi_pending == DL_INFO_REQ);
+ ASSERT(arl->arl_dlpi_deferred == NULL);
+ arl->arl_dlpi_pending = DL_PRIM_INVAL;
+ arp_ll_set_defaults(arl, mp);
+ freemsg(mp);
+ return;
+ case DL_UNBIND_REQ:
+ mutex_enter(&arl->arl_lock);
+ arl->arl_state_flags &= ~ARL_DL_UNBIND_IN_PROGRESS;
+ /*
+ * This is not an error, so we don't set ARL_LL_DOWN
+ */
+ arl->arl_state_flags &= ~ARL_LL_UP;
+ arl->arl_state_flags |= ARL_LL_UNBOUND;
+ if (arl->arl_state_flags & ARL_CONDEMNED) {
+ /*
+ * if this is part of the unplumb the arl may
+ * vaporize any moment after we cv_signal the
+ * arl_cv so we reset arl_dlpi_pending here.
+ * All other cases (including replumb) will
+ * have the arl_dlpi_pending reset in
+ * arp_dlpi_done.
+ */
+ arl->arl_dlpi_pending = DL_PRIM_INVAL;
+ }
+ cv_signal(&arl->arl_cv);
+ mutex_exit(&arl->arl_lock);
+ break;
+ }
+ if (ill != NULL) {
+ /*
+ * ill ref obtained by arl_to_ill() will be released
+ * by qwriter_ip()
+ */
+ qwriter_ip(ill, ill->ill_wq, mp, arp_rput_dlpi_writer,
+ CUR_OP, B_TRUE);
+ return;
+ }
+ freemsg(mp);
+}
+
+/*
+ * Handling of DLPI messages that require exclusive access to the ipsq.
+ */
+/* ARGSUSED */
+static void
+arp_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
+{
+ union DL_primitives *dlp = (union DL_primitives *)mp->b_rptr;
+ ill_t *ill = (ill_t *)q->q_ptr;
+ arl_t *arl = ill_to_arl(ill);
+
+ if (arl == NULL) {
+ /*
+ * happens as a result arp_modclose triggering unbind.
+ * arp_rput_dlpi will cv_signal the arl_cv and the modclose
+ * will complete, but when it does ipsq_exit, the waiting
+ * qwriter_ip gets into the ipsq but will find the arl null.
+ * There should be no deferred messages in this case, so
+ * just complete and exit.
+ */
+ arp_cmd_done(ill, 0, DL_UNBIND_REQ);
+ freemsg(mp);
+ return;
+ }
+ switch (dlp->dl_primitive) {
+ case DL_ERROR_ACK:
+ switch (dlp->error_ack.dl_error_primitive) {
+ case DL_UNBIND_REQ:
+ mutex_enter(&arl->arl_lock);
+ arl->arl_state_flags &= ~ARL_DL_UNBIND_IN_PROGRESS;
+ arl->arl_state_flags &= ~ARL_LL_UP;
+ arl->arl_state_flags |= ARL_LL_UNBOUND;
+ arl->arl_state_flags |= ARL_LL_DOWN;
+ cv_signal(&arl->arl_cv);
+ mutex_exit(&arl->arl_lock);
+ break;
+ case DL_BIND_REQ:
+ mutex_enter(&arl->arl_lock);
+ arl->arl_state_flags &= ~ARL_LL_UP;
+ arl->arl_state_flags |= ARL_LL_DOWN;
+ arl->arl_state_flags |= ARL_LL_UNBOUND;
+ cv_signal(&arl->arl_cv);
+ mutex_exit(&arl->arl_lock);
+ break;
+ case DL_ATTACH_REQ:
+ break;
+ default:
+ /* If it's anything else, we didn't send it. */
+ arl_refrele(arl);
+ putnext(q, mp);
+ return;
+ }
+ break;
+ case DL_OK_ACK:
+ DTRACE_PROBE4(arl__dlpi, char *, "arp_rput_dlpi_writer ok",
+ char *, dl_primstr(dlp->ok_ack.dl_correct_primitive),
+ char *, dl_primstr(dlp->ok_ack.dl_correct_primitive),
+ arl_t *, arl);
+ mutex_enter(&arl->arl_lock);
+ switch (dlp->ok_ack.dl_correct_primitive) {
+ case DL_UNBIND_REQ:
+ case DL_ATTACH_REQ:
+ break;
+ default:
+ ip0dbg(("Dropping unrecognized DL_OK_ACK for %s",
+ dl_primstr(dlp->ok_ack.dl_correct_primitive)));
+ mutex_exit(&arl->arl_lock);
+ arl_refrele(arl);
+ freemsg(mp);
+ return;
+ }
+ mutex_exit(&arl->arl_lock);
+ break;
+ case DL_BIND_ACK:
+ DTRACE_PROBE2(rput_dl_bind, arl_t *, arl,
+ dl_bind_ack_t *, &dlp->bind_ack);
+
+ mutex_enter(&arl->arl_lock);
+ ASSERT(arl->arl_state_flags & ARL_LL_BIND_PENDING);
+ arl->arl_state_flags &=
+ ~(ARL_LL_BIND_PENDING|ARL_LL_DOWN|ARL_LL_UNBOUND);
+ arl->arl_state_flags |= ARL_LL_UP;
+ mutex_exit(&arl->arl_lock);
+ break;
+ case DL_UDERROR_IND:
+ DTRACE_PROBE2(rput_dl_uderror, arl_t *, arl,
+ dl_uderror_ind_t *, &dlp->uderror_ind);
+ arl_refrele(arl);
+ putnext(q, mp);
+ return;
+ default:
+ DTRACE_PROBE2(rput_dl_badprim, arl_t *, arl,
+ union DL_primitives *, dlp);
+ arl_refrele(arl);
+ putnext(q, mp);
+ return;
+ }
+ arp_dlpi_done(arl, ill);
+ arl_refrele(arl);
+ freemsg(mp);
+}
+
+void
+arp_rput(queue_t *q, mblk_t *mp)
+{
+ arl_t *arl = q->q_ptr;
+ boolean_t need_refrele = B_FALSE;
+
+ mutex_enter(&arl->arl_lock);
+ if (((arl->arl_state_flags &
+ (ARL_CONDEMNED | ARL_LL_REPLUMBING)) != 0)) {
+ /*
+ * Only allow high priority DLPI messages during unplumb or
+ * replumb, and we don't take an arl_refcnt for that case.
+ */
+ if (DB_TYPE(mp) != M_PCPROTO) {
+ mutex_exit(&arl->arl_lock);
+ freemsg(mp);
+ return;
+ }
+ } else {
+ arl_refhold_locked(arl);
+ need_refrele = B_TRUE;
+ }
+ mutex_exit(&arl->arl_lock);
+
+ switch (DB_TYPE(mp)) {
+ case M_PCPROTO:
+ case M_PROTO: {
+ ill_t *ill;
+
+ /*
+ * could be one of
+ * (i) real message from the wire, (DLPI_DATA)
+ * (ii) DLPI message
+ * Take a ref on the ill associated with this arl to
+ * prevent the ill from being unplumbed until this thread
+ * is done.
+ */
+ if (IS_DLPI_DATA(mp)) {
+ ill = arl_to_ill(arl);
+ if (ill == NULL) {
+ arp_drop_packet("No ill", mp, ill);
+ break;
+ }
+ arp_process_packet(ill, mp);
+ ill_refrele(ill);
+ break;
+ }
+ /* Miscellaneous DLPI messages get shuffled off. */
+ arp_rput_dlpi(q, mp);
+ break;
+ }
+ case M_ERROR:
+ case M_HANGUP:
+ if (mp->b_rptr < mp->b_wptr)
+ arl->arl_error = (int)(*mp->b_rptr & 0xFF);
+ if (arl->arl_error == 0)
+ arl->arl_error = ENXIO;
+ freemsg(mp);
+ break;
+ default:
+ ip1dbg(("arp_rput other db type %x\n", DB_TYPE(mp)));
+ putnext(q, mp);
+ break;
+ }
+ if (need_refrele)
+ arl_refrele(arl);
+}
+
+static void
+arp_process_packet(ill_t *ill, mblk_t *mp)
+{
+ mblk_t *mp1;
+ arh_t *arh;
+ in_addr_t src_paddr, dst_paddr;
+ uint32_t hlen, plen;
+ boolean_t is_probe;
+ int op;
+ ncec_t *dst_ncec, *src_ncec = NULL;
+ uchar_t *src_haddr, *arhp, *dst_haddr, *dp, *sp;
+ int err;
+ ip_stack_t *ipst;
+ boolean_t need_ill_refrele = B_FALSE;
+ nce_t *nce;
+ uchar_t *src_lladdr;
+ dl_unitdata_ind_t *dlui;
+ ip_recv_attr_t iras;
+
+ ASSERT(ill != NULL);
+ if (ill->ill_flags & ILLF_NOARP) {
+ arp_drop_packet("Interface does not support ARP", mp, ill);
+ return;
+ }
+ ipst = ill->ill_ipst;
+ /*
+ * What we should have at this point is a DL_UNITDATA_IND message
+ * followed by an ARP packet. We do some initial checks and then
+ * get to work.
+ */
+ dlui = (dl_unitdata_ind_t *)mp->b_rptr;
+ if (dlui->dl_group_address == 1) {
+ /*
+ * multicast or broadcast packet. Only accept on the ipmp
+ * nominated interface for multicasts ('cast_ill').
+ * If we have no cast_ill we are liberal and accept everything.
+ */
+ if (IS_UNDER_IPMP(ill)) {
+ /* For an under ill_grp can change under lock */
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+ if (!ill->ill_nom_cast && ill->ill_grp != NULL &&
+ ill->ill_grp->ig_cast_ill != NULL) {
+ rw_exit(&ipst->ips_ill_g_lock);
+ arp_drop_packet("Interface is not nominated "
+ "for multicast sends and receives",
+ mp, ill);
+ return;
+ }
+ rw_exit(&ipst->ips_ill_g_lock);
+ }
+ }
+ mp1 = mp->b_cont;
+ if (mp1 == NULL) {
+ arp_drop_packet("Missing ARP packet", mp, ill);
+ return;
+ }
+ if (mp1->b_cont != NULL) {
+ /* No fooling around with funny messages. */
+ if (!pullupmsg(mp1, -1)) {
+ arp_drop_packet("Funny message: pullup failed",
+ mp, ill);
+ return;
+ }
+ }
+ arh = (arh_t *)mp1->b_rptr;
+ hlen = arh->arh_hlen;
+ plen = arh->arh_plen;
+ if (MBLKL(mp1) < ARH_FIXED_LEN + 2 * hlen + 2 * plen) {
+ arp_drop_packet("mblk len too small", mp, ill);
+ return;
+ }
+ /*
+ * hlen 0 is used for RFC 1868 UnARP.
+ *
+ * Note that the rest of the code checks that hlen is what we expect
+ * for this hardware address type, so might as well discard packets
+ * here that don't match.
+ */
+ if ((hlen > 0 && hlen != ill->ill_phys_addr_length) || plen == 0) {
+ DTRACE_PROBE2(rput_bogus, ill_t *, ill, mblk_t *, mp1);
+ arp_drop_packet("Bogus hlen or plen", mp, ill);
+ return;
+ }
+ /*
+ * Historically, Solaris has been lenient about hardware type numbers.
+ * We should check here, but don't.
+ */
+ DTRACE_PROBE3(arp__physical__in__start, ill_t *, ill, arh_t *, arh,
+ mblk_t *, mp);
+ /*
+ * If ill is in an ipmp group, it will be the under ill. If we want
+ * to report the packet as coming up the IPMP interface, we should
+ * convert it to the ipmp ill.
+ */
+ ARP_HOOK_IN(ipst->ips_arp_physical_in_event, ipst->ips_arp_physical_in,
+ ill->ill_phyint->phyint_ifindex, arh, mp, mp1, ipst);
+ DTRACE_PROBE1(arp__physical__in__end, mblk_t *, mp);
+ if (mp == NULL)
+ return;
+ arhp = (uchar_t *)arh + ARH_FIXED_LEN;
+ src_haddr = arhp; /* ar$sha */
+ arhp += hlen;
+ bcopy(arhp, &src_paddr, IP_ADDR_LEN); /* ar$spa */
+ sp = arhp;
+ arhp += IP_ADDR_LEN;
+ dst_haddr = arhp; /* ar$dha */
+ arhp += hlen;
+ bcopy(arhp, &dst_paddr, IP_ADDR_LEN); /* ar$tpa */
+ dp = arhp;
+ op = BE16_TO_U16(arh->arh_operation);
+
+ DTRACE_PROBE2(ip__arp__input, (in_addr_t), src_paddr,
+ (in_addr_t), dst_paddr);
+
+ /* Determine if this is just a probe */
+ is_probe = (src_paddr == INADDR_ANY);
+
+ /*
+ * ira_ill is the only field used down the arp_notify path.
+ */
+ bzero(&iras, sizeof (iras));
+ iras.ira_ill = iras.ira_rill = ill;
+ /*
+ * RFC 826: first check if the <protocol, sender protocol address> is
+ * in the cache, if there is a sender protocol address. Note that this
+ * step also handles resolutions based on source.
+ */
+ /* Note: after here we need to freeb(mp) and freemsg(mp1) separately */
+ mp->b_cont = NULL;
+ if (is_probe) {
+ err = AR_NOTFOUND;
+ } else {
+ if (plen != 4) {
+ arp_drop_packet("bad protocol len", mp, ill);
+ return;
+ }
+ err = ip_nce_resolve_all(ill, src_haddr, hlen, &src_paddr,
+ &src_ncec, op);
+ switch (err) {
+ case AR_BOGON:
+ ASSERT(src_ncec != NULL);
+ arp_notify(src_paddr, mp1, AR_CN_BOGON,
+ &iras, src_ncec);
+ break;
+ case AR_FAILED:
+ arp_notify(src_paddr, mp1, AR_CN_FAILED, &iras,
+ src_ncec);
+ break;
+ case AR_LOOPBACK:
+ DTRACE_PROBE2(rput_loopback, ill_t *, ill, arh_t *,
+ arh);
+ freemsg(mp1);
+ break;
+ default:
+ goto update;
+ }
+ freemsg(mp);
+ if (src_ncec != NULL)
+ ncec_refrele(src_ncec);
+ return;
+ }
+update:
+ /*
+ * Now look up the destination address. By RFC 826, we ignore the
+ * packet at this step if the target isn't one of our addresses (i.e.,
+ * one we have been asked to PUBLISH). This is true even if the
+ * target is something we're trying to resolve and the packet
+ * is a response.
+ */
+ dst_ncec = ncec_lookup_illgrp_v4(ill, &dst_paddr);
+ if (dst_ncec == NULL || !NCE_PUBLISH(dst_ncec)) {
+ /*
+ * Let the client know if the source mapping has changed, even
+ * if the destination provides no useful information for the
+ * client.
+ */
+ if (err == AR_CHANGED) {
+ arp_notify(src_paddr, mp1, AR_CN_ANNOUNCE, &iras,
+ NULL);
+ freemsg(mp);
+ } else {
+ freemsg(mp);
+ arp_drop_packet("Target is not interesting", mp1, ill);
+ }
+ if (dst_ncec != NULL)
+ ncec_refrele(dst_ncec);
+ if (src_ncec != NULL)
+ ncec_refrele(src_ncec);
+ return;
+ }
+
+ if (dst_ncec->ncec_flags & NCE_F_UNVERIFIED) {
+ /*
+ * Check for a reflection. Some misbehaving bridges will
+ * reflect our own transmitted packets back to us.
+ */
+ ASSERT(NCE_PUBLISH(dst_ncec));
+ if (hlen != dst_ncec->ncec_ill->ill_phys_addr_length) {
+ ncec_refrele(dst_ncec);
+ if (src_ncec != NULL)
+ ncec_refrele(src_ncec);
+ freemsg(mp);
+ arp_drop_packet("bad arh_len", mp1, ill);
+ return;
+ }
+ if (!nce_cmp_ll_addr(dst_ncec, src_haddr, hlen)) {
+ DTRACE_PROBE3(rput_probe_reflected, ill_t *, ill,
+ arh_t *, arh, ncec_t *, dst_ncec);
+ ncec_refrele(dst_ncec);
+ if (src_ncec != NULL)
+ ncec_refrele(src_ncec);
+ freemsg(mp);
+ arp_drop_packet("Reflected probe", mp1, ill);
+ return;
+ }
+ /*
+ * Responses targeting our HW address that are not responses to
+ * our DAD probe must be ignored as they are related to requests
+ * sent before DAD was restarted.
+ */
+ if (op == ARP_RESPONSE &&
+ (nce_cmp_ll_addr(dst_ncec, dst_haddr, hlen) == 0)) {
+ ncec_refrele(dst_ncec);
+ if (src_ncec != NULL)
+ ncec_refrele(src_ncec);
+ freemsg(mp);
+ arp_drop_packet(
+ "Response to request that was sent before DAD",
+ mp1, ill);
+ return;
+ }
+ /*
+ * Responses targeted to HW addresses which are not ours but
+ * sent to our unverified proto address are also conflicts.
+ * These may be reported by a proxy rather than the interface
+ * with the conflicting address, dst_paddr is in conflict
+ * rather than src_paddr. To ensure IP can locate the correct
+ * ipif to take down, it is necessary to copy dst_paddr to
+ * the src_paddr field before sending it to IP. The same is
+ * required for probes, where src_paddr will be INADDR_ANY.
+ */
+ if (is_probe || op == ARP_RESPONSE) {
+ bcopy(dp, sp, plen);
+ arp_notify(src_paddr, mp1, AR_CN_FAILED, &iras,
+ NULL);
+ ncec_delete(dst_ncec);
+ } else if (err == AR_CHANGED) {
+ arp_notify(src_paddr, mp1, AR_CN_ANNOUNCE, &iras,
+ NULL);
+ } else {
+ DTRACE_PROBE3(rput_request_unverified,
+ ill_t *, ill, arh_t *, arh, ncec_t *, dst_ncec);
+ arp_drop_packet("Unverified request", mp1, ill);
+ }
+ freemsg(mp);
+ ncec_refrele(dst_ncec);
+ if (src_ncec != NULL)
+ ncec_refrele(src_ncec);
+ return;
+ }
+ /*
+ * If it's a request, then we reply to this, and if we think the
+ * sender's unknown, then we create an entry to avoid unnecessary ARPs.
+ * The design assumption is that someone ARPing us is likely to send us
+ * a packet soon, and that we'll want to reply to it.
+ */
+ if (op == ARP_REQUEST) {
+ const uchar_t *nce_hwaddr;
+ struct in_addr nce_paddr;
+ clock_t now;
+ ill_t *under_ill = ill;
+ boolean_t send_unicast = B_TRUE;
+
+ ASSERT(NCE_PUBLISH(dst_ncec));
+
+ if ((dst_ncec->ncec_flags & (NCE_F_BCAST|NCE_F_MCAST)) != 0) {
+ /*
+ * Ignore senders who are deliberately or accidentally
+ * confused.
+ */
+ goto bail;
+ }
+
+ if (!is_probe && err == AR_NOTFOUND) {
+ ASSERT(src_ncec == NULL);
+
+ if (IS_UNDER_IPMP(under_ill)) {
+ /*
+ * create the ncec for the sender on ipmp_ill.
+ * We pass in the ipmp_ill itself to avoid
+ * creating an nce_t on the under_ill.
+ */
+ ill = ipmp_ill_hold_ipmp_ill(under_ill);
+ if (ill == NULL)
+ ill = under_ill;
+ else
+ need_ill_refrele = B_TRUE;
+ }
+
+ err = nce_lookup_then_add_v4(ill, src_haddr, hlen,
+ &src_paddr, 0, ND_STALE, &nce);
+
+ switch (err) {
+ case 0:
+ case EEXIST:
+ ip1dbg(("added ncec %p in state %d ill %s\n",
+ (void *)src_ncec, src_ncec->ncec_state,
+ ill->ill_name));
+ src_ncec = nce->nce_common;
+ break;
+ default:
+ /*
+ * Either no memory, or the outgoing interface
+ * is in the process of down/unplumb. In the
+ * latter case, we will fail the send anyway,
+ * and in the former case, we should try to send
+ * the ARP response.
+ */
+ src_lladdr = src_haddr;
+ goto send_response;
+ }
+ ncec_refhold(src_ncec);
+ nce_refrele(nce);
+ /* set up cleanup interval on ncec */
+ }
+
+ /*
+ * This implements periodic address defense based on a modified
+ * version of the RFC 3927 requirements. Instead of sending a
+ * broadcasted reply every time, as demanded by the RFC, we
+ * send at most one broadcast reply per arp_broadcast_interval.
+ */
+ now = ddi_get_lbolt();
+ if ((now - dst_ncec->ncec_last_time_defended) >
+ MSEC_TO_TICK(ipst->ips_ipv4_dad_announce_interval)) {
+ dst_ncec->ncec_last_time_defended = now;
+ /*
+ * If this is one of the long-suffering entries,
+ * pull it out now. It no longer needs separate
+ * defense, because we're now doing that with this
+ * broadcasted reply.
+ */
+ dst_ncec->ncec_flags &= ~NCE_F_DELAYED;
+ send_unicast = B_FALSE;
+ }
+ if (src_ncec != NULL && send_unicast) {
+ src_lladdr = src_ncec->ncec_lladdr;
+ } else {
+ src_lladdr = under_ill->ill_bcast_mp->b_rptr +
+ NCE_LL_ADDR_OFFSET(under_ill);
+ }
+send_response:
+ nce_hwaddr = dst_ncec->ncec_lladdr;
+ IN6_V4MAPPED_TO_INADDR(&dst_ncec->ncec_addr, &nce_paddr);
+
+ (void) arp_output(under_ill, ARP_RESPONSE,
+ nce_hwaddr, (uchar_t *)&nce_paddr, src_haddr,
+ (uchar_t *)&src_paddr, src_lladdr);
+ }
+bail:
+ if (dst_ncec != NULL) {
+ ncec_refrele(dst_ncec);
+ }
+ if (src_ncec != NULL) {
+ ncec_refrele(src_ncec);
+ }
+ if (err == AR_CHANGED) {
+ mp->b_cont = NULL;
+ arp_notify(src_paddr, mp1, AR_CN_ANNOUNCE, &iras, NULL);
+ mp1 = NULL;
+ }
+ if (need_ill_refrele)
+ ill_refrele(ill);
+done:
+ freemsg(mp);
+ freemsg(mp1);
+}
+
+/*
+ * Basic initialization of the arl_t and the arl_common structure shared with
+ * the ill_t that is done after SLIFNAME/IF_UNITSEL.
+ */
+static int
+arl_ill_init(arl_t *arl, char *ill_name)
+{
+ ill_t *ill;
+ arl_ill_common_t *ai;
+
+ ill = ill_lookup_on_name(ill_name, B_FALSE, B_FALSE, B_FALSE,
+ arl->arl_ipst);
+
+ if (ill == NULL)
+ return (ENXIO);
+
+ /*
+ * By the time we set up the arl, we expect the ETHERTYPE_IP
+ * stream to be fully bound and attached. So we copy/verify
+ * relevant information as possible from/against the ill.
+ *
+ * The following should have been set up in arp_ll_set_defaults()
+ * after the first DL_INFO_ACK was received.
+ */
+ ASSERT(arl->arl_phys_addr_length == ill->ill_phys_addr_length);
+ ASSERT(arl->arl_sap == ETHERTYPE_ARP);
+ ASSERT(arl->arl_mactype == ill->ill_mactype);
+ ASSERT(arl->arl_sap_length == ill->ill_sap_length);
+
+ ai = kmem_zalloc(sizeof (*ai), KM_SLEEP);
+ mutex_enter(&ill->ill_lock);
+ /* First ensure that the ill is not CONDEMNED. */
+ if (ill->ill_state_flags & ILL_CONDEMNED) {
+ mutex_exit(&ill->ill_lock);
+ ill_refrele(ill);
+ kmem_free(ai, sizeof (*ai));
+ return (ENXIO);
+ }
+ if (ill->ill_common != NULL || arl->arl_common != NULL) {
+ mutex_exit(&ill->ill_lock);
+ ip0dbg(("%s: PPA already exists", ill->ill_name));
+ ill_refrele(ill);
+ kmem_free(ai, sizeof (*ai));
+ return (EEXIST);
+ }
+ mutex_init(&ai->ai_lock, NULL, MUTEX_DEFAULT, NULL);
+ ai->ai_arl = arl;
+ ai->ai_ill = ill;
+ ill->ill_common = ai;
+ arl->arl_common = ai;
+ mutex_exit(&ill->ill_lock);
+ (void) strlcpy(arl->arl_name, ill->ill_name, LIFNAMSIZ);
+ arl->arl_name_length = ill->ill_name_length;
+ ill_refrele(ill);
+ arp_ifname_notify(arl);
+ return (0);
+}
+
+/* Allocate and do common initializations for DLPI messages. */
+static mblk_t *
+ip_ar_dlpi_comm(t_uscalar_t prim, size_t size)
+{
+ mblk_t *mp;
+
+ if ((mp = allocb(size, BPRI_HI)) == NULL)
+ return (NULL);
+
+ /*
+ * DLPIv2 says that DL_INFO_REQ and DL_TOKEN_REQ (the latter
+ * of which we don't seem to use) are sent with M_PCPROTO, and
+ * that other DLPI are M_PROTO.
+ */
+ DB_TYPE(mp) = (prim == DL_INFO_REQ) ? M_PCPROTO : M_PROTO;
+
+ mp->b_wptr = mp->b_rptr + size;
+ bzero(mp->b_rptr, size);
+ DL_PRIM(mp) = prim;
+ return (mp);
+}
+
+
+int
+ip_sioctl_ifunitsel_arp(queue_t *q, int *ppa)
+{
+ arl_t *arl;
+ char *cp, ill_name[LIFNAMSIZ];
+
+ if (q->q_next == NULL)
+ return (EINVAL);
+
+ do {
+ q = q->q_next;
+ } while (q->q_next != NULL);
+ cp = q->q_qinfo->qi_minfo->mi_idname;
+
+ arl = (arl_t *)q->q_ptr;
+ (void) snprintf(ill_name, sizeof (ill_name), "%s%d", cp, *ppa);
+ arl->arl_ppa = *ppa;
+ return (arl_ill_init(arl, ill_name));
+}
+
+int
+ip_sioctl_slifname_arp(queue_t *q, void *lifreq)
+{
+ arl_t *arl;
+ struct lifreq *lifr = lifreq;
+
+ /* ioctl not valid when IP opened as a device */
+ if (q->q_next == NULL)
+ return (EINVAL);
+
+ arl = (arl_t *)q->q_ptr;
+ arl->arl_ppa = lifr->lifr_ppa;
+ return (arl_ill_init(arl, lifr->lifr_name));
+}
+
+arl_t *
+ill_to_arl(ill_t *ill)
+{
+ arl_ill_common_t *ai = ill->ill_common;
+ arl_t *arl = NULL;
+
+ if (ai == NULL)
+ return (NULL);
+ /*
+ * Find the arl_t that corresponds to this ill_t from the shared
+ * ill_common structure. We can safely access the ai here as it
+ * will only be freed in arp_modclose() after we have become
+ * single-threaded.
+ */
+ mutex_enter(&ai->ai_lock);
+ if ((arl = ai->ai_arl) != NULL) {
+ mutex_enter(&arl->arl_lock);
+ if (!(arl->arl_state_flags & ARL_CONDEMNED)) {
+ arl_refhold_locked(arl);
+ mutex_exit(&arl->arl_lock);
+ } else {
+ mutex_exit(&arl->arl_lock);
+ arl = NULL;
+ }
+ }
+ mutex_exit(&ai->ai_lock);
+ return (arl);
+}
+
+ill_t *
+arl_to_ill(arl_t *arl)
+{
+ arl_ill_common_t *ai = arl->arl_common;
+ ill_t *ill = NULL;
+
+ if (ai == NULL) {
+ /*
+ * happens when the arp stream is just being opened, and
+ * arl_ill_init has not been executed yet.
+ */
+ return (NULL);
+ }
+ /*
+ * Find the ill_t that corresponds to this arl_t from the shared
+ * arl_common structure. We can safely access the ai here as it
+ * will only be freed in arp_modclose() after we have become
+ * single-threaded.
+ */
+ mutex_enter(&ai->ai_lock);
+ if ((ill = ai->ai_ill) != NULL) {
+ mutex_enter(&ill->ill_lock);
+ if (!ILL_IS_CONDEMNED(ill)) {
+ ill_refhold_locked(ill);
+ mutex_exit(&ill->ill_lock);
+ } else {
+ mutex_exit(&ill->ill_lock);
+ ill = NULL;
+ }
+ }
+ mutex_exit(&ai->ai_lock);
+ return (ill);
+}
+
+int
+arp_ll_up(ill_t *ill)
+{
+ mblk_t *attach_mp = NULL;
+ mblk_t *bind_mp = NULL;
+ mblk_t *unbind_mp = NULL;
+ arl_t *arl;
+
+ ASSERT(IAM_WRITER_ILL(ill));
+ arl = ill_to_arl(ill);
+
+ DTRACE_PROBE2(ill__downup, char *, "arp_ll_up", ill_t *, ill);
+ if (arl == NULL)
+ return (ENXIO);
+ DTRACE_PROBE2(arl__downup, char *, "arp_ll_up", arl_t *, arl);
+ if ((arl->arl_state_flags & ARL_LL_UP) != 0) {
+ arl_refrele(arl);
+ return (0);
+ }
+ if (arl->arl_needs_attach) { /* DL_STYLE2 */
+ attach_mp =
+ ip_ar_dlpi_comm(DL_ATTACH_REQ, sizeof (dl_attach_req_t));
+ if (attach_mp == NULL)
+ goto bad;
+ ((dl_attach_req_t *)attach_mp->b_rptr)->dl_ppa = arl->arl_ppa;
+ }
+
+ /* Allocate and initialize a bind message. */
+ bind_mp = ip_ar_dlpi_comm(DL_BIND_REQ, sizeof (dl_bind_req_t));
+ if (bind_mp == NULL)
+ goto bad;
+ ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ETHERTYPE_ARP;
+ ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS;
+
+ unbind_mp = ip_ar_dlpi_comm(DL_UNBIND_REQ, sizeof (dl_unbind_req_t));
+ if (unbind_mp == NULL)
+ goto bad;
+ if (arl->arl_needs_attach) {
+ arp_dlpi_send(arl, attach_mp);
+ }
+ arl->arl_unbind_mp = unbind_mp;
+
+ arl->arl_state_flags |= ARL_LL_BIND_PENDING;
+ arp_dlpi_send(arl, bind_mp);
+ arl_refrele(arl);
+ return (EINPROGRESS);
+
+bad:
+ freemsg(attach_mp);
+ freemsg(bind_mp);
+ freemsg(unbind_mp);
+ arl_refrele(arl);
+ return (ENOMEM);
+}
+
+/*
+ * consumes/frees mp
+ */
+static void
+arp_notify(in_addr_t src, mblk_t *mp, uint32_t arcn_code,
+ ip_recv_attr_t *ira, ncec_t *ncec)
+{
+ char hbuf[MAC_STR_LEN];
+ char sbuf[INET_ADDRSTRLEN];
+ ill_t *ill = ira->ira_ill;
+ ip_stack_t *ipst = ill->ill_ipst;
+ arh_t *arh = (arh_t *)mp->b_rptr;
+
+ switch (arcn_code) {
+ case AR_CN_BOGON:
+ /*
+ * Someone is sending ARP packets with a source protocol
+ * address that we have published and for which we believe our
+ * entry is authoritative and verified to be unique on
+ * the network.
+ *
+ * arp_process_packet() sends AR_CN_FAILED for the case when
+ * a DAD probe is received and the hardware address of a
+ * non-authoritative entry has changed. Thus, AR_CN_BOGON
+ * indicates a real conflict, and we have to do resolution.
+ *
+ * We back away quickly from the address if it's from DHCP or
+ * otherwise temporary and hasn't been used recently (or at
+ * all). We'd like to include "deprecated" addresses here as
+ * well (as there's no real reason to defend something we're
+ * discarding), but IPMP "reuses" this flag to mean something
+ * other than the standard meaning.
+ */
+ if (ip_nce_conflict(mp, ira, ncec)) {
+ (void) mac_colon_addr((uint8_t *)(arh + 1),
+ arh->arh_hlen, hbuf, sizeof (hbuf));
+ (void) ip_dot_addr(src, sbuf);
+ cmn_err(CE_WARN,
+ "proxy ARP problem? Node '%s' is using %s on %s",
+ hbuf, sbuf, ill->ill_name);
+ if (!arp_no_defense)
+ (void) arp_announce(ncec);
+ /*
+ * ncec_last_time_defended has been adjusted in
+ * ip_nce_conflict.
+ */
+ } else {
+ ncec_delete(ncec);
+ }
+ freemsg(mp);
+ break;
+ case AR_CN_ANNOUNCE: {
+ nce_hw_map_t hwm;
+ /*
+ * ARP gives us a copy of any packet where it thinks
+ * the address has changed, so that we can update our
+ * caches. We're responsible for caching known answers
+ * in the current design. We check whether the
+ * hardware address really has changed in all of our
+ * entries that have cached this mapping, and if so, we
+ * blow them away. This way we will immediately pick
+ * up the rare case of a host changing hardware
+ * address.
+ */
+ if (src == 0) {
+ freemsg(mp);
+ break;
+ }
+ hwm.hwm_addr = src;
+ hwm.hwm_hwlen = arh->arh_hlen;
+ hwm.hwm_hwaddr = (uchar_t *)(arh + 1);
+ hwm.hwm_flags = 0;
+ ncec_walk_common(ipst->ips_ndp4, NULL,
+ (pfi_t)nce_update_hw_changed, &hwm, B_TRUE);
+ freemsg(mp);
+ break;
+ }
+ case AR_CN_FAILED:
+ if (arp_no_defense) {
+ (void) mac_colon_addr((uint8_t *)(arh + 1),
+ arh->arh_hlen, hbuf, sizeof (hbuf));
+ (void) ip_dot_addr(src, sbuf);
+
+ cmn_err(CE_WARN,
+ "node %s is using our IP address %s on %s",
+ hbuf, sbuf, ill->ill_name);
+ freemsg(mp);
+ break;
+ }
+ /*
+ * mp will be freed by arp_excl.
+ */
+ ill_refhold(ill);
+ qwriter_ip(ill, ill->ill_rq, mp, arp_excl, NEW_OP, B_FALSE);
+ return;
+ default:
+ ASSERT(0);
+ freemsg(mp);
+ break;
+ }
+}
+
+/*
+ * arp_output is called to transmit an ARP Request or Response. The mapping
+ * to RFC 826 variables is:
+ * haddr1 == ar$sha
+ * paddr1 == ar$spa
+ * haddr2 == ar$tha
+ * paddr2 == ar$tpa
+ * The ARP frame is sent to the ether_dst in dst_lladdr.
+ */
+static int
+arp_output(ill_t *ill, uint32_t operation,
+ const uchar_t *haddr1, const uchar_t *paddr1, const uchar_t *haddr2,
+ const uchar_t *paddr2, uchar_t *dst_lladdr)
+{
+ arh_t *arh;
+ uint8_t *cp;
+ uint_t hlen;
+ uint32_t plen = IPV4_ADDR_LEN; /* ar$pln from RFC 826 */
+ uint32_t proto = IP_ARP_PROTO_TYPE;
+ mblk_t *mp;
+ arl_t *arl;
+
+ ASSERT(dst_lladdr != NULL);
+ hlen = ill->ill_phys_addr_length; /* ar$hln from RFC 826 */
+ mp = ill_dlur_gen(dst_lladdr, hlen, ETHERTYPE_ARP, ill->ill_sap_length);
+
+ if (mp == NULL)
+ return (ENOMEM);
+
+ /* IFF_NOARP flag is set or link down: do not send arp messages */
+ if ((ill->ill_flags & ILLF_NOARP) || !ill->ill_dl_up) {
+ freemsg(mp);
+ return (ENXIO);
+ }
+
+ mp->b_cont = allocb(AR_LL_HDR_SLACK + ARH_FIXED_LEN + (hlen * 4) +
+ plen + plen, BPRI_MED);
+ if (mp->b_cont == NULL) {
+ freeb(mp);
+ return (ENOMEM);
+ }
+
+ /* Fill in the ARP header. */
+ cp = mp->b_cont->b_rptr + (AR_LL_HDR_SLACK + hlen + hlen);
+ mp->b_cont->b_rptr = cp;
+ arh = (arh_t *)cp;
+ U16_TO_BE16(arp_hw_type(ill->ill_mactype), arh->arh_hardware);
+ U16_TO_BE16(proto, arh->arh_proto);
+ arh->arh_hlen = (uint8_t)hlen;
+ arh->arh_plen = (uint8_t)plen;
+ U16_TO_BE16(operation, arh->arh_operation);
+ cp += ARH_FIXED_LEN;
+ bcopy(haddr1, cp, hlen);
+ cp += hlen;
+ if (paddr1 == NULL)
+ bzero(cp, plen);
+ else
+ bcopy(paddr1, cp, plen);
+ cp += plen;
+ if (haddr2 == NULL)
+ bzero(cp, hlen);
+ else
+ bcopy(haddr2, cp, hlen);
+ cp += hlen;
+ bcopy(paddr2, cp, plen);
+ cp += plen;
+ mp->b_cont->b_wptr = cp;
+
+ DTRACE_PROBE3(arp__physical__out__start,
+ ill_t *, ill, arh_t *, arh, mblk_t *, mp);
+ ARP_HOOK_OUT(ill->ill_ipst->ips_arp_physical_out_event,
+ ill->ill_ipst->ips_arp_physical_out,
+ ill->ill_phyint->phyint_ifindex, arh, mp, mp->b_cont,
+ ill->ill_ipst);
+ DTRACE_PROBE1(arp__physical__out__end, mblk_t *, mp);
+ if (mp == NULL)
+ return (0);
+
+ /* Ship it out. */
+ arl = ill_to_arl(ill);
+ if (arl == NULL) {
+ freemsg(mp);
+ return (0);
+ }
+ if (canputnext(arl->arl_wq))
+ putnext(arl->arl_wq, mp);
+ else
+ freemsg(mp);
+ arl_refrele(arl);
+ return (0);
+}
+
+/*
+ * Process resolve requests.
+ * If we are not yet reachable then we check and decrease ncec_rcnt; otherwise
+ * we leave it alone (the caller will check and manage ncec_pcnt in those
+ * cases.)
+ */
+int
+arp_request(ncec_t *ncec, in_addr_t sender, ill_t *ill)
+{
+ int err;
+ const uchar_t *target_hwaddr;
+ struct in_addr nce_paddr;
+ uchar_t *dst_lladdr;
+ boolean_t use_rcnt = !NCE_ISREACHABLE(ncec);
+
+ ASSERT(MUTEX_HELD(&ncec->ncec_lock));
+ ASSERT(!IS_IPMP(ill));
+
+ if (use_rcnt && ncec->ncec_rcnt == 0) {
+ /* not allowed any more retransmits. */
+ return (0);
+ }
+
+ if ((ill->ill_flags & ILLF_NOARP) != 0)
+ return (0);
+
+ IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &nce_paddr);
+
+ target_hwaddr =
+ ill->ill_bcast_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
+
+ if (NCE_ISREACHABLE(ncec)) {
+ dst_lladdr = ncec->ncec_lladdr;
+ } else {
+ dst_lladdr = ill->ill_bcast_mp->b_rptr +
+ NCE_LL_ADDR_OFFSET(ill);
+ }
+
+ mutex_exit(&ncec->ncec_lock);
+ err = arp_output(ill, ARP_REQUEST,
+ ill->ill_phys_addr, (uchar_t *)&sender, target_hwaddr,
+ (uchar_t *)&nce_paddr, dst_lladdr);
+ mutex_enter(&ncec->ncec_lock);
+
+ if (err != 0) {
+ /*
+ * Some transient error such as ENOMEM or a down link was
+ * encountered. If the link has been taken down permanently,
+ * the ncec will eventually be cleaned up (ipif_down_tail()
+ * will call ipif_nce_down() and flush the ncec), to terminate
+ * recurring attempts to send ARP requests. In all other cases,
+ * allow the caller another chance at success next time.
+ */
+ return (ncec->ncec_ill->ill_reachable_retrans_time);
+ }
+
+ if (use_rcnt)
+ ncec->ncec_rcnt--;
+
+ return (ncec->ncec_ill->ill_reachable_retrans_time);
+}
+
+/* return B_TRUE if dropped */
+boolean_t
+arp_announce(ncec_t *ncec)
+{
+ ill_t *ill;
+ int err;
+ uchar_t *sphys_addr, *bcast_addr;
+ struct in_addr ncec_addr;
+ boolean_t need_refrele = B_FALSE;
+
+ ASSERT((ncec->ncec_flags & NCE_F_BCAST) == 0);
+ ASSERT((ncec->ncec_flags & NCE_F_MCAST) == 0);
+
+ if (IS_IPMP(ncec->ncec_ill)) {
+ /* sent on the cast_ill */
+ ill = ipmp_ill_get_xmit_ill(ncec->ncec_ill, B_FALSE);
+ if (ill == NULL)
+ return (B_TRUE);
+ need_refrele = B_TRUE;
+ } else {
+ ill = ncec->ncec_ill;
+ }
+
+ /*
+ * broadcast an announce to ill_bcast address.
+ */
+ IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ncec_addr);
+
+ sphys_addr = ncec->ncec_lladdr;
+ bcast_addr = ill->ill_bcast_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
+
+ err = arp_output(ill, ARP_REQUEST,
+ sphys_addr, (uchar_t *)&ncec_addr, bcast_addr,
+ (uchar_t *)&ncec_addr, bcast_addr);
+
+ if (need_refrele)
+ ill_refrele(ill);
+ return (err != 0);
+}
+
+/* return B_TRUE if dropped */
+boolean_t
+arp_probe(ncec_t *ncec)
+{
+ ill_t *ill;
+ int err;
+ struct in_addr ncec_addr;
+ uchar_t *sphys_addr, *dst_lladdr;
+
+ if (IS_IPMP(ncec->ncec_ill)) {
+ ill = ipmp_ill_get_xmit_ill(ncec->ncec_ill, B_FALSE);
+ if (ill == NULL)
+ return (B_TRUE);
+ } else {
+ ill = ncec->ncec_ill;
+ }
+
+ IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ncec_addr);
+
+ sphys_addr = ncec->ncec_lladdr;
+ dst_lladdr = ill->ill_bcast_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill);
+ err = arp_output(ill, ARP_REQUEST,
+ sphys_addr, NULL, NULL, (uchar_t *)&ncec_addr, dst_lladdr);
+
+ if (IS_IPMP(ncec->ncec_ill))
+ ill_refrele(ill);
+ return (err != 0);
+}
+
+static mblk_t *
+arl_unbind(arl_t *arl)
+{
+ mblk_t *mp;
+
+ if ((mp = arl->arl_unbind_mp) != NULL) {
+ arl->arl_unbind_mp = NULL;
+ arl->arl_state_flags |= ARL_DL_UNBIND_IN_PROGRESS;
+ }
+ return (mp);
+}
+
+int
+arp_ll_down(ill_t *ill)
+{
+ arl_t *arl;
+ mblk_t *unbind_mp;
+ int err = 0;
+ boolean_t replumb = (ill->ill_replumbing == 1);
+
+ DTRACE_PROBE2(ill__downup, char *, "arp_ll_down", ill_t *, ill);
+ if ((arl = ill_to_arl(ill)) == NULL)
+ return (ENXIO);
+ DTRACE_PROBE2(arl__downup, char *, "arp_ll_down", arl_t *, arl);
+ mutex_enter(&arl->arl_lock);
+ unbind_mp = arl_unbind(arl);
+ if (unbind_mp != NULL) {
+ ASSERT(arl->arl_state_flags & ARL_DL_UNBIND_IN_PROGRESS);
+ DTRACE_PROBE2(arp__unbinding, mblk_t *, unbind_mp,
+ arl_t *, arl);
+ err = EINPROGRESS;
+ if (replumb)
+ arl->arl_state_flags |= ARL_LL_REPLUMBING;
+ }
+ mutex_exit(&arl->arl_lock);
+ if (unbind_mp != NULL)
+ arp_dlpi_send(arl, unbind_mp);
+ arl_refrele(arl);
+ return (err);
+}
+
+/* ARGSUSED */
+int
+arp_close(queue_t *q, int flags)
+{
+ if (WR(q)->q_next != NULL) {
+ /* This is a module close */
+ return (arp_modclose(q->q_ptr));
+ }
+ qprocsoff(q);
+ q->q_ptr = WR(q)->q_ptr = NULL;
+ return (0);
+}
+
+static int
+arp_modclose(arl_t *arl)
+{
+ arl_ill_common_t *ai = arl->arl_common;
+ ill_t *ill;
+ queue_t *q = arl->arl_rq;
+ mblk_t *mp, *nextmp;
+ ipsq_t *ipsq = NULL;
+
+ ill = arl_to_ill(arl);
+ if (ill != NULL) {
+ if (!ill_waiter_inc(ill)) {
+ ill_refrele(ill);
+ } else {
+ ill_refrele(ill);
+ if (ipsq_enter(ill, B_FALSE, NEW_OP))
+ ipsq = ill->ill_phyint->phyint_ipsq;
+ ill_waiter_dcr(ill);
+ }
+ if (ipsq == NULL) {
+ /*
+ * could not enter the ipsq because ill is already
+ * marked CONDEMNED.
+ */
+ ill = NULL;
+ }
+ }
+ if (ai != NULL && ipsq == NULL) {
+ /*
+ * Either we did not get an ill because it was marked CONDEMNED
+ * or we could not enter the ipsq because it was unplumbing.
+ * In both cases, wait for the ill to complete ip_modclose().
+ *
+ * If the arp_modclose happened even before SLIFNAME, the ai
+ * itself would be NULL, in which case we can complete the close
+ * without waiting.
+ */
+ mutex_enter(&ai->ai_lock);
+ while (ai->ai_ill != NULL)
+ cv_wait(&ai->ai_ill_unplumb_done, &ai->ai_lock);
+ mutex_exit(&ai->ai_lock);
+ }
+ ASSERT(ill == NULL || IAM_WRITER_ILL(ill));
+
+ mutex_enter(&arl->arl_lock);
+ /*
+ * If the ill had completed unplumbing before arp_modclose(), there
+ * would be no ill (and therefore, no ipsq) to serialize arp_modclose()
+ * so that we need to explicitly check for ARL_CONDEMNED and back off
+ * if it is set.
+ */
+ if ((arl->arl_state_flags & ARL_CONDEMNED) != 0) {
+ mutex_exit(&arl->arl_lock);
+ ASSERT(ipsq == NULL);
+ return (0);
+ }
+ arl->arl_state_flags |= ARL_CONDEMNED;
+
+ /*
+ * send out all pending dlpi messages, don't wait for the ack (which
+ * will be ignored in arp_rput when CONDEMNED is set)
+ *
+ * We have to check for pending DL_UNBIND_REQ because, in the case
+ * that ip_modclose() executed before arp_modclose(), the call to
+ * ill_delete_tail->ipif_arp_down() would have triggered a
+ * DL_UNBIND_REQ. When arp_modclose() executes ipsq_enter() will fail
+ * (since ip_modclose() is in the ipsq) but the DL_UNBIND_ACK may not
+ * have been processed yet. In this scenario, we cannot reset
+ * arl_dlpi_pending, because the setting/clearing of arl_state_flags
+ * related to unbind, and the associated cv_waits must be allowed to
+ * continue.
+ */
+ if (arl->arl_dlpi_pending != DL_UNBIND_REQ)
+ arl->arl_dlpi_pending = DL_PRIM_INVAL;
+ mp = arl->arl_dlpi_deferred;
+ arl->arl_dlpi_deferred = NULL;
+ mutex_exit(&arl->arl_lock);
+
+ for (; mp != NULL; mp = nextmp) {
+ nextmp = mp->b_next;
+ mp->b_next = NULL;
+ putnext(arl->arl_wq, mp);
+ }
+
+ /* Wait for data paths to quiesce */
+ mutex_enter(&arl->arl_lock);
+ while (arl->arl_refcnt != 0)
+ cv_wait(&arl->arl_cv, &arl->arl_lock);
+
+ /*
+ * unbind, so that nothing else can come up from driver.
+ */
+ mp = arl_unbind(arl);
+ mutex_exit(&arl->arl_lock);
+ if (mp != NULL)
+ arp_dlpi_send(arl, mp);
+ mutex_enter(&arl->arl_lock);
+
+ /* wait for unbind ack */
+ while (arl->arl_state_flags & ARL_DL_UNBIND_IN_PROGRESS)
+ cv_wait(&arl->arl_cv, &arl->arl_lock);
+ mutex_exit(&arl->arl_lock);
+
+ qprocsoff(q);
+
+ if (ill != NULL) {
+ mutex_enter(&ill->ill_lock);
+ ill->ill_arl_dlpi_pending = 0;
+ mutex_exit(&ill->ill_lock);
+ }
+
+ if (ai != NULL) {
+ mutex_enter(&ai->ai_lock);
+ ai->ai_arl = NULL;
+ if (ai->ai_ill == NULL) {
+ mutex_destroy(&ai->ai_lock);
+ kmem_free(ai, sizeof (*ai));
+ } else {
+ mutex_exit(&ai->ai_lock);
+ }
+ }
+
+ /* free up the rest */
+ arp_mod_close_tail(arl);
+
+ q->q_ptr = WR(q)->q_ptr = NULL;
+
+ if (ipsq != NULL)
+ ipsq_exit(ipsq);
+
+ return (0);
+}
+
+static void
+arp_mod_close_tail(arl_t *arl)
+{
+ ip_stack_t *ipst = arl->arl_ipst;
+ mblk_t **mpp;
+
+ netstack_hold(ipst->ips_netstack);
+
+ mutex_enter(&ipst->ips_ip_mi_lock);
+ mi_close_unlink(&ipst->ips_arp_g_head, (IDP)arl);
+ mutex_exit(&ipst->ips_ip_mi_lock);
+
+ /*
+ * credp could be null if the open didn't succeed and ip_modopen
+ * itself calls ip_close.
+ */
+ if (arl->arl_credp != NULL)
+ crfree(arl->arl_credp);
+
+ /* Free all retained control messages. */
+ mpp = &arl->arl_first_mp_to_free;
+ do {
+ while (mpp[0]) {
+ mblk_t *mp;
+ mblk_t *mp1;
+
+ mp = mpp[0];
+ mpp[0] = mp->b_next;
+ for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) {
+ mp1->b_next = NULL;
+ mp1->b_prev = NULL;
+ }
+ freemsg(mp);
+ }
+ } while (mpp++ != &arl->arl_last_mp_to_free);
+
+ netstack_rele(ipst->ips_netstack);
+ mi_free(arl->arl_name);
+ mi_close_free((IDP)arl);
+}
+
+/*
+ * DAD failed. Tear down ipifs with the specified srce address. Note that
+ * tearing down the ipif also meas deleting the ncec through ipif_down,
+ * so it is not possible to use nce_timer for recovery. Instead we start
+ * a timer on the ipif. Caller has to free the mp.
+ */
+void
+arp_failure(mblk_t *mp, ip_recv_attr_t *ira)
+{
+ ill_t *ill = ira->ira_ill;
+
+ if ((mp = copymsg(mp)) != NULL) {
+ ill_refhold(ill);
+ qwriter_ip(ill, ill->ill_rq, mp, arp_excl, NEW_OP, B_FALSE);
+ }
+}
+
+/*
+ * This is for exclusive changes due to ARP. Tear down an interface due
+ * to AR_CN_FAILED and AR_CN_BOGON.
+ */
+/* ARGSUSED */
+static void
+arp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg)
+{
+ ill_t *ill = rq->q_ptr;
+ arh_t *arh;
+ ipaddr_t src;
+ ipif_t *ipif;
+ ip_stack_t *ipst = ill->ill_ipst;
+ uchar_t *haddr;
+ uint_t haddrlen;
+
+ /* first try src = ar$spa */
+ arh = (arh_t *)mp->b_rptr;
+ bcopy((char *)&arh[1] + arh->arh_hlen, &src, IP_ADDR_LEN);
+
+ haddrlen = arh->arh_hlen;
+ haddr = (uint8_t *)(arh + 1);
+
+ if (haddrlen == ill->ill_phys_addr_length) {
+ /*
+ * Ignore conflicts generated by misbehaving switches that
+ * just reflect our own messages back to us. For IPMP, we may
+ * see reflections across any ill in the illgrp.
+ */
+ /* For an under ill_grp can change under lock */
+ rw_enter(&ipst->ips_ill_g_lock, RW_READER);
+ if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 ||
+ IS_UNDER_IPMP(ill) && ill->ill_grp != NULL &&
+ ipmp_illgrp_find_ill(ill->ill_grp, haddr,
+ haddrlen) != NULL) {
+ rw_exit(&ipst->ips_ill_g_lock);
+ goto ignore_conflict;
+ }
+ rw_exit(&ipst->ips_ill_g_lock);
+ }
+
+ /*
+ * Look up the appropriate ipif.
+ */
+ ipif = ipif_lookup_addr(src, ill, ALL_ZONES, ipst);
+ if (ipif == NULL)
+ goto ignore_conflict;
+
+ /* Reload the ill to match the ipif */
+ ill = ipif->ipif_ill;
+
+ /* If it's already duplicate or ineligible, then don't do anything. */
+ if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) {
+ ipif_refrele(ipif);
+ goto ignore_conflict;
+ }
+
+ /*
+ * If we failed on a recovery probe, then restart the timer to
+ * try again later.
+ */
+ if (!ipif->ipif_was_dup) {
+ char hbuf[MAC_STR_LEN];
+ char sbuf[INET_ADDRSTRLEN];
+ char ibuf[LIFNAMSIZ];
+
+ (void) mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf));
+ (void) ip_dot_addr(src, sbuf);
+ ipif_get_name(ipif, ibuf, sizeof (ibuf));
+
+ cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);"
+ " disabled", ibuf, sbuf, hbuf);
+ }
+ mutex_enter(&ill->ill_lock);
+ ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE));
+ ipif->ipif_flags |= IPIF_DUPLICATE;
+ ill->ill_ipif_dup_count++;
+ mutex_exit(&ill->ill_lock);
+ (void) ipif_down(ipif, NULL, NULL);
+ (void) ipif_down_tail(ipif);
+ mutex_enter(&ill->ill_lock);
+ if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) &&
+ ill->ill_net_type == IRE_IF_RESOLVER &&
+ !(ipif->ipif_state_flags & IPIF_CONDEMNED) &&
+ ipst->ips_ip_dup_recovery > 0) {
+ ASSERT(ipif->ipif_recovery_id == 0);
+ ipif->ipif_recovery_id = timeout(ipif_dup_recovery,
+ ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery));
+ }
+ mutex_exit(&ill->ill_lock);
+ ipif_refrele(ipif);
+
+ignore_conflict:
+ freemsg(mp);
+}
+
+/*
+ * This is a place for a dtrace hook.
+ * Note that mp can be either the DL_UNITDATA_IND with a b_cont payload,
+ * or just the ARP packet payload as an M_DATA.
+ */
+/* ARGSUSED */
+static void
+arp_drop_packet(const char *str, mblk_t *mp, ill_t *ill)
+{
+ freemsg(mp);
+}
+
+static boolean_t
+arp_over_driver(queue_t *q)
+{
+ queue_t *qnext = STREAM(q)->sd_wrq->q_next;
+
+ /*
+ * check if first module below stream head is IP or UDP.
+ */
+ ASSERT(qnext != NULL);
+ if (strcmp(Q2NAME(qnext), "ip") != 0 &&
+ strcmp(Q2NAME(qnext), "udp") != 0) {
+ /*
+ * module below is not ip or udp, so arp has been pushed
+ * on the driver.
+ */
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+static int
+arp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
+{
+ int err;
+
+ ASSERT(sflag & MODOPEN);
+ if (!arp_over_driver(q)) {
+ q->q_qinfo = dummymodinfo.st_rdinit;
+ WR(q)->q_qinfo = dummymodinfo.st_wrinit;
+ return ((*dummymodinfo.st_rdinit->qi_qopen)(q, devp, flag,
+ sflag, credp));
+ }
+ err = arp_modopen(q, devp, flag, sflag, credp);
+ return (err);
+}
+
+/*
+ * In most cases we must be a writer on the IP stream before coming to
+ * arp_dlpi_send(), to serialize DLPI sends to the driver. The exceptions
+ * when we are not a writer are very early duing initialization (in
+ * arl_init, before the arl has done a SLIFNAME, so that we don't yet know
+ * the associated ill) or during arp_mod_close, when we could not enter the
+ * ipsq because the ill has already unplumbed.
+ */
+static void
+arp_dlpi_send(arl_t *arl, mblk_t *mp)
+{
+ mblk_t **mpp;
+ t_uscalar_t prim;
+ arl_ill_common_t *ai;
+
+ ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO);
+
+#ifdef DEBUG
+ ai = arl->arl_common;
+ if (ai != NULL) {
+ mutex_enter(&ai->ai_lock);
+ if (ai->ai_ill != NULL)
+ ASSERT(IAM_WRITER_ILL(ai->ai_ill));
+ mutex_exit(&ai->ai_lock);
+ }
+#endif /* DEBUG */
+
+ mutex_enter(&arl->arl_lock);
+ if (arl->arl_dlpi_pending != DL_PRIM_INVAL) {
+ /* Must queue message. Tail insertion */
+ mpp = &arl->arl_dlpi_deferred;
+ while (*mpp != NULL)
+ mpp = &((*mpp)->b_next);
+
+ *mpp = mp;
+ mutex_exit(&arl->arl_lock);
+ return;
+ }
+ mutex_exit(&arl->arl_lock);
+ if ((prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive)
+ == DL_BIND_REQ) {
+ ASSERT((arl->arl_state_flags & ARL_DL_UNBIND_IN_PROGRESS) == 0);
+ }
+ /*
+ * No need to take the arl_lock to examine ARL_CONDEMNED at this point
+ * because the only thread that can see ARL_CONDEMNED here is the
+ * closing arp_modclose() thread which sets the flag after becoming a
+ * writer on the ipsq. Threads from IP must have finished and
+ * cannot be active now.
+ */
+ if (!(arl->arl_state_flags & ARL_CONDEMNED) ||
+ (prim == DL_UNBIND_REQ)) {
+ if (prim != DL_NOTIFY_CONF) {
+ ill_t *ill = arl_to_ill(arl);
+
+ arl->arl_dlpi_pending = prim;
+ if (ill != NULL) {
+ mutex_enter(&ill->ill_lock);
+ ill->ill_arl_dlpi_pending = 1;
+ mutex_exit(&ill->ill_lock);
+ ill_refrele(ill);
+ }
+ }
+ }
+ DTRACE_PROBE4(arl__dlpi, char *, "arp_dlpi_send",
+ char *, dl_primstr(prim), char *, "-", arl_t *, arl);
+ putnext(arl->arl_wq, mp);
+}
+
+static void
+arl_defaults_common(arl_t *arl, mblk_t *mp)
+{
+ dl_info_ack_t *dlia = (dl_info_ack_t *)mp->b_rptr;
+ /*
+ * Till the ill is fully up the ill is not globally visible.
+ * So no need for a lock.
+ */
+ arl->arl_mactype = dlia->dl_mac_type;
+ arl->arl_sap_length = dlia->dl_sap_length;
+
+ if (!arl->arl_dlpi_style_set) {
+ if (dlia->dl_provider_style == DL_STYLE2)
+ arl->arl_needs_attach = 1;
+ mutex_enter(&arl->arl_lock);
+ ASSERT(arl->arl_dlpi_style_set == 0);
+ arl->arl_dlpi_style_set = 1;
+ arl->arl_state_flags &= ~ARL_LL_SUBNET_PENDING;
+ cv_broadcast(&arl->arl_cv);
+ mutex_exit(&arl->arl_lock);
+ }
+}
+
+int
+arl_init(queue_t *q, arl_t *arl)
+{
+ mblk_t *info_mp;
+ dl_info_req_t *dlir;
+
+ /* subset of ill_init */
+ mutex_init(&arl->arl_lock, NULL, MUTEX_DEFAULT, 0);
+
+ arl->arl_rq = q;
+ arl->arl_wq = WR(q);
+
+ info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)),
+ BPRI_HI);
+ if (info_mp == NULL)
+ return (ENOMEM);
+ /*
+ * allocate sufficient space to contain device name.
+ */
+ arl->arl_name = (char *)(mi_zalloc(2 * LIFNAMSIZ));
+ arl->arl_ppa = UINT_MAX;
+ arl->arl_state_flags |= (ARL_LL_SUBNET_PENDING | ARL_LL_UNBOUND);
+
+ /* Send down the Info Request to the driver. */
+ info_mp->b_datap->db_type = M_PCPROTO;
+ dlir = (dl_info_req_t *)info_mp->b_rptr;
+ info_mp->b_wptr = (uchar_t *)&dlir[1];
+ dlir->dl_primitive = DL_INFO_REQ;
+ arl->arl_dlpi_pending = DL_PRIM_INVAL;
+ qprocson(q);
+
+ arp_dlpi_send(arl, info_mp);
+ return (0);
+}
+
+int
+arl_wait_for_info_ack(arl_t *arl)
+{
+ int err;
+
+ mutex_enter(&arl->arl_lock);
+ while (arl->arl_state_flags & ARL_LL_SUBNET_PENDING) {
+ /*
+ * Return value of 0 indicates a pending signal.
+ */
+ err = cv_wait_sig(&arl->arl_cv, &arl->arl_lock);
+ if (err == 0) {
+ mutex_exit(&arl->arl_lock);
+ return (EINTR);
+ }
+ }
+ mutex_exit(&arl->arl_lock);
+ /*
+ * ip_rput_other could have set an error in ill_error on
+ * receipt of M_ERROR.
+ */
+ return (arl->arl_error);
+}
+
+void
+arl_set_muxid(ill_t *ill, int muxid)
+{
+ arl_t *arl;
+
+ arl = ill_to_arl(ill);
+ if (arl != NULL) {
+ arl->arl_muxid = muxid;
+ arl_refrele(arl);
+ }
+}
+
+int
+arl_get_muxid(ill_t *ill)
+{
+ arl_t *arl;
+ int muxid = 0;
+
+ arl = ill_to_arl(ill);
+ if (arl != NULL) {
+ muxid = arl->arl_muxid;
+ arl_refrele(arl);
+ }
+ return (muxid);
+}
+
+static int
+arp_modopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
+{
+ int err;
+ zoneid_t zoneid;
+ netstack_t *ns;
+ ip_stack_t *ipst;
+ arl_t *arl = NULL;
+
+ /*
+ * Prevent unprivileged processes from pushing IP so that
+ * they can't send raw IP.
+ */
+ if (secpolicy_net_rawaccess(credp) != 0)
+ return (EPERM);
+
+ ns = netstack_find_by_cred(credp);
+ ASSERT(ns != NULL);
+ ipst = ns->netstack_ip;
+ ASSERT(ipst != NULL);
+
+ /*
+ * For exclusive stacks we set the zoneid to zero
+ * to make IP operate as if in the global zone.
+ */
+ if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
+ zoneid = GLOBAL_ZONEID;
+ else
+ zoneid = crgetzoneid(credp);
+
+ arl = (arl_t *)mi_open_alloc_sleep(sizeof (arl_t));
+ q->q_ptr = WR(q)->q_ptr = arl;
+ arl->arl_ipst = ipst;
+ arl->arl_zoneid = zoneid;
+ err = arl_init(q, arl);
+
+ if (err != 0) {
+ mi_free(arl->arl_name);
+ mi_free(arl);
+ netstack_rele(ipst->ips_netstack);
+ q->q_ptr = NULL;
+ WR(q)->q_ptr = NULL;
+ return (err);
+ }
+
+ /*
+ * Wait for the DL_INFO_ACK if a DL_INFO_REQ was sent.
+ */
+ err = arl_wait_for_info_ack(arl);
+ if (err == 0)
+ arl->arl_credp = credp;
+ else
+ goto fail;
+
+ crhold(credp);
+
+ mutex_enter(&ipst->ips_ip_mi_lock);
+ err = mi_open_link(&ipst->ips_arp_g_head, (IDP)q->q_ptr, devp, flag,
+ sflag, credp);
+ mutex_exit(&ipst->ips_ip_mi_lock);
+fail:
+ if (err) {
+ (void) arp_close(q, 0);
+ return (err);
+ }
+ return (0);
+}
+
+/*
+ * Notify any downstream modules (esp softmac and hitbox) of the name
+ * of this interface using an M_CTL.
+ */
+static void
+arp_ifname_notify(arl_t *arl)
+{
+ mblk_t *mp1, *mp2;
+ struct iocblk *iocp;
+ struct lifreq *lifr;
+
+ if ((mp1 = mkiocb(SIOCSLIFNAME)) == NULL)
+ return;
+ if ((mp2 = allocb(sizeof (struct lifreq), BPRI_HI)) == NULL) {
+ freemsg(mp1);
+ return;
+ }
+
+ lifr = (struct lifreq *)mp2->b_rptr;
+ mp2->b_wptr += sizeof (struct lifreq);
+ bzero(lifr, sizeof (struct lifreq));
+
+ (void) strncpy(lifr->lifr_name, arl->arl_name, LIFNAMSIZ);
+ lifr->lifr_ppa = arl->arl_ppa;
+ lifr->lifr_flags = ILLF_IPV4;
+
+ /* Use M_CTL to avoid confusing anyone else who might be listening. */
+ DB_TYPE(mp1) = M_CTL;
+ mp1->b_cont = mp2;
+ iocp = (struct iocblk *)mp1->b_rptr;
+ iocp->ioc_count = msgsize(mp1->b_cont);
+ DTRACE_PROBE4(arl__dlpi, char *, "arp_ifname_notify",
+ char *, "SIOCSLIFNAME", char *, "-", arl_t *, arl);
+ putnext(arl->arl_wq, mp1);
+}
+
+void
+arp_send_replumb_conf(ill_t *ill)
+{
+ mblk_t *mp;
+ arl_t *arl = ill_to_arl(ill);
+
+ if (arl == NULL)
+ return;
+ /*
+ * arl_got_replumb and arl_got_unbind to be cleared after we complete
+ * arp_cmd_done.
+ */
+ mp = mexchange(NULL, NULL, sizeof (dl_notify_conf_t), M_PROTO,
+ DL_NOTIFY_CONF);
+ ((dl_notify_conf_t *)(mp->b_rptr))->dl_notification =
+ DL_NOTE_REPLUMB_DONE;
+ arp_dlpi_send(arl, mp);
+ mutex_enter(&arl->arl_lock);
+ arl->arl_state_flags &= ~ARL_LL_REPLUMBING;
+ mutex_exit(&arl->arl_lock);
+ arl_refrele(arl);
+}
+
+/*
+ * The unplumb code paths call arp_unbind_complete() to make sure that it is
+ * safe to tear down the ill. We wait for DL_UNBIND_ACK to complete, and also
+ * for the arl_refcnt to fall to one so that, when we return from
+ * arp_unbind_complete(), we know for certain that there are no threads in
+ * arp_rput() that might access the arl_ill.
+ */
+void
+arp_unbind_complete(ill_t *ill)
+{
+ arl_t *arl = ill_to_arl(ill);
+
+ if (arl == NULL)
+ return;
+ mutex_enter(&arl->arl_lock);
+ /*
+ * wait for unbind ack and arl_refcnt to drop to 1. Note that the
+ * quiescent arl_refcnt for this function is 1 (and not 0) because
+ * ill_to_arl() will itself return after taking a ref on the arl_t.
+ */
+ while (arl->arl_state_flags & ARL_DL_UNBIND_IN_PROGRESS)
+ cv_wait(&arl->arl_cv, &arl->arl_lock);
+ while (arl->arl_refcnt != 1)
+ cv_wait(&arl->arl_cv, &arl->arl_lock);
+ mutex_exit(&arl->arl_lock);
+ arl_refrele(arl);
+}