diff options
Diffstat (limited to 'usr/src/uts/common/inet/ip/ip_arp.c')
-rw-r--r-- | usr/src/uts/common/inet/ip/ip_arp.c | 2468 |
1 files changed, 2468 insertions, 0 deletions
diff --git a/usr/src/uts/common/inet/ip/ip_arp.c b/usr/src/uts/common/inet/ip/ip_arp.c new file mode 100644 index 0000000000..489d59dbf6 --- /dev/null +++ b/usr/src/uts/common/inet/ip/ip_arp.c @@ -0,0 +1,2468 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <inet/ip_arp.h> +#include <inet/ip_ndp.h> +#include <net/if_arp.h> +#include <netinet/if_ether.h> +#include <sys/strsubr.h> +#include <inet/ip6.h> +#include <inet/ip.h> +#include <inet/ip_ire.h> +#include <inet/ip_if.h> +#include <sys/dlpi.h> +#include <sys/sunddi.h> +#include <sys/strsun.h> +#include <sys/sdt.h> +#include <inet/mi.h> +#include <inet/arp.h> +#include <inet/ipdrop.h> +#include <sys/sockio.h> +#include <inet/ip_impl.h> +#include <sys/policy.h> + +#define ARL_LL_ADDR_OFFSET(arl) (((arl)->arl_sap_length) < 0 ? \ + (sizeof (dl_unitdata_req_t)) : \ + ((sizeof (dl_unitdata_req_t)) + (ABS((arl)->arl_sap_length)))) + +/* + * MAC-specific intelligence. Shouldn't be needed, but the DL_INFO_ACK + * doesn't quite do it for us. + */ +typedef struct arp_m_s { + t_uscalar_t arp_mac_type; + uint32_t arp_mac_arp_hw_type; + t_scalar_t arp_mac_sap_length; + uint32_t arp_mac_hw_addr_length; +} arp_m_t; + +static int arp_close(queue_t *, int); +static void arp_rput(queue_t *, mblk_t *); +static void arp_wput(queue_t *, mblk_t *); +static arp_m_t *arp_m_lookup(t_uscalar_t mac_type); +static void arp_notify(ipaddr_t, mblk_t *, uint32_t, ip_recv_attr_t *, + ncec_t *); +static int arp_output(ill_t *, uint32_t, const uchar_t *, const uchar_t *, + const uchar_t *, const uchar_t *, uchar_t *); +static int arp_modclose(arl_t *); +static void arp_mod_close_tail(arl_t *); +static mblk_t *arl_unbind(arl_t *); +static void arp_process_packet(ill_t *, mblk_t *); +static void arp_excl(ipsq_t *, queue_t *, mblk_t *, void *); +static void arp_drop_packet(const char *str, mblk_t *, ill_t *); +static int arp_open(queue_t *, dev_t *, int, int, cred_t *); +static int ip_sioctl_ifunitsel_arp(queue_t *, int *); +static int ip_sioctl_slifname_arp(queue_t *, void *); +static void arp_dlpi_send(arl_t *, mblk_t *); +static void arl_defaults_common(arl_t *, mblk_t *); +static int arp_modopen(queue_t *, dev_t *, int, int, cred_t *); +static void arp_ifname_notify(arl_t *); +static void arp_rput_dlpi_writer(ipsq_t *, queue_t *, mblk_t *, void *); +static arl_t *ill_to_arl(ill_t *); + +#define DL_PRIM(mp) (((union DL_primitives *)(mp)->b_rptr)->dl_primitive) +#define IS_DLPI_DATA(mp) \ + ((DB_TYPE(mp) == M_PROTO) && \ + MBLKL(mp) >= sizeof (dl_unitdata_ind_t) && \ + (DL_PRIM(mp) == DL_UNITDATA_IND)) + +#define AR_NOTFOUND 1 /* No matching ace found in cache */ +#define AR_MERGED 2 /* Matching ace updated (RFC 826 Merge_flag) */ +#define AR_LOOPBACK 3 /* Our own arp packet was received */ +#define AR_BOGON 4 /* Another host has our IP addr. */ +#define AR_FAILED 5 /* Duplicate Address Detection has failed */ +#define AR_CHANGED 6 /* Address has changed; tell IP (and merged) */ + +boolean_t arp_no_defense; + +struct module_info arp_mod_info = { + IP_MOD_ID, "arpip", 1, INFPSZ, 65536, 1024 +}; +static struct qinit rinit_arp = { + (pfi_t)arp_rput, NULL, arp_open, arp_close, NULL, &arp_mod_info +}; +static struct qinit winit_arp = { + (pfi_t)arp_wput, NULL, arp_open, arp_close, NULL, + &arp_mod_info +}; +struct streamtab arpinfo = { + &rinit_arp, &winit_arp +}; +#define ARH_FIXED_LEN 8 +#define AR_LL_HDR_SLACK 32 + +/* + * pfhooks for ARP. + */ +#define ARP_HOOK_IN(_hook, _event, _ilp, _hdr, _fm, _m, ipst) \ + \ + if ((_hook).he_interested) { \ + hook_pkt_event_t info; \ + \ + info.hpe_protocol = ipst->ips_arp_net_data; \ + info.hpe_ifp = _ilp; \ + info.hpe_ofp = 0; \ + info.hpe_hdr = _hdr; \ + info.hpe_mp = &(_fm); \ + info.hpe_mb = _m; \ + if (hook_run(ipst->ips_arp_net_data->netd_hooks, \ + _event, (hook_data_t)&info) != 0) { \ + if (_fm != NULL) { \ + freemsg(_fm); \ + _fm = NULL; \ + } \ + _hdr = NULL; \ + _m = NULL; \ + } else { \ + _hdr = info.hpe_hdr; \ + _m = info.hpe_mb; \ + } \ + } + +#define ARP_HOOK_OUT(_hook, _event, _olp, _hdr, _fm, _m, ipst) \ + \ + if ((_hook).he_interested) { \ + hook_pkt_event_t info; \ + \ + info.hpe_protocol = ipst->ips_arp_net_data; \ + info.hpe_ifp = 0; \ + info.hpe_ofp = _olp; \ + info.hpe_hdr = _hdr; \ + info.hpe_mp = &(_fm); \ + info.hpe_mb = _m; \ + if (hook_run(ipst->ips_arp_net_data->netd_hooks, \ + _event, (hook_data_t)&info) != 0) { \ + if (_fm != NULL) { \ + freemsg(_fm); \ + _fm = NULL; \ + } \ + _hdr = NULL; \ + _m = NULL; \ + } else { \ + _hdr = info.hpe_hdr; \ + _m = info.hpe_mb; \ + } \ + } + +static arp_m_t arp_m_tbl[] = { + { DL_CSMACD, ARPHRD_ETHER, -2, 6}, /* 802.3 */ + { DL_TPB, ARPHRD_IEEE802, -2, 6}, /* 802.4 */ + { DL_TPR, ARPHRD_IEEE802, -2, 6}, /* 802.5 */ + { DL_METRO, ARPHRD_IEEE802, -2, 6}, /* 802.6 */ + { DL_ETHER, ARPHRD_ETHER, -2, 6}, /* Ethernet */ + { DL_FDDI, ARPHRD_ETHER, -2, 6}, /* FDDI */ + { DL_IB, ARPHRD_IB, -2, 20}, /* Infiniband */ + { DL_OTHER, ARPHRD_ETHER, -2, 6} /* unknown */ +}; + +static void +arl_refhold_locked(arl_t *arl) +{ + ASSERT(MUTEX_HELD(&arl->arl_lock)); + arl->arl_refcnt++; + ASSERT(arl->arl_refcnt != 0); +} + +static void +arl_refrele(arl_t *arl) +{ + mutex_enter(&arl->arl_lock); + ASSERT(arl->arl_refcnt != 0); + arl->arl_refcnt--; + if (arl->arl_refcnt > 1) { + mutex_exit(&arl->arl_lock); + return; + } + + /* ill_close or arp_unbind_complete may be waiting */ + cv_broadcast(&arl->arl_cv); + mutex_exit(&arl->arl_lock); +} + +/* + * wake up any pending ip ioctls. + */ +static void +arp_cmd_done(ill_t *ill, int err, t_uscalar_t lastprim) +{ + if (lastprim == DL_UNBIND_REQ && ill->ill_replumbing) + arp_replumb_done(ill, 0); + else + arp_bringup_done(ill, err); +} + +static int +ip_nce_resolve_all(ill_t *ill, uchar_t *src_haddr, uint32_t hlen, + const in_addr_t *src_paddr, ncec_t **sncec, int op) +{ + int retv; + ncec_t *ncec; + boolean_t ll_changed; + uchar_t *lladdr = NULL; + int new_state; + + ASSERT(ill != NULL); + + ncec = ncec_lookup_illgrp_v4(ill, src_paddr); + *sncec = ncec; + + if (ncec == NULL) { + retv = AR_NOTFOUND; + goto done; + } + + mutex_enter(&ncec->ncec_lock); + /* + * IP addr and hardware address match what we already + * have, then this is a broadcast packet emitted by one of our + * interfaces, reflected by the switch and received on another + * interface. We return AR_LOOPBACK. + */ + lladdr = ncec->ncec_lladdr; + if (NCE_MYADDR(ncec) && hlen == ncec->ncec_ill->ill_phys_addr_length && + bcmp(lladdr, src_haddr, hlen) == 0) { + mutex_exit(&ncec->ncec_lock); + retv = AR_LOOPBACK; + goto done; + } + /* + * If the entry is unverified, then we've just verified that + * someone else already owns this address, because this is a + * message with the same protocol address but different + * hardware address. + */ + if (ncec->ncec_flags & NCE_F_UNVERIFIED) { + mutex_exit(&ncec->ncec_lock); + ncec_delete(ncec); + ncec_refrele(ncec); + *sncec = NULL; + retv = AR_FAILED; + goto done; + } + + /* + * If the IP address matches ours and we're authoritative for + * this entry, then some other node is using our IP addr, so + * return AR_BOGON. Also reset the transmit count to zero so + * that, if we're currently in initial announcement mode, we + * switch back to the lazier defense mode. Knowing that + * there's at least one duplicate out there, we ought not + * blindly announce. + * + * NCE_F_AUTHORITY is set in one of two ways: + * 1. /sbin/arp told us so, via the "permanent" flag. + * 2. This is one of my addresses. + */ + if (ncec->ncec_flags & NCE_F_AUTHORITY) { + ncec->ncec_unsolicit_count = 0; + mutex_exit(&ncec->ncec_lock); + retv = AR_BOGON; + goto done; + } + + /* + * No address conflict was detected, and we are getting + * ready to update the ncec's hwaddr. The nce MUST NOT be on an + * under interface, because all dynamic nce's are created on the + * native interface (in the non-IPMP case) or on the IPMP + * meta-interface (in the IPMP case) + */ + ASSERT(!IS_UNDER_IPMP(ncec->ncec_ill)); + + /* + * update ncec with src_haddr, hlen. + * + * We are trying to resolve this ncec_addr/src_paddr and we + * got a REQUEST/RESPONSE from the ncec_addr/src_paddr. + * So the new_state is at least "STALE". If, in addition, + * this a solicited, unicast ARP_RESPONSE, we can transition + * to REACHABLE. + */ + new_state = ND_STALE; + ip1dbg(("got info for ncec %p from addr %x\n", + (void *)ncec, *src_paddr)); + retv = AR_MERGED; + if (ncec->ncec_state == ND_INCOMPLETE || + ncec->ncec_state == ND_INITIAL) { + ll_changed = B_TRUE; + } else { + ll_changed = nce_cmp_ll_addr(ncec, src_haddr, hlen); + if (!ll_changed) + new_state = ND_UNCHANGED; + else + retv = AR_CHANGED; + } + /* + * We don't have the equivalent of the IPv6 'S' flag indicating + * a solicited response, so we assume that if we are in + * INCOMPLETE, or got back an unchanged lladdr in PROBE state, + * and this is an ARP_RESPONSE, it must be a + * solicited response allowing us to transtion to REACHABLE. + */ + if (op == ARP_RESPONSE) { + switch (ncec->ncec_state) { + case ND_PROBE: + new_state = (ll_changed ? ND_STALE : ND_REACHABLE); + break; + case ND_INCOMPLETE: + new_state = ND_REACHABLE; + break; + } + } + /* + * Call nce_update() to refresh fastpath information on any + * dependent nce_t entries. + */ + nce_update(ncec, new_state, (ll_changed ? src_haddr : NULL)); + mutex_exit(&ncec->ncec_lock); + nce_resolv_ok(ncec); +done: + return (retv); +} + +/* Find an entry for a particular MAC type in the arp_m_tbl. */ +static arp_m_t * +arp_m_lookup(t_uscalar_t mac_type) +{ + arp_m_t *arm; + + for (arm = arp_m_tbl; arm < A_END(arp_m_tbl); arm++) { + if (arm->arp_mac_type == mac_type) + return (arm); + } + return (NULL); +} + +static uint32_t +arp_hw_type(t_uscalar_t mactype) +{ + arp_m_t *arm; + + if ((arm = arp_m_lookup(mactype)) == NULL) + arm = arp_m_lookup(DL_OTHER); + return (arm->arp_mac_arp_hw_type); +} + +/* + * Called when an DLPI control message has been acked; send down the next + * queued message (if any). + * The DLPI messages of interest being bind, attach and unbind since + * these are the only ones sent by ARP via arp_dlpi_send. + */ +static void +arp_dlpi_done(arl_t *arl, ill_t *ill) +{ + mblk_t *mp; + int err; + t_uscalar_t prim; + + mutex_enter(&arl->arl_lock); + prim = arl->arl_dlpi_pending; + + if ((mp = arl->arl_dlpi_deferred) == NULL) { + arl->arl_dlpi_pending = DL_PRIM_INVAL; + if (arl->arl_state_flags & ARL_LL_DOWN) + err = ENETDOWN; + else + err = 0; + mutex_exit(&arl->arl_lock); + + mutex_enter(&ill->ill_lock); + ill->ill_arl_dlpi_pending = 0; + mutex_exit(&ill->ill_lock); + arp_cmd_done(ill, err, prim); + return; + } + + arl->arl_dlpi_deferred = mp->b_next; + mp->b_next = NULL; + + ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); + + arl->arl_dlpi_pending = DL_PRIM(mp); + mutex_exit(&arl->arl_lock); + + mutex_enter(&ill->ill_lock); + ill->ill_arl_dlpi_pending = 1; + mutex_exit(&ill->ill_lock); + + putnext(arl->arl_wq, mp); +} + +/* + * This routine is called during module initialization when the DL_INFO_ACK + * comes back from the device. We set up defaults for all the device dependent + * doo-dads we are going to need. This will leave us ready to roll if we are + * attempting auto-configuration. Alternatively, these defaults can be + * overridden by initialization procedures possessing higher intelligence. + * + * Caller will free the mp. + */ +static void +arp_ll_set_defaults(arl_t *arl, mblk_t *mp) +{ + arp_m_t *arm; + dl_info_ack_t *dlia = (dl_info_ack_t *)mp->b_rptr; + + if ((arm = arp_m_lookup(dlia->dl_mac_type)) == NULL) + arm = arp_m_lookup(DL_OTHER); + ASSERT(arm != NULL); + + /* + * We initialize based on parameters in the (currently) not too + * exhaustive arp_m_tbl. + */ + if (dlia->dl_version == DL_VERSION_2) { + arl->arl_sap_length = dlia->dl_sap_length; + arl->arl_phys_addr_length = dlia->dl_brdcst_addr_length; + if (dlia->dl_provider_style == DL_STYLE2) + arl->arl_needs_attach = 1; + } else { + arl->arl_sap_length = arm->arp_mac_sap_length; + arl->arl_phys_addr_length = arm->arp_mac_hw_addr_length; + } + /* + * Note: the arp_hw_type in the arp header may be derived from + * the ill_mac_type and arp_m_lookup(). + */ + arl->arl_sap = ETHERTYPE_ARP; + arl_defaults_common(arl, mp); +} + +static void +arp_wput(queue_t *q, mblk_t *mp) +{ + int err = EINVAL; + struct iocblk *ioc; + mblk_t *mp1; + + switch (DB_TYPE(mp)) { + case M_IOCTL: + ASSERT(q->q_next != NULL); + ioc = (struct iocblk *)mp->b_rptr; + if (ioc->ioc_cmd != SIOCSLIFNAME && + ioc->ioc_cmd != IF_UNITSEL) { + DTRACE_PROBE4(arl__dlpi, char *, "arp_wput", + char *, "<some ioctl>", char *, "-", + arl_t *, (arl_t *)q->q_ptr); + putnext(q, mp); + return; + } + if ((mp1 = mp->b_cont) == 0) + err = EINVAL; + else if (ioc->ioc_cmd == SIOCSLIFNAME) + err = ip_sioctl_slifname_arp(q, mp1->b_rptr); + else if (ioc->ioc_cmd == IF_UNITSEL) + err = ip_sioctl_ifunitsel_arp(q, (int *)mp1->b_rptr); + if (err == 0) + miocack(q, mp, 0, 0); + else + miocnak(q, mp, 0, err); + return; + default: + DTRACE_PROBE4(arl__dlpi, char *, "arp_wput default", + char *, "default mblk", char *, "-", + arl_t *, (arl_t *)q->q_ptr); + putnext(q, mp); + return; + } +} + +/* + * similar to ill_dlpi_pending(): verify that the received DLPI response + * matches the one that is pending for the arl. + */ +static boolean_t +arl_dlpi_pending(arl_t *arl, t_uscalar_t prim) +{ + t_uscalar_t pending; + + mutex_enter(&arl->arl_lock); + if (arl->arl_dlpi_pending == prim) { + mutex_exit(&arl->arl_lock); + return (B_TRUE); + } + + if (arl->arl_state_flags & ARL_CONDEMNED) { + mutex_exit(&arl->arl_lock); + return (B_FALSE); + } + pending = arl->arl_dlpi_pending; + mutex_exit(&arl->arl_lock); + + if (pending == DL_PRIM_INVAL) { + ip0dbg(("arl_dlpi_pending unsolicited ack for %s on %s", + dl_primstr(prim), arl->arl_name)); + } else { + ip0dbg(("arl_dlpi_pending ack for %s on %s expect %s", + dl_primstr(prim), arl->arl_name, dl_primstr(pending))); + } + return (B_FALSE); +} + +/* DLPI messages, other than DL_UNITDATA_IND are handled here. */ +static void +arp_rput_dlpi(queue_t *q, mblk_t *mp) +{ + arl_t *arl = (arl_t *)q->q_ptr; + union DL_primitives *dlp; + t_uscalar_t prim; + t_uscalar_t reqprim = DL_PRIM_INVAL; + ill_t *ill; + + if ((mp->b_wptr - mp->b_rptr) < sizeof (dlp->dl_primitive)) { + putnext(q, mp); + return; + } + dlp = (union DL_primitives *)mp->b_rptr; + prim = dlp->dl_primitive; + + /* + * If we received an ACK but didn't send a request for it, then it + * can't be part of any pending operation; discard up-front. + */ + switch (prim) { + case DL_ERROR_ACK: + /* + * ce is confused about how DLPI works, so we have to interpret + * an "error" on DL_NOTIFY_ACK (which we never could have sent) + * as really meaning an error on DL_NOTIFY_REQ. + * + * Note that supporting DL_NOTIFY_REQ is optional, so printing + * out an error message on the console isn't warranted except + * for debug. + */ + if (dlp->error_ack.dl_error_primitive == DL_NOTIFY_ACK || + dlp->error_ack.dl_error_primitive == DL_NOTIFY_REQ) { + reqprim = DL_NOTIFY_REQ; + } else { + reqprim = dlp->error_ack.dl_error_primitive; + } + break; + case DL_INFO_ACK: + reqprim = DL_INFO_REQ; + break; + case DL_OK_ACK: + reqprim = dlp->ok_ack.dl_correct_primitive; + break; + case DL_BIND_ACK: + reqprim = DL_BIND_REQ; + break; + default: + DTRACE_PROBE2(rput_dl_badprim, arl_t *, arl, + union DL_primitives *, dlp); + putnext(q, mp); + return; + } + if (reqprim == DL_PRIM_INVAL || !arl_dlpi_pending(arl, reqprim)) { + freemsg(mp); + return; + } + DTRACE_PROBE4(arl__dlpi, char *, "arp_rput_dlpi received", + char *, dl_primstr(prim), char *, dl_primstr(reqprim), + arl_t *, arl); + + ASSERT(prim != DL_NOTIFY_IND); + + ill = arl_to_ill(arl); + + switch (reqprim) { + case DL_INFO_REQ: + /* + * ill has not been set up yet for this case. This is the + * DL_INFO_ACK for the first DL_INFO_REQ sent from + * arp_modopen(). There should be no other arl_dlpi_deferred + * messages pending. We initialize the arl here. + */ + ASSERT(!arl->arl_dlpi_style_set); + ASSERT(arl->arl_dlpi_pending == DL_INFO_REQ); + ASSERT(arl->arl_dlpi_deferred == NULL); + arl->arl_dlpi_pending = DL_PRIM_INVAL; + arp_ll_set_defaults(arl, mp); + freemsg(mp); + return; + case DL_UNBIND_REQ: + mutex_enter(&arl->arl_lock); + arl->arl_state_flags &= ~ARL_DL_UNBIND_IN_PROGRESS; + /* + * This is not an error, so we don't set ARL_LL_DOWN + */ + arl->arl_state_flags &= ~ARL_LL_UP; + arl->arl_state_flags |= ARL_LL_UNBOUND; + if (arl->arl_state_flags & ARL_CONDEMNED) { + /* + * if this is part of the unplumb the arl may + * vaporize any moment after we cv_signal the + * arl_cv so we reset arl_dlpi_pending here. + * All other cases (including replumb) will + * have the arl_dlpi_pending reset in + * arp_dlpi_done. + */ + arl->arl_dlpi_pending = DL_PRIM_INVAL; + } + cv_signal(&arl->arl_cv); + mutex_exit(&arl->arl_lock); + break; + } + if (ill != NULL) { + /* + * ill ref obtained by arl_to_ill() will be released + * by qwriter_ip() + */ + qwriter_ip(ill, ill->ill_wq, mp, arp_rput_dlpi_writer, + CUR_OP, B_TRUE); + return; + } + freemsg(mp); +} + +/* + * Handling of DLPI messages that require exclusive access to the ipsq. + */ +/* ARGSUSED */ +static void +arp_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) +{ + union DL_primitives *dlp = (union DL_primitives *)mp->b_rptr; + ill_t *ill = (ill_t *)q->q_ptr; + arl_t *arl = ill_to_arl(ill); + + if (arl == NULL) { + /* + * happens as a result arp_modclose triggering unbind. + * arp_rput_dlpi will cv_signal the arl_cv and the modclose + * will complete, but when it does ipsq_exit, the waiting + * qwriter_ip gets into the ipsq but will find the arl null. + * There should be no deferred messages in this case, so + * just complete and exit. + */ + arp_cmd_done(ill, 0, DL_UNBIND_REQ); + freemsg(mp); + return; + } + switch (dlp->dl_primitive) { + case DL_ERROR_ACK: + switch (dlp->error_ack.dl_error_primitive) { + case DL_UNBIND_REQ: + mutex_enter(&arl->arl_lock); + arl->arl_state_flags &= ~ARL_DL_UNBIND_IN_PROGRESS; + arl->arl_state_flags &= ~ARL_LL_UP; + arl->arl_state_flags |= ARL_LL_UNBOUND; + arl->arl_state_flags |= ARL_LL_DOWN; + cv_signal(&arl->arl_cv); + mutex_exit(&arl->arl_lock); + break; + case DL_BIND_REQ: + mutex_enter(&arl->arl_lock); + arl->arl_state_flags &= ~ARL_LL_UP; + arl->arl_state_flags |= ARL_LL_DOWN; + arl->arl_state_flags |= ARL_LL_UNBOUND; + cv_signal(&arl->arl_cv); + mutex_exit(&arl->arl_lock); + break; + case DL_ATTACH_REQ: + break; + default: + /* If it's anything else, we didn't send it. */ + arl_refrele(arl); + putnext(q, mp); + return; + } + break; + case DL_OK_ACK: + DTRACE_PROBE4(arl__dlpi, char *, "arp_rput_dlpi_writer ok", + char *, dl_primstr(dlp->ok_ack.dl_correct_primitive), + char *, dl_primstr(dlp->ok_ack.dl_correct_primitive), + arl_t *, arl); + mutex_enter(&arl->arl_lock); + switch (dlp->ok_ack.dl_correct_primitive) { + case DL_UNBIND_REQ: + case DL_ATTACH_REQ: + break; + default: + ip0dbg(("Dropping unrecognized DL_OK_ACK for %s", + dl_primstr(dlp->ok_ack.dl_correct_primitive))); + mutex_exit(&arl->arl_lock); + arl_refrele(arl); + freemsg(mp); + return; + } + mutex_exit(&arl->arl_lock); + break; + case DL_BIND_ACK: + DTRACE_PROBE2(rput_dl_bind, arl_t *, arl, + dl_bind_ack_t *, &dlp->bind_ack); + + mutex_enter(&arl->arl_lock); + ASSERT(arl->arl_state_flags & ARL_LL_BIND_PENDING); + arl->arl_state_flags &= + ~(ARL_LL_BIND_PENDING|ARL_LL_DOWN|ARL_LL_UNBOUND); + arl->arl_state_flags |= ARL_LL_UP; + mutex_exit(&arl->arl_lock); + break; + case DL_UDERROR_IND: + DTRACE_PROBE2(rput_dl_uderror, arl_t *, arl, + dl_uderror_ind_t *, &dlp->uderror_ind); + arl_refrele(arl); + putnext(q, mp); + return; + default: + DTRACE_PROBE2(rput_dl_badprim, arl_t *, arl, + union DL_primitives *, dlp); + arl_refrele(arl); + putnext(q, mp); + return; + } + arp_dlpi_done(arl, ill); + arl_refrele(arl); + freemsg(mp); +} + +void +arp_rput(queue_t *q, mblk_t *mp) +{ + arl_t *arl = q->q_ptr; + boolean_t need_refrele = B_FALSE; + + mutex_enter(&arl->arl_lock); + if (((arl->arl_state_flags & + (ARL_CONDEMNED | ARL_LL_REPLUMBING)) != 0)) { + /* + * Only allow high priority DLPI messages during unplumb or + * replumb, and we don't take an arl_refcnt for that case. + */ + if (DB_TYPE(mp) != M_PCPROTO) { + mutex_exit(&arl->arl_lock); + freemsg(mp); + return; + } + } else { + arl_refhold_locked(arl); + need_refrele = B_TRUE; + } + mutex_exit(&arl->arl_lock); + + switch (DB_TYPE(mp)) { + case M_PCPROTO: + case M_PROTO: { + ill_t *ill; + + /* + * could be one of + * (i) real message from the wire, (DLPI_DATA) + * (ii) DLPI message + * Take a ref on the ill associated with this arl to + * prevent the ill from being unplumbed until this thread + * is done. + */ + if (IS_DLPI_DATA(mp)) { + ill = arl_to_ill(arl); + if (ill == NULL) { + arp_drop_packet("No ill", mp, ill); + break; + } + arp_process_packet(ill, mp); + ill_refrele(ill); + break; + } + /* Miscellaneous DLPI messages get shuffled off. */ + arp_rput_dlpi(q, mp); + break; + } + case M_ERROR: + case M_HANGUP: + if (mp->b_rptr < mp->b_wptr) + arl->arl_error = (int)(*mp->b_rptr & 0xFF); + if (arl->arl_error == 0) + arl->arl_error = ENXIO; + freemsg(mp); + break; + default: + ip1dbg(("arp_rput other db type %x\n", DB_TYPE(mp))); + putnext(q, mp); + break; + } + if (need_refrele) + arl_refrele(arl); +} + +static void +arp_process_packet(ill_t *ill, mblk_t *mp) +{ + mblk_t *mp1; + arh_t *arh; + in_addr_t src_paddr, dst_paddr; + uint32_t hlen, plen; + boolean_t is_probe; + int op; + ncec_t *dst_ncec, *src_ncec = NULL; + uchar_t *src_haddr, *arhp, *dst_haddr, *dp, *sp; + int err; + ip_stack_t *ipst; + boolean_t need_ill_refrele = B_FALSE; + nce_t *nce; + uchar_t *src_lladdr; + dl_unitdata_ind_t *dlui; + ip_recv_attr_t iras; + + ASSERT(ill != NULL); + if (ill->ill_flags & ILLF_NOARP) { + arp_drop_packet("Interface does not support ARP", mp, ill); + return; + } + ipst = ill->ill_ipst; + /* + * What we should have at this point is a DL_UNITDATA_IND message + * followed by an ARP packet. We do some initial checks and then + * get to work. + */ + dlui = (dl_unitdata_ind_t *)mp->b_rptr; + if (dlui->dl_group_address == 1) { + /* + * multicast or broadcast packet. Only accept on the ipmp + * nominated interface for multicasts ('cast_ill'). + * If we have no cast_ill we are liberal and accept everything. + */ + if (IS_UNDER_IPMP(ill)) { + /* For an under ill_grp can change under lock */ + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + if (!ill->ill_nom_cast && ill->ill_grp != NULL && + ill->ill_grp->ig_cast_ill != NULL) { + rw_exit(&ipst->ips_ill_g_lock); + arp_drop_packet("Interface is not nominated " + "for multicast sends and receives", + mp, ill); + return; + } + rw_exit(&ipst->ips_ill_g_lock); + } + } + mp1 = mp->b_cont; + if (mp1 == NULL) { + arp_drop_packet("Missing ARP packet", mp, ill); + return; + } + if (mp1->b_cont != NULL) { + /* No fooling around with funny messages. */ + if (!pullupmsg(mp1, -1)) { + arp_drop_packet("Funny message: pullup failed", + mp, ill); + return; + } + } + arh = (arh_t *)mp1->b_rptr; + hlen = arh->arh_hlen; + plen = arh->arh_plen; + if (MBLKL(mp1) < ARH_FIXED_LEN + 2 * hlen + 2 * plen) { + arp_drop_packet("mblk len too small", mp, ill); + return; + } + /* + * hlen 0 is used for RFC 1868 UnARP. + * + * Note that the rest of the code checks that hlen is what we expect + * for this hardware address type, so might as well discard packets + * here that don't match. + */ + if ((hlen > 0 && hlen != ill->ill_phys_addr_length) || plen == 0) { + DTRACE_PROBE2(rput_bogus, ill_t *, ill, mblk_t *, mp1); + arp_drop_packet("Bogus hlen or plen", mp, ill); + return; + } + /* + * Historically, Solaris has been lenient about hardware type numbers. + * We should check here, but don't. + */ + DTRACE_PROBE3(arp__physical__in__start, ill_t *, ill, arh_t *, arh, + mblk_t *, mp); + /* + * If ill is in an ipmp group, it will be the under ill. If we want + * to report the packet as coming up the IPMP interface, we should + * convert it to the ipmp ill. + */ + ARP_HOOK_IN(ipst->ips_arp_physical_in_event, ipst->ips_arp_physical_in, + ill->ill_phyint->phyint_ifindex, arh, mp, mp1, ipst); + DTRACE_PROBE1(arp__physical__in__end, mblk_t *, mp); + if (mp == NULL) + return; + arhp = (uchar_t *)arh + ARH_FIXED_LEN; + src_haddr = arhp; /* ar$sha */ + arhp += hlen; + bcopy(arhp, &src_paddr, IP_ADDR_LEN); /* ar$spa */ + sp = arhp; + arhp += IP_ADDR_LEN; + dst_haddr = arhp; /* ar$dha */ + arhp += hlen; + bcopy(arhp, &dst_paddr, IP_ADDR_LEN); /* ar$tpa */ + dp = arhp; + op = BE16_TO_U16(arh->arh_operation); + + DTRACE_PROBE2(ip__arp__input, (in_addr_t), src_paddr, + (in_addr_t), dst_paddr); + + /* Determine if this is just a probe */ + is_probe = (src_paddr == INADDR_ANY); + + /* + * ira_ill is the only field used down the arp_notify path. + */ + bzero(&iras, sizeof (iras)); + iras.ira_ill = iras.ira_rill = ill; + /* + * RFC 826: first check if the <protocol, sender protocol address> is + * in the cache, if there is a sender protocol address. Note that this + * step also handles resolutions based on source. + */ + /* Note: after here we need to freeb(mp) and freemsg(mp1) separately */ + mp->b_cont = NULL; + if (is_probe) { + err = AR_NOTFOUND; + } else { + if (plen != 4) { + arp_drop_packet("bad protocol len", mp, ill); + return; + } + err = ip_nce_resolve_all(ill, src_haddr, hlen, &src_paddr, + &src_ncec, op); + switch (err) { + case AR_BOGON: + ASSERT(src_ncec != NULL); + arp_notify(src_paddr, mp1, AR_CN_BOGON, + &iras, src_ncec); + break; + case AR_FAILED: + arp_notify(src_paddr, mp1, AR_CN_FAILED, &iras, + src_ncec); + break; + case AR_LOOPBACK: + DTRACE_PROBE2(rput_loopback, ill_t *, ill, arh_t *, + arh); + freemsg(mp1); + break; + default: + goto update; + } + freemsg(mp); + if (src_ncec != NULL) + ncec_refrele(src_ncec); + return; + } +update: + /* + * Now look up the destination address. By RFC 826, we ignore the + * packet at this step if the target isn't one of our addresses (i.e., + * one we have been asked to PUBLISH). This is true even if the + * target is something we're trying to resolve and the packet + * is a response. + */ + dst_ncec = ncec_lookup_illgrp_v4(ill, &dst_paddr); + if (dst_ncec == NULL || !NCE_PUBLISH(dst_ncec)) { + /* + * Let the client know if the source mapping has changed, even + * if the destination provides no useful information for the + * client. + */ + if (err == AR_CHANGED) { + arp_notify(src_paddr, mp1, AR_CN_ANNOUNCE, &iras, + NULL); + freemsg(mp); + } else { + freemsg(mp); + arp_drop_packet("Target is not interesting", mp1, ill); + } + if (dst_ncec != NULL) + ncec_refrele(dst_ncec); + if (src_ncec != NULL) + ncec_refrele(src_ncec); + return; + } + + if (dst_ncec->ncec_flags & NCE_F_UNVERIFIED) { + /* + * Check for a reflection. Some misbehaving bridges will + * reflect our own transmitted packets back to us. + */ + ASSERT(NCE_PUBLISH(dst_ncec)); + if (hlen != dst_ncec->ncec_ill->ill_phys_addr_length) { + ncec_refrele(dst_ncec); + if (src_ncec != NULL) + ncec_refrele(src_ncec); + freemsg(mp); + arp_drop_packet("bad arh_len", mp1, ill); + return; + } + if (!nce_cmp_ll_addr(dst_ncec, src_haddr, hlen)) { + DTRACE_PROBE3(rput_probe_reflected, ill_t *, ill, + arh_t *, arh, ncec_t *, dst_ncec); + ncec_refrele(dst_ncec); + if (src_ncec != NULL) + ncec_refrele(src_ncec); + freemsg(mp); + arp_drop_packet("Reflected probe", mp1, ill); + return; + } + /* + * Responses targeting our HW address that are not responses to + * our DAD probe must be ignored as they are related to requests + * sent before DAD was restarted. + */ + if (op == ARP_RESPONSE && + (nce_cmp_ll_addr(dst_ncec, dst_haddr, hlen) == 0)) { + ncec_refrele(dst_ncec); + if (src_ncec != NULL) + ncec_refrele(src_ncec); + freemsg(mp); + arp_drop_packet( + "Response to request that was sent before DAD", + mp1, ill); + return; + } + /* + * Responses targeted to HW addresses which are not ours but + * sent to our unverified proto address are also conflicts. + * These may be reported by a proxy rather than the interface + * with the conflicting address, dst_paddr is in conflict + * rather than src_paddr. To ensure IP can locate the correct + * ipif to take down, it is necessary to copy dst_paddr to + * the src_paddr field before sending it to IP. The same is + * required for probes, where src_paddr will be INADDR_ANY. + */ + if (is_probe || op == ARP_RESPONSE) { + bcopy(dp, sp, plen); + arp_notify(src_paddr, mp1, AR_CN_FAILED, &iras, + NULL); + ncec_delete(dst_ncec); + } else if (err == AR_CHANGED) { + arp_notify(src_paddr, mp1, AR_CN_ANNOUNCE, &iras, + NULL); + } else { + DTRACE_PROBE3(rput_request_unverified, + ill_t *, ill, arh_t *, arh, ncec_t *, dst_ncec); + arp_drop_packet("Unverified request", mp1, ill); + } + freemsg(mp); + ncec_refrele(dst_ncec); + if (src_ncec != NULL) + ncec_refrele(src_ncec); + return; + } + /* + * If it's a request, then we reply to this, and if we think the + * sender's unknown, then we create an entry to avoid unnecessary ARPs. + * The design assumption is that someone ARPing us is likely to send us + * a packet soon, and that we'll want to reply to it. + */ + if (op == ARP_REQUEST) { + const uchar_t *nce_hwaddr; + struct in_addr nce_paddr; + clock_t now; + ill_t *under_ill = ill; + boolean_t send_unicast = B_TRUE; + + ASSERT(NCE_PUBLISH(dst_ncec)); + + if ((dst_ncec->ncec_flags & (NCE_F_BCAST|NCE_F_MCAST)) != 0) { + /* + * Ignore senders who are deliberately or accidentally + * confused. + */ + goto bail; + } + + if (!is_probe && err == AR_NOTFOUND) { + ASSERT(src_ncec == NULL); + + if (IS_UNDER_IPMP(under_ill)) { + /* + * create the ncec for the sender on ipmp_ill. + * We pass in the ipmp_ill itself to avoid + * creating an nce_t on the under_ill. + */ + ill = ipmp_ill_hold_ipmp_ill(under_ill); + if (ill == NULL) + ill = under_ill; + else + need_ill_refrele = B_TRUE; + } + + err = nce_lookup_then_add_v4(ill, src_haddr, hlen, + &src_paddr, 0, ND_STALE, &nce); + + switch (err) { + case 0: + case EEXIST: + ip1dbg(("added ncec %p in state %d ill %s\n", + (void *)src_ncec, src_ncec->ncec_state, + ill->ill_name)); + src_ncec = nce->nce_common; + break; + default: + /* + * Either no memory, or the outgoing interface + * is in the process of down/unplumb. In the + * latter case, we will fail the send anyway, + * and in the former case, we should try to send + * the ARP response. + */ + src_lladdr = src_haddr; + goto send_response; + } + ncec_refhold(src_ncec); + nce_refrele(nce); + /* set up cleanup interval on ncec */ + } + + /* + * This implements periodic address defense based on a modified + * version of the RFC 3927 requirements. Instead of sending a + * broadcasted reply every time, as demanded by the RFC, we + * send at most one broadcast reply per arp_broadcast_interval. + */ + now = ddi_get_lbolt(); + if ((now - dst_ncec->ncec_last_time_defended) > + MSEC_TO_TICK(ipst->ips_ipv4_dad_announce_interval)) { + dst_ncec->ncec_last_time_defended = now; + /* + * If this is one of the long-suffering entries, + * pull it out now. It no longer needs separate + * defense, because we're now doing that with this + * broadcasted reply. + */ + dst_ncec->ncec_flags &= ~NCE_F_DELAYED; + send_unicast = B_FALSE; + } + if (src_ncec != NULL && send_unicast) { + src_lladdr = src_ncec->ncec_lladdr; + } else { + src_lladdr = under_ill->ill_bcast_mp->b_rptr + + NCE_LL_ADDR_OFFSET(under_ill); + } +send_response: + nce_hwaddr = dst_ncec->ncec_lladdr; + IN6_V4MAPPED_TO_INADDR(&dst_ncec->ncec_addr, &nce_paddr); + + (void) arp_output(under_ill, ARP_RESPONSE, + nce_hwaddr, (uchar_t *)&nce_paddr, src_haddr, + (uchar_t *)&src_paddr, src_lladdr); + } +bail: + if (dst_ncec != NULL) { + ncec_refrele(dst_ncec); + } + if (src_ncec != NULL) { + ncec_refrele(src_ncec); + } + if (err == AR_CHANGED) { + mp->b_cont = NULL; + arp_notify(src_paddr, mp1, AR_CN_ANNOUNCE, &iras, NULL); + mp1 = NULL; + } + if (need_ill_refrele) + ill_refrele(ill); +done: + freemsg(mp); + freemsg(mp1); +} + +/* + * Basic initialization of the arl_t and the arl_common structure shared with + * the ill_t that is done after SLIFNAME/IF_UNITSEL. + */ +static int +arl_ill_init(arl_t *arl, char *ill_name) +{ + ill_t *ill; + arl_ill_common_t *ai; + + ill = ill_lookup_on_name(ill_name, B_FALSE, B_FALSE, B_FALSE, + arl->arl_ipst); + + if (ill == NULL) + return (ENXIO); + + /* + * By the time we set up the arl, we expect the ETHERTYPE_IP + * stream to be fully bound and attached. So we copy/verify + * relevant information as possible from/against the ill. + * + * The following should have been set up in arp_ll_set_defaults() + * after the first DL_INFO_ACK was received. + */ + ASSERT(arl->arl_phys_addr_length == ill->ill_phys_addr_length); + ASSERT(arl->arl_sap == ETHERTYPE_ARP); + ASSERT(arl->arl_mactype == ill->ill_mactype); + ASSERT(arl->arl_sap_length == ill->ill_sap_length); + + ai = kmem_zalloc(sizeof (*ai), KM_SLEEP); + mutex_enter(&ill->ill_lock); + /* First ensure that the ill is not CONDEMNED. */ + if (ill->ill_state_flags & ILL_CONDEMNED) { + mutex_exit(&ill->ill_lock); + ill_refrele(ill); + kmem_free(ai, sizeof (*ai)); + return (ENXIO); + } + if (ill->ill_common != NULL || arl->arl_common != NULL) { + mutex_exit(&ill->ill_lock); + ip0dbg(("%s: PPA already exists", ill->ill_name)); + ill_refrele(ill); + kmem_free(ai, sizeof (*ai)); + return (EEXIST); + } + mutex_init(&ai->ai_lock, NULL, MUTEX_DEFAULT, NULL); + ai->ai_arl = arl; + ai->ai_ill = ill; + ill->ill_common = ai; + arl->arl_common = ai; + mutex_exit(&ill->ill_lock); + (void) strlcpy(arl->arl_name, ill->ill_name, LIFNAMSIZ); + arl->arl_name_length = ill->ill_name_length; + ill_refrele(ill); + arp_ifname_notify(arl); + return (0); +} + +/* Allocate and do common initializations for DLPI messages. */ +static mblk_t * +ip_ar_dlpi_comm(t_uscalar_t prim, size_t size) +{ + mblk_t *mp; + + if ((mp = allocb(size, BPRI_HI)) == NULL) + return (NULL); + + /* + * DLPIv2 says that DL_INFO_REQ and DL_TOKEN_REQ (the latter + * of which we don't seem to use) are sent with M_PCPROTO, and + * that other DLPI are M_PROTO. + */ + DB_TYPE(mp) = (prim == DL_INFO_REQ) ? M_PCPROTO : M_PROTO; + + mp->b_wptr = mp->b_rptr + size; + bzero(mp->b_rptr, size); + DL_PRIM(mp) = prim; + return (mp); +} + + +int +ip_sioctl_ifunitsel_arp(queue_t *q, int *ppa) +{ + arl_t *arl; + char *cp, ill_name[LIFNAMSIZ]; + + if (q->q_next == NULL) + return (EINVAL); + + do { + q = q->q_next; + } while (q->q_next != NULL); + cp = q->q_qinfo->qi_minfo->mi_idname; + + arl = (arl_t *)q->q_ptr; + (void) snprintf(ill_name, sizeof (ill_name), "%s%d", cp, *ppa); + arl->arl_ppa = *ppa; + return (arl_ill_init(arl, ill_name)); +} + +int +ip_sioctl_slifname_arp(queue_t *q, void *lifreq) +{ + arl_t *arl; + struct lifreq *lifr = lifreq; + + /* ioctl not valid when IP opened as a device */ + if (q->q_next == NULL) + return (EINVAL); + + arl = (arl_t *)q->q_ptr; + arl->arl_ppa = lifr->lifr_ppa; + return (arl_ill_init(arl, lifr->lifr_name)); +} + +arl_t * +ill_to_arl(ill_t *ill) +{ + arl_ill_common_t *ai = ill->ill_common; + arl_t *arl = NULL; + + if (ai == NULL) + return (NULL); + /* + * Find the arl_t that corresponds to this ill_t from the shared + * ill_common structure. We can safely access the ai here as it + * will only be freed in arp_modclose() after we have become + * single-threaded. + */ + mutex_enter(&ai->ai_lock); + if ((arl = ai->ai_arl) != NULL) { + mutex_enter(&arl->arl_lock); + if (!(arl->arl_state_flags & ARL_CONDEMNED)) { + arl_refhold_locked(arl); + mutex_exit(&arl->arl_lock); + } else { + mutex_exit(&arl->arl_lock); + arl = NULL; + } + } + mutex_exit(&ai->ai_lock); + return (arl); +} + +ill_t * +arl_to_ill(arl_t *arl) +{ + arl_ill_common_t *ai = arl->arl_common; + ill_t *ill = NULL; + + if (ai == NULL) { + /* + * happens when the arp stream is just being opened, and + * arl_ill_init has not been executed yet. + */ + return (NULL); + } + /* + * Find the ill_t that corresponds to this arl_t from the shared + * arl_common structure. We can safely access the ai here as it + * will only be freed in arp_modclose() after we have become + * single-threaded. + */ + mutex_enter(&ai->ai_lock); + if ((ill = ai->ai_ill) != NULL) { + mutex_enter(&ill->ill_lock); + if (!ILL_IS_CONDEMNED(ill)) { + ill_refhold_locked(ill); + mutex_exit(&ill->ill_lock); + } else { + mutex_exit(&ill->ill_lock); + ill = NULL; + } + } + mutex_exit(&ai->ai_lock); + return (ill); +} + +int +arp_ll_up(ill_t *ill) +{ + mblk_t *attach_mp = NULL; + mblk_t *bind_mp = NULL; + mblk_t *unbind_mp = NULL; + arl_t *arl; + + ASSERT(IAM_WRITER_ILL(ill)); + arl = ill_to_arl(ill); + + DTRACE_PROBE2(ill__downup, char *, "arp_ll_up", ill_t *, ill); + if (arl == NULL) + return (ENXIO); + DTRACE_PROBE2(arl__downup, char *, "arp_ll_up", arl_t *, arl); + if ((arl->arl_state_flags & ARL_LL_UP) != 0) { + arl_refrele(arl); + return (0); + } + if (arl->arl_needs_attach) { /* DL_STYLE2 */ + attach_mp = + ip_ar_dlpi_comm(DL_ATTACH_REQ, sizeof (dl_attach_req_t)); + if (attach_mp == NULL) + goto bad; + ((dl_attach_req_t *)attach_mp->b_rptr)->dl_ppa = arl->arl_ppa; + } + + /* Allocate and initialize a bind message. */ + bind_mp = ip_ar_dlpi_comm(DL_BIND_REQ, sizeof (dl_bind_req_t)); + if (bind_mp == NULL) + goto bad; + ((dl_bind_req_t *)bind_mp->b_rptr)->dl_sap = ETHERTYPE_ARP; + ((dl_bind_req_t *)bind_mp->b_rptr)->dl_service_mode = DL_CLDLS; + + unbind_mp = ip_ar_dlpi_comm(DL_UNBIND_REQ, sizeof (dl_unbind_req_t)); + if (unbind_mp == NULL) + goto bad; + if (arl->arl_needs_attach) { + arp_dlpi_send(arl, attach_mp); + } + arl->arl_unbind_mp = unbind_mp; + + arl->arl_state_flags |= ARL_LL_BIND_PENDING; + arp_dlpi_send(arl, bind_mp); + arl_refrele(arl); + return (EINPROGRESS); + +bad: + freemsg(attach_mp); + freemsg(bind_mp); + freemsg(unbind_mp); + arl_refrele(arl); + return (ENOMEM); +} + +/* + * consumes/frees mp + */ +static void +arp_notify(in_addr_t src, mblk_t *mp, uint32_t arcn_code, + ip_recv_attr_t *ira, ncec_t *ncec) +{ + char hbuf[MAC_STR_LEN]; + char sbuf[INET_ADDRSTRLEN]; + ill_t *ill = ira->ira_ill; + ip_stack_t *ipst = ill->ill_ipst; + arh_t *arh = (arh_t *)mp->b_rptr; + + switch (arcn_code) { + case AR_CN_BOGON: + /* + * Someone is sending ARP packets with a source protocol + * address that we have published and for which we believe our + * entry is authoritative and verified to be unique on + * the network. + * + * arp_process_packet() sends AR_CN_FAILED for the case when + * a DAD probe is received and the hardware address of a + * non-authoritative entry has changed. Thus, AR_CN_BOGON + * indicates a real conflict, and we have to do resolution. + * + * We back away quickly from the address if it's from DHCP or + * otherwise temporary and hasn't been used recently (or at + * all). We'd like to include "deprecated" addresses here as + * well (as there's no real reason to defend something we're + * discarding), but IPMP "reuses" this flag to mean something + * other than the standard meaning. + */ + if (ip_nce_conflict(mp, ira, ncec)) { + (void) mac_colon_addr((uint8_t *)(arh + 1), + arh->arh_hlen, hbuf, sizeof (hbuf)); + (void) ip_dot_addr(src, sbuf); + cmn_err(CE_WARN, + "proxy ARP problem? Node '%s' is using %s on %s", + hbuf, sbuf, ill->ill_name); + if (!arp_no_defense) + (void) arp_announce(ncec); + /* + * ncec_last_time_defended has been adjusted in + * ip_nce_conflict. + */ + } else { + ncec_delete(ncec); + } + freemsg(mp); + break; + case AR_CN_ANNOUNCE: { + nce_hw_map_t hwm; + /* + * ARP gives us a copy of any packet where it thinks + * the address has changed, so that we can update our + * caches. We're responsible for caching known answers + * in the current design. We check whether the + * hardware address really has changed in all of our + * entries that have cached this mapping, and if so, we + * blow them away. This way we will immediately pick + * up the rare case of a host changing hardware + * address. + */ + if (src == 0) { + freemsg(mp); + break; + } + hwm.hwm_addr = src; + hwm.hwm_hwlen = arh->arh_hlen; + hwm.hwm_hwaddr = (uchar_t *)(arh + 1); + hwm.hwm_flags = 0; + ncec_walk_common(ipst->ips_ndp4, NULL, + (pfi_t)nce_update_hw_changed, &hwm, B_TRUE); + freemsg(mp); + break; + } + case AR_CN_FAILED: + if (arp_no_defense) { + (void) mac_colon_addr((uint8_t *)(arh + 1), + arh->arh_hlen, hbuf, sizeof (hbuf)); + (void) ip_dot_addr(src, sbuf); + + cmn_err(CE_WARN, + "node %s is using our IP address %s on %s", + hbuf, sbuf, ill->ill_name); + freemsg(mp); + break; + } + /* + * mp will be freed by arp_excl. + */ + ill_refhold(ill); + qwriter_ip(ill, ill->ill_rq, mp, arp_excl, NEW_OP, B_FALSE); + return; + default: + ASSERT(0); + freemsg(mp); + break; + } +} + +/* + * arp_output is called to transmit an ARP Request or Response. The mapping + * to RFC 826 variables is: + * haddr1 == ar$sha + * paddr1 == ar$spa + * haddr2 == ar$tha + * paddr2 == ar$tpa + * The ARP frame is sent to the ether_dst in dst_lladdr. + */ +static int +arp_output(ill_t *ill, uint32_t operation, + const uchar_t *haddr1, const uchar_t *paddr1, const uchar_t *haddr2, + const uchar_t *paddr2, uchar_t *dst_lladdr) +{ + arh_t *arh; + uint8_t *cp; + uint_t hlen; + uint32_t plen = IPV4_ADDR_LEN; /* ar$pln from RFC 826 */ + uint32_t proto = IP_ARP_PROTO_TYPE; + mblk_t *mp; + arl_t *arl; + + ASSERT(dst_lladdr != NULL); + hlen = ill->ill_phys_addr_length; /* ar$hln from RFC 826 */ + mp = ill_dlur_gen(dst_lladdr, hlen, ETHERTYPE_ARP, ill->ill_sap_length); + + if (mp == NULL) + return (ENOMEM); + + /* IFF_NOARP flag is set or link down: do not send arp messages */ + if ((ill->ill_flags & ILLF_NOARP) || !ill->ill_dl_up) { + freemsg(mp); + return (ENXIO); + } + + mp->b_cont = allocb(AR_LL_HDR_SLACK + ARH_FIXED_LEN + (hlen * 4) + + plen + plen, BPRI_MED); + if (mp->b_cont == NULL) { + freeb(mp); + return (ENOMEM); + } + + /* Fill in the ARP header. */ + cp = mp->b_cont->b_rptr + (AR_LL_HDR_SLACK + hlen + hlen); + mp->b_cont->b_rptr = cp; + arh = (arh_t *)cp; + U16_TO_BE16(arp_hw_type(ill->ill_mactype), arh->arh_hardware); + U16_TO_BE16(proto, arh->arh_proto); + arh->arh_hlen = (uint8_t)hlen; + arh->arh_plen = (uint8_t)plen; + U16_TO_BE16(operation, arh->arh_operation); + cp += ARH_FIXED_LEN; + bcopy(haddr1, cp, hlen); + cp += hlen; + if (paddr1 == NULL) + bzero(cp, plen); + else + bcopy(paddr1, cp, plen); + cp += plen; + if (haddr2 == NULL) + bzero(cp, hlen); + else + bcopy(haddr2, cp, hlen); + cp += hlen; + bcopy(paddr2, cp, plen); + cp += plen; + mp->b_cont->b_wptr = cp; + + DTRACE_PROBE3(arp__physical__out__start, + ill_t *, ill, arh_t *, arh, mblk_t *, mp); + ARP_HOOK_OUT(ill->ill_ipst->ips_arp_physical_out_event, + ill->ill_ipst->ips_arp_physical_out, + ill->ill_phyint->phyint_ifindex, arh, mp, mp->b_cont, + ill->ill_ipst); + DTRACE_PROBE1(arp__physical__out__end, mblk_t *, mp); + if (mp == NULL) + return (0); + + /* Ship it out. */ + arl = ill_to_arl(ill); + if (arl == NULL) { + freemsg(mp); + return (0); + } + if (canputnext(arl->arl_wq)) + putnext(arl->arl_wq, mp); + else + freemsg(mp); + arl_refrele(arl); + return (0); +} + +/* + * Process resolve requests. + * If we are not yet reachable then we check and decrease ncec_rcnt; otherwise + * we leave it alone (the caller will check and manage ncec_pcnt in those + * cases.) + */ +int +arp_request(ncec_t *ncec, in_addr_t sender, ill_t *ill) +{ + int err; + const uchar_t *target_hwaddr; + struct in_addr nce_paddr; + uchar_t *dst_lladdr; + boolean_t use_rcnt = !NCE_ISREACHABLE(ncec); + + ASSERT(MUTEX_HELD(&ncec->ncec_lock)); + ASSERT(!IS_IPMP(ill)); + + if (use_rcnt && ncec->ncec_rcnt == 0) { + /* not allowed any more retransmits. */ + return (0); + } + + if ((ill->ill_flags & ILLF_NOARP) != 0) + return (0); + + IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &nce_paddr); + + target_hwaddr = + ill->ill_bcast_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill); + + if (NCE_ISREACHABLE(ncec)) { + dst_lladdr = ncec->ncec_lladdr; + } else { + dst_lladdr = ill->ill_bcast_mp->b_rptr + + NCE_LL_ADDR_OFFSET(ill); + } + + mutex_exit(&ncec->ncec_lock); + err = arp_output(ill, ARP_REQUEST, + ill->ill_phys_addr, (uchar_t *)&sender, target_hwaddr, + (uchar_t *)&nce_paddr, dst_lladdr); + mutex_enter(&ncec->ncec_lock); + + if (err != 0) { + /* + * Some transient error such as ENOMEM or a down link was + * encountered. If the link has been taken down permanently, + * the ncec will eventually be cleaned up (ipif_down_tail() + * will call ipif_nce_down() and flush the ncec), to terminate + * recurring attempts to send ARP requests. In all other cases, + * allow the caller another chance at success next time. + */ + return (ncec->ncec_ill->ill_reachable_retrans_time); + } + + if (use_rcnt) + ncec->ncec_rcnt--; + + return (ncec->ncec_ill->ill_reachable_retrans_time); +} + +/* return B_TRUE if dropped */ +boolean_t +arp_announce(ncec_t *ncec) +{ + ill_t *ill; + int err; + uchar_t *sphys_addr, *bcast_addr; + struct in_addr ncec_addr; + boolean_t need_refrele = B_FALSE; + + ASSERT((ncec->ncec_flags & NCE_F_BCAST) == 0); + ASSERT((ncec->ncec_flags & NCE_F_MCAST) == 0); + + if (IS_IPMP(ncec->ncec_ill)) { + /* sent on the cast_ill */ + ill = ipmp_ill_get_xmit_ill(ncec->ncec_ill, B_FALSE); + if (ill == NULL) + return (B_TRUE); + need_refrele = B_TRUE; + } else { + ill = ncec->ncec_ill; + } + + /* + * broadcast an announce to ill_bcast address. + */ + IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ncec_addr); + + sphys_addr = ncec->ncec_lladdr; + bcast_addr = ill->ill_bcast_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill); + + err = arp_output(ill, ARP_REQUEST, + sphys_addr, (uchar_t *)&ncec_addr, bcast_addr, + (uchar_t *)&ncec_addr, bcast_addr); + + if (need_refrele) + ill_refrele(ill); + return (err != 0); +} + +/* return B_TRUE if dropped */ +boolean_t +arp_probe(ncec_t *ncec) +{ + ill_t *ill; + int err; + struct in_addr ncec_addr; + uchar_t *sphys_addr, *dst_lladdr; + + if (IS_IPMP(ncec->ncec_ill)) { + ill = ipmp_ill_get_xmit_ill(ncec->ncec_ill, B_FALSE); + if (ill == NULL) + return (B_TRUE); + } else { + ill = ncec->ncec_ill; + } + + IN6_V4MAPPED_TO_INADDR(&ncec->ncec_addr, &ncec_addr); + + sphys_addr = ncec->ncec_lladdr; + dst_lladdr = ill->ill_bcast_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill); + err = arp_output(ill, ARP_REQUEST, + sphys_addr, NULL, NULL, (uchar_t *)&ncec_addr, dst_lladdr); + + if (IS_IPMP(ncec->ncec_ill)) + ill_refrele(ill); + return (err != 0); +} + +static mblk_t * +arl_unbind(arl_t *arl) +{ + mblk_t *mp; + + if ((mp = arl->arl_unbind_mp) != NULL) { + arl->arl_unbind_mp = NULL; + arl->arl_state_flags |= ARL_DL_UNBIND_IN_PROGRESS; + } + return (mp); +} + +int +arp_ll_down(ill_t *ill) +{ + arl_t *arl; + mblk_t *unbind_mp; + int err = 0; + boolean_t replumb = (ill->ill_replumbing == 1); + + DTRACE_PROBE2(ill__downup, char *, "arp_ll_down", ill_t *, ill); + if ((arl = ill_to_arl(ill)) == NULL) + return (ENXIO); + DTRACE_PROBE2(arl__downup, char *, "arp_ll_down", arl_t *, arl); + mutex_enter(&arl->arl_lock); + unbind_mp = arl_unbind(arl); + if (unbind_mp != NULL) { + ASSERT(arl->arl_state_flags & ARL_DL_UNBIND_IN_PROGRESS); + DTRACE_PROBE2(arp__unbinding, mblk_t *, unbind_mp, + arl_t *, arl); + err = EINPROGRESS; + if (replumb) + arl->arl_state_flags |= ARL_LL_REPLUMBING; + } + mutex_exit(&arl->arl_lock); + if (unbind_mp != NULL) + arp_dlpi_send(arl, unbind_mp); + arl_refrele(arl); + return (err); +} + +/* ARGSUSED */ +int +arp_close(queue_t *q, int flags) +{ + if (WR(q)->q_next != NULL) { + /* This is a module close */ + return (arp_modclose(q->q_ptr)); + } + qprocsoff(q); + q->q_ptr = WR(q)->q_ptr = NULL; + return (0); +} + +static int +arp_modclose(arl_t *arl) +{ + arl_ill_common_t *ai = arl->arl_common; + ill_t *ill; + queue_t *q = arl->arl_rq; + mblk_t *mp, *nextmp; + ipsq_t *ipsq = NULL; + + ill = arl_to_ill(arl); + if (ill != NULL) { + if (!ill_waiter_inc(ill)) { + ill_refrele(ill); + } else { + ill_refrele(ill); + if (ipsq_enter(ill, B_FALSE, NEW_OP)) + ipsq = ill->ill_phyint->phyint_ipsq; + ill_waiter_dcr(ill); + } + if (ipsq == NULL) { + /* + * could not enter the ipsq because ill is already + * marked CONDEMNED. + */ + ill = NULL; + } + } + if (ai != NULL && ipsq == NULL) { + /* + * Either we did not get an ill because it was marked CONDEMNED + * or we could not enter the ipsq because it was unplumbing. + * In both cases, wait for the ill to complete ip_modclose(). + * + * If the arp_modclose happened even before SLIFNAME, the ai + * itself would be NULL, in which case we can complete the close + * without waiting. + */ + mutex_enter(&ai->ai_lock); + while (ai->ai_ill != NULL) + cv_wait(&ai->ai_ill_unplumb_done, &ai->ai_lock); + mutex_exit(&ai->ai_lock); + } + ASSERT(ill == NULL || IAM_WRITER_ILL(ill)); + + mutex_enter(&arl->arl_lock); + /* + * If the ill had completed unplumbing before arp_modclose(), there + * would be no ill (and therefore, no ipsq) to serialize arp_modclose() + * so that we need to explicitly check for ARL_CONDEMNED and back off + * if it is set. + */ + if ((arl->arl_state_flags & ARL_CONDEMNED) != 0) { + mutex_exit(&arl->arl_lock); + ASSERT(ipsq == NULL); + return (0); + } + arl->arl_state_flags |= ARL_CONDEMNED; + + /* + * send out all pending dlpi messages, don't wait for the ack (which + * will be ignored in arp_rput when CONDEMNED is set) + * + * We have to check for pending DL_UNBIND_REQ because, in the case + * that ip_modclose() executed before arp_modclose(), the call to + * ill_delete_tail->ipif_arp_down() would have triggered a + * DL_UNBIND_REQ. When arp_modclose() executes ipsq_enter() will fail + * (since ip_modclose() is in the ipsq) but the DL_UNBIND_ACK may not + * have been processed yet. In this scenario, we cannot reset + * arl_dlpi_pending, because the setting/clearing of arl_state_flags + * related to unbind, and the associated cv_waits must be allowed to + * continue. + */ + if (arl->arl_dlpi_pending != DL_UNBIND_REQ) + arl->arl_dlpi_pending = DL_PRIM_INVAL; + mp = arl->arl_dlpi_deferred; + arl->arl_dlpi_deferred = NULL; + mutex_exit(&arl->arl_lock); + + for (; mp != NULL; mp = nextmp) { + nextmp = mp->b_next; + mp->b_next = NULL; + putnext(arl->arl_wq, mp); + } + + /* Wait for data paths to quiesce */ + mutex_enter(&arl->arl_lock); + while (arl->arl_refcnt != 0) + cv_wait(&arl->arl_cv, &arl->arl_lock); + + /* + * unbind, so that nothing else can come up from driver. + */ + mp = arl_unbind(arl); + mutex_exit(&arl->arl_lock); + if (mp != NULL) + arp_dlpi_send(arl, mp); + mutex_enter(&arl->arl_lock); + + /* wait for unbind ack */ + while (arl->arl_state_flags & ARL_DL_UNBIND_IN_PROGRESS) + cv_wait(&arl->arl_cv, &arl->arl_lock); + mutex_exit(&arl->arl_lock); + + qprocsoff(q); + + if (ill != NULL) { + mutex_enter(&ill->ill_lock); + ill->ill_arl_dlpi_pending = 0; + mutex_exit(&ill->ill_lock); + } + + if (ai != NULL) { + mutex_enter(&ai->ai_lock); + ai->ai_arl = NULL; + if (ai->ai_ill == NULL) { + mutex_destroy(&ai->ai_lock); + kmem_free(ai, sizeof (*ai)); + } else { + mutex_exit(&ai->ai_lock); + } + } + + /* free up the rest */ + arp_mod_close_tail(arl); + + q->q_ptr = WR(q)->q_ptr = NULL; + + if (ipsq != NULL) + ipsq_exit(ipsq); + + return (0); +} + +static void +arp_mod_close_tail(arl_t *arl) +{ + ip_stack_t *ipst = arl->arl_ipst; + mblk_t **mpp; + + netstack_hold(ipst->ips_netstack); + + mutex_enter(&ipst->ips_ip_mi_lock); + mi_close_unlink(&ipst->ips_arp_g_head, (IDP)arl); + mutex_exit(&ipst->ips_ip_mi_lock); + + /* + * credp could be null if the open didn't succeed and ip_modopen + * itself calls ip_close. + */ + if (arl->arl_credp != NULL) + crfree(arl->arl_credp); + + /* Free all retained control messages. */ + mpp = &arl->arl_first_mp_to_free; + do { + while (mpp[0]) { + mblk_t *mp; + mblk_t *mp1; + + mp = mpp[0]; + mpp[0] = mp->b_next; + for (mp1 = mp; mp1 != NULL; mp1 = mp1->b_cont) { + mp1->b_next = NULL; + mp1->b_prev = NULL; + } + freemsg(mp); + } + } while (mpp++ != &arl->arl_last_mp_to_free); + + netstack_rele(ipst->ips_netstack); + mi_free(arl->arl_name); + mi_close_free((IDP)arl); +} + +/* + * DAD failed. Tear down ipifs with the specified srce address. Note that + * tearing down the ipif also meas deleting the ncec through ipif_down, + * so it is not possible to use nce_timer for recovery. Instead we start + * a timer on the ipif. Caller has to free the mp. + */ +void +arp_failure(mblk_t *mp, ip_recv_attr_t *ira) +{ + ill_t *ill = ira->ira_ill; + + if ((mp = copymsg(mp)) != NULL) { + ill_refhold(ill); + qwriter_ip(ill, ill->ill_rq, mp, arp_excl, NEW_OP, B_FALSE); + } +} + +/* + * This is for exclusive changes due to ARP. Tear down an interface due + * to AR_CN_FAILED and AR_CN_BOGON. + */ +/* ARGSUSED */ +static void +arp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) +{ + ill_t *ill = rq->q_ptr; + arh_t *arh; + ipaddr_t src; + ipif_t *ipif; + ip_stack_t *ipst = ill->ill_ipst; + uchar_t *haddr; + uint_t haddrlen; + + /* first try src = ar$spa */ + arh = (arh_t *)mp->b_rptr; + bcopy((char *)&arh[1] + arh->arh_hlen, &src, IP_ADDR_LEN); + + haddrlen = arh->arh_hlen; + haddr = (uint8_t *)(arh + 1); + + if (haddrlen == ill->ill_phys_addr_length) { + /* + * Ignore conflicts generated by misbehaving switches that + * just reflect our own messages back to us. For IPMP, we may + * see reflections across any ill in the illgrp. + */ + /* For an under ill_grp can change under lock */ + rw_enter(&ipst->ips_ill_g_lock, RW_READER); + if (bcmp(haddr, ill->ill_phys_addr, haddrlen) == 0 || + IS_UNDER_IPMP(ill) && ill->ill_grp != NULL && + ipmp_illgrp_find_ill(ill->ill_grp, haddr, + haddrlen) != NULL) { + rw_exit(&ipst->ips_ill_g_lock); + goto ignore_conflict; + } + rw_exit(&ipst->ips_ill_g_lock); + } + + /* + * Look up the appropriate ipif. + */ + ipif = ipif_lookup_addr(src, ill, ALL_ZONES, ipst); + if (ipif == NULL) + goto ignore_conflict; + + /* Reload the ill to match the ipif */ + ill = ipif->ipif_ill; + + /* If it's already duplicate or ineligible, then don't do anything. */ + if (ipif->ipif_flags & (IPIF_POINTOPOINT|IPIF_DUPLICATE)) { + ipif_refrele(ipif); + goto ignore_conflict; + } + + /* + * If we failed on a recovery probe, then restart the timer to + * try again later. + */ + if (!ipif->ipif_was_dup) { + char hbuf[MAC_STR_LEN]; + char sbuf[INET_ADDRSTRLEN]; + char ibuf[LIFNAMSIZ]; + + (void) mac_colon_addr(haddr, haddrlen, hbuf, sizeof (hbuf)); + (void) ip_dot_addr(src, sbuf); + ipif_get_name(ipif, ibuf, sizeof (ibuf)); + + cmn_err(CE_WARN, "%s has duplicate address %s (in use by %s);" + " disabled", ibuf, sbuf, hbuf); + } + mutex_enter(&ill->ill_lock); + ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); + ipif->ipif_flags |= IPIF_DUPLICATE; + ill->ill_ipif_dup_count++; + mutex_exit(&ill->ill_lock); + (void) ipif_down(ipif, NULL, NULL); + (void) ipif_down_tail(ipif); + mutex_enter(&ill->ill_lock); + if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && + ill->ill_net_type == IRE_IF_RESOLVER && + !(ipif->ipif_state_flags & IPIF_CONDEMNED) && + ipst->ips_ip_dup_recovery > 0) { + ASSERT(ipif->ipif_recovery_id == 0); + ipif->ipif_recovery_id = timeout(ipif_dup_recovery, + ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); + } + mutex_exit(&ill->ill_lock); + ipif_refrele(ipif); + +ignore_conflict: + freemsg(mp); +} + +/* + * This is a place for a dtrace hook. + * Note that mp can be either the DL_UNITDATA_IND with a b_cont payload, + * or just the ARP packet payload as an M_DATA. + */ +/* ARGSUSED */ +static void +arp_drop_packet(const char *str, mblk_t *mp, ill_t *ill) +{ + freemsg(mp); +} + +static boolean_t +arp_over_driver(queue_t *q) +{ + queue_t *qnext = STREAM(q)->sd_wrq->q_next; + + /* + * check if first module below stream head is IP or UDP. + */ + ASSERT(qnext != NULL); + if (strcmp(Q2NAME(qnext), "ip") != 0 && + strcmp(Q2NAME(qnext), "udp") != 0) { + /* + * module below is not ip or udp, so arp has been pushed + * on the driver. + */ + return (B_TRUE); + } + return (B_FALSE); +} + +static int +arp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) +{ + int err; + + ASSERT(sflag & MODOPEN); + if (!arp_over_driver(q)) { + q->q_qinfo = dummymodinfo.st_rdinit; + WR(q)->q_qinfo = dummymodinfo.st_wrinit; + return ((*dummymodinfo.st_rdinit->qi_qopen)(q, devp, flag, + sflag, credp)); + } + err = arp_modopen(q, devp, flag, sflag, credp); + return (err); +} + +/* + * In most cases we must be a writer on the IP stream before coming to + * arp_dlpi_send(), to serialize DLPI sends to the driver. The exceptions + * when we are not a writer are very early duing initialization (in + * arl_init, before the arl has done a SLIFNAME, so that we don't yet know + * the associated ill) or during arp_mod_close, when we could not enter the + * ipsq because the ill has already unplumbed. + */ +static void +arp_dlpi_send(arl_t *arl, mblk_t *mp) +{ + mblk_t **mpp; + t_uscalar_t prim; + arl_ill_common_t *ai; + + ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); + +#ifdef DEBUG + ai = arl->arl_common; + if (ai != NULL) { + mutex_enter(&ai->ai_lock); + if (ai->ai_ill != NULL) + ASSERT(IAM_WRITER_ILL(ai->ai_ill)); + mutex_exit(&ai->ai_lock); + } +#endif /* DEBUG */ + + mutex_enter(&arl->arl_lock); + if (arl->arl_dlpi_pending != DL_PRIM_INVAL) { + /* Must queue message. Tail insertion */ + mpp = &arl->arl_dlpi_deferred; + while (*mpp != NULL) + mpp = &((*mpp)->b_next); + + *mpp = mp; + mutex_exit(&arl->arl_lock); + return; + } + mutex_exit(&arl->arl_lock); + if ((prim = ((union DL_primitives *)mp->b_rptr)->dl_primitive) + == DL_BIND_REQ) { + ASSERT((arl->arl_state_flags & ARL_DL_UNBIND_IN_PROGRESS) == 0); + } + /* + * No need to take the arl_lock to examine ARL_CONDEMNED at this point + * because the only thread that can see ARL_CONDEMNED here is the + * closing arp_modclose() thread which sets the flag after becoming a + * writer on the ipsq. Threads from IP must have finished and + * cannot be active now. + */ + if (!(arl->arl_state_flags & ARL_CONDEMNED) || + (prim == DL_UNBIND_REQ)) { + if (prim != DL_NOTIFY_CONF) { + ill_t *ill = arl_to_ill(arl); + + arl->arl_dlpi_pending = prim; + if (ill != NULL) { + mutex_enter(&ill->ill_lock); + ill->ill_arl_dlpi_pending = 1; + mutex_exit(&ill->ill_lock); + ill_refrele(ill); + } + } + } + DTRACE_PROBE4(arl__dlpi, char *, "arp_dlpi_send", + char *, dl_primstr(prim), char *, "-", arl_t *, arl); + putnext(arl->arl_wq, mp); +} + +static void +arl_defaults_common(arl_t *arl, mblk_t *mp) +{ + dl_info_ack_t *dlia = (dl_info_ack_t *)mp->b_rptr; + /* + * Till the ill is fully up the ill is not globally visible. + * So no need for a lock. + */ + arl->arl_mactype = dlia->dl_mac_type; + arl->arl_sap_length = dlia->dl_sap_length; + + if (!arl->arl_dlpi_style_set) { + if (dlia->dl_provider_style == DL_STYLE2) + arl->arl_needs_attach = 1; + mutex_enter(&arl->arl_lock); + ASSERT(arl->arl_dlpi_style_set == 0); + arl->arl_dlpi_style_set = 1; + arl->arl_state_flags &= ~ARL_LL_SUBNET_PENDING; + cv_broadcast(&arl->arl_cv); + mutex_exit(&arl->arl_lock); + } +} + +int +arl_init(queue_t *q, arl_t *arl) +{ + mblk_t *info_mp; + dl_info_req_t *dlir; + + /* subset of ill_init */ + mutex_init(&arl->arl_lock, NULL, MUTEX_DEFAULT, 0); + + arl->arl_rq = q; + arl->arl_wq = WR(q); + + info_mp = allocb(MAX(sizeof (dl_info_req_t), sizeof (dl_info_ack_t)), + BPRI_HI); + if (info_mp == NULL) + return (ENOMEM); + /* + * allocate sufficient space to contain device name. + */ + arl->arl_name = (char *)(mi_zalloc(2 * LIFNAMSIZ)); + arl->arl_ppa = UINT_MAX; + arl->arl_state_flags |= (ARL_LL_SUBNET_PENDING | ARL_LL_UNBOUND); + + /* Send down the Info Request to the driver. */ + info_mp->b_datap->db_type = M_PCPROTO; + dlir = (dl_info_req_t *)info_mp->b_rptr; + info_mp->b_wptr = (uchar_t *)&dlir[1]; + dlir->dl_primitive = DL_INFO_REQ; + arl->arl_dlpi_pending = DL_PRIM_INVAL; + qprocson(q); + + arp_dlpi_send(arl, info_mp); + return (0); +} + +int +arl_wait_for_info_ack(arl_t *arl) +{ + int err; + + mutex_enter(&arl->arl_lock); + while (arl->arl_state_flags & ARL_LL_SUBNET_PENDING) { + /* + * Return value of 0 indicates a pending signal. + */ + err = cv_wait_sig(&arl->arl_cv, &arl->arl_lock); + if (err == 0) { + mutex_exit(&arl->arl_lock); + return (EINTR); + } + } + mutex_exit(&arl->arl_lock); + /* + * ip_rput_other could have set an error in ill_error on + * receipt of M_ERROR. + */ + return (arl->arl_error); +} + +void +arl_set_muxid(ill_t *ill, int muxid) +{ + arl_t *arl; + + arl = ill_to_arl(ill); + if (arl != NULL) { + arl->arl_muxid = muxid; + arl_refrele(arl); + } +} + +int +arl_get_muxid(ill_t *ill) +{ + arl_t *arl; + int muxid = 0; + + arl = ill_to_arl(ill); + if (arl != NULL) { + muxid = arl->arl_muxid; + arl_refrele(arl); + } + return (muxid); +} + +static int +arp_modopen(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) +{ + int err; + zoneid_t zoneid; + netstack_t *ns; + ip_stack_t *ipst; + arl_t *arl = NULL; + + /* + * Prevent unprivileged processes from pushing IP so that + * they can't send raw IP. + */ + if (secpolicy_net_rawaccess(credp) != 0) + return (EPERM); + + ns = netstack_find_by_cred(credp); + ASSERT(ns != NULL); + ipst = ns->netstack_ip; + ASSERT(ipst != NULL); + + /* + * For exclusive stacks we set the zoneid to zero + * to make IP operate as if in the global zone. + */ + if (ipst->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID) + zoneid = GLOBAL_ZONEID; + else + zoneid = crgetzoneid(credp); + + arl = (arl_t *)mi_open_alloc_sleep(sizeof (arl_t)); + q->q_ptr = WR(q)->q_ptr = arl; + arl->arl_ipst = ipst; + arl->arl_zoneid = zoneid; + err = arl_init(q, arl); + + if (err != 0) { + mi_free(arl->arl_name); + mi_free(arl); + netstack_rele(ipst->ips_netstack); + q->q_ptr = NULL; + WR(q)->q_ptr = NULL; + return (err); + } + + /* + * Wait for the DL_INFO_ACK if a DL_INFO_REQ was sent. + */ + err = arl_wait_for_info_ack(arl); + if (err == 0) + arl->arl_credp = credp; + else + goto fail; + + crhold(credp); + + mutex_enter(&ipst->ips_ip_mi_lock); + err = mi_open_link(&ipst->ips_arp_g_head, (IDP)q->q_ptr, devp, flag, + sflag, credp); + mutex_exit(&ipst->ips_ip_mi_lock); +fail: + if (err) { + (void) arp_close(q, 0); + return (err); + } + return (0); +} + +/* + * Notify any downstream modules (esp softmac and hitbox) of the name + * of this interface using an M_CTL. + */ +static void +arp_ifname_notify(arl_t *arl) +{ + mblk_t *mp1, *mp2; + struct iocblk *iocp; + struct lifreq *lifr; + + if ((mp1 = mkiocb(SIOCSLIFNAME)) == NULL) + return; + if ((mp2 = allocb(sizeof (struct lifreq), BPRI_HI)) == NULL) { + freemsg(mp1); + return; + } + + lifr = (struct lifreq *)mp2->b_rptr; + mp2->b_wptr += sizeof (struct lifreq); + bzero(lifr, sizeof (struct lifreq)); + + (void) strncpy(lifr->lifr_name, arl->arl_name, LIFNAMSIZ); + lifr->lifr_ppa = arl->arl_ppa; + lifr->lifr_flags = ILLF_IPV4; + + /* Use M_CTL to avoid confusing anyone else who might be listening. */ + DB_TYPE(mp1) = M_CTL; + mp1->b_cont = mp2; + iocp = (struct iocblk *)mp1->b_rptr; + iocp->ioc_count = msgsize(mp1->b_cont); + DTRACE_PROBE4(arl__dlpi, char *, "arp_ifname_notify", + char *, "SIOCSLIFNAME", char *, "-", arl_t *, arl); + putnext(arl->arl_wq, mp1); +} + +void +arp_send_replumb_conf(ill_t *ill) +{ + mblk_t *mp; + arl_t *arl = ill_to_arl(ill); + + if (arl == NULL) + return; + /* + * arl_got_replumb and arl_got_unbind to be cleared after we complete + * arp_cmd_done. + */ + mp = mexchange(NULL, NULL, sizeof (dl_notify_conf_t), M_PROTO, + DL_NOTIFY_CONF); + ((dl_notify_conf_t *)(mp->b_rptr))->dl_notification = + DL_NOTE_REPLUMB_DONE; + arp_dlpi_send(arl, mp); + mutex_enter(&arl->arl_lock); + arl->arl_state_flags &= ~ARL_LL_REPLUMBING; + mutex_exit(&arl->arl_lock); + arl_refrele(arl); +} + +/* + * The unplumb code paths call arp_unbind_complete() to make sure that it is + * safe to tear down the ill. We wait for DL_UNBIND_ACK to complete, and also + * for the arl_refcnt to fall to one so that, when we return from + * arp_unbind_complete(), we know for certain that there are no threads in + * arp_rput() that might access the arl_ill. + */ +void +arp_unbind_complete(ill_t *ill) +{ + arl_t *arl = ill_to_arl(ill); + + if (arl == NULL) + return; + mutex_enter(&arl->arl_lock); + /* + * wait for unbind ack and arl_refcnt to drop to 1. Note that the + * quiescent arl_refcnt for this function is 1 (and not 0) because + * ill_to_arl() will itself return after taking a ref on the arl_t. + */ + while (arl->arl_state_flags & ARL_DL_UNBIND_IN_PROGRESS) + cv_wait(&arl->arl_cv, &arl->arl_lock); + while (arl->arl_refcnt != 1) + cv_wait(&arl->arl_cv, &arl->arl_lock); + mutex_exit(&arl->arl_lock); + arl_refrele(arl); +} |