diff options
author | carlsonj <none@none> | 2006-08-14 14:10:48 -0700 |
---|---|---|
committer | carlsonj <none@none> | 2006-08-14 14:10:48 -0700 |
commit | 69bb4bb45c98da60d21839c4dc3c01ea1be60585 (patch) | |
tree | 81b0b1f2cca24e6379bc7933ea584bda2861f39f /usr/src/uts | |
parent | 0173c38a73f34277e0c97a19fedfd25d81ba8380 (diff) | |
download | illumos-joyent-69bb4bb45c98da60d21839c4dc3c01ea1be60585.tar.gz |
PSARC 2005/314 IP Duplicate Address Detection
PSARC 2006/017 Arp Single Entry Display
1248254 NDD doesn't have explicit range checking for ARP NDD variables.
1253974 Please make a "permanent contents" option for arp vs. static
4069191 ace_t::ace_query_count field is not used anywhere.
4157198 ARP cache inconsistency between arp and ip modules.
4396195 ar_rput: ar_cmd_dispatch not necessary for every packet?
4705220 No IPv6 DAD performed during boot
4728609 IPv4 Duplicate Address Detection (DAD) is broken
4971789 Need unsolicited neighbor advertisements when interface comes up
4978063 SO_DONTROUTE option causes ARP traffic for every frame.
4997903 /usr/sbin/arp accepts invalid modifier keywords
6266155 flag on the play: ndp gets its bits confused
6272993 in.ndpd could be free of lint
6273003 arp shouldn't export status via ndd
6360928 ipif_arp_down debug message wrongly expects DLPI
6363393 6281236 causes generation of bogus NS
6454158 need a temporary work-around for 6451644's IPv6 impact
6456379 need ffs(3C)-like function in modapi
--HG--
rename : usr/src/cmd/cmd-inet/sbin/dhcpagent/arp_check.c => deleted_files/usr/src/cmd/cmd-inet/sbin/dhcpagent/arp_check.c
rename : usr/src/cmd/cmd-inet/sbin/dhcpagent/arp_check.h => deleted_files/usr/src/cmd/cmd-inet/sbin/dhcpagent/arp_check.h
rename : usr/src/cmd/cmd-inet/usr.lib/in.ndpd/dupl_addr.c => deleted_files/usr/src/cmd/cmd-inet/usr.lib/in.ndpd/dupl_addr.c
rename : usr/src/cmd/cmd-inet/usr.sbin/ifconfig/dupl_addr.c => deleted_files/usr/src/cmd/cmd-inet/usr.sbin/ifconfig/dupl_addr.c
rename : usr/src/lib/libc/port/gen/ffs.c => deleted_files/usr/src/lib/libc/port/gen/ffs.c
rename : usr/src/lib/libinetcfg/common/inetcfg_dad.c => deleted_files/usr/src/lib/libinetcfg/common/inetcfg_dad.c
rename : usr/src/lib/libinetcfg/common/inetcfg_dad.h => deleted_files/usr/src/lib/libinetcfg/common/inetcfg_dad.h
Diffstat (limited to 'usr/src/uts')
-rw-r--r-- | usr/src/uts/common/Makefile.files | 1 | ||||
-rw-r--r-- | usr/src/uts/common/inet/arp.h | 72 | ||||
-rw-r--r-- | usr/src/uts/common/inet/arp/arp.c | 2112 | ||||
-rw-r--r-- | usr/src/uts/common/inet/arp_impl.h | 39 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ip.h | 54 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ip/ip.c | 632 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ip/ip6.c | 162 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ip/ip6_if.c | 31 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ip/ip_if.c | 495 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ip/ip_ndp.c | 897 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ip/ip_squeue.c | 4 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ip6.h | 2 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ip_if.h | 17 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ip_ndp.h | 26 | ||||
-rw-r--r-- | usr/src/uts/common/net/if.h | 3 | ||||
-rw-r--r-- | usr/src/uts/common/net/if_arp.h | 14 | ||||
-rw-r--r-- | usr/src/uts/common/netinet/arp.h | 13 | ||||
-rw-r--r-- | usr/src/uts/common/os/subr.c | 26 | ||||
-rw-r--r-- | usr/src/uts/common/os/sunddi.c | 3 | ||||
-rw-r--r-- | usr/src/uts/common/sys/systm.h | 2 |
20 files changed, 3137 insertions, 1468 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index fa5d0c132d..ef97e42257 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -131,6 +131,7 @@ GENUNIX_OBJS += \ fdbuffer.o \ fdsync.o \ fem.o \ + ffs.o \ fio.o \ flock.o \ fm.o \ diff --git a/usr/src/uts/common/inet/arp.h b/usr/src/uts/common/inet/arp.h index c773d6354b..71fd056afc 100644 --- a/usr/src/uts/common/inet/arp.h +++ b/usr/src/uts/common/inet/arp.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 1992,1997-2003 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -30,10 +29,18 @@ #pragma ident "%Z%%M% %I% %E% SMI" +#include <sys/types.h> + #ifdef __cplusplus extern "C" { #endif +/* + * Warning: the interfaces described in this file are private to the + * implementation. They may change at any time without notice and are not + * documented. Do not depend on them. + */ + #define ARP_REQUEST 1 #define ARP_RESPONSE 2 #define RARP_REQUEST 3 @@ -41,52 +48,40 @@ extern "C" { #define AR_IOCTL (((unsigned)'A' & 0xFF)<<8) #define CMD_IN_PROGRESS 0x10000 -/* - * The following ARP commands are private, and not part of a supported - * interface. They are subject to change without notice in any release. - */ + #define AR_ENTRY_ADD (AR_IOCTL + 1) #define AR_ENTRY_DELETE (AR_IOCTL + 2) #define AR_ENTRY_QUERY (AR_IOCTL + 3) -#define AR_XMIT_REQUEST (AR_IOCTL + 4) -#define AR_XMIT_TEMPLATE (AR_IOCTL + 5) #define AR_ENTRY_SQUERY (AR_IOCTL + 6) #define AR_MAPPING_ADD (AR_IOCTL + 7) #define AR_CLIENT_NOTIFY (AR_IOCTL + 8) #define AR_INTERFACE_UP (AR_IOCTL + 9) #define AR_INTERFACE_DOWN (AR_IOCTL + 10) -#define AR_XMIT_RESPONSE (AR_IOCTL + 11) #define AR_INTERFACE_ON (AR_IOCTL + 12) #define AR_INTERFACE_OFF (AR_IOCTL + 13) #define AR_DLPIOP_DONE (AR_IOCTL + 14) -#define AR_ENTRY_LLAQUERY (AR_IOCTL + 15) /* * This is not an ARP command per se, it is used to interface between * ARP and IP during close. */ #define AR_ARP_CLOSING (AR_IOCTL + 16) +#define AR_ARP_EXTEND (AR_IOCTL + 17) -/* - * The following ACE flags are private, and not part of a supported - * interface. They are subject to change without notice in any release. - */ -#define ACE_F_PERMANENT 0x1 -#define ACE_F_PUBLISH 0x2 -#define ACE_F_DYING 0x4 -#define ACE_F_RESOLVED 0x8 +/* Both ace_flags and area_flags; must also modify arp.c in mdb */ +#define ACE_F_PERMANENT 0x0001 +#define ACE_F_PUBLISH 0x0002 +#define ACE_F_DYING 0x0004 +#define ACE_F_RESOLVED 0x0008 /* Using bit mask extraction from target address */ -#define ACE_F_MAPPING 0x10 -#define ACE_F_MYADDR 0x20 /* Strong check for duplicate MACs */ - -/* ARP Cmd Table entry */ -typedef struct arct_s { - pfi_t arct_pfi; - uint32_t arct_cmd; - int arct_min_len; - uint32_t arct_flags; - int arct_priv_req; /* Privilege required for this cmd */ - const char *arct_txt; -} arct_t; +#define ACE_F_MAPPING 0x0010 +#define ACE_F_MYADDR 0x0020 /* IP claims to own this address */ +#define ACE_F_UNVERIFIED 0x0040 /* DAD not yet complete */ +#define ACE_F_AUTHORITY 0x0080 /* check for duplicate MACs */ +#define ACE_F_DEFEND 0x0100 /* single transmit (area_flags only) */ +#define ACE_F_OLD 0x0200 /* should revalidate when IP asks */ +#define ACE_F_FAST 0x0400 /* fast probe enabled */ +#define ACE_F_DELAYED 0x0800 /* rescheduled on arp_defend_rate */ +#define ACE_F_DAD_ABORTED 0x1000 /* DAD was aborted on link down */ /* ARP Command Structures */ @@ -98,12 +93,6 @@ typedef struct ar_cmd_s { } arc_t; /* - * The following ARP command structures are private, and not - * part of a supported interface. They are subject to change - * without notice in any release. - */ - -/* * NOTE: when using area_t for an AR_ENTRY_SQUERY, the area_hw_addr_offset * field isn't what you might think. See comments in ip_multi.c where * the routine ill_create_squery() is called, and also in the routine @@ -196,13 +185,10 @@ typedef struct ar_client_notify_s { } arcn_t; /* Client Notification Codes */ -/* - * The following Client Notification codes are private, and not - * part of a supported interface. They are subject to change - * without notice in any release. - */ #define AR_CN_BOGON 1 #define AR_CN_ANNOUNCE 2 +#define AR_CN_READY 3 /* DAD complete; address usable */ +#define AR_CN_FAILED 4 /* DAD failed; address unusable */ /* ARP Header */ typedef struct arh_s { diff --git a/usr/src/uts/common/inet/arp/arp.c b/usr/src/uts/common/inet/arp/arp.c index fd7d086933..17c81b9513 100644 --- a/usr/src/uts/common/inet/arp/arp.c +++ b/usr/src/uts/common/inet/arp/arp.c @@ -28,8 +28,6 @@ /* AR - Address Resolution Protocol */ -#define ARP_DEBUG - #include <sys/types.h> #include <sys/stream.h> #include <sys/stropts.h> @@ -47,6 +45,9 @@ #include <sys/strsun.h> #include <sys/policy.h> #include <sys/ethernet.h> +#include <sys/zone.h> +#include <sys/random.h> +#include <sys/sdt.h> #include <inet/common.h> #include <inet/optcom.h> @@ -56,24 +57,52 @@ #include <net/if.h> #include <inet/arp.h> #include <netinet/ip6.h> +#include <netinet/arp.h> #include <inet/ip.h> #include <inet/ip_ire.h> +#include <inet/ip_ndp.h> #include <inet/mib2.h> #include <inet/arp_impl.h> -#ifdef ARP_DEBUG -#define arp0dbg(a) printf a -#define arp1dbg(a) if (arp_debug) printf a -#define arp2dbg(a) if (arp_debug > 1) printf a -#define arp3dbg(a) if (arp_debug > 2) printf a -#else -#define arp0dbg(a) /* */ -#define arp1dbg(a) /* */ -#define arp2dbg(a) /* */ -#define arp3dbg(a) /* */ -#endif +/* + * ARP entry life time and design notes + * ------------------------------------ + * + * ARP entries (ACEs) must last at least as long as IP knows about a given + * MAC-IP translation (i.e., as long as the IRE cache entry exists). It's ok + * if the ARP entry lasts longer, but not ok if it is removed before the IP + * entry. The reason for this is that if ARP doesn't have an entry, we will be + * unable to detect the difference between an ARP broadcast that represents no + * change (same, known address of sender) and one that represents a change (new + * address for existing entry). In the former case, we must not notify IP, or + * we can suffer hurricane attack. In the latter case, we must notify IP, or + * IP will drift out of sync with the network. + * + * Note that IP controls the lifetime of entries, not ARP. + * + * We don't attempt to reconfirm aging entries. If the system is no longer + * talking to a given peer, then it doesn't matter if we have the right mapping + * for that peer. It would be possible to send queries on aging entries that + * are active, but this isn't done. + */ + +/* + * This is used when scanning for "old" (least recently broadcast) ACEs. We + * don't want to have to walk the list for every single one, so we gather up + * batches at a time. + */ +#define ACE_RESCHED_LIST_LEN 8 + +typedef struct { + arl_t *art_arl; + uint_t art_naces; + ace_t *art_aces[ACE_RESCHED_LIST_LEN]; +} ace_resched_t; #define ACE_RESOLVED(ace) ((ace)->ace_flags & ACE_F_RESOLVED) +#define ACE_NONPERM(ace) \ + (((ace)->ace_flags & (ACE_F_RESOLVED | ACE_F_PERMANENT)) == \ + ACE_F_RESOLVED) #define AR_DEF_XMIT_INTERVAL 500 /* time in milliseconds */ #define AR_LL_HDR_SLACK 32 /* Leave the lower layer some room */ @@ -82,6 +111,13 @@ #define AR_DRAINING (void *)0x11 /* + * The IPv4 Link Local address space is special; we do extra duplicate checking + * there, as the entire assignment mechanism rests on random numbers. + */ +#define IS_IPV4_LL_SPACE(ptr) (((uchar_t *)ptr)[0] == 169 && \ + ((uchar_t *)ptr)[1] == 254) + +/* * Check if the command needs to be enqueued by seeing if there are other * commands ahead of us or if some DLPI response is being awaited. Usually * there would be an enqueued command in the latter case, however if the @@ -94,33 +130,9 @@ (mp->b_prev != AR_DRAINING && (arl->arl_queue != NULL || \ arl->arl_dlpi_pending != DL_PRIM_INVAL)) -/* Ugly check to determine whether the module below is IP */ -#define MODULE_BELOW_IS_IP(q) \ - ((WR(q)->q_next != NULL && WR(q)->q_next->q_next != NULL) && \ - (strcmp(WR(q)->q_next->q_qinfo->qi_minfo->mi_idname, "ip") == 0)) - -/* ARP Cache Entry */ -typedef struct ace_s { - struct ace_s *ace_next; /* Hash chain next pointer */ - struct ace_s **ace_ptpn; /* Pointer to previous next */ - struct arl_s *ace_arl; /* Associated arl */ - uint32_t ace_proto; /* Protocol for this ace */ - uint32_t ace_flags; - uchar_t *ace_proto_addr; - uint32_t ace_proto_addr_length; - uchar_t *ace_proto_mask; /* Mask for matching addr */ - uchar_t *ace_proto_extract_mask; /* For mappings */ - uchar_t *ace_hw_addr; - uint32_t ace_hw_addr_length; - uint32_t ace_hw_extract_start; /* For mappings */ - mblk_t *ace_mp; /* mblk we are in */ - uint32_t ace_query_count; - mblk_t *ace_query_mp; /* Head of outstanding query chain */ - int ace_publish_count; -} ace_t; - #define ACE_EXTERNAL_FLAGS_MASK \ -(ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MAPPING | ACE_F_MYADDR) + (ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MAPPING | ACE_F_MYADDR | \ + ACE_F_AUTHORITY) #define ARH_FIXED_LEN 8 @@ -165,8 +177,8 @@ static int ar_ce_create(arl_t *arl, uint32_t proto, uchar_t *hw_addr, uchar_t *proto_extract_mask, uint32_t hw_extract_start, uint32_t flags); static void ar_ce_delete(ace_t *ace); -static void ar_ce_delete_per_arl(ace_t *ace, arl_t *arl); -static ace_t **ar_ce_hash(uint32_t proto, uchar_t *proto_addr, +static void ar_ce_delete_per_arl(ace_t *ace, void *arg); +static ace_t **ar_ce_hash(uint32_t proto, const uchar_t *proto_addr, uint32_t proto_addr_length); static ace_t *ar_ce_lookup(arl_t *arl, uint32_t proto, uchar_t *proto_addr, uint32_t proto_addr_length); @@ -175,14 +187,12 @@ static ace_t *ar_ce_lookup_entry(arl_t *arl, uint32_t proto, static ace_t *ar_ce_lookup_from_area(mblk_t *mp, ace_t *matchfn()); static ace_t *ar_ce_lookup_mapping(arl_t *arl, uint32_t proto, uchar_t *proto_addr, uint32_t proto_addr_length); -static int ar_ce_report(queue_t *q, mblk_t *mp, caddr_t data, cred_t *cr); -static void ar_ce_report1(ace_t *ace, uchar_t *mp_arg); -static void ar_ce_resolve(ace_t *ace, uchar_t *hw_addr, +static boolean_t ar_ce_resolve(ace_t *ace, const uchar_t *hw_addr, uint32_t hw_addr_length); -static void ar_ce_walk(pfi_t pfi, void *arg1); +static void ar_ce_walk(void (*pfi)(ace_t *, void *), void *arg1); static void ar_cleanup(void); -static void ar_client_notify(arl_t *arl, mblk_t *mp, int code); +static void ar_client_notify(const arl_t *arl, mblk_t *mp, int code); static int ar_close(queue_t *q); static int ar_cmd_dispatch(queue_t *q, mblk_t *mp); static mblk_t *ar_dlpi_comm(t_uscalar_t prim, size_t size); @@ -215,7 +225,7 @@ static int ar_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); static boolean_t ar_param_register(arpparam_t *arppa, int cnt); static int ar_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr); -static int ar_query_delete(ace_t *ace, uchar_t *ar); +static void ar_query_delete(ace_t *ace, void *ar); static void ar_query_reply(ace_t *ace, int ret_val, uchar_t *proto_addr, uint32_t proto_addr_len); static clock_t ar_query_xmit(ace_t *ace, ace_t *src_ace); @@ -227,25 +237,16 @@ static int ar_slifname(queue_t *q, mblk_t *mp); static int ar_set_ppa(queue_t *q, mblk_t *mp); static int ar_snmp_msg(queue_t *q, mblk_t *mp_orig); static void ar_snmp_msg2(ace_t *, void *); -static void ar_timer_init(queue_t *q); -static int ar_trash(ace_t *ace, uchar_t *arg); static void ar_wput(queue_t *q, mblk_t *mp); static void ar_wsrv(queue_t *q); static void ar_xmit(arl_t *arl, uint32_t operation, uint32_t proto, - uint32_t plen, uchar_t *haddr1, uchar_t *paddr1, - uchar_t *haddr2, uchar_t *paddr2); -static int ar_xmit_request(queue_t *q, mblk_t *mp); -static int ar_xmit_response(queue_t *q, mblk_t *mp); + uint32_t plen, const uchar_t *haddr1, const uchar_t *paddr1, + const uchar_t *haddr2, const uchar_t *paddr2, const uchar_t *dstaddr); static uchar_t *ar_snmp_msg_element(mblk_t **, uchar_t *, size_t); static void ar_cmd_enqueue(arl_t *arl, mblk_t *mp, queue_t *q, ushort_t cmd, boolean_t); static mblk_t *ar_cmd_dequeue(arl_t *arl); -#if 0 -static void show_ace(char *str, ace_t *ace); -static void show_arp(char *str, mblk_t *mp); -#endif - /* * All of these are alterable, within the min/max values given, * at run time. arp_publish_interval and arp_publish_count are @@ -256,16 +257,34 @@ static void show_arp(char *str, mblk_t *mp); */ static arpparam_t arp_param_arr[] = { /* min max value name */ - { 0, 10, 0, "arp_debug"}, { 30000, 3600000, 300000, "arp_cleanup_interval"}, { 1000, 20000, 2000, "arp_publish_interval"}, { 1, 20, 5, "arp_publish_count"}, + { 0, 20000, 1000, "arp_probe_delay"}, + { 10, 20000, 1500, "arp_probe_interval"}, + { 0, 20, 3, "arp_probe_count"}, + { 0, 20000, 100, "arp_fastprobe_delay"}, + { 10, 20000, 150, "arp_fastprobe_interval"}, + { 0, 20, 3, "arp_fastprobe_count"}, + { 0, 3600000, 300000, "arp_defend_interval"}, + { 0, 20000, 100, "arp_defend_rate"}, + { 0, 3600000, 15000, "arp_broadcast_interval"}, + { 5, 86400, 3600, "arp_defend_period"} }; -#define arp_debug arp_param_arr[0].arp_param_value -#define arp_timer_interval arp_param_arr[1].arp_param_value -#define arp_publish_interval arp_param_arr[2].arp_param_value -#define arp_publish_count arp_param_arr[3].arp_param_value +#define arp_cleanup_interval arp_param_arr[0].arp_param_value +#define arp_publish_interval arp_param_arr[1].arp_param_value +#define arp_publish_count arp_param_arr[2].arp_param_value +#define arp_probe_delay arp_param_arr[3].arp_param_value +#define arp_probe_interval arp_param_arr[4].arp_param_value +#define arp_probe_count arp_param_arr[5].arp_param_value +#define arp_fastprobe_delay arp_param_arr[6].arp_param_value +#define arp_fastprobe_interval arp_param_arr[7].arp_param_value +#define arp_fastprobe_count arp_param_arr[8].arp_param_value +#define arp_defend_interval arp_param_arr[9].arp_param_value +#define arp_defend_rate arp_param_arr[10].arp_param_value +#define arp_broadcast_interval arp_param_arr[11].arp_param_value +#define arp_defend_period arp_param_arr[12].arp_param_value static struct module_info info = { 0, "arp", 0, INFPSZ, 512, 128 @@ -289,27 +308,24 @@ static arl_t *arl_g_head; /* ARL List Head */ /* * TODO: we need a better mechanism to set the ARP hardware type since - * the DLPI mac type does not include enough prodefined values. + * the DLPI mac type does not include enough predefined values. */ static ar_m_t ar_m_tbl[] = { - { DL_CSMACD, 1, -2, 6}, /* 802.3 */ - { DL_TPB, 6, -2, 6}, /* 802.4 */ - { DL_TPR, 6, -2, 6}, /* 802.5 */ - { DL_METRO, 6, -2, 6}, /* 802.6 */ - { DL_ETHER, 1, -2, 6}, /* Ethernet */ - { DL_FDDI, 1, -2, 6}, /* FDDI */ - { DL_IB, 32, -2, 20}, /* Infiniband */ - { DL_OTHER, 1, -2, 6}, /* unknown */ + { DL_CSMACD, ARPHRD_ETHER, -2, 6}, /* 802.3 */ + { DL_TPB, ARPHRD_IEEE802, -2, 6}, /* 802.4 */ + { DL_TPR, ARPHRD_IEEE802, -2, 6}, /* 802.5 */ + { DL_METRO, ARPHRD_IEEE802, -2, 6}, /* 802.6 */ + { DL_ETHER, ARPHRD_ETHER, -2, 6}, /* Ethernet */ + { DL_FDDI, ARPHRD_ETHER, -2, 6}, /* FDDI */ + { DL_IB, ARPHRD_IB, -2, 20}, /* Infiniband */ + { DL_OTHER, ARPHRD_ETHER, -2, 6}, /* unknown */ }; /* ARP Cache Entry Hash Table */ -static ace_t *ar_ce_hash_tbl[256]; +static ace_t *ar_ce_hash_tbl[ARP_HASH_SIZE]; static ace_t *ar_ce_mask_entries; /* proto_mask not all ones */ -static mblk_t *ar_timer_mp; /* garbage collection timer */ -static queue_t *ar_timer_queue; /* queue for garbage collection */ - /* * Note that all routines which need to queue the message for later * processing have to be ioctl_aware to be able to queue the complete message. @@ -318,6 +334,16 @@ static queue_t *ar_timer_queue; /* queue for garbage collection */ #define ARF_IOCTL_AWARE 0x1 /* Arp command can come down as M_IOCTL */ #define ARF_ONLY_CMD 0x2 /* Command is exclusive to ARP */ +/* ARP Cmd Table entry */ +typedef struct arct_s { + int (*arct_pfi)(queue_t *, mblk_t *); + uint32_t arct_cmd; + int arct_min_len; + uint32_t arct_flags; + int arct_priv_req; /* Privilege required for this cmd */ + const char *arct_txt; +} arct_t; + static arct_t ar_cmd_tbl[] = { { ar_entry_add, AR_ENTRY_ADD, sizeof (area_t), ARF_IOCTL_AWARE | ARF_ONLY_CMD, OP_CONFIG, "AR_ENTRY_ADD" }, @@ -327,10 +353,6 @@ static arct_t ar_cmd_tbl[] = { ARF_IOCTL_AWARE | ARF_ONLY_CMD, OP_NP, "AR_ENTRY_QUERY" }, { ar_entry_squery, AR_ENTRY_SQUERY, sizeof (area_t), ARF_IOCTL_AWARE | ARF_ONLY_CMD, OP_NP, "AR_ENTRY_SQUERY" }, - { ar_xmit_request, AR_XMIT_REQUEST, sizeof (areq_t), - ARF_IOCTL_AWARE | ARF_ONLY_CMD, OP_CONFIG, "AR_XMIT_REQUEST" }, - { ar_xmit_response, AR_XMIT_RESPONSE, sizeof (areq_t), - ARF_IOCTL_AWARE | ARF_ONLY_CMD, OP_CONFIG, "AR_XMIT_RESPONSE" }, { ar_mapping_add, AR_MAPPING_ADD, sizeof (arma_t), ARF_IOCTL_AWARE | ARF_ONLY_CMD, OP_CONFIG, "AR_MAPPING_ADD" }, { ar_interface_up, AR_INTERFACE_UP, sizeof (arc_t), @@ -372,7 +394,7 @@ ar_ce_create(arl_t *arl, uint_t proto, uchar_t *hw_addr, uint_t hw_addr_len, if ((flags & ~ACE_EXTERNAL_FLAGS_MASK) || arl == NULL) return (EINVAL); if (flags & ACE_F_MYADDR) - flags |= ACE_F_PUBLISH; + flags |= ACE_F_PUBLISH | ACE_F_AUTHORITY; if (!hw_addr && hw_addr_len == 0) { if (flags == ACE_F_PERMANENT) { /* Not publish */ @@ -398,6 +420,17 @@ ar_ce_create(arl_t *arl, uint_t proto, uchar_t *hw_addr, uint_t hw_addr_len, return (EINVAL); if (!proto_extract_mask && (flags & ACE_F_MAPPING)) return (EINVAL); + + /* + * If the underlying link doesn't have reliable up/down notification or + * if we're working with the IPv4 169.254.0.0/16 Link Local Address + * space, then don't use the fast timers. Otherwise, use them. + */ + if (arl->arl_notifies && + !(proto == IP_ARP_PROTO_TYPE && IS_IPV4_LL_SPACE(proto_addr))) { + flags |= ACE_F_FAST; + } + /* * Allocate the timer block to hold the ace. * (ace + proto_addr + proto_addr_mask + proto_extract_mask + hw_addr) @@ -425,15 +458,15 @@ ar_ce_create(arl_t *arl, uint_t proto, uchar_t *hw_addr, uint_t hw_addr_len, * subnet structure, if, for example, there are BSD4.2 systems lurking. */ ace->ace_proto_mask = dst; - if (proto_mask) { + if (proto_mask != NULL) { bcopy(proto_mask, dst, proto_addr_len); dst += proto_addr_len; } else { - while (proto_addr_len--) + while (proto_addr_len-- > 0) *dst++ = (uchar_t)~0; } - if (proto_extract_mask) { + if (proto_extract_mask != NULL) { ace->ace_proto_extract_mask = dst; bcopy(proto_extract_mask, dst, ace->ace_proto_addr_length); dst += ace->ace_proto_addr_length; @@ -443,21 +476,22 @@ ar_ce_create(arl_t *arl, uint_t proto, uchar_t *hw_addr, uint_t hw_addr_len, ace->ace_hw_extract_start = hw_extract_start; ace->ace_hw_addr_length = hw_addr_len; ace->ace_hw_addr = dst; - if (hw_addr) { + if (hw_addr != NULL) { bcopy(hw_addr, dst, hw_addr_len); dst += hw_addr_len; } ace->ace_arl = arl; ace->ace_flags = flags; - ace->ace_publish_count = arp_publish_count; + if (ar_mask_all_ones(ace->ace_proto_mask, ace->ace_proto_addr_length)) { acep = ar_ce_hash(ace->ace_proto, ace->ace_proto_addr, ace->ace_proto_addr_length); - } else + } else { acep = &ar_ce_mask_entries; - if ((ace->ace_next = *acep) != 0) + } + if ((ace->ace_next = *acep) != NULL) ace->ace_next->ace_ptpn = &ace->ace_next; *acep = ace; ace->ace_ptpn = acep; @@ -488,9 +522,9 @@ ar_ce_delete(ace_t *ace) * that is going away. */ static void -ar_ce_delete_per_arl(ace_t *ace, arl_t *arl) +ar_ce_delete_per_arl(ace_t *ace, void *arl) { - if (ace != NULL && ace->ace_arl == arl) { + if (ace->ace_arl == arl) { ace->ace_flags &= ~ACE_F_PERMANENT; ar_ce_delete(ace); } @@ -498,9 +532,10 @@ ar_ce_delete_per_arl(ace_t *ace, arl_t *arl) /* Cache entry hash routine, based on protocol and protocol address. */ static ace_t ** -ar_ce_hash(uint32_t proto, uchar_t *proto_addr, uint32_t proto_addr_length) +ar_ce_hash(uint32_t proto, const uchar_t *proto_addr, + uint32_t proto_addr_length) { - uchar_t *up = proto_addr; + const uchar_t *up = proto_addr; unsigned int hval = proto; int len = proto_addr_length; @@ -647,194 +682,170 @@ ar_ce_lookup_permanent(uint32_t proto, uchar_t *proto_addr, } /* - * Pass a cache report back out via NDD. - * TODO: Right now this report assumes IP proto address formatting. - */ -/* ARGSUSED */ -static int -ar_ce_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *cr) -{ - (void) mi_mpprintf(mp, - "ifname proto addr proto mask hardware addr flags"); - /* abcdefgh xxx.xxx.xxx.xxx xxx.xxx.xxx.xxx xx:xx:xx:xx:xx:xx */ - ar_ce_walk((pfi_t)ar_ce_report1, mp); - return (0); -} - -/* - * Add a single line to the ARP Cache Entry Report. - * TODO: Right now this report assumes IP proto address formatting. + * ar_ce_resolve is called when a response comes in to an outstanding request. + * Returns 'true' if the address has changed and we need to tell the client. + * (We don't need to tell the client if there's still an outstanding query.) */ -static void -ar_ce_report1(ace_t *ace, uchar_t *mp_arg) +static boolean_t +ar_ce_resolve(ace_t *ace, const uchar_t *hw_addr, uint32_t hw_addr_length) { - static uchar_t zero_array[8]; - uint32_t flags = ace->ace_flags; - mblk_t *mp = (mblk_t *)mp_arg; - uchar_t *p = ace->ace_proto_addr; - uchar_t *h = ace->ace_hw_addr; - uchar_t *m = ace->ace_proto_mask; - const char *name = "unknown"; - - if (ace->ace_arl != NULL) - name = ace->ace_arl->arl_name; - if (p == NULL) - p = zero_array; - if (h == NULL) - h = zero_array; - if (m == NULL) - m = zero_array; - (void) mi_mpprintf(mp, - "%8s %03d.%03d.%03d.%03d " - "%03d.%03d.%03d.%03d %02x:%02x:%02x:%02x:%02x:%02x", - name, - p[0] & 0xFF, p[1] & 0xFF, p[2] & 0xFF, p[3] & 0xFF, - m[0] & 0xFF, m[1] & 0xFF, m[2] & 0xFF, m[3] & 0xFF, - h[0] & 0xFF, h[1] & 0xFF, h[2] & 0xFF, h[3] & 0xFF, - h[4] & 0xFF, h[5] & 0xFF); - if (flags & ACE_F_PERMANENT) - (void) mi_mpprintf_nr(mp, " PERM"); - if (flags & ACE_F_PUBLISH) - (void) mi_mpprintf_nr(mp, " PUBLISH"); - if (flags & ACE_F_DYING) - (void) mi_mpprintf_nr(mp, " DYING"); - if (!(flags & ACE_F_RESOLVED)) - (void) mi_mpprintf_nr(mp, " UNRESOLVED"); - if (flags & ACE_F_MAPPING) - (void) mi_mpprintf_nr(mp, " MAPPING"); - if (flags & ACE_F_MYADDR) - (void) mi_mpprintf_nr(mp, " MYADDR"); -} + boolean_t hwchanged; -/* - * ar_ce_resolve is called when a response comes in to an outstanding - * request. - */ -static void -ar_ce_resolve(ace_t *ace, uchar_t *hw_addr, uint32_t hw_addr_length) -{ if (hw_addr_length == ace->ace_hw_addr_length) { - if (ace->ace_hw_addr) + ASSERT(ace->ace_hw_addr != NULL); + hwchanged = bcmp(hw_addr, ace->ace_hw_addr, + hw_addr_length) != 0; + if (hwchanged) bcopy(hw_addr, ace->ace_hw_addr, hw_addr_length); /* - * ar_query_reply() blows away soft entries. - * Do not call it unless something is waiting. + * No need to bother with ar_query_reply if no queries are + * waiting. */ ace->ace_flags |= ACE_F_RESOLVED; - if (ace->ace_query_mp) + if (ace->ace_query_mp != NULL) ar_query_reply(ace, 0, NULL, (uint32_t)0); + else if (hwchanged) + return (B_TRUE); } + return (B_FALSE); } /* * There are 2 functions performed by this function. * 1. Resolution of unresolved entries and update of resolved entries. - * 2. Detection of hosts with (duplicate) our own IP address + * 2. Detection of nodes with our own IP address (duplicates). + * + * This is complicated by ill groups. We don't currently have knowledge of ill + * groups, so we can't distinguish between a packet that comes in on one of the + * arls that's part of the group versus one that's on an unrelated arl. Thus, + * we take a conservative approach. If the arls match, then we update resolved + * and unresolved entries alike. If they don't match, then we update only + * unresolved entries. * - * Resolution of unresolved entries and update of resolved entries. + * For all entries, we first check to see if this is a duplicate (probable + * loopback) message. If so, then just ignore it. * - * case A. The packet has been received on the same interface as this ace's - * arl. We blindly call ar_ce_resolve(). The relevant checks for duplicate - * detection (ACE_F_MYADDR) and trying to update published entries have - * already happened in ar_rput(). Both resolved and unresolved entries are - * updated now. This allows a published entry to be updated by an arp - * request, from the node for which we are a proxy arp server, as for eg. - * when a mobile node returns home. + * Next, check to see if the entry has completed DAD. If not, then we've + * failed, because someone is already using the address. Notify IP of the DAD + * failure and remove the broken ace. * - * case B. The interface on which the packet arrived does not match the - * ace's arl. In this case we update only unresolved entries. - * Look whether we have an unresolved entry for src_paddr and if so - * resolve it. We need to look at all the aces that matches the - * src_haddr because with ill groups we could have unresolved ace - * across the whole group. As we don't have knowledge of groups, - * look across all of them. Note that this logic does not update published - * arp entries, as for eg. when we proxy arp across 2 subnets with - * differing subnet masks. + * Next, we check if we're the authority for this address. If so, then it's + * time to defend it, because the other node is a duplicate. Report it as a + * 'bogon' and let IP decide how to defend. * - * Detection of hosts with (duplicate) our own IP address. + * Finally, if it's unresolved or if the arls match, we just update the MAC + * address. This allows a published 'static' entry to be updated by an ARP + * request from the node for which we're a proxy ARP server -- e.g., when a + * mobile node returns home. If the address has changed, then tell IP. * - * case A is handled in ar_rput(). case B is handled here. We return AR_BOGON, - * if we detect duplicate, and caller will send BOGON message to IP. - * If hme0 and hme1 are in a IPMP group. hme1 will receive broadcast arp - * packets sent from hme0. Both IP address and Hardware address of the - * packet match the ace. So we return AR_LOOPBACK. + * Note that this logic does not update published ARP entries for mismatched + * arls, as for example when we proxy arp across 2 subnets with differing + * subnet masks. * * Return Values below */ -#define AR_NORMAL 1 /* Usual return value. */ -#define AR_LOOPBACK 2 /* Our own broadcast arp packet was received */ -#define AR_BOGON 3 /* Another host has our IP addr. */ +#define AR_NOTFOUND 1 /* No matching ace found in cache */ +#define AR_MERGED 2 /* Matching ace updated (RFC 826 Merge_flag) */ +#define AR_LOOPBACK 3 /* Our own arp packet was received */ +#define AR_BOGON 4 /* Another host has our IP addr. */ +#define AR_FAILED 5 /* Duplicate Address Detection has failed */ +#define AR_CHANGED 6 /* Address has changed; tell IP (and merged) */ static int -ar_ce_resolve_all(arl_t *arl, uint32_t proto, uchar_t *src_haddr, - uint32_t hlen, uchar_t *src_paddr, uint32_t plen) +ar_ce_resolve_all(arl_t *arl, uint32_t proto, const uchar_t *src_haddr, + uint32_t hlen, const uchar_t *src_paddr, uint32_t plen) { ace_t *ace; ace_t *ace_next; + int i1; + const uchar_t *paddr; + uchar_t *ace_addr; + uchar_t *mask; + int retv = AR_NOTFOUND; ace = *ar_ce_hash(proto, src_paddr, plen); for (; ace != NULL; ace = ace_next) { + /* ar_ce_resolve may delete the ace; fetch next pointer now */ ace_next = ace->ace_next; - if (ace->ace_proto_addr_length == plen && - ace->ace_proto == proto) { - int i1 = plen; - uchar_t *ace_addr = ace->ace_proto_addr; - uchar_t *mask = ace->ace_proto_mask; + if (ace->ace_proto_addr_length != plen || + ace->ace_proto != proto) { + continue; + } - /* - * Note that the ace_proto_mask is applied to the - * proto_addr before comparing to the ace_addr. - */ - do { - if (--i1 < 0) { - /* - * Limit updating across other - * ills to unresolved entries only. - * We don't want to inadvertently - * update published entries or our - * own entries. - */ - if ((ace->ace_arl == arl) || - (!ACE_RESOLVED(ace))) { - ar_ce_resolve(ace, src_haddr, hlen); - } else { - /* - * If both IP addr and hardware - * address match our's then this - * is a broadcast packet emitted by - * one of our interfaces, reflected - * by the switch, and received on - * another interface. We return - * AR_LOOPBACK. If only IP addr. - * matches our's then some other node - * is using our IP addr, return - * AR_BOGON. - */ - if (ace->ace_flags & ACE_F_MYADDR) { - if (bcmp(ace->ace_hw_addr, - src_haddr, - ace->ace_hw_addr_length) != 0) { - return (AR_BOGON); - } else { - return (AR_LOOPBACK); - } - - } - } + /* + * Note that the ace_proto_mask is applied to the proto_addr + * before comparing to the ace_addr. + */ + paddr = src_paddr; + i1 = plen; + ace_addr = ace->ace_proto_addr; + mask = ace->ace_proto_mask; + while (--i1 >= 0) { + if ((*paddr++ & *mask++) != *ace_addr++) break; - } - } while ((src_paddr[i1] & mask[i1]) == ace_addr[i1]); + } + if (i1 >= 0) + continue; + + /* + * If both IP addr and hardware address match what we already + * have, then this is a broadcast packet emitted by one of our + * interfaces, reflected by the switch and received on another + * interface. We return AR_LOOPBACK. + */ + if ((ace->ace_flags & ACE_F_MYADDR) && + hlen == ace->ace_hw_addr_length && + bcmp(ace->ace_hw_addr, src_haddr, + ace->ace_hw_addr_length) == 0) { + return (AR_LOOPBACK); + } + + /* + * If the entry is unverified, then we've just verified that + * someone else already owns this address, because this is a + * message with the same protocol address but different + * hardware address. + */ + if (ace->ace_flags & ACE_F_UNVERIFIED) { + ar_ce_delete(ace); + return (AR_FAILED); + } + + /* + * If the IP address matches ours and we're authoritative for + * this entry, then some other node is using our IP addr, so + * return AR_BOGON. Also reset the transmit count to zero so + * that, if we're currently in initial announcement mode, we + * switch back to the lazier defense mode. Knowing that + * there's at least one duplicate out there, we ought not + * blindly announce. + */ + if (ace->ace_flags & ACE_F_AUTHORITY) { + ace->ace_xmit_count = 0; + return (AR_BOGON); + } + + /* + * Limit updating across other ills to unresolved + * entries only. We don't want to inadvertently update + * published entries. + */ + if (ace->ace_arl == arl || !ACE_RESOLVED(ace)) { + if (ar_ce_resolve(ace, src_haddr, hlen)) + retv = AR_CHANGED; + else if (retv == AR_NOTFOUND) + retv = AR_MERGED; } } - return (AR_NORMAL); + return (retv); } /* Pass arg1 to the pfi supplied, along with each ace in existence. */ static void -ar_ce_walk(pfi_t pfi, void *arg1) +ar_ce_walk(void (*pfi)(ace_t *, void *), void *arg1) { ace_t *ace; ace_t *ace1; @@ -870,7 +881,7 @@ ar_cleanup(void) * DEV (i.e. ARL). */ static void -ar_client_notify(arl_t *arl, mblk_t *mp, int code) +ar_client_notify(const arl_t *arl, mblk_t *mp, int code) { ar_t *ar = ((ar_t *)arl->arl_rq->q_ptr)->ar_arl_ip_assoc; arcn_t *arcn; @@ -904,6 +915,39 @@ ar_client_notify(arl_t *arl, mblk_t *mp, int code) putnext(ar->ar_wq, mp1); } +/* + * Send a delete-notify message down to IP. We've determined that IP doesn't + * have a cache entry for the IP address itself, but it may have other cache + * entries with the same hardware address, and we don't want to see those grow + * stale. (The alternative is sending down updates for every ARP message we + * get that doesn't match an existing ace. That's much more expensive than an + * occasional delete and reload.) + */ +static void +ar_delete_notify(const ace_t *ace) +{ + const arl_t *arl = ace->ace_arl; + mblk_t *mp; + size_t len; + arh_t *arh; + + len = sizeof (*arh) + 2 * ace->ace_proto_addr_length; + mp = allocb(len, BPRI_MED); + if (mp == NULL) + return; + arh = (arh_t *)mp->b_rptr; + mp->b_wptr = (uchar_t *)arh + len; + U16_TO_BE16(arl->arl_arp_hw_type, arh->arh_hardware); + U16_TO_BE16(ace->ace_proto, arh->arh_proto); + arh->arh_hlen = 0; + arh->arh_plen = ace->ace_proto_addr_length; + U16_TO_BE16(ARP_RESPONSE, arh->arh_operation); + bcopy(ace->ace_proto_addr, arh + 1, ace->ace_proto_addr_length); + bcopy(ace->ace_proto_addr, (uchar_t *)(arh + 1) + + ace->ace_proto_addr_length, ace->ace_proto_addr_length); + ar_client_notify(arl, mp, AR_CN_ANNOUNCE); +} + /* ARP module close routine. */ static int ar_close(queue_t *q) @@ -926,7 +970,7 @@ ar_close(queue_t *q) * an ack. This helps to make sure that messages * that are currently being sent up by IP are not lost. */ - if (MODULE_BELOW_IS_IP(q)) { + if (ar->ar_on_ill_stream) { mp1 = allocb(sizeof (arc_t), BPRI_MED); if (mp1 != NULL) { DB_TYPE(mp1) = M_CTL; @@ -963,7 +1007,7 @@ ar_close(queue_t *q) * If this is the control stream for an arl, delete anything * hanging off our arl. */ - ar_ce_walk((pfi_t)ar_ce_delete_per_arl, arl); + ar_ce_walk(ar_ce_delete_per_arl, arl); /* Free any messages waiting for a bind_ack */ /* Get the arl out of the chain. */ for (arlp = &arl_g_head; arlp[0]; arlp = &arlp[0]->arl_next) { @@ -984,21 +1028,6 @@ ar_close(queue_t *q) ar->ar_arl_ip_assoc->ar_arl_ip_assoc = NULL; ar->ar_arl_ip_assoc = NULL; } - if (WR(q) == ar_timer_queue) { - /* We were using this one for the garbage collection timer. */ - for (arl = arl_g_head; arl; arl = arl->arl_next) - if (arl->arl_rq != q) - break; - if (arl) { - ar_timer_queue = arl->arl_wq; - /* Ask mi_timer to switch to the new queue. */ - mi_timer(ar_timer_queue, ar_timer_mp, -2); - } else { - mi_timer_free(ar_timer_mp); - ar_timer_mp = NULL; - ar_timer_queue = NULL; - } - } cr = ar->ar_credp; /* mi_close_comm frees the instance data. */ (void) mi_close_comm(&ar_g_head, q); @@ -1067,7 +1096,8 @@ ar_cmd_dispatch(queue_t *q, mblk_t *mp_orig) if (arct->arct_flags & ARF_IOCTL_AWARE) mp = mp_orig; - arp2dbg(("ar_cmd_dispatch: %s\n", arct->arct_txt)); + DTRACE_PROBE3(cmd_dispatch, queue_t *, q, mblk_t *, mp, + arct_t *, arct); return (*arct->arct_pfi)(q, mp); } @@ -1104,31 +1134,25 @@ ar_dlpi_comm(t_uscalar_t prim, size_t size) static void ar_dlpi_send(arl_t *arl, mblk_t *mp) { - mblk_t **mpp; - union DL_primitives *dlp; - ASSERT(arl != NULL); - ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); - dlp = (union DL_primitives *)mp->b_rptr; if (arl->arl_dlpi_pending != DL_PRIM_INVAL) { + mblk_t **mpp; + /* Must queue message. Tail insertion */ mpp = &arl->arl_dlpi_deferred; while (*mpp != NULL) mpp = &((*mpp)->b_next); - - arp1dbg(("ar_dlpi_send: deferring DLPI message arl %p %x\n", - (void *)arl, dlp->dl_primitive)); - *mpp = mp; + + DTRACE_PROBE2(dlpi_defer, arl_t *, arl, mblk_t *, mp); return; } - arp1dbg(("ar_dlpi_send: sending DLPI message arl %p %x\n", (void *)arl, - dlp->dl_primitive)); - - arl->arl_dlpi_pending = dlp->dl_primitive; + arl->arl_dlpi_pending = + ((union DL_primitives *)mp->b_rptr)->dl_primitive; + DTRACE_PROBE2(dlpi_send, arl_t *, arl, mblk_t *, mp); putnext(arl->arl_wq, mp); } @@ -1141,16 +1165,16 @@ ar_dlpi_send(arl_t *arl, mblk_t *mp) static void ar_dlpi_done(arl_t *arl, t_uscalar_t prim) { - mblk_t *mp; - union DL_primitives *dlp; + mblk_t *mp; if (arl->arl_dlpi_pending != prim) { - arp0dbg(("ar_dlpi_done: spurious response arl %p\n", - (void *)arl)); + DTRACE_PROBE2(dlpi_done_unexpected, arl_t *, arl, + t_uscalar_t, prim); return; } if ((mp = arl->arl_dlpi_deferred) == NULL) { + DTRACE_PROBE2(dlpi_done_idle, arl_t *, arl, t_uscalar_t, prim); arl->arl_dlpi_pending = DL_PRIM_INVAL; ar_cmd_done(arl); return; @@ -1160,12 +1184,10 @@ ar_dlpi_done(arl_t *arl, t_uscalar_t prim) mp->b_next = NULL; ASSERT(DB_TYPE(mp) == M_PROTO || DB_TYPE(mp) == M_PCPROTO); - dlp = (union DL_primitives *)mp->b_rptr; - arp1dbg(("ar_dlpi_done: sending DLPI message arl %p %x\n", - (void *)arl, dlp->dl_primitive)); - - arl->arl_dlpi_pending = dlp->dl_primitive; + arl->arl_dlpi_pending = + ((union DL_primitives *)mp->b_rptr)->dl_primitive; + DTRACE_PROBE2(dlpi_done_next, arl_t *, arl, mblk_t *, mp); putnext(arl->arl_wq, mp); } @@ -1268,8 +1290,8 @@ ar_cmd_done(arl_t *arl) done: if (dlpi_op_done_mp != NULL) { - arp1dbg(("ar_dlpi_done: ardlpiopdone arl %p to q %p err %d\n", - (void *)arl, (void *)dlpi_op_done_q, err)); + DTRACE_PROBE3(cmd_done_next, arl_t *, arl, + queue_t *, dlpi_op_done_q, mblk_t *, dlpi_op_done_mp); putnext(dlpi_op_done_q, dlpi_op_done_mp); } } @@ -1295,9 +1317,6 @@ static void ar_cmd_enqueue(arl_t *arl, mblk_t *mp, queue_t *q, ushort_t cmd, boolean_t tail_insert) { - arp1dbg(("ar_cmd_enqueue: arl %p from q %p cmd %d \n", (void *)arl, - (void *)q, cmd)); - mp->b_queue = q; if (arl->arl_queue == NULL) { ASSERT(arl->arl_queue_tail == NULL); @@ -1336,6 +1355,38 @@ ar_cmd_dequeue(arl_t *arl) } /* + * Standard ACE timer handling: compute 'fuzz' around a central value or from 0 + * up to a value, and then set the timer. The randomization is necessary to + * prevent groups of systems from falling into synchronization on the network + * and producing ARP packet storms. + */ +static void +ace_set_timer(ace_t *ace, boolean_t initial_time) +{ + clock_t intv, rnd, frac; + + (void) random_get_pseudo_bytes((uint8_t *)&rnd, sizeof (rnd)); + /* Note that clock_t is signed; must chop off bits */ + rnd &= (1ul << (NBBY * sizeof (rnd) - 1)) - 1; + intv = ace->ace_xmit_interval; + if (initial_time) { + /* Set intv to be anywhere in the [1 .. intv] range */ + if (intv <= 0) + intv = 1; + else + intv = (rnd % intv) + 1; + } else { + /* Compute 'frac' as 20% of the configured interval */ + if ((frac = intv / 5) <= 1) + frac = 2; + /* Set intv randomly in the range [intv-frac .. intv+frac] */ + if ((intv = intv - frac + rnd % (2 * frac + 1)) <= 0) + intv = 1; + } + mi_timer(ace->ace_arl->arl_wq, ace->ace_mp, intv); +} + +/* * Process entry add requests from external messages. * It is also called by ip_rput_dlpi_writer() through * ipif_resolver_up() to change hardware address when @@ -1355,6 +1406,8 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig) arl_t *arl; mblk_t *mp = mp_orig; int err; + uint_t aflags; + boolean_t unverified; /* We handle both M_IOCTL and M_PROTO messages. */ if (DB_TYPE(mp) == M_IOCTL) @@ -1366,16 +1419,32 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig) * Newly received commands from clients go to the tail of the queue. */ if (CMD_NEEDS_QUEUEING(mp_orig, arl)) { - arp1dbg(("ar_entry_add: enqueue cmd on q %p \n", (void *)q)); + DTRACE_PROBE3(eadd_enqueued, queue_t *, q, mblk_t *, mp_orig, + arl_t *, arl); ar_cmd_enqueue(arl, mp_orig, q, AR_ENTRY_ADD, B_TRUE); return (EINPROGRESS); } mp_orig->b_prev = NULL; area = (area_t *)mp->b_rptr; - /* If this is a replacement, ditch the original. */ - if ((ace = ar_ce_lookup_from_area(mp, ar_ce_lookup_entry)) != 0) + aflags = area->area_flags; + + /* + * If this is a replacement, ditch the original, but remember the + * duplicate address detection state. If it's a new entry, then we're + * obligated to do duplicate address detection now. + */ + if ((ace = ar_ce_lookup_from_area(mp, ar_ce_lookup_entry)) != NULL) { + unverified = (ace->ace_flags & ACE_F_UNVERIFIED) != 0; ar_ce_delete(ace); + } else { + unverified = (aflags & ACE_F_PUBLISH) != 0; + } + + /* Allow client to request DAD restart */ + if (aflags & ACE_F_UNVERIFIED) + unverified = B_TRUE; + /* Extract parameters from the message. */ hw_addr_len = area->area_hw_addr_length; hw_addr = mi_offset_paramc(mp, area->area_hw_addr_offset, hw_addr_len); @@ -1384,29 +1453,31 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig) proto_addr_len); proto_mask = mi_offset_paramc(mp, area->area_proto_mask_offset, proto_addr_len); - if (!proto_mask) + if (proto_mask == NULL) { + DTRACE_PROBE2(eadd_bad_mask, arl_t *, arl, area_t *, area); return (EINVAL); + } err = ar_ce_create( arl, - area->area_proto, - hw_addr, - hw_addr_len, - proto_addr, - proto_addr_len, - proto_mask, - NULL, - (uint32_t)0, - area->area_flags & ~ACE_F_MAPPING); - if (err) + area->area_proto, + hw_addr, + hw_addr_len, + proto_addr, + proto_addr_len, + proto_mask, + NULL, + (uint32_t)0, + aflags & ~ACE_F_MAPPING & ~ACE_F_UNVERIFIED & ~ACE_F_DEFEND); + if (err != 0) { + DTRACE_PROBE3(eadd_create_failed, arl_t *, arl, area_t *, area, + int, err); return (err); - if (area->area_flags & ACE_F_PUBLISH) { - /* - * Transmit an arp request for this address to flush stale - * information froma arp caches. - */ + } + + if (aflags & ACE_F_PUBLISH) { if (hw_addr == NULL || hw_addr_len == 0) { hw_addr = arl->arl_hw_addr; - } else if (area->area_flags & ACE_F_MYADDR) { + } else if (aflags & ACE_F_MYADDR) { /* * If hardware address changes, then make sure * that the hardware address and hardware @@ -1422,23 +1493,79 @@ ar_entry_add(queue_t *q, mblk_t *mp_orig) ace = ar_ce_lookup(arl, area->area_proto, proto_addr, proto_addr_len); ASSERT(ace != NULL); - ar_xmit(arl, ARP_REQUEST, area->area_proto, proto_addr_len, - hw_addr, proto_addr, arl->arl_arp_addr, - proto_addr); + + if (ace->ace_flags & ACE_F_FAST) { + ace->ace_xmit_count = arp_fastprobe_count; + ace->ace_xmit_interval = arp_fastprobe_delay; + } else { + ace->ace_xmit_count = arp_probe_count; + ace->ace_xmit_interval = arp_probe_delay; + } + + /* + * If the user has disabled duplicate address detection for + * this kind of interface (fast or slow) by setting the probe + * count to zero, then pretend as if we've verified the + * address, and go right to address defense mode. + */ + if (ace->ace_xmit_count == 0) + unverified = B_FALSE; /* - * If MYADDR is set - it is not a proxy arp entry. In that - * case we send more than one copy, so that if this is - * a case of failover, we send out multiple entries in case - * the switch is very slow. + * If we need to do duplicate address detection, then kick that + * off. Otherwise, send out a gratuitous ARP message in order + * to update everyone's caches with the new hardware address. */ - if ((area->area_flags & ACE_F_MYADDR) && - ace->ace_publish_count != 0 && arp_publish_interval != 0) { - /* Account for the xmit we just did */ - ace->ace_publish_count--; - if (ace->ace_publish_count != 0) { - mi_timer(arl->arl_wq, ace->ace_mp, - arp_publish_interval); + if (unverified) { + ace->ace_flags |= ACE_F_UNVERIFIED; + if (ace->ace_xmit_interval == 0) { + /* + * User has configured us to send the first + * probe right away. Do so, and set up for + * the subsequent probes. + */ + DTRACE_PROBE2(eadd_probe, ace_t *, ace, + area_t *, area); + ar_xmit(arl, ARP_REQUEST, area->area_proto, + proto_addr_len, hw_addr, NULL, NULL, + proto_addr, NULL); + ace->ace_xmit_count--; + ace->ace_xmit_interval = + (ace->ace_flags & ACE_F_FAST) ? + arp_fastprobe_interval : + arp_probe_interval; + ace_set_timer(ace, B_FALSE); + } else { + DTRACE_PROBE2(eadd_delay, ace_t *, ace, + area_t *, area); + /* Regular delay before initial probe */ + ace_set_timer(ace, B_TRUE); + } + } else { + DTRACE_PROBE2(eadd_announce, ace_t *, ace, + area_t *, area); + ar_xmit(arl, ARP_REQUEST, area->area_proto, + proto_addr_len, hw_addr, proto_addr, + arl->arl_arp_addr, proto_addr, NULL); + ace->ace_last_bcast = ddi_get_lbolt(); + + /* + * If AUTHORITY is set, it is not just a proxy arp + * entry; we believe we're the authority for this + * entry. In that case, and if we're not just doing + * one-off defense of the address, we send more than + * one copy, so that if this is an IPMP failover, we'll + * still have a good chance of updating everyone even + * when there's a packet loss or two. + */ + if ((aflags & ACE_F_AUTHORITY) && + !(aflags & ACE_F_DEFEND) && + arp_publish_count > 0) { + /* Account for the xmit we just did */ + ace->ace_xmit_count = arp_publish_count - 1; + ace->ace_xmit_interval = arp_publish_interval; + if (ace->ace_xmit_count > 0) + ace_set_timer(ace, B_FALSE); } } } @@ -1463,7 +1590,8 @@ ar_entry_delete(queue_t *q, mblk_t *mp_orig) * Newly received commands from clients go to the tail of the queue. */ if (CMD_NEEDS_QUEUEING(mp_orig, arl)) { - arp1dbg(("ar_entry_delete: enqueue on q %p\n", (void *)q)); + DTRACE_PROBE3(edel_enqueued, queue_t *, q, mblk_t *, mp_orig, + arl_t *, arl); ar_cmd_enqueue(arl, mp_orig, q, AR_ENTRY_DELETE, B_TRUE); return (EINPROGRESS); } @@ -1474,7 +1602,13 @@ ar_entry_delete(queue_t *q, mblk_t *mp_orig) * match first. */ ace = ar_ce_lookup_from_area(mp, ar_ce_lookup); - if (ace) { + if (ace != NULL) { + /* + * If it's a permanent entry, then the client is the one who + * told us to delete it, so there's no reason to notify. + */ + if (ACE_NONPERM(ace)) + ar_delete_notify(ace); ar_ce_delete(ace); return (0); } @@ -1511,6 +1645,7 @@ ar_entry_query(queue_t *q, mblk_t *mp_orig) } arl = ar_ll_lookup_from_mp(mp); if (arl == NULL) { + DTRACE_PROBE2(query_no_arl, queue_t *, q, mblk_t *, mp); err = EINVAL; goto err_ret; } @@ -1518,7 +1653,8 @@ ar_entry_query(queue_t *q, mblk_t *mp_orig) * Newly received commands from clients go to the tail of the queue. */ if (CMD_NEEDS_QUEUEING(mp_orig, arl)) { - arp1dbg(("ar_entry_query: enqueue on q %p\n", (void *)q)); + DTRACE_PROBE3(query_enqueued, queue_t *, q, mblk_t *, mp_orig, + arl_t *, arl); ar_cmd_enqueue(arl, mp_orig, q, AR_ENTRY_QUERY, B_TRUE); return (EINPROGRESS); } @@ -1528,7 +1664,8 @@ ar_entry_query(queue_t *q, mblk_t *mp_orig) proto_addr_len = areq->areq_target_addr_length; proto_addr = mi_offset_paramc(mp, areq->areq_target_addr_offset, proto_addr_len); - if (proto_addr == 0) { + if (proto_addr == NULL) { + DTRACE_PROBE1(query_illegal_address, areq_t *, areq); err = EINVAL; goto err_ret; } @@ -1538,9 +1675,22 @@ ar_entry_query(queue_t *q, mblk_t *mp_orig) if (areq->areq_xmit_interval == 0) areq->areq_xmit_interval = AR_DEF_XMIT_INTERVAL; ace = ar_ce_lookup(arl, areq->areq_proto, proto_addr, proto_addr_len); - if (ace) { + if (ace != NULL && (ace->ace_flags & ACE_F_OLD)) { + /* + * This is a potentially stale entry that IP's asking about. + * Since IP is asking, it must not have an answer anymore, + * either due to periodic ARP flush or due to SO_DONTROUTE. + * Rather than go forward with what we've got, restart + * resolution. + */ + DTRACE_PROBE2(query_stale_ace, ace_t *, ace, areq_t *, areq); + ar_ce_delete(ace); + ace = NULL; + } + if (ace != NULL) { mblk_t **mpp; uint32_t count = 0; + /* * There is already a cache entry. This means there is either * a permanent entry, or address resolution is in progress. @@ -1550,6 +1700,8 @@ ar_entry_query(queue_t *q, mblk_t *mp_orig) */ for (mpp = &ace->ace_query_mp; mpp[0]; mpp = &mpp[0]->b_next) { if (++count > areq->areq_max_buffered) { + DTRACE_PROBE2(query_overflow, ace_t *, ace, + areq_t *, areq); mp->b_prev = NULL; err = EALREADY; goto err_ret; @@ -1562,6 +1714,8 @@ ar_entry_query(queue_t *q, mblk_t *mp_orig) * If a query was already queued up, then we must not * have an answer yet. */ + DTRACE_PROBE2(query_in_progress, ace_t *, ace, + areq_t *, areq); return (EINPROGRESS); } if (ACE_RESOLVED(ace)) { @@ -1572,6 +1726,8 @@ ar_entry_query(queue_t *q, mblk_t *mp_orig) */ mblk_t *mp1; + DTRACE_PROBE2(query_resolved, ace_t *, ace, + areq_t *, areq); mp1 = dupmsg(mp); ar_query_reply(ace, 0, proto_addr, proto_addr_len); freemsg(mp1); @@ -1579,22 +1735,28 @@ ar_entry_query(queue_t *q, mblk_t *mp_orig) } if (ace->ace_flags & ACE_F_MAPPING) { /* Should never happen */ - arp0dbg(("ar_entry_query: unresolved mapping\n")); + DTRACE_PROBE2(query_unresolved_mapping, ace_t *, ace, + areq_t *, areq); mpp[0] = mp->b_next; err = ENXIO; goto err_ret; } if (arl->arl_xmit_template == NULL) { /* Can't get help if we don't know how. */ + DTRACE_PROBE2(query_no_template, ace_t *, ace, + areq_t *, areq); mpp[0] = NULL; mp->b_prev = NULL; err = ENXIO; goto err_ret; } + DTRACE_PROBE2(query_unresolved, ace_t, ace, areq_t *, areq); } else { /* No ace yet. Make one now. (This is the common case.) */ if (areq->areq_xmit_count == 0 || arl->arl_xmit_template == NULL) { + DTRACE_PROBE2(query_template, arl_t *, arl, + areq_t *, areq); mp->b_prev = NULL; err = ENXIO; goto err_ret; @@ -1607,6 +1769,8 @@ ar_entry_query(queue_t *q, mblk_t *mp_orig) areq->areq_sender_addr_offset, areq->areq_sender_addr_length); if (sender_addr == NULL) { + DTRACE_PROBE2(query_no_sender, arl_t *, arl, + areq_t *, areq); mp->b_prev = NULL; err = EINVAL; goto err_ret; @@ -1615,14 +1779,18 @@ ar_entry_query(queue_t *q, mblk_t *mp_orig) proto_addr, proto_addr_len, NULL, NULL, (uint32_t)0, areq->areq_flags); - if (err) { + if (err != 0) { + DTRACE_PROBE3(query_create_failed, arl_t *, arl, + areq_t *, areq, int, err); mp->b_prev = NULL; goto err_ret; } ace = ar_ce_lookup(arl, areq->areq_proto, proto_addr, proto_addr_len); - if (!ace || ace->ace_query_mp) { + if (ace == NULL || ace->ace_query_mp != NULL) { /* Shouldn't happen! */ + DTRACE_PROBE3(query_lookup_failed, arl_t *, arl, + areq_t *, areq, ace_t *, ace); mp->b_prev = NULL; err = ENXIO; goto err_ret; @@ -1637,10 +1805,8 @@ ar_entry_query(queue_t *q, mblk_t *mp_orig) src_ace = ar_ce_lookup_permanent(areq->areq_proto, sender_addr, areq->areq_sender_addr_length); if (src_ace == NULL) { - printf("ar_entry_query: Could not find the ace for " - "source address %d.%d.%d.%d\n", - sender_addr[0], sender_addr[1], sender_addr[2], - sender_addr[3]); + DTRACE_PROBE3(query_source_missing, arl_t *, arl, + areq_t *, areq, ace_t *, ace); ar_query_reply(ace, ENXIO, NULL, (uint32_t)0); /* * ar_query_reply has already freed the mp. @@ -1659,7 +1825,9 @@ ar_entry_query(queue_t *q, mblk_t *mp_orig) areq->areq_proto, proto_addr, proto_addr_len); if (dst_ace != NULL && ACE_RESOLVED(dst_ace)) { - ar_ce_resolve(ace, dst_ace->ace_hw_addr, + DTRACE_PROBE3(query_other_arl, arl_t *, arl, + areq_t *, areq, ace_t *, dst_ace); + (void) ar_ce_resolve(ace, dst_ace->ace_hw_addr, dst_ace->ace_hw_addr_length); return (EINPROGRESS); } @@ -1701,7 +1869,8 @@ ar_entry_squery(queue_t *q, mblk_t *mp_orig) * Newly received commands from clients go to the tail of the queue. */ if (CMD_NEEDS_QUEUEING(mp_orig, arl)) { - arp1dbg(("ar_entry_squery: enqueue on q %p\n", (void *)q)); + DTRACE_PROBE3(squery_enqueued, queue_t *, q, mblk_t *, mp_orig, + arl_t *, arl); ar_cmd_enqueue(arl, mp_orig, q, AR_ENTRY_SQUERY, B_TRUE); return (EINPROGRESS); } @@ -1714,13 +1883,17 @@ ar_entry_squery(queue_t *q, mblk_t *mp_orig) proto_addr_len); hw_addr_len = area->area_hw_addr_length; hw_addr = mi_offset_paramc(mp, area->area_hw_addr_offset, hw_addr_len); - if (!proto_addr || !hw_addr) + if (proto_addr == NULL || hw_addr == NULL) { + DTRACE_PROBE1(squery_illegal_address, area_t *, area); return (EINVAL); + } ace = ar_ce_lookup(arl, area->area_proto, proto_addr, proto_addr_len); - if (!ace) + if (ace == NULL) { return (ENXIO); - if (hw_addr_len < ace->ace_hw_addr_length) + } + if (hw_addr_len < ace->ace_hw_addr_length) { return (EINVAL); + } if (ACE_RESOLVED(ace)) { /* Got it, prepare the response. */ ASSERT(area->area_hw_addr_length == ace->ace_hw_addr_length); @@ -1736,8 +1909,9 @@ ar_entry_squery(queue_t *q, mblk_t *mp_orig) if (mp == mp_orig) { /* Non-ioctl case */ /* TODO: change message type? */ - arp1dbg(("ar_entry_squery: qreply\n")); DB_TYPE(mp) = M_CTL; /* Caught by ip_wput */ + DTRACE_PROBE3(squery_reply, queue_t *, q, mblk_t *, mp, + arl_t *, arl); qreply(q, mp); return (EINPROGRESS); } @@ -1751,10 +1925,9 @@ ar_interface_down(queue_t *q, mblk_t *mp) { arl_t *arl; - arp1dbg(("ar_interface_down q %p\n", (void *)q)); arl = ar_ll_lookup_from_mp(mp); - if ((arl == NULL) || (arl->arl_closing)) { - arp1dbg(("ar_interface_down: no arl q %p \n", (void *)q)); + if (arl == NULL || arl->arl_closing) { + DTRACE_PROBE2(down_no_arl, queue_t *, q, mblk_t *, mp); return (EINVAL); } @@ -1762,6 +1935,8 @@ ar_interface_down(queue_t *q, mblk_t *mp) * Newly received commands from clients go to the tail of the queue. */ if (CMD_NEEDS_QUEUEING(mp, arl)) { + DTRACE_PROBE3(down_enqueued, queue_t *, q, mblk_t *, mp, + arl_t *, arl); ar_cmd_enqueue(arl, mp, q, AR_INTERFACE_DOWN, B_TRUE); return (EINPROGRESS); } @@ -1784,7 +1959,7 @@ ar_interface_down(queue_t *q, mblk_t *mp) ASSERT(arl->arl_state == ARL_S_UP); /* Free all arp entries for this interface */ - ar_ce_walk((pfi_t)ar_ce_delete_per_arl, arl); + ar_ce_walk(ar_ce_delete_per_arl, arl); ar_ll_down(arl); /* Return EINPROGRESS so that ar_rput does not free the 'mp' */ @@ -1801,10 +1976,9 @@ ar_interface_up(queue_t *q, mblk_t *mp) int err; mblk_t *mp1; - arp1dbg(("ar_interface_up q %p\n", (void *)q)); arl = ar_ll_lookup_from_mp(mp); - if ((arl == NULL) || (arl->arl_closing)) { - arp1dbg(("ar_interface_up: no arl %p\n", (void *)q)); + if (arl == NULL || arl->arl_closing) { + DTRACE_PROBE2(up_no_arl, queue_t *, q, mblk_t *, mp); err = EINVAL; goto done; } @@ -1813,6 +1987,8 @@ ar_interface_up(queue_t *q, mblk_t *mp) * Newly received commands from clients go to the tail of the queue. */ if (CMD_NEEDS_QUEUEING(mp, arl)) { + DTRACE_PROBE3(up_enqueued, queue_t *, q, mblk_t *, mp, + arl_t *, arl); ar_cmd_enqueue(arl, mp, q, AR_INTERFACE_UP, B_TRUE); return (EINPROGRESS); } @@ -1843,9 +2019,10 @@ done: mp1 = ar_alloc(AR_DLPIOP_DONE, err); if (mp1 != NULL) { - arp1dbg(("ar_interface_up: send resp err %d q %p\n", - err, (void *)q)); - putnext(WR(q), mp1); + q = WR(q); + DTRACE_PROBE3(up_send_err, queue_t *, q, mblk_t *, mp1, + int, err); + putnext(q, mp1); } return (err); } @@ -1860,13 +2037,13 @@ ar_interface_on(queue_t *q, mblk_t *mp) { arl_t *arl; - arp1dbg(("ar_interface_on\n")); arl = ar_ll_lookup_from_mp(mp); if (arl == NULL) { - arp1dbg(("ar_interface_on: no arl\n")); + DTRACE_PROBE2(on_no_arl, queue_t *, q, mblk_t *, mp); return (EINVAL); } /* Turn off the IFF_NOARP flag and activate ARP */ + DTRACE_PROBE3(on_intf, queue_t *, q, mblk_t *, mp, arl_t *, arl); arl->arl_flags = 0; return (0); } @@ -1881,13 +2058,13 @@ ar_interface_off(queue_t *q, mblk_t *mp) { arl_t *arl; - arp1dbg(("ar_interface_off\n")); arl = ar_ll_lookup_from_mp(mp); if (arl == NULL) { - arp1dbg(("ar_interface_off: no arl\n")); + DTRACE_PROBE2(off_no_arl, queue_t *, q, mblk_t *, mp); return (EINVAL); } /* Turn on the IFF_NOARP flag and deactivate ARP */ + DTRACE_PROBE3(off_intf, queue_t *, q, mblk_t *, mp, arl_t *, arl); arl->arl_flags = ARL_F_NOARP; return (0); } @@ -1978,6 +2155,7 @@ ar_ll_init(ar_t *ar, mblk_t *mp) arl->arl_wq = ar->ar_wq; arl->arl_dlpi_pending = DL_PRIM_INVAL; + arl->arl_link_up = B_TRUE; ar->ar_arl = arl; } @@ -2127,8 +2305,6 @@ ar_ll_down(arl_t *arl) mblk_t *mp; ar_t *ar; - arp1dbg(("ar_ll_down arl %p\n", (void *)arl)); - ASSERT(arl->arl_state == ARL_S_UP); /* Let's break the association between an ARL and IP instance */ @@ -2163,8 +2339,7 @@ ar_ll_up(arl_t *arl) mblk_t *detach_mp = NULL; mblk_t *unbind_mp = NULL; mblk_t *info_mp = NULL; - - arp1dbg(("ar_ll_up arl %p \n", (void *)arl)); + mblk_t *notify_mp = NULL; ASSERT(arl->arl_state == ARL_S_DOWN); @@ -2197,6 +2372,12 @@ ar_ll_up(arl_t *arl) if (unbind_mp == NULL) goto bad; + notify_mp = ar_dlpi_comm(DL_NOTIFY_REQ, sizeof (dl_notify_req_t)); + if (notify_mp == NULL) + goto bad; + ((dl_notify_req_t *)notify_mp->b_rptr)->dl_notifications = + DL_NOTE_LINK_UP | DL_NOTE_LINK_DOWN; + arl->arl_state = ARL_S_PENDING; if (arl->arl_provider_style == DL_STYLE2) { ar_dlpi_send(arl, attach_mp); @@ -2206,18 +2387,16 @@ ar_ll_up(arl_t *arl) ar_dlpi_send(arl, info_mp); ar_dlpi_send(arl, bind_mp); arl->arl_unbind_mp = unbind_mp; + ar_dlpi_send(arl, notify_mp); return (0); + bad: - if (attach_mp != NULL) - freemsg(attach_mp); - if (bind_mp != NULL) - freemsg(bind_mp); - if (detach_mp != NULL) - freemsg(detach_mp); - if (unbind_mp != NULL) - freemsg(unbind_mp); - if (info_mp != NULL) - freemsg(info_mp); + freemsg(attach_mp); + freemsg(bind_mp); + freemsg(detach_mp); + freemsg(unbind_mp); + freemsg(info_mp); + freemsg(notify_mp); return (ENOMEM); } @@ -2237,7 +2416,6 @@ ar_mapping_add(queue_t *q, mblk_t *mp_orig) uint32_t hw_extract_start; arl_t *arl; - arp1dbg(("ar_mapping_add\n")); /* We handle both M_IOCTL and M_PROTO messages. */ if (DB_TYPE(mp) == M_IOCTL) mp = mp->b_cont; @@ -2248,14 +2426,15 @@ ar_mapping_add(queue_t *q, mblk_t *mp_orig) * Newly received commands from clients go to the tail of the queue. */ if (CMD_NEEDS_QUEUEING(mp_orig, arl)) { - arp1dbg(("ar_mapping_add: enqueue on %p q\n", (void *)q)); + DTRACE_PROBE3(madd_enqueued, queue_t *, q, mblk_t *, mp_orig, + arl_t *, arl); ar_cmd_enqueue(arl, mp_orig, q, AR_MAPPING_ADD, B_TRUE); return (EINPROGRESS); } mp_orig->b_prev = NULL; arma = (arma_t *)mp->b_rptr; - if ((ace = ar_ce_lookup_from_area(mp, ar_ce_lookup_mapping)) != 0) + if ((ace = ar_ce_lookup_from_area(mp, ar_ce_lookup_mapping)) != NULL) ar_ce_delete(ace); hw_addr_len = arma->arma_hw_addr_length; hw_addr = mi_offset_paramc(mp, arma->arma_hw_addr_offset, hw_addr_len); @@ -2267,8 +2446,8 @@ ar_mapping_add(queue_t *q, mblk_t *mp_orig) proto_extract_mask = mi_offset_paramc(mp, arma->arma_proto_extract_mask_offset, proto_addr_len); hw_extract_start = arma->arma_hw_mapping_start; - if (!proto_mask || !proto_extract_mask) { - arp0dbg(("ar_mapping_add: not masks\n")); + if (proto_mask == NULL || proto_extract_mask == NULL) { + DTRACE_PROBE2(madd_illegal_mask, arl_t *, arl, arpa_t *, arma); return (EINVAL); } return (ar_ce_create( @@ -2327,6 +2506,7 @@ ar_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) ar_t *ar; int err; queue_t *tmp_q; + mblk_t *mp; TRACE_1(TR_FAC_ARP, TR_ARP_OPEN, "arp_open: q %p", q); @@ -2335,10 +2515,8 @@ ar_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) return (0); } /* Load up the Named Dispatch tables, if not already done. */ - if (!ar_g_nd && - (!nd_load(&ar_g_nd, "arp_cache_report", ar_ce_report, NULL, - NULL) || - !ar_param_register(arp_param_arr, A_CNT(arp_param_arr)))) { + if (ar_g_nd == NULL && + !ar_param_register(arp_param_arr, A_CNT(arp_param_arr))) { ar_cleanup(); return (ENOMEM); } @@ -2362,8 +2540,6 @@ ar_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) crhold(credp); ar->ar_credp = credp; - if (!ar_timer_mp) - ar_timer_init(q); /* * Probe for the DLPI info if we are not pushed on IP. Wait for * the reply. In case of error call ar_close() which will take @@ -2371,6 +2547,8 @@ ar_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) * as freeing the arl, restarting the timer on a different queue etc. */ if (strcmp(q->q_next->q_qinfo->qi_minfo->mi_idname, "ip") == 0) { + arc_t *arc; + /* * We are pushed directly on top of IP. There is no need to * send down a DL_INFO_REQ. Return success. This could @@ -2378,7 +2556,25 @@ ar_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) * or a stream corresponding to an open of /dev/arp * (i.e. <arp-IP> stream). Note that we don't support * pushing some module in between arp and IP. + * + * Tell IP, though, that we're an extended implementation, so + * it knows to expect a DAD response after bringing an + * interface up. Old ATM drivers won't do this, and IP will + * just bring the interface up immediately. */ + ar->ar_on_ill_stream = (q->q_next->q_next != NULL); + if (!ar->ar_on_ill_stream) + return (0); + mp = allocb(sizeof (arc_t), BPRI_MED); + if (mp == NULL) { + (void) ar_close(RD(q)); + return (ENOMEM); + } + DB_TYPE(mp) = M_CTL; + arc = (arc_t *)mp->b_rptr; + mp->b_wptr = mp->b_rptr + sizeof (arc_t); + arc->arc_cmd = AR_ARP_EXTEND; + putnext(q, mp); return (0); } tmp_q = q; @@ -2390,8 +2586,8 @@ ar_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) if (strcmp(tmp_q->q_qinfo->qi_minfo->mi_idname, "ip") == 0) { /* - * We don't support pushing ARP arbitrarily on an - * IP driver stream. ARP has to be pushed directly above IP + * We don't support pushing ARP arbitrarily on an IP driver + * stream. ARP has to be pushed directly above IP. */ (void) ar_close(RD(q)); return (ENOTSUP); @@ -2400,8 +2596,8 @@ ar_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) * Send down a DL_INFO_REQ so we can find out what we are * talking to. */ - mblk_t *mp = ar_dlpi_comm(DL_INFO_REQ, sizeof (dl_info_req_t)); - if (!mp) { + mp = ar_dlpi_comm(DL_INFO_REQ, sizeof (dl_info_req_t)); + if (mp == NULL) { (void) ar_close(RD(q)); return (ENOMEM); } @@ -2547,19 +2743,18 @@ ar_plink_send(queue_t *q, mblk_t *mp) * ar_ce_walk routine to delete any outstanding queries for an ar that is * going away. */ -static int -ar_query_delete(ace_t *ace, uchar_t *ar) +static void +ar_query_delete(ace_t *ace, void *arg) { + ar_t *ar = arg; mblk_t **mpp = &ace->ace_query_mp; - mblk_t *mp = mpp[0]; + mblk_t *mp; - if (!mp) - return (0); - do { + while ((mp = *mpp) != NULL) { /* The response queue was stored in the query b_prev. */ - if ((queue_t *)mp->b_prev == ((ar_t *)ar)->ar_wq || - (queue_t *)mp->b_prev == ((ar_t *)ar)->ar_rq) { - mpp[0] = mp->b_next; + if ((queue_t *)mp->b_prev == ar->ar_wq || + (queue_t *)mp->b_prev == ar->ar_rq) { + *mpp = mp->b_next; if (DB_TYPE(mp) == M_PROTO && *(uint32_t *)mp->b_rptr == AR_ENTRY_QUERY) { BUMP_IRE_STATS(ire_stats_v4, ire_stats_freed); @@ -2568,8 +2763,7 @@ ar_query_delete(ace_t *ace, uchar_t *ar) } else { mpp = &mp->b_next; } - } while ((mp = mpp[0]) != 0); - return (0); + } } /* @@ -2614,11 +2808,11 @@ ar_query_reply(ace_t *ace, int ret_val, uchar_t *proto_addr, } /* Complete the response based on how the request arrived. */ if (DB_TYPE(mp) == M_IOCTL) { - struct iocblk *ioc = - (struct iocblk *)mp->b_rptr; + struct iocblk *ioc = (struct iocblk *)mp->b_rptr; + ioc->ioc_error = ret_val; - DB_TYPE(mp) = M_IOCACK; if (ret_val != 0) { + DB_TYPE(mp) = M_IOCNAK; ioc->ioc_count = 0; putnext(q, mp); continue; @@ -2627,6 +2821,7 @@ ar_query_reply(ace_t *ace, int ret_val, uchar_t *proto_addr, * Return the xmit template out with the successful * IOCTL. */ + DB_TYPE(mp) = M_IOCACK; ioc->ioc_count = template->b_wptr - template->b_rptr; /* Remove the areq mblk from the IOCTL. */ areq_mp = mp->b_cont; @@ -2680,12 +2875,23 @@ ar_query_reply(ace_t *ace, int ret_val, uchar_t *proto_addr, mp->b_cont = template; putnext(q, mp); } + /* - * Unless we are responding from a permanent cache entry, delete - * the ace. + * Unless we are responding from a permanent cache entry, start the + * cleanup timer or (on error) delete the entry. */ if (!(ace->ace_flags & (ACE_F_PERMANENT | ACE_F_DYING))) { - ar_ce_delete(ace); + if (!ACE_RESOLVED(ace) || arl->arl_xmit_template == NULL) { + /* + * No need to notify IP here, because the entry was + * never resolved, so IP can't have any cached copies + * of the address. + */ + ar_ce_delete(ace); + } else { + mi_timer(arl->arl_wq, ace->ace_mp, + arp_cleanup_interval); + } } } @@ -2726,10 +2932,26 @@ ar_query_xmit(ace_t *ace, ace_t *src_ace) src_ace = ar_ce_lookup_permanent(areq->areq_proto, sender_addr, areq->areq_sender_addr_length); if (src_ace == NULL) { - printf("ar_query_xmit: Could not find the ace\n"); + DTRACE_PROBE3(xmit_no_source, ace_t *, ace, + areq_t *, areq, uchar_t *, sender_addr); return (0); } } + + /* + * If we haven't yet finished duplicate address checking on this source + * address, then do *not* use it on the wire. Doing so will corrupt + * the world's caches. Just allow the timer to restart. Note that + * duplicate address checking will eventually complete one way or the + * other, so this cannot go on "forever." + */ + if (src_ace->ace_flags & ACE_F_UNVERIFIED) { + DTRACE_PROBE2(xmit_source_unverified, ace_t *, ace, + ace_t *, src_ace); + areq->areq_xmit_count++; + return (areq->areq_xmit_interval); + } + /* * Transmit on src_arl. We should transmit on src_arl. Otherwise * the switch will send back a copy on other interfaces of the @@ -2737,9 +2959,12 @@ ar_query_xmit(ace_t *ace, ace_t *src_ace) * address + hardware address, ARP will treat this as a bogon. */ src_arl = src_ace->ace_arl; + DTRACE_PROBE3(xmit_send, ace_t *, ace, ace_t *, src_ace, + areq_t *, areq); ar_xmit(src_arl, ARP_REQUEST, areq->areq_proto, areq->areq_sender_addr_length, src_arl->arl_hw_addr, sender_addr, - src_arl->arl_arp_addr, proto_addr); + src_arl->arl_arp_addr, proto_addr, NULL); + src_ace->ace_last_bcast = ddi_get_lbolt(); return (areq->areq_xmit_interval); } @@ -2758,11 +2983,10 @@ ar_rput(queue_t *q, mblk_t *mp) int op; uint32_t plen; uint32_t proto; - ace_t *src_ace; uchar_t *src_haddr; uchar_t *src_paddr; - dl_unitdata_ind_t *dlui; - boolean_t hwaddr_changed = B_TRUE; + boolean_t is_probe; + int i; TRACE_1(TR_FAC_ARP, TR_ARP_RPUT_START, "arp_rput_start: q %p", q); @@ -2817,34 +3041,36 @@ ar_rput(queue_t *q, mblk_t *mp) return; case M_PCPROTO: case M_PROTO: + if (MBLKL(mp) >= sizeof (dl_unitdata_ind_t) && + ((dl_unitdata_ind_t *)mp->b_rptr)->dl_primitive == + DL_UNITDATA_IND) { + arl = ((ar_t *)q->q_ptr)->ar_arl; + if (arl != NULL) { + /* Real messages from the wire! */ + break; + } + putnext(q, mp); + TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, + "arp_rput_end: q %p (%S)", q, "default"); + return; + } err = ar_cmd_dispatch(q, mp); switch (err) { case ENOENT: + /* Miscellaneous DLPI messages get shuffled off. */ + ar_rput_dlpi(q, mp); + TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, + "arp_rput_end: q %p (%S)", q, "proto/dlpi"); break; case EINPROGRESS: TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, "arp_rput_end: q %p (%S)", q, "proto"); - return; + break; default: inet_freemsg(mp); - return; - } - if ((mp->b_wptr - mp->b_rptr) < sizeof (dl_unitdata_ind_t) || - ((dl_unitdata_ind_t *)mp->b_rptr)->dl_primitive - != DL_UNITDATA_IND) { - /* Miscellaneous DLPI messages get shuffled off. */ - ar_rput_dlpi(q, mp); - TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, - "arp_rput_end: q %p (%S)", q, "proto/dlpi"); - return; - } - /* DL_UNITDATA_IND */ - arl = ((ar_t *)q->q_ptr)->ar_arl; - if (arl != NULL) { - /* Real messages from the wire! */ break; } - /* FALLTHRU */ + return; default: putnext(q, mp); TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, @@ -2867,15 +3093,14 @@ ar_rput(queue_t *q, mblk_t *mp) * followed by an ARP packet. We do some initial checks and then * get to work. */ - dlui = (dl_unitdata_ind_t *)mp->b_rptr; mp1 = mp->b_cont; - if (!mp1) { + if (mp1 == NULL) { freemsg(mp); TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, "arp_rput_end: q %p (%S)", q, "baddlpi"); return; } - if (!OK_32PTR(mp1->b_rptr) || mp1->b_cont) { + if (mp1->b_cont != NULL) { /* No fooling around with funny messages. */ if (!pullupmsg(mp1, -1)) { freemsg(mp); @@ -2885,22 +3110,33 @@ ar_rput(queue_t *q, mblk_t *mp) } } arh = (arh_t *)mp1->b_rptr; - hlen = (uint32_t)arh->arh_hlen & 0xFF; - plen = (uint32_t)arh->arh_plen & 0xFF; - if ((mp1->b_wptr - mp1->b_rptr) - < (ARH_FIXED_LEN + hlen + hlen + plen + plen)) { + hlen = arh->arh_hlen; + plen = arh->arh_plen; + if (MBLKL(mp1) < ARH_FIXED_LEN + 2 * hlen + 2 * plen) { freemsg(mp); TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, "arp_rput_end: q %p (%S)", q, "short"); return; } - if (hlen == 0 || plen == 0) { - arp1dbg(("ar_rput: bogus arh\n")); + /* + * hlen 0 is used for RFC 1868 UnARP. + * + * Note that the rest of the code checks that hlen is what we expect + * for this hardware address type, so might as well discard packets + * here that don't match. + */ + if ((hlen > 0 && hlen != arl->arl_hw_addr_length) || plen == 0) { + DTRACE_PROBE2(rput_bogus, arl_t *, arl, mblk_t *, mp1); freemsg(mp); TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, "arp_rput_end: q %p (%S)", q, "hlenzero/plenzero"); return; } + /* + * Historically, Solaris has been lenient about hardware type numbers. + * We should check here, but don't. + */ + DTRACE_PROBE2(rput_normal, arl_t *, arl, arh_t *, arh); proto = (uint32_t)BE16_TO_U16(arh->arh_proto); src_haddr = (uchar_t *)arh; src_haddr = &src_haddr[ARH_FIXED_LEN]; @@ -2908,191 +3144,255 @@ ar_rput(queue_t *q, mblk_t *mp) dst_paddr = &src_haddr[hlen + plen + hlen]; op = BE16_TO_U16(arh->arh_operation); - /* Now see if we have a cache entry for the source address. */ - src_ace = ar_ce_lookup_entry(arl, proto, src_paddr, plen); + /* Determine if this is just a probe */ + for (i = 0; i < plen; i++) + if (src_paddr[i] != 0) + break; + is_probe = i >= plen; + /* - * If so, and it is the entry for one of our IP addresses, - * we really don't expect to see this packet, so pretend we didn't. - * Tell IP that we received a bogon. - * - * If is a "published" (proxy arp) entry we can receive requests - * FROM the node but we should never see an ARP_RESPONSE. In this case - * we process the response but also inform IP. + * RFC 826: first check if the <protocol, sender protocol address> is + * in the cache, if there is a sender protocol address. Note that this + * step also handles resolutions based on source. */ - if (src_ace) { - if (src_ace->ace_flags & ACE_F_MYADDR) { - freeb(mp); - ar_client_notify(arl, mp1, AR_CN_BOGON); - TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, - "arp_rput_end: q %p (%S)", q, "pubentry"); - return; - } - if ((src_ace->ace_flags & ACE_F_PUBLISH) && - op == ARP_RESPONSE) { - mblk_t *mp2; - - mp2 = copymsg(mp1); - if (mp2 != NULL) - ar_client_notify(arl, mp2, AR_CN_BOGON); - } - if (src_ace->ace_hw_addr_length == hlen && - bcmp(src_ace->ace_hw_addr, src_haddr, hlen) == 0) { - hwaddr_changed = B_FALSE; - } + if (is_probe) + err = AR_NOTFOUND; + else + err = ar_ce_resolve_all(arl, proto, src_haddr, hlen, src_paddr, + plen); + switch (err) { + case AR_BOGON: + ar_client_notify(arl, mp1, AR_CN_BOGON); + mp1 = NULL; + break; + case AR_FAILED: + ar_client_notify(arl, mp1, AR_CN_FAILED); + mp1 = NULL; + break; + case AR_LOOPBACK: + DTRACE_PROBE2(rput_loopback, arl_t *, arl, arh_t *, arh); + freemsg(mp1); + mp1 = NULL; + break; } - switch (op) { - case ARP_REQUEST: - /* - * If we know the answer, and it is "published", send out - * the response. - */ - dst_ace = ar_ce_lookup_entry(arl, proto, dst_paddr, plen); - if (dst_ace && (dst_ace->ace_flags & ACE_F_PUBLISH) && - ACE_RESOLVED(dst_ace)) { - ar_xmit(arl, ARP_RESPONSE, dst_ace->ace_proto, plen, - dst_ace->ace_hw_addr, dst_ace->ace_proto_addr, - src_haddr, src_paddr); - } + if (mp1 == NULL) { + freeb(mp); + TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, + "arp_rput_end: q %p (%S)", q, "unneeded"); + return; + } + + /* + * Now look up the destination address. By RFC 826, we ignore the + * packet at this step if the target isn't one of our addresses. This + * is true even if the target is something we're trying to resolve and + * the packet is a response. + * + * Note that in order to do this correctly, we need to know when to + * notify IP of a change implied by the source address of the ARP + * message. That implies that the local ARP table has entries for all + * of the resolved entries cached in the client. This is why we must + * notify IP when we delete a resolved entry and we know that IP may + * have cached answers. + */ + dst_ace = ar_ce_lookup_entry(arl, proto, dst_paddr, plen); + if (dst_ace == NULL || !ACE_RESOLVED(dst_ace) || + !(dst_ace->ace_flags & ACE_F_PUBLISH)) { /* - * Now fall through to the response side, and add a cache entry - * for the sender so we will have it when we need it. + * Let the client know if the source mapping has changed, even + * if the destination provides no useful information for the + * client. */ - /* FALLTHRU */ - case ARP_RESPONSE: + if (err == AR_CHANGED) + ar_client_notify(arl, mp1, AR_CN_ANNOUNCE); + else + freemsg(mp1); + freeb(mp); + TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, + "arp_rput_end: q %p (%S)", q, "nottarget"); + return; + } + + /* + * If the target is unverified by DAD, then one of two things is true: + * either it's someone else claiming this address (on a probe or an + * announcement) or it's just a regular request. The former is + * failure, but a regular request is not. + */ + if (dst_ace->ace_flags & ACE_F_UNVERIFIED) { /* - * With ill groups, we need to look for request across - * all the ills in the group. The request itself may - * not be queued on this arl. See ar_query_xmit() for - * details. + * Check for a reflection. Some misbehaving bridges will + * reflect our own transmitted packets back to us. */ - err = ar_ce_resolve_all(arl, proto, src_haddr, hlen, - src_paddr, plen); - if (err == AR_BOGON) { - /* - * Some other host has our IP address. Send a - * BOGON message to IP. - */ + if (hlen == dst_ace->ace_hw_addr_length && + bcmp(src_haddr, dst_ace->ace_hw_addr, hlen) == 0) { + DTRACE_PROBE3(rput_probe_reflected, arl_t *, arl, + arh_t *, arh, ace_t *, dst_ace); freeb(mp); - ar_client_notify(arl, mp1, AR_CN_BOGON); + freemsg(mp1); TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, - "arp_rput_end: q %p (%S)", q, "pubentry"); + "arp_rput_end: q %p (%S)", q, "reflection"); return; } + if (is_probe || op == ARP_RESPONSE) { + ar_client_notify(arl, mp1, AR_CN_FAILED); + ar_ce_delete(dst_ace); + } else if (err == AR_CHANGED) { + ar_client_notify(arl, mp1, AR_CN_ANNOUNCE); + } else { + DTRACE_PROBE3(rput_request_unverified, arl_t *, arl, + arh_t *, arh, ace_t *, dst_ace); + freemsg(mp1); + } + freeb(mp); + TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, + "arp_rput_end: q %p (%S)", q, "unverified"); + return; + } + + /* + * If it's a request, then we reply to this, and if we think the + * sender's unknown, then we create an entry to avoid unnecessary ARPs. + * The design assumption is that someone ARPing us is likely to send us + * a packet soon, and that we'll want to reply to it. + */ + if (op == ARP_REQUEST) { + const uchar_t *dstaddr = src_haddr; + clock_t now; - if ((err != AR_LOOPBACK) && (src_ace == NULL)) { + /* + * This implements periodic address defense based on a modified + * version of the RFC 3927 requirements. Instead of sending a + * broadcasted reply every time, as demanded by the RFC, we + * send at most one broadcast reply per arp_broadcast_interval. + */ + now = ddi_get_lbolt(); + if ((now - dst_ace->ace_last_bcast) > + MSEC_TO_TICK(arp_broadcast_interval)) { + DTRACE_PROBE3(rput_bcast_reply, arl_t *, arl, + arh_t *, arh, ace_t *, dst_ace); + dst_ace->ace_last_bcast = now; + dstaddr = arl->arl_arp_addr; /* - * We may need this one sooner or later. The AR_LOOPBACK - * check above ensures, that we don't create arp - * entries for our own IP address, on another arl. + * If this is one of the long-suffering entries, then + * pull it out now. It no longer needs separate + * defense, because we're doing now that with this + * broadcasted reply. */ - (void) ar_ce_create(arl, proto, src_haddr, hlen, - src_paddr, plen, NULL, - NULL, (uint32_t)0, - (uint32_t)0); + dst_ace->ace_flags &= ~ACE_F_DELAYED; } - /* Let's see if this is a system ARPing itself. */ - do { - if (*src_paddr++ != *dst_paddr++) - break; - } while (--plen); - if (plen == 0) { - /* - * An ARP message with identical src and dst - * protocol addresses. This guy is trying to - * tell us something that our clients might - * find interesting.Essentially such packets are - * sent when a m/c comes up or changes its h/w - * address, so before notifying our client check the - * h/w address if there is a cache entry and notify - * the client only if the addresses differ. - */ - if (hwaddr_changed) { - freeb(mp); - ar_client_notify(arl, mp1, AR_CN_ANNOUNCE); - } else { - /* Just discard it. */ - freemsg(mp); - } - TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, - "arp_rput_end: q %p (%S)", q, "duplicate"); - return; + ar_xmit(arl, ARP_RESPONSE, dst_ace->ace_proto, plen, + dst_ace->ace_hw_addr, dst_ace->ace_proto_addr, + src_haddr, src_paddr, dstaddr); + if (!is_probe && err == AR_NOTFOUND && + ar_ce_create(arl, proto, src_haddr, hlen, src_paddr, plen, + NULL, NULL, 0, 0) == 0) { + ace_t *ace; + + ace = ar_ce_lookup(arl, proto, src_paddr, plen); + ASSERT(ace != NULL); + mi_timer(arl->arl_wq, ace->ace_mp, + arp_cleanup_interval); } + } + if (err == AR_CHANGED) { + freeb(mp); + ar_client_notify(arl, mp1, AR_CN_ANNOUNCE); + TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, + "arp_rput_end: q %p (%S)", q, "reqchange"); + } else { + freemsg(mp); + TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, + "arp_rput_end: q %p (%S)", q, "end"); + } +} + +static void +ar_ce_restart_dad(ace_t *ace, void *arl) +{ + if ((ace->ace_arl == arl) && + (ace->ace_flags & (ACE_F_UNVERIFIED|ACE_F_DAD_ABORTED)) == + (ACE_F_UNVERIFIED|ACE_F_DAD_ABORTED)) { /* - * A broadcast response may also be interesting. + * Slight cheat here: we don't use the initial probe delay + * in this obscure case. */ - if (op == ARP_RESPONSE && dlui->dl_group_address) { - freeb(mp); - ar_client_notify(arl, mp1, AR_CN_ANNOUNCE); - return; + if (ace->ace_flags & ACE_F_FAST) { + ace->ace_xmit_count = arp_fastprobe_count; + ace->ace_xmit_interval = arp_fastprobe_interval; + } else { + ace->ace_xmit_count = arp_probe_count; + ace->ace_xmit_interval = arp_probe_interval; } - break; - default: - break; + ace->ace_flags &= ~ACE_F_DAD_ABORTED; + ace_set_timer(ace, B_FALSE); } - freemsg(mp); - TRACE_2(TR_FAC_ARP, TR_ARP_RPUT_END, - "arp_rput_end: q %p (%S)", q, "end"); } /* DLPI messages, other than DL_UNITDATA_IND are handled here. */ static void ar_rput_dlpi(queue_t *q, mblk_t *mp) { - ar_t *ar = (ar_t *)q->q_ptr; + ar_t *ar = q->q_ptr; arl_t *arl = ar->ar_arl; - dl_bind_ack_t *dlba; - dl_error_ack_t *dlea; - dl_ok_ack_t *dloa; - dl_uderror_ind_t *dluei; - char *err_str; + union DL_primitives *dlp; + const char *err_str; - if ((mp->b_wptr - mp->b_rptr) < sizeof (dloa->dl_primitive)) { + if (MBLKL(mp) < sizeof (dlp->dl_primitive)) { putnext(q, mp); return; } - dloa = (dl_ok_ack_t *)mp->b_rptr; - dlea = (dl_error_ack_t *)dloa; - switch (dloa->dl_primitive) { + dlp = (union DL_primitives *)mp->b_rptr; + switch (dlp->dl_primitive) { case DL_ERROR_ACK: - switch (dlea->dl_error_primitive) { + /* + * ce is confused about how DLPI works, so we have to interpret + * an "error" on DL_NOTIFY_ACK (which we never could have sent) + * as really meaning an error on DL_NOTIFY_REQ. + * + * Note that supporting DL_NOTIFY_REQ is optional, so printing + * out an error message on the console isn't warranted except + * for debug. + */ + if (dlp->error_ack.dl_error_primitive == DL_NOTIFY_ACK || + dlp->error_ack.dl_error_primitive == DL_NOTIFY_REQ) { + ar_dlpi_done(arl, DL_NOTIFY_REQ); + freemsg(mp); + return; + } + err_str = dlpi_prim_str(dlp->error_ack.dl_error_primitive); + DTRACE_PROBE2(rput_dl_error, arl_t *, arl, + dl_error_ack_t *, &dlp->error_ack); + switch (dlp->error_ack.dl_error_primitive) { case DL_UNBIND_REQ: if (arl->arl_provider_style == DL_STYLE1) arl->arl_state = ARL_S_DOWN; - ar_dlpi_done(arl, DL_UNBIND_REQ); - err_str = "DL_UNBIND_REQ"; break; case DL_DETACH_REQ: + case DL_BIND_REQ: arl->arl_state = ARL_S_DOWN; - ar_dlpi_done(arl, DL_DETACH_REQ); - err_str = "DL_DETACH_REQ"; break; case DL_ATTACH_REQ: - ar_dlpi_done(arl, DL_ATTACH_REQ); - err_str = "DL_ATTACH_REQ"; - break; - case DL_BIND_REQ: - arl->arl_state = ARL_S_DOWN; - ar_dlpi_done(arl, DL_BIND_REQ); - err_str = "DL_BIND_REQ"; break; default: - err_str = "?"; - break; + /* If it's anything else, we didn't send it. */ + putnext(q, mp); + return; } - arp0dbg(("ar_rput_dlpi: " - "%s (%d) failed, dl_errno %d, dl_unix_errno %d\n", - err_str, (int)dlea->dl_error_primitive, - (int)dlea->dl_errno, (int)dlea->dl_unix_errno)); + ar_dlpi_done(arl, dlp->error_ack.dl_error_primitive); (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, "ar_rput_dlpi: %s failed, dl_errno %d, dl_unix_errno %d", - err_str, dlea->dl_errno, dlea->dl_unix_errno); + err_str, dlp->error_ack.dl_errno, + dlp->error_ack.dl_unix_errno); break; case DL_INFO_ACK: /* * We have a response back from the driver. Go set up transmit * defaults. */ + DTRACE_PROBE2(rput_dl_info, arl_t *, arl, + dl_info_ack_t *, &dlp->info_ack); if (arl != NULL) { ar_ll_set_defaults(arl, mp); ar_dlpi_done(arl, DL_INFO_REQ); @@ -3103,48 +3403,75 @@ ar_rput_dlpi(queue_t *q, mblk_t *mp) qenable(WR(q)); break; case DL_OK_ACK: - arp1dbg(("ar_rput_dlpi: arl %p DL_OK_ACK for %d\n", - (void *)arl, dloa->dl_correct_primitive)); - switch (dloa->dl_correct_primitive) { + DTRACE_PROBE2(rput_dl_ok, arl_t *, arl, + dl_ok_ack_t *, &dlp->ok_ack); + switch (dlp->ok_ack.dl_correct_primitive) { case DL_UNBIND_REQ: if (arl->arl_provider_style == DL_STYLE1) arl->arl_state = ARL_S_DOWN; - ar_dlpi_done(arl, DL_UNBIND_REQ); break; case DL_DETACH_REQ: arl->arl_state = ARL_S_DOWN; - ar_dlpi_done(arl, DL_DETACH_REQ); break; case DL_ATTACH_REQ: - ar_dlpi_done(arl, DL_ATTACH_REQ); break; + default: + putnext(q, mp); + return; } + ar_dlpi_done(arl, dlp->ok_ack.dl_correct_primitive); + break; + case DL_NOTIFY_ACK: + DTRACE_PROBE2(rput_dl_notify, arl_t *, arl, + dl_notify_ack_t *, &dlp->notify_ack); + /* + * We mostly care about interface-up transitions, as this is + * when we need to redo duplicate address detection. + */ + arl->arl_notifies = + (dlp->notify_ack.dl_notifications & DL_NOTE_LINK_UP) != 0; + ar_dlpi_done(arl, DL_NOTIFY_REQ); break; case DL_BIND_ACK: - arp1dbg(("ar_rput: DL_BIND_ACK arl %p\n", (void *)arl)); - dlba = (dl_bind_ack_t *)dloa; + DTRACE_PROBE2(rput_dl_bind, arl_t *, arl, + dl_bind_ack_t *, &dlp->bind_ack); if (arl->arl_sap_length < 0) - bcopy((char *)dlba + dlba->dl_addr_offset, + bcopy((char *)dlp + dlp->bind_ack.dl_addr_offset, arl->arl_hw_addr, arl->arl_hw_addr_length); else - bcopy((char *)dlba + dlba->dl_addr_offset + + bcopy((char *)dlp + dlp->bind_ack.dl_addr_offset + arl->arl_sap_length, arl->arl_hw_addr, arl->arl_hw_addr_length); arl->arl_state = ARL_S_UP; ar_dlpi_done(arl, DL_BIND_REQ); break; + case DL_NOTIFY_IND: + DTRACE_PROBE2(rput_dl_notify_ind, arl_t *, arl, + dl_notify_ind_t *, &dlp->notify_ind); + switch (dlp->notify_ind.dl_notification) { + case DL_NOTE_LINK_UP: + arl->arl_link_up = B_TRUE; + ar_ce_walk(ar_ce_restart_dad, arl); + break; + case DL_NOTE_LINK_DOWN: + arl->arl_link_up = B_FALSE; + break; + } + break; case DL_UDERROR_IND: - dluei = (dl_uderror_ind_t *)dloa; + DTRACE_PROBE2(rput_dl_uderror, arl_t *, arl, + dl_uderror_ind_t *, &dlp->uderror_ind); (void) mi_strlog(q, 1, SL_ERROR | SL_TRACE, "ar_rput_dlpi: " "DL_UDERROR_IND, dl_dest_addr_length %d dl_errno %d", - dluei->dl_dest_addr_length, dluei->dl_errno); + dlp->uderror_ind.dl_dest_addr_length, + dlp->uderror_ind.dl_errno); putnext(q, mp); return; default: - arp1dbg(("ar_rput_dlpi: default, primitive %d\n", - (int)dloa->dl_primitive)); + DTRACE_PROBE2(rput_dl_badprim, arl_t *, arl, + union DL_primitives *, dlp); putnext(q, mp); return; } @@ -3158,14 +3485,12 @@ ar_set_address(ace_t *ace, uchar_t *addrpos, uchar_t *proto_addr, uchar_t *mask, *to; int len; - if (!ace->ace_hw_addr) - return; + ASSERT(ace->ace_hw_addr != NULL); bcopy(ace->ace_hw_addr, addrpos, ace->ace_hw_addr_length); if (ace->ace_flags & ACE_F_MAPPING && proto_addr != NULL && ace->ace_proto_extract_mask) { /* careful */ - arp1dbg(("ar_set_address: MAPPING\n")); len = MIN((int)ace->ace_hw_addr_length - ace->ace_hw_extract_start, proto_addr_len); @@ -3179,14 +3504,15 @@ ar_set_address(ace_t *ace, uchar_t *addrpos, uchar_t *proto_addr, static int ar_slifname(queue_t *q, mblk_t *mp_orig) { - ar_t *ar = (ar_t *)q->q_ptr; + ar_t *ar = q->q_ptr; arl_t *arl = ar->ar_arl; struct lifreq *lifr; mblk_t *mp = mp_orig; + arl_t *old_arl; + mblk_t *ioccpy; + struct iocblk *iocp; - arp1dbg(("ar_slifname\n")); - - if (MODULE_BELOW_IS_IP(q)) { + if (ar->ar_on_ill_stream) { /* * This command is for IP, since it is coming down * the <arp-IP-driver> stream. Return ENOENT so that @@ -3197,37 +3523,71 @@ ar_slifname(queue_t *q, mblk_t *mp_orig) /* We handle both M_IOCTL and M_PROTO messages */ if (DB_TYPE(mp) == M_IOCTL) mp = mp->b_cont; - if (!q->q_next || arl == NULL) { + if (q->q_next == NULL || arl == NULL) { /* * If the interface was just opened and * the info ack has not yet come back from the driver */ - arp1dbg(("ar_slifname no arl - queued\n")); + DTRACE_PROBE2(slifname_no_arl, queue_t *, q, + mblk_t *, mp_orig); (void) putq(q, mp_orig); return (EINPROGRESS); } - if (arl->arl_name[0] != '\0') + + if (MBLKL(mp) < sizeof (struct lifreq)) { + DTRACE_PROBE2(slifname_malformed, queue_t *, q, + mblk_t *, mp); + } + + if (arl->arl_name[0] != '\0') { + DTRACE_PROBE1(slifname_already, arl_t *, arl); return (EALREADY); + } - lifr = (struct lifreq *)(mp->b_rptr); + lifr = (struct lifreq *)mp->b_rptr; - if (strlen(lifr->lifr_name) >= LIFNAMSIZ) + if (strlen(lifr->lifr_name) >= LIFNAMSIZ) { + DTRACE_PROBE2(slifname_bad_name, arl_t *, arl, + struct lifreq *, lifr); return (ENXIO); + } /* Check whether the name is already in use. */ - if (ar_ll_lookup_by_name(lifr->lifr_name)) { - arp1dbg(("ar_slifname: %s exists\n", lifr->lifr_name)); + + old_arl = ar_ll_lookup_by_name(lifr->lifr_name); + if (old_arl != NULL) { + DTRACE_PROBE2(slifname_exists, arl_t *, arl, arl_t *, old_arl); return (EEXIST); } + + /* Make a copy of the message so we can send it downstream. */ + if ((ioccpy = allocb(sizeof (struct iocblk), BPRI_MED)) == NULL || + (ioccpy->b_cont = copymsg(mp)) == NULL) { + if (ioccpy != NULL) + freeb(ioccpy); + return (ENOMEM); + } + (void) strlcpy(arl->arl_name, lifr->lifr_name, sizeof (arl->arl_name)); /* The ppa is sent down by ifconfig */ arl->arl_ppa = lifr->lifr_ppa; - arp1dbg(("ar_slifname: name is now %s, ppa %d\n", arl->arl_name, - arl->arl_ppa)); /* Chain in the new arl. */ arl->arl_next = arl_g_head; arl_g_head = arl; + DTRACE_PROBE1(slifname_set, arl_t *, arl); + + /* + * Send along a copy of the ioctl; this is just for hitbox. Use + * M_CTL to avoid confusing anyone else who might be listening. + */ + DB_TYPE(ioccpy) = M_CTL; + iocp = (struct iocblk *)ioccpy->b_rptr; + bzero(iocp, sizeof (*iocp)); + iocp->ioc_cmd = SIOCSLIFNAME; + iocp->ioc_count = msgsize(ioccpy->b_cont); + ioccpy->b_wptr = (uchar_t *)(iocp + 1); + putnext(arl->arl_wq, ioccpy); return (0); } @@ -3239,10 +3599,9 @@ ar_set_ppa(queue_t *q, mblk_t *mp_orig) int ppa; char *cp; mblk_t *mp = mp_orig; + arl_t *old_arl; - arp1dbg(("ar_set_ppa\n")); - - if (MODULE_BELOW_IS_IP(q)) { + if (ar->ar_on_ill_stream) { /* * This command is for IP, since it is coming down * the <arp-IP-driver> stream. Return ENOENT so that @@ -3254,35 +3613,40 @@ ar_set_ppa(queue_t *q, mblk_t *mp_orig) /* We handle both M_IOCTL and M_PROTO messages. */ if (DB_TYPE(mp) == M_IOCTL) mp = mp->b_cont; - if (!q->q_next || arl == NULL) { + if (q->q_next == NULL || arl == NULL) { /* * If the interface was just opened and * the info ack has not yet come back from the driver. */ - arp1dbg(("ar_set_ppa: no arl - queued\n")); + DTRACE_PROBE2(setppa_no_arl, queue_t *, q, + mblk_t *, mp_orig); (void) putq(q, mp_orig); return (EINPROGRESS); } - if (arl->arl_name[0] != '\0') + if (arl->arl_name[0] != '\0') { + DTRACE_PROBE1(setppa_already, arl_t *, arl); return (EALREADY); + } do { q = q->q_next; - } while (q->q_next); + } while (q->q_next != NULL); cp = q->q_qinfo->qi_minfo->mi_idname; ppa = *(int *)(mp->b_rptr); (void) snprintf(arl->arl_name, sizeof (arl->arl_name), "%s%d", cp, ppa); - if (ar_ll_lookup_by_name(arl->arl_name) != NULL) { - arp1dbg(("ar_set_ppa: %s busy\n", arl->arl_name)); + + old_arl = ar_ll_lookup_by_name(arl->arl_name); + if (old_arl != NULL) { + DTRACE_PROBE2(setppa_exists, arl_t *, arl, arl_t *, old_arl); /* Make it a null string again */ arl->arl_name[0] = '\0'; return (EBUSY); } - arp1dbg(("ar_set_ppa: %d\n", ppa)); arl->arl_ppa = ppa; + DTRACE_PROBE1(setppa_done, arl_t *, arl); /* Chain in the new arl. */ arl->arl_next = arl_g_head; arl_g_head = arl; @@ -3357,10 +3721,8 @@ ar_snmp_msg(queue_t *q, mblk_t *mp_orig) * this is an ipNetToMediaTable msg from IP that needs (unique) * arp cache entries appended... */ - if ((mpdata = mp->b_cont) == NULL) { - arp0dbg(("ar_snmp_msg: b_cont == NULL for MIB2_IP msg\n")); + if ((mpdata = mp->b_cont) == NULL) return (EINVAL); - } ar_snmp_hash_tbl = ar_create_snmp_hash(mpdata); @@ -3368,7 +3730,7 @@ ar_snmp_msg(queue_t *q, mblk_t *mp_orig) args.m2a_hashb = ar_snmp_hash_tbl; args.m2a_mpdata = NULL; args.m2a_mptail = NULL; - ar_ce_walk((pfi_t)ar_snmp_msg2, &args); + ar_ce_walk(ar_snmp_msg2, &args); mi_free(ar_snmp_hash_tbl); /* @@ -3478,7 +3840,7 @@ ar_snmp_msg2(ace_t *ace, void *arg) m2ap->m2a_mpdata = allocb(sizeof (mib2_ipNetToMediaEntry_t), BPRI_HI); if (m2ap->m2a_mpdata == NULL) { - arp1dbg(("ar_snmp_msg2:allocb failed\n")); + DTRACE_PROBE(snmp_allocb_failure); return; } } @@ -3498,30 +3860,6 @@ ar_snmp_msg2(ace_t *ace, void *arg) (char *)&ntme, sizeof (ntme)); } -/* Start up the garbage collection timer on the queue provided. */ -static void -ar_timer_init(queue_t *q) -{ - if (ar_timer_mp) - return; - ar_timer_mp = mi_timer_alloc(0); - if (!ar_timer_mp) - return; - ar_timer_queue = q; - mi_timer(ar_timer_queue, ar_timer_mp, arp_timer_interval); -} - -/* ar_ce_walk routine to trash all non-permanent resolved entries. */ -/* ARGSUSED */ -static int -ar_trash(ace_t *ace, uchar_t *arg) -{ - if ((ace->ace_flags & (ACE_F_RESOLVED|ACE_F_PERMANENT)) == - ACE_F_RESOLVED) - ar_ce_delete(ace); - return (0); -} - /* Write side put procedure. */ static void ar_wput(queue_t *q, mblk_t *mp) @@ -3579,11 +3917,14 @@ ar_wput(queue_t *q, mblk_t *mp) break; } ioc = (struct iocblk *)mp->b_rptr; - ioc->ioc_error = err; - if ((mp1 = mp->b_cont) != 0) - ioc->ioc_count = msgdsize(mp1); - else - ioc->ioc_count = 0; + if (err != 0) + ioc->ioc_error = err; + if (ioc->ioc_error != 0) { + DB_TYPE(mp) = M_IOCNAK; + freemsg(mp->b_cont); + mp->b_cont = NULL; + } + ioc->ioc_count = msgdsize(mp->b_cont); qreply(q, mp); TRACE_2(TR_FAC_ARP, TR_ARP_WPUT_END, "arp_wput_end: q %p (%S)", q, "ioctl"); @@ -3660,6 +4001,117 @@ ar_wput(queue_t *q, mblk_t *mp) "arp_wput_end: q %p (%S)", q, "end"); } +static boolean_t +arp_say_ready(ace_t *ace) +{ + mblk_t *mp; + arl_t *arl; + arh_t *arh; + uchar_t *cp; + + arl = ace->ace_arl; + mp = allocb(sizeof (*arh) + 2 * (arl->arl_hw_addr_length + + ace->ace_proto_addr_length), BPRI_MED); + if (mp == NULL) { + /* skip a beat on allocation trouble */ + ace->ace_xmit_count = 1; + ace_set_timer(ace, B_FALSE); + return (B_FALSE); + } + /* Tell IP address is now usable */ + arh = (arh_t *)mp->b_rptr; + U16_TO_BE16(arl->arl_arp_hw_type, arh->arh_hardware); + U16_TO_BE16(ace->ace_proto, arh->arh_proto); + arh->arh_hlen = arl->arl_hw_addr_length; + arh->arh_plen = ace->ace_proto_addr_length; + U16_TO_BE16(ARP_REQUEST, arh->arh_operation); + cp = (uchar_t *)(arh + 1); + bcopy(ace->ace_hw_addr, cp, arl->arl_hw_addr_length); + cp += arl->arl_hw_addr_length; + bcopy(ace->ace_proto_addr, cp, ace->ace_proto_addr_length); + cp += ace->ace_proto_addr_length; + bcopy(ace->ace_hw_addr, cp, arl->arl_hw_addr_length); + cp += arl->arl_hw_addr_length; + bcopy(ace->ace_proto_addr, cp, ace->ace_proto_addr_length); + cp += ace->ace_proto_addr_length; + mp->b_wptr = cp; + ar_client_notify(arl, mp, AR_CN_READY); + DTRACE_PROBE1(ready, ace_t *, ace); + return (B_TRUE); +} + +/* + * Pick the longest-waiting aces for defense. + */ +static void +ace_reschedule(ace_t *ace, void *arg) +{ + ace_resched_t *art = arg; + ace_t **aces; + ace_t **acemax; + ace_t *atemp; + + if (ace->ace_arl != art->art_arl) + return; + /* + * Only published entries that are ready for announcement are eligible. + */ + if ((ace->ace_flags & (ACE_F_PUBLISH | ACE_F_UNVERIFIED | ACE_F_DYING | + ACE_F_DELAYED)) != ACE_F_PUBLISH) { + return; + } + if (art->art_naces < ACE_RESCHED_LIST_LEN) { + art->art_aces[art->art_naces++] = ace; + } else { + aces = art->art_aces; + acemax = aces + ACE_RESCHED_LIST_LEN; + for (; aces < acemax; aces++) { + if ((*aces)->ace_last_bcast > ace->ace_last_bcast) { + atemp = *aces; + *aces = ace; + ace = atemp; + } + } + } +} + +/* + * Reschedule the ARP defense of any long-waiting ACEs. It's assumed that this + * doesn't happen very often (if at all), and thus it needn't be highly + * optimized. (Note, though, that it's actually O(N) complexity, because the + * outer loop is bounded by a constant rather than by the length of the list.) + */ +static void +arl_reschedule(arl_t *arl) +{ + ace_resched_t art; + int i; + ace_t *ace; + + i = arl->arl_defend_count; + arl->arl_defend_count = 0; + /* If none could be sitting around, then don't reschedule */ + if (i < arp_defend_rate) { + DTRACE_PROBE1(reschedule_none, arl_t *, arl); + return; + } + art.art_arl = arl; + while (arl->arl_defend_count < arp_defend_rate) { + art.art_naces = 0; + ar_ce_walk(ace_reschedule, &art); + for (i = 0; i < art.art_naces; i++) { + ace = art.art_aces[i]; + ace->ace_flags |= ACE_F_DELAYED; + ace_set_timer(ace, B_FALSE); + if (++arl->arl_defend_count >= arp_defend_rate) + break; + } + if (art.art_naces < ACE_RESCHED_LIST_LEN) + break; + } + DTRACE_PROBE1(reschedule, arl_t *, arl); +} + /* * Write side service routine. The only action here is delivery of transmit * timer events and delayed messages while waiting for the info_ack (ar_arl @@ -3668,8 +4120,9 @@ ar_wput(queue_t *q, mblk_t *mp) static void ar_wsrv(queue_t *q) { - ace_t *ace; - mblk_t *mp; + ace_t *ace; + arl_t *arl; + mblk_t *mp; clock_t ms; TRACE_1(TR_FAC_ARP, TR_ARP_WSRV_START, @@ -3680,39 +4133,135 @@ ar_wsrv(queue_t *q) case M_PCSIG: if (!mi_timer_valid(mp)) continue; - if (mp == ar_timer_mp) { - /* Garbage collection time. */ - ar_ce_walk(ar_trash, NULL); - mi_timer(ar_timer_queue, ar_timer_mp, - arp_timer_interval); + ace = (ace_t *)mp->b_rptr; + if (ace->ace_flags & ACE_F_DYING) continue; + arl = ace->ace_arl; + if (ace->ace_flags & ACE_F_UNVERIFIED) { + ASSERT(ace->ace_flags & ACE_F_PUBLISH); + ASSERT(ace->ace_query_mp == NULL); + /* + * If the link is down, give up for now. IP + * will give us the go-ahead to try again when + * the link restarts. + */ + if (!arl->arl_link_up) { + DTRACE_PROBE1(timer_link_down, + ace_t *, ace); + ace->ace_flags |= ACE_F_DAD_ABORTED; + continue; + } + if (ace->ace_xmit_count > 0) { + DTRACE_PROBE1(timer_probe, + ace_t *, ace); + ace->ace_xmit_count--; + ar_xmit(arl, ARP_REQUEST, + ace->ace_proto, + ace->ace_proto_addr_length, + ace->ace_hw_addr, NULL, NULL, + ace->ace_proto_addr, NULL); + ace_set_timer(ace, B_FALSE); + continue; + } + if (!arp_say_ready(ace)) + continue; + DTRACE_PROBE1(timer_ready, ace_t *, ace); + ace->ace_xmit_interval = arp_publish_interval; + ace->ace_xmit_count = arp_publish_count; + if (ace->ace_xmit_count == 0) + ace->ace_xmit_count++; + ace->ace_flags &= ~ACE_F_UNVERIFIED; } - ace = (ace_t *)mp->b_rptr; - if (ace->ace_flags & (ACE_F_PUBLISH | ACE_F_MYADDR)) { + if (ace->ace_flags & ACE_F_PUBLISH) { + clock_t now; + + /* + * If an hour has passed, then free up the + * entries that need defense by rescheduling + * them. + */ + now = ddi_get_lbolt(); + if (arp_defend_rate > 0 && + now - arl->arl_defend_start > + SEC_TO_TICK(arp_defend_period)) { + arl->arl_defend_start = now; + arl_reschedule(arl); + } /* * Finish the job that we started in - * ar_entry_add. + * ar_entry_add. When we get to zero + * announcement retransmits left, switch to + * address defense. */ ASSERT(ace->ace_query_mp == NULL); - ASSERT(ace->ace_publish_count != 0); - ace->ace_publish_count--; - ar_xmit(ace->ace_arl, ARP_REQUEST, + if (ace->ace_xmit_count > 0) { + ace->ace_xmit_count--; + DTRACE_PROBE1(timer_announce, + ace_t *, ace); + } else if (ace->ace_flags & ACE_F_DELAYED) { + /* + * This guy was rescheduled as one of + * the really old entries needing + * on-going defense. Let him through + * now. + */ + DTRACE_PROBE1(timer_send_delayed, + ace_t *, ace); + ace->ace_flags &= ~ACE_F_DELAYED; + } else if (arp_defend_rate > 0 && + (arl->arl_defend_count >= arp_defend_rate || + ++arl->arl_defend_count >= + arp_defend_rate)) { + /* + * If we're no longer allowed to send + * unbidden defense messages, then just + * wait for rescheduling. + */ + DTRACE_PROBE1(timer_excess_defense, + ace_t *, ace); + ace_set_timer(ace, B_FALSE); + continue; + } else { + DTRACE_PROBE1(timer_defend, + ace_t *, ace); + } + ar_xmit(arl, ARP_REQUEST, ace->ace_proto, ace->ace_proto_addr_length, ace->ace_hw_addr, ace->ace_proto_addr, - ace->ace_arl->arl_arp_addr, - ace->ace_proto_addr); - if (ace->ace_publish_count != 0 && - arp_publish_interval != 0) { - mi_timer(ace->ace_arl->arl_wq, - ace->ace_mp, - arp_publish_interval); - } + arl->arl_arp_addr, + ace->ace_proto_addr, NULL); + ace->ace_last_bcast = now; + if (ace->ace_xmit_count == 0) + ace->ace_xmit_interval = + arp_defend_interval; + if (ace->ace_xmit_interval != 0) + ace_set_timer(ace, B_FALSE); continue; } - if (!ace->ace_query_mp) + + /* + * If this is a non-permanent (regular) resolved ARP + * entry, then it's now time to check if it can be + * retired. As an optimization, we check with IP + * first, and just restart the timer if the address is + * still in use. + */ + if (ACE_NONPERM(ace)) { + if (ace->ace_proto == IP_ARP_PROTO_TYPE && + ndp_lookup_ipaddr(*(ipaddr_t *) + ace->ace_proto_addr)) { + ace->ace_flags |= ACE_F_OLD; + mi_timer(arl->arl_wq, ace->ace_mp, + arp_cleanup_interval); + } else { + ar_delete_notify(ace); + ar_ce_delete(ace); + } continue; + } + /* * ar_query_xmit returns the number of milliseconds to * wait following this transmit. If the number of @@ -3721,6 +4270,7 @@ ar_wsrv(queue_t *q) * we complete the operation with a failure indication. * Otherwise, we restart the timer. */ + ASSERT(ace->ace_query_mp != NULL); ms = ar_query_xmit(ace, NULL); if (ms == 0) ar_query_reply(ace, ENXIO, NULL, (uint32_t)0); @@ -3739,43 +4289,50 @@ ar_wsrv(queue_t *q) /* ar_xmit is called to transmit an ARP Request or Response. */ static void ar_xmit(arl_t *arl, uint32_t operation, uint32_t proto, uint32_t plen, - uchar_t *haddr1, uchar_t *paddr1, uchar_t *haddr2, uchar_t *paddr2) + const uchar_t *haddr1, const uchar_t *paddr1, const uchar_t *haddr2, + const uchar_t *paddr2, const uchar_t *dstaddr) { arh_t *arh; - char *cp; - uint32_t hlen = arl->arl_hw_addr_length; + uint8_t *cp; + uint_t hlen; mblk_t *mp; - if (arl->arl_flags & ARL_F_NOARP) { - /* IFF_NOARP flag is set. Do not send an arp request */ + /* IFF_NOARP flag is set or interface down: do not send arp messages */ + if ((arl->arl_flags & ARL_F_NOARP) || !arl->arl_link_up) return; - } mp = arl->arl_xmit_template; - if (!mp || !(mp = copyb(mp))) + if (mp == NULL || (mp = copyb(mp)) == NULL) return; + hlen = arl->arl_hw_addr_length; mp->b_cont = allocb(AR_LL_HDR_SLACK + ARH_FIXED_LEN + (hlen * 4) + plen + plen, BPRI_MED); - if (!mp->b_cont) { + if (mp->b_cont == NULL) { freeb(mp); return; } + + /* Get the L2 destination address for the message */ + if (haddr2 == NULL) + dstaddr = arl->arl_arp_addr; + else if (dstaddr == NULL) + dstaddr = haddr2; + /* * Figure out where the target hardware address goes in the * DL_UNITDATA_REQ header, and copy it in. */ - - cp = (char *)mi_offset_param(mp, arl->arl_xmit_template_addr_offset, - hlen); - if (!cp) { + cp = mi_offset_param(mp, arl->arl_xmit_template_addr_offset, hlen); + ASSERT(cp != NULL); + if (cp == NULL) { freemsg(mp); return; } - bcopy(haddr2, cp, hlen); + bcopy(dstaddr, cp, hlen); /* Fill in the ARP header. */ - cp = (char *)mp->b_cont->b_rptr + (AR_LL_HDR_SLACK + hlen + hlen); - mp->b_cont->b_rptr = (uchar_t *)cp; + cp = mp->b_cont->b_rptr + (AR_LL_HDR_SLACK + hlen + hlen); + mp->b_cont->b_rptr = cp; arh = (arh_t *)cp; U16_TO_BE16(arl->arl_arp_hw_type, arh->arh_hardware); U16_TO_BE16(proto, arh->arh_proto); @@ -3785,13 +4342,19 @@ ar_xmit(arl_t *arl, uint32_t operation, uint32_t proto, uint32_t plen, cp += ARH_FIXED_LEN; bcopy(haddr1, cp, hlen); cp += hlen; - bcopy(paddr1, cp, plen); + if (paddr1 == NULL) + bzero(cp, plen); + else + bcopy(paddr1, cp, plen); cp += plen; - bcopy(haddr2, cp, hlen); + if (haddr2 == NULL) + bzero(cp, hlen); + else + bcopy(haddr2, cp, hlen); cp += hlen; bcopy(paddr2, cp, plen); cp += plen; - mp->b_cont->b_wptr = (uchar_t *)cp; + mp->b_cont->b_wptr = cp; /* Ship it out. */ if (canputnext(arl->arl_wq)) putnext(arl->arl_wq, mp); @@ -3799,209 +4362,6 @@ ar_xmit(arl_t *arl, uint32_t operation, uint32_t proto, uint32_t plen, freemsg(mp); } -/* - * Handle an external request to broadcast an ARP request. This is used - * by configuration programs to broadcast a request advertising our own - * hardware and protocol addresses. - */ -static int -ar_xmit_request(queue_t *q, mblk_t *mp_orig) -{ - areq_t *areq; - arl_t *arl; - uchar_t *sender; - uint32_t sender_length; - uchar_t *target; - uint32_t target_length; - mblk_t *mp = mp_orig; - - /* We handle both M_IOCTL and M_PROTO messages. */ - if (DB_TYPE(mp) == M_IOCTL) - mp = mp->b_cont; - arl = ar_ll_lookup_from_mp(mp); - if (arl == NULL) - return (EINVAL); - /* - * Newly received commands from clients go to the tail of the queue. - */ - if (CMD_NEEDS_QUEUEING(mp_orig, arl)) { - arp1dbg(("ar_xmit_request: enqueue on q %p\n", (void *)q)); - ar_cmd_enqueue(arl, mp_orig, q, AR_XMIT_REQUEST, B_TRUE); - return (EINPROGRESS); - } - mp_orig->b_prev = NULL; - - areq = (areq_t *)mp->b_rptr; - sender_length = areq->areq_sender_addr_length; - sender = mi_offset_param(mp, areq->areq_sender_addr_offset, - sender_length); - target_length = areq->areq_target_addr_length; - target = mi_offset_param(mp, areq->areq_target_addr_offset, - target_length); - if (!sender || !target) - return (EINVAL); - ar_xmit(arl, ARP_REQUEST, areq->areq_proto, sender_length, - arl->arl_hw_addr, sender, arl->arl_arp_addr, target); - return (0); -} - -/* - * Handle an external request to broadcast an ARP response. This is used - * by configuration programs to broadcast a response advertising our own - * hardware and protocol addresses. - */ -static int -ar_xmit_response(queue_t *q, mblk_t *mp_orig) -{ - areq_t *areq; - arl_t *arl; - uchar_t *sender; - uint32_t sender_length; - uchar_t *target; - uint32_t target_length; - mblk_t *mp = mp_orig; - - /* We handle both M_IOCTL and M_PROTO messages. */ - if (DB_TYPE(mp) == M_IOCTL) - mp = mp->b_cont; - arl = ar_ll_lookup_from_mp(mp); - if (arl == NULL) - return (EINVAL); - /* - * Newly received commands from clients go to the tail of the queue. - */ - if (CMD_NEEDS_QUEUEING(mp_orig, arl)) { - arp1dbg(("ar_xmit_response: enqueue on q %p \n", (void *)q)); - ar_cmd_enqueue(arl, mp_orig, q, AR_XMIT_RESPONSE, B_TRUE); - return (EINPROGRESS); - } - mp_orig->b_prev = NULL; - - areq = (areq_t *)mp->b_rptr; - sender_length = areq->areq_sender_addr_length; - sender = mi_offset_param(mp, areq->areq_sender_addr_offset, - sender_length); - target_length = areq->areq_target_addr_length; - target = mi_offset_param(mp, areq->areq_target_addr_offset, - target_length); - if (!sender || !target) - return (EINVAL); - ar_xmit(arl, ARP_RESPONSE, areq->areq_proto, sender_length, - arl->arl_hw_addr, sender, arl->arl_arp_addr, target); - return (0); -} - -#if 0 -/* - * Debug routine to display a particular ARP Cache Entry with an - * accompanying text message. - */ -static void -show_ace(char *msg, ace_t *ace) -{ - if (msg) - printf("%s", msg); - printf("ace 0x%p:\n", ace); - printf("\tace_next 0x%p, ace_ptpn 0x%p, ace_arl 0x%p\n", - ace->ace_next, ace->ace_ptpn, ace->ace_arl); - printf("\tace_proto %x, ace_flags %x\n", ace->ace_proto, - ace->ace_flags); - if (ace->ace_proto_addr && ace->ace_proto_addr_length) - printf("\tace_proto_addr %x %x %x %x, len %d\n", - ace->ace_proto_addr[0], ace->ace_proto_addr[1], - ace->ace_proto_addr[2], ace->ace_proto_addr[3], - ace->ace_proto_addr_length); - if (ace->ace_proto_mask) - printf("\tace_proto_mask %x %x %x %x\n", - ace->ace_proto_mask[0], ace->ace_proto_mask[1], - ace->ace_proto_mask[2], ace->ace_proto_mask[3]); - if (ace->ace_hw_addr && ace->ace_hw_addr_length) - printf("\tace_hw_addr %x %x %x %x %x %x, len %d\n", - ace->ace_hw_addr[0], ace->ace_hw_addr[1], - ace->ace_hw_addr[2], ace->ace_hw_addr[3], - ace->ace_hw_addr[4], ace->ace_hw_addr[5], - ace->ace_hw_addr_length); - printf("\tace_mp 0x%p\n", ace->ace_mp); - printf("\tace_query_count %d, ace_query_mp 0x%x\n", - ace->ace_query_count, ace->ace_query_mp); -} - -/* Debug routine to display an ARP packet with an accompanying text message. */ -static void -show_arp(char *msg, mblk_t *mp) -{ - uchar_t *up = mp->b_rptr; - int len; - int hlen = up[4] & 0xFF; - char fmt[64]; - char buf[128]; - char *op; - int plen = up[5] & 0xFF; - uint_t proto; - - if (msg && *msg) - printf("%s", msg); - len = mp->b_wptr - up; - if (len < 8) { - printf("ARP packet of %d bytes too small\n", len); - return; - } - switch (BE16_TO_U16(&up[6])) { - case ARP_REQUEST: - op = "ARP request"; - break; - case ARP_RESPONSE: - op = "ARP response"; - break; - case RARP_REQUEST: - op = "RARP request"; - break; - case RARP_RESPONSE: - op = "RARP response"; - break; - default: - op = "unknown"; - break; - } - proto = (uint_t)BE16_TO_U16(&up[2]); - printf("len %d, hardware %d, proto %d, hlen %d, plen %d, op %s\n", - len, (int)BE16_TO_U16(up), proto, hlen, plen, op); - if (len < (8 + hlen + hlen + plen + plen)) - printf("ARP packet of %d bytes too small!\n", len); - up += 8; - - (void) mi_sprintf(fmt, "sender hardware address %%%dM\n", hlen); - (void) mi_sprintf(buf, fmt, up); - printf(buf); - up += hlen; - if (proto == 0x800) { - printf("sender proto address %d.%d.%d.%d\n", - up[0] & 0xFF, up[1] & 0xFF, up[2] & 0xFF, - up[3] & 0xFF); - } else { - (void) mi_sprintf(fmt, "sender proto address %%%dM\n", plen); - (void) mi_sprintf(buf, fmt, up); - printf(buf); - } - up += plen; - - (void) mi_sprintf(fmt, "target hardware address %%%dM\n", hlen); - (void) mi_sprintf(buf, fmt, up); - printf(buf); - up += hlen; - if (proto == 0x800) { - printf("target proto address %d.%d.%d.%d\n", - up[0] & 0xFF, up[1] & 0xFF, up[2] & 0xFF, - up[3] & 0xFF); - } else { - (void) mi_sprintf(fmt, "target proto address %%%dM\n", plen); - (void) mi_sprintf(buf, fmt, up); - printf(buf); - } - up += plen; -} -#endif - static mblk_t * ar_alloc(uint32_t cmd, int err) { diff --git a/usr/src/uts/common/inet/arp_impl.h b/usr/src/uts/common/inet/arp_impl.h index 84756488f8..e87fc69ab3 100644 --- a/usr/src/uts/common/inet/arp_impl.h +++ b/usr/src/uts/common/inet/arp_impl.h @@ -36,6 +36,10 @@ extern "C" { #include <sys/types.h> #include <sys/stream.h> +#include <net/if.h> + +/* ARP kernel hash size; used for mdb support */ +#define ARP_HASH_SIZE 256 /* ARL Structure, one per link level device */ typedef struct arl_s { @@ -43,7 +47,6 @@ typedef struct arl_s { queue_t *arl_rq; /* Read queue pointer */ queue_t *arl_wq; /* Write queue pointer */ t_uscalar_t arl_ppa; /* DL_ATTACH parameter */ - t_scalar_t arl_mac_sap; uchar_t *arl_arp_addr; /* multicast address to use */ uchar_t *arl_hw_addr; /* Our hardware address */ uint32_t arl_hw_addr_length; @@ -56,8 +59,6 @@ typedef struct arl_s { mblk_t *arl_unbind_mp; mblk_t *arl_detach_mp; t_uscalar_t arl_provider_style; /* From DL_INFO_ACK */ - mblk_t *arl_dlpiop_done; /* DLPI opertion done */ - queue_t *arl_ip_pending_queue; /* Pending queue */ mblk_t *arl_queue; /* Queued commands head */ mblk_t *arl_queue_tail; /* Queued commands tail */ uint32_t arl_flags; /* Used for IFF_NOARP */ @@ -65,7 +66,12 @@ typedef struct arl_s { mblk_t *arl_dlpi_deferred; /* Deferred DLPI messages */ uint_t arl_state; /* lower interface state */ char *arl_data; /* address data pointer */ - uint32_t arl_closing : 1; + clock_t arl_defend_start; /* start of 1-hour period */ + uint_t arl_defend_count; /* # of unbidden broadcasts */ + uint_t + arl_closing : 1, /* stream is closing */ + arl_notifies : 1, /* handles DL_NOTE_LINK */ + arl_link_up : 1; /* DL_NOTE status */ } arl_t; #define ARL_F_NOARP 0x01 @@ -81,9 +87,32 @@ typedef struct ar_s { arl_t *ar_arl; /* Associated arl */ cred_t *ar_credp; /* Credentials associated w/ open */ struct ar_s *ar_arl_ip_assoc; /* ARL - IP association */ - uint32_t ar_ip_acked_close : 1; /* IP has acked the close */ + uint32_t + ar_ip_acked_close : 1, /* IP has acked the close */ + ar_on_ill_stream : 1; /* Module below is IP */ } ar_t; +/* ARP Cache Entry */ +typedef struct ace_s { + struct ace_s *ace_next; /* Hash chain next pointer */ + struct ace_s **ace_ptpn; /* Pointer to previous next */ + struct arl_s *ace_arl; /* Associated arl */ + uint32_t ace_proto; /* Protocol for this ace */ + uint32_t ace_flags; + uchar_t *ace_proto_addr; + uint32_t ace_proto_addr_length; + uchar_t *ace_proto_mask; /* Mask for matching addr */ + uchar_t *ace_proto_extract_mask; /* For mappings */ + uchar_t *ace_hw_addr; + uint32_t ace_hw_addr_length; + uint32_t ace_hw_extract_start; /* For mappings */ + mblk_t *ace_mp; /* mblk we are in */ + mblk_t *ace_query_mp; /* outstanding query chain */ + clock_t ace_last_bcast; /* last broadcast Response */ + clock_t ace_xmit_interval; + int ace_xmit_count; +} ace_t; + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h index a732b92585..67af1bf688 100644 --- a/usr/src/uts/common/inet/ip.h +++ b/usr/src/uts/common/inet/ip.h @@ -39,11 +39,9 @@ extern "C" { #include <inet/mib2.h> #include <inet/nd.h> #include <sys/atomic.h> -#include <sys/socket.h> #include <net/if_dl.h> #include <net/if.h> #include <netinet/ip.h> -#include <sys/dlpi.h> #include <netinet/igmp.h> #ifdef _KERNEL @@ -1284,7 +1282,9 @@ typedef struct ipif_s { ipif_replace_zero : 1, /* Replacement for zero */ ipif_was_up : 1, /* ipif was up before */ - ipif_pad_to_31 : 28; + ipif_addr_ready : 1, /* DAD is done */ + ipif_was_dup : 1, /* DAD had failed */ + ipif_pad_to_31 : 26; int ipif_orig_ifindex; /* ifindex before SLIFFAILOVER */ uint_t ipif_seqid; /* unique index across all ills */ @@ -1295,6 +1295,7 @@ typedef struct ipif_s { uint_t ipif_saved_ire_cnt; zoneid_t ipif_zoneid; /* zone ID number */ + timeout_id_t ipif_recovery_id; /* Timer for DAD recovery */ #ifdef ILL_DEBUG #define IP_TR_HASH_MAX 64 th_trace_t *ipif_trace[IP_TR_HASH_MAX]; @@ -1740,6 +1741,7 @@ typedef struct ill_s { uint_t ill_ipif_up_count; /* Number of IPIFs currently up. */ uint_t ill_max_frag; /* Max IDU from DLPI. */ char *ill_name; /* Our name. */ + uint_t ill_ipif_dup_count; /* Number of duplicate addresses. */ uint_t ill_name_length; /* Name length, incl. terminator. */ char *ill_ndd_name; /* Name + ":ip?_forwarding" for NDD. */ uint_t ill_net_type; /* IRE_IF_RESOLVER/IRE_IF_NORESOLVER. */ @@ -1807,7 +1809,9 @@ typedef struct ill_s { ill_dl_up : 1, ill_up_ipifs : 1, - ill_pad_to_bit_31 : 20; + ill_note_link : 1, /* supports link-up notification */ + + ill_pad_to_bit_31 : 19; /* Following bit fields protected by ill_lock */ uint_t @@ -1818,7 +1822,8 @@ typedef struct ill_s { ill_arp_bringup_pending : 1, ill_mtu_userspecified : 1, /* SIOCSLNKINFO has set the mtu */ - ill_pad_bit_31 : 26; + ill_arp_extend : 1, /* ARP has DAD extensions */ + ill_pad_bit_31 : 25; /* * Used in SIOCSIFMUXID and SIOCGIFMUXID for 'ifconfig unplumb'. @@ -2501,12 +2506,8 @@ typedef struct ire_s { /* source ip-addr of incoming packet */ clock_t ire_last_used_time; /* Last used time */ struct ire_s *ire_fastpath; /* Pointer to next ire in fastpath */ - zoneid_t ire_zoneid; /* for local address discrimination */ tsol_ire_gw_secattr_t *ire_gw_secattr; /* gateway security attributes */ -#ifdef IRE_DEBUG - th_trace_t *ire_trace[IP_TR_HASH_MAX]; - boolean_t ire_trace_disable; /* True when alloc fails */ -#endif + zoneid_t ire_zoneid; /* for local address discrimination */ /* * ire's that are embedded inside mblk_t and sent to the external * resolver use the ire_stq_ifindex to track the ifindex of the @@ -2514,6 +2515,12 @@ typedef struct ire_s { * for cleanup in the esbfree routine when arp failure occurs */ uint_t ire_stq_ifindex; + uint_t ire_defense_count; /* number of ARP conflicts */ + uint_t ire_defense_time; /* last time defended (secs) */ +#ifdef IRE_DEBUG + th_trace_t *ire_trace[IP_TR_HASH_MAX]; + boolean_t ire_trace_disable; /* True when alloc fails */ +#endif } ire_t; /* IPv4 compatiblity macros */ @@ -2822,23 +2829,37 @@ extern int ipv6_forward; extern vmem_t *ip_minor_arena; #define ip_respond_to_address_mask_broadcast ip_param_arr[0].ip_param_value +#define ip_g_resp_to_echo_bcast ip_param_arr[1].ip_param_value +#define ip_g_resp_to_echo_mcast ip_param_arr[2].ip_param_value +#define ip_g_resp_to_timestamp ip_param_arr[3].ip_param_value +#define ip_g_resp_to_timestamp_bcast ip_param_arr[4].ip_param_value #define ip_g_send_redirects ip_param_arr[5].ip_param_value +#define ip_g_forward_directed_bcast ip_param_arr[6].ip_param_value #define ip_debug ip_param_arr[7].ip_param_value #define ip_mrtdebug ip_param_arr[8].ip_param_value #define ip_timer_interval ip_param_arr[9].ip_param_value #define ip_ire_arp_interval ip_param_arr[10].ip_param_value +#define ip_ire_redir_interval ip_param_arr[11].ip_param_value #define ip_def_ttl ip_param_arr[12].ip_param_value +#define ip_forward_src_routed ip_param_arr[13].ip_param_value #define ip_wroff_extra ip_param_arr[14].ip_param_value +#define ip_ire_pathmtu_interval ip_param_arr[15].ip_param_value +#define ip_icmp_return ip_param_arr[16].ip_param_value #define ip_path_mtu_discovery ip_param_arr[17].ip_param_value #define ip_ignore_delete_time ip_param_arr[18].ip_param_value +#define ip_ignore_redirect ip_param_arr[19].ip_param_value #define ip_output_queue ip_param_arr[20].ip_param_value #define ip_broadcast_ttl ip_param_arr[21].ip_param_value #define ip_icmp_err_interval ip_param_arr[22].ip_param_value #define ip_icmp_err_burst ip_param_arr[23].ip_param_value #define ip_reass_queue_bytes ip_param_arr[24].ip_param_value +#define ip_strict_dst_multihoming ip_param_arr[25].ip_param_value #define ip_addrs_per_if ip_param_arr[26].ip_param_value #define ipsec_override_persocket_policy ip_param_arr[27].ip_param_value #define icmp_accept_clear_messages ip_param_arr[28].ip_param_value +#define igmp_accept_clear_messages ip_param_arr[29].ip_param_value + +/* IPv6 configuration knobs */ #define delay_first_probe_time ip_param_arr[30].ip_param_value #define max_unicast_solicit ip_param_arr[31].ip_param_value #define ipv6_def_hops ip_param_arr[32].ip_param_value @@ -2850,6 +2871,7 @@ extern vmem_t *ip_minor_arena; #define ipv6_strict_dst_multihoming ip_param_arr[38].ip_param_value #define ip_ire_reclaim_fraction ip_param_arr[39].ip_param_value #define ipsec_policy_log_interval ip_param_arr[40].ip_param_value +#define pim_accept_clear_messages ip_param_arr[41].ip_param_value #define ip_ndp_unsolicit_interval ip_param_arr[42].ip_param_value #define ip_ndp_unsolicit_count ip_param_arr[43].ip_param_value #define ipv6_ignore_home_address_opt ip_param_arr[44].ip_param_value @@ -2857,8 +2879,14 @@ extern vmem_t *ip_minor_arena; #define ip_multirt_resolution_interval ip_param_arr[46].ip_param_value #define ip_multirt_ttl ip_param_arr[47].ip_param_value #define ip_multidata_outbound ip_param_arr[48].ip_param_value +#define ip_ndp_defense_interval ip_param_arr[49].ip_param_value +#define ip_max_temp_idle ip_param_arr[50].ip_param_value +#define ip_max_temp_defend ip_param_arr[51].ip_param_value +#define ip_max_defend ip_param_arr[52].ip_param_value +#define ip_defend_interval ip_param_arr[53].ip_param_value +#define ip_dup_recovery ip_param_arr[54].ip_param_value #ifdef DEBUG -#define ipv6_drop_inbound_icmpv6 ip_param_arr[49].ip_param_value +#define ipv6_drop_inbound_icmpv6 ip_param_arr[55].ip_param_value #else #define ipv6_drop_inbound_icmpv6 0 #endif @@ -2934,6 +2962,9 @@ extern uint32_t ipsechw_debug; #define ip3dbg(a) /* */ #endif /* IP_DEBUG */ +/* Default MAC-layer address string length for mac_colon_addr */ +#define MAC_STR_LEN 128 + struct ipsec_out_s; extern const char *dlpi_prim_str(int); @@ -2945,6 +2976,7 @@ extern void ill_frag_timer_start(ill_t *); extern mblk_t *ip_carve_mp(mblk_t **, ssize_t); extern mblk_t *ip_dlpi_alloc(size_t, t_uscalar_t); extern char *ip_dot_addr(ipaddr_t, char *); +extern const char *mac_colon_addr(const uint8_t *, size_t, char *, size_t); extern void ip_lwput(queue_t *, mblk_t *); extern boolean_t icmp_err_rate_limit(void); extern void icmp_time_exceeded(queue_t *, mblk_t *, uint8_t); diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c index 036748e78c..7872cfd212 100644 --- a/usr/src/uts/common/inet/ip/ip.c +++ b/usr/src/uts/common/inet/ip/ip.c @@ -956,6 +956,12 @@ static ipparam_t lcl_param_arr[] = { { 1000, 60000, 1000, "ip_multirt_resolution_interval" }, { 0, 255, 1, "ip_multirt_ttl" }, { 0, 1, 1, "ip_multidata_outbound" }, + { 0, 3600000, 300000, "ip_ndp_defense_interval" }, + { 0, 999999, 60*60*24, "ip_max_temp_idle" }, + { 0, 1000, 1, "ip_max_temp_defend" }, + { 0, 1000, 3, "ip_max_defend" }, + { 0, 999999, 30, "ip_defend_interval" }, + { 0, 3600000, 300000, "ip_dup_recovery" }, #ifdef DEBUG { 0, 1, 0, "ip6_drop_inbound_icmpv6" }, #endif @@ -1022,65 +1028,6 @@ int ip_g_forward = IP_FORWARD_DEFAULT; int ipv6_forward = IP_FORWARD_DEFAULT; -/* Following line is external, and in ip.h. Normally marked with * *. */ -#define ip_respond_to_address_mask_broadcast ip_param_arr[0].ip_param_value -#define ip_g_resp_to_echo_bcast ip_param_arr[1].ip_param_value -#define ip_g_resp_to_echo_mcast ip_param_arr[2].ip_param_value -#define ip_g_resp_to_timestamp ip_param_arr[3].ip_param_value -#define ip_g_resp_to_timestamp_bcast ip_param_arr[4].ip_param_value -#define ip_g_send_redirects ip_param_arr[5].ip_param_value -#define ip_g_forward_directed_bcast ip_param_arr[6].ip_param_value -#define ip_debug ip_param_arr[7].ip_param_value /* */ -#define ip_mrtdebug ip_param_arr[8].ip_param_value /* */ -#define ip_timer_interval ip_param_arr[9].ip_param_value /* */ -#define ip_ire_arp_interval ip_param_arr[10].ip_param_value /* */ -#define ip_ire_redir_interval ip_param_arr[11].ip_param_value -#define ip_def_ttl ip_param_arr[12].ip_param_value -#define ip_forward_src_routed ip_param_arr[13].ip_param_value -#define ip_wroff_extra ip_param_arr[14].ip_param_value -#define ip_ire_pathmtu_interval ip_param_arr[15].ip_param_value -#define ip_icmp_return ip_param_arr[16].ip_param_value -#define ip_path_mtu_discovery ip_param_arr[17].ip_param_value /* */ -#define ip_ignore_delete_time ip_param_arr[18].ip_param_value /* */ -#define ip_ignore_redirect ip_param_arr[19].ip_param_value -#define ip_output_queue ip_param_arr[20].ip_param_value -#define ip_broadcast_ttl ip_param_arr[21].ip_param_value -#define ip_icmp_err_interval ip_param_arr[22].ip_param_value -#define ip_icmp_err_burst ip_param_arr[23].ip_param_value -#define ip_reass_queue_bytes ip_param_arr[24].ip_param_value -#define ip_strict_dst_multihoming ip_param_arr[25].ip_param_value -#define ip_addrs_per_if ip_param_arr[26].ip_param_value -#define ipsec_override_persocket_policy ip_param_arr[27].ip_param_value /* */ -#define icmp_accept_clear_messages ip_param_arr[28].ip_param_value -#define igmp_accept_clear_messages ip_param_arr[29].ip_param_value - -/* IPv6 configuration knobs */ -#define delay_first_probe_time ip_param_arr[30].ip_param_value -#define max_unicast_solicit ip_param_arr[31].ip_param_value -#define ipv6_def_hops ip_param_arr[32].ip_param_value -#define ipv6_icmp_return ip_param_arr[33].ip_param_value -#define ipv6_forward_src_routed ip_param_arr[34].ip_param_value -#define ipv6_resp_echo_mcast ip_param_arr[35].ip_param_value -#define ipv6_send_redirects ip_param_arr[36].ip_param_value -#define ipv6_ignore_redirect ip_param_arr[37].ip_param_value -#define ipv6_strict_dst_multihoming ip_param_arr[38].ip_param_value -#define ip_ire_reclaim_fraction ip_param_arr[39].ip_param_value -#define ipsec_policy_log_interval ip_param_arr[40].ip_param_value -#define pim_accept_clear_messages ip_param_arr[41].ip_param_value -#define ip_ndp_unsolicit_interval ip_param_arr[42].ip_param_value -#define ip_ndp_unsolicit_count ip_param_arr[43].ip_param_value -#define ipv6_ignore_home_address_opt ip_param_arr[44].ip_param_value -#define ip_policy_mask ip_param_arr[45].ip_param_value -#define ip_multirt_resolution_interval ip_param_arr[46].ip_param_value -#define ip_multirt_ttl ip_param_arr[47].ip_param_value -#define ip_multidata_outbound ip_param_arr[48].ip_param_value -#ifdef DEBUG -#define ipv6_drop_inbound_icmpv6 ip_param_arr[49].ip_param_value -#else -#define ipv6_drop_inbound_icmpv6 0 -#endif - - /* * Table of IP ioctls encoding the various properties of the ioctl and * indexed based on the last byte of the ioctl command. Occasionally there @@ -1516,28 +1463,33 @@ struct module_info ip_mod_info = { IP_MOD_ID, IP_MOD_NAME, 1, INFPSZ, 65536, 1024 }; -static struct qinit rinit = { +/* + * Duplicate static symbols within a module confuses mdb; so we avoid the + * problem by making the symbols here distinct from those in udp.c. + */ + +static struct qinit iprinit = { (pfi_t)ip_rput, NULL, ip_open, ip_close, NULL, &ip_mod_info }; -static struct qinit winit = { +static struct qinit ipwinit = { (pfi_t)ip_wput, (pfi_t)ip_wsrv, ip_open, ip_close, NULL, &ip_mod_info }; -static struct qinit lrinit = { +static struct qinit iplrinit = { (pfi_t)ip_lrput, NULL, ip_open, ip_close, NULL, &ip_mod_info }; -static struct qinit lwinit = { +static struct qinit iplwinit = { (pfi_t)ip_lwput, NULL, ip_open, ip_close, NULL, &ip_mod_info }; struct streamtab ipinfo = { - &rinit, &winit, &lrinit, &lwinit + &iprinit, &ipwinit, &iplrinit, &iplwinit }; #ifdef DEBUG @@ -3782,6 +3734,204 @@ icmp_unreachable(queue_t *q, mblk_t *mp, uint8_t code) } /* + * Attempt to start recovery of an IPv4 interface that's been shut down as a + * duplicate. As long as someone else holds the address, the interface will + * stay down. When that conflict goes away, the interface is brought back up. + * This is done so that accidental shutdowns of addresses aren't made + * permanent. Your server will recover from a failure. + * + * For DHCP, recovery is not done in the kernel. Instead, it's handled by a + * user space process (dhcpagent). + * + * Recovery completes if ARP reports that the address is now ours (via + * AR_CN_READY). In that case, we go to ip_arp_excl to finish the operation. + * + * This function is entered on a timer expiry; the ID is in ipif_recovery_id. + */ +static void +ipif_dup_recovery(void *arg) +{ + ipif_t *ipif = arg; + ill_t *ill = ipif->ipif_ill; + mblk_t *arp_add_mp; + mblk_t *arp_del_mp; + area_t *area; + + ipif->ipif_recovery_id = 0; + + if (ill->ill_arp_closing || !(ipif->ipif_flags & IPIF_DUPLICATE) || + (ipif->ipif_flags & IPIF_POINTOPOINT)) { + /* No reason to try to bring this address back. */ + return; + } + + if ((arp_add_mp = ipif_area_alloc(ipif)) == NULL) + goto alloc_fail; + + if (ipif->ipif_arp_del_mp == NULL) { + if ((arp_del_mp = ipif_ared_alloc(ipif)) == NULL) + goto alloc_fail; + ipif->ipif_arp_del_mp = arp_del_mp; + } + + /* Setting the 'unverified' flag restarts DAD */ + area = (area_t *)arp_add_mp->b_rptr; + area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR | + ACE_F_UNVERIFIED; + putnext(ill->ill_rq, arp_add_mp); + return; + +alloc_fail: + /* On allocation failure, just restart the timer */ + freemsg(arp_add_mp); + if (ip_dup_recovery > 0) { + ipif->ipif_recovery_id = timeout(ipif_dup_recovery, ipif, + MSEC_TO_TICK(ip_dup_recovery)); + } +} + +/* + * This is for exclusive changes due to ARP. Either tear down an interface due + * to AR_CN_FAILED and AR_CN_BOGON, or bring one up for successful recovery. + */ +/* ARGSUSED */ +static void +ip_arp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) +{ + ill_t *ill = rq->q_ptr; + arh_t *arh; + ipaddr_t src; + ipif_t *ipif; + char ibuf[LIFNAMSIZ + 10]; /* 10 digits for logical i/f number */ + char hbuf[MAC_STR_LEN]; + char sbuf[INET_ADDRSTRLEN]; + const char *failtype; + boolean_t bring_up; + + switch (((arcn_t *)mp->b_rptr)->arcn_code) { + case AR_CN_READY: + failtype = NULL; + bring_up = B_TRUE; + break; + case AR_CN_FAILED: + failtype = "in use"; + bring_up = B_FALSE; + break; + default: + failtype = "claimed"; + bring_up = B_FALSE; + break; + } + + arh = (arh_t *)mp->b_cont->b_rptr; + bcopy((char *)&arh[1] + arh->arh_hlen, &src, IP_ADDR_LEN); + + /* Handle failures due to probes */ + if (src == 0) { + bcopy((char *)&arh[1] + 2 * arh->arh_hlen + IP_ADDR_LEN, &src, + IP_ADDR_LEN); + } + + (void) strlcpy(ibuf, ill->ill_name, sizeof (ibuf)); + (void) mac_colon_addr((uint8_t *)(arh + 1), arh->arh_hlen, hbuf, + sizeof (hbuf)); + (void) ip_dot_addr(src, sbuf); + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { + + if ((ipif->ipif_flags & IPIF_POINTOPOINT) || + ipif->ipif_lcl_addr != src) { + continue; + } + + /* + * If we failed on a recovery probe, then restart the timer to + * try again later. + */ + if (!bring_up && (ipif->ipif_flags & IPIF_DUPLICATE) && + !(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && + ill->ill_net_type == IRE_IF_RESOLVER && + ip_dup_recovery > 0 && ipif->ipif_recovery_id == 0) { + ipif->ipif_recovery_id = timeout(ipif_dup_recovery, + ipif, MSEC_TO_TICK(ip_dup_recovery)); + continue; + } + + /* + * If what we're trying to do has already been done, then do + * nothing. + */ + if (bring_up == ((ipif->ipif_flags & IPIF_UP) != 0)) + continue; + + if (ipif->ipif_id != 0) { + (void) snprintf(ibuf + ill->ill_name_length - 1, + sizeof (ibuf) - ill->ill_name_length + 1, ":%d", + ipif->ipif_id); + } + if (failtype == NULL) { + cmn_err(CE_NOTE, "recovered address %s on %s", sbuf, + ibuf); + } else { + cmn_err(CE_WARN, "%s has duplicate address %s (%s " + "by %s); disabled", ibuf, sbuf, failtype, hbuf); + } + + if (bring_up) { + ASSERT(ill->ill_dl_up); + /* + * Free up the ARP delete message so we can allocate + * a fresh one through the normal path. + */ + freemsg(ipif->ipif_arp_del_mp); + ipif->ipif_arp_del_mp = NULL; + if (ipif_resolver_up(ipif, Res_act_initial) != + EINPROGRESS) { + ipif->ipif_addr_ready = 1; + (void) ipif_up_done(ipif); + } + continue; + } + + mutex_enter(&ill->ill_lock); + ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); + ipif->ipif_flags |= IPIF_DUPLICATE; + ill->ill_ipif_dup_count++; + mutex_exit(&ill->ill_lock); + /* + * Already exclusive on the ill; no need to handle deferred + * processing here. + */ + (void) ipif_down(ipif, NULL, NULL); + ipif_down_tail(ipif); + if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && + ill->ill_net_type == IRE_IF_RESOLVER && + ip_dup_recovery > 0) { + ipif->ipif_recovery_id = timeout(ipif_dup_recovery, + ipif, MSEC_TO_TICK(ip_dup_recovery)); + } + } + freemsg(mp); +} + +/* ARGSUSED */ +static void +ip_arp_defend(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) +{ + ill_t *ill = rq->q_ptr; + arh_t *arh; + ipaddr_t src; + ipif_t *ipif; + + arh = (arh_t *)mp->b_cont->b_rptr; + bcopy((char *)&arh[1] + arh->arh_hlen, &src, IP_ADDR_LEN); + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { + if ((ipif->ipif_flags & IPIF_UP) && ipif->ipif_lcl_addr == src) + (void) ipif_resolver_up(ipif, Res_act_defend); + } + freemsg(mp); +} + +/* * News from ARP. ARP sends notification of interesting events down * to its clients using M_CTL messages with the interesting ARP packet * attached via b_cont. @@ -3796,15 +3946,14 @@ ip_arp_news(queue_t *q, mblk_t *mp) { arcn_t *arcn; arh_t *arh; - char *cp1; - uchar_t *cp2; ire_t *ire = NULL; - int i1; - char hbuf[128]; - char sbuf[16]; + char hbuf[MAC_STR_LEN]; + char sbuf[INET_ADDRSTRLEN]; ipaddr_t src; in6_addr_t v6src; boolean_t isv6 = B_FALSE; + ipif_t *ipif; + ill_t *ill; if ((mp->b_wptr - mp->b_rptr) < sizeof (arcn_t) || !mp->b_cont) { if (q->q_next) { @@ -3827,25 +3976,37 @@ ip_arp_news(queue_t *q, mblk_t *mp) return; } + ill = q->q_ptr; + arcn = (arcn_t *)mp->b_rptr; switch (arcn->arcn_code) { case AR_CN_BOGON: /* * Someone is sending ARP packets with a source protocol - * address which we have published. Either they are - * pretending to be us, or we have been asked to proxy - * for a machine that can do fine for itself, or two - * different machines are providing proxy service for the - * same protocol address, or something. We try and do - * something appropriate here. - */ - cp2 = (uchar_t *)&arh[1]; - cp1 = hbuf; - *cp1 = '\0'; - for (i1 = arh->arh_hlen; i1--; cp1 += 3) - (void) sprintf(cp1, "%02x:", *cp2++ & 0xff); - if (cp1 != hbuf) - cp1[-1] = '\0'; + * address that we have published and for which we believe our + * entry is authoritative and (when ill_arp_extend is set) + * verified to be unique on the network. + * + * The ARP module internally handles the cases where the sender + * is just probing (for DAD) and where the hardware address of + * a non-authoritative entry has changed. Thus, these are the + * real conflicts, and we have to do resolution. + * + * We back away quickly from the address if it's from DHCP or + * otherwise temporary and hasn't been used recently (or at + * all). We'd like to include "deprecated" addresses here as + * well (as there's no real reason to defend something we're + * discarding), but IPMP "reuses" this flag to mean something + * other than the standard meaning. + * + * If the ARP module above is not extended (meaning that it + * doesn't know how to defend the address), then we just log + * the problem as we always did and continue on. It's not + * right, but there's little else we can do, and those old ATM + * users are going away anyway. + */ + (void) mac_colon_addr((uint8_t *)(arh + 1), arh->arh_hlen, + hbuf, sizeof (hbuf)); (void) ip_dot_addr(src, sbuf); if (isv6) ire = ire_cache_lookup_v6(&v6src, ALL_ZONES, NULL); @@ -3853,16 +4014,78 @@ ip_arp_news(queue_t *q, mblk_t *mp) ire = ire_cache_lookup(src, ALL_ZONES, NULL); if (ire != NULL && IRE_IS_LOCAL(ire)) { - cmn_err(CE_WARN, - "IP: Hardware address '%s' trying" - " to be our address %s!", - hbuf, sbuf); - } else { - cmn_err(CE_WARN, - "IP: Proxy ARP problem? " - "Hardware address '%s' thinks it is %s", - hbuf, sbuf); + uint32_t now; + uint32_t maxage; + clock_t lused; + uint_t maxdefense; + uint_t defs; + + /* + * First, figure out if this address hasn't been used + * in a while. If it hasn't, then it's a better + * candidate for abandoning. + */ + ipif = ire->ire_ipif; + ASSERT(ipif != NULL); + now = gethrestime_sec(); + maxage = now - ire->ire_create_time; + if (maxage > ip_max_temp_idle) + maxage = ip_max_temp_idle; + lused = drv_hztousec(ddi_get_lbolt() - + ire->ire_last_used_time) / MICROSEC + 1; + if (lused >= maxage && (ipif->ipif_flags & + (IPIF_DHCPRUNNING | IPIF_TEMPORARY))) + maxdefense = ip_max_temp_defend; + else + maxdefense = ip_max_defend; + + /* + * Now figure out how many times we've defended + * ourselves. Ignore defenses that happened long in + * the past. + */ + mutex_enter(&ire->ire_lock); + if ((defs = ire->ire_defense_count) > 0 && + now - ire->ire_defense_time > ip_defend_interval) { + ire->ire_defense_count = defs = 0; + } + ire->ire_defense_count++; + ire->ire_defense_time = now; + mutex_exit(&ire->ire_lock); + ill_refhold(ill); + ire_refrele(ire); + + /* + * If we've defended ourselves too many times already, + * then give up and tear down the interface(s) using + * this address. Otherwise, defend by sending out a + * gratuitous ARP. + */ + if (defs >= maxdefense && ill->ill_arp_extend) { + (void) qwriter_ip(NULL, ill, q, mp, + ip_arp_excl, CUR_OP, B_FALSE); + } else { + cmn_err(CE_WARN, + "node %s is using our IP address %s on %s", + hbuf, sbuf, ill->ill_name); + /* + * If this is an old (ATM) ARP module, then + * don't try to defend the address. Remain + * compatible with the old behavior. Defend + * only with new ARP. + */ + if (ill->ill_arp_extend) { + (void) qwriter_ip(NULL, ill, q, mp, + ip_arp_defend, CUR_OP, B_FALSE); + } else { + ill_refrele(ill); + } + } + return; } + cmn_err(CE_WARN, + "proxy ARP problem? Node '%s' is using %s on %s", + hbuf, sbuf, ill->ill_name); if (ire != NULL) ire_refrele(ire); break; @@ -3884,53 +4107,79 @@ ip_arp_news(queue_t *q, mblk_t *mp) ire_walk_v6(ire_delete_cache_gw_v6, (char *)&v6src, ALL_ZONES); } - break; + } else { + nce_hw_map_t hwm; + + /* + * ARP gives us a copy of any packet where it thinks + * the address has changed, so that we can update our + * caches. We're responsible for caching known answers + * in the current design. We check whether the + * hardware address really has changed in all of our + * entries that have cached this mapping, and if so, we + * blow them away. This way we will immediately pick + * up the rare case of a host changing hardware + * address. + */ + if (src == 0) + break; + hwm.hwm_addr = src; + hwm.hwm_hwlen = arh->arh_hlen; + hwm.hwm_hwaddr = (uchar_t *)(arh + 1); + ndp_walk_common(&ndp4, NULL, + (pfi_t)nce_delete_hw_changed, &hwm, ALL_ZONES); } - /* - * ARP gives us a copy of any broadcast packet with identical - * sender and receiver protocol address, in - * case we want to intuit something from it. Such a packet - * usually means that a machine has just come up on the net. - * If we have an IRE_CACHE, we blow it away. This way we will - * immediately pick up the rare case of a host changing - * hardware address. ip_ire_clookup_and_delete achieves this. - * - * The address in "src" may be an entry for a router. - * (Default router, or non-default router.) If - * that's true, then any off-net IRE_CACHE entries - * that go through the router with address "src" - * must be clobbered. Use ire_walk to achieve this - * goal. - * - * It should be possible to determine if the address - * in src is or is not for a router. This way, - * the ire_walk() isn't called all of the time here. - * Do not pass 'src' value of 0 to ire_delete_cache_gw, - * as it would remove all IRE_CACHE entries for onlink - * destinations. All onlink destinations have - * ire_gateway_addr == 0. - * - * - * The ip_ire_clookup_and_delete() call deletes - * the nce and all relevant ire cache entries that - * are associated with that nce. - * The ire_walk_v4->ire_delete_cache_gw() call - * will delete the appropriate redirect ires. - */ - if ((ip_ire_clookup_and_delete(src, NULL) || - (ire = ire_ftable_lookup(src, 0, 0, 0, NULL, NULL, NULL, - 0, NULL, MATCH_IRE_DSTONLY)) != NULL) && src != 0) { - ire_walk_v4(ire_delete_cache_gw, (char *)&src, - ALL_ZONES); - } - /* From ire_ftable_lookup */ - if (ire != NULL) - ire_refrele(ire); break; - default: - if (ire != NULL) + case AR_CN_READY: + /* No external v6 resolver has a contract to use this */ + if (isv6) + break; + /* If the link is down, we'll retry this later */ + if (!(ill->ill_phyint->phyint_flags & PHYI_RUNNING)) + break; + ipif = ipif_lookup_addr(src, ill, ALL_ZONES, NULL, NULL, + NULL, NULL); + if (ipif != NULL) { + /* + * If this is a duplicate recovery, then we now need to + * go exclusive to bring this thing back up. + */ + if ((ipif->ipif_flags & (IPIF_UP|IPIF_DUPLICATE)) == + IPIF_DUPLICATE) { + ipif_refrele(ipif); + ill_refhold(ill); + (void) qwriter_ip(NULL, ill, q, mp, + ip_arp_excl, CUR_OP, B_FALSE); + return; + } + /* + * If this is the first notice that this address is + * ready, then let the user know now. + */ + if ((ipif->ipif_flags & IPIF_UP) && + !ipif->ipif_addr_ready) { + ipif_mask_reply(ipif); + ip_rts_ifmsg(ipif); + ip_rts_newaddrmsg(RTM_ADD, 0, ipif); + sctp_update_ipif(ipif, SCTP_IPIF_UP); + } + ipif->ipif_addr_ready = 1; + ipif_refrele(ipif); + } + ire = ire_cache_lookup(src, ALL_ZONES, MBLK_GETLABEL(mp)); + if (ire != NULL) { + ire->ire_defense_count = 0; ire_refrele(ire); + } break; + case AR_CN_FAILED: + /* No external v6 resolver has a contract to use this */ + if (isv6) + break; + ill_refhold(ill); + (void) qwriter_ip(NULL, ill, q, mp, ip_arp_excl, CUR_OP, + B_FALSE); + return; } freemsg(mp); } @@ -5598,25 +5847,57 @@ dlpi_err_str(int err) * Debug formatting routine. Returns a character string representation of the * addr in buf, of the form xxx.xxx.xxx.xxx. This routine takes the address * in the form of a ipaddr_t and calls ip_dot_saddr with a pointer. + * + * Once the ndd table-printing interfaces are removed, this can be changed to + * standard dotted-decimal form. */ char * ip_dot_addr(ipaddr_t addr, char *buf) { - return (ip_dot_saddr((uchar_t *)&addr, buf)); + uint8_t *ap = (uint8_t *)&addr; + + (void) mi_sprintf(buf, "%03d.%03d.%03d.%03d", + ap[0] & 0xFF, ap[1] & 0xFF, ap[2] & 0xFF, ap[3] & 0xFF); + return (buf); } /* - * Debug formatting routine. Returns a character string representation of the - * addr in buf, of the form xxx.xxx.xxx.xxx. This routine takes the address - * as a pointer. The "xxx" parts including left zero padding so the final - * string will fit easily in tables. It would be nice to take a padding - * length argument instead. + * Write the given MAC address as a printable string in the usual colon- + * separated format. */ -static char * -ip_dot_saddr(uchar_t *addr, char *buf) +const char * +mac_colon_addr(const uint8_t *addr, size_t alen, char *buf, size_t buflen) { - (void) mi_sprintf(buf, "%03d.%03d.%03d.%03d", - addr[0] & 0xFF, addr[1] & 0xFF, addr[2] & 0xFF, addr[3] & 0xFF); + char *bp; + + if (alen == 0 || buflen < 4) + return ("?"); + bp = buf; + for (;;) { + /* + * If there are more MAC address bytes available, but we won't + * have any room to print them, then add "..." to the string + * instead. See below for the 'magic number' explanation. + */ + if ((alen == 2 && buflen < 6) || (alen > 2 && buflen < 7)) { + (void) strcpy(bp, "..."); + break; + } + (void) sprintf(bp, "%02x", *addr++); + bp += 2; + if (--alen == 0) + break; + *bp++ = ':'; + buflen -= 3; + /* + * At this point, based on the first 'if' statement above, + * either alen == 1 and buflen >= 3, or alen > 1 and + * buflen >= 4. The first case leaves room for the final "xx" + * number and trailing NUL byte. The second leaves room for at + * least "...". Thus the apparently 'magic' numbers chosen for + * that statement. + */ + } return (buf); } @@ -9315,8 +9596,8 @@ ip_setqinfo(queue_t *q, minor_t minor, boolean_t bump_mib) } else { if (bump_mib) BUMP_MIB(&ip_mib, ipOutSwitchIPv6); - q->q_qinfo = &rinit; - WR(q)->q_qinfo = &winit; + q->q_qinfo = &iprinit; + WR(q)->q_qinfo = &ipwinit; (Q_TO_CONN(q))->conn_pkt_isv6 = B_FALSE; } @@ -14891,7 +15172,7 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) * v6 interfaces. * Unlike ARP which has to do another bind * and attach, once we get here we are - * done withh NDP. Except in the case of + * done with NDP. Except in the case of * ILLF_XRESOLV, in which case we send an * AR_INTERFACE_UP to the external resolver. * If all goes well, the ioctl will complete @@ -14910,7 +15191,7 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) mutex_exit(&connp->conn_lock); if (success) { err = ipif_resolver_up(ipif, - B_FALSE); + Res_act_initial); if (err == EINPROGRESS) { freemsg(mp); return; @@ -14939,7 +15220,7 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) mutex_exit(&ill->ill_lock); mutex_exit(&connp->conn_lock); if (success) { - err = ipif_resolver_up(ipif, B_FALSE); + err = ipif_resolver_up(ipif, Res_act_initial); if (err == EINPROGRESS) { freemsg(mp); return; @@ -15061,13 +15342,13 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) /* * IPv4 ARP case * - * Set B_TRUE, as we only want + * Set Res_act_move, as we only want * ipif_resolver_up to send an * AR_ENTRY_ADD request up to * ARP. */ err = ipif_resolver_up(ipif, - B_TRUE); + Res_act_move); if (err) { ip1dbg(( "ip_rput_dlpi_writer: " @@ -15204,10 +15485,11 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) phyint_t *phyint = ill->ill_phyint; uint64_t new_phyint_flags; boolean_t changed = B_FALSE; + boolean_t went_up; + went_up = notify->dl_notification == DL_NOTE_LINK_UP; mutex_enter(&phyint->phyint_lock); - new_phyint_flags = - (notify->dl_notification == DL_NOTE_LINK_UP) ? + new_phyint_flags = went_up ? phyint->phyint_flags | PHYI_RUNNING : phyint->phyint_flags & ~PHYI_RUNNING; if (new_phyint_flags != phyint->phyint_flags) { @@ -15216,18 +15498,12 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) } mutex_exit(&phyint->phyint_lock); /* - * If the flags have changed, send a message to - * the routing socket. + * ill_restart_dad handles the DAD restart and routing + * socket notification logic. */ if (changed) { - if (phyint->phyint_illv4 != NULL) { - ip_rts_ifmsg( - phyint->phyint_illv4->ill_ipif); - } - if (phyint->phyint_illv6 != NULL) { - ip_rts_ifmsg( - phyint->phyint_illv6->ill_ipif); - } + ill_restart_dad(phyint->phyint_illv4, went_up); + ill_restart_dad(phyint->phyint_illv6, went_up); } break; } @@ -15274,15 +15550,14 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) */ break; } - case DL_NOTIFY_ACK: - /* - * Don't really need to check for what notifications - * are supported; we'll process what gets sent upstream, - * and we know it'll be something we support changing - * based on our DL_NOTIFY_REQ. - */ + case DL_NOTIFY_ACK: { + dl_notify_ack_t *noteack = (dl_notify_ack_t *)mp->b_rptr; + + if (noteack->dl_notifications & DL_NOTE_LINK_UP) + ill->ill_note_link = 1; ill_dlpi_done(ill, DL_NOTIFY_REQ); break; + } case DL_PHYS_ADDR_ACK: { /* * We should have an IOCTL waiting on this when request @@ -16198,7 +16473,7 @@ ip_fanout_proto_again(mblk_t *ipsec_mp, ill_t *ill, ill_t *recv_ill, ire_t *ire) rput_flags |= (IN6_IS_ADDR_MULTICAST(v6dstp) ? IP6_IN_LLMCAST : 0); ip_rput_data_v6(ill->ill_rq, ill, ipsec_mp, ip6h, rput_flags, - NULL); + NULL, NULL); } if (ill_need_rele) ill_refrele(ill); @@ -25801,6 +26076,17 @@ nak: freemsg(mp); } return; + case AR_ARP_EXTEND: + /* + * The ARP module above us is capable of duplicate + * address detection. Old ATM drivers will not send + * this message. + */ + ASSERT(q->q_next != NULL); + ill = (ill_t *)q->q_ptr; + ill->ill_arp_extend = B_TRUE; + freemsg(mp); + return; default: break; } @@ -27308,7 +27594,7 @@ static void ip_multirt_bad_mtu(ire_t *ire, uint32_t max_frag) { hrtime_t current = gethrtime(); - char buf[16]; + char buf[INET_ADDRSTRLEN]; /* Convert interval in ms to hrtime in ns */ if (multirt_bad_mtu_last_time + diff --git a/usr/src/uts/common/inet/ip/ip6.c b/usr/src/uts/common/inet/ip/ip6.c index b96a6a24ef..29afe371f7 100644 --- a/usr/src/uts/common/inet/ip/ip6.c +++ b/usr/src/uts/common/inet/ip/ip6.c @@ -102,6 +102,9 @@ #include <rpc/pmap_prot.h> +/* Temporary; for CR 6451644 work-around */ +#include <sys/ethernet.h> + extern squeue_func_t ip_input_proc; /* @@ -326,7 +329,7 @@ struct qinit winit_ipv6 = { */ static void icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length, - boolean_t mctl_present, uint_t flags, zoneid_t zoneid) + boolean_t mctl_present, uint_t flags, zoneid_t zoneid, mblk_t *dl_mp) { icmp6_t *icmp6; ip6_t *ip6h; @@ -603,7 +606,7 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length, if (mctl_present) freeb(first_mp); /* XXX may wish to pass first_mp up to ndp_input someday. */ - ndp_input(ill, mp); + ndp_input(ill, mp, dl_mp); return; case ND_NEIGHBOR_ADVERT: @@ -612,7 +615,7 @@ icmp_inbound_v6(queue_t *q, mblk_t *mp, ill_t *ill, uint_t hdr_length, if (mctl_present) freeb(first_mp); /* XXX may wish to pass first_mp up to ndp_input someday. */ - ndp_input(ill, mp); + ndp_input(ill, mp, dl_mp); return; case ND_REDIRECT: { @@ -5910,26 +5913,6 @@ ip_newroute_ipif_v6(queue_t *q, mblk_t *mp, ipif_t *ipif, } goto err_ret; } - /* Use any ipif for source */ - for (src_ipif = dst_ill->ill_ipif; src_ipif != NULL; - src_ipif = src_ipif->ipif_next) { - if ((src_ipif->ipif_flags & IPIF_UP) && - IN6_IS_ADDR_UNSPECIFIED( - &src_ipif->ipif_v6src_addr)) - break; - } - if (src_ipif == NULL) { - if (ip_debug > 2) { - /* ip1dbg */ - pr_addr_dbg("ip_newroute_ipif_v6: " - "no src for dst %s\n ", - AF_INET6, v6dstp); - printf("ip_newroute_ipif_v6: if %s" - "(UNSPEC_SRC)\n", - dst_ill->ill_name); - } - goto err_ret; - } src_ipif = ipif; ipif_refhold(src_ipif); } @@ -6602,7 +6585,7 @@ bad_opt: */ static void ip_process_rthdr(queue_t *q, mblk_t *mp, ip6_t *ip6h, ip6_rthdr_t *rth, - ill_t *ill, uint_t flags, mblk_t *hada_mp) + ill_t *ill, uint_t flags, mblk_t *hada_mp, mblk_t *dl_mp) { ip6_rthdr0_t *rthdr; uint_t ehdrlen; @@ -6678,7 +6661,7 @@ ip_process_rthdr(queue_t *q, mblk_t *mp, ip6_t *ip6h, ip6_rthdr_t *rth, B_FALSE, B_FALSE); return; } - ip_rput_data_v6(q, ill, mp, ip6h, flags, hada_mp); + ip_rput_data_v6(q, ill, mp, ip6h, flags, hada_mp, dl_mp); return; hada_drop: /* IPsec kstats: bean counter? */ @@ -6692,12 +6675,15 @@ hada_drop: static void ip_rput_v6(queue_t *q, mblk_t *mp) { - mblk_t *mp1, *first_mp, *hada_mp = NULL; + mblk_t *first_mp; + mblk_t *hada_mp = NULL; ip6_t *ip6h; - boolean_t ll_multicast = B_FALSE, mctl_present = B_FALSE; + boolean_t ll_multicast = B_FALSE; + boolean_t mctl_present = B_FALSE; ill_t *ill; struct iocblk *iocp; uint_t flags = 0; + mblk_t *dl_mp; ill = (ill_t *)q->q_ptr; if (ill->ill_state_flags & ILL_CONDEMNED) { @@ -6719,9 +6705,59 @@ ip_rput_v6(queue_t *q, mblk_t *mp) } } + dl_mp = NULL; switch (mp->b_datap->db_type) { - case M_DATA: + case M_DATA: { + int hlen; + uchar_t *ucp; + struct ether_header *eh; + dl_unitdata_ind_t *dui; + + /* + * This is a work-around for CR 6451644, a bug in Nemo. It + * should be removed when that problem is fixed. + */ + if (ill->ill_mactype == DL_ETHER && + (hlen = MBLKHEAD(mp)) >= sizeof (struct ether_header) && + (ucp = mp->b_rptr)[-1] == (IP6_DL_SAP & 0xFF) && + ucp[-2] == (IP6_DL_SAP >> 8)) { + if (hlen >= sizeof (struct ether_vlan_header) && + ucp[-5] == 0 && ucp[-6] == 0x81) + ucp -= sizeof (struct ether_vlan_header); + else + ucp -= sizeof (struct ether_header); + /* + * If it's a group address, then fabricate a + * DL_UNITDATA_IND message. + */ + if ((ll_multicast = (ucp[0] & 1)) != 0 && + (dl_mp = allocb(DL_UNITDATA_IND_SIZE + 16, + BPRI_HI)) != NULL) { + eh = (struct ether_header *)ucp; + dui = (dl_unitdata_ind_t *)dl_mp->b_rptr; + DB_TYPE(dl_mp) = M_PROTO; + dl_mp->b_wptr = (uchar_t *)(dui + 1) + 16; + dui->dl_primitive = DL_UNITDATA_IND; + dui->dl_dest_addr_length = 8; + dui->dl_dest_addr_offset = DL_UNITDATA_IND_SIZE; + dui->dl_src_addr_length = 8; + dui->dl_src_addr_offset = DL_UNITDATA_IND_SIZE + + 8; + dui->dl_group_address = 1; + ucp = (uchar_t *)(dui + 1); + if (ill->ill_sap_length > 0) + ucp += ill->ill_sap_length; + bcopy(&eh->ether_dhost, ucp, 6); + bcopy(&eh->ether_shost, ucp + 8, 6); + ucp = (uchar_t *)(dui + 1); + if (ill->ill_sap_length < 0) + ucp += 8 + ill->ill_sap_length; + bcopy(&eh->ether_type, ucp, 2); + bcopy(&eh->ether_type, ucp + 8, 2); + } + } break; + } case M_PROTO: case M_PCPROTO: @@ -6734,10 +6770,10 @@ ip_rput_v6(queue_t *q, mblk_t *mp) #define dlur ((dl_unitdata_ind_t *)mp->b_rptr) ll_multicast = dlur->dl_group_address; #undef dlur - /* Ditch the DLPI header. */ - mp1 = mp; + /* Save the DLPI header. */ + dl_mp = mp; mp = mp->b_cont; - freeb(mp1); + dl_mp->b_cont = NULL; break; case M_BREAK: panic("ip_rput_v6: got an M_BREAK"); @@ -6772,7 +6808,7 @@ ip_rput_v6(queue_t *q, mblk_t *mp) mutex_exit(&ill->ill_lock); qwriter_ip(NULL, ill, q, mp, ip_rput_other, CUR_OP, B_FALSE); return; - case M_CTL: { + case M_CTL: if ((MBLKL(mp) > sizeof (int)) && ((da_ipsec_t *)mp->b_rptr)->da_type == IPHADA_M_CTL) { ASSERT(MBLKL(mp) >= sizeof (da_ipsec_t)); @@ -6781,7 +6817,6 @@ ip_rput_v6(queue_t *q, mblk_t *mp) } putnext(q, mp); return; - } case M_IOCNAK: iocp = (struct iocblk *)mp->b_rptr; switch (iocp->ioc_cmd) { @@ -6824,8 +6859,8 @@ ip_rput_v6(queue_t *q, mblk_t *mp) mp1 = copymsg(mp); freemsg(mp); if (mp1 == NULL) { - BUMP_MIB(ill->ill_ip6_mib, ipv6InDiscards); - return; + first_mp = NULL; + goto discard; } mp = mp1; } @@ -6841,10 +6876,8 @@ ip_rput_v6(queue_t *q, mblk_t *mp) if (!OK_32PTR((uchar_t *)ip6h) || (mp->b_wptr - (uchar_t *)ip6h) < IPV6_HDR_LEN) { if (!pullupmsg(mp, IPV6_HDR_LEN)) { - BUMP_MIB(ill->ill_ip6_mib, ipv6InDiscards); ip1dbg(("ip_rput_v6: pullupmsg failed\n")); - freemsg(first_mp); - return; + goto discard; } ip6h = (ip6_t *)mp->b_rptr; } @@ -6857,31 +6890,32 @@ ip_rput_v6(queue_t *q, mblk_t *mp) * TODO: Avoid this check for e.g. connected TCP sockets */ if (IN6_IS_ADDR_V4MAPPED(&ip6h->ip6_src)) { - BUMP_MIB(ill->ill_ip6_mib, ipv6InDiscards); ip1dbg(("ip_rput_v6: pkt with mapped src addr\n")); - freemsg(first_mp); - return; + goto discard; } if (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_src)) { - BUMP_MIB(ill->ill_ip6_mib, ipv6InDiscards); ip1dbg(("ip_rput_v6: pkt with loopback src")); - freemsg(first_mp); - return; + goto discard; } else if (IN6_IS_ADDR_LOOPBACK(&ip6h->ip6_dst)) { - BUMP_MIB(ill->ill_ip6_mib, ipv6InDiscards); ip1dbg(("ip_rput_v6: pkt with loopback dst")); - freemsg(first_mp); - return; + goto discard; } flags |= (ll_multicast ? IP6_IN_LLMCAST : 0); - ip_rput_data_v6(q, ill, mp, ip6h, flags, hada_mp); + ip_rput_data_v6(q, ill, mp, ip6h, flags, hada_mp, dl_mp); } else { BUMP_MIB(ill->ill_ip6_mib, ipv6InIPv4); - BUMP_MIB(ill->ill_ip6_mib, ipv6InDiscards); - freemsg(first_mp); + goto discard; } + freemsg(dl_mp); + return; + +discard: + if (dl_mp != NULL) + freeb(dl_mp); + freemsg(first_mp); + BUMP_MIB(ill->ill_ip6_mib, ipv6InDiscards); } /* @@ -7080,10 +7114,14 @@ ipsec_early_ah_v6(queue_t *q, mblk_t *first_mp, boolean_t mctl_present, * actually arrived on. We need to remember this when saving the * input interface index into potential IPV6_PKTINFO data in * ip_add_info_v6(). + * + * This routine doesn't free dl_mp; that's the caller's responsibility on + * return. (Note that the callers are complex enough that there's no tail + * recursion here anyway.) */ void ip_rput_data_v6(queue_t *q, ill_t *inill, mblk_t *mp, ip6_t *ip6h, - uint_t flags, mblk_t *hada_mp) + uint_t flags, mblk_t *hada_mp, mblk_t *dl_mp) { ire_t *ire = NULL; queue_t *rq; @@ -7939,14 +7977,15 @@ tcp_fanout: continue; icmp_inbound_v6(q, first_mp1, ill, hdr_len, mctl_present, 0, - ilm->ilm_zoneid); + ilm->ilm_zoneid, dl_mp); } ILM_WALKER_RELE(ill); } else { first_mp1 = ip_copymsg(first_mp); if (first_mp1 != NULL) icmp_inbound_v6(q, first_mp1, ill, - hdr_len, mctl_present, 0, zoneid); + hdr_len, mctl_present, 0, zoneid, + dl_mp); } } /* FALLTHRU */ @@ -8181,7 +8220,7 @@ tcp_fanout: return; } ip_process_rthdr(q, mp, ip6h, rthdr, ill, - flags, hada_mp); + flags, hada_mp, dl_mp); return; } used = ehdrlen; @@ -10253,8 +10292,7 @@ send_from_ill: &ip6h->ip6_src, ill, zoneid); } } - if (ill != NULL) - ill_refrele(ill); + ill_refrele(ill); return; } if (need_decref) { @@ -10284,8 +10322,7 @@ send_from_ill: } if (mp == NULL) { BUMP_MIB(mibptr, ipv6OutDiscards); - if (ill != NULL) - ill_refrele(ill); + ill_refrele(ill); return; } ip6i = (ip6i_t *)mp->b_rptr; @@ -10333,8 +10370,7 @@ send_from_ill: ip_newroute_v6(q, first_mp, v6dstp, &ip6h->ip6_src, ill, zoneid); } - if (ill != NULL) - ill_refrele(ill); + ill_refrele(ill); return; notv6: @@ -10553,7 +10589,8 @@ ip_wput_local_v6(queue_t *q, ill_t *ill, ip6_t *ip6h, mblk_t *first_mp, continue; icmp_inbound_v6(q, first_mp1, ill, hdr_length, mctl_present, - IP6_NO_IPPOLICY, ilm->ilm_zoneid); + IP6_NO_IPPOLICY, ilm->ilm_zoneid, + NULL); } ILM_WALKER_RELE(ill); } else { @@ -10561,7 +10598,8 @@ ip_wput_local_v6(queue_t *q, ill_t *ill, ip6_t *ip6h, mblk_t *first_mp, if (first_mp1 != NULL) icmp_inbound_v6(q, first_mp1, ill, hdr_length, mctl_present, - IP6_NO_IPPOLICY, ire->ire_zoneid); + IP6_NO_IPPOLICY, ire->ire_zoneid, + NULL); } } /* FALLTHRU */ diff --git a/usr/src/uts/common/inet/ip/ip6_if.c b/usr/src/uts/common/inet/ip/ip6_if.c index 2749b6b439..313d0bbdca 100644 --- a/usr/src/uts/common/inet/ip/ip6_if.c +++ b/usr/src/uts/common/inet/ip/ip6_if.c @@ -1317,12 +1317,12 @@ ipif_ndp_up(ipif_t *ipif, const in6_addr_t *addr, boolean_t macaddr_change) * ND not supported on XRESOLV interfaces. If ND support (multicast) * added later, take out this check. */ - if (ill->ill_flags & ILLF_XRESOLV) - return (0); - - if (IN6_IS_ADDR_UNSPECIFIED(addr) || - (!(ill->ill_net_type & IRE_INTERFACE))) + if ((ill->ill_flags & ILLF_XRESOLV) || + IN6_IS_ADDR_UNSPECIFIED(addr) || + (!(ill->ill_net_type & IRE_INTERFACE))) { + ipif->ipif_addr_ready = 1; return (0); + } /* * Need to setup multicast mapping only when the first @@ -1374,7 +1374,7 @@ ipif_ndp_up(ipif_t *ipif, const in6_addr_t *addr, boolean_t macaddr_change) &ipv6_all_zeros, 0, flags, - ND_REACHABLE, + ND_PROBE, /* Causes Duplicate Address Detection to run */ &nce, NULL, NULL); @@ -1382,6 +1382,11 @@ ipif_ndp_up(ipif_t *ipif, const in6_addr_t *addr, boolean_t macaddr_change) case 0: ip1dbg(("ipif_ndp_up: NCE created for %s\n", ill->ill_name)); + ipif->ipif_addr_ready = 1; + break; + case EINPROGRESS: + ip1dbg(("ipif_ndp_up: running DAD now for %s\n", + ill->ill_name)); break; case EEXIST: NCE_REFRELE(nce); @@ -1401,6 +1406,9 @@ ipif_ndp_up(ipif_t *ipif, const in6_addr_t *addr, boolean_t macaddr_change) } return (err); } + } else { + /* No local NCE for this entry */ + ipif->ipif_addr_ready = 1; } if (nce != NULL) NCE_REFRELE(nce); @@ -1625,7 +1633,8 @@ ip_addr_xor_v6(const in6_addr_t *a1, const in6_addr_t *a2, in6_addr_t *res) #define IPIF_VALID_IPV6_SOURCE(ipif) \ (((ipif)->ipif_flags & IPIF_UP) && \ - !((ipif)->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST))) + !((ipif)->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) && \ + (ipif)->ipif_addr_ready) /* source address candidate */ typedef struct candidate { @@ -3001,9 +3010,12 @@ ipif_up_done_v6(ipif_t *ipif) } } + if (ipif->ipif_addr_ready) { + ip_rts_ifmsg(ipif); + ip_rts_newaddrmsg(RTM_ADD, 0, ipif); + sctp_update_ipif(ipif, SCTP_IPIF_UP); + } - ip_rts_ifmsg(ipif); - ip_rts_newaddrmsg(RTM_ADD, 0, ipif); if (ipif_saved_irep != NULL) { kmem_free(ipif_saved_irep, ipif_saved_ire_cnt * sizeof (ire_t *)); @@ -3011,7 +3023,6 @@ ipif_up_done_v6(ipif_t *ipif) if (src_ipif_held) ipif_refrele(src_ipif); - sctp_update_ipif(ipif, SCTP_IPIF_UP); return (0); bad: diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c index 179c1dd7ce..8edf1bb113 100644 --- a/usr/src/uts/common/inet/ip/ip_if.c +++ b/usr/src/uts/common/inet/ip/ip_if.c @@ -157,10 +157,8 @@ static void ipif_check_bcast_ires(ipif_t *test_ipif); static void ipif_down_delete_ire(ire_t *ire, char *ipif); static void ipif_delete_cache_ire(ire_t *, char *); static int ipif_logical_down(ipif_t *ipif, queue_t *q, mblk_t *mp); -static void ipif_down_tail(ipif_t *ipif); static void ipif_free(ipif_t *ipif); static void ipif_free_tail(ipif_t *ipif); -static void ipif_mask_reply(ipif_t *); static void ipif_mtu_change(ire_t *ire, char *ipif_arg); static void ipif_multicast_down(ipif_t *ipif); static void ipif_recreate_interface_routes(ipif_t *old_ipif, ipif_t *ipif); @@ -180,6 +178,7 @@ static int ill_arp_off(ill_t *ill); static int ill_arp_on(ill_t *ill); static void ill_delete_interface_type(ill_if_t *); static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q); +static void ill_dl_down(ill_t *ill); static void ill_down(ill_t *ill); static void ill_downi(ire_t *ire, char *ill_arg); static void ill_downi_mrtun_srcif(ire_t *ire, char *ill_arg); @@ -671,6 +670,20 @@ ill_arp_alloc(ill_t *ill, uchar_t *template, caddr_t addr) return (mp); } +mblk_t * +ipif_area_alloc(ipif_t *ipif) +{ + return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_area_template, + (char *)&ipif->ipif_lcl_addr)); +} + +mblk_t * +ipif_ared_alloc(ipif_t *ipif) +{ + return (ill_arp_alloc(ipif->ipif_ill, (uchar_t *)&ip_ared_template, + (char *)&ipif->ipif_lcl_addr)); +} + /* * Completely vaporize a lower level tap and all associated interfaces. * ill_delete is called only out of ip_close when the device control @@ -751,6 +764,19 @@ ill_delete(ill_t *ill) rw_exit(&ill_g_usesrc_lock); } +static void +ipif_non_duplicate(ipif_t *ipif) +{ + ill_t *ill = ipif->ipif_ill; + mutex_enter(&ill->ill_lock); + if (ipif->ipif_flags & IPIF_DUPLICATE) { + ipif->ipif_flags &= ~IPIF_DUPLICATE; + ASSERT(ill->ill_ipif_dup_count > 0); + ill->ill_ipif_dup_count--; + } + mutex_exit(&ill->ill_lock); +} + /* * ill_delete_tail is called from ip_modclose after all references * to the closing ill are gone. The wait is done in ip_modclose @@ -761,8 +787,14 @@ ill_delete_tail(ill_t *ill) mblk_t **mpp; ipif_t *ipif; - for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { + ipif_non_duplicate(ipif); ipif_down_tail(ipif); + } + + ASSERT(ill->ill_ipif_dup_count == 0 && + ill->ill_arp_down_mp == NULL && + ill->ill_arp_del_mapping_mp == NULL); /* * If polling capability is enabled (which signifies direct @@ -1489,8 +1521,10 @@ ipif_all_down_tail(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) ipif_t *ipif; ASSERT(IAM_WRITER_IPSQ(ipsq)); - for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { + ipif_non_duplicate(ipif); ipif_down_tail(ipif); + } ill_down_tail(ill); freemsg(mp); ipsq->ipsq_current_ipif = NULL; @@ -5645,8 +5679,10 @@ ipif_is_quiescent(ipif_t *ipif) } ill = ipif->ipif_ill; - if (ill->ill_ipif_up_count != 0 || ill->ill_logical_down) + if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 || + ill->ill_logical_down) { return (B_TRUE); + } /* This is the last ipif going down or being deleted on this ill */ if (ill->ill_ire_cnt != 0 || ill->ill_refcnt != 0) { @@ -9144,6 +9180,8 @@ ip_sioctl_arp_common(ill_t *ill, queue_t *q, mblk_t *mp, sin_t *sin, area->area_flags |= ACE_F_PERMANENT; if (flags & ATF_PUBL) area->area_flags |= ACE_F_PUBLISH; + if (flags & ATF_AUTHORITY) + area->area_flags |= ACE_F_AUTHORITY; /* * Up to ARP it goes. The response will come @@ -10118,6 +10156,8 @@ errack: *flagsp |= ATF_PERM; if (area->area_flags & ACE_F_PUBLISH) *flagsp |= ATF_PUBL; + if (area->area_flags & ACE_F_AUTHORITY) + *flagsp |= ATF_AUTHORITY; if (area->area_hw_addr_length != 0) { *flagsp |= ATF_COM; /* @@ -10524,10 +10564,11 @@ ip_sioctl_removeif(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, if (ipif->ipif_refcnt == 0 && ipif->ipif_ire_cnt == 0) { mutex_exit(&ill->ill_lock); mutex_exit(&connp->conn_lock); + ipif_non_duplicate(ipif); ipif_down_tail(ipif); ipif_free_tail(ipif); return (0); - } + } success = ipsq_pending_mp_add(connp, ipif, CONNP_TO_WQ(connp), mp, IPIF_FREE); mutex_exit(&ill->ill_lock); @@ -10565,6 +10606,7 @@ ip_sioctl_removeif_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, ASSERT(IAM_WRITER_IPIF(ipif)); ASSERT(ipif->ipif_state_flags & IPIF_CONDEMNED); + ipif_non_duplicate(ipif); ipif_down_tail(ipif); ipif_free_tail(ipif); @@ -10682,10 +10724,19 @@ ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ipaddr_t addr; sin6_t *sin6; int err = 0; + ill_t *ill = ipif->ipif_ill; + boolean_t need_dl_down; + boolean_t need_arp_down; ip1dbg(("ip_sioctl_addr_tail(%s:%u %p)\n", - ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); + ill->ill_name, ipif->ipif_id, (void *)ipif)); ASSERT(IAM_WRITER_IPIF(ipif)); + + /* Must cancel any pending timer before taking the ill_lock */ + if (ipif->ipif_recovery_id != 0) + (void) untimeout(ipif->ipif_recovery_id); + ipif->ipif_recovery_id = 0; + if (ipif->ipif_isv6) { sin6 = (sin6_t *)sin; v6addr = sin6->sin6_addr; @@ -10693,17 +10744,37 @@ ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, addr = sin->sin_addr.s_addr; IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); } - mutex_enter(&ipif->ipif_ill->ill_lock); + mutex_enter(&ill->ill_lock); ipif->ipif_v6lcl_addr = v6addr; if (ipif->ipif_flags & (IPIF_ANYCAST | IPIF_NOLOCAL)) { ipif->ipif_v6src_addr = ipv6_all_zeros; } else { ipif->ipif_v6src_addr = v6addr; } + ipif->ipif_addr_ready = 0; + + /* + * If the interface was previously marked as a duplicate, then since + * we've now got a "new" address, it should no longer be considered a + * duplicate -- even if the "new" address is the same as the old one. + * Note that if all ipifs are down, we may have a pending ARP down + * event to handle. This is because we want to recover from duplicates + * and thus delay tearing down ARP until the duplicates have been + * removed or disabled. + */ + need_dl_down = need_arp_down = B_FALSE; + if (ipif->ipif_flags & IPIF_DUPLICATE) { + need_arp_down = !need_up; + ipif->ipif_flags &= ~IPIF_DUPLICATE; + if (--ill->ill_ipif_dup_count == 0 && !need_up && + ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { + need_dl_down = B_TRUE; + } + } - if ((ipif->ipif_isv6) && IN6_IS_ADDR_6TO4(&v6addr) && - (!ipif->ipif_ill->ill_is_6to4tun)) { - queue_t *wqp = ipif->ipif_ill->ill_wq; + if (ipif->ipif_isv6 && IN6_IS_ADDR_6TO4(&v6addr) && + !ill->ill_is_6to4tun) { + queue_t *wqp = ill->ill_wq; /* * The local address of this interface is a 6to4 address, @@ -10719,7 +10790,7 @@ ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, if (wqp->q_next->q_qinfo->qi_minfo->mi_idnum == TUN6TO4_MODID) { /* set for use in IP */ - ipif->ipif_ill->ill_is_6to4tun = 1; + ill->ill_is_6to4tun = 1; break; } wqp = wqp->q_next; @@ -10728,7 +10799,7 @@ ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, } ipif_set_default(ipif); - mutex_exit(&ipif->ipif_ill->ill_lock); + mutex_exit(&ill->ill_lock); if (need_up) { /* @@ -10748,6 +10819,11 @@ ip_sioctl_addr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, sctp_update_ipif(ipif, SCTP_IPIF_UPDATE); } + if (need_dl_down) + ill_dl_down(ill); + if (need_arp_down) + ipif_arp_down(ipif); + return (err); } @@ -10872,9 +10948,17 @@ ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, in6_addr_t v6addr; ill_t *ill = ipif->ipif_ill; int err = 0; + boolean_t need_dl_down; + boolean_t need_arp_down; + + ip1dbg(("ip_sioctl_dstaddr_tail(%s:%u %p)\n", ill->ill_name, + ipif->ipif_id, (void *)ipif)); + + /* Must cancel any pending timer before taking the ill_lock */ + if (ipif->ipif_recovery_id != 0) + (void) untimeout(ipif->ipif_recovery_id); + ipif->ipif_recovery_id = 0; - ip1dbg(("ip_sioctl_dstaddr_tail(%s:%u %p)\n", - ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); if (ipif->ipif_isv6) { sin6_t *sin6; @@ -10898,7 +10982,24 @@ ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, ipif->ipif_flags |= IPIF_POINTOPOINT; ipif->ipif_flags &= ~IPIF_BROADCAST; if (ipif->ipif_isv6) - ipif->ipif_ill->ill_flags |= ILLF_NONUD; + ill->ill_flags |= ILLF_NONUD; + } + + /* + * If the interface was previously marked as a duplicate, then since + * we've now got a "new" address, it should no longer be considered a + * duplicate -- even if the "new" address is the same as the old one. + * Note that if all ipifs are down, we may have a pending ARP down + * event to handle. + */ + need_dl_down = need_arp_down = B_FALSE; + if (ipif->ipif_flags & IPIF_DUPLICATE) { + need_arp_down = !need_up; + ipif->ipif_flags &= ~IPIF_DUPLICATE; + if (--ill->ill_ipif_dup_count == 0 && !need_up && + ill->ill_ipif_up_count == 0 && ill->ill_dl_up) { + need_dl_down = B_TRUE; + } } /* Set the new address. */ @@ -10918,6 +11019,12 @@ ip_sioctl_dstaddr_tail(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, */ err = ipif_up(ipif, q, mp); } + + if (need_dl_down) + ill_dl_down(ill); + + if (need_arp_down) + ipif_arp_down(ipif); return (err); } @@ -12917,47 +13024,45 @@ void ipif_arp_down(ipif_t *ipif) { mblk_t *mp; + ill_t *ill = ipif->ipif_ill; - ip1dbg(("ipif_arp_down(%s:%u)\n", - ipif->ipif_ill->ill_name, ipif->ipif_id)); + ip1dbg(("ipif_arp_down(%s:%u)\n", ill->ill_name, ipif->ipif_id)); ASSERT(IAM_WRITER_IPIF(ipif)); /* Delete the mapping for the local address */ mp = ipif->ipif_arp_del_mp; if (mp != NULL) { - ip1dbg(("ipif_arp_down: %s (%u) for %s:%u\n", - dlpi_prim_str(*(int *)mp->b_rptr), *(int *)mp->b_rptr, - ipif->ipif_ill->ill_name, ipif->ipif_id)); - putnext(ipif->ipif_ill->ill_rq, mp); + ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", + *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); + putnext(ill->ill_rq, mp); ipif->ipif_arp_del_mp = NULL; } /* - * If this is the last ipif that is going down, we need - * to clean up ARP completely. + * If this is the last ipif that is going down and there are no + * duplicate addresses we may yet attempt to re-probe, then we need to + * clean up ARP completely. */ - if (ipif->ipif_ill->ill_ipif_up_count == 0) { + if (ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0) { /* Send up AR_INTERFACE_DOWN message */ - mp = ipif->ipif_ill->ill_arp_down_mp; + mp = ill->ill_arp_down_mp; if (mp != NULL) { - ip1dbg(("ipif_arp_down: %s (%u) for %s:%u\n", - dlpi_prim_str(*(int *)mp->b_rptr), - *(int *)mp->b_rptr, ipif->ipif_ill->ill_name, + ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", + *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); - putnext(ipif->ipif_ill->ill_rq, mp); - ipif->ipif_ill->ill_arp_down_mp = NULL; + putnext(ill->ill_rq, mp); + ill->ill_arp_down_mp = NULL; } /* Tell ARP to delete the multicast mappings */ - mp = ipif->ipif_ill->ill_arp_del_mapping_mp; + mp = ill->ill_arp_del_mapping_mp; if (mp != NULL) { - ip1dbg(("ipif_arp_down: %s (%u) for %s:%u\n", - dlpi_prim_str(*(int *)mp->b_rptr), - *(int *)mp->b_rptr, ipif->ipif_ill->ill_name, + ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", + *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); - putnext(ipif->ipif_ill->ill_rq, mp); - ipif->ipif_ill->ill_arp_del_mapping_mp = NULL; + putnext(ill->ill_rq, mp); + ill->ill_arp_del_mapping_mp = NULL; } } } @@ -13000,9 +13105,8 @@ ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp) */ mp = ill->ill_arp_del_mapping_mp; if (mp != NULL) { - ip1dbg(("ipif_arp_down: %s (%u) for %s:%u\n", - dlpi_prim_str(*(int *)mp->b_rptr), - *(int *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); + ip1dbg(("ipif_arp_down: arp cmd %x for %s:%u\n", + *(unsigned *)mp->b_rptr, ill->ill_name, ipif->ipif_id)); putnext(ill->ill_rq, mp); ill->ill_arp_del_mapping_mp = NULL; } @@ -13077,6 +13181,7 @@ ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp) return (0); } ASSERT(add_mp != NULL && del_mp != NULL); + ASSERT(ill->ill_arp_del_mapping_mp == NULL); ill->ill_arp_del_mapping_mp = del_mp; if (arp_add_mapping_mp != NULL) { /* The caller just wants the mblks allocated */ @@ -13095,15 +13200,18 @@ ipif_arp_setup_multicast(ipif_t *ipif, mblk_t **arp_add_mapping_mp) * though it only sets up the resolver for v6 * if it's an xresolv interface (one using an external resolver). * Honors ILLF_NOARP. - * The boolean value arp_just_publish, if B_TRUE, indicates that - * it only needs to send an AR_ENTRY_ADD message up to ARP for - * IPv4 interfaces. Currently, B_TRUE is only set when this - * function is called by ip_rput_dlpi_writer() to handle - * asynchronous hardware address change notification. + * The enumerated value res_act is used to tune the behavior. + * If set to Res_act_initial, then we set up all the resolver + * structures for a new interface. If set to Res_act_move, then + * we just send an AR_ENTRY_ADD message up to ARP for IPv4 + * interfaces; this is called by ip_rput_dlpi_writer() to handle + * asynchronous hardware address change notification. If set to + * Res_act_defend, then we tell ARP that it needs to send a single + * gratuitous message in defense of the address. * Returns error on failure. */ int -ipif_resolver_up(ipif_t *ipif, boolean_t arp_just_publish) +ipif_resolver_up(ipif_t *ipif, enum ip_resolver_action res_act) { caddr_t addr; mblk_t *arp_up_mp = NULL; @@ -13116,22 +13224,43 @@ ipif_resolver_up(ipif_t *ipif, boolean_t arp_just_publish) uchar_t *area_p = NULL; uchar_t *ared_p = NULL; int err = ENOMEM; + boolean_t was_dup; ip1dbg(("ipif_resolver_up(%s:%u) flags 0x%x\n", - ipif->ipif_ill->ill_name, ipif->ipif_id, - (uint_t)ipif->ipif_flags)); + ill->ill_name, ipif->ipif_id, (uint_t)ipif->ipif_flags)); ASSERT(IAM_WRITER_IPIF(ipif)); - if ((ill->ill_net_type != IRE_IF_RESOLVER) || - (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV))) { + was_dup = B_FALSE; + if (res_act == Res_act_initial) { + ipif->ipif_addr_ready = 0; + /* + * We're bringing an interface up here. There's no way that we + * should need to shut down ARP now. + */ + mutex_enter(&ill->ill_lock); + if (ipif->ipif_flags & IPIF_DUPLICATE) { + ipif->ipif_flags &= ~IPIF_DUPLICATE; + ill->ill_ipif_dup_count--; + was_dup = B_TRUE; + } + mutex_exit(&ill->ill_lock); + } + if (ipif->ipif_recovery_id != 0) + (void) untimeout(ipif->ipif_recovery_id); + ipif->ipif_recovery_id = 0; + if (ill->ill_net_type != IRE_IF_RESOLVER) { + ipif->ipif_addr_ready = 1; return (0); } + /* NDP will set the ipif_addr_ready flag when it's ready */ + if (ill->ill_isv6 && !(ill->ill_flags & ILLF_XRESOLV)) + return (0); if (ill->ill_isv6) { /* * External resolver for IPv6 */ - ASSERT(!arp_just_publish); + ASSERT(res_act == Res_act_initial); if (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)) { addr = (caddr_t)&ipif->ipif_v6lcl_addr; area_p = (uchar_t *)&ip6_area_template; @@ -13149,7 +13278,8 @@ ipif_resolver_up(ipif_t *ipif, boolean_t arp_just_publish) err = EINVAL; goto failed; } else { - if (ill->ill_ipif_up_count == 0) + if (ill->ill_ipif_up_count == 0 && + ill->ill_ipif_dup_count == 0 && !was_dup) ill->ill_arp_bringup_pending = 1; mutex_exit(&ill->ill_lock); } @@ -13164,17 +13294,19 @@ ipif_resolver_up(ipif_t *ipif, boolean_t arp_just_publish) * Add an entry for the local address in ARP only if it * is not UNNUMBERED and the address is not INADDR_ANY. */ - if (((ipif->ipif_flags & IPIF_UNNUMBERED) == 0) && area_p != NULL) { + if (!(ipif->ipif_flags & IPIF_UNNUMBERED) && area_p != NULL) { + area_t *area; + /* Now ask ARP to publish our address. */ arp_add_mp = ill_arp_alloc(ill, area_p, addr); if (arp_add_mp == NULL) goto failed; - if (arp_just_publish) { + area = (area_t *)arp_add_mp->b_rptr; + if (res_act != Res_act_initial) { /* * Copy the new hardware address and length into * arp_add_mp to be sent to ARP. */ - area_t *area = (area_t *)arp_add_mp->b_rptr; area->area_hw_addr_length = ill->ill_phys_addr_length; bcopy((char *)ill->ill_phys_addr, @@ -13182,10 +13314,20 @@ ipif_resolver_up(ipif_t *ipif, boolean_t arp_just_publish) area->area_hw_addr_length); } - ((area_t *)arp_add_mp->b_rptr)->area_flags = - ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR; + area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | + ACE_F_MYADDR; + + if (res_act == Res_act_defend) { + area->area_flags |= ACE_F_DEFEND; + /* + * If we're just defending our address now, then + * there's no need to set up ARP multicast mappings. + * The publish command is enough. + */ + goto done; + } - if (arp_just_publish) + if (res_act != Res_act_initial) goto arp_setup_multicast; /* @@ -13197,15 +13339,17 @@ ipif_resolver_up(ipif_t *ipif, boolean_t arp_just_publish) goto failed; } else { - if (arp_just_publish) + if (res_act != Res_act_initial) goto done; } /* * Need to bring up ARP or setup multicast mapping only * when the first interface is coming UP. */ - if (ill->ill_ipif_up_count != 0) + if (ill->ill_ipif_up_count != 0 || ill->ill_ipif_dup_count != 0 || + was_dup) { goto done; + } /* * Allocate an ARP down message (to be saved) and an ARP up @@ -13236,7 +13380,7 @@ arp_setup_multicast: ASSERT(arp_add_mapping_mp != NULL); } -done:; +done: if (arp_del_mp != NULL) { ASSERT(ipif->ipif_arp_del_mp == NULL); ipif->ipif_arp_del_mp = arp_del_mp; @@ -13251,41 +13395,48 @@ done:; } if (arp_up_mp != NULL) { ip1dbg(("ipif_resolver_up: ARP_UP for %s:%u\n", - ipif->ipif_ill->ill_name, ipif->ipif_id)); + ill->ill_name, ipif->ipif_id)); putnext(ill->ill_rq, arp_up_mp); } if (arp_add_mp != NULL) { ip1dbg(("ipif_resolver_up: ARP_ADD for %s:%u\n", - ipif->ipif_ill->ill_name, ipif->ipif_id)); + ill->ill_name, ipif->ipif_id)); + /* + * If it's an extended ARP implementation, then we'll wait to + * hear that DAD has finished before using the interface. + */ + if (!ill->ill_arp_extend) + ipif->ipif_addr_ready = 1; putnext(ill->ill_rq, arp_add_mp); + } else { + ipif->ipif_addr_ready = 1; } if (arp_add_mapping_mp != NULL) { ip1dbg(("ipif_resolver_up: MAPPING_ADD for %s:%u\n", - ipif->ipif_ill->ill_name, ipif->ipif_id)); + ill->ill_name, ipif->ipif_id)); putnext(ill->ill_rq, arp_add_mapping_mp); } - if (arp_just_publish) + if (res_act != Res_act_initial) return (0); if (ill->ill_flags & ILLF_NOARP) err = ill_arp_off(ill); else err = ill_arp_on(ill); - if (err) { + if (err != 0) { ip0dbg(("ipif_resolver_up: arp_on/off failed %d\n", err)); freemsg(ipif->ipif_arp_del_mp); - if (arp_down_mp != NULL) - freemsg(ill->ill_arp_down_mp); - if (ill->ill_arp_del_mapping_mp != NULL) - freemsg(ill->ill_arp_del_mapping_mp); + freemsg(ill->ill_arp_down_mp); + freemsg(ill->ill_arp_del_mapping_mp); ipif->ipif_arp_del_mp = NULL; ill->ill_arp_down_mp = NULL; ill->ill_arp_del_mapping_mp = NULL; return (err); } - return (ill->ill_ipif_up_count != 0 ? 0 : EINPROGRESS); + return ((ill->ill_ipif_up_count != 0 || was_dup || + ill->ill_ipif_dup_count != 0) ? 0 : EINPROGRESS); -failed:; +failed: ip1dbg(("ipif_resolver_up: FAILED\n")); freemsg(arp_add_mp); freemsg(arp_del_mp); @@ -13297,6 +13448,143 @@ failed:; } /* + * This routine restarts IPv4 duplicate address detection (DAD) when a link has + * just gone back up. + */ +static void +ipif_arp_start_dad(ipif_t *ipif) +{ + ill_t *ill = ipif->ipif_ill; + mblk_t *arp_add_mp; + area_t *area; + + if (ill->ill_net_type != IRE_IF_RESOLVER || ill->ill_arp_closing || + (ipif->ipif_flags & IPIF_UNNUMBERED) || + ipif->ipif_lcl_addr == INADDR_ANY || + (arp_add_mp = ill_arp_alloc(ill, (uchar_t *)&ip_area_template, + (char *)&ipif->ipif_lcl_addr)) == NULL) { + /* + * If we can't contact ARP for some reason, that's not really a + * problem. Just send out the routing socket notification that + * DAD completion would have done, and continue. + */ + ipif_mask_reply(ipif); + ip_rts_ifmsg(ipif); + ip_rts_newaddrmsg(RTM_ADD, 0, ipif); + sctp_update_ipif(ipif, SCTP_IPIF_UP); + ipif->ipif_addr_ready = 1; + return; + } + + /* Setting the 'unverified' flag restarts DAD */ + area = (area_t *)arp_add_mp->b_rptr; + area->area_flags = ACE_F_PERMANENT | ACE_F_PUBLISH | ACE_F_MYADDR | + ACE_F_UNVERIFIED; + putnext(ill->ill_rq, arp_add_mp); +} + +static void +ipif_ndp_start_dad(ipif_t *ipif) +{ + nce_t *nce; + + nce = ndp_lookup_v6(ipif->ipif_ill, &ipif->ipif_v6lcl_addr, B_FALSE); + if (nce == NULL) + return; + + if (!ndp_restart_dad(nce)) { + /* + * If we can't restart DAD for some reason, that's not really a + * problem. Just send out the routing socket notification that + * DAD completion would have done, and continue. + */ + ip_rts_ifmsg(ipif); + ip_rts_newaddrmsg(RTM_ADD, 0, ipif); + sctp_update_ipif(ipif, SCTP_IPIF_UP); + ipif->ipif_addr_ready = 1; + } + NCE_REFRELE(nce); +} + +/* + * Restart duplicate address detection on all interfaces on the given ill. + * + * This is called when an interface transitions from down to up + * (DL_NOTE_LINK_UP) or up to down (DL_NOTE_LINK_DOWN). + * + * Note that since the underlying physical link has transitioned, we must cause + * at least one routing socket message to be sent here, either via DAD + * completion or just by default on the first ipif. (If we don't do this, then + * in.mpathd will see long delays when doing link-based failure recovery.) + */ +void +ill_restart_dad(ill_t *ill, boolean_t went_up) +{ + ipif_t *ipif; + + if (ill == NULL) + return; + + /* + * If layer two doesn't support duplicate address detection, then just + * send the routing socket message now and be done with it. + */ + if ((ill->ill_isv6 && (ill->ill_flags & ILLF_XRESOLV)) || + (!ill->ill_isv6 && !ill->ill_arp_extend)) { + ip_rts_ifmsg(ill->ill_ipif); + return; + } + + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { + if (went_up) { + if (ipif->ipif_flags & IPIF_UP) { + if (ill->ill_isv6) + ipif_ndp_start_dad(ipif); + else + ipif_arp_start_dad(ipif); + } else if (ill->ill_isv6 && + (ipif->ipif_flags & IPIF_DUPLICATE)) { + /* + * For IPv4, the ARP module itself will + * automatically start the DAD process when it + * sees DL_NOTE_LINK_UP. We respond to the + * AR_CN_READY at the completion of that task. + * For IPv6, we must kick off the bring-up + * process now. + */ + ndp_do_recovery(ipif); + } else { + /* + * Unfortunately, the first ipif is "special" + * and represents the underlying ill in the + * routing socket messages. Thus, when this + * one ipif is down, we must still notify so + * that the user knows the IFF_RUNNING status + * change. (If the first ipif is up, then + * we'll handle eventual routing socket + * notification via DAD completion.) + */ + if (ipif == ill->ill_ipif) + ip_rts_ifmsg(ill->ill_ipif); + } + } else { + /* + * After link down, we'll need to send a new routing + * message when the link comes back, so clear + * ipif_addr_ready. + */ + ipif->ipif_addr_ready = 0; + } + } + + /* + * If we've torn down links, then notify the user right away. + */ + if (!went_up) + ip_rts_ifmsg(ill->ill_ipif); +} + +/* * Wakeup all threads waiting to enter the ipsq, and sleeping * on any of the ills in this ipsq. The ill_lock of the ill * must be held so that waiters don't miss wakeups @@ -13716,6 +14004,7 @@ ill_down_ipifs(ill_t *ill, mblk_t *mp, int index, boolean_t chk_nofailover) if (!ipif->ipif_isv6) ipif_check_bcast_ires(ipif); (void) ipif_logical_down(ipif, NULL, NULL); + ipif_non_duplicate(ipif); ipif_down_tail(ipif); /* * We don't do ipif_multicast_down for IPv4 in @@ -16658,7 +16947,7 @@ ipif_move(ipif_t *ipif, ill_t *to_ill, queue_t *q, mblk_t *mp, * move. */ rep_ipif->ipif_flags = ipif->ipif_flags | IPIF_NOFAILOVER; - rep_ipif->ipif_flags &= ~IPIF_UP; + rep_ipif->ipif_flags &= ~IPIF_UP & ~IPIF_DUPLICATE; rep_ipif->ipif_replace_zero = B_TRUE; mutex_init(&rep_ipif->ipif_saved_ire_lock, NULL, MUTEX_DEFAULT, NULL); @@ -17796,7 +18085,7 @@ ipif_down(ipif_t *ipif, queue_t *q, mblk_t *mp) return (EINPROGRESS); } -static void +void ipif_down_tail(ipif_t *ipif) { ill_t *ill = ipif->ipif_ill; @@ -17809,11 +18098,10 @@ ipif_down_tail(ipif_t *ipif) * there are other logical units that are up. * This occurs e.g. when we change a "significant" IFF_ flag. */ - if (ipif->ipif_ill->ill_wq != NULL) { - if (!ill->ill_logical_down && (ill->ill_ipif_up_count == 0) && - ill->ill_dl_up) { - ill_dl_down(ill); - } + if (ill->ill_wq != NULL && !ill->ill_logical_down && + ill->ill_ipif_up_count == 0 && ill->ill_ipif_dup_count == 0 && + ill->ill_dl_up) { + ill_dl_down(ill); } ill->ill_logical_down = 0; @@ -17821,7 +18109,7 @@ ipif_down_tail(ipif_t *ipif) * Have to be after removing the routes in ipif_down_delete_ire. */ if (ipif->ipif_isv6) { - if (ipif->ipif_ill->ill_flags & ILLF_XRESOLV) + if (ill->ill_flags & ILLF_XRESOLV) ipif_arp_down(ipif); } else { ipif_arp_down(ipif); @@ -18048,6 +18336,10 @@ ipif_free(ipif_t *ipif) { ASSERT(IAM_WRITER_IPIF(ipif)); + if (ipif->ipif_recovery_id != 0) + (void) untimeout(ipif->ipif_recovery_id); + ipif->ipif_recovery_id = 0; + /* Remove conn references */ reset_conn_ipif(ipif); @@ -18127,6 +18419,9 @@ ipif_free_tail(ipif_t *ipif) rw_exit(&ill_g_lock); mutex_destroy(&ipif->ipif_saved_ire_lock); + + ASSERT(!(ipif->ipif_flags & (IPIF_UP | IPIF_DUPLICATE))); + /* Free the memory. */ mi_free((char *)ipif); } @@ -18344,7 +18639,7 @@ ipif_lookup_on_name(char *name, size_t namelen, boolean_t do_alloc, * but might not make the system manager very popular. (May be called * as writer.) */ -static void +void ipif_mask_reply(ipif_t *ipif) { icmph_t *icmph; @@ -18900,13 +19195,14 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) err = ipif_ndp_up(ipif, &ipif->ipif_v6lcl_addr, B_FALSE); if (err != 0) { - mp = ipsq_pending_mp_get(ipsq, &connp); + if (err != EINPROGRESS) + mp = ipsq_pending_mp_get(ipsq, &connp); return (err); } } /* Now, ARP */ - if ((err = ipif_resolver_up(ipif, B_FALSE)) == - EINPROGRESS) { + err = ipif_resolver_up(ipif, Res_act_initial); + if (err == EINPROGRESS) { /* We will complete it in ip_arp_done */ return (err); } @@ -19455,7 +19751,6 @@ ipif_up_done(ipif_t *ipif) } - /* This is the first interface on this ill */ if (ipif->ipif_ipif_up_count == 1 && !loopback) { /* @@ -19496,14 +19791,7 @@ ipif_up_done(ipif_t *ipif) } } - /* - * This had to be deferred until we had bound. - * tell routing sockets that this interface is up - */ - ip_rts_ifmsg(ipif); - ip_rts_newaddrmsg(RTM_ADD, 0, ipif); - - if (!loopback) { + if (!loopback && ipif->ipif_addr_ready) { /* Broadcast an address mask reply. */ ipif_mask_reply(ipif); } @@ -19513,8 +19801,19 @@ ipif_up_done(ipif_t *ipif) } if (src_ipif_held) ipif_refrele(src_ipif); - /* Let SCTP update the status for this ipif */ - sctp_update_ipif(ipif, SCTP_IPIF_UP); + + /* + * This had to be deferred until we had bound. Tell routing sockets and + * others that this interface is up if it looks like the address has + * been validated. Otherwise, if it isn't ready yet, wait for + * duplicate address detection to do its thing. + */ + if (ipif->ipif_addr_ready) { + ip_rts_ifmsg(ipif); + ip_rts_newaddrmsg(RTM_ADD, 0, ipif); + /* Let SCTP update the status for this ipif */ + sctp_update_ipif(ipif, SCTP_IPIF_UP); + } return (0); bad: @@ -19919,7 +20218,8 @@ retry: /* Always skip NOLOCAL and ANYCAST interfaces */ if (ipif->ipif_flags & (IPIF_NOLOCAL|IPIF_ANYCAST)) continue; - if (!(ipif->ipif_flags & IPIF_UP)) + if (!(ipif->ipif_flags & IPIF_UP) || + !ipif->ipif_addr_ready) continue; if (ipif->ipif_zoneid != zoneid && ipif->ipif_zoneid != ALL_ZONES) @@ -20700,7 +21000,8 @@ ip_sioctl_slifname(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, * This ill has not been inserted into the global list. * So we are still single threaded and don't need any lock */ - ipif->ipif_flags = lifr->lifr_flags & IFF_LOGINT_FLAGS; + ipif->ipif_flags = lifr->lifr_flags & IFF_LOGINT_FLAGS & + ~IFF_DUPLICATE; ill->ill_flags = lifr->lifr_flags & IFF_PHYINTINST_FLAGS; ill->ill_phyint->phyint_flags = lifr->lifr_flags & IFF_PHYINT_FLAGS; diff --git a/usr/src/uts/common/inet/ip/ip_ndp.c b/usr/src/uts/common/inet/ip/ip_ndp.c index efdb39b657..2b40b14d08 100644 --- a/usr/src/uts/common/inet/ip/ip_ndp.c +++ b/usr/src/uts/common/inet/ip/ip_ndp.c @@ -28,18 +28,23 @@ #include <sys/types.h> #include <sys/stream.h> #include <sys/stropts.h> +#include <sys/strsun.h> #include <sys/sysmacros.h> #include <sys/errno.h> #include <sys/dlpi.h> #include <sys/socket.h> #include <sys/ddi.h> +#include <sys/sunddi.h> #include <sys/cmn_err.h> #include <sys/debug.h> #include <sys/vtrace.h> #include <sys/kmem.h> #include <sys/zone.h> +#include <sys/ethernet.h> +#include <sys/sdt.h> #include <net/if.h> +#include <net/if_types.h> #include <net/if_dl.h> #include <net/route.h> #include <netinet/in.h> @@ -58,13 +63,22 @@ #include <inet/ip_ndp.h> #include <inet/ipsec_impl.h> #include <inet/ipsec_info.h> +#include <inet/sctp_ip.h> /* * Function names with nce_ prefix are static while function * names with ndp_ prefix are used by rest of the IP. + * + * Lock ordering: + * + * ndp_g_lock -> ill_lock -> nce_lock + * + * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and + * nce_next. Nce_lock protects the contents of the NCE (particularly + * nce_refcnt). */ -static boolean_t nce_cmp_ll_addr(nce_t *nce, char *new_ll_addr, +static boolean_t nce_cmp_ll_addr(const nce_t *nce, const uchar_t *new_ll_addr, uint32_t ll_addr_len); static void nce_fastpath(nce_t *nce); static void nce_ire_delete(nce_t *nce); @@ -84,7 +98,6 @@ static uint32_t nce_solicit(nce_t *nce, mblk_t *mp); static boolean_t nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill, boolean_t use_lla_addr, const in6_addr_t *sender, const in6_addr_t *target, int flag); -static void lla2ascii(uint8_t *lla, int addrlen, uchar_t *buf); extern void th_trace_rrecord(th_trace_t *); static int ndp_lookup_then_add_v6(ill_t *, uchar_t *, const in6_addr_t *, const in6_addr_t *, const in6_addr_t *, @@ -131,6 +144,9 @@ ndp_add(ill_t *ill, uchar_t *hw_addr, const void *addr, return (status); } +/* Non-tunable probe interval, based on link capabilities */ +#define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500) + /* * NDP Cache Entry creation routine. * Mapped entries will never do NUD . @@ -148,6 +164,7 @@ ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr, mblk_t *mp; mblk_t *template; nce_t **ncep; + int err; boolean_t dropped = B_FALSE; ASSERT(MUTEX_HELD(&ndp6.ndp_g_lock)); @@ -237,6 +254,7 @@ ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr, if (ill->ill_state_flags & ILL_CONDEMNED) { mutex_exit(&ill->ill_lock); freeb(mp); + freeb(template); return (EINVAL); } if ((nce->nce_next = *ncep) != NULL) @@ -251,13 +269,23 @@ ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr, ill->ill_nce_cnt++; mutex_exit(&ill->ill_lock); - /* - * Before we insert the nce, honor the UNSOL_ADV flag. - * We cannot hold the ndp_g_lock and call nce_xmit - * which does a putnext. - */ - if (flags & NCE_F_UNSOL_ADV) { - flags |= NDP_ORIDE; + err = 0; + if ((flags & NCE_F_PERMANENT) && state == ND_PROBE) { + mutex_enter(&nce->nce_lock); + mutex_exit(&ndp6.ndp_g_lock); + nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT; + mutex_exit(&nce->nce_lock); + dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE, + &ipv6_all_zeros, addr, NDP_PROBE); + if (dropped) { + mutex_enter(&nce->nce_lock); + nce->nce_pcnt++; + mutex_exit(&nce->nce_lock); + } + NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill)); + mutex_enter(&ndp6.ndp_g_lock); + err = EINPROGRESS; + } else if (flags & NCE_F_UNSOL_ADV) { /* * We account for the transmit below by assigning one * less than the ndd variable. Subsequent decrements @@ -273,7 +301,7 @@ ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr, B_TRUE, /* use ill_nd_lla */ addr, /* Source and target of the advertisement pkt */ &ipv6_all_hosts_mcast, /* Destination of the packet */ - flags); + NDP_ORIDE); mutex_enter(&nce->nce_lock); if (dropped) nce->nce_unsolicit_count++; @@ -292,7 +320,7 @@ ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr, */ if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) nce_fastpath(nce); - return (0); + return (err); } int @@ -609,6 +637,41 @@ nce_ire_delete1(ire_t *ire, char *nce_arg) } /* + * Restart DAD on given NCE. Returns B_TRUE if DAD has been restarted. + */ +boolean_t +ndp_restart_dad(nce_t *nce) +{ + boolean_t started; + boolean_t dropped; + + if (nce == NULL) + return (B_FALSE); + mutex_enter(&nce->nce_lock); + if (nce->nce_state == ND_PROBE) { + mutex_exit(&nce->nce_lock); + started = B_TRUE; + } else if (nce->nce_state == ND_REACHABLE) { + nce->nce_state = ND_PROBE; + nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT - 1; + mutex_exit(&nce->nce_lock); + dropped = nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, NULL, + B_FALSE, &ipv6_all_zeros, &nce->nce_addr, NDP_PROBE); + if (dropped) { + mutex_enter(&nce->nce_lock); + nce->nce_pcnt++; + mutex_exit(&nce->nce_lock); + } + NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(nce->nce_ill)); + started = B_TRUE; + } else { + mutex_exit(&nce->nce_lock); + started = B_FALSE; + } + return (started); +} + +/* * IPv6 Cache entry lookup. Try to find an nce matching the parameters passed. * If one is found, the refcnt on the nce will be incremented. */ @@ -804,7 +867,7 @@ ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv) } return; } - ll_changed = nce_cmp_ll_addr(nce, (char *)hw_addr, hw_addr_len); + ll_changed = nce_cmp_ll_addr(nce, hw_addr, hw_addr_len); if (!is_adv) { /* If this is a SOLICITATION request only */ if (ll_changed) @@ -1381,11 +1444,16 @@ nce_solicit(nce_t *nce, mblk_t *mp) if (ipif != NULL) break; } - if (src_ill == NULL) { - /* May be a forwarding packet */ - src_ill = ill; + /* + * If no relevant ipif can be found, then it's not one of our + * addresses. Reset to :: and let nce_xmit. If an ipif can be + * found, but it's not yet done with DAD verification, then + * just postpone this transmission until later. + */ + if (src_ill == NULL) src = ipv6_all_zeros; - } + else if (!ipif->ipif_addr_ready) + return (ill->ill_reachable_retrans_time); } dst = nce->nce_addr; /* @@ -1394,7 +1462,7 @@ nce_solicit(nce_t *nce, mblk_t *mp) * appropriately. */ if (IN6_IS_ADDR_UNSPECIFIED(&src)) - src_ill = NULL; + src_ill = NULL; nce->nce_rcnt--; mutex_exit(&nce->nce_lock); rw_exit(&ill_g_lock); @@ -1407,8 +1475,350 @@ nce_solicit(nce_t *nce, mblk_t *mp) return (ill->ill_reachable_retrans_time); } +/* + * Attempt to recover an address on an interface that's been marked as a + * duplicate. Because NCEs are destroyed when the interface goes down, there's + * no easy way to just probe the address and have the right thing happen if + * it's no longer in use. Instead, we just bring it up normally and allow the + * regular interface start-up logic to probe for a remaining duplicate and take + * us back down if necessary. + * Neither DHCP nor temporary addresses arrive here; they're excluded by + * ip_ndp_excl. + */ +/* ARGSUSED */ +static void +ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) +{ + ill_t *ill = rq->q_ptr; + ipif_t *ipif; + in6_addr_t *addr = (in6_addr_t *)mp->b_rptr; + + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { + /* + * We do not support recovery of proxy ARP'd interfaces, + * because the system lacks a complete proxy ARP mechanism. + */ + if ((ipif->ipif_flags & IPIF_POINTOPOINT) || + !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, addr)) { + continue; + } + + /* + * If we have already recovered, then ignore. + */ + mutex_enter(&ill->ill_lock); + if (!(ipif->ipif_flags & IPIF_DUPLICATE)) { + mutex_exit(&ill->ill_lock); + continue; + } + + ipif->ipif_flags &= ~IPIF_DUPLICATE; + ill->ill_ipif_dup_count--; + mutex_exit(&ill->ill_lock); + ipif->ipif_was_dup = B_TRUE; + + if (ipif_ndp_up(ipif, addr, B_FALSE) != EINPROGRESS) + (void) ipif_up_done_v6(ipif); + } + freeb(mp); +} + +/* + * Attempt to recover an IPv6 interface that's been shut down as a duplicate. + * As long as someone else holds the address, the interface will stay down. + * When that conflict goes away, the interface is brought back up. This is + * done so that accidental shutdowns of addresses aren't made permanent. Your + * server will recover from a failure. + * + * For DHCP and temporary addresses, recovery is not done in the kernel. + * Instead, it's handled by user space processes (dhcpagent and in.ndpd). + * + * This function is entered on a timer expiry; the ID is in ipif_recovery_id. + */ +static void +ipif6_dup_recovery(void *arg) +{ + ipif_t *ipif = arg; + + ipif->ipif_recovery_id = 0; + if (!(ipif->ipif_flags & IPIF_DUPLICATE)) + return; + + /* If the link is down, we'll retry this later */ + if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING)) + return; + + ndp_do_recovery(ipif); +} + +/* + * Perform interface recovery by forcing the duplicate interfaces up and + * allowing the system to determine which ones should stay up. + * + * Called both by recovery timer expiry and link-up notification. + */ void -ndp_input_solicit(ill_t *ill, mblk_t *mp) +ndp_do_recovery(ipif_t *ipif) +{ + ill_t *ill = ipif->ipif_ill; + mblk_t *mp; + + mp = allocb(sizeof (ipif->ipif_v6lcl_addr), BPRI_MED); + if (mp == NULL) { + ipif->ipif_recovery_id = timeout(ipif6_dup_recovery, + ipif, MSEC_TO_TICK(ip_dup_recovery)); + } else { + bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr, + sizeof (ipif->ipif_v6lcl_addr)); + ill_refhold(ill); + (void) qwriter_ip(NULL, ill, ill->ill_rq, mp, ip_ndp_recover, + CUR_OP, B_FALSE); + } +} + +/* + * Find the solicitation in the given message, and extract printable details + * (MAC and IP addresses) from it. + */ +static nd_neighbor_solicit_t * +ip_ndp_find_solicitation(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, char *hbuf, + size_t hlen, char *sbuf, size_t slen, uchar_t **haddr) +{ + nd_neighbor_solicit_t *ns; + ip6_t *ip6h; + uchar_t *addr; + int alen; + + alen = 0; + ip6h = (ip6_t *)mp->b_rptr; + if (dl_mp == NULL) { + nd_opt_hdr_t *opt; + int nslen; + + /* + * If it's from the fast-path, then it can't be a probe + * message, and thus must include the source linkaddr option. + * Extract that here. + */ + ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN); + nslen = mp->b_wptr - (uchar_t *)ns; + if ((nslen -= sizeof (*ns)) > 0) { + opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1), nslen, + ND_OPT_SOURCE_LINKADDR); + if (opt != NULL && + opt->nd_opt_len * 8 - sizeof (*opt) >= + ill->ill_nd_lla_len) { + addr = (uchar_t *)(opt + 1); + alen = ill->ill_nd_lla_len; + } + } + /* + * We cheat a bit here for the sake of printing usable log + * messages in the rare case where the reply we got was unicast + * without a source linkaddr option, and the interface is in + * fastpath mode. (Sigh.) + */ + if (alen == 0 && ill->ill_type == IFT_ETHER && + MBLKHEAD(mp) >= sizeof (struct ether_header)) { + struct ether_header *pether; + + pether = (struct ether_header *)((char *)ip6h - + sizeof (*pether)); + addr = pether->ether_shost.ether_addr_octet; + alen = ETHERADDRL; + } + } else { + dl_unitdata_ind_t *dlu; + + dlu = (dl_unitdata_ind_t *)dl_mp->b_rptr; + alen = dlu->dl_src_addr_length; + if (alen > 0 && dlu->dl_src_addr_offset >= sizeof (*dlu) && + dlu->dl_src_addr_offset + alen <= MBLKL(dl_mp)) { + addr = dl_mp->b_rptr + dlu->dl_src_addr_offset; + if (ill->ill_sap_length < 0) { + alen += ill->ill_sap_length; + } else { + addr += ill->ill_sap_length; + alen -= ill->ill_sap_length; + } + } + } + if (alen > 0) { + *haddr = addr; + (void) mac_colon_addr(addr, alen, hbuf, hlen); + } else { + *haddr = NULL; + (void) strcpy(hbuf, "?"); + } + ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN); + (void) inet_ntop(AF_INET6, &ns->nd_ns_target, sbuf, slen); + return (ns); +} + +/* + * This is for exclusive changes due to NDP duplicate address detection + * failure. + */ +/* ARGSUSED */ +static void +ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) +{ + ill_t *ill = rq->q_ptr; + ipif_t *ipif; + char ibuf[LIFNAMSIZ + 10]; /* 10 digits for logical i/f number */ + char hbuf[MAC_STR_LEN]; + char sbuf[INET6_ADDRSTRLEN]; + nd_neighbor_solicit_t *ns; + mblk_t *dl_mp = NULL; + uchar_t *haddr; + + if (DB_TYPE(mp) != M_DATA) { + dl_mp = mp; + mp = mp->b_cont; + } + ns = ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf, sizeof (hbuf), sbuf, + sizeof (sbuf), &haddr); + if (haddr != NULL && + bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) { + /* + * Ignore conflicts generated by misbehaving switches that just + * reflect our own messages back to us. + */ + goto ignore_conflict; + } + (void) strlcpy(ibuf, ill->ill_name, sizeof (ibuf)); + for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { + + if ((ipif->ipif_flags & IPIF_POINTOPOINT) || + !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, + &ns->nd_ns_target)) { + continue; + } + + /* If it's already marked, then don't do anything. */ + if (ipif->ipif_flags & IPIF_DUPLICATE) + continue; + + /* + * If this is a failure during duplicate recovery, then don't + * complain. It may take a long time to recover. + */ + if (!ipif->ipif_was_dup) { + if (ipif->ipif_id != 0) { + (void) snprintf(ibuf + ill->ill_name_length - 1, + sizeof (ibuf) - ill->ill_name_length + 1, + ":%d", ipif->ipif_id); + } + cmn_err(CE_WARN, "%s has duplicate address %s (in " + "use by %s); disabled", ibuf, sbuf, hbuf); + } + mutex_enter(&ill->ill_lock); + ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); + ipif->ipif_flags |= IPIF_DUPLICATE; + ill->ill_ipif_dup_count++; + mutex_exit(&ill->ill_lock); + (void) ipif_down(ipif, NULL, NULL); + ipif_down_tail(ipif); + if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && + ill->ill_net_type == IRE_IF_RESOLVER && + ip_dup_recovery > 0) + ipif->ipif_recovery_id = timeout(ipif6_dup_recovery, + ipif, MSEC_TO_TICK(ip_dup_recovery)); + } +ignore_conflict: + if (dl_mp != NULL) + freeb(dl_mp); + freemsg(mp); +} + +/* + * Handle failure by tearing down the ipifs with the specified address. Note + * that tearing down the ipif also means deleting the nce through ipif_down, so + * it's not possible to do recovery by just restarting the nce timer. Instead, + * we start a timer on the ipif. + */ +static void +ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce) +{ + if ((mp = copymsg(mp)) != NULL) { + if (dl_mp == NULL) + dl_mp = mp; + else if ((dl_mp = copyb(dl_mp)) != NULL) + dl_mp->b_cont = mp; + if (dl_mp == NULL) { + freemsg(mp); + } else { + ill_refhold(ill); + (void) qwriter_ip(NULL, ill, ill->ill_rq, dl_mp, + ip_ndp_excl, CUR_OP, B_FALSE); + } + } + ndp_delete(nce); +} + +/* + * Handle a discovered conflict: some other system is advertising that it owns + * one of our IP addresses. We need to defend ourselves, or just shut down the + * interface. + */ +static void +ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce) +{ + ipif_t *ipif; + uint32_t now; + uint_t maxdefense; + uint_t defs; + + ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill, ALL_ZONES, NULL, NULL, + NULL, NULL); + if (ipif == NULL) + return; + /* + * First, figure out if this address is disposable. + */ + if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY)) + maxdefense = ip_max_temp_defend; + else + maxdefense = ip_max_defend; + + /* + * Now figure out how many times we've defended ourselves. Ignore + * defenses that happened long in the past. + */ + now = gethrestime_sec(); + mutex_enter(&nce->nce_lock); + if ((defs = nce->nce_defense_count) > 0 && + now - nce->nce_defense_time > ip_defend_interval) { + nce->nce_defense_count = defs = 0; + } + nce->nce_defense_count++; + nce->nce_defense_time = now; + mutex_exit(&nce->nce_lock); + ipif_refrele(ipif); + + /* + * If we've defended ourselves too many times already, then give up and + * tear down the interface(s) using this address. Otherwise, defend by + * sending out an unsolicited Neighbor Advertisement. + */ + if (defs >= maxdefense) { + ip_ndp_failure(ill, mp, dl_mp, nce); + } else { + char hbuf[MAC_STR_LEN]; + char sbuf[INET6_ADDRSTRLEN]; + uchar_t *haddr; + + (void) ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf, + sizeof (hbuf), sbuf, sizeof (sbuf), &haddr); + cmn_err(CE_WARN, "node %s is using our IP address %s on %s", + hbuf, sbuf, ill->ill_name); + (void) nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill, B_FALSE, + &nce->nce_addr, &ipv6_all_hosts_mcast, NDP_ORIDE); + } +} + +static void +ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) { nd_neighbor_solicit_t *ns; uint32_t hlen = ill->ill_nd_lla_len; @@ -1485,53 +1895,16 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp) if (opt != NULL) { opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR); if (opt != NULL) { - /* - * No source link layer address option should - * be present in a valid DAD request. - */ - if (IN6_IS_ADDR_UNSPECIFIED(&src)) { - ip1dbg(("ndp_input_solicit: source link-layer " - "address option present with an " - "unspecified source. \n")); - bad_solicit = B_TRUE; - goto done; - } haddr = (uchar_t *)&opt[1]; - if (hlen > opt->nd_opt_len * 8 || + if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) || hlen == 0) { + ip1dbg(("ndp_input_advert: bad SLLA\n")); bad_solicit = B_TRUE; goto done; } } } - /* - * haddr can be NULL if no options are present, - * or no Source link layer address is present in, - * recvd NDP options of solicitation message. - */ - if (haddr == NULL) { - nce_t *nnce; - mutex_enter(&ndp6.ndp_g_lock); - nnce = *((nce_t **)NCE_HASH_PTR_V6(src)); - nnce = nce_lookup_addr(ill, &src, nnce); - mutex_exit(&ndp6.ndp_g_lock); - if (nnce == NULL) { - in6_addr_t dst = ipv6_solicited_node_mcast; - - /* Form solicited node multicast address */ - dst.s6_addr32[3] |= src.s6_addr32[3]; - (void) nce_xmit(ill, - ND_NEIGHBOR_SOLICIT, - ill, - B_TRUE, - &target, - &dst, - flag); - bad_solicit = B_TRUE; - goto done; - } - } /* Set override flag, it will be reset later if need be. */ flag |= NDP_ORIDE; if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) { @@ -1544,10 +1917,39 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp) * the source is unspecified address. */ if (!IN6_IS_ADDR_UNSPECIFIED(&src)) { - int err = 0; + int err; nce_t *nnce; ASSERT(ill->ill_isv6); + /* + * Regular solicitations *must* include the Source Link-Layer + * Address option. Ignore messages that do not. + */ + if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) { + ip1dbg(("ndp_input_solicit: source link-layer address " + "option missing with a specified source.\n")); + bad_solicit = B_TRUE; + goto done; + } + + /* + * This is a regular solicitation. If we're still in the + * process of verifying the address, then don't respond at all + * and don't keep track of the sender. + */ + if (our_nce->nce_state == ND_PROBE) + goto done; + + /* + * If the solicitation doesn't have sender hardware address + * (legal for unicast solicitation), then process without + * installing the return NCE. Either we already know it, or + * we'll be forced to look it up when (and if) we reply to the + * packet. + */ + if (haddr == NULL) + goto no_source; + err = ndp_lookup_then_add(ill, haddr, &src, /* Soliciting nodes address */ @@ -1577,11 +1979,38 @@ ndp_input_solicit(ill_t *ill, mblk_t *mp) err)); goto done; } +no_source: flag |= NDP_SOLICITED; } else { /* - * This is a DAD req, multicast the advertisement - * to the all-nodes address. + * No source link layer address option should be present in a + * valid DAD request. + */ + if (haddr != NULL) { + ip1dbg(("ndp_input_solicit: source link-layer address " + "option present with an unspecified source.\n")); + bad_solicit = B_TRUE; + goto done; + } + if (our_nce->nce_state == ND_PROBE) { + /* + * Internally looped-back probes won't have DLPI + * attached to them. External ones (which are sent by + * multicast) always will. Just ignore our own + * transmissions. + */ + if (dl_mp != NULL) { + /* + * If someone else is probing our address, then + * we've crossed wires. Declare failure. + */ + ip_ndp_failure(ill, mp, dl_mp, our_nce); + } + goto done; + } + /* + * This is a DAD probe. Multicast the advertisement to the + * all-nodes address. */ src = ipv6_all_hosts_mcast; } @@ -1605,7 +2034,7 @@ done: } void -ndp_input_advert(ill_t *ill, mblk_t *mp) +ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) { nd_neighbor_advert_t *na; uint32_t hlen = ill->ill_nd_lla_len; @@ -1639,6 +2068,7 @@ ndp_input_advert(ill_t *ill, mblk_t *mp) opt = (nd_opt_hdr_t *)&na[1]; if (!ndp_verify_optlen(opt, len - sizeof (nd_neighbor_advert_t))) { + ip1dbg(("ndp_input_advert: cannot verify SLLA\n")); BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); return; } @@ -1647,8 +2077,9 @@ ndp_input_advert(ill_t *ill, mblk_t *mp) opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR); if (opt != NULL) { haddr = (uchar_t *)&opt[1]; - if (hlen > opt->nd_opt_len * 8 || + if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) || hlen == 0) { + ip1dbg(("ndp_input_advert: bad SLLA\n")); BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); return; @@ -1676,13 +2107,41 @@ ndp_input_advert(ill_t *ill, mblk_t *mp) /* We have to drop the lock since ndp_process calls put* */ rw_exit(&ill_g_lock); if (dst_nce != NULL) { - if (na->nd_na_flags_reserved & - ND_NA_FLAG_ROUTER) { - dst_nce->nce_flags |= NCE_F_ISROUTER; + if ((dst_nce->nce_flags & NCE_F_PERMANENT) && + dst_nce->nce_state == ND_PROBE) { + /* + * Someone else sent an advertisement for an + * address that we're trying to configure. + * Tear it down. Note that dl_mp might be NULL + * if we're getting a unicast reply. This + * isn't typically done (multicast is the norm + * in response to a probe), but ip_ndp_failure + * will handle the dl_mp == NULL case as well. + */ + ip_ndp_failure(ill, mp, dl_mp, dst_nce); + } else if (dst_nce->nce_flags & NCE_F_PERMANENT) { + /* + * Someone just announced one of our local + * addresses. If it wasn't us, then this is a + * conflict. Defend the address or shut it + * down. + */ + if (dl_mp != NULL && + (haddr == NULL || + nce_cmp_ll_addr(dst_nce, haddr, + ill->ill_nd_lla_len))) { + ip_ndp_conflict(ill, mp, dl_mp, + dst_nce); + } + } else { + if (na->nd_na_flags_reserved & + ND_NA_FLAG_ROUTER) { + dst_nce->nce_flags |= NCE_F_ISROUTER; + } + /* B_TRUE indicates this an advertisement */ + ndp_process(dst_nce, haddr, + na->nd_na_flags_reserved, B_TRUE); } - /* B_TRUE indicates this an advertisement */ - ndp_process(dst_nce, haddr, - na->nd_na_flags_reserved, B_TRUE); NCE_REFRELE(dst_nce); } rw_enter(&ill_g_lock, RW_READER); @@ -1696,7 +2155,7 @@ ndp_input_advert(ill_t *ill, mblk_t *mp) * The checksum has already checked o.k before reaching here. */ void -ndp_input(ill_t *ill, mblk_t *mp) +ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) { icmp6_t *icmp_nd; ip6_t *ip6h; @@ -1747,9 +2206,9 @@ ndp_input(ill_t *ill, mblk_t *mp) goto done; } if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) { - ndp_input_solicit(ill, mp); + ndp_input_solicit(ill, mp, dl_mp); } else { - ndp_input_advert(ill, mp); + ndp_input_advert(ill, mp, dl_mp); } done: freemsg(mp); @@ -1758,9 +2217,13 @@ done: /* * nce_xmit is called to form and transmit a ND solicitation or * advertisement ICMP packet. - * If source address is unspecified, appropriate source address - * and link layer address will be chosen here. This function - * *always* sends the link layer option. + * + * If the source address is unspecified and this isn't a probe (used for + * duplicate address detection), an appropriate source address and link layer + * address will be chosen here. The link layer address option is included if + * the source is specified (i.e., all non-probe packets), and omitted (per the + * specification) otherwise. + * * It returns B_FALSE only if it does a successful put() to the * corresponding ill's ill_wq otherwise returns B_TRUE. */ @@ -1792,7 +2255,7 @@ nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill, */ ASSERT(IN6_IS_ADDR_UNSPECIFIED(sender) || (hwaddr_ill != NULL)); - if (IN6_IS_ADDR_UNSPECIFIED(sender)) { + if (IN6_IS_ADDR_UNSPECIFIED(sender) && !(flag & NDP_PROBE)) { ASSERT(operation != ND_NEIGHBOR_ADVERT); /* * Pick a source address for this solicitation, but @@ -1816,7 +2279,10 @@ nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill, hwaddr_ill = src_ipif->ipif_ill; } - plen = (sizeof (nd_opt_hdr_t) + ill->ill_nd_lla_len + 7)/8; + if (flag & NDP_PROBE) + plen = 0; + else + plen = (sizeof (nd_opt_hdr_t) + ill->ill_nd_lla_len + 7)/8; /* * Always make sure that the NS/NA packets don't get load * spread. This is needed so that the probe packets sent @@ -1842,6 +2308,8 @@ nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill, ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW; ip6i->ip6i_nxt = IPPROTO_RAW; ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT; + if (flag & NDP_PROBE) + ip6i->ip6i_flags |= IP6I_UNSPEC_SRC; ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex; ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t)); @@ -1858,7 +2326,8 @@ nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill, if (operation == ND_NEIGHBOR_SOLICIT) { nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6; - opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR; + if (!(flag & NDP_PROBE)) + opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR; ip6h->ip6_src = *sender; ns->nd_ns_target = *target; if (!(flag & NDP_UNICAST)) { @@ -1870,6 +2339,7 @@ nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill, } else { nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6; + ASSERT(!(flag & NDP_PROBE)); opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; ip6h->ip6_src = *sender; na->nd_na_target = *sender; @@ -1881,12 +2351,16 @@ nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill, na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE; } - /* Fill in link layer address and option len */ - opt->nd_opt_len = (uint8_t)plen; - mutex_enter(&hwaddr_ill->ill_lock); - bcopy(use_nd_lla ? hwaddr_ill->ill_nd_lla : hwaddr_ill->ill_phys_addr, - &opt[1], hwaddr_ill->ill_nd_lla_len); - mutex_exit(&hwaddr_ill->ill_lock); + + if (!(flag & NDP_PROBE)) { + /* Fill in link layer address and option len */ + opt->nd_opt_len = (uint8_t)plen; + mutex_enter(&hwaddr_ill->ill_lock); + bcopy(use_nd_lla ? hwaddr_ill->ill_nd_lla : + hwaddr_ill->ill_phys_addr, &opt[1], + hwaddr_ill->ill_nd_lla_len); + mutex_exit(&hwaddr_ill->ill_lock); + } icmp6->icmp6_type = (uint8_t)operation; icmp6->icmp6_code = 0; /* @@ -1950,30 +2424,6 @@ ndp_report(queue_t *q, mblk_t *mp, caddr_t arg, cred_t *ioc_cr) } /* - * convert a link level address of arbitrary length - * to an ascii string. - * The caller *must* have already verified that the string buffer - * is large enough to hold the entire string, including trailing NULL. - */ -static void -lla2ascii(uint8_t *lla, int addrlen, uchar_t *buf) -{ - uchar_t addrbyte[8]; /* needs to hold ascii for a byte plus a NULL */ - int i; - size_t len; - - buf[0] = '\0'; - for (i = 0; i < addrlen; i++) { - addrbyte[0] = '\0'; - (void) sprintf((char *)addrbyte, "%02x:", (lla[i] & 0xff)); - len = strlen((const char *)addrbyte); - bcopy(addrbyte, buf, len); - buf = buf + len; - } - *--buf = '\0'; -} - -/* * Add a single line to the NDP Cache Entry Report. */ static void @@ -2013,7 +2463,7 @@ nce_report1(nce_t *nce, uchar_t *mp_arg) if (ill->ill_net_type == IRE_IF_RESOLVER) { size_t addrlen; - uchar_t *addr_buf; + char *addr_buf; dl_unitdata_req_t *dl; mutex_enter(&nce->nce_lock); @@ -2042,12 +2492,10 @@ nce_report1(nce_t *nce, uchar_t *mp_arg) mutex_exit(&nce->nce_lock); return; } - if (ill->ill_flags & ILLF_XRESOLV) - lla2ascii((uint8_t *)h, dl->dl_dest_addr_length, - addr_buf); - else - lla2ascii((uint8_t *)h, ill->ill_nd_lla_len, - addr_buf); + (void) mac_colon_addr((uint8_t *)h, + (ill->ill_flags & ILLF_XRESOLV) ? + dl->dl_dest_addr_length : ill->ill_nd_lla_len, + addr_buf, addrlen); mutex_exit(&nce->nce_lock); (void) mi_mpprintf(mp, "%8s %17s %5s %s/%d", ill->ill_name, addr_buf, (uchar_t *)&flags_buf, @@ -2152,48 +2600,108 @@ ndp_timer(void *arg) nce->nce_pcnt--; ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT && nce->nce_pcnt >= -1); - if (nce->nce_pcnt == 0) { + if (nce->nce_pcnt > 0) { + /* + * As per RFC2461, the nce gets deleted after + * MAX_UNICAST_SOLICIT unsuccessful re-transmissions. + * Note that the first unicast solicitation is sent + * during the DELAY state. + */ + ip2dbg(("ndp_timer: pcount=%x dst %s\n", + nce->nce_pcnt, inet_ntop(AF_INET6, &nce->nce_addr, + addrbuf, sizeof (addrbuf)))); + mutex_exit(&nce->nce_lock); + dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, + B_FALSE, &ipv6_all_zeros, &nce->nce_addr, + (nce->nce_flags & NCE_F_PERMANENT) ? NDP_PROBE : + NDP_UNICAST); + if (dropped) { + mutex_enter(&nce->nce_lock); + nce->nce_pcnt++; + mutex_exit(&nce->nce_lock); + } + NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill)); + } else if (nce->nce_pcnt < 0) { + /* No hope, delete the nce */ + nce->nce_state = ND_UNREACHABLE; + mutex_exit(&nce->nce_lock); + if (ip_debug > 2) { + /* ip1dbg */ + pr_addr_dbg("ndp_timer: Delete IRE for" + " dst %s\n", AF_INET6, &nce->nce_addr); + } + ndp_delete(nce); + } else if (!(nce->nce_flags & NCE_F_PERMANENT)) { /* Wait RetransTimer, before deleting the entry */ ip2dbg(("ndp_timer: pcount=%x dst %s\n", nce->nce_pcnt, inet_ntop(AF_INET6, &nce->nce_addr, addrbuf, sizeof (addrbuf)))); mutex_exit(&nce->nce_lock); + /* Wait one interval before killing */ NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time); - } else { + } else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) { + ipif_t *ipif; + /* - * As per RFC2461, the nce gets deleted after - * MAX_UNICAST_SOLICIT unsuccessful re-transmissions. - * Note that the first unicast solicitation is sent - * during the DELAY state. + * We're done probing, and we can now declare this + * address to be usable. Let IP know that it's ok to + * use. */ - if (nce->nce_pcnt > 0) { - ip2dbg(("ndp_timer: pcount=%x dst %s\n", - nce->nce_pcnt, inet_ntop(AF_INET6, - &nce->nce_addr, - addrbuf, sizeof (addrbuf)))); - mutex_exit(&nce->nce_lock); - dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, - NULL, B_FALSE, &ipv6_all_zeros, - &nce->nce_addr, NDP_UNICAST); - if (dropped) { - mutex_enter(&nce->nce_lock); - nce->nce_pcnt++; - mutex_exit(&nce->nce_lock); + nce->nce_state = ND_REACHABLE; + mutex_exit(&nce->nce_lock); + ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill, + ALL_ZONES, NULL, NULL, NULL, NULL); + if (ipif != NULL) { + if (ipif->ipif_was_dup) { + char ibuf[LIFNAMSIZ + 10]; + char sbuf[INET6_ADDRSTRLEN]; + + ipif->ipif_was_dup = B_FALSE; + (void) strlcpy(ibuf, ill->ill_name, + sizeof (ibuf)); + (void) inet_ntop(AF_INET6, + &ipif->ipif_v6lcl_addr, + sbuf, sizeof (sbuf)); + if (ipif->ipif_id != 0) { + (void) snprintf(ibuf + + ill->ill_name_length - 1, + sizeof (ibuf) - + ill->ill_name_length + 1, + ":%d", ipif->ipif_id); + } + cmn_err(CE_NOTE, "recovered address " + "%s on %s", sbuf, ibuf); } - NDP_RESTART_TIMER(nce, - ill->ill_reachable_retrans_time); - } else { - /* No hope, delete the nce */ - nce->nce_state = ND_UNREACHABLE; - mutex_exit(&nce->nce_lock); - if (ip_debug > 2) { - /* ip1dbg */ - pr_addr_dbg("ndp_timer: Delete IRE for" - " dst %s\n", AF_INET6, - &nce->nce_addr); + if ((ipif->ipif_flags & IPIF_UP) && + !ipif->ipif_addr_ready) { + ip_rts_ifmsg(ipif); + ip_rts_newaddrmsg(RTM_ADD, 0, ipif); + sctp_update_ipif(ipif, SCTP_IPIF_UP); } - ndp_delete(nce); + ipif->ipif_addr_ready = 1; + ipif_refrele(ipif); + } + /* Begin defending our new address */ + nce->nce_unsolicit_count = 0; + dropped = nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill, + B_FALSE, &nce->nce_addr, &ipv6_all_hosts_mcast, + NDP_ORIDE); + if (dropped) { + nce->nce_unsolicit_count = 1; + NDP_RESTART_TIMER(nce, + ip_ndp_unsolicit_interval); + } else if (ip_ndp_defense_interval != 0) { + NDP_RESTART_TIMER(nce, ip_ndp_defense_interval); } + } else { + /* + * This is an address we're probing to be our own, but + * the ill is down. Wait until it comes back before + * doing anything, but switch to reachable state so + * that the restart will work. + */ + nce->nce_state = ND_REACHABLE; + mutex_exit(&nce->nce_lock); } NCE_REFRELE(nce); return; @@ -2262,9 +2770,12 @@ ndp_timer(void *arg) break; case ND_REACHABLE : rw_exit(&ill_g_lock); - if (nce->nce_flags & NCE_F_UNSOL_ADV && - nce->nce_unsolicit_count != 0) { - nce->nce_unsolicit_count--; + if (((nce->nce_flags & NCE_F_UNSOL_ADV) && + nce->nce_unsolicit_count != 0) || + ((nce->nce_flags & NCE_F_PERMANENT) && + ip_ndp_defense_interval != 0)) { + if (nce->nce_unsolicit_count > 0) + nce->nce_unsolicit_count--; mutex_exit(&nce->nce_lock); dropped = nce_xmit(ill, ND_NEIGHBOR_ADVERT, @@ -2272,7 +2783,7 @@ ndp_timer(void *arg) B_FALSE, /* use ill_phys_addr */ &nce->nce_addr, &ipv6_all_hosts_mcast, - nce->nce_flags | NDP_ORIDE); + NDP_ORIDE); if (dropped) { mutex_enter(&nce->nce_lock); nce->nce_unsolicit_count++; @@ -2281,6 +2792,9 @@ ndp_timer(void *arg) if (nce->nce_unsolicit_count != 0) { NDP_RESTART_TIMER(nce, ip_ndp_unsolicit_interval); + } else { + NDP_RESTART_TIMER(nce, + ip_ndp_defense_interval); } } else { mutex_exit(&nce->nce_lock); @@ -2339,7 +2853,7 @@ nce_set_ll(nce_t *nce, uchar_t *ll_addr) } static boolean_t -nce_cmp_ll_addr(nce_t *nce, char *ll_addr, uint32_t ll_addr_len) +nce_cmp_ll_addr(const nce_t *nce, const uchar_t *ll_addr, uint32_t ll_addr_len) { ill_t *ill = nce->nce_ill; uchar_t *ll_offset; @@ -2348,7 +2862,7 @@ nce_cmp_ll_addr(nce_t *nce, char *ll_addr, uint32_t ll_addr_len) if (ll_addr == NULL) return (B_FALSE); ll_offset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill); - if (bcmp(ll_addr, (char *)ll_offset, ll_addr_len) != 0) + if (bcmp(ll_addr, ll_offset, ll_addr_len) != 0) return (B_TRUE); return (B_FALSE); } @@ -3338,3 +3852,84 @@ nce_reinit(nce_t *nce) NCE_REFRELE(nce); return (newnce); } + +/* + * ndp_walk routine to delete all entries that have a given destination or + * gateway address and cached link layer (MAC) address. This is used when ARP + * informs us that a network-to-link-layer mapping may have changed. + */ +void +nce_delete_hw_changed(nce_t *nce, void *arg) +{ + nce_hw_map_t *hwm = arg; + mblk_t *mp; + dl_unitdata_req_t *dlu; + uchar_t *macaddr; + ill_t *ill; + int saplen; + ipaddr_t nce_addr; + + if (nce->nce_state != ND_REACHABLE) + return; + + IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr); + if (nce_addr != hwm->hwm_addr) + return; + + mutex_enter(&nce->nce_lock); + if ((mp = nce->nce_res_mp) == NULL) { + mutex_exit(&nce->nce_lock); + return; + } + dlu = (dl_unitdata_req_t *)mp->b_rptr; + macaddr = (uchar_t *)(dlu + 1); + ill = nce->nce_ill; + if ((saplen = ill->ill_sap_length) > 0) + macaddr += saplen; + else + saplen = -saplen; + + /* + * If the hardware address is unchanged, then leave this one alone. + * Note that saplen == abs(saplen) now. + */ + if (hwm->hwm_hwlen == dlu->dl_dest_addr_length - saplen && + bcmp(hwm->hwm_hwaddr, macaddr, hwm->hwm_hwlen) == 0) { + mutex_exit(&nce->nce_lock); + return; + } + mutex_exit(&nce->nce_lock); + + DTRACE_PROBE1(nce__hw__deleted, nce_t *, nce); + ndp_delete(nce); +} + +/* + * This function verifies whether a given IPv4 address is potentially known to + * the NCE subsystem. If so, then ARP must not delete the corresponding ace_t, + * so that it can continue to look for hardware changes on that address. + */ +boolean_t +ndp_lookup_ipaddr(in_addr_t addr) +{ + nce_t *nce; + struct in_addr nceaddr; + + if (addr == INADDR_ANY) + return (B_FALSE); + + mutex_enter(&ndp4.ndp_g_lock); + nce = *(nce_t **)NCE_HASH_PTR_V4(addr); + for (; nce != NULL; nce = nce->nce_next) { + /* Note that only v4 mapped entries are in the table. */ + IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &nceaddr); + if (addr == nceaddr.s_addr && + IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) { + /* Single flag check; no lock needed */ + if (!(nce->nce_flags & NCE_F_CONDEMNED)) + break; + } + } + mutex_exit(&ndp4.ndp_g_lock); + return (nce != NULL); +} diff --git a/usr/src/uts/common/inet/ip/ip_squeue.c b/usr/src/uts/common/inet/ip/ip_squeue.c index 033b962b02..417b1580eb 100644 --- a/usr/src/uts/common/inet/ip/ip_squeue.c +++ b/usr/src/uts/common/inet/ip/ip_squeue.c @@ -110,16 +110,14 @@ #include <inet/common.h> #include <inet/ip.h> #include <inet/ip_if.h> -#include <inet/mi.h> #include <inet/nd.h> #include <inet/ipclassifier.h> #include <sys/types.h> #include <sys/conf.h> #include <sys/sunddi.h> -#include <sys/ddi.h> +#include <sys/dlpi.h> #include <sys/squeue_impl.h> - /* * We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1 * mapping between squeue and NIC (or Rx ring) for performance reasons so diff --git a/usr/src/uts/common/inet/ip6.h b/usr/src/uts/common/inet/ip6.h index beae955d27..2cffc239b8 100644 --- a/usr/src/uts/common/inet/ip6.h +++ b/usr/src/uts/common/inet/ip6.h @@ -379,7 +379,7 @@ extern void ip_output_v6(void *, mblk_t *, void *, int); extern void ip_xmit_v6(mblk_t *, ire_t *, uint_t, conn_t *, int, struct ipsec_out_s *); extern void ip_rput_data_v6(queue_t *, ill_t *, mblk_t *, ip6_t *, - uint_t, mblk_t *); + uint_t, mblk_t *, mblk_t *); extern void mld_input(queue_t *, mblk_t *, ill_t *); extern void mld_joingroup(ilm_t *); extern void mld_leavegroup(ilm_t *); diff --git a/usr/src/uts/common/inet/ip_if.h b/usr/src/uts/common/inet/ip_if.h index 00b631b9e1..173930e3ee 100644 --- a/usr/src/uts/common/inet/ip_if.h +++ b/usr/src/uts/common/inet/ip_if.h @@ -91,7 +91,7 @@ extern "C" { #define IFF_LOGINT_FLAGS (IFF_UP|IFF_BROADCAST|IFF_POINTOPOINT| \ IFF_UNNUMBERED|IFF_DHCPRUNNING|IFF_PRIVATE|IFF_NOXMIT|IFF_NOLOCAL| \ IFF_DEPRECATED|IFF_ADDRCONF|IFF_ANYCAST|IFF_MIPRUNNING|IFF_NOFAILOVER| \ - IFF_PREFERRED|IFF_TEMPORARY|IFF_FIXEDMTU) + IFF_PREFERRED|IFF_TEMPORARY|IFF_FIXEDMTU|IFF_DUPLICATE) #define IPIF_REPL_CHECK(to_ipif, failback_cmd) \ (((to_ipif)->ipif_replace_zero) || ((failback_cmd) && \ @@ -138,14 +138,24 @@ extern "C" { #define IPIF_PREFERRED IFF_PREFERRED /* Prefer as source address */ #define IPIF_TEMPORARY IFF_TEMPORARY /* RFC3041 */ #define IPIF_FIXEDMTU IFF_FIXEDMTU /* set with SIOCSLIFMTU */ +#define IPIF_DUPLICATE IFF_DUPLICATE /* address is in use */ /* Source selection values for ipif_select_source_v6 */ #define RESTRICT_TO_NONE 0x0 /* No restriction in source selection */ #define RESTRICT_TO_GROUP 0x1 /* Restrict to IPMP group */ #define RESTRICT_TO_ILL 0x2 /* Restrict to ILL */ +/* for ipif_resolver_up */ +enum ip_resolver_action { + Res_act_initial, /* initial address establishment */ + Res_act_move, /* address move (IPMP, new DL addr) */ + Res_act_defend /* address defense */ +}; + extern ill_t *illgrp_scheduler(ill_t *); extern mblk_t *ill_arp_alloc(ill_t *, uchar_t *, caddr_t); +extern mblk_t *ipif_area_alloc(ipif_t *); +extern mblk_t *ipif_ared_alloc(ipif_t *); extern void ill_dlpi_done(ill_t *, t_uscalar_t); extern void ill_dlpi_send(ill_t *, mblk_t *); extern mblk_t *ill_dlur_gen(uchar_t *, uint_t, t_uscalar_t, t_scalar_t); @@ -167,6 +177,7 @@ extern time_t ill_frag_timeout(ill_t *, time_t); extern int ill_init(queue_t *, ill_t *); extern int ill_nominate_mcast_rcv(ill_group_t *); extern boolean_t ill_setdefaulttoken(ill_t *); +extern void ill_restart_dad(ill_t *, boolean_t); extern void ill_lock_ills(ill_t **, int); extern mblk_t *ill_pending_mp_get(ill_t *, conn_t **, uint_t); @@ -216,9 +227,10 @@ extern void ipif_refhold_locked(ipif_t *); extern void ipif_refrele(ipif_t *); extern boolean_t ipif_ire_active(ipif_t *); extern void ipif_all_down_tail(ipsq_t *, queue_t *, mblk_t *, void *); -extern int ipif_resolver_up(ipif_t *, boolean_t); +extern int ipif_resolver_up(ipif_t *, enum ip_resolver_action); extern int ipif_arp_setup_multicast(ipif_t *, mblk_t **); extern int ipif_down(ipif_t *, queue_t *, mblk_t *); +extern void ipif_down_tail(ipif_t *); extern void ipif_multicast_up(ipif_t *); extern void ipif_ndp_down(ipif_t *); extern int ipif_ndp_up(ipif_t *, const in6_addr_t *, boolean_t); @@ -238,6 +250,7 @@ extern ipif_t *ipif_lookup_on_ifindex(uint_t, boolean_t, zoneid_t, queue_t *, extern ipif_t *ipif_get_next_ipif(ipif_t *curr, ill_t *ill); extern void ipif_ill_refrele_tail(ill_t *ill); extern void ipif_arp_down(ipif_t *ipif); +extern void ipif_mask_reply(ipif_t *); extern int illgrp_insert(ill_group_t **, ill_t *, char *, ill_group_t *, boolean_t); diff --git a/usr/src/uts/common/inet/ip_ndp.h b/usr/src/uts/common/inet/ip_ndp.h index 05edcd3225..4d0dbd5428 100644 --- a/usr/src/uts/common/inet/ip_ndp.h +++ b/usr/src/uts/common/inet/ip_ndp.h @@ -28,6 +28,12 @@ #pragma ident "%Z%%M% %I% %E% SMI" +#include <sys/mutex.h> +#include <sys/stream.h> +#include <netinet/in.h> +#include <netinet/icmp6.h> +#include <inet/ip.h> + /* * Internal definitions for the kernel implementation of the IPv6 * Neighbor Discovery Protocol (NDP). @@ -69,6 +75,8 @@ typedef struct nce_s { struct nce_s *nce_fastpath; /* for fastpath list */ timeout_id_t nce_timeout_id; uchar_t nce_ipversion; /* IPv4(ARP)/IPv6(NDP) version */ + uint_t nce_defense_count; /* number of NDP conflicts */ + uint_t nce_defense_time; /* last time defended (secs) */ #ifdef NCE_DEBUG th_trace_t *nce_trace[IP_TR_HASH_MAX]; boolean_t nce_trace_disable; /* True when alloc fails */ @@ -135,6 +143,7 @@ extern ndp_g_t ndp4, ndp6; #define NDP_ISROUTER 0x2 #define NDP_SOLICITED 0x4 #define NDP_ORIDE 0x8 +#define NDP_PROBE 0x10 /* Number of packets queued in NDP for a neighbor */ #define ND_MAX_Q 4 @@ -219,6 +228,17 @@ typedef struct { int ncr_host; /* Fraction for host entries */ } nce_cache_reclaim_t; +/* + * Structure for nce_delete_hw_changed; specifies an IPv4 address to link-layer + * address mapping. Any route that has a cached copy of a mapping for that + * IPv4 address that doesn't match the given mapping must be purged. + */ +typedef struct { + ipaddr_t hwm_addr; /* IPv4 address */ + uint_t hwm_hwlen; /* Length of hardware address (may be 0) */ + uchar_t *hwm_hwaddr; /* Pointer to new hardware address, if any */ +} nce_hw_map_t; + /* When SAP is greater than zero address appears before SAP */ #define NCE_LL_ADDR_OFFSET(ill) (((ill)->ill_sap_length) < 0 ? \ (sizeof (dl_unitdata_req_t)) : \ @@ -276,7 +296,8 @@ extern void ndp_fastpath_flush(nce_t *, char *); extern boolean_t ndp_fastpath_update(nce_t *, void *); extern nd_opt_hdr_t *ndp_get_option(nd_opt_hdr_t *, int, int); extern void ndp_inactive(nce_t *); -extern void ndp_input(ill_t *, mblk_t *); +extern void ndp_input(ill_t *, mblk_t *, mblk_t *); +extern boolean_t ndp_lookup_ipaddr(in_addr_t); extern nce_t *ndp_lookup_v6(ill_t *, const in6_addr_t *, boolean_t); extern nce_t *ndp_lookup_v4(ill_t *, const in_addr_t *, boolean_t); extern int ndp_lookup_then_add(ill_t *, uchar_t *, const void *, @@ -298,6 +319,8 @@ extern void ndp_walk_common(ndp_g_t *, ill_t *, pfi_t, extern int ndp_add(ill_t *, uchar_t *, const void *, const void *, const void *, uint32_t, uint16_t, uint16_t, nce_t **, mblk_t *, mblk_t *); +extern boolean_t ndp_restart_dad(nce_t *); +extern void ndp_do_recovery(ipif_t *); extern void nce_resolv_failed(nce_t *); extern void arp_resolv_failed(nce_t *); extern void nce_fastpath_list_add(nce_t *); @@ -307,6 +330,7 @@ extern void nce_fastpath_list_dispatch(ill_t *, extern void nce_queue_mp_common(nce_t *, mblk_t *, boolean_t); extern void ndp_flush_qd_mp(nce_t *); extern nce_t *nce_reinit(nce_t *); +extern void nce_delete_hw_changed(nce_t *, void *); #ifdef NCE_DEBUG extern void nce_trace_inactive(nce_t *); diff --git a/usr/src/uts/common/net/if.h b/usr/src/uts/common/net/if.h index 8351c9b33a..f2be9114c5 100644 --- a/usr/src/uts/common/net/if.h +++ b/usr/src/uts/common/net/if.h @@ -163,6 +163,7 @@ struct ifnet { #define IFF_FIXEDMTU 0x1000000000ll /* MTU manually set with SIOCSLIFMTU */ #define IFF_VIRTUAL 0x2000000000ll /* Does not send or receive packets */ +#define IFF_DUPLICATE 0x4000000000ll /* Local address already in use */ /* * The IFF_MULTICAST flag indicates that the network can support the @@ -177,7 +178,7 @@ struct ifnet { (IFF_BROADCAST | IFF_POINTOPOINT | IFF_RUNNING | IFF_PROMISC | \ IFF_MULTICAST | IFF_MULTI_BCAST | IFF_UNNUMBERED | IFF_IPV4 | \ IFF_IPV6 | IFF_INACTIVE | IFF_FIXEDMTU | IFF_VIRTUAL | \ - IFF_LOOPBACK | IFF_ALLMULTI) + IFF_LOOPBACK | IFF_ALLMULTI | IFF_DUPLICATE) /* * Output queues (ifp->if_snd) and internetwork datagram level (pup level 1) diff --git a/usr/src/uts/common/net/if_arp.h b/usr/src/uts/common/net/if_arp.h index 7df505c710..9103b1d0b5 100644 --- a/usr/src/uts/common/net/if_arp.h +++ b/usr/src/uts/common/net/if_arp.h @@ -1,5 +1,5 @@ /* - * Copyright 1997-2003 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -15,6 +15,9 @@ #pragma ident "%Z%%M% %I% %E% SMI" /* if_arp.h 1.5 88/08/19 SMI; from UCB 7.1 1/24/86 */ +#include <sys/types.h> +#include <sys/socket.h> + #ifdef __cplusplus extern "C" { #endif @@ -32,7 +35,8 @@ extern "C" { struct arphdr { ushort_t ar_hrd; /* format of hardware address */ #define ARPHRD_ETHER 1 /* ethernet hardware address */ -#define ARPHRD_IB 32 /* IPoIB hardware address */ +#define ARPHRD_IEEE802 6 /* IEEE 802 hardware address */ +#define ARPHRD_IB 32 /* IPoIB hardware address */ ushort_t ar_pro; /* format of protocol address */ uchar_t ar_hln; /* length of hardware address */ uchar_t ar_pln; /* length of protocol address */ @@ -55,6 +59,9 @@ struct arphdr { #endif /* notdef */ }; +/* Maximum hardware and protocol address length */ +#define ARP_MAX_ADDR_LEN 255 + /* * Extended ARP ioctl request */ @@ -72,12 +79,13 @@ struct arpreq { struct sockaddr arp_ha; /* hardware address */ int arp_flags; /* flags */ }; -/* arp_flags and at_flags field values */ +/* arp_flags field values */ #define ATF_INUSE 0x01 /* entry in use */ #define ATF_COM 0x02 /* completed entry (enaddr valid) */ #define ATF_PERM 0x04 /* permanent entry */ #define ATF_PUBL 0x08 /* publish entry (respond for other host) */ #define ATF_USETRAILERS 0x10 /* has requested trailers */ +#define ATF_AUTHORITY 0x20 /* hardware address is authoritative */ /* * This data structure is used by kernel protocol modules to register diff --git a/usr/src/uts/common/netinet/arp.h b/usr/src/uts/common/netinet/arp.h index 523f111c00..a3bf0e7761 100644 --- a/usr/src/uts/common/netinet/arp.h +++ b/usr/src/uts/common/netinet/arp.h @@ -1,5 +1,5 @@ /* - * Copyright 1986-2003 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -14,6 +14,10 @@ #pragma ident "%Z%%M% %I% %E% SMI" +#include <sys/types.h> +#include <sys/ethernet.h> +#include <sys/socket.h> + #ifdef __cplusplus extern "C" { #endif @@ -31,6 +35,7 @@ extern "C" { struct arphdr { ushort_t ar_hrd; /* format of hardware address */ #define ARPHRD_ETHER 1 /* ethernet hardware address */ +#define ARPHRD_IEEE802 6 /* IEEE 802 hardware address */ #define ARPHRD_IB 32 /* IPoIB hardware address */ ushort_t ar_pro; /* format of protocol address */ uchar_t ar_hln; /* length of hardware address */ @@ -54,6 +59,9 @@ struct arphdr { #endif /* notdef */ }; +/* Maximum hardware and protocol address length */ +#define ARP_MAX_ADDR_LEN 255 + /* * Ethernet Address Resolution Protocol. * @@ -82,12 +90,13 @@ struct arpreq { struct sockaddr arp_ha; /* hardware address */ int arp_flags; /* flags */ }; -/* arp_flags and at_flags field values */ +/* arp_flags field values */ #define ATF_INUSE 0x01 /* entry in use */ #define ATF_COM 0x02 /* completed entry (enaddr valid) */ #define ATF_PERM 0x04 /* permanent entry */ #define ATF_PUBL 0x08 /* publish entry (respond for other host) */ #define ATF_USETRAILERS 0x10 /* has requested trailers */ +#define ATF_AUTHORITY 0x20 /* hardware address is authoritative */ #ifdef __cplusplus } diff --git a/usr/src/uts/common/os/subr.c b/usr/src/uts/common/os/subr.c index 9c9942ec8c..4753f1152a 100644 --- a/usr/src/uts/common/os/subr.c +++ b/usr/src/uts/common/os/subr.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -218,25 +217,6 @@ umax(uint_t a, uint_t b) #endif /* !_LP64 */ /* - * Return bit position of least significant bit set in mask, - * starting numbering from 1. - */ -int -ffs(long mask) -{ - int i; - - if (mask == 0) - return (0); - for (i = 1; i <= NBBY * sizeof (mask); i++) { - if (mask & 1) - return (i); - mask >>= 1; - } - return (0); -} - -/* * Parse suboptions from a string. * Same as getsubopt(3C). */ diff --git a/usr/src/uts/common/os/sunddi.c b/usr/src/uts/common/os/sunddi.c index 1709760d74..f16ae44426 100644 --- a/usr/src/uts/common/os/sunddi.c +++ b/usr/src/uts/common/os/sunddi.c @@ -5955,7 +5955,6 @@ ddi_in_panic() int ddi_ffs(long mask) { - extern int ffs(long mask); return (ffs(mask)); } @@ -5970,8 +5969,6 @@ ddi_ffs(long mask) int ddi_fls(long mask) { - extern int ffs(long); - while (mask) { long nx; diff --git a/usr/src/uts/common/sys/systm.h b/usr/src/uts/common/sys/systm.h index 9c34c3f895..c96ea5b4ac 100644 --- a/usr/src/uts/common/sys/systm.h +++ b/usr/src/uts/common/sys/systm.h @@ -230,7 +230,7 @@ int strident_valid(const char *); void strident_canon(char *, size_t); int getsubopt(char **optionsp, char * const *tokens, char **valuep); char *append_subopt(const char *, size_t, char *, const char *); -int ffs(long); +int ffs(uintmax_t); int copyin(const void *, void *, size_t); void copyin_noerr(const void *, void *, size_t); int xcopyin(const void *, void *, size_t); |