diff options
author | Erik Nordmark <Erik.Nordmark@Sun.COM> | 2009-11-11 11:49:49 -0800 |
---|---|---|
committer | Erik Nordmark <Erik.Nordmark@Sun.COM> | 2009-11-11 11:49:49 -0800 |
commit | bd670b35a010421b6e1a5536c34453a827007c81 (patch) | |
tree | 97c2057b6771dd40411a12eb89d2db2e2b2cce31 /usr/src/uts/common/inet/ip/ip_attr.c | |
parent | b3388e4fc5f5c24c8a39fbe132a00b02dae5b717 (diff) | |
download | illumos-joyent-bd670b35a010421b6e1a5536c34453a827007c81.tar.gz |
PSARC/2009/331 IP Datapath Refactoring
PSARC/2008/522 EOF of 2001/070 IPsec HW Acceleration support
PSARC/2009/495 netstat -r flags for blackhole and reject routes
PSARC 2009/496 EOF of XRESOLV
PSARC/2009/494 IP_DONTFRAG socket option
PSARC/2009/515 fragmentation controls for ping and traceroute
6798716 ip_newroute delenda est
6798739 ARP and IP are too separate
6807265 IPv4 ip2mac() support
6756382 Please remove Venus IPsec HWACCEL code
6880632 sendto/sendmsg never returns EHOSTUNREACH in Solaris
6748582 sendmsg() return OK, but doesn't send message using IPv4-mapped x IPv6 addr
1119790 TCP and path mtu discovery
4637227 should support equal-cost multi-path (ECMP)
5078568 getsockopt() for IPV6_PATHMTU on a non-connected socket should not succeed
6419648 "AR* contract private note" should be removed as part of ATM SW EOL
6274715 Arp could keep the old entry in the cache while it waits for an arp response
6605615 Remove duplicated TCP/IP opt_set/opt_get code; use conn_t
6874677 IP_TTL can be used to send with ttl zero
4034090 arp should not let you delete your own entry
6882140 Implement IP_DONTFRAG socket option
6883858 Implement ping -D option; traceroute -F should work for IPv6 and shared-IP zones
1119792 TCP/IP black hole detection is broken on receiver
4078796 Directed broadcast forwarding code has problems
4104337 restrict the IPPROTO_IP and IPPROTO_IPV6 options based on the socket family
4203747 Source address selection for source routed packets
4230259 pmtu is increased every ip_ire_pathmtu_interval timer value.
4300533 When sticky option ipv6_pktinfo set to bogus address subsequent connect time out
4471035 ire_delete_cache_gw is called through ire_walk unnecessarily
4514572 SO_DONTROUTE socket option doesn't work with IPv6
4524980 tcp_lookup_ipv4() should compare the ifindex against tcpb->tcpb_bound_if
4532714 machine fails to switch quickly among failed default routes
4634219 IPv6 path mtu discovery is broken when using routing header
4691581 udp broadcast handling causes too many replicas
4708405 mcast is broken on machines when all interfaces are IFF_POINTOPOINT
4770457 netstat/route: source address of interface routes pretends to be gateway address
4786974 use routing table to determine routes/interface for multicast
4792619 An ip_fanout_udp_ipc_v6() routine might lead to some simpler code
4816115 Nuke ipsec_out_use_global_policy
4862844 ipsec offload corner case
4867533 tcp_rq and tcp_wq are redundant
4868589 NCEs should be shared across an IPMP group
4872093 unplumbing an improper virtual interface panics in ip_newroute_get_dst_ill()
4901671 FireEngine needs some cleanup
4907617 IPsec identity latching should be done before sending SYN-ACK
4941461 scopeid and IPV6_PKTINFO with UDP/ICMP connect() does not work properly
4944981 ip does nothing with IP6I_NEXTHOP
4963353 IPv4 and IPv6 proto fanout codes could be brought closer
4963360 consider passing zoneid using ip6i_t instead of ipsec_out_t in NDP
4963734 new ip6_asp locking is used incorrectly in ip_newroute_v6()
5008315 IPv6 code passes ip6i_t to IPsec code instead of ip6_t
5009636 memory leak in ip_fanout_proto_v6()
5092337 tcp/udp option handling can use some cleanup
5035841 Solaris can fail to create a valid broadcast ire
5043747 ar_query_xmit: Could not find the ace
5051574 tcp_check_policy is missing some checks
6305037 full hardware checksum is discarded when there're more than 2 mblks in the chain
6311149 ip.c needs to be put through a woodchipper
4708860 Unable to reassemble CGTP fragmented multicast packets
6224628 Large IPv6 packets with IPsec protection sometimes have length mismatch.
6213243 Solaris does not currently support Dead Gateway Detection
5029091 duplicate code in IP's input path for TCP/UDP/SCTP
4674643 through IPv6 CGTP routes, the very first packet is sent only after a while
6207318 Multiple default routes do not round robin connections to routers.
4823410 IP has an inconsistent view of link mtu
5105520 adding interface route to down interface causes ifconfig hang
5105707 advanced sockets API introduced some dead code
6318399 IP option handling for icmp and udp is too complicated
6321434 Every dropped packet in IP should use ip_drop_packet()
6341693 ifconfig mtu should operate on the physical interface, not individual ipif's
6352430 The credentials attached to an mblk are not particularly useful
6357894 uninitialised ipp_hoplimit needs to be cleaned up.
6363568 ip_xmit_v6() may be missing IRE releases in error cases
6364828 ip_rput_forward needs a makeover
6384416 System panics when running as multicast forwarder using multicast tunnels
6402382 TX: UDP v6 slowpath is not modified to handle mac_exempt conns
6418413 assertion failed ipha->ipha_ident == 0||ipha->ipha_ident == 0xFFFF
6420916 assertion failures in ipv6 wput path
6430851 use of b_prev to store ifindex is not 100% safe
6446106 IPv6 packets stored in nce->nce_qd_mp will be sent with incorrect tcp/udp checksums
6453711 SCTP OOTB sent as if genetated by global zone
6465212 ARP/IP merge should remove ire_freemblk.esballoc
6490163 ip_input() could misbehave if the first mblk's size is not big enough
6496664 missing ipif_refrele leads to reference leak and deferred crash in ip_wput_ipsec_out_v6
6504856 memory leak in ip_fanout_proto_v6() when using link local outer tunnel addresses
6507765 IRE cache hash function performs badly
6510186 IP_FORWARD_PROG bit is easily overlooked
6514727 cgtp ipv6 failure on snv54
6528286 MULTIRT (CGTP) should offload checksum to hardware
6533904 SCTP: doesn't support traffic class for IPv6
6539415 TX: ipif source selection is flawed for unlabeled gateways
6539851 plumbed unworking nic blocks sending broadcast packets
6564468 non-solaris SCTP stack over rawip socket: netstat command counts rawipInData not rawipOutDatagrams
6568511 ipIfStatsOutDiscards not bumped when discarding an ipsec packet on the wrong NIC
6584162 tcp_g_q_inactive() makes incorrect use of taskq_dispatch()
6603974 round-robin default with many interfaces causes infinite temporary IRE thrashing
6611750 ilm_lookup_ill_index_v4 was born an orphan
6618423 ip_wput_frag_mdt sends out packets that void pfhooks
6620964 IRE max bucket count calculations performed in ip_ire_init() are flawed
6626266 various _broadcasts seem redundant
6638182 IP_PKTINFO + SO_DONTROUTE + CIPSO IP option == panic
6647710 IPv6 possible DoS vulnerability
6657357 nce should be kmem_cache alloc'ed from an nce_cache.
6685131 ilg_add -> conn_ilg_alloc interacting with conn_ilg[] walkers can cause panic.
6730298 adding 0.0.0.0 key with mask != 0 causes 'route delete default' to fail
6730976 vni and ipv6 doesn't quite work.
6740956 assertion failed: mp->b_next == 0L && mp->b_prev == 0L in nce_queue_mp_common()
6748515 BUMP_MIB() is occasionally done on the wrong ill
6753250 ip_output_v6() `notv6' error path has an errant ill_refrele()
6756411 NULL-pointer dereference in ip_wput_local()
6769582 IP must forward packet returned from FW-HOOK
6781525 bogus usesrc usage leads directly to panic
6422839 System paniced in ip_multicast_loopback due to NULL pointer dereference
6785521 initial IPv6 DAD solicitation is dropped in ip_newroute_ipif_v6()
6787370 ipnet devices not seeing forwarded IP packets on outgoing interface
6791187 ip*dbg() calls in ip_output_options() claim to originate from ip_wput()
6794047 nce_fp_mp prevents sharing of NCEs across an IPMP group
6797926 many unnecessary ip0dbg() in ip_rput_data_v6
6846919 Packet queued for ND gets sent in the clear.
6856591 ping doesn't send packets with DF set
6861113 arp module has incorrect dependency path for hook module
6865664 IPV6_NEXTHOP does not work with TCP socket
6874681 No ICMP time exceeded when a router receives packet with ttl = 0
6880977 ip_wput_ire() uses over 1k of stack
6595433 IPsec performance could be significantly better when calling hw crypto provider synchronously
6848397 ifconfig down of an interface can hang.
6849602 IPV6_PATHMTU size issue for UDP
6885359 Add compile-time option for testing pure IPsec overhead
6889268 Odd loopback source address selection with IPMP
6895420 assertion failed: connp->conn_helper_info == NULL
6851189 Routing-related panic occurred during reboot on T2000 system running snv_117
6896174 Post-async-encryption, AH+ESP packets may have misinitialized ipha/ip6
6896687 iptun presents IPv6 with an MTU < 1280
6897006 assertion failed: ipif->ipif_id != 0 in ip_sioctl_slifzone_restart
Diffstat (limited to 'usr/src/uts/common/inet/ip/ip_attr.c')
-rw-r--r-- | usr/src/uts/common/inet/ip/ip_attr.c | 1338 |
1 files changed, 1338 insertions, 0 deletions
diff --git a/usr/src/uts/common/inet/ip/ip_attr.c b/usr/src/uts/common/inet/ip/ip_attr.c new file mode 100644 index 0000000000..a46a82c85f --- /dev/null +++ b/usr/src/uts/common/inet/ip/ip_attr.c @@ -0,0 +1,1338 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* Copyright (c) 1990 Mentat Inc. */ + +#include <sys/types.h> +#include <sys/stream.h> +#include <sys/strsun.h> +#include <sys/zone.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/atomic.h> + +#include <sys/systm.h> +#include <sys/param.h> +#include <sys/kmem.h> +#include <sys/sdt.h> +#include <sys/socket.h> +#include <sys/mac.h> +#include <net/if.h> +#include <net/if_arp.h> +#include <net/route.h> +#include <sys/sockio.h> +#include <netinet/in.h> +#include <net/if_dl.h> + +#include <inet/common.h> +#include <inet/mi.h> +#include <inet/mib2.h> +#include <inet/nd.h> +#include <inet/arp.h> +#include <inet/snmpcom.h> +#include <inet/kstatcom.h> + +#include <netinet/igmp_var.h> +#include <netinet/ip6.h> +#include <netinet/icmp6.h> +#include <netinet/sctp.h> + +#include <inet/ip.h> +#include <inet/ip_impl.h> +#include <inet/ip6.h> +#include <inet/ip6_asp.h> +#include <inet/tcp.h> +#include <inet/ip_multi.h> +#include <inet/ip_if.h> +#include <inet/ip_ire.h> +#include <inet/ip_ftable.h> +#include <inet/ip_rts.h> +#include <inet/optcom.h> +#include <inet/ip_ndp.h> +#include <inet/ip_listutils.h> +#include <netinet/igmp.h> +#include <netinet/ip_mroute.h> +#include <inet/ipp_common.h> + +#include <net/pfkeyv2.h> +#include <inet/sadb.h> +#include <inet/ipsec_impl.h> +#include <inet/ipdrop.h> +#include <inet/ip_netinfo.h> +#include <sys/squeue_impl.h> +#include <sys/squeue.h> + +#include <inet/ipclassifier.h> +#include <inet/sctp_ip.h> +#include <inet/sctp/sctp_impl.h> +#include <inet/udp_impl.h> +#include <sys/sunddi.h> + +#include <sys/tsol/label.h> +#include <sys/tsol/tnet.h> + +/* + * Release a reference on ip_xmit_attr. + * The reference is acquired by conn_get_ixa() + */ +#define IXA_REFRELE(ixa) \ +{ \ + if (atomic_add_32_nv(&(ixa)->ixa_refcnt, -1) == 0) \ + ixa_inactive(ixa); \ +} + +#define IXA_REFHOLD(ixa) \ +{ \ + ASSERT((ixa)->ixa_refcnt != 0); \ + atomic_add_32(&(ixa)->ixa_refcnt, 1); \ +} + +/* + * When we need to handle a transmit side asynchronous operation, then we need + * to save sufficient information so that we can call the fragment and postfrag + * functions. That information is captured in an mblk containing this structure. + * + * Since this is currently only used for IPsec, we include information for + * the kernel crypto framework. + */ +typedef struct ixamblk_s { + boolean_t ixm_inbound; /* B_FALSE */ + iaflags_t ixm_flags; /* ixa_flags */ + netstackid_t ixm_stackid; /* Verify it didn't go away */ + uint_t ixm_ifindex; /* Used to find the nce */ + in6_addr_t ixm_nceaddr_v6; /* Used to find nce */ +#define ixm_nceaddr_v4 V4_PART_OF_V6(ixm_nceaddr_v6) + uint32_t ixm_fragsize; + uint_t ixm_pktlen; + uint16_t ixm_ip_hdr_length; /* Points to ULP header */ + uint8_t ixm_protocol; /* Protocol number for ULP cksum */ + pfirepostfrag_t ixm_postfragfn; + + zoneid_t ixm_zoneid; /* Needed for ipobs */ + zoneid_t ixm_no_loop_zoneid; /* IXAF_NO_LOOP_ZONEID_SET */ + + uint_t ixm_scopeid; /* For IPv6 link-locals */ + + uint32_t ixm_ident; /* For IPv6 fragment header */ + uint32_t ixm_xmit_hint; + + cred_t *ixm_cred; /* For getpeerucred - refhold if set */ + pid_t ixm_cpid; /* For getpeerucred */ + + ts_label_t *ixm_tsl; /* Refhold if set. */ + + /* + * When the pointers below are set they have a refhold on the struct. + */ + ipsec_latch_t *ixm_ipsec_latch; + struct ipsa_s *ixm_ipsec_ah_sa; /* SA for AH */ + struct ipsa_s *ixm_ipsec_esp_sa; /* SA for ESP */ + struct ipsec_policy_s *ixm_ipsec_policy; /* why are we here? */ + struct ipsec_action_s *ixm_ipsec_action; /* For reflected packets */ + + ipsa_ref_t ixm_ipsec_ref[2]; /* Soft reference to SA */ + + /* Need these while waiting for SA */ + uint16_t ixm_ipsec_src_port; /* Source port number of d-gram. */ + uint16_t ixm_ipsec_dst_port; /* Destination port number of d-gram. */ + uint8_t ixm_ipsec_icmp_type; /* ICMP type of d-gram */ + uint8_t ixm_ipsec_icmp_code; /* ICMP code of d-gram */ + + sa_family_t ixm_ipsec_inaf; /* Inner address family */ + uint32_t ixm_ipsec_insrc[IXA_MAX_ADDRLEN]; /* Inner src address */ + uint32_t ixm_ipsec_indst[IXA_MAX_ADDRLEN]; /* Inner dest address */ + uint8_t ixm_ipsec_insrcpfx; /* Inner source prefix */ + uint8_t ixm_ipsec_indstpfx; /* Inner destination prefix */ + + uint8_t ixm_ipsec_proto; /* IP protocol number for d-gram. */ +} ixamblk_t; + + +/* + * When we need to handle a receive side asynchronous operation, then we need + * to save sufficient information so that we can call ip_fanout. + * That information is captured in an mblk containing this structure. + * + * Since this is currently only used for IPsec, we include information for + * the kernel crypto framework. + */ +typedef struct iramblk_s { + boolean_t irm_inbound; /* B_TRUE */ + iaflags_t irm_flags; /* ira_flags */ + netstackid_t irm_stackid; /* Verify it didn't go away */ + uint_t irm_ifindex; /* To find ira_ill */ + + uint_t irm_rifindex; /* ira_rifindex */ + uint_t irm_ruifindex; /* ira_ruifindex */ + uint_t irm_pktlen; + uint16_t irm_ip_hdr_length; /* Points to ULP header */ + uint8_t irm_protocol; /* Protocol number for ULP cksum */ + zoneid_t irm_zoneid; /* ALL_ZONES unless local delivery */ + + squeue_t *irm_sqp; + ill_rx_ring_t *irm_ring; + + ipaddr_t irm_mroute_tunnel; /* IRAF_MROUTE_TUNNEL_SET */ + zoneid_t irm_no_loop_zoneid; /* IRAF_NO_LOOP_ZONEID_SET */ + uint32_t irm_esp_udp_ports; /* IRAF_ESP_UDP_PORTS */ + + char irm_l2src[IRA_L2SRC_SIZE]; /* If IRAF_L2SRC_SET */ + + cred_t *irm_cred; /* For getpeerucred - refhold if set */ + pid_t irm_cpid; /* For getpeerucred */ + + ts_label_t *irm_tsl; /* Refhold if set. */ + + /* + * When set these correspond to a refhold on the object. + */ + struct ipsa_s *irm_ipsec_ah_sa; /* SA for AH */ + struct ipsa_s *irm_ipsec_esp_sa; /* SA for ESP */ + struct ipsec_action_s *irm_ipsec_action; /* For reflected packets */ +} iramblk_t; + + +/* + * Take the information in ip_xmit_attr_t and stick it in an mblk + * that can later be passed to ip_xmit_attr_from_mblk to recreate the + * ip_xmit_attr_t. + * + * Returns NULL on memory allocation failure. + */ +mblk_t * +ip_xmit_attr_to_mblk(ip_xmit_attr_t *ixa) +{ + mblk_t *ixamp; + ixamblk_t *ixm; + nce_t *nce = ixa->ixa_nce; + + ASSERT(nce != NULL); + ixamp = allocb(sizeof (*ixm), BPRI_MED); + if (ixamp == NULL) + return (NULL); + + ixamp->b_datap->db_type = M_BREAK; + ixamp->b_wptr += sizeof (*ixm); + ixm = (ixamblk_t *)ixamp->b_rptr; + + bzero(ixm, sizeof (*ixm)); + ixm->ixm_inbound = B_FALSE; + ixm->ixm_flags = ixa->ixa_flags; + ixm->ixm_stackid = ixa->ixa_ipst->ips_netstack->netstack_stackid; + ixm->ixm_ifindex = nce->nce_ill->ill_phyint->phyint_ifindex; + ixm->ixm_nceaddr_v6 = nce->nce_addr; + ixm->ixm_fragsize = ixa->ixa_fragsize; + ixm->ixm_pktlen = ixa->ixa_pktlen; + ixm->ixm_ip_hdr_length = ixa->ixa_ip_hdr_length; + ixm->ixm_protocol = ixa->ixa_protocol; + ixm->ixm_postfragfn = ixa->ixa_postfragfn; + ixm->ixm_zoneid = ixa->ixa_zoneid; + ixm->ixm_no_loop_zoneid = ixa->ixa_no_loop_zoneid; + ixm->ixm_scopeid = ixa->ixa_scopeid; + ixm->ixm_ident = ixa->ixa_ident; + ixm->ixm_xmit_hint = ixa->ixa_xmit_hint; + + if (ixa->ixa_tsl != NULL) { + ixm->ixm_tsl = ixa->ixa_tsl; + label_hold(ixm->ixm_tsl); + } + if (ixa->ixa_cred != NULL) { + ixm->ixm_cred = ixa->ixa_cred; + crhold(ixa->ixa_cred); + } + ixm->ixm_cpid = ixa->ixa_cpid; + + if (ixa->ixa_flags & IXAF_IPSEC_SECURE) { + if (ixa->ixa_ipsec_ah_sa != NULL) { + ixm->ixm_ipsec_ah_sa = ixa->ixa_ipsec_ah_sa; + IPSA_REFHOLD(ixa->ixa_ipsec_ah_sa); + } + if (ixa->ixa_ipsec_esp_sa != NULL) { + ixm->ixm_ipsec_esp_sa = ixa->ixa_ipsec_esp_sa; + IPSA_REFHOLD(ixa->ixa_ipsec_esp_sa); + } + if (ixa->ixa_ipsec_policy != NULL) { + ixm->ixm_ipsec_policy = ixa->ixa_ipsec_policy; + IPPOL_REFHOLD(ixa->ixa_ipsec_policy); + } + if (ixa->ixa_ipsec_action != NULL) { + ixm->ixm_ipsec_action = ixa->ixa_ipsec_action; + IPACT_REFHOLD(ixa->ixa_ipsec_action); + } + if (ixa->ixa_ipsec_latch != NULL) { + ixm->ixm_ipsec_latch = ixa->ixa_ipsec_latch; + IPLATCH_REFHOLD(ixa->ixa_ipsec_latch); + } + ixm->ixm_ipsec_ref[0] = ixa->ixa_ipsec_ref[0]; + ixm->ixm_ipsec_ref[1] = ixa->ixa_ipsec_ref[1]; + ixm->ixm_ipsec_src_port = ixa->ixa_ipsec_src_port; + ixm->ixm_ipsec_dst_port = ixa->ixa_ipsec_dst_port; + ixm->ixm_ipsec_icmp_type = ixa->ixa_ipsec_icmp_type; + ixm->ixm_ipsec_icmp_code = ixa->ixa_ipsec_icmp_code; + ixm->ixm_ipsec_inaf = ixa->ixa_ipsec_inaf; + ixm->ixm_ipsec_insrc[0] = ixa->ixa_ipsec_insrc[0]; + ixm->ixm_ipsec_insrc[1] = ixa->ixa_ipsec_insrc[1]; + ixm->ixm_ipsec_insrc[2] = ixa->ixa_ipsec_insrc[2]; + ixm->ixm_ipsec_insrc[3] = ixa->ixa_ipsec_insrc[3]; + ixm->ixm_ipsec_indst[0] = ixa->ixa_ipsec_indst[0]; + ixm->ixm_ipsec_indst[1] = ixa->ixa_ipsec_indst[1]; + ixm->ixm_ipsec_indst[2] = ixa->ixa_ipsec_indst[2]; + ixm->ixm_ipsec_indst[3] = ixa->ixa_ipsec_indst[3]; + ixm->ixm_ipsec_insrcpfx = ixa->ixa_ipsec_insrcpfx; + ixm->ixm_ipsec_indstpfx = ixa->ixa_ipsec_indstpfx; + ixm->ixm_ipsec_proto = ixa->ixa_ipsec_proto; + } + return (ixamp); +} + +/* + * Extract the ip_xmit_attr_t from the mblk, checking that the + * ip_stack_t, ill_t, and nce_t still exist. Returns B_FALSE if that is + * not the case. + * + * Otherwise ixa is updated. + * Caller needs to release references on the ixa by calling ixa_refrele() + * which will imediately call ixa_inactive to release the references. + */ +boolean_t +ip_xmit_attr_from_mblk(mblk_t *ixamp, ip_xmit_attr_t *ixa) +{ + ixamblk_t *ixm; + netstack_t *ns; + ip_stack_t *ipst; + ill_t *ill; + nce_t *nce; + + /* We assume the caller hasn't initialized ixa */ + bzero(ixa, sizeof (*ixa)); + + ASSERT(DB_TYPE(ixamp) == M_BREAK); + ASSERT(ixamp->b_cont == NULL); + + ixm = (ixamblk_t *)ixamp->b_rptr; + ASSERT(!ixm->ixm_inbound); + + /* Verify the netstack is still around */ + ns = netstack_find_by_stackid(ixm->ixm_stackid); + if (ns == NULL) { + /* Disappeared on us */ + (void) ip_xmit_attr_free_mblk(ixamp); + return (B_FALSE); + } + ipst = ns->netstack_ip; + + /* Verify the ill is still around */ + ill = ill_lookup_on_ifindex(ixm->ixm_ifindex, + !(ixm->ixm_flags & IXAF_IS_IPV4), ipst); + + /* We have the ill, hence the netstack can't go away */ + netstack_rele(ns); + if (ill == NULL) { + /* Disappeared on us */ + (void) ip_xmit_attr_free_mblk(ixamp); + return (B_FALSE); + } + /* + * Find the nce. We don't load-spread (only lookup nce's on the ill) + * because we want to find the same nce as the one we had when + * ip_xmit_attr_to_mblk was called. + */ + if (ixm->ixm_flags & IXAF_IS_IPV4) { + nce = nce_lookup_v4(ill, &ixm->ixm_nceaddr_v4); + } else { + nce = nce_lookup_v6(ill, &ixm->ixm_nceaddr_v6); + } + + /* We have the nce, hence the ill can't go away */ + ill_refrele(ill); + if (nce == NULL) { + /* + * Since this is unusual and we don't know what type of + * nce it was, we drop the packet. + */ + (void) ip_xmit_attr_free_mblk(ixamp); + return (B_FALSE); + } + + ixa->ixa_flags = ixm->ixm_flags; + ixa->ixa_refcnt = 1; + ixa->ixa_ipst = ipst; + ixa->ixa_fragsize = ixm->ixm_fragsize; + ixa->ixa_pktlen = ixm->ixm_pktlen; + ixa->ixa_ip_hdr_length = ixm->ixm_ip_hdr_length; + ixa->ixa_protocol = ixm->ixm_protocol; + ixa->ixa_nce = nce; + ixa->ixa_postfragfn = ixm->ixm_postfragfn; + ixa->ixa_zoneid = ixm->ixm_zoneid; + ixa->ixa_no_loop_zoneid = ixm->ixm_no_loop_zoneid; + ixa->ixa_scopeid = ixm->ixm_scopeid; + ixa->ixa_ident = ixm->ixm_ident; + ixa->ixa_xmit_hint = ixm->ixm_xmit_hint; + + if (ixm->ixm_tsl != NULL) { + ixa->ixa_tsl = ixm->ixm_tsl; + ixa->ixa_free_flags |= IXA_FREE_TSL; + } + if (ixm->ixm_cred != NULL) { + ixa->ixa_cred = ixm->ixm_cred; + ixa->ixa_free_flags |= IXA_FREE_CRED; + } + ixa->ixa_cpid = ixm->ixm_cpid; + + ixa->ixa_ipsec_ah_sa = ixm->ixm_ipsec_ah_sa; + ixa->ixa_ipsec_esp_sa = ixm->ixm_ipsec_esp_sa; + ixa->ixa_ipsec_policy = ixm->ixm_ipsec_policy; + ixa->ixa_ipsec_action = ixm->ixm_ipsec_action; + ixa->ixa_ipsec_latch = ixm->ixm_ipsec_latch; + + ixa->ixa_ipsec_ref[0] = ixm->ixm_ipsec_ref[0]; + ixa->ixa_ipsec_ref[1] = ixm->ixm_ipsec_ref[1]; + ixa->ixa_ipsec_src_port = ixm->ixm_ipsec_src_port; + ixa->ixa_ipsec_dst_port = ixm->ixm_ipsec_dst_port; + ixa->ixa_ipsec_icmp_type = ixm->ixm_ipsec_icmp_type; + ixa->ixa_ipsec_icmp_code = ixm->ixm_ipsec_icmp_code; + ixa->ixa_ipsec_inaf = ixm->ixm_ipsec_inaf; + ixa->ixa_ipsec_insrc[0] = ixm->ixm_ipsec_insrc[0]; + ixa->ixa_ipsec_insrc[1] = ixm->ixm_ipsec_insrc[1]; + ixa->ixa_ipsec_insrc[2] = ixm->ixm_ipsec_insrc[2]; + ixa->ixa_ipsec_insrc[3] = ixm->ixm_ipsec_insrc[3]; + ixa->ixa_ipsec_indst[0] = ixm->ixm_ipsec_indst[0]; + ixa->ixa_ipsec_indst[1] = ixm->ixm_ipsec_indst[1]; + ixa->ixa_ipsec_indst[2] = ixm->ixm_ipsec_indst[2]; + ixa->ixa_ipsec_indst[3] = ixm->ixm_ipsec_indst[3]; + ixa->ixa_ipsec_insrcpfx = ixm->ixm_ipsec_insrcpfx; + ixa->ixa_ipsec_indstpfx = ixm->ixm_ipsec_indstpfx; + ixa->ixa_ipsec_proto = ixm->ixm_ipsec_proto; + + freeb(ixamp); + return (B_TRUE); +} + +/* + * Free the ixm mblk and any references it holds + * Returns b_cont. + */ +mblk_t * +ip_xmit_attr_free_mblk(mblk_t *ixamp) +{ + ixamblk_t *ixm; + mblk_t *mp; + + /* Consume mp */ + ASSERT(DB_TYPE(ixamp) == M_BREAK); + mp = ixamp->b_cont; + + ixm = (ixamblk_t *)ixamp->b_rptr; + ASSERT(!ixm->ixm_inbound); + + if (ixm->ixm_ipsec_ah_sa != NULL) { + IPSA_REFRELE(ixm->ixm_ipsec_ah_sa); + ixm->ixm_ipsec_ah_sa = NULL; + } + if (ixm->ixm_ipsec_esp_sa != NULL) { + IPSA_REFRELE(ixm->ixm_ipsec_esp_sa); + ixm->ixm_ipsec_esp_sa = NULL; + } + if (ixm->ixm_ipsec_policy != NULL) { + IPPOL_REFRELE(ixm->ixm_ipsec_policy); + ixm->ixm_ipsec_policy = NULL; + } + if (ixm->ixm_ipsec_action != NULL) { + IPACT_REFRELE(ixm->ixm_ipsec_action); + ixm->ixm_ipsec_action = NULL; + } + if (ixm->ixm_ipsec_latch) { + IPLATCH_REFRELE(ixm->ixm_ipsec_latch); + ixm->ixm_ipsec_latch = NULL; + } + + if (ixm->ixm_tsl != NULL) { + label_rele(ixm->ixm_tsl); + ixm->ixm_tsl = NULL; + } + if (ixm->ixm_cred != NULL) { + crfree(ixm->ixm_cred); + ixm->ixm_cred = NULL; + } + freeb(ixamp); + return (mp); +} + +/* + * Take the information in ip_recv_attr_t and stick it in an mblk + * that can later be passed to ip_recv_attr_from_mblk to recreate the + * ip_recv_attr_t. + * + * Returns NULL on memory allocation failure. + */ +mblk_t * +ip_recv_attr_to_mblk(ip_recv_attr_t *ira) +{ + mblk_t *iramp; + iramblk_t *irm; + ill_t *ill = ira->ira_ill; + + ASSERT(ira->ira_ill != NULL || ira->ira_ruifindex != 0); + + iramp = allocb(sizeof (*irm), BPRI_MED); + if (iramp == NULL) + return (NULL); + + iramp->b_datap->db_type = M_BREAK; + iramp->b_wptr += sizeof (*irm); + irm = (iramblk_t *)iramp->b_rptr; + + bzero(irm, sizeof (*irm)); + irm->irm_inbound = B_TRUE; + irm->irm_flags = ira->ira_flags; + if (ill != NULL) { + /* Internal to IP - preserve ip_stack_t, ill and rill */ + irm->irm_stackid = + ill->ill_ipst->ips_netstack->netstack_stackid; + irm->irm_ifindex = ira->ira_ill->ill_phyint->phyint_ifindex; + ASSERT(ira->ira_rill->ill_phyint->phyint_ifindex == + ira->ira_rifindex); + } else { + /* Let ip_recv_attr_from_stackid know there isn't one */ + irm->irm_stackid = -1; + } + irm->irm_rifindex = ira->ira_rifindex; + irm->irm_ruifindex = ira->ira_ruifindex; + irm->irm_pktlen = ira->ira_pktlen; + irm->irm_ip_hdr_length = ira->ira_ip_hdr_length; + irm->irm_protocol = ira->ira_protocol; + + irm->irm_sqp = ira->ira_sqp; + irm->irm_ring = ira->ira_ring; + + irm->irm_zoneid = ira->ira_zoneid; + irm->irm_mroute_tunnel = ira->ira_mroute_tunnel; + irm->irm_no_loop_zoneid = ira->ira_no_loop_zoneid; + irm->irm_esp_udp_ports = ira->ira_esp_udp_ports; + + if (ira->ira_tsl != NULL) { + irm->irm_tsl = ira->ira_tsl; + label_hold(irm->irm_tsl); + } + if (ira->ira_cred != NULL) { + irm->irm_cred = ira->ira_cred; + crhold(ira->ira_cred); + } + irm->irm_cpid = ira->ira_cpid; + + if (ira->ira_flags & IRAF_L2SRC_SET) + bcopy(ira->ira_l2src, irm->irm_l2src, IRA_L2SRC_SIZE); + + if (ira->ira_flags & IRAF_IPSEC_SECURE) { + if (ira->ira_ipsec_ah_sa != NULL) { + irm->irm_ipsec_ah_sa = ira->ira_ipsec_ah_sa; + IPSA_REFHOLD(ira->ira_ipsec_ah_sa); + } + if (ira->ira_ipsec_esp_sa != NULL) { + irm->irm_ipsec_esp_sa = ira->ira_ipsec_esp_sa; + IPSA_REFHOLD(ira->ira_ipsec_esp_sa); + } + if (ira->ira_ipsec_action != NULL) { + irm->irm_ipsec_action = ira->ira_ipsec_action; + IPACT_REFHOLD(ira->ira_ipsec_action); + } + } + return (iramp); +} + +/* + * Extract the ip_recv_attr_t from the mblk. If we are used inside IP + * then irm_stackid is not -1, in which case we check that the + * ip_stack_t and ill_t still exist. Returns B_FALSE if that is + * not the case. + * If irm_stackid is zero then we are used by an ULP (e.g., squeue_enter) + * and we just proceed with ira_ill and ira_rill as NULL. + * + * The caller needs to release any references on the pointers inside the ire + * by calling ira_cleanup. + */ +boolean_t +ip_recv_attr_from_mblk(mblk_t *iramp, ip_recv_attr_t *ira) +{ + iramblk_t *irm; + netstack_t *ns; + ip_stack_t *ipst = NULL; + ill_t *ill = NULL, *rill = NULL; + + /* We assume the caller hasn't initialized ira */ + bzero(ira, sizeof (*ira)); + + ASSERT(DB_TYPE(iramp) == M_BREAK); + ASSERT(iramp->b_cont == NULL); + + irm = (iramblk_t *)iramp->b_rptr; + ASSERT(irm->irm_inbound); + + if (irm->irm_stackid != -1) { + /* Verify the netstack is still around */ + ns = netstack_find_by_stackid(irm->irm_stackid); + if (ns == NULL) { + /* Disappeared on us */ + (void) ip_recv_attr_free_mblk(iramp); + return (B_FALSE); + } + ipst = ns->netstack_ip; + + /* Verify the ill is still around */ + ill = ill_lookup_on_ifindex(irm->irm_ifindex, + !(irm->irm_flags & IRAF_IS_IPV4), ipst); + + if (irm->irm_ifindex == irm->irm_rifindex) { + rill = ill; + } else { + rill = ill_lookup_on_ifindex(irm->irm_rifindex, + !(irm->irm_flags & IRAF_IS_IPV4), ipst); + } + + /* We have the ill, hence the netstack can't go away */ + netstack_rele(ns); + if (ill == NULL || rill == NULL) { + /* Disappeared on us */ + if (ill != NULL) + ill_refrele(ill); + if (rill != NULL && rill != ill) + ill_refrele(rill); + (void) ip_recv_attr_free_mblk(iramp); + return (B_FALSE); + } + } + + ira->ira_flags = irm->irm_flags; + /* Caller must ill_refele(ira_ill) by using ira_cleanup() */ + ira->ira_ill = ill; + ira->ira_rill = rill; + + ira->ira_rifindex = irm->irm_rifindex; + ira->ira_ruifindex = irm->irm_ruifindex; + ira->ira_pktlen = irm->irm_pktlen; + ira->ira_ip_hdr_length = irm->irm_ip_hdr_length; + ira->ira_protocol = irm->irm_protocol; + + ira->ira_sqp = irm->irm_sqp; + /* The rest of IP assumes that the rings never go away. */ + ira->ira_ring = irm->irm_ring; + + ira->ira_zoneid = irm->irm_zoneid; + ira->ira_mroute_tunnel = irm->irm_mroute_tunnel; + ira->ira_no_loop_zoneid = irm->irm_no_loop_zoneid; + ira->ira_esp_udp_ports = irm->irm_esp_udp_ports; + + if (irm->irm_tsl != NULL) { + ira->ira_tsl = irm->irm_tsl; + ira->ira_free_flags |= IRA_FREE_TSL; + } + if (irm->irm_cred != NULL) { + ira->ira_cred = irm->irm_cred; + ira->ira_free_flags |= IRA_FREE_CRED; + } + ira->ira_cpid = irm->irm_cpid; + + if (ira->ira_flags & IRAF_L2SRC_SET) + bcopy(irm->irm_l2src, ira->ira_l2src, IRA_L2SRC_SIZE); + + ira->ira_ipsec_ah_sa = irm->irm_ipsec_ah_sa; + ira->ira_ipsec_esp_sa = irm->irm_ipsec_esp_sa; + ira->ira_ipsec_action = irm->irm_ipsec_action; + + freeb(iramp); + return (B_TRUE); +} + +/* + * Free the irm mblk and any references it holds + * Returns b_cont. + */ +mblk_t * +ip_recv_attr_free_mblk(mblk_t *iramp) +{ + iramblk_t *irm; + mblk_t *mp; + + /* Consume mp */ + ASSERT(DB_TYPE(iramp) == M_BREAK); + mp = iramp->b_cont; + + irm = (iramblk_t *)iramp->b_rptr; + ASSERT(irm->irm_inbound); + + if (irm->irm_ipsec_ah_sa != NULL) { + IPSA_REFRELE(irm->irm_ipsec_ah_sa); + irm->irm_ipsec_ah_sa = NULL; + } + if (irm->irm_ipsec_esp_sa != NULL) { + IPSA_REFRELE(irm->irm_ipsec_esp_sa); + irm->irm_ipsec_esp_sa = NULL; + } + if (irm->irm_ipsec_action != NULL) { + IPACT_REFRELE(irm->irm_ipsec_action); + irm->irm_ipsec_action = NULL; + } + if (irm->irm_tsl != NULL) { + label_rele(irm->irm_tsl); + irm->irm_tsl = NULL; + } + if (irm->irm_cred != NULL) { + crfree(irm->irm_cred); + irm->irm_cred = NULL; + } + + freeb(iramp); + return (mp); +} + +/* + * Returns true if the mblk contains an ip_recv_attr_t + * For now we just check db_type. + */ +boolean_t +ip_recv_attr_is_mblk(mblk_t *mp) +{ + /* + * Need to handle the various forms of tcp_timermp which are tagged + * with b_wptr and might have a NULL b_datap. + */ + if (mp->b_wptr == NULL || mp->b_wptr == (uchar_t *)-1) + return (B_FALSE); + +#ifdef DEBUG + iramblk_t *irm; + + if (DB_TYPE(mp) != M_BREAK) + return (B_FALSE); + + irm = (iramblk_t *)mp->b_rptr; + ASSERT(irm->irm_inbound); + return (B_TRUE); +#else + return (DB_TYPE(mp) == M_BREAK); +#endif +} + +static ip_xmit_attr_t * +conn_get_ixa_impl(conn_t *connp, boolean_t replace, int kmflag) +{ + ip_xmit_attr_t *ixa; + ip_xmit_attr_t *oldixa; + + mutex_enter(&connp->conn_lock); + ixa = connp->conn_ixa; + + /* At least one references for the conn_t */ + ASSERT(ixa->ixa_refcnt >= 1); + if (atomic_add_32_nv(&ixa->ixa_refcnt, 1) == 2) { + /* No other thread using conn_ixa */ + mutex_exit(&connp->conn_lock); + return (ixa); + } + ixa = kmem_alloc(sizeof (*ixa), kmflag); + if (ixa == NULL) { + mutex_exit(&connp->conn_lock); + ixa_refrele(connp->conn_ixa); + return (NULL); + } + ixa_safe_copy(connp->conn_ixa, ixa); + + /* Make sure we drop conn_lock before any refrele */ + if (replace) { + ixa->ixa_refcnt++; /* No atomic needed - not visible */ + oldixa = connp->conn_ixa; + connp->conn_ixa = ixa; + mutex_exit(&connp->conn_lock); + IXA_REFRELE(oldixa); /* Undo refcnt from conn_t */ + } else { + oldixa = connp->conn_ixa; + mutex_exit(&connp->conn_lock); + } + IXA_REFRELE(oldixa); /* Undo above atomic_add_32_nv */ + + return (ixa); +} + +/* + * Return an ip_xmit_attr_t to use with a conn_t that ensures that only + * the caller can access the ip_xmit_attr_t. + * + * If nobody else is using conn_ixa we return it. + * Otherwise we make a "safe" copy of conn_ixa + * and return it. The "safe" copy has the pointers set to NULL + * (since the pointers might be changed by another thread using + * conn_ixa). The caller needs to check for NULL pointers to see + * if ip_set_destination needs to be called to re-establish the pointers. + * + * If 'replace' is set then we replace conn_ixa with the new ip_xmit_attr_t. + * That is used when we connect() the ULP. + */ +ip_xmit_attr_t * +conn_get_ixa(conn_t *connp, boolean_t replace) +{ + return (conn_get_ixa_impl(connp, replace, KM_NOSLEEP)); +} + +/* + * Used only when the option is to have the kernel hang due to not + * cleaning up ixa references on ills etc. + */ +ip_xmit_attr_t * +conn_get_ixa_tryhard(conn_t *connp, boolean_t replace) +{ + return (conn_get_ixa_impl(connp, replace, KM_SLEEP)); +} + +/* + * Replace conn_ixa with the ixa argument. + * + * The caller must hold conn_lock. + * + * We return the old ixa; the caller must ixa_refrele that after conn_lock + * has been dropped. + */ +ip_xmit_attr_t * +conn_replace_ixa(conn_t *connp, ip_xmit_attr_t *ixa) +{ + ip_xmit_attr_t *oldixa; + + ASSERT(MUTEX_HELD(&connp->conn_lock)); + + oldixa = connp->conn_ixa; + IXA_REFHOLD(ixa); + connp->conn_ixa = ixa; + return (oldixa); +} + +/* + * Return a ip_xmit_attr_t to use with a conn_t that is based on but + * separate from conn_ixa. + * + * This "safe" copy has the pointers set to NULL + * (since the pointers might be changed by another thread using + * conn_ixa). The caller needs to check for NULL pointers to see + * if ip_set_destination needs to be called to re-establish the pointers. + */ +ip_xmit_attr_t * +conn_get_ixa_exclusive(conn_t *connp) +{ + ip_xmit_attr_t *ixa; + + mutex_enter(&connp->conn_lock); + ixa = connp->conn_ixa; + + /* At least one references for the conn_t */ + ASSERT(ixa->ixa_refcnt >= 1); + + /* Make sure conn_ixa doesn't disappear while we copy it */ + atomic_add_32(&ixa->ixa_refcnt, 1); + + ixa = kmem_alloc(sizeof (*ixa), KM_NOSLEEP); + if (ixa == NULL) { + mutex_exit(&connp->conn_lock); + ixa_refrele(connp->conn_ixa); + return (NULL); + } + ixa_safe_copy(connp->conn_ixa, ixa); + mutex_exit(&connp->conn_lock); + IXA_REFRELE(connp->conn_ixa); + return (ixa); +} + +void +ixa_safe_copy(ip_xmit_attr_t *src, ip_xmit_attr_t *ixa) +{ + bcopy(src, ixa, sizeof (*ixa)); + ixa->ixa_refcnt = 1; + /* + * Clear any pointers that have references and might be changed + * by ip_set_destination or the ULP + */ + ixa->ixa_ire = NULL; + ixa->ixa_nce = NULL; + ixa->ixa_dce = NULL; + ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; + ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; +#ifdef DEBUG + ixa->ixa_curthread = NULL; +#endif + /* Clear all the IPsec pointers and the flag as well. */ + ixa->ixa_flags &= ~IXAF_IPSEC_SECURE; + + ixa->ixa_ipsec_latch = NULL; + ixa->ixa_ipsec_ah_sa = NULL; + ixa->ixa_ipsec_esp_sa = NULL; + ixa->ixa_ipsec_policy = NULL; + ixa->ixa_ipsec_action = NULL; + + /* + * We leave ixa_tsl unchanged, but if it has a refhold we need + * to get an extra refhold. + */ + if (ixa->ixa_free_flags & IXA_FREE_TSL) + label_hold(ixa->ixa_tsl); + + /* + * We leave ixa_cred unchanged, but if it has a refhold we need + * to get an extra refhold. + */ + if (ixa->ixa_free_flags & IXA_FREE_CRED) + crhold(ixa->ixa_cred); +} + +/* + * Duplicate an ip_xmit_attr_t. + * Assumes that the caller controls the ixa, hence we do not need to use + * a safe copy. We just have to increase the refcnt on any pointers. + */ +ip_xmit_attr_t * +ip_xmit_attr_duplicate(ip_xmit_attr_t *src_ixa) +{ + ip_xmit_attr_t *ixa; + + ixa = kmem_alloc(sizeof (*ixa), KM_NOSLEEP); + if (ixa == NULL) + return (NULL); + bcopy(src_ixa, ixa, sizeof (*ixa)); + ixa->ixa_refcnt = 1; + + if (ixa->ixa_ire != NULL) + ire_refhold_notr(ixa->ixa_ire); + if (ixa->ixa_nce != NULL) + nce_refhold(ixa->ixa_nce); + if (ixa->ixa_dce != NULL) + dce_refhold_notr(ixa->ixa_dce); + +#ifdef DEBUG + ixa->ixa_curthread = NULL; +#endif + + if (ixa->ixa_ipsec_latch != NULL) + IPLATCH_REFHOLD(ixa->ixa_ipsec_latch); + if (ixa->ixa_ipsec_ah_sa != NULL) + IPSA_REFHOLD(ixa->ixa_ipsec_ah_sa); + if (ixa->ixa_ipsec_esp_sa != NULL) + IPSA_REFHOLD(ixa->ixa_ipsec_esp_sa); + if (ixa->ixa_ipsec_policy != NULL) + IPPOL_REFHOLD(ixa->ixa_ipsec_policy); + if (ixa->ixa_ipsec_action != NULL) + IPACT_REFHOLD(ixa->ixa_ipsec_action); + + if (ixa->ixa_tsl != NULL) { + label_hold(ixa->ixa_tsl); + ixa->ixa_free_flags |= IXA_FREE_TSL; + } + if (ixa->ixa_cred != NULL) { + crhold(ixa->ixa_cred); + ixa->ixa_free_flags |= IXA_FREE_CRED; + } + return (ixa); +} + +/* + * Used to replace the ixa_label field. + * The caller should have a reference on the label, which we transfer to + * the attributes so that when the attribute is freed/cleaned up + * we will release that reference. + */ +void +ip_xmit_attr_replace_tsl(ip_xmit_attr_t *ixa, ts_label_t *tsl) +{ + ASSERT(tsl != NULL); + + if (ixa->ixa_free_flags & IXA_FREE_TSL) { + ASSERT(ixa->ixa_tsl != NULL); + label_rele(ixa->ixa_tsl); + } else { + ixa->ixa_free_flags |= IXA_FREE_TSL; + } + ixa->ixa_tsl = tsl; +} + +/* + * Replace the ip_recv_attr_t's label. + * Due to kernel RPC's use of db_credp we also need to replace ira_cred; + * TCP/UDP uses ira_cred to set db_credp for non-socket users. + * This can fail (and return B_FALSE) due to lack of memory. + */ +boolean_t +ip_recv_attr_replace_label(ip_recv_attr_t *ira, ts_label_t *tsl) +{ + cred_t *newcr; + + if (ira->ira_free_flags & IRA_FREE_TSL) { + ASSERT(ira->ira_tsl != NULL); + label_rele(ira->ira_tsl); + } + label_hold(tsl); + ira->ira_tsl = tsl; + ira->ira_free_flags |= IRA_FREE_TSL; + + /* + * Reset zoneid if we have a shared address. That allows + * ip_fanout_tx_v4/v6 to determine the zoneid again. + */ + if (ira->ira_flags & IRAF_TX_SHARED_ADDR) + ira->ira_zoneid = ALL_ZONES; + + /* We update ira_cred for RPC */ + newcr = copycred_from_tslabel(ira->ira_cred, ira->ira_tsl, KM_NOSLEEP); + if (newcr == NULL) + return (B_FALSE); + if (ira->ira_free_flags & IRA_FREE_CRED) + crfree(ira->ira_cred); + ira->ira_cred = newcr; + ira->ira_free_flags |= IRA_FREE_CRED; + return (B_TRUE); +} + +/* + * This needs to be called after ip_set_destination/tsol_check_dest might + * have changed ixa_tsl to be specific for a destination, and we now want to + * send to a different destination. + * We have to restart with crgetlabel() since ip_set_destination/ + * tsol_check_dest will start with ixa_tsl. + */ +void +ip_xmit_attr_restore_tsl(ip_xmit_attr_t *ixa, cred_t *cr) +{ + if (!is_system_labeled()) + return; + + if (ixa->ixa_free_flags & IXA_FREE_TSL) { + ASSERT(ixa->ixa_tsl != NULL); + label_rele(ixa->ixa_tsl); + ixa->ixa_free_flags &= ~IXA_FREE_TSL; + } + ixa->ixa_tsl = crgetlabel(cr); +} + +void +ixa_refrele(ip_xmit_attr_t *ixa) +{ + IXA_REFRELE(ixa); +} + +void +ixa_inactive(ip_xmit_attr_t *ixa) +{ + ASSERT(ixa->ixa_refcnt == 0); + + ixa_cleanup(ixa); + kmem_free(ixa, sizeof (*ixa)); +} + +/* + * Release any references contained in the ixa. + * Also clear any fields that are not controlled by ixa_flags. + */ +void +ixa_cleanup(ip_xmit_attr_t *ixa) +{ + if (ixa->ixa_ire != NULL) { + ire_refrele_notr(ixa->ixa_ire); + ixa->ixa_ire = NULL; + } + if (ixa->ixa_dce != NULL) { + dce_refrele_notr(ixa->ixa_dce); + ixa->ixa_dce = NULL; + } + if (ixa->ixa_nce != NULL) { + nce_refrele(ixa->ixa_nce); + ixa->ixa_nce = NULL; + } + ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; + ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; + if (ixa->ixa_flags & IXAF_IPSEC_SECURE) { + ipsec_out_release_refs(ixa); + } + if (ixa->ixa_free_flags & IXA_FREE_TSL) { + ASSERT(ixa->ixa_tsl != NULL); + label_rele(ixa->ixa_tsl); + ixa->ixa_tsl = NULL; + ixa->ixa_free_flags &= ~IXA_FREE_TSL; + } + if (ixa->ixa_free_flags & IXA_FREE_CRED) { + ASSERT(ixa->ixa_cred != NULL); + crfree(ixa->ixa_cred); + ixa->ixa_cred = NULL; + ixa->ixa_free_flags &= ~IXA_FREE_CRED; + } + ixa->ixa_src_preferences = 0; + ixa->ixa_ifindex = 0; + ixa->ixa_multicast_ifindex = 0; + ixa->ixa_multicast_ifaddr = INADDR_ANY; +} + +/* + * Release any references contained in the ira. + * Callers which use ip_recv_attr_from_mblk() would pass B_TRUE as the second + * argument. + */ +void +ira_cleanup(ip_recv_attr_t *ira, boolean_t refrele_ill) +{ + if (ira->ira_ill != NULL) { + if (ira->ira_rill != ira->ira_ill) { + /* Caused by async processing */ + ill_refrele(ira->ira_rill); + } + if (refrele_ill) + ill_refrele(ira->ira_ill); + } + if (ira->ira_flags & IRAF_IPSEC_SECURE) { + ipsec_in_release_refs(ira); + } + if (ira->ira_free_flags & IRA_FREE_TSL) { + ASSERT(ira->ira_tsl != NULL); + label_rele(ira->ira_tsl); + ira->ira_tsl = NULL; + ira->ira_free_flags &= ~IRA_FREE_TSL; + } + if (ira->ira_free_flags & IRA_FREE_CRED) { + ASSERT(ira->ira_cred != NULL); + crfree(ira->ira_cred); + ira->ira_cred = NULL; + ira->ira_free_flags &= ~IRA_FREE_CRED; + } +} + +/* + * Function to help release any IRE, NCE, or DCEs that + * have been deleted and are marked as condemned. + * The caller is responsible for any serialization which is different + * for TCP, SCTP, and others. + */ +static void +ixa_cleanup_stale(ip_xmit_attr_t *ixa) +{ + ire_t *ire; + nce_t *nce; + dce_t *dce; + + ire = ixa->ixa_ire; + nce = ixa->ixa_nce; + dce = ixa->ixa_dce; + + if (ire != NULL && IRE_IS_CONDEMNED(ire)) { + ire_refrele_notr(ire); + ire = ire_blackhole(ixa->ixa_ipst, + !(ixa->ixa_flags & IXAF_IS_IPV4)); + ASSERT(ire != NULL); +#ifdef DEBUG + ire_refhold_notr(ire); + ire_refrele(ire); +#endif + ixa->ixa_ire = ire; + ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; + } + if (nce != NULL && nce->nce_is_condemned) { + /* Can make it NULL as long as we set IRE_GENERATION_VERIFY */ + nce_refrele(nce); + ixa->ixa_nce = NULL; + ixa->ixa_ire_generation = IRE_GENERATION_VERIFY; + } + if (dce != NULL && DCE_IS_CONDEMNED(dce)) { + dce_refrele_notr(dce); + dce = dce_get_default(ixa->ixa_ipst); + ASSERT(dce != NULL); +#ifdef DEBUG + dce_refhold_notr(dce); + dce_refrele(dce); +#endif + ixa->ixa_dce = dce; + ixa->ixa_dce_generation = DCE_GENERATION_VERIFY; + } +} + +/* + * Used to run ixa_cleanup_stale inside the tcp squeue. + * When done we hand the mp back by assigning it to tcps_ixa_cleanup_mp + * and waking up the caller. + */ +/* ARGSUSED2 */ +static void +tcp_ixa_cleanup(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy) +{ + conn_t *connp = (conn_t *)arg; + tcp_stack_t *tcps; + + tcps = connp->conn_netstack->netstack_tcp; + + ixa_cleanup_stale(connp->conn_ixa); + + mutex_enter(&tcps->tcps_ixa_cleanup_lock); + ASSERT(tcps->tcps_ixa_cleanup_mp == NULL); + tcps->tcps_ixa_cleanup_mp = mp; + cv_signal(&tcps->tcps_ixa_cleanup_cv); + mutex_exit(&tcps->tcps_ixa_cleanup_lock); +} + + +/* + * ipcl_walk() function to help release any IRE, NCE, or DCEs that + * have been deleted and are marked as condemned. + * Note that we can't cleanup the pointers since there can be threads + * in conn_ip_output() sending while we are called. + */ +void +conn_ixa_cleanup(conn_t *connp, void *arg) +{ + boolean_t tryhard = (boolean_t)arg; + + if (IPCL_IS_TCP(connp)) { + mblk_t *mp; + tcp_stack_t *tcps; + + tcps = connp->conn_netstack->netstack_tcp; + + mutex_enter(&tcps->tcps_ixa_cleanup_lock); + while ((mp = tcps->tcps_ixa_cleanup_mp) == NULL) { + /* + * Multiple concurrent cleanups; need to have the last + * one run since it could be an unplumb. + */ + cv_wait(&tcps->tcps_ixa_cleanup_cv, + &tcps->tcps_ixa_cleanup_lock); + } + tcps->tcps_ixa_cleanup_mp = NULL; + mutex_exit(&tcps->tcps_ixa_cleanup_lock); + + if (connp->conn_sqp->sq_run == curthread) { + /* Already on squeue */ + tcp_ixa_cleanup(connp, mp, NULL, NULL); + } else { + CONN_INC_REF(connp); + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_ixa_cleanup, + connp, NULL, SQ_PROCESS, SQTAG_TCP_IXA_CLEANUP); + + /* Wait until tcp_ixa_cleanup has run */ + mutex_enter(&tcps->tcps_ixa_cleanup_lock); + while (tcps->tcps_ixa_cleanup_mp == NULL) { + cv_wait(&tcps->tcps_ixa_cleanup_cv, + &tcps->tcps_ixa_cleanup_lock); + } + mutex_exit(&tcps->tcps_ixa_cleanup_lock); + } + } else if (IPCL_IS_SCTP(connp)) { + sctp_t *sctp; + sctp_faddr_t *fp; + + sctp = CONN2SCTP(connp); + RUN_SCTP(sctp); + ixa_cleanup_stale(connp->conn_ixa); + for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->next) + ixa_cleanup_stale(fp->ixa); + WAKE_SCTP(sctp); + } else { + ip_xmit_attr_t *ixa; + + /* + * If there is a different thread using conn_ixa then we get a + * new copy and cut the old one loose from conn_ixa. Otherwise + * we use conn_ixa and prevent any other thread from + * using/changing it. Anybody using conn_ixa (e.g., a thread in + * conn_ip_output) will do an ixa_refrele which will remove any + * references on the ire etc. + * + * Once we are done other threads can use conn_ixa since the + * refcnt will be back at one. + * + * We are called either because an ill is going away, or + * due to memory reclaim. In the former case we wait for + * memory since we must remove the refcnts on the ill. + */ + if (tryhard) { + ixa = conn_get_ixa_tryhard(connp, B_TRUE); + ASSERT(ixa != NULL); + } else { + ixa = conn_get_ixa(connp, B_TRUE); + if (ixa == NULL) { + /* + * Somebody else was using it and kmem_alloc + * failed! Next memory reclaim will try to + * clean up. + */ + DTRACE_PROBE1(conn__ixa__cleanup__bail, + conn_t *, connp); + return; + } + } + ixa_cleanup_stale(ixa); + ixa_refrele(ixa); + } +} + +/* + * ixa needs to be an exclusive copy so that no one changes the cookie + * or the ixa_nce. + */ +boolean_t +ixa_check_drain_insert(conn_t *connp, ip_xmit_attr_t *ixa) +{ + uintptr_t cookie = ixa->ixa_cookie; + ill_dld_direct_t *idd; + idl_tx_list_t *idl_txl; + ill_t *ill = ixa->ixa_nce->nce_ill; + boolean_t inserted = B_FALSE; + + idd = &(ill)->ill_dld_capab->idc_direct; + idl_txl = &ixa->ixa_ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)]; + if (cookie == 0) { + /* + * ip_xmit failed the canputnext check + */ + connp->conn_did_putbq = 1; + ASSERT(cookie == 0); + conn_drain_insert(connp, idl_txl); + if (!IPCL_IS_NONSTR(connp)) + noenable(connp->conn_wq); + return (B_TRUE); + } + ASSERT(ILL_DIRECT_CAPABLE(ill)); + mutex_enter(&idl_txl->txl_lock); + if (connp->conn_direct_blocked || + (idd->idd_tx_fctl_df(idd->idd_tx_fctl_dh, cookie) == 0)) { + DTRACE_PROBE1(ill__tx__not__blocked, boolean, + connp->conn_direct_blocked); + } else if (idl_txl->txl_cookie != NULL && + idl_txl->txl_cookie != ixa->ixa_cookie) { + DTRACE_PROBE2(ill__send__tx__collision, uintptr_t, cookie, + uintptr_t, idl_txl->txl_cookie); + /* bump kstat for cookie collision */ + } else { + connp->conn_direct_blocked = B_TRUE; + idl_txl->txl_cookie = cookie; + conn_drain_insert(connp, idl_txl); + if (!IPCL_IS_NONSTR(connp)) + noenable(connp->conn_wq); + inserted = B_TRUE; + } + mutex_exit(&idl_txl->txl_lock); + return (inserted); +} |