diff options
Diffstat (limited to 'usr/src/uts/common/inet/udp/udp.c')
-rw-r--r-- | usr/src/uts/common/inet/udp/udp.c | 2643 |
1 files changed, 955 insertions, 1688 deletions
diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c index 301c397cf6..91c3cd6772 100644 --- a/usr/src/uts/common/inet/udp/udp.c +++ b/usr/src/uts/common/inet/udp/udp.c @@ -98,101 +98,39 @@ const char udp_version[] = "%Z%%M% %I% %E% SMI"; /* * Synchronization notes: * - * UDP uses a combination of its internal perimeter, a global lock and - * a set of bind hash locks to protect its data structures. Please see - * the note above udp_mode_assertions for details about the internal - * perimeter. + * UDP is MT and uses the usual kernel synchronization primitives. There are 2 + * locks, the fanout lock (uf_lock) and the udp endpoint lock udp_rwlock. + * We also use conn_lock when updating things that affect the IP classifier + * lookup. + * The lock order is udp_rwlock -> uf_lock and is udp_rwlock -> conn_lock. * + * The fanout lock uf_lock: * When a UDP endpoint is bound to a local port, it is inserted into * a bind hash list. The list consists of an array of udp_fanout_t buckets. * The size of the array is controlled by the udp_bind_fanout_size variable. * This variable can be changed in /etc/system if the default value is * not large enough. Each bind hash bucket is protected by a per bucket * lock. It protects the udp_bind_hash and udp_ptpbhn fields in the udp_t - * structure. An UDP endpoint is removed from the bind hash list only - * when it is being unbound or being closed. The per bucket lock also - * protects a UDP endpoint's state changes. + * structure and a few other fields in the udp_t. A UDP endpoint is removed + * from the bind hash list only when it is being unbound or being closed. + * The per bucket lock also protects a UDP endpoint's state changes. * - * Plumbing notes: - * - * Both udp and ip are merged, but the streams plumbing is kept unchanged - * in that udp is always pushed atop /dev/ip. This is done to preserve - * backwards compatibility for certain applications which rely on such - * plumbing geometry to do things such as issuing I_POP on the stream - * in order to obtain direct access to /dev/ip, etc. - * - * All UDP processings happen in the /dev/ip instance; the udp module - * instance does not possess any state about the endpoint, and merely - * acts as a dummy module whose presence is to keep the streams plumbing - * appearance unchanged. At open time /dev/ip allocates a conn_t that - * happens to embed a udp_t. This stays dormant until the time udp is - * pushed, which indicates to /dev/ip that it must convert itself from - * an IP to a UDP endpoint. - * - * We only allow for the following plumbing cases: + * The udp_rwlock: + * This protects most of the other fields in the udp_t. The exact list of + * fields which are protected by each of the above locks is documented in + * the udp_t structure definition. * - * Normal: - * /dev/ip is first opened and later udp is pushed directly on top. - * This is the default action that happens when a udp socket or - * /dev/udp is opened. The conn_t created by /dev/ip instance is - * now shared and is marked with IPCL_UDP. - * - * SNMP-only: - * udp is pushed on top of a module other than /dev/ip. When this - * happens it will support only SNMP semantics. A new conn_t is - * allocated and marked with IPCL_UDPMOD. + * Plumbing notes: + * UDP is always a device driver. For compatibility with mibopen() code + * it is possible to I_PUSH "udp", but that results in pushing a passthrough + * dummy module. * - * The above cases imply that we don't support any intermediate module to + * The above implies that we don't support any intermediate module to * reside in between /dev/ip and udp -- in fact, we never supported such * scenario in the past as the inter-layer communication semantics have - * always been private. Also note that the normal case allows for SNMP - * requests to be processed in addition to the rest of UDP operations. - * - * The normal case plumbing is depicted by the following diagram: - * - * +---------------+---------------+ - * | | | udp - * | udp_wq | udp_rq | - * | | UDP_RD | - * | | | - * +---------------+---------------+ - * | ^ - * v | - * +---------------+---------------+ - * | | | /dev/ip - * | ip_wq | ip_rq | conn_t - * | UDP_WR | | - * | | | - * +---------------+---------------+ - * - * Messages arriving at udp_wq from above will end up in ip_wq before - * it gets processed, i.e. udp write entry points will advance udp_wq - * and use its q_next value as ip_wq in order to use the conn_t that - * is stored in its q_ptr. Likewise, messages generated by ip to the - * module above udp will appear as if they are originated from udp_rq, - * i.e. putnext() calls to the module above udp is done using the - * udp_rq instead of ip_rq in order to avoid udp_rput() which does - * nothing more than calling putnext(). - * - * The above implies the following rule of thumb: - * - * 1. udp_t is obtained from conn_t, which is created by the /dev/ip - * instance and is stored in q_ptr of both ip_wq and ip_rq. There - * is no direct reference to conn_t from either udp_wq or udp_rq. - * - * 2. Write-side entry points of udp can obtain the conn_t via the - * Q_TO_CONN() macro, using the queue value obtain from UDP_WR(). - * - * 3. While in /dev/ip context, putnext() to the module above udp can - * be done by supplying the queue value obtained from UDP_RD(). - * + * always been private. */ -static queue_t *UDP_WR(queue_t *); -static queue_t *UDP_RD(queue_t *); - -struct kmem_cache *udp_cache; - /* For /etc/system control */ uint_t udp_bind_fanout_size = UDP_BIND_FANOUT_SIZE; @@ -217,7 +155,10 @@ static void udp_addr_req(queue_t *q, mblk_t *mp); static void udp_bind(queue_t *q, mblk_t *mp); static void udp_bind_hash_insert(udp_fanout_t *uf, udp_t *udp); static void udp_bind_hash_remove(udp_t *udp, boolean_t caller_holds_lock); -static int udp_build_hdrs(queue_t *q, udp_t *udp); +static void udp_bind_result(conn_t *, mblk_t *); +static void udp_bind_ack(conn_t *, mblk_t *mp); +static void udp_bind_error(conn_t *, mblk_t *mp); +static int udp_build_hdrs(udp_t *udp); static void udp_capability_req(queue_t *q, mblk_t *mp); static int udp_close(queue_t *q); static void udp_connect(queue_t *q, mblk_t *mp); @@ -235,9 +176,16 @@ static int udp_extra_priv_ports_del(queue_t *q, mblk_t *mp, static void udp_icmp_error(queue_t *q, mblk_t *mp); static void udp_icmp_error_ipv6(queue_t *q, mblk_t *mp); static void udp_info_req(queue_t *q, mblk_t *mp); +static void udp_input(void *, mblk_t *, void *); static mblk_t *udp_ip_bind_mp(udp_t *udp, t_scalar_t bind_prim, t_scalar_t addr_length); +static void udp_lrput(queue_t *, mblk_t *); +static void udp_lwput(queue_t *, mblk_t *); static int udp_open(queue_t *q, dev_t *devp, int flag, int sflag, + cred_t *credp, boolean_t isv6); +static int udp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, + cred_t *credp); +static int udp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp); static int udp_unitdata_opt_process(queue_t *q, mblk_t *mp, int *errorp, udpattrs_t *udpattrs); @@ -247,11 +195,8 @@ static boolean_t udp_param_register(IDP *ndp, udpparam_t *udppa, int cnt); static int udp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr); static void udp_report_item(mblk_t *mp, udp_t *udp); -static void udp_rput(queue_t *q, mblk_t *mp); -static void udp_rput_other(queue_t *, mblk_t *); static int udp_rinfop(queue_t *q, infod_t *dp); static int udp_rrw(queue_t *q, struiod_t *dp); -static void udp_rput_bind_ack(queue_t *q, mblk_t *mp); static int udp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); static void udp_send_data(udp_t *, queue_t *, mblk_t *, ipha_t *); @@ -260,15 +205,12 @@ static void udp_ud_err(queue_t *q, mblk_t *mp, uchar_t *destaddr, static void udp_unbind(queue_t *q, mblk_t *mp); static in_port_t udp_update_next_port(udp_t *udp, in_port_t port, boolean_t random); -static void udp_wput(queue_t *q, mblk_t *mp); static mblk_t *udp_output_v4(conn_t *, mblk_t *, ipaddr_t, uint16_t, uint_t, int *, boolean_t); static mblk_t *udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6, int *error); static void udp_wput_other(queue_t *q, mblk_t *mp); static void udp_wput_iocdata(queue_t *q, mblk_t *mp); -static void udp_output(conn_t *connp, mblk_t *mp, struct sockaddr *addr, - socklen_t addrlen); static size_t udp_set_rcv_hiwat(udp_t *udp, size_t size); static void *udp_stack_init(netstackid_t stackid, netstack_t *ns); @@ -279,56 +221,62 @@ static void udp_kstat_fini(netstackid_t stackid, kstat_t *ksp); static void *udp_kstat2_init(netstackid_t, udp_stat_t *); static void udp_kstat2_fini(netstackid_t, kstat_t *); static int udp_kstat_update(kstat_t *kp, int rw); -static void udp_input_wrapper(void *arg, mblk_t *mp, void *arg2); -static void udp_rput_other_wrapper(void *arg, mblk_t *mp, void *arg2); -static void udp_wput_other_wrapper(void *arg, mblk_t *mp, void *arg2); -static void udp_resume_bind_cb(void *arg, mblk_t *mp, void *arg2); static void udp_rcv_enqueue(queue_t *q, udp_t *udp, mblk_t *mp, uint_t pkt_len); static void udp_rcv_drain(queue_t *q, udp_t *udp, boolean_t closing); -static void udp_enter(conn_t *, mblk_t *, sqproc_t, uint8_t); -static void udp_exit(conn_t *); -static void udp_become_writer(conn_t *, mblk_t *, sqproc_t, uint8_t); -#ifdef DEBUG -static void udp_mode_assertions(udp_t *, int); -#endif /* DEBUG */ - -major_t UDP6_MAJ; -#define UDP6 "udp6" +static void udp_xmit(queue_t *, mblk_t *, ire_t *ire, conn_t *, zoneid_t); #define UDP_RECV_HIWATER (56 * 1024) #define UDP_RECV_LOWATER 128 #define UDP_XMIT_HIWATER (56 * 1024) #define UDP_XMIT_LOWATER 1024 -static struct module_info udp_info = { +static struct module_info udp_mod_info = { UDP_MOD_ID, UDP_MOD_NAME, 1, INFPSZ, UDP_RECV_HIWATER, UDP_RECV_LOWATER }; -static struct qinit udp_rinit = { - (pfi_t)udp_rput, NULL, udp_open, udp_close, NULL, - &udp_info, NULL, udp_rrw, udp_rinfop, STRUIOT_STANDARD +/* + * Entry points for UDP as a device. + * We have separate open functions for the /dev/udp and /dev/udp6 devices. + */ +static struct qinit udp_rinitv4 = { + NULL, NULL, udp_openv4, udp_close, NULL, + &udp_mod_info, NULL, udp_rrw, udp_rinfop, STRUIOT_STANDARD +}; + +static struct qinit udp_rinitv6 = { + NULL, NULL, udp_openv6, udp_close, NULL, + &udp_mod_info, NULL, udp_rrw, udp_rinfop, STRUIOT_STANDARD }; static struct qinit udp_winit = { (pfi_t)udp_wput, NULL, NULL, NULL, NULL, - &udp_info, NULL, NULL, NULL, STRUIOT_NONE + &udp_mod_info, NULL, NULL, NULL, STRUIOT_NONE }; -/* Support for just SNMP if UDP is not pushed directly over device IP */ -struct qinit udp_snmp_rinit = { - (pfi_t)putnext, NULL, udp_open, ip_snmpmod_close, NULL, - &udp_info, NULL, NULL, NULL, STRUIOT_NONE +/* + * UDP needs to handle I_LINK and I_PLINK since ifconfig + * likes to use it as a place to hang the various streams. + */ +static struct qinit udp_lrinit = { + (pfi_t)udp_lrput, NULL, udp_openv4, udp_close, NULL, + &udp_mod_info }; -struct qinit udp_snmp_winit = { - (pfi_t)ip_snmpmod_wput, NULL, udp_open, ip_snmpmod_close, NULL, - &udp_info, NULL, NULL, NULL, STRUIOT_NONE +static struct qinit udp_lwinit = { + (pfi_t)udp_lwput, NULL, udp_openv4, udp_close, NULL, + &udp_mod_info }; -struct streamtab udpinfo = { - &udp_rinit, &udp_winit +/* For AF_INET aka /dev/udp */ +struct streamtab udpinfov4 = { + &udp_rinitv4, &udp_winit, &udp_lrinit, &udp_lwinit +}; + +/* For AF_INET6 aka /dev/udp6 */ +struct streamtab udpinfov6 = { + &udp_rinitv6, &udp_winit, &udp_lrinit, &udp_lwinit }; static sin_t sin_null; /* Zero address for quick clears */ @@ -409,429 +357,6 @@ void (*cl_inet_unbind)(uint8_t protocol, sa_family_t addr_family, typedef union T_primitives *t_primp_t; -#define UDP_ENQUEUE_MP(udp, mp, proc, tag) { \ - ASSERT((mp)->b_prev == NULL && (mp)->b_queue == NULL); \ - ASSERT(MUTEX_HELD(&(udp)->udp_connp->conn_lock)); \ - (mp)->b_queue = (queue_t *)((uintptr_t)tag); \ - (mp)->b_prev = (mblk_t *)proc; \ - if ((udp)->udp_mphead == NULL) \ - (udp)->udp_mphead = (mp); \ - else \ - (udp)->udp_mptail->b_next = (mp); \ - (udp)->udp_mptail = (mp); \ - (udp)->udp_mpcount++; \ -} - -#define UDP_READERS_INCREF(udp) { \ - ASSERT(MUTEX_HELD(&(udp)->udp_connp->conn_lock)); \ - (udp)->udp_reader_count++; \ -} - -#define UDP_READERS_DECREF(udp) { \ - ASSERT(MUTEX_HELD(&(udp)->udp_connp->conn_lock)); \ - (udp)->udp_reader_count--; \ - if ((udp)->udp_reader_count == 0) \ - cv_broadcast(&(udp)->udp_connp->conn_cv); \ -} - -#define UDP_SQUEUE_DECREF(udp) { \ - ASSERT(MUTEX_HELD(&(udp)->udp_connp->conn_lock)); \ - (udp)->udp_squeue_count--; \ - if ((udp)->udp_squeue_count == 0) \ - cv_broadcast(&(udp)->udp_connp->conn_cv); \ -} - -/* - * Notes on UDP endpoint synchronization: - * - * UDP needs exclusive operation on a per endpoint basis, when executing - * functions that modify the endpoint state. udp_rput_other() deals with - * packets with IP options, and processing these packets end up having - * to update the endpoint's option related state. udp_wput_other() deals - * with control operations from the top, e.g. connect() that needs to - * update the endpoint state. These could be synchronized using locks, - * but the current version uses squeues for this purpose. squeues may - * give performance improvement for certain cases such as connected UDP - * sockets; thus the framework allows for using squeues. - * - * The perimeter routines are described as follows: - * - * udp_enter(): - * Enter the UDP endpoint perimeter. - * - * udp_become_writer(): - * Become exclusive on the UDP endpoint. Specifies a function - * that will be called exclusively either immediately or later - * when the perimeter is available exclusively. - * - * udp_exit(): - * Exit the UDP perimeter. - * - * Entering UDP from the top or from the bottom must be done using - * udp_enter(). No lock must be held while attempting to enter the UDP - * perimeter. When finished, udp_exit() must be called to get out of - * the perimeter. - * - * UDP operates in either MT_HOT mode or in SQUEUE mode. In MT_HOT mode, - * multiple threads may enter a UDP endpoint concurrently. This is used - * for sending and/or receiving normal data. Control operations and other - * special cases call udp_become_writer() to become exclusive on a per - * endpoint basis and this results in transitioning to SQUEUE mode. squeue - * by definition serializes access to the conn_t. When there are no more - * pending messages on the squeue for the UDP connection, the endpoint - * reverts to MT_HOT mode. During the interregnum when not all MT threads - * of an endpoint have finished, messages are queued in the UDP endpoint - * and the UDP is in UDP_MT_QUEUED mode or UDP_QUEUED_SQUEUE mode. - * - * These modes have the following analogs: - * - * UDP_MT_HOT/udp_reader_count==0 none - * UDP_MT_HOT/udp_reader_count>0 RW_READ_LOCK - * UDP_MT_QUEUED RW_WRITE_WANTED - * UDP_SQUEUE or UDP_QUEUED_SQUEUE RW_WRITE_LOCKED - * - * Stable modes: UDP_MT_HOT, UDP_SQUEUE - * Transient modes: UDP_MT_QUEUED, UDP_QUEUED_SQUEUE - * - * While in stable modes, UDP keeps track of the number of threads - * operating on the endpoint. The udp_reader_count variable represents - * the number of threads entering the endpoint as readers while it is - * in UDP_MT_HOT mode. Transitioning to UDP_SQUEUE happens when there - * is only a single reader, i.e. when this counter drops to 1. Likewise, - * udp_squeue_count represents the number of threads operating on the - * endpoint's squeue while it is in UDP_SQUEUE mode. The mode transition - * to UDP_MT_HOT happens after the last thread exits the endpoint, i.e. - * when this counter drops to 0. - * - * The default mode is set to UDP_MT_HOT and UDP alternates between - * UDP_MT_HOT and UDP_SQUEUE as shown in the state transition below. - * - * Mode transition: - * ---------------------------------------------------------------- - * old mode Event New mode - * ---------------------------------------------------------------- - * UDP_MT_HOT Call to udp_become_writer() UDP_SQUEUE - * and udp_reader_count == 1 - * - * UDP_MT_HOT Call to udp_become_writer() UDP_MT_QUEUED - * and udp_reader_count > 1 - * - * UDP_MT_QUEUED udp_reader_count drops to zero UDP_QUEUED_SQUEUE - * - * UDP_QUEUED_SQUEUE All messages enqueued on the UDP_SQUEUE - * internal UDP queue successfully - * moved to squeue AND udp_squeue_count != 0 - * - * UDP_QUEUED_SQUEUE All messages enqueued on the UDP_MT_HOT - * internal UDP queue successfully - * moved to squeue AND udp_squeue_count - * drops to zero - * - * UDP_SQUEUE udp_squeue_count drops to zero UDP_MT_HOT - * ---------------------------------------------------------------- - */ - -static queue_t * -UDP_WR(queue_t *q) -{ - ASSERT(q->q_ptr == NULL && _OTHERQ(q)->q_ptr == NULL); - ASSERT(WR(q)->q_next != NULL && WR(q)->q_next->q_ptr != NULL); - ASSERT(IPCL_IS_UDP(Q_TO_CONN(WR(q)->q_next))); - - return (_WR(q)->q_next); -} - -static queue_t * -UDP_RD(queue_t *q) -{ - ASSERT(q->q_ptr != NULL && _OTHERQ(q)->q_ptr != NULL); - ASSERT(IPCL_IS_UDP(Q_TO_CONN(q))); - ASSERT(RD(q)->q_next != NULL && RD(q)->q_next->q_ptr == NULL); - - return (_RD(q)->q_next); -} - -#ifdef DEBUG -#define UDP_MODE_ASSERTIONS(udp, caller) udp_mode_assertions(udp, caller) -#else -#define UDP_MODE_ASSERTIONS(udp, caller) -#endif - -/* Invariants */ -#ifdef DEBUG - -uint32_t udp_count[4]; - -/* Context of udp_mode_assertions */ -#define UDP_ENTER 1 -#define UDP_BECOME_WRITER 2 -#define UDP_EXIT 3 - -static void -udp_mode_assertions(udp_t *udp, int caller) -{ - ASSERT(MUTEX_HELD(&udp->udp_connp->conn_lock)); - - switch (udp->udp_mode) { - case UDP_MT_HOT: - /* - * Messages have not yet been enqueued on the internal queue, - * otherwise we would have switched to UDP_MT_QUEUED. Likewise - * by definition, there can't be any messages enqueued on the - * squeue. The UDP could be quiescent, so udp_reader_count - * could be zero at entry. - */ - ASSERT(udp->udp_mphead == NULL && udp->udp_mpcount == 0 && - udp->udp_squeue_count == 0); - ASSERT(caller == UDP_ENTER || udp->udp_reader_count != 0); - udp_count[0]++; - break; - - case UDP_MT_QUEUED: - /* - * The last MT thread to exit the udp perimeter empties the - * internal queue and then switches the UDP to - * UDP_QUEUED_SQUEUE mode. Since we are still in UDP_MT_QUEUED - * mode, it means there must be at least 1 MT thread still in - * the perimeter and at least 1 message on the internal queue. - */ - ASSERT(udp->udp_reader_count >= 1 && udp->udp_mphead != NULL && - udp->udp_mpcount != 0 && udp->udp_squeue_count == 0); - udp_count[1]++; - break; - - case UDP_QUEUED_SQUEUE: - /* - * The switch has happened from MT to SQUEUE. So there can't - * any MT threads. Messages could still pile up on the internal - * queue until the transition is complete and we move to - * UDP_SQUEUE mode. We can't assert on nonzero udp_squeue_count - * since the squeue could drain any time. - */ - ASSERT(udp->udp_reader_count == 0); - udp_count[2]++; - break; - - case UDP_SQUEUE: - /* - * The transition is complete. Thre can't be any messages on - * the internal queue. The udp could be quiescent or the squeue - * could drain any time, so we can't assert on nonzero - * udp_squeue_count during entry. Nor can we assert that - * udp_reader_count is zero, since, a reader thread could have - * directly become writer in line by calling udp_become_writer - * without going through the queued states. - */ - ASSERT(udp->udp_mphead == NULL && udp->udp_mpcount == 0); - ASSERT(caller == UDP_ENTER || udp->udp_squeue_count != 0); - udp_count[3]++; - break; - } -} -#endif - -#define _UDP_ENTER(connp, mp, proc, tag) { \ - udp_t *_udp = (connp)->conn_udp; \ - \ - mutex_enter(&(connp)->conn_lock); \ - if ((connp)->conn_state_flags & CONN_CLOSING) { \ - mutex_exit(&(connp)->conn_lock); \ - freemsg(mp); \ - } else { \ - UDP_MODE_ASSERTIONS(_udp, UDP_ENTER); \ - \ - switch (_udp->udp_mode) { \ - case UDP_MT_HOT: \ - /* We can execute as reader right away. */ \ - UDP_READERS_INCREF(_udp); \ - mutex_exit(&(connp)->conn_lock); \ - (*(proc))(connp, mp, (connp)->conn_sqp); \ - break; \ - \ - case UDP_SQUEUE: \ - /* \ - * We are in squeue mode, send the \ - * packet to the squeue \ - */ \ - _udp->udp_squeue_count++; \ - CONN_INC_REF_LOCKED(connp); \ - mutex_exit(&(connp)->conn_lock); \ - squeue_enter((connp)->conn_sqp, mp, proc, \ - connp, tag); \ - break; \ - \ - case UDP_MT_QUEUED: \ - case UDP_QUEUED_SQUEUE: \ - /* \ - * Some messages may have been enqueued \ - * ahead of us. Enqueue the new message \ - * at the tail of the internal queue to \ - * preserve message ordering. \ - */ \ - UDP_ENQUEUE_MP(_udp, mp, proc, tag); \ - mutex_exit(&(connp)->conn_lock); \ - break; \ - } \ - } \ -} - -static void -udp_enter(conn_t *connp, mblk_t *mp, sqproc_t proc, uint8_t tag) -{ - _UDP_ENTER(connp, mp, proc, tag); -} - -static void -udp_become_writer(conn_t *connp, mblk_t *mp, sqproc_t proc, uint8_t tag) -{ - udp_t *udp; - - udp = connp->conn_udp; - - mutex_enter(&connp->conn_lock); - - UDP_MODE_ASSERTIONS(udp, UDP_BECOME_WRITER); - - switch (udp->udp_mode) { - case UDP_MT_HOT: - if (udp->udp_reader_count == 1) { - /* - * We are the only MT thread. Switch to squeue mode - * immediately. - */ - udp->udp_mode = UDP_SQUEUE; - udp->udp_squeue_count = 1; - CONN_INC_REF_LOCKED(connp); - mutex_exit(&connp->conn_lock); - squeue_enter(connp->conn_sqp, mp, proc, connp, tag); - return; - } - /* FALLTHRU */ - - case UDP_MT_QUEUED: - /* Enqueue the packet internally in UDP */ - udp->udp_mode = UDP_MT_QUEUED; - UDP_ENQUEUE_MP(udp, mp, proc, tag); - mutex_exit(&connp->conn_lock); - return; - - case UDP_SQUEUE: - case UDP_QUEUED_SQUEUE: - /* - * We are already exclusive. i.e. we are already - * writer. Simply call the desired function. - */ - udp->udp_squeue_count++; - mutex_exit(&connp->conn_lock); - (*proc)(connp, mp, connp->conn_sqp); - return; - } -} - -/* - * Transition from MT mode to SQUEUE mode, when the last MT thread - * is exiting the UDP perimeter. Move all messages from the internal - * udp queue to the squeue. A better way would be to move all the - * messages in one shot, this needs more support from the squeue framework - */ -static void -udp_switch_to_squeue(udp_t *udp) -{ - mblk_t *mp; - mblk_t *mp_next; - sqproc_t proc; - uint8_t tag; - conn_t *connp = udp->udp_connp; - - ASSERT(MUTEX_HELD(&connp->conn_lock)); - ASSERT(udp->udp_mode == UDP_MT_QUEUED); - while (udp->udp_mphead != NULL) { - mp = udp->udp_mphead; - udp->udp_mphead = NULL; - udp->udp_mptail = NULL; - udp->udp_mpcount = 0; - udp->udp_mode = UDP_QUEUED_SQUEUE; - mutex_exit(&connp->conn_lock); - /* - * It is best not to hold any locks across the calls - * to squeue functions. Since we drop the lock we - * need to go back and check the udp_mphead once again - * after the squeue_fill and hence the while loop at - * the top of this function - */ - for (; mp != NULL; mp = mp_next) { - mp_next = mp->b_next; - proc = (sqproc_t)mp->b_prev; - tag = (uint8_t)((uintptr_t)mp->b_queue); - mp->b_next = NULL; - mp->b_prev = NULL; - mp->b_queue = NULL; - CONN_INC_REF(connp); - udp->udp_squeue_count++; - squeue_fill(connp->conn_sqp, mp, proc, connp, - tag); - } - mutex_enter(&connp->conn_lock); - } - /* - * udp_squeue_count of zero implies that the squeue has drained - * even before we arrived here (i.e. after the squeue_fill above) - */ - udp->udp_mode = (udp->udp_squeue_count != 0) ? - UDP_SQUEUE : UDP_MT_HOT; -} - -#define _UDP_EXIT(connp) { \ - udp_t *_udp = (connp)->conn_udp; \ - \ - mutex_enter(&(connp)->conn_lock); \ - UDP_MODE_ASSERTIONS(_udp, UDP_EXIT); \ - \ - switch (_udp->udp_mode) { \ - case UDP_MT_HOT: \ - UDP_READERS_DECREF(_udp); \ - mutex_exit(&(connp)->conn_lock); \ - break; \ - \ - case UDP_SQUEUE: \ - UDP_SQUEUE_DECREF(_udp); \ - if (_udp->udp_squeue_count == 0) \ - _udp->udp_mode = UDP_MT_HOT; \ - mutex_exit(&(connp)->conn_lock); \ - break; \ - \ - case UDP_MT_QUEUED: \ - /* \ - * If this is the last MT thread, we need to \ - * switch to squeue mode \ - */ \ - UDP_READERS_DECREF(_udp); \ - if (_udp->udp_reader_count == 0) \ - udp_switch_to_squeue(_udp); \ - mutex_exit(&(connp)->conn_lock); \ - break; \ - \ - case UDP_QUEUED_SQUEUE: \ - UDP_SQUEUE_DECREF(_udp); \ - /* \ - * Even if the udp_squeue_count drops to zero, we \ - * don't want to change udp_mode to UDP_MT_HOT here. \ - * The thread in udp_switch_to_squeue will take care \ - * of the transition to UDP_MT_HOT, after emptying \ - * any more new messages that have been enqueued in \ - * udp_mphead. \ - */ \ - mutex_exit(&(connp)->conn_lock); \ - break; \ - } \ -} - -static void -udp_exit(conn_t *connp) -{ - _UDP_EXIT(connp); -} - /* * Return the next anonymous port in the privileged port range for * bind checking. @@ -988,9 +513,7 @@ udp_bind_hash_insert(udp_fanout_t *uf, udp_t *udp) udp_t *udpnext; ASSERT(MUTEX_HELD(&uf->uf_lock)); - if (udp->udp_ptpbhn != NULL) { - udp_bind_hash_remove(udp, B_TRUE); - } + ASSERT(udp->udp_ptpbhn == NULL); udpp = &uf->uf_udp; udpnext = udpp[0]; if (udpnext != NULL) { @@ -1068,7 +591,6 @@ udp_bind(queue_t *q, mblk_t *mp) udp_err_ack(q, mp, TPROTO, 0); return; } - if (udp->udp_state != TS_UNBND) { (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, "udp_bind: bad state, %u", udp->udp_state); @@ -1198,9 +720,25 @@ udp_bind(queue_t *q, mblk_t *mp) } /* + * The state must be TS_UNBND. TPI mandates that users must send + * TPI primitives only 1 at a time and wait for the response before + * sending the next primitive. + */ + rw_enter(&udp->udp_rwlock, RW_WRITER); + if (udp->udp_state != TS_UNBND || udp->udp_pending_op != -1) { + rw_exit(&udp->udp_rwlock); + (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, + "udp_bind: bad state, %u", udp->udp_state); + udp_err_ack(q, mp, TOUTSTATE, 0); + return; + } + udp->udp_pending_op = tbr->PRIM_type; + /* * Copy the source address into our udp structure. This address * may still be zero; if so, IP will fill in the correct address - * each time an outbound packet is passed to it. + * each time an outbound packet is passed to it. Since the udp is + * not yet in the bind hash list, we don't grab the uf_lock to + * change udp_ipversion */ if (udp->udp_family == AF_INET) { ASSERT(sin != NULL); @@ -1212,6 +750,10 @@ udp_bind(queue_t *q, mblk_t *mp) ASSERT(sin6 != NULL); v6src = sin6->sin6_addr; if (IN6_IS_ADDR_V4MAPPED(&v6src)) { + /* + * no need to hold the uf_lock to set the udp_ipversion + * since we are not yet in the fanout list + */ udp->udp_ipversion = IPV4_VERSION; udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE + udp->udp_ip_snd_options_len; @@ -1383,6 +925,8 @@ udp_bind(queue_t *q, mblk_t *mp) * the routine (and exit the loop). * */ + udp->udp_pending_op = -1; + rw_exit(&udp->udp_rwlock); udp_err_ack(q, mp, TADDRBUSY, 0); return; } @@ -1412,6 +956,8 @@ udp_bind(queue_t *q, mblk_t *mp) * there are none available, so send an error * to the user. */ + udp->udp_pending_op = -1; + rw_exit(&udp->udp_rwlock); udp_err_ack(q, mp, TNOADDR, 0); return; } @@ -1421,8 +967,9 @@ udp_bind(queue_t *q, mblk_t *mp) * Copy the source address into our udp structure. This address * may still be zero; if so, ip will fill in the correct address * each time an outbound packet is passed to it. - * If we are binding to a broadcast or multicast address udp_rput - * will clear the source address when it receives the T_BIND_ACK. + * If we are binding to a broadcast or multicast address then + * udp_bind_ack will clear the source address when it receives + * the T_BIND_ACK. */ udp->udp_v6src = udp->udp_bound_v6src = v6src; udp->udp_port = lport; @@ -1442,8 +989,10 @@ udp_bind(queue_t *q, mblk_t *mp) sin6->sin6_port = udp->udp_port; /* Rebuild the header template */ - error = udp_build_hdrs(q, udp); + error = udp_build_hdrs(udp); if (error != 0) { + udp->udp_pending_op = -1; + rw_exit(&udp->udp_rwlock); mutex_exit(&udpf->uf_lock); udp_err_ack(q, mp, TSYSERR, error); return; @@ -1452,6 +1001,7 @@ udp_bind(queue_t *q, mblk_t *mp) udp->udp_state = TS_IDLE; udp_bind_hash_insert(udpf, udp); mutex_exit(&udpf->uf_lock); + rw_exit(&udp->udp_rwlock); if (cl_inet_bind) { /* @@ -1480,8 +1030,11 @@ udp_bind(queue_t *q, mblk_t *mp) connp->conn_mlp_type = udp->udp_recvucred ? mlptBoth : mlptSingle; addrtype = tsol_mlp_addr_type(zone->zone_id, IPV6_VERSION, - &v6src, udp->udp_us->us_netstack->netstack_ip); + &v6src, us->us_netstack->netstack_ip); if (addrtype == mlptSingle) { + rw_enter(&udp->udp_rwlock, RW_WRITER); + udp->udp_pending_op = -1; + rw_exit(&udp->udp_rwlock); udp_err_ack(q, mp, TNOADDR, 0); connp->conn_anon_port = B_FALSE; connp->conn_mlp_type = mlptSingle; @@ -1499,6 +1052,9 @@ udp_bind(queue_t *q, mblk_t *mp) "udp_bind: no priv for multilevel port %d", mlpport); } + rw_enter(&udp->udp_rwlock, RW_WRITER); + udp->udp_pending_op = -1; + rw_exit(&udp->udp_rwlock); udp_err_ack(q, mp, TACCES, 0); connp->conn_anon_port = B_FALSE; connp->conn_mlp_type = mlptSingle; @@ -1529,6 +1085,9 @@ udp_bind(queue_t *q, mblk_t *mp) mlpport, connp->conn_zoneid, mlpzone); } + rw_enter(&udp->udp_rwlock, RW_WRITER); + udp->udp_pending_op = -1; + rw_exit(&udp->udp_rwlock); udp_err_ack(q, mp, TACCES, 0); connp->conn_anon_port = B_FALSE; connp->conn_mlp_type = mlptSingle; @@ -1547,6 +1106,9 @@ udp_bind(queue_t *q, mblk_t *mp) "udp_bind: cannot establish anon " "MLP for port %d", port); } + rw_enter(&udp->udp_rwlock, RW_WRITER); + udp->udp_pending_op = -1; + rw_exit(&udp->udp_rwlock); udp_err_ack(q, mp, TACCES, 0); connp->conn_anon_port = B_FALSE; connp->conn_mlp_type = mlptSingle; @@ -1565,6 +1127,9 @@ udp_bind(queue_t *q, mblk_t *mp) */ mp->b_cont = allocb(sizeof (ire_t), BPRI_HI); if (!mp->b_cont) { + rw_enter(&udp->udp_rwlock, RW_WRITER); + udp->udp_pending_op = -1; + rw_exit(&udp->udp_rwlock); udp_err_ack(q, mp, TSYSERR, ENOMEM); return; } @@ -1576,34 +1141,25 @@ udp_bind(queue_t *q, mblk_t *mp) else mp = ip_bind_v4(q, mp, connp); + /* The above return NULL if the bind needs to be deferred */ if (mp != NULL) - udp_rput_other(_RD(q), mp); + udp_bind_result(connp, mp); else CONN_INC_REF(connp); } - -void -udp_resume_bind(conn_t *connp, mblk_t *mp) -{ - udp_enter(connp, mp, udp_resume_bind_cb, SQTAG_BIND_RETRY); -} - /* - * This is called from ip_wput_nondata to resume a deferred UDP bind. + * This is called from ip_wput_nondata to handle the results of a + * deferred UDP bind. It is called once the bind has been completed. */ -/* ARGSUSED */ -static void -udp_resume_bind_cb(void *arg, mblk_t *mp, void *arg2) +void +udp_resume_bind(conn_t *connp, mblk_t *mp) { - conn_t *connp = arg; - ASSERT(connp != NULL && IPCL_IS_UDP(connp)); - udp_rput_other(connp->conn_rq, mp); + udp_bind_result(connp, mp); CONN_OPER_PENDING_DONE(connp); - udp_exit(connp); } /* @@ -1616,11 +1172,11 @@ udp_resume_bind_cb(void *arg, mblk_t *mp, void *arg2) * T_OK_ACK - for the T_CONN_REQ * T_CONN_CON - to keep the TPI user happy * - * The connect completes in udp_rput. + * The connect completes in udp_bind_result. * When a T_BIND_ACK is received information is extracted from the IRE * and the two appended messages are sent to the TPI user. - * Should udp_rput receive T_ERROR_ACK for the T_BIND_REQ it will convert - * it to an error ack for the appropriate primitive. + * Should udp_bind_result receive T_ERROR_ACK for the T_BIND_REQ it will + * convert it to an error ack for the appropriate primitive. */ static void udp_connect(queue_t *q, mblk_t *mp) @@ -1635,10 +1191,11 @@ udp_connect(queue_t *q, mblk_t *mp) mblk_t *mp1, *mp2; udp_fanout_t *udpf; udp_t *udp, *udp1; + ushort_t ipversion; udp_stack_t *us; + conn_t *connp = Q_TO_CONN(q); - udp = Q_TO_UDP(q); - + udp = connp->conn_udp; tcr = (struct T_conn_req *)mp->b_rptr; us = udp->udp_us; @@ -1647,28 +1204,6 @@ udp_connect(queue_t *q, mblk_t *mp) udp_err_ack(q, mp, TPROTO, 0); return; } - /* - * This UDP must have bound to a port already before doing - * a connect. - */ - if (udp->udp_state == TS_UNBND) { - (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, - "udp_connect: bad state, %u", udp->udp_state); - udp_err_ack(q, mp, TOUTSTATE, 0); - return; - } - ASSERT(udp->udp_port != 0 && udp->udp_ptpbhn != NULL); - - udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port, - us->us_bind_fanout_size)]; - - if (udp->udp_state == TS_DATA_XFER) { - /* Already connected - clear out state */ - mutex_enter(&udpf->uf_lock); - udp->udp_v6src = udp->udp_bound_v6src; - udp->udp_state = TS_IDLE; - mutex_exit(&udpf->uf_lock); - } if (tcr->OPT_length != 0) { udp_err_ack(q, mp, TBADOPT, 0); @@ -1702,8 +1237,7 @@ udp_connect(queue_t *q, mblk_t *mp) dstport = sin->sin_port; IN6_IPADDR_TO_V4MAPPED(v4dst, &v6dst); ASSERT(udp->udp_ipversion == IPV4_VERSION); - udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE + - udp->udp_ip_snd_options_len; + ipversion = IPV4_VERSION; break; case sizeof (sin6_t): @@ -1719,18 +1253,15 @@ udp_connect(queue_t *q, mblk_t *mp) return; } v6dst = sin6->sin6_addr; + dstport = sin6->sin6_port; if (IN6_IS_ADDR_V4MAPPED(&v6dst)) { IN6_V4MAPPED_TO_IPADDR(&v6dst, v4dst); - udp->udp_ipversion = IPV4_VERSION; - udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + - UDPH_SIZE + udp->udp_ip_snd_options_len; + ipversion = IPV4_VERSION; flowinfo = 0; } else { - udp->udp_ipversion = IPV6_VERSION; - udp->udp_max_hdr_len = udp->udp_sticky_hdrs_len; + ipversion = IPV6_VERSION; flowinfo = sin6->sin6_flowinfo; } - dstport = sin6->sin6_port; break; } if (dstport == 0) { @@ -1738,11 +1269,46 @@ udp_connect(queue_t *q, mblk_t *mp) return; } + rw_enter(&udp->udp_rwlock, RW_WRITER); + + /* + * This UDP must have bound to a port already before doing a connect. + * TPI mandates that users must send TPI primitives only 1 at a time + * and wait for the response before sending the next primitive. + */ + if (udp->udp_state == TS_UNBND || udp->udp_pending_op != -1) { + rw_exit(&udp->udp_rwlock); + (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, + "udp_connect: bad state, %u", udp->udp_state); + udp_err_ack(q, mp, TOUTSTATE, 0); + return; + } + udp->udp_pending_op = T_CONN_REQ; + ASSERT(udp->udp_port != 0 && udp->udp_ptpbhn != NULL); + + if (ipversion == IPV4_VERSION) { + udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE + + udp->udp_ip_snd_options_len; + } else { + udp->udp_max_hdr_len = udp->udp_sticky_hdrs_len; + } + + udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port, + us->us_bind_fanout_size)]; + + mutex_enter(&udpf->uf_lock); + if (udp->udp_state == TS_DATA_XFER) { + /* Already connected - clear out state */ + udp->udp_v6src = udp->udp_bound_v6src; + udp->udp_state = TS_IDLE; + } + /* * Create a default IP header with no IP options. */ udp->udp_dstport = dstport; - if (udp->udp_ipversion == IPV4_VERSION) { + udp->udp_ipversion = ipversion; + if (ipversion == IPV4_VERSION) { /* * Interpret a zero destination to mean loopback. * Update the T_CONN_REQ (sin/sin6) since it is used to @@ -1794,10 +1360,9 @@ udp_connect(queue_t *q, mblk_t *mp) } /* - * Verify that the src/port/dst/port and zoneid is unique for all + * Verify that the src/port/dst/port is unique for all * connections in TS_DATA_XFER */ - mutex_enter(&udpf->uf_lock); for (udp1 = udpf->uf_udp; udp1 != NULL; udp1 = udp1->udp_bind_hash) { if (udp1->udp_state != TS_DATA_XFER) continue; @@ -1812,6 +1377,8 @@ udp_connect(queue_t *q, mblk_t *mp) udp->udp_connp->conn_zoneid))) continue; mutex_exit(&udpf->uf_lock); + udp->udp_pending_op = -1; + rw_exit(&udp->udp_rwlock); udp_err_ack(q, mp, TBADADDR, 0); return; } @@ -1828,17 +1395,20 @@ udp_connect(queue_t *q, mblk_t *mp) else mp1 = udp_ip_bind_mp(udp, O_T_BIND_REQ, sizeof (ipa6_conn_t)); if (mp1 == NULL) { - udp_err_ack(q, mp, TSYSERR, ENOMEM); bind_failed: mutex_enter(&udpf->uf_lock); udp->udp_state = TS_IDLE; + udp->udp_pending_op = -1; mutex_exit(&udpf->uf_lock); + rw_exit(&udp->udp_rwlock); + udp_err_ack(q, mp, TSYSERR, ENOMEM); return; } + rw_exit(&udp->udp_rwlock); /* * We also have to send a connection confirmation to - * keep TLI happy. Prepare it for udp_rput. + * keep TLI happy. Prepare it for udp_bind_result. */ if (udp->udp_family == AF_INET) mp2 = mi_tpi_conn_con(NULL, (char *)sin, @@ -1848,7 +1418,7 @@ bind_failed: sizeof (*sin6), NULL, 0); if (mp2 == NULL) { freemsg(mp1); - udp_err_ack(q, mp, TSYSERR, ENOMEM); + rw_enter(&udp->udp_rwlock, RW_WRITER); goto bind_failed; } @@ -1856,36 +1426,43 @@ bind_failed: if (mp == NULL) { /* Unable to reuse the T_CONN_REQ for the ack. */ freemsg(mp2); + rw_enter(&udp->udp_rwlock, RW_WRITER); + mutex_enter(&udpf->uf_lock); + udp->udp_state = TS_IDLE; + udp->udp_pending_op = -1; + mutex_exit(&udpf->uf_lock); + rw_exit(&udp->udp_rwlock); udp_err_ack_prim(q, mp1, T_CONN_REQ, TSYSERR, ENOMEM); - goto bind_failed; + return; } /* Hang onto the T_OK_ACK and T_CONN_CON for later. */ linkb(mp1, mp); linkb(mp1, mp2); - mblk_setcred(mp1, udp->udp_connp->conn_cred); + mblk_setcred(mp1, connp->conn_cred); if (udp->udp_family == AF_INET) - mp1 = ip_bind_v4(q, mp1, udp->udp_connp); + mp1 = ip_bind_v4(q, mp1, connp); else - mp1 = ip_bind_v6(q, mp1, udp->udp_connp, NULL); + mp1 = ip_bind_v6(q, mp1, connp, NULL); + /* The above return NULL if the bind needs to be deferred */ if (mp1 != NULL) - udp_rput_other(_RD(q), mp1); + udp_bind_result(connp, mp1); else - CONN_INC_REF(udp->udp_connp); + CONN_INC_REF(connp); } static int udp_close(queue_t *q) { - conn_t *connp = Q_TO_CONN(UDP_WR(q)); + conn_t *connp = (conn_t *)q->q_ptr; udp_t *udp; - queue_t *ip_rq = RD(UDP_WR(q)); ASSERT(connp != NULL && IPCL_IS_UDP(connp)); udp = connp->conn_udp; + udp_quiesce_conn(connp); ip_quiesce_conn(connp); /* * Disable read-side synchronous stream @@ -1896,11 +1473,6 @@ udp_close(queue_t *q) qprocsoff(q); - /* restore IP module's high and low water marks to default values */ - ip_rq->q_hiwat = ip_rq->q_qinfo->qi_minfo->mi_hiwat; - WR(ip_rq)->q_hiwat = WR(ip_rq)->q_qinfo->qi_minfo->mi_hiwat; - WR(ip_rq)->q_lowat = WR(ip_rq)->q_qinfo->qi_minfo->mi_lowat; - ASSERT(udp->udp_rcv_cnt == 0); ASSERT(udp->udp_rcv_msgcnt == 0); ASSERT(udp->udp_rcv_list_head == NULL); @@ -1909,23 +1481,28 @@ udp_close(queue_t *q) udp_close_free(connp); /* - * Restore connp as an IP endpoint. - * Locking required to prevent a race with udp_snmp_get()/ - * ipcl_get_next_conn(), which selects conn_t which are - * IPCL_UDP and not CONN_CONDEMNED. + * Now we are truly single threaded on this stream, and can + * delete the things hanging off the connp, and finally the connp. + * We removed this connp from the fanout list, it cannot be + * accessed thru the fanouts, and we already waited for the + * conn_ref to drop to 0. We are already in close, so + * there cannot be any other thread from the top. qprocsoff + * has completed, and service has completed or won't run in + * future. */ - mutex_enter(&connp->conn_lock); - connp->conn_flags &= ~IPCL_UDP; - connp->conn_state_flags &= - ~(CONN_CLOSING | CONN_CONDEMNED | CONN_QUIESCED); - connp->conn_ulp_labeled = B_FALSE; - mutex_exit(&connp->conn_lock); + ASSERT(connp->conn_ref == 1); + + inet_minor_free(ip_minor_arena, connp->conn_dev); + connp->conn_ref--; + ipcl_conn_destroy(connp); + + q->q_ptr = WR(q)->q_ptr = NULL; return (0); } /* - * Called in the close path from IP (ip_quiesce_conn) to quiesce the conn + * Called in the close path to quiesce the conn */ void udp_quiesce_conn(conn_t *connp) @@ -1949,12 +1526,6 @@ udp_quiesce_conn(conn_t *connp) udp_bind_hash_remove(udp, B_FALSE); - mutex_enter(&connp->conn_lock); - while (udp->udp_reader_count != 0 || udp->udp_squeue_count != 0 || - udp->udp_mode != UDP_MT_HOT) { - cv_wait(&connp->conn_cv, &connp->conn_lock); - } - mutex_exit(&connp->conn_lock); } void @@ -1982,12 +1553,6 @@ udp_close_free(conn_t *connp) } ip6_pkt_free(&udp->udp_sticky_ipp); - - udp->udp_connp = NULL; - netstack_rele(udp->udp_us->us_netstack); - - connp->conn_udp = NULL; - kmem_cache_free(udp_cache, udp); } /* @@ -2000,26 +1565,31 @@ udp_close_free(conn_t *connp) * T_BIND_REQ - specifying just the local address/port * T_OK_ACK - for the T_DISCON_REQ * - * The disconnect completes in udp_rput. + * The disconnect completes in udp_bind_result. * When a T_BIND_ACK is received the appended T_OK_ACK is sent to the TPI user. - * Should udp_rput receive T_ERROR_ACK for the T_BIND_REQ it will convert - * it to an error ack for the appropriate primitive. + * Should udp_bind_result receive T_ERROR_ACK for the T_BIND_REQ it will + * convert it to an error ack for the appropriate primitive. */ static void udp_disconnect(queue_t *q, mblk_t *mp) { - udp_t *udp = Q_TO_UDP(q); + udp_t *udp; mblk_t *mp1; udp_fanout_t *udpf; udp_stack_t *us; + conn_t *connp = Q_TO_CONN(q); + udp = connp->conn_udp; us = udp->udp_us; - if (udp->udp_state != TS_DATA_XFER) { + rw_enter(&udp->udp_rwlock, RW_WRITER); + if (udp->udp_state != TS_DATA_XFER || udp->udp_pending_op != -1) { + rw_exit(&udp->udp_rwlock); (void) mi_strlog(q, 1, SL_ERROR|SL_TRACE, "udp_disconnect: bad state, %u", udp->udp_state); udp_err_ack(q, mp, TOUTSTATE, 0); return; } + udp->udp_pending_op = T_DISCON_REQ; udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port, us->us_bind_fanout_size)]; mutex_enter(&udpf->uf_lock); @@ -2036,12 +1606,16 @@ udp_disconnect(queue_t *q, mblk_t *mp) else mp1 = udp_ip_bind_mp(udp, O_T_BIND_REQ, sizeof (sin6_t)); if (mp1 == NULL) { + udp->udp_pending_op = -1; + rw_exit(&udp->udp_rwlock); udp_err_ack(q, mp, TSYSERR, ENOMEM); return; } mp = mi_tpi_ok_ack_alloc(mp); if (mp == NULL) { /* Unable to reuse the T_DISCON_REQ for the ack. */ + udp->udp_pending_op = -1; + rw_exit(&udp->udp_rwlock); udp_err_ack_prim(q, mp1, T_DISCON_REQ, TSYSERR, ENOMEM); return; } @@ -2050,29 +1624,30 @@ udp_disconnect(queue_t *q, mblk_t *mp) int error; /* Rebuild the header template */ - error = udp_build_hdrs(q, udp); + error = udp_build_hdrs(udp); if (error != 0) { + udp->udp_pending_op = -1; + rw_exit(&udp->udp_rwlock); udp_err_ack_prim(q, mp, T_DISCON_REQ, TSYSERR, error); freemsg(mp1); return; } } - mutex_enter(&udpf->uf_lock); - udp->udp_discon_pending = 1; - mutex_exit(&udpf->uf_lock); - /* Append the T_OK_ACK to the T_BIND_REQ for udp_rput */ + rw_exit(&udp->udp_rwlock); + /* Append the T_OK_ACK to the T_BIND_REQ for udp_bind_ack */ linkb(mp1, mp); if (udp->udp_family == AF_INET6) - mp1 = ip_bind_v6(q, mp1, udp->udp_connp, NULL); + mp1 = ip_bind_v6(q, mp1, connp, NULL); else - mp1 = ip_bind_v4(q, mp1, udp->udp_connp); + mp1 = ip_bind_v4(q, mp1, connp); + /* The above return NULL if the bind needs to be deferred */ if (mp1 != NULL) - udp_rput_other(_RD(q), mp1); + udp_bind_result(connp, mp1); else - CONN_INC_REF(udp->udp_connp); + CONN_INC_REF(connp); } /* This routine creates a T_ERROR_ACK message and passes it upstream. */ @@ -2080,7 +1655,7 @@ static void udp_err_ack(queue_t *q, mblk_t *mp, t_scalar_t t_error, int sys_error) { if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) - putnext(UDP_RD(q), mp); + qreply(q, mp); } /* Shorthand to generate and send TPI error acks to our client */ @@ -2096,7 +1671,7 @@ udp_err_ack_prim(queue_t *q, mblk_t *mp, int primitive, t_scalar_t t_error, teackp->ERROR_prim = primitive; teackp->TLI_error = t_error; teackp->UNIX_error = sys_error; - putnext(UDP_RD(q), mp); + qreply(q, mp); } } @@ -2191,13 +1766,9 @@ udp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp, #define ICMP_MIN_UDP_HDR 4 /* - * udp_icmp_error is called by udp_rput to process ICMP msgs. passed up by IP. + * udp_icmp_error is called by udp_input to process ICMP msgs. passed up by IP. * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors. * Assumes that IP has pulled up everything up to and including the ICMP header. - * An M_CTL could potentially come here from some other module (i.e. if UDP - * is pushed on some module other than IP). Thus, if we find that the M_CTL - * does not have enough ICMP information , following STREAMS conventions, - * we send it upstream assuming it is an M_CTL we don't understand. */ static void udp_icmp_error(queue_t *q, mblk_t *mp) @@ -2210,70 +1781,27 @@ udp_icmp_error(queue_t *q, mblk_t *mp) sin6_t sin6; mblk_t *mp1; int error = 0; - size_t mp_size = MBLKL(mp); udp_t *udp = Q_TO_UDP(q); - /* - * Assume IP provides aligned packets - otherwise toss - */ - if (!OK_32PTR(mp->b_rptr)) { - freemsg(mp); - return; - } + ipha = (ipha_t *)mp->b_rptr; - /* - * Verify that we have a complete IP header and the application has - * asked for errors. If not, send it upstream. - */ - if (!udp->udp_dgram_errind || mp_size < sizeof (ipha_t)) { -noticmpv4: - putnext(UDP_RD(q), mp); - return; - } + ASSERT(OK_32PTR(mp->b_rptr)); - ipha = (ipha_t *)mp->b_rptr; - /* - * Verify IP version. Anything other than IPv4 or IPv6 packet is sent - * upstream. ICMPv6 is handled in udp_icmp_error_ipv6. - */ - switch (IPH_HDR_VERSION(ipha)) { - case IPV6_VERSION: + if (IPH_HDR_VERSION(ipha) != IPV4_VERSION) { + ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION); udp_icmp_error_ipv6(q, mp); return; - case IPV4_VERSION: - break; - default: - goto noticmpv4; } + ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); /* Skip past the outer IP and ICMP headers */ iph_hdr_length = IPH_HDR_LENGTH(ipha); icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; - /* - * If we don't have the correct outer IP header length or if the ULP - * is not IPPROTO_ICMP or if we don't have a complete inner IP header - * send the packet upstream. - */ - if (iph_hdr_length < sizeof (ipha_t) || - ipha->ipha_protocol != IPPROTO_ICMP || - (ipha_t *)&icmph[1] + 1 > (ipha_t *)mp->b_wptr) { - goto noticmpv4; - } ipha = (ipha_t *)&icmph[1]; /* Skip past the inner IP and find the ULP header */ iph_hdr_length = IPH_HDR_LENGTH(ipha); udpha = (udpha_t *)((char *)ipha + iph_hdr_length); - /* - * If we don't have the correct inner IP header length or if the ULP - * is not IPPROTO_UDP or if we don't have at least ICMP_MIN_UDP_HDR - * bytes of UDP header, send it upstream. - */ - if (iph_hdr_length < sizeof (ipha_t) || - ipha->ipha_protocol != IPPROTO_UDP || - (uchar_t *)udpha + ICMP_MIN_UDP_HDR > mp->b_wptr) { - goto noticmpv4; - } switch (icmph->icmph_type) { case ICMP_DEST_UNREACHABLE: @@ -2281,7 +1809,6 @@ noticmpv4: case ICMP_FRAGMENTATION_NEEDED: /* * IP has already adjusted the path MTU. - * XXX Somehow pass MTU indication to application? */ break; case ICMP_PORT_UNREACHABLE: @@ -2302,6 +1829,15 @@ noticmpv4: return; } + /* + * Deliver T_UDERROR_IND when the application has asked for it. + * The socket layer enables this automatically when connected. + */ + if (!udp->udp_dgram_errind) { + freemsg(mp); + return; + } + switch (udp->udp_family) { case AF_INET: sin = sin_null; @@ -2322,7 +1858,7 @@ noticmpv4: break; } if (mp1) - putnext(UDP_RD(q), mp1); + putnext(q, mp1); freemsg(mp); } @@ -2331,67 +1867,33 @@ noticmpv4: * Generates the appropriate T_UDERROR_IND for permanent (non-transient) errors. * Assumes that IP has pulled up all the extension headers as well as the * ICMPv6 header. - * An M_CTL could potentially come here from some other module (i.e. if UDP - * is pushed on some module other than IP). Thus, if we find that the M_CTL - * does not have enough ICMP information , following STREAMS conventions, - * we send it upstream assuming it is an M_CTL we don't understand. The reason - * it might get here is if the non-ICMP M_CTL accidently has 6 in the version - * field (when cast to ipha_t in udp_icmp_error). */ static void udp_icmp_error_ipv6(queue_t *q, mblk_t *mp) { icmp6_t *icmp6; ip6_t *ip6h, *outer_ip6h; - uint16_t hdr_length; + uint16_t iph_hdr_length; uint8_t *nexthdrp; udpha_t *udpha; sin6_t sin6; mblk_t *mp1; int error = 0; - size_t mp_size = MBLKL(mp); udp_t *udp = Q_TO_UDP(q); - - /* - * Verify that we have a complete IP header. If not, send it upstream. - */ - if (mp_size < sizeof (ip6_t)) { -noticmpv6: - putnext(UDP_RD(q), mp); - return; - } + udp_stack_t *us = udp->udp_us; outer_ip6h = (ip6_t *)mp->b_rptr; - /* - * Verify this is an ICMPV6 packet, else send it upstream - */ - if (outer_ip6h->ip6_nxt == IPPROTO_ICMPV6) { - hdr_length = IPV6_HDR_LEN; - } else if (!ip_hdr_length_nexthdr_v6(mp, outer_ip6h, &hdr_length, - &nexthdrp) || - *nexthdrp != IPPROTO_ICMPV6) { - goto noticmpv6; - } - icmp6 = (icmp6_t *)&mp->b_rptr[hdr_length]; + if (outer_ip6h->ip6_nxt != IPPROTO_ICMPV6) + iph_hdr_length = ip_hdr_length_v6(mp, outer_ip6h); + else + iph_hdr_length = IPV6_HDR_LEN; + icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length]; ip6h = (ip6_t *)&icmp6[1]; - /* - * Verify we have a complete ICMP and inner IP header. - */ - if ((uchar_t *)&ip6h[1] > mp->b_wptr) - goto noticmpv6; - - if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &hdr_length, &nexthdrp)) - goto noticmpv6; - udpha = (udpha_t *)((char *)ip6h + hdr_length); - /* - * Validate inner header. If the ULP is not IPPROTO_UDP or if we don't - * have at least ICMP_MIN_UDP_HDR bytes of UDP header send the - * packet upstream. - */ - if ((*nexthdrp != IPPROTO_UDP) || - ((uchar_t *)udpha + ICMP_MIN_UDP_HDR) > mp->b_wptr) { - goto noticmpv6; + if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) { + freemsg(mp); + return; } + udpha = (udpha_t *)((char *)ip6h + iph_hdr_length); switch (icmp6->icmp6_type) { case ICMP6_DST_UNREACH: @@ -2430,7 +1932,7 @@ noticmpv6: udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t) + opt_length; if ((newmp = allocb(udi_size, BPRI_MED)) == NULL) { - BUMP_MIB(&udp->udp_mib, udpInErrors); + BUMP_MIB(&us->us_udp_mib, udpInErrors); break; } @@ -2468,7 +1970,7 @@ noticmpv6: * message. Free it, then send our empty message. */ freemsg(mp); - putnext(UDP_RD(q), newmp); + putnext(q, newmp); return; } case ICMP6_TIME_EXCEEDED: @@ -2489,6 +1991,15 @@ noticmpv6: return; } + /* + * Deliver T_UDERROR_IND when the application has asked for it. + * The socket layer enables this automatically when connected. + */ + if (!udp->udp_dgram_errind) { + freemsg(mp); + return; + } + sin6 = sin6_null; sin6.sin6_family = AF_INET6; sin6.sin6_addr = ip6h->ip6_dst; @@ -2498,7 +2009,7 @@ noticmpv6: mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), NULL, 0, error); if (mp1) - putnext(UDP_RD(q), mp1); + putnext(q, mp1); freemsg(mp); } @@ -2532,6 +2043,7 @@ udp_addr_req(queue_t *q, mblk_t *mp) taa->PRIM_type = T_ADDR_ACK; ackmp->b_datap->db_type = M_PCPROTO; + rw_enter(&udp->udp_rwlock, RW_READER); /* * Note: Following code assumes 32 bit alignment of basic * data structures like sin_t and struct T_addr_ack. @@ -2625,8 +2137,9 @@ udp_addr_req(queue_t *q, mblk_t *mp) ackmp->b_wptr = (uchar_t *)&sin6[1]; } } + rw_exit(&udp->udp_rwlock); ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim); - putnext(UDP_RD(q), ackmp); + qreply(q, ackmp); } static void @@ -2669,7 +2182,7 @@ udp_capability_req(queue_t *q, mblk_t *mp) tcap->CAP_bits1 |= TC1_INFO; } - putnext(UDP_RD(q), mp); + qreply(q, mp); } /* @@ -2688,7 +2201,7 @@ udp_info_req(queue_t *q, mblk_t *mp) if (!mp) return; udp_copy_info((struct T_info_ack *)mp->b_rptr, udp); - putnext(UDP_RD(q), mp); + qreply(q, mp); } /* @@ -2738,7 +2251,7 @@ udp_ip_bind_mp(udp_t *udp, t_scalar_t bind_prim, t_scalar_t addr_length) sin6_t *sin6; ASSERT(bind_prim == O_T_BIND_REQ || bind_prim == T_BIND_REQ); - + ASSERT(RW_LOCK_HELD(&udp->udp_rwlock)); mp = allocb(sizeof (*tbr) + addr_length + 1, BPRI_HI); if (!mp) return (mp); @@ -2830,18 +2343,33 @@ udp_ip_bind_mp(udp_t *udp, t_scalar_t bind_prim, t_scalar_t addr_length) return (mp); } +/* For /dev/udp aka AF_INET open */ +static int +udp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) +{ + return (udp_open(q, devp, flag, sflag, credp, B_FALSE)); +} + +/* For /dev/udp6 aka AF_INET6 open */ +static int +udp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) +{ + return (udp_open(q, devp, flag, sflag, credp, B_TRUE)); +} + /* * This is the open routine for udp. It allocates a udp_t structure for * the stream and, on the first open of the module, creates an ND table. */ -/* ARGSUSED */ +/*ARGSUSED2*/ static int -udp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) +udp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, + boolean_t isv6) { int err; udp_t *udp; conn_t *connp; - queue_t *ip_wq; + dev_t conn_dev; zoneid_t zoneid; netstack_t *ns; udp_stack_t *us; @@ -2852,8 +2380,7 @@ udp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) if (q->q_ptr != NULL) return (0); - /* If this is not a push of udp as a module, fail. */ - if (sflag != MODOPEN) + if (sflag == MODOPEN) return (EINVAL); ns = netstack_find_by_cred(credp); @@ -2865,63 +2392,43 @@ udp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) * For exclusive stacks we set the zoneid to zero * to make UDP operate as if in the global zone. */ - if (us->us_netstack->netstack_stackid != GLOBAL_NETSTACKID) + if (ns->netstack_stackid != GLOBAL_NETSTACKID) zoneid = GLOBAL_ZONEID; else zoneid = crgetzoneid(credp); - q->q_hiwat = us->us_recv_hiwat; - WR(q)->q_hiwat = us->us_xmit_hiwat; - WR(q)->q_lowat = us->us_xmit_lowat; - - /* Insert ourselves in the stream since we're about to walk q_next */ - qprocson(q); + if ((conn_dev = inet_minor_alloc(ip_minor_arena)) == 0) { + netstack_rele(ns); + return (EBUSY); + } + *devp = makedevice(getemajor(*devp), (minor_t)conn_dev); - udp = kmem_cache_alloc(udp_cache, KM_SLEEP); - bzero(udp, sizeof (*udp)); + connp = ipcl_conn_create(IPCL_UDPCONN, KM_SLEEP, ns); + connp->conn_dev = conn_dev; + udp = connp->conn_udp; /* - * UDP is supported only as a module and it has to be pushed directly - * above the device instance of IP. If UDP is pushed anywhere else - * on a stream, it will support just T_SVR4_OPTMGMT_REQ for the - * sake of MIB browsers and fail everything else. + * ipcl_conn_create did a netstack_hold. Undo the hold that was + * done by netstack_find_by_cred() */ - ip_wq = WR(q)->q_next; - if (NOT_OVER_IP(ip_wq)) { - /* Support just SNMP for MIB browsers */ - connp = ipcl_conn_create(IPCL_IPCCONN, KM_SLEEP, - us->us_netstack); - connp->conn_rq = q; - connp->conn_wq = WR(q); - connp->conn_flags |= IPCL_UDPMOD; - connp->conn_cred = credp; - connp->conn_zoneid = zoneid; - connp->conn_udp = udp; - udp->udp_us = us; - udp->udp_connp = connp; - q->q_ptr = WR(q)->q_ptr = connp; - crhold(credp); - q->q_qinfo = &udp_snmp_rinit; - WR(q)->q_qinfo = &udp_snmp_winit; - return (0); - } + netstack_rele(ns); /* * Initialize the udp_t structure for this stream. */ - q = RD(ip_wq); - connp = Q_TO_CONN(q); - mutex_enter(&connp->conn_lock); - connp->conn_proto = IPPROTO_UDP; - connp->conn_flags |= IPCL_UDP; - connp->conn_sqp = IP_SQUEUE_GET(lbolt); - connp->conn_udp = udp; + q->q_ptr = connp; + WR(q)->q_ptr = connp; + connp->conn_rq = q; + connp->conn_wq = WR(q); + + rw_enter(&udp->udp_rwlock, RW_WRITER); + ASSERT(connp->conn_ulp == IPPROTO_UDP); + ASSERT(connp->conn_udp == udp); + ASSERT(udp->udp_connp == connp); /* Set the initial state of the stream and the privilege status. */ - udp->udp_connp = connp; udp->udp_state = TS_UNBND; - udp->udp_mode = UDP_MT_HOT; - if (getmajor(*devp) == (major_t)UDP6_MAJ) { + if (isv6) { udp->udp_family = AF_INET6; udp->udp_ipversion = IPV6_VERSION; udp->udp_max_hdr_len = IPV6_HDR_LEN + UDPH_SIZE; @@ -2938,6 +2445,7 @@ udp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) } udp->udp_multicast_ttl = IP_DEFAULT_MULTICAST_TTL; + udp->udp_pending_op = -1; connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; connp->conn_zoneid = zoneid; @@ -2951,41 +2459,45 @@ udp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) if (getpflags(NET_MAC_AWARE, credp) != 0) udp->udp_mac_exempt = B_TRUE; - if (connp->conn_flags & IPCL_SOCKET) { + if (flag & SO_SOCKSTR) { + connp->conn_flags |= IPCL_SOCKET; udp->udp_issocket = B_TRUE; udp->udp_direct_sockfs = B_TRUE; } connp->conn_ulp_labeled = is_system_labeled(); - mutex_exit(&connp->conn_lock); udp->udp_us = us; - /* - * The transmit hiwat/lowat is only looked at on IP's queue. - * Store in q_hiwat in order to return on SO_SNDBUF/SO_RCVBUF - * getsockopts. - */ q->q_hiwat = us->us_recv_hiwat; WR(q)->q_hiwat = us->us_xmit_hiwat; WR(q)->q_lowat = us->us_xmit_lowat; + connp->conn_recv = udp_input; + crhold(credp); + connp->conn_cred = credp; + + mutex_enter(&connp->conn_lock); + connp->conn_state_flags &= ~CONN_INCIPIENT; + mutex_exit(&connp->conn_lock); + + qprocson(q); + if (udp->udp_family == AF_INET6) { /* Build initial header template for transmit */ - if ((err = udp_build_hdrs(q, udp)) != 0) { - /* XXX missing free of connp? crfree? netstack_rele? */ - qprocsoff(UDP_RD(q)); - udp->udp_connp = NULL; - connp->conn_udp = NULL; - kmem_cache_free(udp_cache, udp); + if ((err = udp_build_hdrs(udp)) != 0) { + rw_exit(&udp->udp_rwlock); + qprocsoff(q); + ipcl_conn_destroy(connp); return (err); } } + rw_exit(&udp->udp_rwlock); /* Set the Stream head write offset and high watermark. */ - (void) mi_set_sth_wroff(UDP_RD(q), + (void) mi_set_sth_wroff(q, udp->udp_max_hdr_len + us->us_wroff_extra); - (void) mi_set_sth_hiwat(UDP_RD(q), udp_set_rcv_hiwat(udp, q->q_hiwat)); + (void) mi_set_sth_hiwat(q, udp_set_rcv_hiwat(udp, q->q_hiwat)); return (0); } @@ -3006,7 +2518,7 @@ udp_opt_allow_udr_set(t_scalar_t level, t_scalar_t name) */ /* ARGSUSED */ int -udp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) +udp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) { udp_t *udp = Q_TO_UDP(q); udp_stack_t *us = udp->udp_us; @@ -3041,12 +2553,11 @@ udp_opt_default(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) } /* - * This routine retrieves the current status of socket options - * and expects the caller to pass in the queue pointer of the - * upper instance. It returns the size of the option retrieved. + * This routine retrieves the current status of socket options. + * It returns the size of the option retrieved. */ int -udp_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) +udp_opt_get_locked(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) { int *i1 = (int *)ptr; conn_t *connp; @@ -3055,7 +2566,6 @@ udp_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) int len; udp_stack_t *us; - q = UDP_WR(q); connp = Q_TO_CONN(q); udp = connp->conn_udp; ipp = &udp->udp_sticky_ipp; @@ -3368,13 +2878,26 @@ udp_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) return (sizeof (int)); } +int +udp_opt_get(queue_t *q, t_scalar_t level, t_scalar_t name, uchar_t *ptr) +{ + udp_t *udp; + int err; + + udp = Q_TO_UDP(q); + + rw_enter(&udp->udp_rwlock, RW_READER); + err = udp_opt_get_locked(q, level, name, ptr); + rw_exit(&udp->udp_rwlock); + return (err); +} + /* - * This routine sets socket options; it expects the caller - * to pass in the queue pointer of the upper instance. + * This routine sets socket options. */ /* ARGSUSED */ int -udp_opt_set(queue_t *q, uint_t optset_context, int level, +udp_opt_set_locked(queue_t *q, uint_t optset_context, int level, int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk) { @@ -3387,8 +2910,8 @@ udp_opt_set(queue_t *q, uint_t optset_context, int level, udp_t *udp; uint_t newlen; udp_stack_t *us; + size_t sth_wroff; - q = UDP_WR(q); connp = Q_TO_CONN(q); udp = connp->conn_udp; us = udp->udp_us; @@ -3479,7 +3002,6 @@ udp_opt_set(queue_t *q, uint_t optset_context, int level, } if (!checkonly) { q->q_hiwat = *i1; - WR(UDP_RD(q))->q_hiwat = *i1; } break; case SO_RCVBUF: @@ -3489,9 +3011,10 @@ udp_opt_set(queue_t *q, uint_t optset_context, int level, } if (!checkonly) { RD(q)->q_hiwat = *i1; - UDP_RD(q)->q_hiwat = *i1; - (void) mi_set_sth_hiwat(UDP_RD(q), + rw_exit(&udp->udp_rwlock); + (void) mi_set_sth_hiwat(RD(q), udp_set_rcv_hiwat(udp, *i1)); + rw_enter(&udp->udp_rwlock, RW_WRITER); } break; case SO_DGRAM_ERRIND: @@ -3588,6 +3111,10 @@ udp_opt_set(queue_t *q, uint_t optset_context, int level, if (checkonly) break; + /* + * Update the stored options taking into account + * any CIPSO option which we should not overwrite. + */ if (!tsol_option_set(&udp->udp_ip_snd_options, &udp->udp_ip_snd_options_len, udp->udp_label_len, invalp, inlen)) { @@ -3597,8 +3124,10 @@ udp_opt_set(queue_t *q, uint_t optset_context, int level, udp->udp_max_hdr_len = IP_SIMPLE_HDR_LENGTH + UDPH_SIZE + udp->udp_ip_snd_options_len; - (void) mi_set_sth_wroff(RD(q), udp->udp_max_hdr_len + - us->us_wroff_extra); + sth_wroff = udp->udp_max_hdr_len + us->us_wroff_extra; + rw_exit(&udp->udp_rwlock); + (void) mi_set_sth_wroff(RD(q), sth_wroff); + rw_enter(&udp->udp_rwlock, RW_WRITER); break; case IP_TTL: @@ -3784,7 +3313,7 @@ udp_opt_set(queue_t *q, uint_t optset_context, int level, ipp->ipp_fields |= IPPF_UNICAST_HOPS; } /* Rebuild the header template */ - error = udp_build_hdrs(q, udp); + error = udp_build_hdrs(udp); if (error != 0) { *outlenp = 0; return (error); @@ -3921,7 +3450,7 @@ udp_opt_set(queue_t *q, uint_t optset_context, int level, ipp->ipp_fields &= ~IPPF_ADDR; } if (sticky) { - error = udp_build_hdrs(q, udp); + error = udp_build_hdrs(udp); if (error != 0) return (error); } @@ -3967,7 +3496,7 @@ udp_opt_set(queue_t *q, uint_t optset_context, int level, ipp->ipp_fields |= IPPF_TCLASS; } if (sticky) { - error = udp_build_hdrs(q, udp); + error = udp_build_hdrs(udp); if (error != 0) return (error); } @@ -4001,7 +3530,7 @@ udp_opt_set(queue_t *q, uint_t optset_context, int level, ipp->ipp_fields &= ~IPPF_NEXTHOP; } if (sticky) { - error = udp_build_hdrs(q, udp); + error = udp_build_hdrs(udp); if (error != 0) return (error); } @@ -4032,7 +3561,7 @@ udp_opt_set(queue_t *q, uint_t optset_context, int level, ipp->ipp_fields |= IPPF_HOPOPTS; } if (sticky) { - error = udp_build_hdrs(q, udp); + error = udp_build_hdrs(udp); if (error != 0) return (error); } @@ -4072,7 +3601,7 @@ udp_opt_set(queue_t *q, uint_t optset_context, int level, ipp->ipp_fields |= IPPF_RTDSTOPTS; } if (sticky) { - error = udp_build_hdrs(q, udp); + error = udp_build_hdrs(udp); if (error != 0) return (error); } @@ -4111,7 +3640,7 @@ udp_opt_set(queue_t *q, uint_t optset_context, int level, ipp->ipp_fields |= IPPF_DSTOPTS; } if (sticky) { - error = udp_build_hdrs(q, udp); + error = udp_build_hdrs(udp); if (error != 0) return (error); } @@ -4150,7 +3679,7 @@ udp_opt_set(queue_t *q, uint_t optset_context, int level, ipp->ipp_fields |= IPPF_RTHDR; } if (sticky) { - error = udp_build_hdrs(q, udp); + error = udp_build_hdrs(udp); if (error != 0) return (error); } @@ -4265,6 +3794,23 @@ udp_opt_set(queue_t *q, uint_t optset_context, int level, return (0); } +int +udp_opt_set(queue_t *q, uint_t optset_context, int level, + int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, + uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, mblk_t *mblk) +{ + udp_t *udp; + int err; + + udp = Q_TO_UDP(q); + + rw_enter(&udp->udp_rwlock, RW_WRITER); + err = udp_opt_set_locked(q, optset_context, level, name, inlen, invalp, + outlenp, outvalp, thisdg_attrs, cr, mblk); + rw_exit(&udp->udp_rwlock); + return (err); +} + /* * Update udp_sticky_hdrs based on udp_sticky_ipp, udp_v6src, and udp_ttl. * The headers include ip6i_t (if needed), ip6_t, any sticky extension @@ -4272,7 +3818,7 @@ udp_opt_set(queue_t *q, uint_t optset_context, int level, * Returns failure if can't allocate memory. */ static int -udp_build_hdrs(queue_t *q, udp_t *udp) +udp_build_hdrs(udp_t *udp) { udp_stack_t *us = udp->udp_us; uchar_t *hdrs; @@ -4281,7 +3827,9 @@ udp_build_hdrs(queue_t *q, udp_t *udp) ip6i_t *ip6i; udpha_t *udpha; ip6_pkt_t *ipp = &udp->udp_sticky_ipp; + size_t sth_wroff; + ASSERT(RW_WRITE_HELD(&udp->udp_rwlock)); hdrs_len = ip_total_hdrs_len_v6(ipp) + UDPH_SIZE; ASSERT(hdrs_len != 0); if (hdrs_len != udp->udp_sticky_hdrs_len) { @@ -4317,8 +3865,10 @@ udp_build_hdrs(queue_t *q, udp_t *udp) /* Try to get everything in a single mblk */ if (hdrs_len > udp->udp_max_hdr_len) { udp->udp_max_hdr_len = hdrs_len; - (void) mi_set_sth_wroff(RD(q), udp->udp_max_hdr_len + - us->us_wroff_extra); + sth_wroff = udp->udp_max_hdr_len + us->us_wroff_extra; + rw_exit(&udp->udp_rwlock); + (void) mi_set_sth_wroff(udp->udp_connp->conn_rq, sth_wroff); + rw_enter(&udp->udp_rwlock, RW_WRITER); } return (0); } @@ -4556,12 +4106,48 @@ copy_hop_opts(const ip6_pkt_t *ipp, uchar_t *dbuf) return (tlen); } +/* + * Update udp_rcv_opt_len from the packet. + * Called when options received, and when no options received but + * udp_ip_recv_opt_len has previously recorded options. + */ +static void +udp_save_ip_rcv_opt(udp_t *udp, void *opt, int opt_len) +{ + /* Save the options if any */ + if (opt_len > 0) { + if (opt_len > udp->udp_ip_rcv_options_len) { + /* Need to allocate larger buffer */ + if (udp->udp_ip_rcv_options_len != 0) + mi_free((char *)udp->udp_ip_rcv_options); + udp->udp_ip_rcv_options_len = 0; + udp->udp_ip_rcv_options = + (uchar_t *)mi_alloc(opt_len, BPRI_HI); + if (udp->udp_ip_rcv_options != NULL) + udp->udp_ip_rcv_options_len = opt_len; + } + if (udp->udp_ip_rcv_options_len != 0) { + bcopy(opt, udp->udp_ip_rcv_options, opt_len); + /* Adjust length if we are resusing the space */ + udp->udp_ip_rcv_options_len = opt_len; + } + } else if (udp->udp_ip_rcv_options_len != 0) { + /* Clear out previously recorded options */ + mi_free((char *)udp->udp_ip_rcv_options); + udp->udp_ip_rcv_options = NULL; + udp->udp_ip_rcv_options_len = 0; + } +} + +/* ARGSUSED2 */ static void -udp_input(conn_t *connp, mblk_t *mp) +udp_input(void *arg1, mblk_t *mp, void *arg2) { + conn_t *connp = (conn_t *)arg1; struct T_unitdata_ind *tudi; uchar_t *rptr; /* Pointer to IP header */ int hdr_length; /* Length of IP+UDP headers */ + int opt_len; int udi_size; /* Size of T_unitdata_ind */ int mp_len; udp_t *udp; @@ -4574,13 +4160,13 @@ udp_input(conn_t *connp, mblk_t *mp) mblk_t *options_mp = NULL; ip_pktinfo_t *pinfo = NULL; cred_t *cr = NULL; - queue_t *q = connp->conn_rq; pid_t cpid; + uint32_t udp_ip_rcv_options_len; + udp_bits_t udp_bits; cred_t *rcr = connp->conn_cred; udp_stack_t *us; - TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_START, - "udp_rput_start: q %p mp %p", q, mp); + ASSERT(connp->conn_flags & IPCL_UDPCONN); udp = connp->conn_udp; us = udp->udp_us; @@ -4599,7 +4185,7 @@ udp_input(conn_t *connp, mblk_t *mp) IN_PKTINFO) { /* * IP_RECVIF or IP_RECVSLLA or IPF_RECVADDR information - * has been appended to the packet by IP. We need to + * has been prepended to the packet by IP. We need to * extract the mblk and adjust the rptr */ pinfo = (ip_pktinfo_t *)mp->b_rptr; @@ -4611,9 +4197,7 @@ udp_input(conn_t *connp, mblk_t *mp) /* * ICMP messages. */ - udp_icmp_error(q, mp); - TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END, - "udp_rput_end: q %p (%S)", q, "m_ctl"); + udp_icmp_error(connp->conn_rq, mp); return; } } @@ -4623,53 +4207,37 @@ udp_input(conn_t *connp, mblk_t *mp) * This is the inbound data path. * First, we check to make sure the IP version number is correct, * and then pull the IP and UDP headers into the first mblk. - * Assume IP provides aligned packets - otherwise toss. - * Also, check if we have a complete IP header. */ /* Initialize regardless if ipversion is IPv4 or IPv6 */ ipp.ipp_fields = 0; ipversion = IPH_HDR_VERSION(rptr); + + rw_enter(&udp->udp_rwlock, RW_READER); + udp_ip_rcv_options_len = udp->udp_ip_rcv_options_len; + udp_bits = udp->udp_bits; + rw_exit(&udp->udp_rwlock); + switch (ipversion) { case IPV4_VERSION: ASSERT(MBLKL(mp) >= sizeof (ipha_t)); ASSERT(((ipha_t *)rptr)->ipha_protocol == IPPROTO_UDP); hdr_length = IPH_HDR_LENGTH(rptr) + UDPH_SIZE; - if ((hdr_length > IP_SIMPLE_HDR_LENGTH + UDPH_SIZE) || - (udp->udp_ip_rcv_options_len)) { + opt_len = hdr_length - (IP_SIMPLE_HDR_LENGTH + UDPH_SIZE); + if ((opt_len > 0 || udp_ip_rcv_options_len > 0) && + udp->udp_family == AF_INET) { /* - * Handle IPv4 packets with options outside of the - * main data path. Not needed for AF_INET6 sockets + * Record/update udp_ip_rcv_options with the lock + * held. Not needed for AF_INET6 sockets * since they don't support a getsockopt of IP_OPTIONS. */ - if (udp->udp_family == AF_INET6) - break; - /* - * UDP length check performed for IPv4 packets with - * options to check whether UDP length specified in - * the header is the same as the physical length of - * the packet. - */ - udpha = (udpha_t *)(rptr + (hdr_length - UDPH_SIZE)); - if (mp_len != (ntohs(udpha->uha_length) + - hdr_length - UDPH_SIZE)) { - goto tossit; - } - /* - * Handle the case where the packet has IP options - * and the IP_RECVSLLA & IP_RECVIF are set - */ - if (pinfo != NULL) - mp = options_mp; - udp_become_writer(connp, mp, udp_rput_other_wrapper, - SQTAG_UDP_INPUT); - TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END, - "udp_rput_end: q %p (%S)", q, "end"); - return; + rw_enter(&udp->udp_rwlock, RW_WRITER); + udp_save_ip_rcv_opt(udp, rptr + IP_SIMPLE_HDR_LENGTH, + opt_len); + rw_exit(&udp->udp_rwlock); } - - /* Handle IPV6_RECVHOPLIMIT. */ + /* Handle IPV6_RECVPKTINFO even for IPv4 packet. */ if ((udp->udp_family == AF_INET6) && (pinfo != NULL) && udp->udp_ip_recvpktinfo) { if (pinfo->ip_pkt_flags & IPF_RECVIF) { @@ -4735,8 +4303,9 @@ udp_input(conn_t *connp, mblk_t *mp) /* * IP inspected the UDP header thus all of it must be in the mblk. * UDP length check is performed for IPv6 packets and IPv4 packets - * without options to check if the size of the packet as specified + * to check if the size of the packet as specified * by the header is the same as the physical size of the packet. + * FIXME? Didn't IP already check this? */ udpha = (udpha_t *)(rptr + (hdr_length - UDPH_SIZE)); if ((MBLKL(mp) < hdr_length) || @@ -4744,8 +4313,9 @@ udp_input(conn_t *connp, mblk_t *mp) goto tossit; } - /* Walk past the headers. */ - if (!udp->udp_rcvhdr) { + + /* Walk past the headers unless IP_RECVHDR was set. */ + if (!udp_bits.udpb_rcvhdr) { mp->b_rptr = rptr + hdr_length; mp_len -= hdr_length; } @@ -4760,56 +4330,62 @@ udp_input(conn_t *connp, mblk_t *mp) ASSERT(IPH_HDR_VERSION((ipha_t *)rptr) == IPV4_VERSION); /* - * Normally only send up the address. + * Normally only send up the source address. * If IP_RECVDSTADDR is set we include the destination IP * address as an option. With IP_RECVOPTS we include all - * the IP options. Only ip_rput_other() handles packets - * that contain IP options. + * the IP options. */ udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t); - if (udp->udp_recvdstaddr) { + if (udp_bits.udpb_recvdstaddr) { udi_size += sizeof (struct T_opthdr) + sizeof (struct in_addr); UDP_STAT(us, udp_in_recvdstaddr); } - if (udp->udp_ip_recvpktinfo && (pinfo != NULL) && + if (udp_bits.udpb_ip_recvpktinfo && (pinfo != NULL) && (pinfo->ip_pkt_flags & IPF_RECVADDR)) { udi_size += sizeof (struct T_opthdr) + sizeof (struct in_pktinfo); - UDP_STAT(us, udp_ip_recvpktinfo); + UDP_STAT(us, udp_ip_rcvpktinfo); + } + + if ((udp_bits.udpb_recvopts) && opt_len > 0) { + udi_size += sizeof (struct T_opthdr) + opt_len; + UDP_STAT(us, udp_in_recvopts); } /* * If the IP_RECVSLLA or the IP_RECVIF is set then allocate * space accordingly */ - if (udp->udp_recvif && (pinfo != NULL) && + if ((udp_bits.udpb_recvif) && (pinfo != NULL) && (pinfo->ip_pkt_flags & IPF_RECVIF)) { udi_size += sizeof (struct T_opthdr) + sizeof (uint_t); UDP_STAT(us, udp_in_recvif); } - if (udp->udp_recvslla && (pinfo != NULL) && + if ((udp_bits.udpb_recvslla) && (pinfo != NULL) && (pinfo->ip_pkt_flags & IPF_RECVSLLA)) { udi_size += sizeof (struct T_opthdr) + sizeof (struct sockaddr_dl); UDP_STAT(us, udp_in_recvslla); } - if (udp->udp_recvucred && (cr = DB_CRED(mp)) != NULL) { + if ((udp_bits.udpb_recvucred) && + (cr = DB_CRED(mp)) != NULL) { udi_size += sizeof (struct T_opthdr) + ucredsize; cpid = DB_CPID(mp); UDP_STAT(us, udp_in_recvucred); } + /* XXX FIXME: apply to AF_INET6 as well */ /* * If SO_TIMESTAMP is set allocate the appropriate sized * buffer. Since gethrestime() expects a pointer aligned * argument, we allocate space necessary for extra * alignment (even though it might not be used). */ - if (udp->udp_timestamp) { + if (udp_bits.udpb_timestamp) { udi_size += sizeof (struct T_opthdr) + sizeof (timestruc_t) + _POINTER_ALIGNMENT; UDP_STAT(us, udp_in_timestamp); @@ -4818,11 +4394,10 @@ udp_input(conn_t *connp, mblk_t *mp) /* * If IP_RECVTTL is set allocate the appropriate sized buffer */ - if (udp->udp_recvttl) { + if (udp_bits.udpb_recvttl) { udi_size += sizeof (struct T_opthdr) + sizeof (uint8_t); UDP_STAT(us, udp_in_recvttl); } - ASSERT(IPH_HDR_LENGTH((ipha_t *)rptr) == IP_SIMPLE_HDR_LENGTH); /* Allocate a message block for the T_UNITDATA_IND structure. */ mp1 = allocb(udi_size, BPRI_MED); @@ -4830,9 +4405,7 @@ udp_input(conn_t *connp, mblk_t *mp) freemsg(mp); if (options_mp != NULL) freeb(options_mp); - TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END, - "udp_rput_end: q %p (%S)", q, "allocbfail"); - BUMP_MIB(&udp->udp_mib, udpInErrors); + BUMP_MIB(&us->us_udp_mib, udpInErrors); return; } mp1->b_cont = mp; @@ -4866,7 +4439,7 @@ udp_input(conn_t *connp, mblk_t *mp) char *dstopt; dstopt = (char *)&sin[1]; - if (udp->udp_recvdstaddr) { + if (udp_bits.udpb_recvdstaddr) { struct T_opthdr *toh; ipaddr_t *dstptr; @@ -4879,11 +4452,26 @@ udp_input(conn_t *connp, mblk_t *mp) dstopt += sizeof (struct T_opthdr); dstptr = (ipaddr_t *)dstopt; *dstptr = ((ipha_t *)rptr)->ipha_dst; - dstopt = (char *)toh + toh->len; + dstopt += sizeof (ipaddr_t); + udi_size -= toh->len; + } + + if (udp_bits.udpb_recvopts && opt_len > 0) { + struct T_opthdr *toh; + + toh = (struct T_opthdr *)dstopt; + toh->level = IPPROTO_IP; + toh->name = IP_RECVOPTS; + toh->len = sizeof (struct T_opthdr) + opt_len; + toh->status = 0; + dstopt += sizeof (struct T_opthdr); + bcopy(rptr + IP_SIMPLE_HDR_LENGTH, dstopt, + opt_len); + dstopt += opt_len; udi_size -= toh->len; } - if (udp->udp_ip_recvpktinfo && (pinfo != NULL) && + if ((udp_bits.udpb_ip_recvpktinfo) && (pinfo != NULL) && (pinfo->ip_pkt_flags & IPF_RECVADDR)) { struct T_opthdr *toh; struct in_pktinfo *pktinfop; @@ -4906,7 +4494,7 @@ udp_input(conn_t *connp, mblk_t *mp) udi_size -= toh->len; } - if (udp->udp_recvslla && (pinfo != NULL) && + if ((udp_bits.udpb_recvslla) && (pinfo != NULL) && (pinfo->ip_pkt_flags & IPF_RECVSLLA)) { struct T_opthdr *toh; @@ -4922,11 +4510,11 @@ udp_input(conn_t *connp, mblk_t *mp) dstptr = (struct sockaddr_dl *)dstopt; bcopy(&pinfo->ip_pkt_slla, dstptr, sizeof (struct sockaddr_dl)); - dstopt = (char *)toh + toh->len; + dstopt += sizeof (struct sockaddr_dl); udi_size -= toh->len; } - if (udp->udp_recvif && (pinfo != NULL) && + if ((udp_bits.udpb_recvif) && (pinfo != NULL) && (pinfo->ip_pkt_flags & IPF_RECVIF)) { struct T_opthdr *toh; @@ -4941,7 +4529,7 @@ udp_input(conn_t *connp, mblk_t *mp) dstopt += sizeof (struct T_opthdr); dstptr = (uint_t *)dstopt; *dstptr = pinfo->ip_pkt_ifindex; - dstopt = (char *)toh + toh->len; + dstopt += sizeof (uint_t); udi_size -= toh->len; } @@ -4953,12 +4541,13 @@ udp_input(conn_t *connp, mblk_t *mp) toh->name = SCM_UCRED; toh->len = sizeof (struct T_opthdr) + ucredsize; toh->status = 0; - (void) cred2ucred(cr, cpid, &toh[1], rcr); - dstopt = (char *)toh + toh->len; + dstopt += sizeof (struct T_opthdr); + (void) cred2ucred(cr, cpid, dstopt, rcr); + dstopt += ucredsize; udi_size -= toh->len; } - if (udp->udp_timestamp) { + if (udp_bits.udpb_timestamp) { struct T_opthdr *toh; toh = (struct T_opthdr *)dstopt; @@ -4984,7 +4573,7 @@ udp_input(conn_t *connp, mblk_t *mp) * any option processing after this will * cause alignment panic. */ - if (udp->udp_recvttl) { + if (udp_bits.udpb_recvttl) { struct T_opthdr *toh; uint8_t *dstptr; @@ -4997,7 +4586,7 @@ udp_input(conn_t *connp, mblk_t *mp) dstopt += sizeof (struct T_opthdr); dstptr = (uint8_t *)dstopt; *dstptr = ((ipha_t *)rptr)->ipha_ttl; - dstopt = (char *)toh + toh->len; + dstopt += sizeof (uint8_t); udi_size -= toh->len; } @@ -5013,15 +4602,12 @@ udp_input(conn_t *connp, mblk_t *mp) * Normally we only send up the address. If receiving of any * optional receive side information is enabled, we also send * that up as options. - * [ Only udp_rput_other() handles packets that contain IP - * options so code to account for does not appear immediately - * below but elsewhere ] */ udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin6_t); if (ipp.ipp_fields & (IPPF_HOPOPTS|IPPF_DSTOPTS|IPPF_RTDSTOPTS| IPPF_RTHDR|IPPF_IFINDEX)) { - if (udp->udp_ipv6_recvhopopts && + if ((udp_bits.udpb_ipv6_recvhopopts) && (ipp.ipp_fields & IPPF_HOPOPTS)) { size_t hlen; @@ -5031,29 +4617,29 @@ udp_input(conn_t *connp, mblk_t *mp) ipp.ipp_fields &= ~IPPF_HOPOPTS; udi_size += hlen; } - if ((udp->udp_ipv6_recvdstopts || - udp->udp_old_ipv6_recvdstopts) && + if (((udp_bits.udpb_ipv6_recvdstopts) || + udp_bits.udpb_old_ipv6_recvdstopts) && (ipp.ipp_fields & IPPF_DSTOPTS)) { udi_size += sizeof (struct T_opthdr) + ipp.ipp_dstoptslen; UDP_STAT(us, udp_in_recvdstopts); } - if (((udp->udp_ipv6_recvdstopts && - udp->udp_ipv6_recvrthdr && + if ((((udp_bits.udpb_ipv6_recvdstopts) && + udp_bits.udpb_ipv6_recvrthdr && (ipp.ipp_fields & IPPF_RTHDR)) || - udp->udp_ipv6_recvrthdrdstopts) && + (udp_bits.udpb_ipv6_recvrthdrdstopts)) && (ipp.ipp_fields & IPPF_RTDSTOPTS)) { udi_size += sizeof (struct T_opthdr) + ipp.ipp_rtdstoptslen; UDP_STAT(us, udp_in_recvrtdstopts); } - if (udp->udp_ipv6_recvrthdr && + if ((udp_bits.udpb_ipv6_recvrthdr) && (ipp.ipp_fields & IPPF_RTHDR)) { udi_size += sizeof (struct T_opthdr) + ipp.ipp_rthdrlen; UDP_STAT(us, udp_in_recvrthdr); } - if (udp->udp_ip_recvpktinfo && + if ((udp_bits.udpb_ip_recvpktinfo) && (ipp.ipp_fields & IPPF_IFINDEX)) { udi_size += sizeof (struct T_opthdr) + sizeof (struct in6_pktinfo); @@ -5061,18 +4647,19 @@ udp_input(conn_t *connp, mblk_t *mp) } } - if (udp->udp_recvucred && (cr = DB_CRED(mp)) != NULL) { + if ((udp_bits.udpb_recvucred) && + (cr = DB_CRED(mp)) != NULL) { udi_size += sizeof (struct T_opthdr) + ucredsize; cpid = DB_CPID(mp); UDP_STAT(us, udp_in_recvucred); } - if (udp->udp_ipv6_recvhoplimit) { + if (udp_bits.udpb_ipv6_recvhoplimit) { udi_size += sizeof (struct T_opthdr) + sizeof (int); UDP_STAT(us, udp_in_recvhoplimit); } - if (udp->udp_ipv6_recvtclass) { + if (udp_bits.udpb_ipv6_recvtclass) { udi_size += sizeof (struct T_opthdr) + sizeof (int); UDP_STAT(us, udp_in_recvtclass); } @@ -5082,9 +4669,7 @@ udp_input(conn_t *connp, mblk_t *mp) freemsg(mp); if (options_mp != NULL) freeb(options_mp); - TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END, - "udp_rput_end: q %p (%S)", q, "allocbfail"); - BUMP_MIB(&udp->udp_mib, udpInErrors); + BUMP_MIB(&us->us_udp_mib, udpInErrors); return; } mp1->b_cont = mp; @@ -5132,7 +4717,7 @@ udp_input(conn_t *connp, mblk_t *mp) uchar_t *dstopt; dstopt = (uchar_t *)&sin6[1]; - if (udp->udp_ip_recvpktinfo && + if ((udp_bits.udpb_ip_recvpktinfo) && (ipp.ipp_fields & IPPF_IFINDEX)) { struct T_opthdr *toh; struct in6_pktinfo *pkti; @@ -5155,7 +4740,7 @@ udp_input(conn_t *connp, mblk_t *mp) dstopt += sizeof (*pkti); udi_size -= toh->len; } - if (udp->udp_ipv6_recvhoplimit) { + if (udp_bits.udpb_ipv6_recvhoplimit) { struct T_opthdr *toh; toh = (struct T_opthdr *)dstopt; @@ -5173,7 +4758,7 @@ udp_input(conn_t *connp, mblk_t *mp) dstopt += sizeof (uint_t); udi_size -= toh->len; } - if (udp->udp_ipv6_recvtclass) { + if (udp_bits.udpb_ipv6_recvtclass) { struct T_opthdr *toh; toh = (struct T_opthdr *)dstopt; @@ -5194,7 +4779,7 @@ udp_input(conn_t *connp, mblk_t *mp) dstopt += sizeof (uint_t); udi_size -= toh->len; } - if (udp->udp_ipv6_recvhopopts && + if ((udp_bits.udpb_ipv6_recvhopopts) && (ipp.ipp_fields & IPPF_HOPOPTS)) { size_t hlen; @@ -5202,8 +4787,8 @@ udp_input(conn_t *connp, mblk_t *mp) dstopt += hlen; udi_size -= hlen; } - if (udp->udp_ipv6_recvdstopts && - udp->udp_ipv6_recvrthdr && + if ((udp_bits.udpb_ipv6_recvdstopts) && + (udp_bits.udpb_ipv6_recvrthdr) && (ipp.ipp_fields & IPPF_RTHDR) && (ipp.ipp_fields & IPPF_RTDSTOPTS)) { struct T_opthdr *toh; @@ -5220,7 +4805,7 @@ udp_input(conn_t *connp, mblk_t *mp) dstopt += ipp.ipp_rtdstoptslen; udi_size -= toh->len; } - if (udp->udp_ipv6_recvrthdr && + if ((udp_bits.udpb_ipv6_recvrthdr) && (ipp.ipp_fields & IPPF_RTHDR)) { struct T_opthdr *toh; @@ -5235,7 +4820,7 @@ udp_input(conn_t *connp, mblk_t *mp) dstopt += ipp.ipp_rthdrlen; udi_size -= toh->len; } - if (udp->udp_ipv6_recvdstopts && + if ((udp_bits.udpb_ipv6_recvdstopts) && (ipp.ipp_fields & IPPF_DSTOPTS)) { struct T_opthdr *toh; @@ -5271,20 +4856,18 @@ udp_input(conn_t *connp, mblk_t *mp) /* No IP_RECVDSTADDR for IPv6. */ } - BUMP_MIB(&udp->udp_mib, udpHCInDatagrams); - TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END, - "udp_rput_end: q %p (%S)", q, "end"); + BUMP_MIB(&us->us_udp_mib, udpHCInDatagrams); if (options_mp != NULL) freeb(options_mp); - if (udp->udp_direct_sockfs) { + if (udp_bits.udpb_direct_sockfs) { /* * There is nothing above us except for the stream head; * use the read-side synchronous stream interface in * order to reduce the time spent in interrupt thread. */ ASSERT(udp->udp_issocket); - udp_rcv_enqueue(UDP_RD(q), udp, mp, mp_len); + udp_rcv_enqueue(connp->conn_rq, udp, mp, mp_len); } else { /* * Use regular STREAMS interface to pass data upstream @@ -5292,7 +4875,7 @@ udp_input(conn_t *connp, mblk_t *mp) * switched over to the slow mode due to sockmod being * popped or a module being pushed on top of us. */ - putnext(UDP_RD(q), mp); + putnext(connp->conn_rq, mp); } return; @@ -5300,472 +4883,79 @@ tossit: freemsg(mp); if (options_mp != NULL) freeb(options_mp); - BUMP_MIB(&udp->udp_mib, udpInErrors); -} - -void -udp_conn_recv(conn_t *connp, mblk_t *mp) -{ - _UDP_ENTER(connp, mp, udp_input_wrapper, SQTAG_UDP_FANOUT); -} - -/* ARGSUSED */ -static void -udp_input_wrapper(void *arg, mblk_t *mp, void *arg2) -{ - udp_input((conn_t *)arg, mp); - _UDP_EXIT((conn_t *)arg); + BUMP_MIB(&us->us_udp_mib, udpInErrors); } /* - * Process non-M_DATA messages as well as M_DATA messages that requires - * modifications to udp_ip_rcv_options i.e. IPv4 packets with IP options. + * Handle the results of a T_BIND_REQ whether deferred by IP or handled + * immediately. */ static void -udp_rput_other(queue_t *q, mblk_t *mp) +udp_bind_result(conn_t *connp, mblk_t *mp) { - struct T_unitdata_ind *tudi; - mblk_t *mp1; - uchar_t *rptr; - uchar_t *new_rptr; - int hdr_length; - int udi_size; /* Size of T_unitdata_ind */ - int opt_len; /* Length of IP options */ - sin_t *sin; struct T_error_ack *tea; - mblk_t *options_mp = NULL; - ip_pktinfo_t *pinfo; - boolean_t recv_on = B_FALSE; - cred_t *cr = NULL; - udp_t *udp = Q_TO_UDP(q); - pid_t cpid; - cred_t *rcr = udp->udp_connp->conn_cred; - udp_stack_t *us = udp->udp_us; - - TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_START, - "udp_rput_other: q %p mp %p", q, mp); - - ASSERT(OK_32PTR(mp->b_rptr)); - rptr = mp->b_rptr; switch (mp->b_datap->db_type) { - case M_CTL: - /* - * We are here only if IP_RECVSLLA and/or IP_RECVIF are set - */ - recv_on = B_TRUE; - options_mp = mp; - pinfo = (ip_pktinfo_t *)options_mp->b_rptr; - - /* - * The actual data is in mp->b_cont - */ - mp = mp->b_cont; - ASSERT(OK_32PTR(mp->b_rptr)); - rptr = mp->b_rptr; - break; - case M_DATA: - /* - * M_DATA messages contain IPv4 datagrams. They are handled - * after this switch. - */ - break; case M_PROTO: case M_PCPROTO: /* M_PROTO messages contain some type of TPI message. */ - ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); - if (mp->b_wptr - rptr < sizeof (t_scalar_t)) { + ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= + (uintptr_t)INT_MAX); + if (mp->b_wptr - mp->b_rptr < sizeof (t_scalar_t)) { freemsg(mp); - TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END, - "udp_rput_other_end: q %p (%S)", q, "protoshort"); return; } - tea = (struct T_error_ack *)rptr; + tea = (struct T_error_ack *)mp->b_rptr; switch (tea->PRIM_type) { case T_ERROR_ACK: switch (tea->ERROR_prim) { case O_T_BIND_REQ: - case T_BIND_REQ: { - /* - * If our O_T_BIND_REQ/T_BIND_REQ fails, - * clear out the associated port and source - * address before passing the message - * upstream. If this was caused by a T_CONN_REQ - * revert back to bound state. - */ - udp_fanout_t *udpf; - - udpf = &us->us_bind_fanout[UDP_BIND_HASH( - udp->udp_port, us->us_bind_fanout_size)]; - mutex_enter(&udpf->uf_lock); - if (udp->udp_state == TS_DATA_XFER) { - /* Connect failed */ - tea->ERROR_prim = T_CONN_REQ; - /* Revert back to the bound source */ - udp->udp_v6src = udp->udp_bound_v6src; - udp->udp_state = TS_IDLE; - mutex_exit(&udpf->uf_lock); - if (udp->udp_family == AF_INET6) - (void) udp_build_hdrs(q, udp); - break; - } - - if (udp->udp_discon_pending) { - tea->ERROR_prim = T_DISCON_REQ; - udp->udp_discon_pending = 0; - } - V6_SET_ZERO(udp->udp_v6src); - V6_SET_ZERO(udp->udp_bound_v6src); - udp->udp_state = TS_UNBND; - udp_bind_hash_remove(udp, B_TRUE); - udp->udp_port = 0; - mutex_exit(&udpf->uf_lock); - if (udp->udp_family == AF_INET6) - (void) udp_build_hdrs(q, udp); - break; - } + case T_BIND_REQ: + udp_bind_error(connp, mp); + return; default: break; } - break; - case T_BIND_ACK: - udp_rput_bind_ack(q, mp); - return; - - case T_OPTMGMT_ACK: - case T_OK_ACK: - break; - default: + ASSERT(0); freemsg(mp); return; - } - putnext(UDP_RD(q), mp); - return; - } - /* - * This is the inbound data path. - * First, we make sure the data contains both IP and UDP headers. - * - * This handle IPv4 packets for only AF_INET sockets. - * AF_INET6 sockets can never access udp_ip_rcv_options thus there - * is no need saving the options. - */ - ASSERT(IPH_HDR_VERSION((ipha_t *)rptr) == IPV4_VERSION); - hdr_length = IPH_HDR_LENGTH(rptr) + UDPH_SIZE; - if (mp->b_wptr - rptr < hdr_length) { - if (!pullupmsg(mp, hdr_length)) { - freemsg(mp); - if (options_mp != NULL) - freeb(options_mp); - BUMP_MIB(&udp->udp_mib, udpInErrors); - TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END, - "udp_rput_other_end: q %p (%S)", q, "hdrshort"); + case T_BIND_ACK: + udp_bind_ack(connp, mp); return; - } - rptr = mp->b_rptr; - } - /* Walk past the headers. */ - new_rptr = rptr + hdr_length; - if (!udp->udp_rcvhdr) - mp->b_rptr = new_rptr; - /* Save the options if any */ - opt_len = hdr_length - (IP_SIMPLE_HDR_LENGTH + UDPH_SIZE); - if (opt_len > 0) { - if (opt_len > udp->udp_ip_rcv_options_len) { - if (udp->udp_ip_rcv_options_len) - mi_free((char *)udp->udp_ip_rcv_options); - udp->udp_ip_rcv_options_len = 0; - udp->udp_ip_rcv_options = - (uchar_t *)mi_alloc(opt_len, BPRI_HI); - if (udp->udp_ip_rcv_options) - udp->udp_ip_rcv_options_len = opt_len; - } - if (udp->udp_ip_rcv_options_len) { - bcopy(rptr + IP_SIMPLE_HDR_LENGTH, - udp->udp_ip_rcv_options, opt_len); - /* Adjust length if we are resusing the space */ - udp->udp_ip_rcv_options_len = opt_len; + default: + break; } - } else if (udp->udp_ip_rcv_options_len) { - mi_free((char *)udp->udp_ip_rcv_options); - udp->udp_ip_rcv_options = NULL; - udp->udp_ip_rcv_options_len = 0; - } - - /* - * Normally only send up the address. - * If IP_RECVDSTADDR is set we include the destination IP - * address as an option. With IP_RECVOPTS we include all - * the IP options. - */ - udi_size = sizeof (struct T_unitdata_ind) + sizeof (sin_t); - if (udp->udp_recvdstaddr) { - udi_size += sizeof (struct T_opthdr) + sizeof (struct in_addr); - UDP_STAT(us, udp_in_recvdstaddr); - } - - if (udp->udp_ip_recvpktinfo && recv_on && - (pinfo->ip_pkt_flags & IPF_RECVADDR)) { - udi_size += sizeof (struct T_opthdr) + - sizeof (struct in_pktinfo); - UDP_STAT(us, udp_ip_recvpktinfo); - } - - if (udp->udp_recvopts && opt_len > 0) { - udi_size += sizeof (struct T_opthdr) + opt_len; - UDP_STAT(us, udp_in_recvopts); - } - - /* - * If the IP_RECVSLLA or the IP_RECVIF is set then allocate - * space accordingly - */ - if (udp->udp_recvif && recv_on && - (pinfo->ip_pkt_flags & IPF_RECVIF)) { - udi_size += sizeof (struct T_opthdr) + sizeof (uint_t); - UDP_STAT(us, udp_in_recvif); - } - - if (udp->udp_recvslla && recv_on && - (pinfo->ip_pkt_flags & IPF_RECVSLLA)) { - udi_size += sizeof (struct T_opthdr) + - sizeof (struct sockaddr_dl); - UDP_STAT(us, udp_in_recvslla); - } - - if (udp->udp_recvucred && (cr = DB_CRED(mp)) != NULL) { - udi_size += sizeof (struct T_opthdr) + ucredsize; - cpid = DB_CPID(mp); - UDP_STAT(us, udp_in_recvucred); - } - /* - * If IP_RECVTTL is set allocate the appropriate sized buffer - */ - if (udp->udp_recvttl) { - udi_size += sizeof (struct T_opthdr) + sizeof (uint8_t); - UDP_STAT(us, udp_in_recvttl); - } - - /* Allocate a message block for the T_UNITDATA_IND structure. */ - mp1 = allocb(udi_size, BPRI_MED); - if (mp1 == NULL) { freemsg(mp); - if (options_mp != NULL) - freeb(options_mp); - TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END, - "udp_rput_other_end: q %p (%S)", q, "allocbfail"); - BUMP_MIB(&udp->udp_mib, udpInErrors); + return; + default: + /* FIXME: other cases? */ + ASSERT(0); + freemsg(mp); return; } - mp1->b_cont = mp; - mp = mp1; - mp->b_datap->db_type = M_PROTO; - tudi = (struct T_unitdata_ind *)mp->b_rptr; - mp->b_wptr = (uchar_t *)tudi + udi_size; - tudi->PRIM_type = T_UNITDATA_IND; - tudi->SRC_length = sizeof (sin_t); - tudi->SRC_offset = sizeof (struct T_unitdata_ind); - tudi->OPT_offset = sizeof (struct T_unitdata_ind) + sizeof (sin_t); - udi_size -= (sizeof (struct T_unitdata_ind) + sizeof (sin_t)); - tudi->OPT_length = udi_size; - - sin = (sin_t *)&tudi[1]; - sin->sin_addr.s_addr = ((ipha_t *)rptr)->ipha_src; - sin->sin_port = ((in_port_t *) - new_rptr)[-(UDPH_SIZE/sizeof (in_port_t))]; - sin->sin_family = AF_INET; - *(uint32_t *)&sin->sin_zero[0] = 0; - *(uint32_t *)&sin->sin_zero[4] = 0; - - /* - * Add options if IP_RECVDSTADDR, IP_RECVIF, IP_RECVSLLA or - * IP_RECVTTL has been set. - */ - if (udi_size != 0) { - /* - * Copy in destination address before options to avoid any - * padding issues. - */ - char *dstopt; - - dstopt = (char *)&sin[1]; - if (udp->udp_recvdstaddr) { - struct T_opthdr *toh; - ipaddr_t *dstptr; - - toh = (struct T_opthdr *)dstopt; - toh->level = IPPROTO_IP; - toh->name = IP_RECVDSTADDR; - toh->len = sizeof (struct T_opthdr) + sizeof (ipaddr_t); - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - dstptr = (ipaddr_t *)dstopt; - *dstptr = (((ipaddr_t *)rptr)[4]); - dstopt += sizeof (ipaddr_t); - udi_size -= toh->len; - } - if (udp->udp_recvopts && udi_size != 0) { - struct T_opthdr *toh; - - toh = (struct T_opthdr *)dstopt; - toh->level = IPPROTO_IP; - toh->name = IP_RECVOPTS; - toh->len = sizeof (struct T_opthdr) + opt_len; - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - bcopy(rptr + IP_SIMPLE_HDR_LENGTH, dstopt, opt_len); - dstopt += opt_len; - udi_size -= toh->len; - } - if (udp->udp_ip_recvpktinfo && recv_on && - (pinfo->ip_pkt_flags & IPF_RECVADDR)) { - - struct T_opthdr *toh; - struct in_pktinfo *pktinfop; - - toh = (struct T_opthdr *)dstopt; - toh->level = IPPROTO_IP; - toh->name = IP_PKTINFO; - toh->len = sizeof (struct T_opthdr) + - sizeof (*pktinfop); - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - pktinfop = (struct in_pktinfo *)dstopt; - pktinfop->ipi_ifindex = pinfo->ip_pkt_ifindex; - pktinfop->ipi_spec_dst = pinfo->ip_pkt_match_addr; - - pktinfop->ipi_addr.s_addr = ((ipha_t *)rptr)->ipha_dst; - - dstopt += sizeof (struct in_pktinfo); - udi_size -= toh->len; - } - - if (udp->udp_recvslla && recv_on && - (pinfo->ip_pkt_flags & IPF_RECVSLLA)) { - - struct T_opthdr *toh; - struct sockaddr_dl *dstptr; - - toh = (struct T_opthdr *)dstopt; - toh->level = IPPROTO_IP; - toh->name = IP_RECVSLLA; - toh->len = sizeof (struct T_opthdr) + - sizeof (struct sockaddr_dl); - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - dstptr = (struct sockaddr_dl *)dstopt; - bcopy(&pinfo->ip_pkt_slla, dstptr, - sizeof (struct sockaddr_dl)); - dstopt += sizeof (struct sockaddr_dl); - udi_size -= toh->len; - } - - if (udp->udp_recvif && recv_on && - (pinfo->ip_pkt_flags & IPF_RECVIF)) { - - struct T_opthdr *toh; - uint_t *dstptr; - - toh = (struct T_opthdr *)dstopt; - toh->level = IPPROTO_IP; - toh->name = IP_RECVIF; - toh->len = sizeof (struct T_opthdr) + - sizeof (uint_t); - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - dstptr = (uint_t *)dstopt; - *dstptr = pinfo->ip_pkt_ifindex; - dstopt += sizeof (uint_t); - udi_size -= toh->len; - } - - if (cr != NULL) { - struct T_opthdr *toh; - - toh = (struct T_opthdr *)dstopt; - toh->level = SOL_SOCKET; - toh->name = SCM_UCRED; - toh->len = sizeof (struct T_opthdr) + ucredsize; - toh->status = 0; - (void) cred2ucred(cr, cpid, &toh[1], rcr); - dstopt += toh->len; - udi_size -= toh->len; - } - - if (udp->udp_recvttl) { - struct T_opthdr *toh; - uint8_t *dstptr; - - toh = (struct T_opthdr *)dstopt; - toh->level = IPPROTO_IP; - toh->name = IP_RECVTTL; - toh->len = sizeof (struct T_opthdr) + - sizeof (uint8_t); - toh->status = 0; - dstopt += sizeof (struct T_opthdr); - dstptr = (uint8_t *)dstopt; - *dstptr = ((ipha_t *)rptr)->ipha_ttl; - dstopt += sizeof (uint8_t); - udi_size -= toh->len; - } - - ASSERT(udi_size == 0); /* "Consumed" all of allocated space */ - } - BUMP_MIB(&udp->udp_mib, udpHCInDatagrams); - TRACE_2(TR_FAC_UDP, TR_UDP_RPUT_END, - "udp_rput_other_end: q %p (%S)", q, "end"); - if (options_mp != NULL) - freeb(options_mp); - - if (udp->udp_direct_sockfs) { - /* - * There is nothing above us except for the stream head; - * use the read-side synchronous stream interface in - * order to reduce the time spent in interrupt thread. - */ - ASSERT(udp->udp_issocket); - udp_rcv_enqueue(UDP_RD(q), udp, mp, msgdsize(mp)); - } else { - /* - * Use regular STREAMS interface to pass data upstream - * if this is not a socket endpoint, or if we have - * switched over to the slow mode due to sockmod being - * popped or a module being pushed on top of us. - */ - putnext(UDP_RD(q), mp); - } -} - -/* ARGSUSED */ -static void -udp_rput_other_wrapper(void *arg, mblk_t *mp, void *arg2) -{ - conn_t *connp = arg; - - udp_rput_other(connp->conn_rq, mp); - udp_exit(connp); } /* * Process a T_BIND_ACK */ static void -udp_rput_bind_ack(queue_t *q, mblk_t *mp) +udp_bind_ack(conn_t *connp, mblk_t *mp) { - udp_t *udp = Q_TO_UDP(q); + udp_t *udp = connp->conn_udp; mblk_t *mp1; ire_t *ire; struct T_bind_ack *tba; uchar_t *addrp; ipa_conn_t *ac; ipa6_conn_t *ac6; + udp_fanout_t *udpf; + udp_stack_t *us = udp->udp_us; - if (udp->udp_discon_pending) - udp->udp_discon_pending = 0; - + ASSERT(udp->udp_pending_op != -1); + rw_enter(&udp->udp_rwlock, RW_WRITER); /* * If a broadcast/multicast address was bound set * the source address to 0. @@ -5786,12 +4976,18 @@ udp_rput_bind_ack(queue_t *q, mblk_t *mp) * Note: we get IRE_BROADCAST for IPv6 to "mark" a multicast * local address. */ + udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port, + us->us_bind_fanout_size)]; if (ire->ire_type == IRE_BROADCAST && udp->udp_state != TS_DATA_XFER) { + ASSERT(udp->udp_pending_op == T_BIND_REQ || + udp->udp_pending_op == O_T_BIND_REQ); /* This was just a local bind to a broadcast addr */ + mutex_enter(&udpf->uf_lock); V6_SET_ZERO(udp->udp_v6src); + mutex_exit(&udpf->uf_lock); if (udp->udp_family == AF_INET6) - (void) udp_build_hdrs(q, udp); + (void) udp_build_hdrs(udp); } else if (V6_OR_V4_INADDR_ANY(udp->udp_v6src)) { /* * Local address not yet set - pick it from the @@ -5808,8 +5004,10 @@ udp_rput_bind_ack(queue_t *q, mblk_t *mp) sizeof (ipa_conn_x_t)); ac = &((ipa_conn_x_t *)addrp)->acx_conn; } + mutex_enter(&udpf->uf_lock); IN6_IPADDR_TO_V4MAPPED(ac->ac_laddr, &udp->udp_v6src); + mutex_exit(&udpf->uf_lock); break; case AF_INET6: if (tba->ADDR_length == sizeof (ipa6_conn_t)) { @@ -5820,13 +5018,17 @@ udp_rput_bind_ack(queue_t *q, mblk_t *mp) ac6 = &((ipa6_conn_x_t *) addrp)->ac6x_conn; } + mutex_enter(&udpf->uf_lock); udp->udp_v6src = ac6->ac6_laddr; - (void) udp_build_hdrs(q, udp); + mutex_exit(&udpf->uf_lock); + (void) udp_build_hdrs(udp); break; } } mp1 = mp1->b_cont; } + udp->udp_pending_op = -1; + rw_exit(&udp->udp_rwlock); /* * Look for one or more appended ACK message added by * udp_connect or udp_disconnect. @@ -5846,20 +5048,86 @@ udp_rput_bind_ack(queue_t *q, mblk_t *mp) while (mp != NULL) { mp1 = mp->b_cont; mp->b_cont = NULL; - putnext(UDP_RD(q), mp); + putnext(connp->conn_rq, mp); mp = mp1; } return; } freemsg(mp->b_cont); mp->b_cont = NULL; - putnext(UDP_RD(q), mp); + putnext(connp->conn_rq, mp); +} + +static void +udp_bind_error(conn_t *connp, mblk_t *mp) +{ + udp_t *udp = connp->conn_udp; + struct T_error_ack *tea; + udp_fanout_t *udpf; + udp_stack_t *us = udp->udp_us; + + tea = (struct T_error_ack *)mp->b_rptr; + + /* + * If our O_T_BIND_REQ/T_BIND_REQ fails, + * clear out the associated port and source + * address before passing the message + * upstream. If this was caused by a T_CONN_REQ + * revert back to bound state. + */ + + rw_enter(&udp->udp_rwlock, RW_WRITER); + ASSERT(udp->udp_pending_op != -1); + tea->ERROR_prim = udp->udp_pending_op; + udp->udp_pending_op = -1; + udpf = &us->us_bind_fanout[ + UDP_BIND_HASH(udp->udp_port, + us->us_bind_fanout_size)]; + mutex_enter(&udpf->uf_lock); + + switch (tea->ERROR_prim) { + case T_CONN_REQ: + ASSERT(udp->udp_state == TS_DATA_XFER); + /* Connect failed */ + /* Revert back to the bound source */ + udp->udp_v6src = udp->udp_bound_v6src; + udp->udp_state = TS_IDLE; + mutex_exit(&udpf->uf_lock); + if (udp->udp_family == AF_INET6) + (void) udp_build_hdrs(udp); + rw_exit(&udp->udp_rwlock); + break; + + case T_DISCON_REQ: + case T_BIND_REQ: + case O_T_BIND_REQ: + V6_SET_ZERO(udp->udp_v6src); + V6_SET_ZERO(udp->udp_bound_v6src); + udp->udp_state = TS_UNBND; + udp_bind_hash_remove(udp, B_TRUE); + udp->udp_port = 0; + mutex_exit(&udpf->uf_lock); + if (udp->udp_family == AF_INET6) + (void) udp_build_hdrs(udp); + rw_exit(&udp->udp_rwlock); + break; + + default: + mutex_exit(&udpf->uf_lock); + rw_exit(&udp->udp_rwlock); + (void) mi_strlog(connp->conn_rq, 1, + SL_ERROR|SL_TRACE, + "udp_input_other: bad ERROR_prim, " + "len %d", tea->ERROR_prim); + } + putnext(connp->conn_rq, mp); } /* - * return SNMP stuff in buffer in mpdata + * return SNMP stuff in buffer in mpdata. We don't hold any lock and report + * information that can be changing beneath us. */ -int +mblk_t * udp_snmp_get(queue_t *q, mblk_t *mpctl) { mblk_t *mpdata; @@ -5880,11 +5148,18 @@ udp_snmp_get(queue_t *q, mblk_t *mpctl) int i; connf_t *connfp; conn_t *connp = Q_TO_CONN(q); - udp_t *udp = connp->conn_udp; int v4_conn_idx; int v6_conn_idx; boolean_t needattr; + udp_t *udp; ip_stack_t *ipst = connp->conn_netstack->netstack_ip; + udp_stack_t *us = connp->conn_netstack->netstack_udp; + mblk_t *mp2ctl; + + /* + * make a copy of the original message + */ + mp2ctl = copymsg(mpctl); mp_conn_ctl = mp_attr_ctl = mp6_conn_ctl = NULL; if (mpctl == NULL || @@ -5896,23 +5171,25 @@ udp_snmp_get(queue_t *q, mblk_t *mpctl) freemsg(mp_conn_ctl); freemsg(mp_attr_ctl); freemsg(mp6_conn_ctl); + freemsg(mpctl); + freemsg(mp2ctl); return (0); } zoneid = connp->conn_zoneid; /* fixed length structure for IPv4 and IPv6 counters */ - SET_MIB(udp->udp_mib.udpEntrySize, sizeof (mib2_udpEntry_t)); - SET_MIB(udp->udp_mib.udp6EntrySize, sizeof (mib2_udp6Entry_t)); + SET_MIB(us->us_udp_mib.udpEntrySize, sizeof (mib2_udpEntry_t)); + SET_MIB(us->us_udp_mib.udp6EntrySize, sizeof (mib2_udp6Entry_t)); /* synchronize 64- and 32-bit counters */ - SYNC32_MIB(&udp->udp_mib, udpInDatagrams, udpHCInDatagrams); - SYNC32_MIB(&udp->udp_mib, udpOutDatagrams, udpHCOutDatagrams); + SYNC32_MIB(&us->us_udp_mib, udpInDatagrams, udpHCInDatagrams); + SYNC32_MIB(&us->us_udp_mib, udpOutDatagrams, udpHCOutDatagrams); optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; optp->level = MIB2_UDP; optp->name = 0; - (void) snmp_append_data(mpdata, (char *)&udp->udp_mib, - sizeof (udp->udp_mib)); + (void) snmp_append_data(mpdata, (char *)&us->us_udp_mib, + sizeof (us->us_udp_mib)); optp->len = msgdsize(mpdata); qreply(q, mpctl); @@ -5924,7 +5201,7 @@ udp_snmp_get(queue_t *q, mblk_t *mpctl) connp = NULL; while ((connp = ipcl_get_next_conn(connfp, connp, - IPCL_UDP))) { + IPCL_UDPCONN))) { udp = connp->conn_udp; if (zoneid != connp->conn_zoneid) continue; @@ -6088,7 +5365,7 @@ udp_snmp_get(queue_t *q, mblk_t *mpctl) else qreply(q, mp6_attr_ctl); - return (1); + return (mp2ctl); } /* @@ -6190,7 +5467,7 @@ udp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) connp = NULL; while ((connp = ipcl_get_next_conn(connfp, connp, - IPCL_UDP))) { + IPCL_UDPCONN))) { udp = connp->conn_udp; if (zoneid != GLOBAL_ZONEID && zoneid != connp->conn_zoneid) @@ -6246,7 +5523,7 @@ udp_ud_err(queue_t *q, mblk_t *mp, uchar_t *destaddr, t_scalar_t destlen, mp1 = mi_tpi_uderror_ind((char *)destaddr, destlen, (char *)optaddr, optlen, err); if (mp1 != NULL) - putnext(UDP_RD(q), mp1); + qreply(q, mp1); done: freemsg(mp); @@ -6260,12 +5537,9 @@ static void udp_unbind(queue_t *q, mblk_t *mp) { udp_t *udp = Q_TO_UDP(q); + udp_fanout_t *udpf; + udp_stack_t *us = udp->udp_us; - /* If a bind has not been done, we can't unbind. */ - if (udp->udp_state == TS_UNBND) { - udp_err_ack(q, mp, TOUTSTATE, 0); - return; - } if (cl_inet_unbind != NULL) { /* * Running in cluster mode - register unbind information @@ -6281,29 +5555,44 @@ udp_unbind(queue_t *q, mblk_t *mp) } } - udp_bind_hash_remove(udp, B_FALSE); - V6_SET_ZERO(udp->udp_v6src); - V6_SET_ZERO(udp->udp_bound_v6src); - udp->udp_port = 0; - udp->udp_state = TS_UNBND; - - if (udp->udp_family == AF_INET6) { - int error; - - /* Rebuild the header template */ - error = udp_build_hdrs(q, udp); - if (error != 0) { - udp_err_ack(q, mp, TSYSERR, error); - return; - } + rw_enter(&udp->udp_rwlock, RW_WRITER); + if (udp->udp_state == TS_UNBND || udp->udp_pending_op != -1) { + rw_exit(&udp->udp_rwlock); + udp_err_ack(q, mp, TOUTSTATE, 0); + return; } + udp->udp_pending_op = T_UNBIND_REQ; + rw_exit(&udp->udp_rwlock); + /* * Pass the unbind to IP; T_UNBIND_REQ is larger than T_OK_ACK * and therefore ip_unbind must never return NULL. */ mp = ip_unbind(q, mp); ASSERT(mp != NULL); - putnext(UDP_RD(q), mp); + ASSERT(((struct T_ok_ack *)mp->b_rptr)->PRIM_type == T_OK_ACK); + + /* + * Once we're unbound from IP, the pending operation may be cleared + * here. + */ + rw_enter(&udp->udp_rwlock, RW_WRITER); + udpf = &us->us_bind_fanout[UDP_BIND_HASH(udp->udp_port, + us->us_bind_fanout_size)]; + mutex_enter(&udpf->uf_lock); + udp_bind_hash_remove(udp, B_TRUE); + V6_SET_ZERO(udp->udp_v6src); + V6_SET_ZERO(udp->udp_bound_v6src); + udp->udp_port = 0; + mutex_exit(&udpf->uf_lock); + + udp->udp_pending_op = -1; + udp->udp_state = TS_UNBND; + if (udp->udp_family == AF_INET6) + (void) udp_build_hdrs(udp); + rw_exit(&udp->udp_rwlock); + + qreply(q, mp); } /* @@ -6381,10 +5670,11 @@ udp_update_label(queue_t *wq, mblk_t *mp, ipaddr_t dst) int err; uchar_t opt_storage[IP_MAX_OPT_LENGTH]; udp_t *udp = Q_TO_UDP(wq); + udp_stack_t *us = udp->udp_us; err = tsol_compute_label(DB_CREDDEF(mp, udp->udp_connp->conn_cred), dst, opt_storage, udp->udp_mac_exempt, - udp->udp_us->us_netstack->netstack_ip); + us->us_netstack->netstack_ip); if (err == 0) { err = tsol_update_options(&udp->udp_ip_snd_options, &udp->udp_ip_snd_options_len, &udp->udp_label_len, @@ -6413,6 +5703,8 @@ udp_output_v4(conn_t *connp, mblk_t *mp, ipaddr_t v4dst, uint16_t port, int ip_hdr_length; uint32_t ip_len; udpha_t *udpha; + boolean_t lock_held = B_FALSE; + in_port_t uha_src_port; udpattrs_t attrs; uchar_t ip_snd_opt[IP_MAX_OPT_LENGTH]; uint32_t ip_snd_opt_len = 0; @@ -6457,6 +5749,8 @@ udp_output_v4(conn_t *connp, mblk_t *mp, ipaddr_t v4dst, uint16_t port, /* mp1 points to the M_DATA mblk carrying the packet */ ASSERT(mp1 != NULL && DB_TYPE(mp1) == M_DATA); + rw_enter(&udp->udp_rwlock, RW_READER); + lock_held = B_TRUE; /* * Check if our saved options are valid; update if not. * TSOL Note: Since we are not in WRITER mode, UDP packets @@ -6557,6 +5851,11 @@ udp_output_v4(conn_t *connp, mblk_t *mp, ipaddr_t v4dst, uint16_t port, IN6_V4MAPPED_TO_IPADDR(&v6src, ipha->ipha_src); } } + uha_src_port = udp->udp_port; + if (ip_hdr_length == IP_SIMPLE_HDR_LENGTH) { + rw_exit(&udp->udp_rwlock); + lock_held = B_FALSE; + } if (pktinfop->ip4_ill_index != 0) { optinfo.ip_opt_ill_index = pktinfop->ip4_ill_index; @@ -6610,12 +5909,14 @@ udp_output_v4(conn_t *connp, mblk_t *mp, ipaddr_t v4dst, uint16_t port, ipha->ipha_ttl = udp->udp_multicast_ttl; udpha->uha_dst_port = port; - udpha->uha_src_port = udp->udp_port; + udpha->uha_src_port = uha_src_port; if (ip_snd_opt_len > 0) { uint32_t cksum; bcopy(ip_snd_opt, &ipha[1], ip_snd_opt_len); + lock_held = B_FALSE; + rw_exit(&udp->udp_rwlock); /* * Massage source route putting first source route in ipha_dst. * Ignore the destination in T_unitdata_req. @@ -6659,7 +5960,7 @@ udp_output_v4(conn_t *connp, mblk_t *mp, ipaddr_t v4dst, uint16_t port, ip_len <<= 16; #endif } - + ASSERT(!lock_held); /* Set UDP length and checksum */ *((uint32_t *)&udpha->uha_length) = ip_len; if (DB_CRED(mp) != NULL) @@ -6675,7 +5976,7 @@ udp_output_v4(conn_t *connp, mblk_t *mp, ipaddr_t v4dst, uint16_t port, mp = NULL; /* We're done. Pass the packet to ip. */ - BUMP_MIB(&udp->udp_mib, udpHCOutDatagrams); + BUMP_MIB(&us->us_udp_mib, udpHCOutDatagrams); TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, "udp_wput_end: q %p (%S)", q, "end"); @@ -6696,9 +5997,11 @@ udp_output_v4(conn_t *connp, mblk_t *mp, ipaddr_t v4dst, uint16_t port, } done: + if (lock_held) + rw_exit(&udp->udp_rwlock); if (*error != 0) { ASSERT(mp != NULL); - BUMP_MIB(&udp->udp_mib, udpOutErrors); + BUMP_MIB(&us->us_udp_mib, udpOutErrors); } return (mp); } @@ -6708,14 +6011,9 @@ udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, ipha_t *ipha) { conn_t *connp = udp->udp_connp; ipaddr_t src, dst; - ill_t *ill; ire_t *ire; ipif_t *ipif = NULL; mblk_t *ire_fp_mp; - uint_t ire_fp_mp_len; - uint16_t *up; - uint32_t cksum, hcksum_txflags; - queue_t *dev_q; boolean_t retry_caching; udp_stack_t *us = udp->udp_us; ip_stack_t *ipst = connp->conn_netstack->netstack_ip; @@ -6824,10 +6122,9 @@ udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, ipha_t *ipha) if ((ire->ire_type & (IRE_BROADCAST|IRE_LOCAL|IRE_LOOPBACK)) || (ire->ire_flags & RTF_MULTIRT) || (ire->ire_stq == NULL) || (ire->ire_max_frag < ntohs(ipha->ipha_length)) || - (connp->conn_nexthop_set) || - (ire->ire_nce == NULL) || - ((ire_fp_mp = ire->ire_nce->nce_fp_mp) == NULL) || - ((ire_fp_mp_len = MBLKL(ire_fp_mp)) > MBLKHEAD(mp))) { + ((ire->ire_nce == NULL) || + ((ire_fp_mp = ire->ire_nce->nce_fp_mp) == NULL)) || + connp->conn_nexthop_set || (MBLKL(ire_fp_mp) > MBLKHEAD(mp))) { if (ipif != NULL) ipif_refrele(ipif); UDP_STAT(us, udp_ip_ire_send); @@ -6836,43 +6133,62 @@ udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, ipha_t *ipha) return; } - ill = ire_to_ill(ire); - ASSERT(ill != NULL); + if (src == INADDR_ANY && !connp->conn_unspec_src) { + if (CLASSD(dst) && !(ire->ire_flags & RTF_SETSRC)) + ipha->ipha_src = ipif->ipif_src_addr; + else + ipha->ipha_src = ire->ire_src_addr; + } - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); + if (ipif != NULL) + ipif_refrele(ipif); + + udp_xmit(connp->conn_wq, mp, ire, connp, connp->conn_zoneid); +} + +static void +udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid) +{ + ipaddr_t src, dst; + ill_t *ill; + mblk_t *ire_fp_mp; + uint_t ire_fp_mp_len; + uint16_t *up; + uint32_t cksum, hcksum_txflags; + queue_t *dev_q; + udp_t *udp = connp->conn_udp; + ipha_t *ipha = (ipha_t *)mp->b_rptr; + udp_stack_t *us = udp->udp_us; + ip_stack_t *ipst = connp->conn_netstack->netstack_ip; dev_q = ire->ire_stq->q_next; ASSERT(dev_q != NULL); - /* - * If the service thread is already running, or if the driver - * queue is currently flow-controlled, queue this packet. - */ - if ((q->q_first != NULL || connp->conn_draining) || - ((dev_q->q_next || dev_q->q_first) && !canput(dev_q))) { - if (ipst->ips_ip_output_queue) { - (void) putq(q, mp); - } else { - BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards); - freemsg(mp); - } - if (ipif != NULL) - ipif_refrele(ipif); - IRE_REFRELE(ire); + + + if (DEV_Q_IS_FLOW_CTLED(dev_q)) { + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); + ire_refrele(ire); return; } + ire_fp_mp = ire->ire_nce->nce_fp_mp; + ire_fp_mp_len = MBLKL(ire_fp_mp); + ASSERT(MBLKHEAD(mp) >= ire_fp_mp_len); + + dst = ipha->ipha_dst; + src = ipha->ipha_src; + + ill = ire_to_ill(ire); + ASSERT(ill != NULL); + + BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); + ipha->ipha_ident = (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1); #ifndef _BIG_ENDIAN ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8); #endif - if (src == INADDR_ANY && !connp->conn_unspec_src) { - if (CLASSD(dst) && !(ire->ire_flags & RTF_SETSRC)) - src = ipha->ipha_src = ipif->ipif_src_addr; - else - src = ipha->ipha_src = ire->ire_src_addr; - } - if (ILL_HCKSUM_CAPABLE(ill) && dohwcksum) { ASSERT(ill->ill_hcksum_capab != NULL); hcksum_txflags = ill->ill_hcksum_capab->ill_hcksum_txflags; @@ -6918,15 +6234,13 @@ udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, ipha_t *ipha) if (ilm != NULL) { ip_multicast_loopback(q, ill, mp, connp->conn_multicast_loop ? 0 : - IP_FF_NO_MCAST_LOOP, connp->conn_zoneid); + IP_FF_NO_MCAST_LOOP, zoneid); } /* If multicast TTL is 0 then we are done */ if (ipha->ipha_ttl == 0) { - if (ipif != NULL) - ipif_refrele(ipif); freemsg(mp); - IRE_REFRELE(ire); + ire_refrele(ire); return; } } @@ -6961,8 +6275,6 @@ udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, ipha_t *ipha) putnext(ire->ire_stq, mp); } - if (ipif != NULL) - ipif_refrele(ipif); IRE_REFRELE(ire); } @@ -6972,10 +6284,11 @@ udp_update_label_v6(queue_t *wq, mblk_t *mp, in6_addr_t *dst) udp_t *udp = Q_TO_UDP(wq); int err; uchar_t opt_storage[TSOL_MAX_IPV6_OPTION]; + udp_stack_t *us = udp->udp_us; err = tsol_compute_label_v6(DB_CREDDEF(mp, udp->udp_connp->conn_cred), dst, opt_storage, udp->udp_mac_exempt, - udp->udp_us->us_netstack->netstack_ip); + us->us_netstack->netstack_ip); if (err == 0) { err = tsol_update_sticky(&udp->udp_sticky_ipp, &udp->udp_label_len_v6, opt_storage); @@ -6991,97 +6304,145 @@ udp_update_label_v6(queue_t *wq, mblk_t *mp, in6_addr_t *dst) return (err); } +void +udp_output_connected(void *arg, mblk_t *mp) +{ + conn_t *connp = (conn_t *)arg; + udp_t *udp = connp->conn_udp; + udp_stack_t *us = udp->udp_us; + ipaddr_t v4dst; + in_port_t dstport; + boolean_t mapped_addr; + struct sockaddr_storage ss; + sin_t *sin; + sin6_t *sin6; + struct sockaddr *addr; + socklen_t addrlen; + int error; + boolean_t insert_spi = udp->udp_nat_t_endpoint; + + /* M_DATA for connected socket */ + + ASSERT(udp->udp_issocket); + UDP_DBGSTAT(us, udp_data_conn); + + mutex_enter(&connp->conn_lock); + if (udp->udp_state != TS_DATA_XFER) { + mutex_exit(&connp->conn_lock); + BUMP_MIB(&us->us_udp_mib, udpOutErrors); + UDP_STAT(us, udp_out_err_notconn); + freemsg(mp); + TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, + "udp_wput_end: connp %p (%S)", connp, + "not-connected; address required"); + return; + } + + mapped_addr = IN6_IS_ADDR_V4MAPPED(&udp->udp_v6dst); + if (mapped_addr) + IN6_V4MAPPED_TO_IPADDR(&udp->udp_v6dst, v4dst); + + /* Initialize addr and addrlen as if they're passed in */ + if (udp->udp_family == AF_INET) { + sin = (sin_t *)&ss; + sin->sin_family = AF_INET; + dstport = sin->sin_port = udp->udp_dstport; + ASSERT(mapped_addr); + sin->sin_addr.s_addr = v4dst; + addr = (struct sockaddr *)sin; + addrlen = sizeof (*sin); + } else { + sin6 = (sin6_t *)&ss; + sin6->sin6_family = AF_INET6; + dstport = sin6->sin6_port = udp->udp_dstport; + sin6->sin6_flowinfo = udp->udp_flowinfo; + sin6->sin6_addr = udp->udp_v6dst; + sin6->sin6_scope_id = 0; + sin6->__sin6_src_id = 0; + addr = (struct sockaddr *)sin6; + addrlen = sizeof (*sin6); + } + mutex_exit(&connp->conn_lock); + + if (mapped_addr) { + /* + * Handle both AF_INET and AF_INET6; the latter + * for IPV4 mapped destination addresses. Note + * here that both addr and addrlen point to the + * corresponding struct depending on the address + * family of the socket. + */ + mp = udp_output_v4(connp, mp, v4dst, dstport, 0, &error, + insert_spi); + } else { + mp = udp_output_v6(connp, mp, sin6, &error); + } + if (error == 0) { + ASSERT(mp == NULL); + return; + } + + UDP_STAT(us, udp_out_err_output); + ASSERT(mp != NULL); + /* mp is freed by the following routine */ + udp_ud_err(connp->conn_wq, mp, (uchar_t *)addr, (t_scalar_t)addrlen, + (t_scalar_t)error); +} + /* * This routine handles all messages passed downstream. It either * consumes the message or passes it downstream; it never queues a * a message. + * + * Also entry point for sockfs when udp is in "direct sockfs" mode. This mode + * is valid when we are directly beneath the stream head, and thus sockfs + * is able to bypass STREAMS and directly call us, passing along the sockaddr + * structure without the cumbersome T_UNITDATA_REQ interface for the case of + * connected endpoints. */ -static void -udp_output(conn_t *connp, mblk_t *mp, struct sockaddr *addr, socklen_t addrlen) +void +udp_wput(queue_t *q, mblk_t *mp) { sin6_t *sin6; sin_t *sin; ipaddr_t v4dst; uint16_t port; uint_t srcid; - queue_t *q = connp->conn_wq; + conn_t *connp = Q_TO_CONN(q); udp_t *udp = connp->conn_udp; int error = 0; - struct sockaddr_storage ss; + struct sockaddr *addr; + socklen_t addrlen; udp_stack_t *us = udp->udp_us; boolean_t insert_spi = udp->udp_nat_t_endpoint; TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_START, - "udp_wput_start: connp %p mp %p", connp, mp); + "udp_wput_start: queue %p mp %p", q, mp); /* * We directly handle several cases here: T_UNITDATA_REQ message - * coming down as M_PROTO/M_PCPROTO and M_DATA messages for both - * connected and non-connected socket. The latter carries the - * address structure along when this routine gets called. + * coming down as M_PROTO/M_PCPROTO and M_DATA messages for connected + * socket. */ switch (DB_TYPE(mp)) { case M_DATA: + /* + * Quick check for error cases. Checks will be done again + * under the lock later on + */ if (!udp->udp_direct_sockfs || udp->udp_state != TS_DATA_XFER) { - if (!udp->udp_direct_sockfs || - addr == NULL || addrlen == 0) { - /* Not connected; address is required */ - BUMP_MIB(&udp->udp_mib, udpOutErrors); - UDP_STAT(us, udp_out_err_notconn); - freemsg(mp); - TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, - "udp_wput_end: connp %p (%S)", connp, - "not-connected; address required"); - return; - } - ASSERT(udp->udp_issocket); - UDP_DBGSTAT(us, udp_data_notconn); - /* Not connected; do some more checks below */ - break; - } - /* M_DATA for connected socket */ - UDP_DBGSTAT(us, udp_data_conn); - IN6_V4MAPPED_TO_IPADDR(&udp->udp_v6dst, v4dst); - - /* Initialize addr and addrlen as if they're passed in */ - if (udp->udp_family == AF_INET) { - sin = (sin_t *)&ss; - sin->sin_family = AF_INET; - sin->sin_port = udp->udp_dstport; - sin->sin_addr.s_addr = v4dst; - addr = (struct sockaddr *)sin; - addrlen = sizeof (*sin); - } else { - sin6 = (sin6_t *)&ss; - sin6->sin6_family = AF_INET6; - sin6->sin6_port = udp->udp_dstport; - sin6->sin6_flowinfo = udp->udp_flowinfo; - sin6->sin6_addr = udp->udp_v6dst; - sin6->sin6_scope_id = 0; - sin6->__sin6_src_id = 0; - addr = (struct sockaddr *)sin6; - addrlen = sizeof (*sin6); - } - - if (udp->udp_family == AF_INET || - IN6_IS_ADDR_V4MAPPED(&udp->udp_v6dst)) { - /* - * Handle both AF_INET and AF_INET6; the latter - * for IPV4 mapped destination addresses. Note - * here that both addr and addrlen point to the - * corresponding struct depending on the address - * family of the socket. - */ - mp = udp_output_v4(connp, mp, v4dst, - udp->udp_dstport, 0, &error, insert_spi); - } else { - mp = udp_output_v6(connp, mp, sin6, &error); - } - if (error != 0) { - ASSERT(addr != NULL && addrlen != 0); - goto ud_error; + /* Not connected; address is required */ + BUMP_MIB(&us->us_udp_mib, udpOutErrors); + UDP_STAT(us, udp_out_err_notconn); + freemsg(mp); + TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, + "udp_wput_end: connp %p (%S)", connp, + "not-connected; address required"); + return; } + udp_output_connected(connp, mp); return; + case M_PROTO: case M_PCPROTO: { struct T_unitdata_req *tudr; @@ -7128,8 +6489,7 @@ udp_output(conn_t *connp, mblk_t *mp, struct sockaddr *addr, socklen_t addrlen) /* FALLTHRU */ } default: - udp_become_writer(connp, mp, udp_wput_other_wrapper, - SQTAG_UDP_OUTPUT); + udp_wput_other(q, mp); return; } ASSERT(addr != NULL); @@ -7137,8 +6497,8 @@ udp_output(conn_t *connp, mblk_t *mp, struct sockaddr *addr, socklen_t addrlen) switch (udp->udp_family) { case AF_INET6: sin6 = (sin6_t *)addr; - if (!OK_32PTR((char *)sin6) || addrlen != sizeof (sin6_t) || - sin6->sin6_family != AF_INET6) { + if (!OK_32PTR((char *)sin6) || (addrlen != sizeof (sin6_t)) || + (sin6->sin6_family != AF_INET6)) { TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, "udp_wput_end: q %p (%S)", q, "badaddr"); error = EADDRNOTAVAIL; @@ -7180,8 +6540,8 @@ udp_output(conn_t *connp, mblk_t *mp, struct sockaddr *addr, socklen_t addrlen) case AF_INET: sin = (sin_t *)addr; - if (!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t) || - sin->sin_family != AF_INET) { + if ((!OK_32PTR((char *)sin) || addrlen != sizeof (sin_t)) || + (sin->sin_family != AF_INET)) { TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_END, "udp_wput_end: q %p (%S)", q, "badaddr"); error = EADDRNOTAVAIL; @@ -7205,107 +6565,6 @@ ud_error: } } -/* ARGSUSED */ -static void -udp_output_wrapper(void *arg, mblk_t *mp, void *arg2) -{ - udp_output((conn_t *)arg, mp, NULL, 0); - _UDP_EXIT((conn_t *)arg); -} - -static void -udp_wput(queue_t *q, mblk_t *mp) -{ - _UDP_ENTER(Q_TO_CONN(UDP_WR(q)), mp, udp_output_wrapper, - SQTAG_UDP_WPUT); -} - -/* - * Allocate and prepare a T_UNITDATA_REQ message. - */ -static mblk_t * -udp_tudr_alloc(struct sockaddr *addr, socklen_t addrlen) -{ - struct T_unitdata_req *tudr; - mblk_t *mp; - - mp = allocb(sizeof (*tudr) + addrlen, BPRI_MED); - if (mp != NULL) { - mp->b_wptr += sizeof (*tudr) + addrlen; - DB_TYPE(mp) = M_PROTO; - - tudr = (struct T_unitdata_req *)mp->b_rptr; - tudr->PRIM_type = T_UNITDATA_REQ; - tudr->DEST_length = addrlen; - tudr->DEST_offset = (t_scalar_t)sizeof (*tudr); - tudr->OPT_length = 0; - tudr->OPT_offset = 0; - bcopy(addr, tudr+1, addrlen); - } - return (mp); -} - -/* - * Entry point for sockfs when udp is in "direct sockfs" mode. This mode - * is valid when we are directly beneath the stream head, and thus sockfs - * is able to bypass STREAMS and directly call us, passing along the sockaddr - * structure without the cumbersome T_UNITDATA_REQ interface. Note that - * this is done for both connected and non-connected endpoint. - */ -void -udp_wput_data(queue_t *q, mblk_t *mp, struct sockaddr *addr, socklen_t addrlen) -{ - conn_t *connp; - udp_t *udp; - udp_stack_t *us; - - q = UDP_WR(q); - connp = Q_TO_CONN(q); - udp = connp->conn_udp; - us = udp->udp_us; - - /* udpsockfs should only send down M_DATA for this entry point */ - ASSERT(DB_TYPE(mp) == M_DATA); - - mutex_enter(&connp->conn_lock); - UDP_MODE_ASSERTIONS(udp, UDP_ENTER); - - if (udp->udp_mode != UDP_MT_HOT) { - /* - * We can't enter this conn right away because another - * thread is currently executing as writer; therefore we - * need to deposit the message into the squeue to be - * drained later. If a socket address is present, we - * need to create a T_UNITDATA_REQ message as placeholder. - */ - if (addr != NULL && addrlen != 0) { - mblk_t *tudr_mp = udp_tudr_alloc(addr, addrlen); - - if (tudr_mp == NULL) { - mutex_exit(&connp->conn_lock); - BUMP_MIB(&udp->udp_mib, udpOutErrors); - UDP_STAT(us, udp_out_err_tudr); - freemsg(mp); - return; - } - /* Tag the packet with T_UNITDATA_REQ */ - tudr_mp->b_cont = mp; - mp = tudr_mp; - } - mutex_exit(&connp->conn_lock); - udp_enter(connp, mp, udp_output_wrapper, SQTAG_UDP_WPUT); - return; - } - - /* We can execute as reader right away. */ - UDP_READERS_INCREF(udp); - mutex_exit(&connp->conn_lock); - - udp_output(connp, mp, addr, addrlen); - - udp_exit(connp); -} - /* * udp_output_v6(): * Assumes that udp_wput did some sanity checking on the destination @@ -7338,6 +6597,7 @@ udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6, int *error) uint_t hopoptslen = 0; boolean_t is_ancillary = B_FALSE; udp_stack_t *us = udp->udp_us; + size_t sth_wroff = 0; *error = 0; @@ -7366,12 +6626,15 @@ udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6, int *error) if (((struct T_unitdata_req *)mp->b_rptr)->OPT_length != 0) { attrs.udpattr_ipp6 = ipp; attrs.udpattr_mb = mp; - if (udp_unitdata_opt_process(q, mp, error, &attrs) < 0) + if (udp_unitdata_opt_process(q, mp, error, + &attrs) < 0) { goto done; + } ASSERT(*error == 0); opt_present = B_TRUE; } } + rw_enter(&udp->udp_rwlock, RW_READER); ignore = ipp->ipp_sticky_ignored; /* mp1 points to the M_DATA mblk carrying the packet */ @@ -7417,6 +6680,7 @@ udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6, int *error) char *, "MLP mp(1) lacks SCM_UCRED attr(2) on q(3)", mblk_t *, mp1, udpattrs_t *, &attrs, queue_t *, q); *error = ECONNREFUSED; + rw_exit(&udp->udp_rwlock); mutex_exit(&connp->conn_lock); goto done; } @@ -7429,6 +6693,7 @@ udp_output_v6(conn_t *connp, mblk_t *mp, sin6_t *sin6, int *error) !IN6_ARE_ADDR_EQUAL(&udp->udp_v6lastdst, &ip6_dst) || connp->conn_mlp_type != mlptSingle) && (*error = udp_update_label_v6(q, mp, &ip6_dst)) != 0) { + rw_exit(&udp->udp_rwlock); mutex_exit(&connp->conn_lock); goto done; } @@ -7596,15 +6861,17 @@ no_options: ip6h = (ip6_t *)&mp1->b_rptr[-udp_ip_hdr_len]; if (DB_REF(mp1) != 1 || ((unsigned char *)ip6h < DB_BASE(mp1)) || !OK_32PTR(ip6h)) { + /* Try to get everything in a single mblk next time */ if (udp_ip_hdr_len > udp->udp_max_hdr_len) { udp->udp_max_hdr_len = udp_ip_hdr_len; - (void) mi_set_sth_wroff(UDP_RD(q), - udp->udp_max_hdr_len + us->us_wroff_extra); + sth_wroff = udp->udp_max_hdr_len + us->us_wroff_extra; } + mp2 = allocb(udp_ip_hdr_len + us->us_wroff_extra, BPRI_LO); if (mp2 == NULL) { *error = ENOMEM; + rw_exit(&udp->udp_rwlock); goto done; } mp2->b_wptr = DB_LIM(mp2); @@ -7801,6 +7068,7 @@ no_options: ip6h->ip6_vcf = IPV6_TCLASS_FLOW(ip6h->ip6_vcf, tipp->ipp_tclass); } + rw_exit(&udp->udp_rwlock); if (option_exists & IPPF_RTHDR) { ip6_rthdr_t *rth; @@ -7902,17 +7170,21 @@ no_options: mp = NULL; /* We're done. Pass the packet to IP */ - BUMP_MIB(&udp->udp_mib, udpHCOutDatagrams); + BUMP_MIB(&us->us_udp_mib, udpHCOutDatagrams); ip_output_v6(connp, mp1, q, IP_WPUT); done: + if (sth_wroff != 0) { + (void) mi_set_sth_wroff(RD(q), + udp->udp_max_hdr_len + us->us_wroff_extra); + } if (hopoptsptr != NULL && !is_ancillary) { kmem_free(hopoptsptr, hopoptslen); hopoptsptr = NULL; } if (*error != 0) { ASSERT(mp != NULL); - BUMP_MIB(&udp->udp_mib, udpOutErrors); + BUMP_MIB(&us->us_udp_mib, udpOutErrors); } return (mp); } @@ -7988,26 +7260,17 @@ udp_wput_other(queue_t *q, mblk_t *mp) "udp_wput_other_end: q %p (%S)", q, "unbindreq"); return; case T_SVR4_OPTMGMT_REQ: - if (!snmpcom_req(q, mp, udp_snmp_set, udp_snmp_get, cr)) - /* - * Use upper queue for option processing in - * case the request is not handled at this - * level and needs to be passed down to IP. - */ - (void) svr4_optcom_req(_WR(UDP_RD(q)), - mp, cr, &udp_opt_obj); + if (!snmpcom_req(q, mp, udp_snmp_set, ip_snmp_get, + cr)) { + (void) svr4_optcom_req(q, + mp, cr, &udp_opt_obj, B_TRUE); + } TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, "udp_wput_other_end: q %p (%S)", q, "optmgmtreq"); return; case T_OPTMGMT_REQ: - /* - * Use upper queue for option processing in - * case the request is not handled at this - * level and needs to be passed down to IP. - */ - (void) tpi_optcom_req(_WR(UDP_RD(q)), - mp, cr, &udp_opt_obj); + (void) tpi_optcom_req(q, mp, cr, &udp_opt_obj, B_TRUE); TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, "udp_wput_other_end: q %p (%S)", q, "optmgmtreq"); return; @@ -8057,7 +7320,7 @@ udp_wput_other(queue_t *q, mblk_t *mp) iocp->ioc_error = ENOTCONN; iocp->ioc_count = 0; mp->b_datap->db_type = M_IOCACK; - putnext(UDP_RD(q), mp); + qreply(q, mp); TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, "udp_wput_other_end: q %p (%S)", q, "getpeername"); @@ -8081,7 +7344,7 @@ udp_wput_other(queue_t *q, mblk_t *mp) /* nd_getset performs the necessary checking */ case ND_GET: if (nd_getset(q, us->us_nd, mp)) { - putnext(UDP_RD(q), mp); + qreply(q, mp); TRACE_2(TR_FAC_UDP, TR_UDP_WPUT_OTHER_END, "udp_wput_other_end: q %p (%S)", q, "get"); return; @@ -8107,7 +7370,7 @@ udp_wput_other(queue_t *q, mblk_t *mp) * stream interface and drain any * queued data. */ - udp_rcv_drain(UDP_RD(q), udp, + udp_rcv_drain(RD(q), udp, B_FALSE); ASSERT(!udp->udp_direct_sockfs); UDP_STAT(us, udp_sock_fallback); @@ -8117,7 +7380,7 @@ udp_wput_other(queue_t *q, mblk_t *mp) } iocp->ioc_count = 0; iocp->ioc_rval = 0; - putnext(UDP_RD(q), mp); + qreply(q, mp); return; default: break; @@ -8137,14 +7400,6 @@ udp_wput_other(queue_t *q, mblk_t *mp) ip_output(connp, mp, q, IP_WPUT); } -/* ARGSUSED */ -static void -udp_wput_other_wrapper(void *arg, mblk_t *mp, void *arg2) -{ - udp_wput_other(((conn_t *)arg)->conn_wq, mp); - udp_exit((conn_t *)arg); -} - /* * udp_wput_iocdata is called by udp_wput_other to handle all M_IOCDATA * messages. @@ -8171,7 +7426,6 @@ udp_wput_iocdata(queue_t *q, mblk_t *mp) return; } - q = WR(UDP_RD(q)); switch (mi_copy_state(q, mp, &mp1)) { case -1: return; @@ -8317,11 +7571,7 @@ udp_unitdata_opt_process(queue_t *q, mblk_t *mp, int *errorp, udreqp = (struct T_unitdata_req *)mp->b_rptr; - /* - * Use upper queue for option processing since the callback - * routines expect to be called in UDP instance instead of IP. - */ - *errorp = tpi_optcom_buf(_WR(UDP_RD(q)), mp, &udreqp->OPT_length, + *errorp = tpi_optcom_buf(q, mp, &udreqp->OPT_length, udreqp->OPT_offset, cr, &udp_opt_obj, udpattrs, &is_absreq_failure); @@ -8339,13 +7589,9 @@ udp_unitdata_opt_process(queue_t *q, mblk_t *mp, int *errorp, void udp_ddi_init(void) { - UDP6_MAJ = ddi_name_to_major(UDP6); udp_max_optsize = optcom_max_optsize(udp_opt_obj.odb_opt_des_arr, udp_opt_obj.odb_opt_arr_cnt); - udp_cache = kmem_cache_create("udp_cache", sizeof (udp_t), - CACHE_ALIGN_SIZE, NULL, NULL, NULL, NULL, NULL, 0); - /* * We want to be informed each time a stack is created or * destroyed in the kernel, so we can maintain the @@ -8358,8 +7604,6 @@ void udp_ddi_destroy(void) { netstack_unregister(NS_UDP); - - kmem_cache_destroy(udp_cache); } /* @@ -8584,17 +7828,6 @@ udp_kstat_update(kstat_t *kp, int rw) return (0); } -/* ARGSUSED */ -static void -udp_rput(queue_t *q, mblk_t *mp) -{ - /* - * We get here whenever we do qreply() from IP, - * i.e as part of handlings ioctls, etc. - */ - putnext(q, mp); -} - /* * Read-side synchronous stream info entry point, called as a * result of handling certain STREAMS ioctl operations. @@ -8606,7 +7839,7 @@ udp_rinfop(queue_t *q, infod_t *dp) uint_t cmd = dp->d_cmd; int res = 0; int error = 0; - udp_t *udp = Q_TO_UDP(RD(UDP_WR(q))); + udp_t *udp = Q_TO_UDP(q); struct stdata *stp = STREAM(q); mutex_enter(&udp->udp_drain_lock); @@ -8681,12 +7914,9 @@ static int udp_rrw(queue_t *q, struiod_t *dp) { mblk_t *mp; - udp_t *udp = Q_TO_UDP(_RD(UDP_WR(q))); + udp_t *udp = Q_TO_UDP(q); udp_stack_t *us = udp->udp_us; - /* We should never get here when we're in SNMP mode */ - ASSERT(!(udp->udp_connp->conn_flags & IPCL_UDPMOD)); - /* * Dequeue datagram from the head of the list and return * it to caller; also ensure that RSLEEP sd_wakeq flag is @@ -8850,3 +8080,40 @@ udp_set_rcv_hiwat(udp_t *udp, size_t size) udp->udp_rcv_hiwat = size; return (size); } + +/* + * For the lower queue so that UDP can be a dummy mux. + * Nobody should be sending + * packets up this stream + */ +static void +udp_lrput(queue_t *q, mblk_t *mp) +{ + mblk_t *mp1; + + switch (mp->b_datap->db_type) { + case M_FLUSH: + /* Turn around */ + if (*mp->b_rptr & FLUSHW) { + *mp->b_rptr &= ~FLUSHR; + qreply(q, mp); + return; + } + break; + } + /* Could receive messages that passed through ar_rput */ + for (mp1 = mp; mp1; mp1 = mp1->b_cont) + mp1->b_prev = mp1->b_next = NULL; + freemsg(mp); +} + +/* + * For the lower queue so that UDP can be a dummy mux. + * Nobody should be sending packets down this stream. + */ +/* ARGSUSED */ +void +udp_lwput(queue_t *q, mblk_t *mp) +{ + freemsg(mp); +} |