diff options
Diffstat (limited to 'usr/src')
-rw-r--r-- | usr/src/uts/common/fs/sockfs/sockcommon_sops.c | 2 | ||||
-rw-r--r-- | usr/src/uts/common/fs/sockfs/sockcommon_subr.c | 406 | ||||
-rw-r--r-- | usr/src/uts/common/fs/sockfs/socktpi.c | 93 | ||||
-rw-r--r-- | usr/src/uts/common/fs/sockfs/socktpi.h | 13 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ip/icmp.c | 121 | ||||
-rw-r--r-- | usr/src/uts/common/inet/rawip_impl.h | 4 | ||||
-rw-r--r-- | usr/src/uts/common/inet/tcp/tcp.c | 243 | ||||
-rw-r--r-- | usr/src/uts/common/inet/tcp/tcp_fusion.c | 1 | ||||
-rw-r--r-- | usr/src/uts/common/inet/tcp_impl.h | 4 | ||||
-rw-r--r-- | usr/src/uts/common/inet/udp/udp.c | 122 | ||||
-rw-r--r-- | usr/src/uts/common/inet/udp_impl.h | 2 | ||||
-rw-r--r-- | usr/src/uts/common/sys/socket_proto.h | 4 | ||||
-rw-r--r-- | usr/src/uts/common/sys/socketvar.h | 12 |
13 files changed, 719 insertions, 308 deletions
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c index 46831888d5..7b21facf35 100644 --- a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c +++ b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c @@ -1203,7 +1203,7 @@ so_queue_msg(sock_upper_handle_t sock_handle, mblk_t *mp, } mutex_enter(&so->so_lock); - if (so->so_state & (SS_FALLBACK_PENDING | SS_FALLBACK_COMP)) { + if (so->so_state & (SS_FALLBACK_DRAIN | SS_FALLBACK_COMP)) { SOD_DISABLE(sodp); mutex_exit(&so->so_lock); *errorp = EOPNOTSUPP; diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c index dd114dbc26..d01447c48a 100644 --- a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c +++ b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c @@ -37,6 +37,7 @@ #include <sys/strsubr.h> #include <sys/strsun.h> #include <sys/atomic.h> +#include <sys/tihdr.h> #include <fs/sockfs/sockcommon.h> #include <fs/sockfs/socktpi.h> @@ -1515,8 +1516,75 @@ socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode, } /* - * Process STREAMS related ioctls. If a I_PUSH/POP operation is specified - * then the socket will fall back to TPI. + * Handle the I_NREAD STREAM ioctl. + */ +static int +so_strioc_nread(struct sonode *so, intptr_t arg, int mode, int32_t *rvalp) +{ + size_t size = 0; + int retval; + int count = 0; + mblk_t *mp; + + if (so->so_downcalls == NULL || + so->so_downcalls->sd_recv_uio != NULL) + return (EINVAL); + + mutex_enter(&so->so_lock); + /* Wait for reader to get out of the way. */ + while (so->so_flag & SOREADLOCKED) { + /* + * If reader is waiting for data, then there should be nothing + * on the rcv queue. + */ + if (so->so_rcv_wakeup) + goto out; + + so->so_flag |= SOWANT; + /* Do a timed sleep, in case the reader goes to sleep. */ + (void) cv_timedwait(&so->so_state_cv, &so->so_lock, + lbolt + drv_usectohz(10)); + } + + /* + * Since we are holding so_lock no new reader will come in, and the + * protocol will not be able to enqueue data. So it's safe to walk + * both rcv queues. + */ + mp = so->so_rcv_q_head; + if (mp != NULL) { + size = msgdsize(so->so_rcv_q_head); + for (; mp != NULL; mp = mp->b_next) + count++; + } else { + /* + * In case the processing list was empty, get the size of the + * next msg in line. + */ + size = msgdsize(so->so_rcv_head); + } + + for (mp = so->so_rcv_head; mp != NULL; mp = mp->b_next) + count++; +out: + mutex_exit(&so->so_lock); + + /* + * Drop down from size_t to the "int" required by the + * interface. Cap at INT_MAX. + */ + retval = MIN(size, INT_MAX); + if (so_copyout(&retval, (void *)arg, sizeof (retval), + (mode & (int)FKIOCTL))) { + return (EFAULT); + } else { + *rvalp = count; + return (0); + } +} + +/* + * Process STREAM ioctls. * * Returns: * < 0 - ioctl was not handle @@ -1526,32 +1594,42 @@ int socket_strioc_common(struct sonode *so, int cmd, intptr_t arg, int mode, struct cred *cr, int32_t *rvalp) { - switch (cmd) { - case _I_INSERT: - case _I_REMOVE: - case I_FIND: - case I_LIST: - return (EOPNOTSUPP); + int retval; - case I_PUSH: - case I_POP: { - int retval; + /* Only STREAM iotcls are handled here */ + if ((cmd & 0xffffff00U) != STR) + return (-1); - if ((retval = so_tpi_fallback(so, cr)) == 0) { - /* Reissue the ioctl */ - ASSERT(so->so_rcv_q_head == NULL); - return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp)); - } - return (retval); - } + switch (cmd) { + case I_CANPUT: + /* + * We return an error for I_CANPUT so that isastream(3C) will + * not report the socket as being a STREAM. + */ + return (EOPNOTSUPP); + case I_NREAD: + /* Avoid doing a fallback for I_NREAD. */ + return (so_strioc_nread(so, arg, mode, rvalp)); case I_LOOK: + /* Avoid doing a fallback for I_LOOK. */ if (so_copyout("sockmod", (void *)arg, strlen("sockmod") + 1, (mode & (int)FKIOCTL))) { return (EFAULT); } return (0); default: - return (-1); + break; + } + + /* + * Try to fall back to TPI, and if successful, reissue the ioctl. + */ + if ((retval = so_tpi_fallback(so, cr)) == 0) { + /* Reissue the ioctl */ + ASSERT(so->so_rcv_q_head == NULL); + return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp)); + } else { + return (retval); } } @@ -1851,7 +1929,7 @@ so_end_fallback(struct sonode *so) ASSERT(RW_ISWRITER(&so->so_fallback_rwlock)); mutex_enter(&so->so_lock); - so->so_state &= ~SS_FALLBACK_PENDING; + so->so_state &= ~(SS_FALLBACK_PENDING|SS_FALLBACK_DRAIN); mutex_exit(&so->so_lock); rw_downgrade(&so->so_fallback_rwlock); @@ -1867,8 +1945,6 @@ so_end_fallback(struct sonode *so) * is safe to synchronize the state. Data can also be moved without * risk for reordering. * - * NOTE: urgent data is dropped on the floor. - * * We do not need to hold so_lock, since there can be only one thread * operating on the sonode. */ @@ -1878,15 +1954,21 @@ so_quiesced_cb(sock_upper_handle_t sock_handle, queue_t *q, struct sockaddr *faddr, socklen_t faddrlen, short opts) { struct sonode *so = (struct sonode *)sock_handle; + boolean_t atmark; sotpi_update_state(so, tcap, laddr, laddrlen, faddr, faddrlen, opts); + /* + * Some protocols do not quiece the data path during fallback. Once + * we set the SS_FALLBACK_DRAIN flag any attempt to queue data will + * fail and the protocol is responsible for saving the data for later + * delivery (i.e., once the fallback has completed). + */ mutex_enter(&so->so_lock); + so->so_state |= SS_FALLBACK_DRAIN; SOCKET_TIMER_CANCEL(so); mutex_exit(&so->so_lock); - /* - * Move data to the STREAM head. - */ + if (so->so_rcv_head != NULL) { if (so->so_rcv_q_last_head == NULL) so->so_rcv_q_head = so->so_rcv_head; @@ -1895,6 +1977,20 @@ so_quiesced_cb(sock_upper_handle_t sock_handle, queue_t *q, so->so_rcv_q_last_head = so->so_rcv_last_head; } + atmark = (so->so_state & SS_RCVATMARK) != 0; + /* + * Clear any OOB state having to do with pending data. The TPI + * code path will set the appropriate oob state when we move the + * oob data to the STREAM head. We leave SS_HADOOBDATA since the oob + * data has already been consumed. + */ + so->so_state &= ~(SS_RCVATMARK|SS_OOBPEND|SS_HAVEOOBDATA); + + ASSERT(so->so_oobmsg != NULL || so->so_oobmark <= so->so_rcv_queued); + + /* + * Move data to the STREAM head. + */ while (so->so_rcv_q_head != NULL) { mblk_t *mp = so->so_rcv_q_head; size_t mlen = msgdsize(mp); @@ -1902,33 +1998,200 @@ so_quiesced_cb(sock_upper_handle_t sock_handle, queue_t *q, so->so_rcv_q_head = mp->b_next; mp->b_next = NULL; mp->b_prev = NULL; + + /* + * Send T_EXDATA_IND if we are at the oob mark. + */ + if (atmark) { + struct T_exdata_ind *tei; + mblk_t *mp1 = SOTOTPI(so)->sti_exdata_mp; + + SOTOTPI(so)->sti_exdata_mp = NULL; + ASSERT(mp1 != NULL); + mp1->b_datap->db_type = M_PROTO; + tei = (struct T_exdata_ind *)mp1->b_rptr; + tei->PRIM_type = T_EXDATA_IND; + tei->MORE_flag = 0; + mp1->b_wptr = (uchar_t *)&tei[1]; + + if (IS_SO_OOB_INLINE(so)) { + mp1->b_cont = mp; + } else { + ASSERT(so->so_oobmsg != NULL); + mp1->b_cont = so->so_oobmsg; + so->so_oobmsg = NULL; + + /* process current mp next time around */ + mp->b_next = so->so_rcv_q_head; + so->so_rcv_q_head = mp; + mlen = 0; + } + mp = mp1; + + /* we have consumed the oob mark */ + atmark = B_FALSE; + } else if (so->so_oobmark > 0) { + /* + * Check if the OOB mark is within the current + * mblk chain. In that case we have to split it up. + */ + if (so->so_oobmark < mlen) { + mblk_t *urg_mp = mp; + + atmark = B_TRUE; + mp = NULL; + mlen = so->so_oobmark; + + /* + * It is assumed that the OOB mark does + * not land within a mblk. + */ + do { + so->so_oobmark -= MBLKL(urg_mp); + mp = urg_mp; + urg_mp = urg_mp->b_cont; + } while (so->so_oobmark > 0); + mp->b_cont = NULL; + if (urg_mp != NULL) { + urg_mp->b_next = so->so_rcv_q_head; + so->so_rcv_q_head = urg_mp; + } + } else { + so->so_oobmark -= mlen; + if (so->so_oobmark == 0) + atmark = B_TRUE; + } + } + + /* + * Queue data on the STREAM head. + */ so->so_rcv_queued -= mlen; putnext(q, mp); } - ASSERT(so->so_rcv_queued == 0); so->so_rcv_head = NULL; so->so_rcv_last_head = NULL; so->so_rcv_q_head = NULL; so->so_rcv_q_last_head = NULL; -#ifdef DEBUG - if (so->so_oobmsg != NULL || so->so_oobmark > 0) { - cmn_err(CE_NOTE, "losing oob data due to tpi fallback\n"); + /* + * Check if the oob byte is at the end of the data stream, or if the + * oob byte has not yet arrived. In the latter case we have to send a + * SIGURG and a mark indicator to the STREAM head. The mark indicator + * is needed to guarantee correct behavior for SIOCATMARK. See block + * comment in socktpi.h for more details. + */ + if (atmark || so->so_oobmark > 0) { + mblk_t *mp; + + if (atmark && so->so_oobmsg != NULL) { + struct T_exdata_ind *tei; + + mp = SOTOTPI(so)->sti_exdata_mp; + SOTOTPI(so)->sti_exdata_mp = NULL; + ASSERT(mp != NULL); + mp->b_datap->db_type = M_PROTO; + tei = (struct T_exdata_ind *)mp->b_rptr; + tei->PRIM_type = T_EXDATA_IND; + tei->MORE_flag = 0; + mp->b_wptr = (uchar_t *)&tei[1]; + + mp->b_cont = so->so_oobmsg; + so->so_oobmsg = NULL; + + putnext(q, mp); + } else { + /* Send up the signal */ + mp = SOTOTPI(so)->sti_exdata_mp; + SOTOTPI(so)->sti_exdata_mp = NULL; + ASSERT(mp != NULL); + DB_TYPE(mp) = M_PCSIG; + *mp->b_wptr++ = (uchar_t)SIGURG; + putnext(q, mp); + + /* Send up the mark indicator */ + mp = SOTOTPI(so)->sti_urgmark_mp; + SOTOTPI(so)->sti_urgmark_mp = NULL; + mp->b_flag = atmark ? MSGMARKNEXT : MSGNOTMARKNEXT; + putnext(q, mp); + + so->so_oobmark = 0; + } } -#endif - if (so->so_oobmsg != NULL) { - freemsg(so->so_oobmsg); - so->so_oobmsg = NULL; + + if (SOTOTPI(so)->sti_exdata_mp != NULL) { + freeb(SOTOTPI(so)->sti_exdata_mp); + SOTOTPI(so)->sti_exdata_mp = NULL; } - so->so_oobmark = 0; + if (SOTOTPI(so)->sti_urgmark_mp != NULL) { + freeb(SOTOTPI(so)->sti_urgmark_mp); + SOTOTPI(so)->sti_urgmark_mp = NULL; + } + + ASSERT(so->so_oobmark == 0); ASSERT(so->so_rcv_queued == 0); } +#ifdef DEBUG +/* + * Do an integrity check of the sonode. This should be done if a + * fallback fails after sonode has initially been converted to use + * TPI and subsequently have to be reverted. + * + * Failure to pass the integrity check will panic the system. + */ +void +so_integrity_check(struct sonode *cur, struct sonode *orig) +{ + VERIFY(cur->so_vnode == orig->so_vnode); + VERIFY(cur->so_ops == orig->so_ops); + /* + * For so_state we can only VERIFY the state flags in CHECK_STATE. + * The other state flags might be affected by a notification from the + * protocol. + */ +#define CHECK_STATE (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_NDELAY|SS_NONBLOCK| \ + SS_ASYNC|SS_ACCEPTCONN|SS_SAVEDEOR|SS_RCVATMARK|SS_OOBPEND| \ + SS_HAVEOOBDATA|SS_HADOOBDATA|SS_SENTLASTREADSIG|SS_SENTLASTWRITESIG) + VERIFY((cur->so_state & (orig->so_state & CHECK_STATE)) == + (orig->so_state & CHECK_STATE)); + VERIFY(cur->so_mode == orig->so_mode); + VERIFY(cur->so_flag == orig->so_flag); + VERIFY(cur->so_count == orig->so_count); + /* Cannot VERIFY so_proto_connid; proto can update it */ + VERIFY(cur->so_sockparams == orig->so_sockparams); + /* an error might have been recorded, but it can not be lost */ + VERIFY(cur->so_error != 0 || orig->so_error == 0); + VERIFY(cur->so_family == orig->so_family); + VERIFY(cur->so_type == orig->so_type); + VERIFY(cur->so_protocol == orig->so_protocol); + VERIFY(cur->so_version == orig->so_version); + /* New conns might have arrived, but none should have been lost */ + VERIFY(cur->so_acceptq_len >= orig->so_acceptq_len); + VERIFY(cur->so_acceptq_head == orig->so_acceptq_head); + VERIFY(cur->so_backlog == orig->so_backlog); + /* New OOB migth have arrived, but mark should not have been lost */ + VERIFY(cur->so_oobmark >= orig->so_oobmark); + /* Cannot VERIFY so_oobmsg; the proto might have sent up a new one */ + VERIFY(cur->so_pgrp == orig->so_pgrp); + VERIFY(cur->so_peercred == orig->so_peercred); + VERIFY(cur->so_cpid == orig->so_cpid); + VERIFY(cur->so_zoneid == orig->so_zoneid); + /* New data migth have arrived, but none should have been lost */ + VERIFY(cur->so_rcv_queued >= orig->so_rcv_queued); + VERIFY(cur->so_rcv_q_head == orig->so_rcv_q_head); + VERIFY(cur->so_rcv_head == orig->so_rcv_head); + VERIFY(cur->so_proto_handle == orig->so_proto_handle); + VERIFY(cur->so_downcalls == orig->so_downcalls); + /* Cannot VERIFY so_proto_props; they can be updated by proto */ +} +#endif + /* * so_tpi_fallback() * - * This is fallback initation routine; things start here. + * This is the fallback initation routine; things start here. * * Basic strategy: * o Block new socket operations from coming in @@ -1944,10 +2207,13 @@ so_tpi_fallback(struct sonode *so, struct cred *cr) int error; queue_t *q; struct sockparams *sp; - struct sockparams *newsp; + struct sockparams *newsp = NULL; so_proto_fallback_func_t fbfunc; boolean_t direct; - + struct sonode *nso; +#ifdef DEBUG + struct sonode origso; +#endif error = 0; sp = so->so_sockparams; fbfunc = sp->sp_smod_info->smod_proto_fallback_func; @@ -1965,6 +2231,13 @@ so_tpi_fallback(struct sonode *so, struct cred *cr) */ if (!so_start_fallback(so)) return (EAGAIN); +#ifdef DEBUG + /* + * Make a copy of the sonode in case we need to make an integrity + * check later on. + */ + bcopy(so, &origso, sizeof (*so)); +#endif newsp = sockparams_hold_ephemeral_bydev(so->so_family, so->so_type, so->so_protocol, so->so_sockparams->sp_sdev_info.sd_devpath, @@ -1983,29 +2256,47 @@ so_tpi_fallback(struct sonode *so, struct cred *cr) } /* Turn sonode into a TPI socket */ - q = sotpi_convert_sonode(so, newsp, &direct, cr); - if (q == NULL) { - zcmn_err(getzoneid(), CE_WARN, - "Failed to convert socket to TPI. Pid = %d\n", - curproc->p_pid); - SOCKPARAMS_DEC_REF(newsp); - error = EINVAL; + error = sotpi_convert_sonode(so, newsp, &direct, &q, cr); + if (error != 0) goto out; - } + /* * Now tell the protocol to start using TPI. so_quiesced_cb be * called once it's safe to synchronize state. */ DTRACE_PROBE1(proto__fallback__begin, struct sonode *, so); - /* FIXME assumes this cannot fail. TCP can fail to enter squeue */ - (*fbfunc)(so->so_proto_handle, q, direct, so_quiesced_cb); + error = (*fbfunc)(so->so_proto_handle, q, direct, so_quiesced_cb); DTRACE_PROBE1(proto__fallback__end, struct sonode *, so); + if (error != 0) { + /* protocol was unable to do a fallback, revert the sonode */ + sotpi_revert_sonode(so, cr); + goto out; + } + /* - * Free all pending connection indications, i.e., socket_accept() has - * not yet pulled the connection of the queue. The transport sent - * a T_CONN_IND message for each pending connection to the STREAM head. + * Walk the accept queue and notify the proto that they should + * fall back to TPI. The protocol will send up the T_CONN_IND. + */ + nso = so->so_acceptq_head; + while (nso != NULL) { + int rval; + + DTRACE_PROBE1(proto__fallback__begin, struct sonode *, nso); + rval = (*fbfunc)(nso->so_proto_handle, NULL, direct, NULL); + DTRACE_PROBE1(proto__fallback__end, struct sonode *, nso); + if (rval != 0) { + zcmn_err(getzoneid(), CE_WARN, + "Failed to convert socket in accept queue to TPI. " + "Pid = %d\n", curproc->p_pid); + } + nso = nso->so_acceptq_next; + } + + /* + * Now flush the acceptq, this will destroy all sockets. They will + * be recreated in sotpi_accept(). */ so_acceptq_flush(so); @@ -2020,10 +2311,6 @@ so_tpi_fallback(struct sonode *so, struct cred *cr) so->so_ops = &sotpi_sonodeops; /* - * No longer a non streams socket - */ - so->so_not_str = B_FALSE; - /* * Wake up any threads stuck in poll. This is needed since the poll * head changes when the fallback happens (moves from the sonode to * the STREAMS head). @@ -2032,5 +2319,16 @@ so_tpi_fallback(struct sonode *so, struct cred *cr) out: so_end_fallback(so); + if (error != 0) { +#ifdef DEBUG + so_integrity_check(so, &origso); +#endif + zcmn_err(getzoneid(), CE_WARN, + "Failed to convert socket to TPI (err=%d). Pid = %d\n", + error, curproc->p_pid); + if (newsp != NULL) + SOCKPARAMS_DEC_REF(newsp); + } + return (error); } diff --git a/usr/src/uts/common/fs/sockfs/socktpi.c b/usr/src/uts/common/fs/sockfs/socktpi.c index d801c1e14f..80738f5fa8 100644 --- a/usr/src/uts/common/fs/sockfs/socktpi.c +++ b/usr/src/uts/common/fs/sockfs/socktpi.c @@ -6674,21 +6674,24 @@ socktpi_init(void) * Given a non-TPI sonode, allocate and prep it to be ready for TPI. * * Caller must still update state and mode using sotpi_update_state(). - * - * Returns the STREAM queue that the protocol should use. */ -queue_t * +int sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp, - boolean_t *direct, struct cred *cr) + boolean_t *direct, queue_t **qp, struct cred *cr) { sotpi_info_t *sti; struct sockparams *origsp = so->so_sockparams; sock_lower_handle_t handle = so->so_proto_handle; - uint_t old_state = so->so_state; struct stdata *stp; struct vnode *vp; queue_t *q; + int error = 0; + ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) == + SS_FALLBACK_PENDING); + ASSERT(SOCK_IS_NONSTR(so)); + + *qp = NULL; *direct = B_FALSE; so->so_sockparams = newsp; /* @@ -6697,11 +6700,10 @@ sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp, (void) sotpi_info_create(so, KM_SLEEP); sotpi_info_init(so); - if (sotpi_init(so, NULL, cr, SO_FALLBACK) != 0) { + if ((error = sotpi_init(so, NULL, cr, SO_FALLBACK)) != 0) { sotpi_info_fini(so); sotpi_info_destroy(so); - so->so_state = old_state; - return (NULL); + return (error); } ASSERT(handle == so->so_proto_handle); sti = SOTOTPI(so); @@ -6709,6 +6711,23 @@ sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp, *direct = B_TRUE; /* + * When it comes to urgent data we have two cases to deal with; + * (1) The oob byte has already arrived, or (2) the protocol has + * notified that oob data is pending, but it has not yet arrived. + * + * For (1) all we need to do is send a T_EXDATA_IND to indicate were + * in the byte stream the oob byte is. For (2) we have to send a + * SIGURG (M_PCSIG), followed by a zero-length mblk indicating whether + * the oob byte will be the next byte from the protocol. + * + * So in the worst case we need two mblks, one for the signal, another + * for mark indication. In that case we use the exdata_mp for the sig. + */ + sti->sti_exdata_mp = allocb_wait(sizeof (struct T_exdata_ind), BPRI_MED, + STR_NOSIG, NULL); + sti->sti_urgmark_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL); + + /* * Keep the original sp around so we can properly dispose of the * sonode when the socket is being closed. */ @@ -6728,10 +6747,8 @@ sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp, * connection indications. */ if (so->so_pgrp != 0) { - mutex_enter(&so->so_lock); if (so_set_events(so, so->so_vnode, cr) != 0) so->so_pgrp = 0; - mutex_exit(&so->so_lock); } /* @@ -6748,9 +6765,52 @@ sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp, */ while (q->q_next != NULL) q = q->q_next; - q = _RD(q); + *qp = _RD(q); - return (q); + /* This is now a STREAMS sockets */ + so->so_not_str = B_FALSE; + + return (error); +} + +/* + * Revert a TPI sonode. It is only allowed to revert the sonode during + * the fallback process. + */ +void +sotpi_revert_sonode(struct sonode *so, struct cred *cr) +{ + vnode_t *vp = SOTOV(so); + + ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) == + SS_FALLBACK_PENDING); + ASSERT(!SOCK_IS_NONSTR(so)); + ASSERT(vp->v_stream != NULL); + + if (SOTOTPI(so)->sti_exdata_mp != NULL) { + freeb(SOTOTPI(so)->sti_exdata_mp); + SOTOTPI(so)->sti_exdata_mp = NULL; + } + + if (SOTOTPI(so)->sti_urgmark_mp != NULL) { + freeb(SOTOTPI(so)->sti_urgmark_mp); + SOTOTPI(so)->sti_urgmark_mp = NULL; + } + + strclean(vp); + (void) strclose(vp, FREAD|FWRITE|SO_FALLBACK, cr); + + /* + * Restore the original sockparams. The caller is responsible for + * dropping the ref to the new sp. + */ + so->so_sockparams = SOTOTPI(so)->sti_orig_sp; + + sotpi_info_fini(so); + sotpi_info_destroy(so); + + /* This is no longer a STREAMS sockets */ + so->so_not_str = B_TRUE; } void @@ -6815,8 +6875,7 @@ sotpi_sototpi(struct sonode *so) { sotpi_info_t *sti; - if (so == NULL) - return (NULL); + ASSERT(so != NULL); sti = (sotpi_info_t *)so->so_priv; @@ -6845,6 +6904,9 @@ i_sotpi_info_constructor(sotpi_info_t *sti) sti->sti_nl7c_uri = NULL; sti->sti_nl7c_rcv_mp = NULL; + sti->sti_exdata_mp = NULL; + sti->sti_urgmark_mp = NULL; + mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL); @@ -6870,6 +6932,9 @@ i_sotpi_info_destructor(sotpi_info_t *sti) ASSERT(sti->sti_nl7c_uri == NULL); ASSERT(sti->sti_nl7c_rcv_mp == NULL); + ASSERT(sti->sti_exdata_mp == NULL); + ASSERT(sti->sti_urgmark_mp == NULL); + mutex_destroy(&sti->sti_plumb_lock); cv_destroy(&sti->sti_ack_cv); } diff --git a/usr/src/uts/common/fs/sockfs/socktpi.h b/usr/src/uts/common/fs/sockfs/socktpi.h index 4c1a5de268..cee3a5da43 100644 --- a/usr/src/uts/common/fs/sockfs/socktpi.h +++ b/usr/src/uts/common/fs/sockfs/socktpi.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -252,6 +252,12 @@ typedef struct sotpi_info { kssl_endpt_type_t sti_kssl_type; /* is proxy/is proxied/none */ kssl_ent_t sti_kssl_ent; /* SSL config entry */ kssl_ctx_t sti_kssl_ctx; /* SSL session context */ + + /* + * The mblks below are only allocated and used during fallback. + */ + mblk_t *sti_exdata_mp; /* T_EXDATA_IND or SIGURG */ + mblk_t *sti_urgmark_mp; /* mark indication */ } sotpi_info_t; struct T_capability_ack; @@ -259,8 +265,9 @@ struct T_capability_ack; extern sonodeops_t sotpi_sonodeops; extern int socktpi_init(void); -extern queue_t *sotpi_convert_sonode(struct sonode *, struct sockparams *, - boolean_t *, struct cred *); +extern int sotpi_convert_sonode(struct sonode *, struct sockparams *, + boolean_t *, queue_t **, struct cred *); +extern void sotpi_revert_sonode(struct sonode *, struct cred *); extern void sotpi_update_state(struct sonode *, struct T_capability_ack *, struct sockaddr *, socklen_t, struct sockaddr *, socklen_t, short); diff --git a/usr/src/uts/common/inet/ip/icmp.c b/usr/src/uts/common/inet/ip/icmp.c index e26254a51d..eb0162ae98 100644 --- a/usr/src/uts/common/inet/ip/icmp.c +++ b/usr/src/uts/common/inet/ip/icmp.c @@ -151,6 +151,7 @@ static int raw_ip_send_data_v4(queue_t *q, conn_t *connp, mblk_t *mp, static void icmp_wput_other(queue_t *q, mblk_t *mp); static void icmp_wput_iocdata(queue_t *q, mblk_t *mp); static void icmp_wput_restricted(queue_t *q, mblk_t *mp); +static void icmp_ulp_recv(conn_t *, mblk_t *); static void *rawip_stack_init(netstackid_t stackid, netstack_t *ns); static void rawip_stack_fini(netstackid_t stackid, void *arg); @@ -1131,6 +1132,7 @@ icmp_icmp_error(conn_t *connp, mblk_t *mp) sin = sin_null; sin.sin_family = AF_INET; sin.sin_addr.s_addr = ipha->ipha_dst; + if (IPCL_IS_NONSTR(connp)) { rw_enter(&icmp->icmp_rwlock, RW_WRITER); if (icmp->icmp_state == TS_DATA_XFER) { @@ -1147,13 +1149,13 @@ icmp_icmp_error(conn_t *connp, mblk_t *mp) } rw_exit(&icmp->icmp_rwlock); } else { - mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL, 0, error); if (mp1 != NULL) putnext(connp->conn_rq, mp1); } done: + ASSERT(!RW_ISWRITER(&icmp->icmp_rwlock)); freemsg(mp); } @@ -1264,14 +1266,8 @@ icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp) * message. Free it, then send our empty message. */ freemsg(mp); - if (!IPCL_IS_NONSTR(connp)) { - putnext(connp->conn_rq, newmp); - } else { - (*connp->conn_upcalls->su_recv) - (connp->conn_upper_handle, newmp, 0, 0, &error, - NULL); - ASSERT(error == 0); - } + icmp_ulp_recv(connp, newmp); + return; } case ICMP6_TIME_EXCEEDED: @@ -1322,13 +1318,13 @@ icmp_icmp_error_ipv6(conn_t *connp, mblk_t *mp) } rw_exit(&icmp->icmp_rwlock); } else { - mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), NULL, 0, error); if (mp1 != NULL) putnext(connp->conn_rq, mp1); } done: + ASSERT(!RW_ISWRITER(&icmp->icmp_rwlock)); freemsg(mp); } @@ -3339,7 +3335,8 @@ icmp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr) icmppa->icmp_param_value = new_value; return (0); } -static void + +static mblk_t * icmp_queue_fallback(icmp_t *icmp, mblk_t *mp) { ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock)); @@ -3356,13 +3353,56 @@ icmp_queue_fallback(icmp_t *icmp, mblk_t *mp) icmp->icmp_fallback_queue_tail->b_next = mp; icmp->icmp_fallback_queue_tail = mp; } - mutex_exit(&icmp->icmp_recv_lock); + return (NULL); } else { /* - * no more fallbacks possible, ok to drop lock. + * Fallback completed, let the caller putnext() the mblk. */ - mutex_exit(&icmp->icmp_recv_lock); - putnext(icmp->icmp_connp->conn_rq, mp); + return (mp); + } +} + +/* + * Deliver data to ULP. In case we have a socket, and it's falling back to + * TPI, then we'll queue the mp for later processing. + */ +static void +icmp_ulp_recv(conn_t *connp, mblk_t *mp) +{ + + if (IPCL_IS_NONSTR(connp)) { + icmp_t *icmp = connp->conn_icmp; + int error; + + if ((*connp->conn_upcalls->su_recv) + (connp->conn_upper_handle, mp, msgdsize(mp), 0, &error, + NULL) < 0) { + mutex_enter(&icmp->icmp_recv_lock); + if (error == ENOSPC) { + /* + * let's confirm while holding the lock + */ + if ((*connp->conn_upcalls->su_recv) + (connp->conn_upper_handle, NULL, 0, 0, + &error, NULL) < 0) { + ASSERT(error == ENOSPC); + if (error == ENOSPC) { + connp->conn_flow_cntrld = + B_TRUE; + } + } + mutex_exit(&icmp->icmp_recv_lock); + } else { + ASSERT(error == EOPNOTSUPP); + mp = icmp_queue_fallback(icmp, mp); + mutex_exit(&icmp->icmp_recv_lock); + if (mp != NULL) + putnext(connp->conn_rq, mp); + } + } + ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock)); + } else { + putnext(connp->conn_rq, mp); } } @@ -3391,7 +3431,6 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2) uint_t icmp_opt = 0; boolean_t icmp_ipv6_recvhoplimit = B_FALSE; uint_t hopstrip; - int error; ASSERT(connp->conn_flags & IPCL_RAWIPCONN); @@ -4038,35 +4077,8 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2) BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams); deliver: - if (IPCL_IS_NONSTR(connp)) { - if ((*connp->conn_upcalls->su_recv) - (connp->conn_upper_handle, mp, msgdsize(mp), 0, &error, - NULL) < 0) { - mutex_enter(&icmp->icmp_recv_lock); - if (error == ENOSPC) { - /* - * let's confirm while holding the lock - */ - if ((*connp->conn_upcalls->su_recv) - (connp->conn_upper_handle, NULL, 0, 0, - &error, NULL) < 0) { - if (error == ENOSPC) { - connp->conn_flow_cntrld = - B_TRUE; - } else { - ASSERT(error == EOPNOTSUPP); - } - } - mutex_exit(&icmp->icmp_recv_lock); - } else { - ASSERT(error == EOPNOTSUPP); - icmp_queue_fallback(icmp, mp); - } - } - } else { - putnext(connp->conn_rq, mp); - } - ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock)); + icmp_ulp_recv(connp, mp); + } /* @@ -5968,7 +5980,7 @@ rawip_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, } /* ARGSUSED */ -void +int rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q, boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb) { @@ -6032,20 +6044,14 @@ rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q, if (icmp->icmp_dontroute) opts |= SO_DONTROUTE; - /* - * Once we grab the drain lock, no data will be send up - * to the socket. So we notify the socket that the endpoint - * is quiescent and it's therefore safe move data from - * the socket to the stream head. - */ (*quiesced_cb)(connp->conn_upper_handle, q, &tca, (struct sockaddr *)&laddr, laddrlen, (struct sockaddr *)&faddr, faddrlen, opts); /* - * push up any packets that were queued in icmp_t + * Attempts to send data up during fallback will result in it being + * queued in udp_t. Now we push up any queued packets. */ - mutex_enter(&icmp->icmp_recv_lock); while (icmp->icmp_fallback_queue_head != NULL) { mblk_t *mp; @@ -6058,15 +6064,22 @@ rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q, mutex_enter(&icmp->icmp_recv_lock); } icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head; + /* * No longer a streams less socket */ + rw_enter(&icmp->icmp_rwlock, RW_WRITER); connp->conn_flags &= ~IPCL_NONSTR; + rw_exit(&icmp->icmp_rwlock); + mutex_exit(&icmp->icmp_recv_lock); + ASSERT(icmp->icmp_fallback_queue_head == NULL && icmp->icmp_fallback_queue_tail == NULL); ASSERT(connp->conn_ref >= 1); + + return (0); } /* ARGSUSED */ diff --git a/usr/src/uts/common/inet/rawip_impl.h b/usr/src/uts/common/inet/rawip_impl.h index f818247b67..241132b683 100644 --- a/usr/src/uts/common/inet/rawip_impl.h +++ b/usr/src/uts/common/inet/rawip_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -170,7 +170,7 @@ extern void icmp_ddi_g_destroy(void); extern sock_lower_handle_t rawip_create(int, int, int, sock_downcalls_t **, uint_t *, int *, int, cred_t *); -extern void rawip_fallback(sock_lower_handle_t, queue_t *, boolean_t, +extern int rawip_fallback(sock_lower_handle_t, queue_t *, boolean_t, so_proto_quiesced_cb_t); extern sock_downcalls_t sock_rawip_downcalls; diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c index ddbcb82428..af748fe140 100644 --- a/usr/src/uts/common/inet/tcp/tcp.c +++ b/usr/src/uts/common/inet/tcp/tcp.c @@ -14059,6 +14059,11 @@ ok:; (*connp->conn_upcalls->su_recv) (connp->conn_upper_handle, mp, seg_len, MSG_OOB, &error, NULL); + /* + * We should never be in middle of a + * fallback, the squeue guarantees that. + */ + ASSERT(error != EOPNOTSUPP); mp = NULL; goto update_ack; } else if (!tcp->tcp_urp_mp) { @@ -15157,12 +15162,13 @@ update_ack: if ((*connp->conn_upcalls->su_recv) (connp->conn_upper_handle, mp, seg_len, 0, &error, NULL) <= 0) { - if (error == ENOSPC) { + /* + * We should never be in middle of a + * fallback, the squeue guarantees that. + */ + ASSERT(error != EOPNOTSUPP); + if (error == ENOSPC) tcp->tcp_rwnd -= seg_len; - } else if (error == EOPNOTSUPP) { - tcp_rcv_enqueue(tcp, mp, - seg_len); - } } } else if (sodp != NULL) { mutex_enter(sodp->sod_lockp); @@ -15216,11 +15222,13 @@ update_ack: if ((*connp->conn_upcalls->su_recv)( connp->conn_upper_handle, mp, seg_len, 0, &error, &push) <= 0) { - if (error == ENOSPC) { + /* + * We should never be in middle of a + * fallback, the squeue guarantees that. + */ + ASSERT(error != EOPNOTSUPP); + if (error == ENOSPC) tcp->tcp_rwnd -= seg_len; - } else if (error == EOPNOTSUPP) { - tcp_rcv_enqueue(tcp, mp, seg_len); - } } else if (push) { /* * PUSH bit set and sockfs is not @@ -18169,9 +18177,8 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) 0, &error, &push); if (space_left < 0) { /* - * At this point the eager is not - * visible to anyone, so fallback - * can not happen. + * We should never be in middle of a + * fallback, the squeue guarantees that. */ ASSERT(error != EOPNOTSUPP); } @@ -27700,72 +27707,34 @@ tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr, /* * tcp_fallback * - * A direct socket is falling back to using STREAMS. Hanging - * off of the queue is a temporary tcp_t, which was created using - * tcp_open(). The tcp_open() was called as part of the regular - * sockfs create path, i.e., the SO_SOCKSTR flag is passed down, - * and therefore the temporary tcp_t is marked to be a socket - * (i.e., IPCL_SOCKET, tcp_issocket). So the optimizations - * introduced by FireEngine will be used. + * A direct socket is falling back to using STREAMS. The queue + * that is being passed down was created using tcp_open() with + * the SO_FALLBACK flag set. As a result, the queue is not + * associated with a conn, and the q_ptrs instead contain the + * dev and minor area that should be used. * - * The tcp_t associated with the socket falling back will - * still be marked as a socket, although the direct socket flag - * (IPCL_NONSTR) is removed. A fall back to true TPI semantics - * will not take place until a _SIOCSOCKFALLBACK ioctl is issued. - * - * If the above mentioned behavior, i.e., the tmp tcp_t is created - * as a STREAMS/TPI endpoint, then we will need to do more work here. - * Such as inserting the direct socket into the acceptor hash. + * The 'direct_sockfs' flag indicates whether the FireEngine + * optimizations should be used. The common case would be that + * optimizations are enabled, and they might be subsequently + * disabled using the _SIOCSOCKFALLBACK ioctl. + */ + +/* + * An active connection is falling back to TPI. Gather all the information + * required by the STREAM head and TPI sonode and send it up. */ void -tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q, +tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q, boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb) { - tcp_t *tcp, *eager; - conn_t *connp = (conn_t *)proto_handle; - int error; + conn_t *connp = tcp->tcp_connp; + struct stroptions *stropt; struct T_capability_ack tca; struct sockaddr_in6 laddr, faddr; socklen_t laddrlen, faddrlen; short opts; - struct stroptions *stropt; - mblk_t *stropt_mp; + int error; mblk_t *mp; - mblk_t *conn_ind_head = NULL; - mblk_t *conn_ind_tail = NULL; - mblk_t *ordrel_mp; - mblk_t *fused_sigurp_mp; - - tcp = connp->conn_tcp; - /* - * No support for acceptor fallback - */ - ASSERT(q->q_qinfo != &tcp_acceptor_rinit); - - stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL); - - /* Pre-allocate the T_ordrel_ind mblk. */ - ASSERT(tcp->tcp_ordrel_mp == NULL); - ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI, - STR_NOSIG, NULL); - ordrel_mp->b_datap->db_type = M_PROTO; - ((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND; - ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind); - - /* Pre-allocate the M_PCSIG anyway */ - fused_sigurp_mp = allocb_wait(1, BPRI_HI, STR_NOSIG, NULL); - - /* - * Enter the squeue so that no new packets can come in - */ - error = squeue_synch_enter(connp->conn_sqp, connp, 0); - if (error != 0) { - /* failed to enter, free all the pre-allocated messages. */ - freeb(stropt_mp); - freeb(ordrel_mp); - freeb(fused_sigurp_mp); - return; - } /* Disable I/OAT during fallback */ tcp->tcp_sodirect = NULL; @@ -27814,10 +27783,8 @@ tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q, tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID); laddrlen = faddrlen = sizeof (sin6_t); - (void) tcp_getsockname(proto_handle, (struct sockaddr *)&laddr, - &laddrlen, CRED()); - error = tcp_getpeername(proto_handle, (struct sockaddr *)&faddr, - &faddrlen, CRED()); + (void) tcp_do_getsockname(tcp, (struct sockaddr *)&laddr, &laddrlen); + error = tcp_do_getpeername(tcp, (struct sockaddr *)&faddr, &faddrlen); if (error != 0) faddrlen = 0; @@ -27844,60 +27811,112 @@ tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q, tcp->tcp_rcv_last_head = NULL; tcp->tcp_rcv_last_tail = NULL; tcp->tcp_rcv_cnt = 0; +} + +/* + * An eager is falling back to TPI. All we have to do is send + * up a T_CONN_IND. + */ +void +tcp_fallback_eager(tcp_t *eager, boolean_t direct_sockfs) +{ + tcp_t *listener = eager->tcp_listener; + mblk_t *mp = eager->tcp_conn.tcp_eager_conn_ind; + + ASSERT(listener != NULL); + ASSERT(mp != NULL); + + eager->tcp_conn.tcp_eager_conn_ind = NULL; /* - * No longer a direct socket + * TLI/XTI applications will get confused by + * sending eager as an option since it violates + * the option semantics. So remove the eager as + * option since TLI/XTI app doesn't need it anyway. */ - connp->conn_flags &= ~IPCL_NONSTR; - - tcp->tcp_ordrel_mp = ordrel_mp; + if (!direct_sockfs) { + struct T_conn_ind *conn_ind; - if (tcp->tcp_fused) { - ASSERT(tcp->tcp_fused_sigurg_mp == NULL); - tcp->tcp_fused_sigurg_mp = fused_sigurp_mp; - } else { - freeb(fused_sigurp_mp); + conn_ind = (struct T_conn_ind *)mp->b_rptr; + conn_ind->OPT_length = 0; + conn_ind->OPT_offset = 0; } /* - * Send T_CONN_IND messages for all ESTABLISHED connections. + * Sockfs guarantees that the listener will not be closed + * during fallback. So we can safely use the listener's queue. */ - mutex_enter(&tcp->tcp_eager_lock); - for (eager = tcp->tcp_eager_next_q; eager != NULL; - eager = eager->tcp_eager_next_q) { - mp = eager->tcp_conn.tcp_eager_conn_ind; + putnext(listener->tcp_rq, mp); +} - eager->tcp_conn.tcp_eager_conn_ind = NULL; - ASSERT(mp != NULL); +int +tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q, + boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb) +{ + tcp_t *tcp; + conn_t *connp = (conn_t *)proto_handle; + int error; + mblk_t *stropt_mp; + mblk_t *ordrel_mp; + mblk_t *fused_sigurp_mp; + + tcp = connp->conn_tcp; + + stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG, + NULL); + + /* Pre-allocate the T_ordrel_ind mblk. */ + ASSERT(tcp->tcp_ordrel_mp == NULL); + ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI, + STR_NOSIG, NULL); + ordrel_mp->b_datap->db_type = M_PROTO; + ((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND; + ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind); + + /* Pre-allocate the M_PCSIG used by fusion */ + fused_sigurp_mp = allocb_wait(1, BPRI_HI, STR_NOSIG, NULL); + + /* + * Enter the squeue so that no new packets can come in + */ + error = squeue_synch_enter(connp->conn_sqp, connp, 0); + if (error != 0) { + /* failed to enter, free all the pre-allocated messages. */ + freeb(stropt_mp); + freeb(ordrel_mp); + freeb(fused_sigurp_mp); /* - * TLI/XTI applications will get confused by - * sending eager as an option since it violates - * the option semantics. So remove the eager as - * option since TLI/XTI app doesn't need it anyway. + * We cannot process the eager, so at least send out a + * RST so the peer can reconnect. */ - if (!TCP_IS_SOCKET(tcp)) { - struct T_conn_ind *conn_ind; - - conn_ind = (struct T_conn_ind *)mp->b_rptr; - conn_ind->OPT_length = 0; - conn_ind->OPT_offset = 0; + if (tcp->tcp_listener != NULL) { + (void) tcp_eager_blowoff(tcp->tcp_listener, + tcp->tcp_conn_req_seqnum); } - if (conn_ind_head == NULL) { - conn_ind_head = mp; - } else { - conn_ind_tail->b_next = mp; - } - conn_ind_tail = mp; + return (ENOMEM); } - mutex_exit(&tcp->tcp_eager_lock); - mp = conn_ind_head; - while (mp != NULL) { - mblk_t *nmp = mp->b_next; - mp->b_next = NULL; + /* + * No longer a direct socket + */ + connp->conn_flags &= ~IPCL_NONSTR; - putnext(tcp->tcp_rq, mp); - mp = nmp; + tcp->tcp_ordrel_mp = ordrel_mp; + + if (tcp->tcp_fused) { + ASSERT(tcp->tcp_fused_sigurg_mp == NULL); + tcp->tcp_fused_sigurg_mp = fused_sigurp_mp; + } else { + freeb(fused_sigurp_mp); + } + + if (tcp->tcp_listener != NULL) { + /* The eager will deal with opts when accept() is called */ + freeb(stropt_mp); + tcp_fallback_eager(tcp, direct_sockfs); + } else { + tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs, + quiesced_cb); } /* @@ -27905,6 +27924,8 @@ tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q, */ ASSERT(connp->conn_ref >= 2); squeue_synch_exit(connp->conn_sqp, connp); + + return (0); } /* ARGSUSED */ diff --git a/usr/src/uts/common/inet/tcp/tcp_fusion.c b/usr/src/uts/common/inet/tcp/tcp_fusion.c index a8b6780cef..7ac90ce8ca 100644 --- a/usr/src/uts/common/inet/tcp/tcp_fusion.c +++ b/usr/src/uts/common/inet/tcp/tcp_fusion.c @@ -805,6 +805,7 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size) (*peer_tcp->tcp_connp->conn_upcalls->su_recv)( peer_tcp->tcp_connp->conn_upper_handle, mp, recv_size, flags, &error, &push); + ASSERT(error != EOPNOTSUPP); } else { if (IPCL_IS_NONSTR(peer_tcp->tcp_connp) && (tcp->tcp_valid_bits & TCP_URG_VALID) && diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h index 97374be482..e61c854923 100644 --- a/usr/src/uts/common/inet/tcp_impl.h +++ b/usr/src/uts/common/inet/tcp_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -277,7 +277,7 @@ extern uint_t tcp_max_optsize; extern sock_lower_handle_t tcp_create(int, int, int, sock_downcalls_t **, uint_t *, int *, int, cred_t *); -extern void tcp_fallback(sock_lower_handle_t, queue_t *, boolean_t, +extern int tcp_fallback(sock_lower_handle_t, queue_t *, boolean_t, so_proto_quiesced_cb_t); extern sock_downcalls_t sock_tcp_downcalls; diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c index f141ee0222..c473afdfc5 100644 --- a/usr/src/uts/common/inet/udp/udp.c +++ b/usr/src/uts/common/inet/udp/udp.c @@ -227,6 +227,7 @@ static void udp_xmit(queue_t *, mblk_t *, ire_t *ire, conn_t *, zoneid_t); static int udp_send_connected(conn_t *, mblk_t *, struct nmsghdr *, cred_t *, pid_t); +static void udp_ulp_recv(conn_t *, mblk_t *); /* Common routine for TPI and socket module */ static conn_t *udp_do_open(cred_t *, boolean_t, int); @@ -1206,7 +1207,7 @@ udp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp, */ static void udp_icmp_error(conn_t *connp, mblk_t *mp) - { +{ icmph_t *icmph; ipha_t *ipha; int iph_hdr_length; @@ -1286,7 +1287,6 @@ udp_icmp_error(conn_t *connp, mblk_t *mp) if (sin.sin_port == udp->udp_dstport && sin.sin_addr.s_addr == V4_PART_OF_V6(udp->udp_v6dst)) { - rw_exit(&udp->udp_rwlock); (*connp->conn_upcalls->su_set_error) (connp->conn_upper_handle, error); @@ -1324,7 +1324,6 @@ udp_icmp_error(conn_t *connp, mblk_t *mp) } rw_exit(&udp->udp_rwlock); } else { - mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t), NULL, 0, error); } @@ -1333,6 +1332,7 @@ udp_icmp_error(conn_t *connp, mblk_t *mp) if (mp1 != NULL) putnext(connp->conn_rq, mp1); done: + ASSERT(!RW_ISWRITER(&udp->udp_rwlock)); freemsg(mp); } @@ -1444,13 +1444,8 @@ udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp) * message. Free it, then send our empty message. */ freemsg(mp); - if (!IPCL_IS_NONSTR(connp)) { - putnext(connp->conn_rq, newmp); - } else { - (*connp->conn_upcalls->su_recv) - (connp->conn_upper_handle, newmp, 0, 0, &error, - NULL); - } + udp_ulp_recv(connp, newmp); + return; } case ICMP6_TIME_EXCEEDED: @@ -1508,8 +1503,8 @@ udp_icmp_error_ipv6(conn_t *connp, mblk_t *mp) if (mp1 != NULL) putnext(connp->conn_rq, mp1); } - done: + ASSERT(!RW_ISWRITER(&udp->udp_rwlock)); freemsg(mp); } @@ -3689,7 +3684,7 @@ udp_save_ip_rcv_opt(udp_t *udp, void *opt, int opt_len) } } -static void +static mblk_t * udp_queue_fallback(udp_t *udp, mblk_t *mp) { ASSERT(MUTEX_HELD(&udp->udp_recv_lock)); @@ -3706,13 +3701,55 @@ udp_queue_fallback(udp_t *udp, mblk_t *mp) udp->udp_fallback_queue_tail->b_next = mp; udp->udp_fallback_queue_tail = mp; } - mutex_exit(&udp->udp_recv_lock); + return (NULL); } else { /* - * no more fallbacks possible, ok to drop lock. + * Fallback completed, let the caller putnext() the mblk. */ - mutex_exit(&udp->udp_recv_lock); - putnext(udp->udp_connp->conn_rq, mp); + return (mp); + } +} + +/* + * Deliver data to ULP. In case we have a socket, and it's falling back to + * TPI, then we'll queue the mp for later processing. + */ +static void +udp_ulp_recv(conn_t *connp, mblk_t *mp) +{ + if (IPCL_IS_NONSTR(connp)) { + udp_t *udp = connp->conn_udp; + int error; + + if ((*connp->conn_upcalls->su_recv) + (connp->conn_upper_handle, mp, msgdsize(mp), 0, &error, + NULL) < 0) { + mutex_enter(&udp->udp_recv_lock); + if (error == ENOSPC) { + /* + * let's confirm while holding the lock + */ + if ((*connp->conn_upcalls->su_recv) + (connp->conn_upper_handle, NULL, 0, 0, + &error, NULL) < 0) { + ASSERT(error == ENOSPC); + if (error == ENOSPC) { + connp->conn_flow_cntrld = + B_TRUE; + } + } + mutex_exit(&udp->udp_recv_lock); + } else { + ASSERT(error == EOPNOTSUPP); + mp = udp_queue_fallback(udp, mp); + mutex_exit(&udp->udp_recv_lock); + if (mp != NULL) + putnext(connp->conn_rq, mp); + } + } + ASSERT(MUTEX_NOT_HELD(&udp->udp_recv_lock)); + } else { + putnext(connp->conn_rq, mp); } } @@ -4463,37 +4500,8 @@ udp_input(void *arg1, mblk_t *mp, void *arg2) if (options_mp != NULL) freeb(options_mp); - if (IPCL_IS_NONSTR(connp)) { - int error; + udp_ulp_recv(connp, mp); - if ((*connp->conn_upcalls->su_recv) - (connp->conn_upper_handle, mp, msgdsize(mp), 0, &error, - NULL) < 0) { - mutex_enter(&udp->udp_recv_lock); - if (error == ENOSPC) { - /* - * let's confirm while holding the lock - */ - if ((*connp->conn_upcalls->su_recv) - (connp->conn_upper_handle, NULL, 0, 0, - &error, NULL) < 0) { - if (error == ENOSPC) { - connp->conn_flow_cntrld = - B_TRUE; - } else { - ASSERT(error == EOPNOTSUPP); - } - } - mutex_exit(&udp->udp_recv_lock); - } else { - ASSERT(error == EOPNOTSUPP); - udp_queue_fallback(udp, mp); - } - } - } else { - putnext(connp->conn_rq, mp); - } - ASSERT(MUTEX_NOT_HELD(&udp->udp_recv_lock)); return; tossit: @@ -5846,7 +5854,7 @@ udp_send_connected(conn_t *connp, mblk_t *mp, struct nmsghdr *msg, cred_t *cr, /* M_DATA for connected socket */ - ASSERT(udp->udp_issocket || IPCL_IS_NONSTR(connp)); + ASSERT(udp->udp_issocket); UDP_DBGSTAT(us, udp_data_conn); mutex_enter(&connp->conn_lock); @@ -7990,6 +7998,7 @@ udp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, us = udp->udp_us; ASSERT(us != NULL); + udp->udp_issocket = B_TRUE; connp->conn_flags |= IPCL_NONSTR | IPCL_SOCKET; /* Set flow control */ @@ -9272,7 +9281,7 @@ udp_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, return (udp->udp_dgram_errind ? error : 0); } -void +int udp_fallback(sock_lower_handle_t proto_handle, queue_t *q, boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb) { @@ -9340,21 +9349,15 @@ udp_fallback(sock_lower_handle_t proto_handle, queue_t *q, if (udp->udp_dontroute) opts |= SO_DONTROUTE; - /* - * Once we grab the drain lock, no data will be send up - * to the socket. So we notify the socket that the endpoint - * is quiescent and it's therefore safe move data from - * the socket to the stream head. - */ (*quiesced_cb)(connp->conn_upper_handle, q, &tca, (struct sockaddr *)&laddr, laddrlen, (struct sockaddr *)&faddr, faddrlen, opts); + mutex_enter(&udp->udp_recv_lock); /* - * push up any packets that were queued in udp_t + * Attempts to send data up during fallback will result in it being + * queued in udp_t. Now we push up any queued packets. */ - - mutex_enter(&udp->udp_recv_lock); while (udp->udp_fallback_queue_head != NULL) { mblk_t *mp; mp = udp->udp_fallback_queue_head; @@ -9368,10 +9371,15 @@ udp_fallback(sock_lower_handle_t proto_handle, queue_t *q, /* * No longer a streams less socket */ + rw_enter(&udp->udp_rwlock, RW_WRITER); connp->conn_flags &= ~IPCL_NONSTR; + rw_exit(&udp->udp_rwlock); + mutex_exit(&udp->udp_recv_lock); ASSERT(connp->conn_ref >= 1); + + return (0); } static int diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h index 96f84e43bc..ba370cbc18 100644 --- a/usr/src/uts/common/inet/udp_impl.h +++ b/usr/src/uts/common/inet/udp_impl.h @@ -391,7 +391,7 @@ extern uint_t udp_max_optsize; extern sock_lower_handle_t udp_create(int, int, int, sock_downcalls_t **, uint_t *, int *, int, cred_t *); -extern void udp_fallback(sock_lower_handle_t, queue_t *, boolean_t, +extern int udp_fallback(sock_lower_handle_t, queue_t *, boolean_t, so_proto_quiesced_cb_t); extern sock_downcalls_t sock_udp_downcalls; diff --git a/usr/src/uts/common/sys/socket_proto.h b/usr/src/uts/common/sys/socket_proto.h index 8f60ea9e31..12c95474c6 100644 --- a/usr/src/uts/common/sys/socket_proto.h +++ b/usr/src/uts/common/sys/socket_proto.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -129,7 +129,7 @@ typedef sock_lower_handle_t (*so_proto_create_func_t)(int, int, int, typedef void (*so_proto_quiesced_cb_t)(sock_upper_handle_t, queue_t *, struct T_capability_ack *, struct sockaddr *, socklen_t, struct sockaddr *, socklen_t, short); -typedef void (*so_proto_fallback_func_t)(sock_lower_handle_t, queue_t *, +typedef int (*so_proto_fallback_func_t)(sock_lower_handle_t, queue_t *, boolean_t, so_proto_quiesced_cb_t); /* diff --git a/usr/src/uts/common/sys/socketvar.h b/usr/src/uts/common/sys/socketvar.h index c7c0f0aae7..f4f026fd77 100644 --- a/usr/src/uts/common/sys/socketvar.h +++ b/usr/src/uts/common/sys/socketvar.h @@ -291,14 +291,12 @@ struct sonode { #define SS_SODIRECT 0x00400000 /* transport supports sodirect */ -/* unused 0x01000000 */ /* was SS_LADDR_VALID */ -/* unused 0x02000000 */ /* was SS_FADDR_VALID */ +#define SS_SENTLASTREADSIG 0x01000000 /* last rx signal has been sent */ +#define SS_SENTLASTWRITESIG 0x02000000 /* last tx signal has been sent */ -#define SS_SENTLASTREADSIG 0x10000000 /* last rx signal has been sent */ -#define SS_SENTLASTWRITESIG 0x20000000 /* last tx signal has been sent */ - -#define SS_FALLBACK_PENDING 0x40000000 -#define SS_FALLBACK_COMP 0x80000000 +#define SS_FALLBACK_DRAIN 0x20000000 /* data was/is being drained */ +#define SS_FALLBACK_PENDING 0x40000000 /* fallback is pending */ +#define SS_FALLBACK_COMP 0x80000000 /* fallback has completed */ /* Set of states when the socket can't be rebound */ |