diff options
author | ja97890 <none@none> | 2008-06-02 07:48:31 -0700 |
---|---|---|
committer | ja97890 <none@none> | 2008-06-02 07:48:31 -0700 |
commit | 301ce41f42acb0fca24b2abbaddde9bdb71ab449 (patch) | |
tree | 0c6288b0b9f7f7a4fcd71211f5ae102fa6aee6ed /usr | |
parent | 13a55820e952b584554bc6b9d4e9303052a2cf29 (diff) | |
download | illumos-gate-301ce41f42acb0fca24b2abbaddde9bdb71ab449.tar.gz |
6666472 Network performance regression under heavy load (introduced by 4868863)
Diffstat (limited to 'usr')
-rw-r--r-- | usr/src/uts/common/io/stream.c | 176 | ||||
-rw-r--r-- | usr/src/uts/common/os/logsubr.c | 8 | ||||
-rw-r--r-- | usr/src/uts/common/os/streamio.c | 106 | ||||
-rw-r--r-- | usr/src/uts/common/sys/stream.h | 1 | ||||
-rw-r--r-- | usr/src/uts/common/sys/strsubr.h | 2 |
5 files changed, 213 insertions, 80 deletions
diff --git a/usr/src/uts/common/io/stream.c b/usr/src/uts/common/io/stream.c index d8dad37e58..7de836d800 100644 --- a/usr/src/uts/common/io/stream.c +++ b/usr/src/uts/common/io/stream.c @@ -1670,7 +1670,7 @@ getq(queue_t *q) mblk_t *bp; uchar_t band = 0; - bp = getq_noenab(q); + bp = getq_noenab(q, 0); if (bp != NULL) band = bp->b_band; @@ -1701,15 +1701,43 @@ getq(queue_t *q) } /* + * Returns the number of bytes in a message (a message is defined as a + * chain of mblks linked by b_cont). If a non-NULL mblkcnt is supplied we + * also return the number of distinct mblks in the message. + */ +int +mp_cont_len(mblk_t *bp, int *mblkcnt) +{ + mblk_t *mp; + int mblks = 0; + int bytes = 0; + + for (mp = bp; mp != NULL; mp = mp->b_cont) { + ADD_MBLK_SIZE(mp, bytes); + mblks++; + } + + if (mblkcnt != NULL) + *mblkcnt = mblks; + + return (bytes); +} + +/* * Like getq() but does not backenable. This is used by the stream * head when a putback() is likely. The caller must call qbackenable() * after it is done with accessing the queue. + * The rbytes arguments to getq_noneab() allows callers to specify a + * the maximum number of bytes to return. If the current amount on the + * queue is less than this then the entire message will be returned. + * A value of 0 returns the entire message and is equivalent to the old + * default behaviour prior to the addition of the rbytes argument. */ mblk_t * -getq_noenab(queue_t *q) +getq_noenab(queue_t *q, ssize_t rbytes) { - mblk_t *bp; - mblk_t *tmp; + mblk_t *bp, *mp1; + mblk_t *mp2 = NULL; qband_t *qbp; kthread_id_t freezer; int bytecnt = 0, mblkcnt = 0; @@ -1725,17 +1753,115 @@ getq_noenab(queue_t *q) if ((bp = q->q_first) == 0) { q->q_flag |= QWANTR; } else { - if ((q->q_first = bp->b_next) == NULL) - q->q_last = NULL; - else - q->q_first->b_prev = NULL; - - /* Get message byte count for q_count accounting */ - for (tmp = bp; tmp; tmp = tmp->b_cont) { - ADD_MBLK_SIZE(tmp, bytecnt); - mblkcnt++; + /* + * If the caller supplied a byte threshold and there is + * more than this amount on the queue then break up the + * the message appropriately. We can only safely do + * this for M_DATA messages. + */ + if ((DB_TYPE(bp) == M_DATA) && (rbytes > 0) && + (q->q_count > rbytes)) { + /* + * Inline version of mp_cont_len() which terminates + * when we meet or exceed rbytes. + */ + for (mp1 = bp; mp1 != NULL; mp1 = mp1->b_cont) { + mblkcnt++; + ADD_MBLK_SIZE(mp1, bytecnt); + if (bytecnt >= rbytes) + break; + } + /* + * We need to account for the following scenarios: + * + * 1) Too much data in the first message: + * mp1 will be the mblk which puts us over our + * byte limit. + * 2) Not enough data in the first message: + * mp1 will be NULL. + * 3) Exactly the right amount of data contained within + * whole mblks: + * mp1->b_cont will be where we break the message. + */ + if (bytecnt > rbytes) { + /* + * Dup/copy mp1 and put what we don't need + * back onto the queue. Adjust the read/write + * and continuation pointers appropriately + * and decrement the current mblk count to + * reflect we are putting an mblk back onto + * the queue. + * When adjusting the message pointers, it's + * OK to use the existing bytecnt and the + * requested amount (rbytes) to calculate the + * the new write offset (b_wptr) of what we + * are taking. However, we cannot use these + * values when calculating the read offset of + * the mblk we are putting back on the queue. + * This is because the begining (b_rptr) of the + * mblk represents some arbitrary point within + * the message. + * It's simplest to do this by advancing b_rptr + * by the new length of mp1 as we don't have to + * remember any intermediate state. + */ + ASSERT(mp1 != NULL); + mblkcnt--; + if ((mp2 = dupb(mp1)) == NULL && + (mp2 = copyb(mp1)) == NULL) { + bytecnt = mblkcnt = 0; + goto dup_failed; + } + mp2->b_cont = mp1->b_cont; + mp1->b_wptr -= bytecnt - rbytes; + mp2->b_rptr += mp1->b_wptr - mp1->b_rptr; + mp1->b_cont = NULL; + bytecnt = rbytes; + } else { + /* + * Either there is not enough data in the first + * message or there is no excess data to deal + * with. If mp1 is NULL, we are taking the + * whole message. No need to do anything. + * Otherwise we assign mp1->b_cont to mp2 as + * we will be putting this back onto the head of + * the queue. + */ + if (mp1 != NULL) { + mp2 = mp1->b_cont; + mp1->b_cont = NULL; + } + } + /* + * If mp2 is not NULL then we have part of the message + * to put back onto the queue. + */ + if (mp2 != NULL) { + if ((mp2->b_next = bp->b_next) == NULL) + q->q_last = mp2; + else + bp->b_next->b_prev = mp2; + q->q_first = mp2; + } else { + if ((q->q_first = bp->b_next) == NULL) + q->q_last = NULL; + else + q->q_first->b_prev = NULL; + } + } else { + /* + * Either no byte threshold was supplied, there is + * not enough on the queue or we failed to + * duplicate/copy a data block. In these cases we + * just take the entire first message. + */ +dup_failed: + bytecnt = mp_cont_len(bp, &mblkcnt); + if ((q->q_first = bp->b_next) == NULL) + q->q_last = NULL; + else + q->q_first->b_prev = NULL; } - if (bp->b_band == 0) { q->q_count -= bytecnt; q->q_mblkcnt -= mblkcnt; @@ -1900,7 +2026,6 @@ rmvq(queue_t *q, mblk_t *mp) void rmvq_noenab(queue_t *q, mblk_t *mp) { - mblk_t *tmp; int i; qband_t *qbp = NULL; kthread_id_t freezer; @@ -1952,10 +2077,7 @@ rmvq_noenab(queue_t *q, mblk_t *mp) mp->b_prev = NULL; /* Get the size of the message for q_count accounting */ - for (tmp = mp; tmp; tmp = tmp->b_cont) { - ADD_MBLK_SIZE(tmp, bytecnt); - mblkcnt++; - } + bytecnt = mp_cont_len(mp, &mblkcnt); if (mp->b_band == 0) { /* Perform q_count accounting */ q->q_count -= bytecnt; @@ -2444,10 +2566,7 @@ putq(queue_t *q, mblk_t *bp) } /* Get message byte count for q_count accounting */ - for (tmp = bp; tmp; tmp = tmp->b_cont) { - ADD_MBLK_SIZE(tmp, bytecnt); - mblkcnt++; - } + bytecnt = mp_cont_len(bp, &mblkcnt); if (qbp) { qbp->qb_count += bytecnt; @@ -2629,10 +2748,8 @@ putbq(queue_t *q, mblk_t *bp) } /* Get message byte count for q_count accounting */ - for (tmp = bp; tmp; tmp = tmp->b_cont) { - ADD_MBLK_SIZE(tmp, bytecnt); - mblkcnt++; - } + bytecnt = mp_cont_len(bp, &mblkcnt); + if (qbp) { qbp->qb_count += bytecnt; qbp->qb_mblkcnt += mblkcnt; @@ -2760,10 +2877,7 @@ badord: } /* Get mblk and byte count for q_count accounting */ - for (tmp = mp; tmp; tmp = tmp->b_cont) { - ADD_MBLK_SIZE(tmp, bytecnt); - mblkcnt++; - } + bytecnt = mp_cont_len(mp, &mblkcnt); if (qbp) { /* adjust qband pointers and count */ if (!qbp->qb_first) { diff --git a/usr/src/uts/common/os/logsubr.c b/usr/src/uts/common/os/logsubr.c index 13854e0c23..46c64a141c 100644 --- a/usr/src/uts/common/os/logsubr.c +++ b/usr/src/uts/common/os/logsubr.c @@ -121,7 +121,7 @@ log_flushq(queue_t *q) log_t *lp = (log_t *)q->q_ptr; /* lp will be NULL if the queue was created via log_makeq */ - while ((mp = getq_noenab(q)) != NULL) + while ((mp = getq_noenab(q, 0)) != NULL) log_sendmsg(mp, lp == NULL ? GLOBAL_ZONEID : lp->log_zoneid); } @@ -322,7 +322,7 @@ log_conswitch(log_t *src, log_t *dst) mblk_t *tmp = NULL; log_ctl_t *hlc; - while ((mp = getq_noenab(src->log_q)) != NULL) { + while ((mp = getq_noenab(src->log_q, 0)) != NULL) { log_ctl_t *lc = (log_ctl_t *)mp->b_rptr; lc->flags |= SL_LOGONLY; @@ -500,7 +500,7 @@ log_makemsg(int mid, int sid, int level, int sl, int pri, void *msg, if (size <= LOG_MSGSIZE && (on_intr || log_freeq->q_count > log_freeq->q_lowat)) - mp = getq_noenab(log_freeq); + mp = getq_noenab(log_freeq, 0); if (mp == NULL) { if (on_intr || @@ -688,7 +688,7 @@ log_sendmsg(mblk_t *mp, zoneid_t zoneid) (mp2 = copymsg(mp)) != NULL) { mp2->b_cont->b_rptr += body; if (log_recentq->q_flag & QFULL) - freemsg(getq_noenab(log_recentq)); + freemsg(getq_noenab(log_recentq, 0)); (void) putq(log_recentq, mp2); } } diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c index a1cada7964..a6ef94da96 100644 --- a/usr/src/uts/common/os/streamio.c +++ b/usr/src/uts/common/os/streamio.c @@ -987,9 +987,10 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first, sodirect_t *sodp = stp->sd_sodirect; mblk_t *bp; int error; + ssize_t rbytes = 0; - ASSERT(MUTEX_HELD(&stp->sd_lock)); /* Holding sd_lock prevents the read queue from changing */ + ASSERT(MUTEX_HELD(&stp->sd_lock)); if (uiop != NULL && stp->sd_struiordq != NULL && q->q_first == NULL && @@ -1073,6 +1074,7 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first, } *errorp = 0; ASSERT(MUTEX_HELD(&stp->sd_lock)); + if (sodp != NULL && (sodp->sod_state & SOD_ENABLED) && (sodp->sod_uioa.uioa_state & UIOA_INIT)) { /* @@ -1085,10 +1087,18 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first, if (q->q_first != NULL) { struioainit(q, sodp, uiop); } + } else { + /* + * If we have a valid uio, try and use this as a guide for how + * many bytes to retrieve from the queue via getq_noenab(). + * Doing this can avoid unneccesary counting of overlong + * messages in putback(). We currently only do this for sockets. + */ + if ((uiop != NULL) && (stp->sd_vnode->v_type == VSOCK)) + rbytes = uiop->uio_resid; } - bp = getq_noenab(q); - + bp = getq_noenab(q, rbytes); if (bp != NULL && (bp->b_datap->db_flags & DBLK_UIOA)) { /* * A uioa flaged mblk_t chain, already uio processed, @@ -1431,7 +1441,7 @@ ismdata: */ while ((((bp = q->q_first)) != NULL) && (bp->b_datap->db_type == M_SIG)) { - bp = getq_noenab(q); + bp = getq_noenab(q, 0); /* * sd_lock is held so the content of the * read queue can not change. @@ -6745,14 +6755,14 @@ strgetmsg( */ } else if ((*flagsp & MSG_HIPRI) && q_first != NULL && q_first->b_datap->db_type >= QPCTL && - (bp = getq_noenab(q)) != NULL) { + (bp = getq_noenab(q, 0)) != NULL) { /* Asked for HIPRI and got one */ ASSERT(bp->b_datap->db_type >= QPCTL); break; } else if ((*flagsp & MSG_BAND) && q_first != NULL && ((q_first->b_band >= *prip) || q_first->b_datap->db_type >= QPCTL) && - (bp = getq_noenab(q)) != NULL) { + (bp = getq_noenab(q, 0)) != NULL) { /* * Asked for at least band "prip" and got either at * least that band or a hipri message. @@ -7286,13 +7296,13 @@ retry: */ } else if ((flags & MSG_HIPRI) && q_first != NULL && q_first->b_datap->db_type >= QPCTL && - (bp = getq_noenab(q)) != NULL) { + (bp = getq_noenab(q, 0)) != NULL) { ASSERT(bp->b_datap->db_type >= QPCTL); break; } else if ((flags & MSG_BAND) && q_first != NULL && ((q_first->b_band >= *prip) || q_first->b_datap->db_type >= QPCTL) && - (bp = getq_noenab(q)) != NULL) { + (bp = getq_noenab(q, 0)) != NULL) { /* * Asked for at least band "prip" and got either at * least that band or a hipri message. @@ -8467,19 +8477,32 @@ chkrd: static void putback(struct stdata *stp, queue_t *q, mblk_t *bp, int band) { - mblk_t *qfirst = q->q_first; + mblk_t *qfirst; ASSERT(MUTEX_HELD(&stp->sd_lock)); + /* + * As a result of lock-step ordering around q_lock and sd_lock, + * it's possible for function calls like putnext() and + * canputnext() to get an inaccurate picture of how much + * data is really being processed at the stream head. + * We only consolidate with existing messages on the queue + * if the length of the message we want to put back is smaller + * than the queue hiwater mark. + */ if ((stp->sd_rput_opt & SR_CONSOL_DATA) && - (qfirst != NULL) && - (qfirst->b_datap->db_type == M_DATA) && - ((qfirst->b_flag & (MSGMARK|MSGDELIM)) == 0)) { + (DB_TYPE(bp) == M_DATA) && ((qfirst = q->q_first) != NULL) && + (DB_TYPE(qfirst) == M_DATA) && + ((qfirst->b_flag & (MSGMARK|MSGDELIM)) == 0) && + ((bp->b_flag & (MSGMARK|MSGDELIM|MSGMARKNEXT)) == 0) && + (mp_cont_len(bp, NULL) < q->q_hiwat)) { /* * We use the same logic as defined in strrput() * but in reverse as we are putting back onto the * queue and want to retain byte ordering. - * Consolidate an M_DATA message onto an M_DATA, - * M_PROTO, or M_PCPROTO by merging it with q_first. + * Consolidate M_DATA messages with M_DATA ONLY. + * strrput() allows the consolidation of M_DATA onto + * M_PROTO | M_PCPROTO but not the other way round. + * * The consolidation does not take place if the message * we are returning to the queue is marked with either * of the marks or the delim flag or if q_first @@ -8489,38 +8512,33 @@ putback(struct stdata *stp, queue_t *q, mblk_t *bp, int band) * Carry any MSGMARKNEXT and MSGNOTMARKNEXT from q_first * to the front of the b_cont chain. */ - unsigned char db_type = bp->b_datap->db_type; + rmvq_noenab(q, qfirst); - if ((db_type == M_DATA || db_type == M_PROTO || - db_type == M_PCPROTO) && - !(bp->b_flag & (MSGMARK|MSGDELIM|MSGMARKNEXT))) { - rmvq_noenab(q, qfirst); - /* - * The first message in the b_cont list - * tracks MSGMARKNEXT and MSGNOTMARKNEXT. - * We need to handle the case where we - * are appending: - * - * 1) a MSGMARKNEXT to a MSGNOTMARKNEXT. - * 2) a MSGMARKNEXT to a plain message. - * 3) a MSGNOTMARKNEXT to a plain message - * 4) a MSGNOTMARKNEXT to a MSGNOTMARKNEXT - * message. - * - * Thus we never append a MSGMARKNEXT or - * MSGNOTMARKNEXT to a MSGMARKNEXT message. - */ - if (qfirst->b_flag & MSGMARKNEXT) { - bp->b_flag |= MSGMARKNEXT; - bp->b_flag &= ~MSGNOTMARKNEXT; - qfirst->b_flag &= ~MSGMARKNEXT; - } else if (qfirst->b_flag & MSGNOTMARKNEXT) { - bp->b_flag |= MSGNOTMARKNEXT; - qfirst->b_flag &= ~MSGNOTMARKNEXT; - } - - linkb(bp, qfirst); + /* + * The first message in the b_cont list + * tracks MSGMARKNEXT and MSGNOTMARKNEXT. + * We need to handle the case where we + * are appending: + * + * 1) a MSGMARKNEXT to a MSGNOTMARKNEXT. + * 2) a MSGMARKNEXT to a plain message. + * 3) a MSGNOTMARKNEXT to a plain message + * 4) a MSGNOTMARKNEXT to a MSGNOTMARKNEXT + * message. + * + * Thus we never append a MSGMARKNEXT or + * MSGNOTMARKNEXT to a MSGMARKNEXT message. + */ + if (qfirst->b_flag & MSGMARKNEXT) { + bp->b_flag |= MSGMARKNEXT; + bp->b_flag &= ~MSGNOTMARKNEXT; + qfirst->b_flag &= ~MSGMARKNEXT; + } else if (qfirst->b_flag & MSGNOTMARKNEXT) { + bp->b_flag |= MSGNOTMARKNEXT; + qfirst->b_flag &= ~MSGNOTMARKNEXT; } + + linkb(bp, qfirst); } (void) putbq(q, bp); diff --git a/usr/src/uts/common/sys/stream.h b/usr/src/uts/common/sys/stream.h index 0da91f7d38..30a2870552 100644 --- a/usr/src/uts/common/sys/stream.h +++ b/usr/src/uts/common/sys/stream.h @@ -848,6 +848,7 @@ extern int rwnext(queue_t *, struiod_t *); extern int infonext(queue_t *, infod_t *); extern int isuioq(queue_t *); extern void create_putlocks(queue_t *, int); +extern int mp_cont_len(mblk_t *, int *); /* * shared or externally configured data structures diff --git a/usr/src/uts/common/sys/strsubr.h b/usr/src/uts/common/sys/strsubr.h index 71c26a3212..401e69dc5e 100644 --- a/usr/src/uts/common/sys/strsubr.h +++ b/usr/src/uts/common/sys/strsubr.h @@ -1159,7 +1159,7 @@ extern void backenable(queue_t *, uchar_t); extern void set_qend(queue_t *); extern int strgeterr(stdata_t *, int32_t, int); extern void qenable_locked(queue_t *); -extern mblk_t *getq_noenab(queue_t *); +extern mblk_t *getq_noenab(queue_t *, ssize_t); extern void rmvq_noenab(queue_t *, mblk_t *); extern void qbackenable(queue_t *, uchar_t); extern void set_qfull(queue_t *); |