summaryrefslogtreecommitdiff
path: root/usr
diff options
context:
space:
mode:
authorja97890 <none@none>2008-06-02 07:48:31 -0700
committerja97890 <none@none>2008-06-02 07:48:31 -0700
commit301ce41f42acb0fca24b2abbaddde9bdb71ab449 (patch)
tree0c6288b0b9f7f7a4fcd71211f5ae102fa6aee6ed /usr
parent13a55820e952b584554bc6b9d4e9303052a2cf29 (diff)
downloadillumos-gate-301ce41f42acb0fca24b2abbaddde9bdb71ab449.tar.gz
6666472 Network performance regression under heavy load (introduced by 4868863)
Diffstat (limited to 'usr')
-rw-r--r--usr/src/uts/common/io/stream.c176
-rw-r--r--usr/src/uts/common/os/logsubr.c8
-rw-r--r--usr/src/uts/common/os/streamio.c106
-rw-r--r--usr/src/uts/common/sys/stream.h1
-rw-r--r--usr/src/uts/common/sys/strsubr.h2
5 files changed, 213 insertions, 80 deletions
diff --git a/usr/src/uts/common/io/stream.c b/usr/src/uts/common/io/stream.c
index d8dad37e58..7de836d800 100644
--- a/usr/src/uts/common/io/stream.c
+++ b/usr/src/uts/common/io/stream.c
@@ -1670,7 +1670,7 @@ getq(queue_t *q)
mblk_t *bp;
uchar_t band = 0;
- bp = getq_noenab(q);
+ bp = getq_noenab(q, 0);
if (bp != NULL)
band = bp->b_band;
@@ -1701,15 +1701,43 @@ getq(queue_t *q)
}
/*
+ * Returns the number of bytes in a message (a message is defined as a
+ * chain of mblks linked by b_cont). If a non-NULL mblkcnt is supplied we
+ * also return the number of distinct mblks in the message.
+ */
+int
+mp_cont_len(mblk_t *bp, int *mblkcnt)
+{
+ mblk_t *mp;
+ int mblks = 0;
+ int bytes = 0;
+
+ for (mp = bp; mp != NULL; mp = mp->b_cont) {
+ ADD_MBLK_SIZE(mp, bytes);
+ mblks++;
+ }
+
+ if (mblkcnt != NULL)
+ *mblkcnt = mblks;
+
+ return (bytes);
+}
+
+/*
* Like getq() but does not backenable. This is used by the stream
* head when a putback() is likely. The caller must call qbackenable()
* after it is done with accessing the queue.
+ * The rbytes arguments to getq_noneab() allows callers to specify a
+ * the maximum number of bytes to return. If the current amount on the
+ * queue is less than this then the entire message will be returned.
+ * A value of 0 returns the entire message and is equivalent to the old
+ * default behaviour prior to the addition of the rbytes argument.
*/
mblk_t *
-getq_noenab(queue_t *q)
+getq_noenab(queue_t *q, ssize_t rbytes)
{
- mblk_t *bp;
- mblk_t *tmp;
+ mblk_t *bp, *mp1;
+ mblk_t *mp2 = NULL;
qband_t *qbp;
kthread_id_t freezer;
int bytecnt = 0, mblkcnt = 0;
@@ -1725,17 +1753,115 @@ getq_noenab(queue_t *q)
if ((bp = q->q_first) == 0) {
q->q_flag |= QWANTR;
} else {
- if ((q->q_first = bp->b_next) == NULL)
- q->q_last = NULL;
- else
- q->q_first->b_prev = NULL;
-
- /* Get message byte count for q_count accounting */
- for (tmp = bp; tmp; tmp = tmp->b_cont) {
- ADD_MBLK_SIZE(tmp, bytecnt);
- mblkcnt++;
+ /*
+ * If the caller supplied a byte threshold and there is
+ * more than this amount on the queue then break up the
+ * the message appropriately. We can only safely do
+ * this for M_DATA messages.
+ */
+ if ((DB_TYPE(bp) == M_DATA) && (rbytes > 0) &&
+ (q->q_count > rbytes)) {
+ /*
+ * Inline version of mp_cont_len() which terminates
+ * when we meet or exceed rbytes.
+ */
+ for (mp1 = bp; mp1 != NULL; mp1 = mp1->b_cont) {
+ mblkcnt++;
+ ADD_MBLK_SIZE(mp1, bytecnt);
+ if (bytecnt >= rbytes)
+ break;
+ }
+ /*
+ * We need to account for the following scenarios:
+ *
+ * 1) Too much data in the first message:
+ * mp1 will be the mblk which puts us over our
+ * byte limit.
+ * 2) Not enough data in the first message:
+ * mp1 will be NULL.
+ * 3) Exactly the right amount of data contained within
+ * whole mblks:
+ * mp1->b_cont will be where we break the message.
+ */
+ if (bytecnt > rbytes) {
+ /*
+ * Dup/copy mp1 and put what we don't need
+ * back onto the queue. Adjust the read/write
+ * and continuation pointers appropriately
+ * and decrement the current mblk count to
+ * reflect we are putting an mblk back onto
+ * the queue.
+ * When adjusting the message pointers, it's
+ * OK to use the existing bytecnt and the
+ * requested amount (rbytes) to calculate the
+ * the new write offset (b_wptr) of what we
+ * are taking. However, we cannot use these
+ * values when calculating the read offset of
+ * the mblk we are putting back on the queue.
+ * This is because the begining (b_rptr) of the
+ * mblk represents some arbitrary point within
+ * the message.
+ * It's simplest to do this by advancing b_rptr
+ * by the new length of mp1 as we don't have to
+ * remember any intermediate state.
+ */
+ ASSERT(mp1 != NULL);
+ mblkcnt--;
+ if ((mp2 = dupb(mp1)) == NULL &&
+ (mp2 = copyb(mp1)) == NULL) {
+ bytecnt = mblkcnt = 0;
+ goto dup_failed;
+ }
+ mp2->b_cont = mp1->b_cont;
+ mp1->b_wptr -= bytecnt - rbytes;
+ mp2->b_rptr += mp1->b_wptr - mp1->b_rptr;
+ mp1->b_cont = NULL;
+ bytecnt = rbytes;
+ } else {
+ /*
+ * Either there is not enough data in the first
+ * message or there is no excess data to deal
+ * with. If mp1 is NULL, we are taking the
+ * whole message. No need to do anything.
+ * Otherwise we assign mp1->b_cont to mp2 as
+ * we will be putting this back onto the head of
+ * the queue.
+ */
+ if (mp1 != NULL) {
+ mp2 = mp1->b_cont;
+ mp1->b_cont = NULL;
+ }
+ }
+ /*
+ * If mp2 is not NULL then we have part of the message
+ * to put back onto the queue.
+ */
+ if (mp2 != NULL) {
+ if ((mp2->b_next = bp->b_next) == NULL)
+ q->q_last = mp2;
+ else
+ bp->b_next->b_prev = mp2;
+ q->q_first = mp2;
+ } else {
+ if ((q->q_first = bp->b_next) == NULL)
+ q->q_last = NULL;
+ else
+ q->q_first->b_prev = NULL;
+ }
+ } else {
+ /*
+ * Either no byte threshold was supplied, there is
+ * not enough on the queue or we failed to
+ * duplicate/copy a data block. In these cases we
+ * just take the entire first message.
+ */
+dup_failed:
+ bytecnt = mp_cont_len(bp, &mblkcnt);
+ if ((q->q_first = bp->b_next) == NULL)
+ q->q_last = NULL;
+ else
+ q->q_first->b_prev = NULL;
}
-
if (bp->b_band == 0) {
q->q_count -= bytecnt;
q->q_mblkcnt -= mblkcnt;
@@ -1900,7 +2026,6 @@ rmvq(queue_t *q, mblk_t *mp)
void
rmvq_noenab(queue_t *q, mblk_t *mp)
{
- mblk_t *tmp;
int i;
qband_t *qbp = NULL;
kthread_id_t freezer;
@@ -1952,10 +2077,7 @@ rmvq_noenab(queue_t *q, mblk_t *mp)
mp->b_prev = NULL;
/* Get the size of the message for q_count accounting */
- for (tmp = mp; tmp; tmp = tmp->b_cont) {
- ADD_MBLK_SIZE(tmp, bytecnt);
- mblkcnt++;
- }
+ bytecnt = mp_cont_len(mp, &mblkcnt);
if (mp->b_band == 0) { /* Perform q_count accounting */
q->q_count -= bytecnt;
@@ -2444,10 +2566,7 @@ putq(queue_t *q, mblk_t *bp)
}
/* Get message byte count for q_count accounting */
- for (tmp = bp; tmp; tmp = tmp->b_cont) {
- ADD_MBLK_SIZE(tmp, bytecnt);
- mblkcnt++;
- }
+ bytecnt = mp_cont_len(bp, &mblkcnt);
if (qbp) {
qbp->qb_count += bytecnt;
@@ -2629,10 +2748,8 @@ putbq(queue_t *q, mblk_t *bp)
}
/* Get message byte count for q_count accounting */
- for (tmp = bp; tmp; tmp = tmp->b_cont) {
- ADD_MBLK_SIZE(tmp, bytecnt);
- mblkcnt++;
- }
+ bytecnt = mp_cont_len(bp, &mblkcnt);
+
if (qbp) {
qbp->qb_count += bytecnt;
qbp->qb_mblkcnt += mblkcnt;
@@ -2760,10 +2877,7 @@ badord:
}
/* Get mblk and byte count for q_count accounting */
- for (tmp = mp; tmp; tmp = tmp->b_cont) {
- ADD_MBLK_SIZE(tmp, bytecnt);
- mblkcnt++;
- }
+ bytecnt = mp_cont_len(mp, &mblkcnt);
if (qbp) { /* adjust qband pointers and count */
if (!qbp->qb_first) {
diff --git a/usr/src/uts/common/os/logsubr.c b/usr/src/uts/common/os/logsubr.c
index 13854e0c23..46c64a141c 100644
--- a/usr/src/uts/common/os/logsubr.c
+++ b/usr/src/uts/common/os/logsubr.c
@@ -121,7 +121,7 @@ log_flushq(queue_t *q)
log_t *lp = (log_t *)q->q_ptr;
/* lp will be NULL if the queue was created via log_makeq */
- while ((mp = getq_noenab(q)) != NULL)
+ while ((mp = getq_noenab(q, 0)) != NULL)
log_sendmsg(mp, lp == NULL ? GLOBAL_ZONEID : lp->log_zoneid);
}
@@ -322,7 +322,7 @@ log_conswitch(log_t *src, log_t *dst)
mblk_t *tmp = NULL;
log_ctl_t *hlc;
- while ((mp = getq_noenab(src->log_q)) != NULL) {
+ while ((mp = getq_noenab(src->log_q, 0)) != NULL) {
log_ctl_t *lc = (log_ctl_t *)mp->b_rptr;
lc->flags |= SL_LOGONLY;
@@ -500,7 +500,7 @@ log_makemsg(int mid, int sid, int level, int sl, int pri, void *msg,
if (size <= LOG_MSGSIZE &&
(on_intr || log_freeq->q_count > log_freeq->q_lowat))
- mp = getq_noenab(log_freeq);
+ mp = getq_noenab(log_freeq, 0);
if (mp == NULL) {
if (on_intr ||
@@ -688,7 +688,7 @@ log_sendmsg(mblk_t *mp, zoneid_t zoneid)
(mp2 = copymsg(mp)) != NULL) {
mp2->b_cont->b_rptr += body;
if (log_recentq->q_flag & QFULL)
- freemsg(getq_noenab(log_recentq));
+ freemsg(getq_noenab(log_recentq, 0));
(void) putq(log_recentq, mp2);
}
}
diff --git a/usr/src/uts/common/os/streamio.c b/usr/src/uts/common/os/streamio.c
index a1cada7964..a6ef94da96 100644
--- a/usr/src/uts/common/os/streamio.c
+++ b/usr/src/uts/common/os/streamio.c
@@ -987,9 +987,10 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,
sodirect_t *sodp = stp->sd_sodirect;
mblk_t *bp;
int error;
+ ssize_t rbytes = 0;
- ASSERT(MUTEX_HELD(&stp->sd_lock));
/* Holding sd_lock prevents the read queue from changing */
+ ASSERT(MUTEX_HELD(&stp->sd_lock));
if (uiop != NULL && stp->sd_struiordq != NULL &&
q->q_first == NULL &&
@@ -1073,6 +1074,7 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,
}
*errorp = 0;
ASSERT(MUTEX_HELD(&stp->sd_lock));
+
if (sodp != NULL && (sodp->sod_state & SOD_ENABLED) &&
(sodp->sod_uioa.uioa_state & UIOA_INIT)) {
/*
@@ -1085,10 +1087,18 @@ strget(struct stdata *stp, queue_t *q, struct uio *uiop, int first,
if (q->q_first != NULL) {
struioainit(q, sodp, uiop);
}
+ } else {
+ /*
+ * If we have a valid uio, try and use this as a guide for how
+ * many bytes to retrieve from the queue via getq_noenab().
+ * Doing this can avoid unneccesary counting of overlong
+ * messages in putback(). We currently only do this for sockets.
+ */
+ if ((uiop != NULL) && (stp->sd_vnode->v_type == VSOCK))
+ rbytes = uiop->uio_resid;
}
- bp = getq_noenab(q);
-
+ bp = getq_noenab(q, rbytes);
if (bp != NULL && (bp->b_datap->db_flags & DBLK_UIOA)) {
/*
* A uioa flaged mblk_t chain, already uio processed,
@@ -1431,7 +1441,7 @@ ismdata:
*/
while ((((bp = q->q_first)) != NULL) &&
(bp->b_datap->db_type == M_SIG)) {
- bp = getq_noenab(q);
+ bp = getq_noenab(q, 0);
/*
* sd_lock is held so the content of the
* read queue can not change.
@@ -6745,14 +6755,14 @@ strgetmsg(
*/
} else if ((*flagsp & MSG_HIPRI) && q_first != NULL &&
q_first->b_datap->db_type >= QPCTL &&
- (bp = getq_noenab(q)) != NULL) {
+ (bp = getq_noenab(q, 0)) != NULL) {
/* Asked for HIPRI and got one */
ASSERT(bp->b_datap->db_type >= QPCTL);
break;
} else if ((*flagsp & MSG_BAND) && q_first != NULL &&
((q_first->b_band >= *prip) ||
q_first->b_datap->db_type >= QPCTL) &&
- (bp = getq_noenab(q)) != NULL) {
+ (bp = getq_noenab(q, 0)) != NULL) {
/*
* Asked for at least band "prip" and got either at
* least that band or a hipri message.
@@ -7286,13 +7296,13 @@ retry:
*/
} else if ((flags & MSG_HIPRI) && q_first != NULL &&
q_first->b_datap->db_type >= QPCTL &&
- (bp = getq_noenab(q)) != NULL) {
+ (bp = getq_noenab(q, 0)) != NULL) {
ASSERT(bp->b_datap->db_type >= QPCTL);
break;
} else if ((flags & MSG_BAND) && q_first != NULL &&
((q_first->b_band >= *prip) ||
q_first->b_datap->db_type >= QPCTL) &&
- (bp = getq_noenab(q)) != NULL) {
+ (bp = getq_noenab(q, 0)) != NULL) {
/*
* Asked for at least band "prip" and got either at
* least that band or a hipri message.
@@ -8467,19 +8477,32 @@ chkrd:
static void
putback(struct stdata *stp, queue_t *q, mblk_t *bp, int band)
{
- mblk_t *qfirst = q->q_first;
+ mblk_t *qfirst;
ASSERT(MUTEX_HELD(&stp->sd_lock));
+ /*
+ * As a result of lock-step ordering around q_lock and sd_lock,
+ * it's possible for function calls like putnext() and
+ * canputnext() to get an inaccurate picture of how much
+ * data is really being processed at the stream head.
+ * We only consolidate with existing messages on the queue
+ * if the length of the message we want to put back is smaller
+ * than the queue hiwater mark.
+ */
if ((stp->sd_rput_opt & SR_CONSOL_DATA) &&
- (qfirst != NULL) &&
- (qfirst->b_datap->db_type == M_DATA) &&
- ((qfirst->b_flag & (MSGMARK|MSGDELIM)) == 0)) {
+ (DB_TYPE(bp) == M_DATA) && ((qfirst = q->q_first) != NULL) &&
+ (DB_TYPE(qfirst) == M_DATA) &&
+ ((qfirst->b_flag & (MSGMARK|MSGDELIM)) == 0) &&
+ ((bp->b_flag & (MSGMARK|MSGDELIM|MSGMARKNEXT)) == 0) &&
+ (mp_cont_len(bp, NULL) < q->q_hiwat)) {
/*
* We use the same logic as defined in strrput()
* but in reverse as we are putting back onto the
* queue and want to retain byte ordering.
- * Consolidate an M_DATA message onto an M_DATA,
- * M_PROTO, or M_PCPROTO by merging it with q_first.
+ * Consolidate M_DATA messages with M_DATA ONLY.
+ * strrput() allows the consolidation of M_DATA onto
+ * M_PROTO | M_PCPROTO but not the other way round.
+ *
* The consolidation does not take place if the message
* we are returning to the queue is marked with either
* of the marks or the delim flag or if q_first
@@ -8489,38 +8512,33 @@ putback(struct stdata *stp, queue_t *q, mblk_t *bp, int band)
* Carry any MSGMARKNEXT and MSGNOTMARKNEXT from q_first
* to the front of the b_cont chain.
*/
- unsigned char db_type = bp->b_datap->db_type;
+ rmvq_noenab(q, qfirst);
- if ((db_type == M_DATA || db_type == M_PROTO ||
- db_type == M_PCPROTO) &&
- !(bp->b_flag & (MSGMARK|MSGDELIM|MSGMARKNEXT))) {
- rmvq_noenab(q, qfirst);
- /*
- * The first message in the b_cont list
- * tracks MSGMARKNEXT and MSGNOTMARKNEXT.
- * We need to handle the case where we
- * are appending:
- *
- * 1) a MSGMARKNEXT to a MSGNOTMARKNEXT.
- * 2) a MSGMARKNEXT to a plain message.
- * 3) a MSGNOTMARKNEXT to a plain message
- * 4) a MSGNOTMARKNEXT to a MSGNOTMARKNEXT
- * message.
- *
- * Thus we never append a MSGMARKNEXT or
- * MSGNOTMARKNEXT to a MSGMARKNEXT message.
- */
- if (qfirst->b_flag & MSGMARKNEXT) {
- bp->b_flag |= MSGMARKNEXT;
- bp->b_flag &= ~MSGNOTMARKNEXT;
- qfirst->b_flag &= ~MSGMARKNEXT;
- } else if (qfirst->b_flag & MSGNOTMARKNEXT) {
- bp->b_flag |= MSGNOTMARKNEXT;
- qfirst->b_flag &= ~MSGNOTMARKNEXT;
- }
-
- linkb(bp, qfirst);
+ /*
+ * The first message in the b_cont list
+ * tracks MSGMARKNEXT and MSGNOTMARKNEXT.
+ * We need to handle the case where we
+ * are appending:
+ *
+ * 1) a MSGMARKNEXT to a MSGNOTMARKNEXT.
+ * 2) a MSGMARKNEXT to a plain message.
+ * 3) a MSGNOTMARKNEXT to a plain message
+ * 4) a MSGNOTMARKNEXT to a MSGNOTMARKNEXT
+ * message.
+ *
+ * Thus we never append a MSGMARKNEXT or
+ * MSGNOTMARKNEXT to a MSGMARKNEXT message.
+ */
+ if (qfirst->b_flag & MSGMARKNEXT) {
+ bp->b_flag |= MSGMARKNEXT;
+ bp->b_flag &= ~MSGNOTMARKNEXT;
+ qfirst->b_flag &= ~MSGMARKNEXT;
+ } else if (qfirst->b_flag & MSGNOTMARKNEXT) {
+ bp->b_flag |= MSGNOTMARKNEXT;
+ qfirst->b_flag &= ~MSGNOTMARKNEXT;
}
+
+ linkb(bp, qfirst);
}
(void) putbq(q, bp);
diff --git a/usr/src/uts/common/sys/stream.h b/usr/src/uts/common/sys/stream.h
index 0da91f7d38..30a2870552 100644
--- a/usr/src/uts/common/sys/stream.h
+++ b/usr/src/uts/common/sys/stream.h
@@ -848,6 +848,7 @@ extern int rwnext(queue_t *, struiod_t *);
extern int infonext(queue_t *, infod_t *);
extern int isuioq(queue_t *);
extern void create_putlocks(queue_t *, int);
+extern int mp_cont_len(mblk_t *, int *);
/*
* shared or externally configured data structures
diff --git a/usr/src/uts/common/sys/strsubr.h b/usr/src/uts/common/sys/strsubr.h
index 71c26a3212..401e69dc5e 100644
--- a/usr/src/uts/common/sys/strsubr.h
+++ b/usr/src/uts/common/sys/strsubr.h
@@ -1159,7 +1159,7 @@ extern void backenable(queue_t *, uchar_t);
extern void set_qend(queue_t *);
extern int strgeterr(stdata_t *, int32_t, int);
extern void qenable_locked(queue_t *);
-extern mblk_t *getq_noenab(queue_t *);
+extern mblk_t *getq_noenab(queue_t *, ssize_t);
extern void rmvq_noenab(queue_t *, mblk_t *);
extern void qbackenable(queue_t *, uchar_t);
extern void set_qfull(queue_t *);