summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMadhavan Venkataraman <Madhavan.Venkataraman@Sun.COM>2010-02-23 04:19:01 -0500
committerMadhavan Venkataraman <Madhavan.Venkataraman@Sun.COM>2010-02-23 04:19:01 -0500
commit060cedfb73ddd0d3b0742264bece958dd77d1008 (patch)
tree8a93e2b5bb2a1002a59d190bbbbfe4b43d5c3c48
parentc9a5bc8f8ef20fd68ff0b46331bbaf26e1415abb (diff)
downloadillumos-joyent-060cedfb73ddd0d3b0742264bece958dd77d1008.tar.gz
6839498 callout subsystem should be less brittle wrt memory allocation failure
-rw-r--r--usr/src/cmd/mdb/common/modules/genunix/genunix.c112
-rw-r--r--usr/src/uts/common/os/callout.c474
-rw-r--r--usr/src/uts/common/sys/callo.h50
3 files changed, 489 insertions, 147 deletions
diff --git a/usr/src/cmd/mdb/common/modules/genunix/genunix.c b/usr/src/cmd/mdb/common/modules/genunix/genunix.c
index 99f26333f4..968f0284c4 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/genunix.c
+++ b/usr/src/cmd/mdb/common/modules/genunix/genunix.c
@@ -651,33 +651,35 @@ static const char *co_typenames[] = { "R", "N" };
#define TABLE_TO_SEQID(x) ((x) >> CALLOUT_TYPE_BITS)
/* callout flags, in no particular order */
-#define COF_REAL 0x0000001
-#define COF_NORM 0x0000002
-#define COF_LONG 0x0000004
-#define COF_SHORT 0x0000008
-#define COF_EMPTY 0x0000010
-#define COF_TIME 0x0000020
-#define COF_BEFORE 0x0000040
-#define COF_AFTER 0x0000080
-#define COF_SEQID 0x0000100
-#define COF_FUNC 0x0000200
-#define COF_ADDR 0x0000400
-#define COF_EXEC 0x0000800
-#define COF_HIRES 0x0001000
-#define COF_ABS 0x0002000
-#define COF_TABLE 0x0004000
-#define COF_BYIDH 0x0008000
-#define COF_FREE 0x0010000
-#define COF_LIST 0x0020000
-#define COF_EXPREL 0x0040000
-#define COF_HDR 0x0080000
-#define COF_VERBOSE 0x0100000
-#define COF_LONGLIST 0x0200000
-#define COF_THDR 0x0400000
-#define COF_LHDR 0x0800000
-#define COF_CHDR 0x1000000
-#define COF_PARAM 0x2000000
-#define COF_DECODE 0x4000000
+#define COF_REAL 0x00000001
+#define COF_NORM 0x00000002
+#define COF_LONG 0x00000004
+#define COF_SHORT 0x00000008
+#define COF_EMPTY 0x00000010
+#define COF_TIME 0x00000020
+#define COF_BEFORE 0x00000040
+#define COF_AFTER 0x00000080
+#define COF_SEQID 0x00000100
+#define COF_FUNC 0x00000200
+#define COF_ADDR 0x00000400
+#define COF_EXEC 0x00000800
+#define COF_HIRES 0x00001000
+#define COF_ABS 0x00002000
+#define COF_TABLE 0x00004000
+#define COF_BYIDH 0x00008000
+#define COF_FREE 0x00010000
+#define COF_LIST 0x00020000
+#define COF_EXPREL 0x00040000
+#define COF_HDR 0x00080000
+#define COF_VERBOSE 0x00100000
+#define COF_LONGLIST 0x00200000
+#define COF_THDR 0x00400000
+#define COF_LHDR 0x00800000
+#define COF_CHDR 0x01000000
+#define COF_PARAM 0x02000000
+#define COF_DECODE 0x04000000
+#define COF_HEAP 0x08000000
+#define COF_QUEUE 0x10000000
/* show real and normal, short and long, expired and unexpired. */
#define COF_DEFAULT (COF_REAL | COF_NORM | COF_LONG | COF_SHORT)
@@ -719,14 +721,14 @@ callouts_cb(uintptr_t addr, const void *data, void *priv)
return (WALK_ERR);
}
- if ((coargs->flags & COF_FREE) && !(co->c_xid & CALLOUT_FREE)) {
+ if ((coargs->flags & COF_FREE) && !(co->c_xid & CALLOUT_ID_FREE)) {
/*
* The callout must have been reallocated. No point in
* walking any more.
*/
return (WALK_DONE);
}
- if (!(coargs->flags & COF_FREE) && (co->c_xid & CALLOUT_FREE)) {
+ if (!(coargs->flags & COF_FREE) && (co->c_xid & CALLOUT_ID_FREE)) {
/*
* The callout must have been freed. No point in
* walking any more.
@@ -806,6 +808,21 @@ callouts_cb(uintptr_t addr, const void *data, void *priv)
return (WALK_NEXT);
}
}
+ /*
+ * We do the checks for COF_HEAP and COF_QUEUE here only if we
+ * are traversing BYIDH. If the traversal is by callout list,
+ * we do this check in callout_list_cb() to be more
+ * efficient.
+ */
+ if ((coargs->flags & COF_HEAP) &&
+ !(list_flags & CALLOUT_LIST_FLAG_HEAPED)) {
+ return (WALK_NEXT);
+ }
+
+ if ((coargs->flags & COF_QUEUE) &&
+ !(list_flags & CALLOUT_LIST_FLAG_QUEUED)) {
+ return (WALK_NEXT);
+ }
}
#define callout_table_mask ((1 << coargs->ctbits) - 1)
@@ -937,6 +954,16 @@ callout_list_cb(uintptr_t addr, const void *data, void *priv)
}
}
+ if ((coargs->flags & COF_HEAP) &&
+ !(coargs->list_flags & CALLOUT_LIST_FLAG_HEAPED)) {
+ return (WALK_NEXT);
+ }
+
+ if ((coargs->flags & COF_QUEUE) &&
+ !(coargs->list_flags & CALLOUT_LIST_FLAG_QUEUED)) {
+ return (WALK_NEXT);
+ }
+
if ((coargs->flags & COF_LHDR) && !(coargs->flags & COF_ADDR) &&
(coargs->flags & (COF_LIST | COF_VERBOSE))) {
if (!(coargs->flags & COF_VERBOSE)) {
@@ -1063,9 +1090,9 @@ callout_t_cb(uintptr_t addr, const void *data, void *priv)
coargs->flags |= (COF_LHDR | COF_CHDR);
if (coargs->flags & COF_LONGLIST) {
/* more info! */
- mdb_printf("%<u> %-T%-7s %-7s %-?s %-?s"
+ mdb_printf("%<u> %-T%-7s %-7s %-?s %-?s %-?s"
" %-?s %-?s %-?s%</u>",
- "HEAPNUM", "HEAPMAX", "TASKQ", "EXPQ",
+ "HEAPNUM", "HEAPMAX", "TASKQ", "EXPQ", "QUE",
"PEND", "FREE", "LOCK");
}
mdb_printf("\n");
@@ -1078,10 +1105,11 @@ callout_t_cb(uintptr_t addr, const void *data, void *priv)
ct->ct_heap);
if (coargs->flags & COF_LONGLIST) {
/* more info! */
- mdb_printf(" %-7d %-7d %-?p %-?p"
+ mdb_printf(" %-7d %-7d %-?p %-?p %-?p"
" %-?lld %-?lld %-?p",
ct->ct_heap_num, ct->ct_heap_max,
ct->ct_taskq, ct->ct_expired.ch_head,
+ ct->ct_queue.ch_head,
cotwd->ct_timeouts_pending,
cotwd->ct_allocations -
cotwd->ct_timeouts_pending,
@@ -1128,6 +1156,17 @@ callout_t_cb(uintptr_t addr, const void *data, void *priv)
return (WALK_ERR);
}
}
+ /* then, print the callout queue */
+ clptr = (callout_list_t *)ct->ct_queue.ch_head;
+ if (clptr != NULL) {
+ coargs->bucket = -1;
+ if (mdb_pwalk("callout_list", callout_list_cb,
+ coargs, (uintptr_t)clptr) == -1) {
+ mdb_warn("cannot walk callout_list"
+ " at %p", clptr);
+ return (WALK_ERR);
+ }
+ }
for (i = 0; i < CALLOUT_BUCKETS; i++) {
if (ct->ct_clhash == NULL) {
/* nothing to do */
@@ -1272,6 +1311,8 @@ callout(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
'v', MDB_OPT_SETBITS, COF_LONGLIST, &coargs.flags,
'i', MDB_OPT_SETBITS, COF_BYIDH, &coargs.flags,
'F', MDB_OPT_SETBITS, COF_FREE, &coargs.flags,
+ 'H', MDB_OPT_SETBITS, COF_HEAP, &coargs.flags,
+ 'Q', MDB_OPT_SETBITS, COF_QUEUE, &coargs.flags,
'A', MDB_OPT_SETBITS, COF_ADDR, &coargs.flags,
NULL) != argc) {
return (DCMD_USAGE);
@@ -1347,6 +1388,11 @@ callout(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
return (DCMD_USAGE);
}
+ if ((coargs.flags & COF_HEAP) && (coargs.flags & COF_QUEUE)) {
+ mdb_printf("-H and -Q are mutually exclusive\n");
+ return (DCMD_USAGE);
+ }
+
if (funcname != NULL) {
GElf_Sym sym;
@@ -1552,6 +1598,8 @@ callout_help(void)
" -F : walk free callout list (free list with -i) instead\n"
" -v : display more info for each item\n"
" -V : show details of each level of info as it is traversed\n"
+ " -H : limit display to callouts in the callout heap\n"
+ " -Q : limit display to callouts in the callout queue\n"
" -A : show only addresses. Useful for pipelines.\n");
}
diff --git a/usr/src/uts/common/os/callout.c b/usr/src/uts/common/os/callout.c
index 7ae72da94c..0421dc94d1 100644
--- a/usr/src/uts/common/os/callout.c
+++ b/usr/src/uts/common/os/callout.c
@@ -42,6 +42,7 @@
*/
static int callout_threads; /* callout normal threads */
static hrtime_t callout_debug_hrtime; /* debugger entry time */
+static int callout_chunk; /* callout heap chunk size */
static int callout_min_reap; /* callout minimum reap count */
static int callout_tolerance; /* callout hires tolerance */
static callout_table_t *callout_boot_ct; /* Boot CPU's callout tables */
@@ -170,6 +171,15 @@ static hrtime_t callout_heap_process(callout_table_t *, hrtime_t, int);
#define CALLOUT_LIST_DELETE(hash, cl) \
CALLOUT_HASH_DELETE(hash, cl, cl_next, cl_prev)
+#define CALLOUT_LIST_BEFORE(cl, nextcl) \
+{ \
+ (cl)->cl_prev = (nextcl)->cl_prev; \
+ (cl)->cl_next = (nextcl); \
+ (nextcl)->cl_prev = (cl); \
+ if (cl->cl_prev != NULL) \
+ cl->cl_prev->cl_next = cl; \
+}
+
/*
* For normal callouts, there is a deadlock scenario if two callouts that
* have an inter-dependency end up on the same callout list. To break the
@@ -179,7 +189,7 @@ static hrtime_t callout_heap_process(callout_table_t *, hrtime_t, int);
* necessary (sigh).
*/
#define CALLOUT_THRESHOLD 100000000
-#define CALLOUT_EXEC_COMPUTE(ct, exec) \
+#define CALLOUT_EXEC_COMPUTE(ct, nextexp, exec) \
{ \
callout_list_t *cl; \
\
@@ -197,14 +207,10 @@ static hrtime_t callout_heap_process(callout_table_t *, hrtime_t, int);
* only one callout, there is no need for two threads. \
*/ \
exec = 1; \
- } else if ((ct->ct_heap_num == 0) || \
- (ct->ct_heap[0].ch_expiration > gethrtime() + CALLOUT_THRESHOLD)) {\
+ } else if ((nextexp) > (gethrtime() + CALLOUT_THRESHOLD)) { \
/* \
- * If the heap has become empty, we need two threads as \
- * there is no one to kick off the second thread in the \
- * future. If the heap is not empty and the top of the \
- * heap does not expire in the near future, we need two \
- * threads. \
+ * If the next expiration of the cyclic is way out into \
+ * the future, we need two threads. \
*/ \
exec = 2; \
} else { \
@@ -240,6 +246,16 @@ static hrtime_t callout_heap_process(callout_table_t *, hrtime_t, int);
}
/*
+ * Macro to free a callout.
+ */
+#define CALLOUT_FREE(ct, cl) \
+{ \
+ cp->c_idnext = ct->ct_free; \
+ ct->ct_free = cp; \
+ cp->c_xid |= CALLOUT_ID_FREE; \
+}
+
+/*
* Allocate a callout structure. We try quite hard because we
* can't sleep, and if we can't do the allocation, we're toast.
* Failing all, we try a KM_PANIC allocation. Note that we never
@@ -333,6 +349,164 @@ callout_list_get(callout_table_t *ct, hrtime_t expiration, int flags, int hash)
}
/*
+ * Add a new callout list into a callout table's queue in sorted order by
+ * expiration.
+ */
+static int
+callout_queue_add(callout_table_t *ct, callout_list_t *cl)
+{
+ callout_list_t *nextcl;
+ hrtime_t expiration;
+
+ expiration = cl->cl_expiration;
+ nextcl = ct->ct_queue.ch_head;
+ if ((nextcl == NULL) || (expiration < nextcl->cl_expiration)) {
+ CALLOUT_LIST_INSERT(ct->ct_queue, cl);
+ return (1);
+ }
+
+ while (nextcl != NULL) {
+ if (expiration < nextcl->cl_expiration) {
+ CALLOUT_LIST_BEFORE(cl, nextcl);
+ return (0);
+ }
+ nextcl = nextcl->cl_next;
+ }
+ CALLOUT_LIST_APPEND(ct->ct_queue, cl);
+
+ return (0);
+}
+
+/*
+ * Insert a callout list into a callout table's queue and reprogram the queue
+ * cyclic if needed.
+ */
+static void
+callout_queue_insert(callout_table_t *ct, callout_list_t *cl)
+{
+ cl->cl_flags |= CALLOUT_LIST_FLAG_QUEUED;
+
+ /*
+ * Add the callout to the callout queue. If it ends up at the head,
+ * the cyclic needs to be reprogrammed as we have an earlier
+ * expiration.
+ *
+ * Also, during the CPR suspend phase, do not reprogram the cyclic.
+ * We don't want any callout activity. When the CPR resume phase is
+ * entered, the cyclic will be programmed for the earliest expiration
+ * in the queue.
+ */
+ if (callout_queue_add(ct, cl) && (ct->ct_suspend == 0))
+ (void) cyclic_reprogram(ct->ct_qcyclic, cl->cl_expiration);
+}
+
+/*
+ * Delete and handle all past expirations in a callout table's queue.
+ */
+static hrtime_t
+callout_queue_delete(callout_table_t *ct)
+{
+ callout_list_t *cl;
+ hrtime_t now;
+
+ ASSERT(MUTEX_HELD(&ct->ct_mutex));
+
+ now = gethrtime();
+ while ((cl = ct->ct_queue.ch_head) != NULL) {
+ if (cl->cl_expiration > now)
+ break;
+ cl->cl_flags &= ~CALLOUT_LIST_FLAG_QUEUED;
+ CALLOUT_LIST_DELETE(ct->ct_queue, cl);
+ CALLOUT_LIST_APPEND(ct->ct_expired, cl);
+ }
+
+ /*
+ * If this callout queue is empty or callouts have been suspended,
+ * just return.
+ */
+ if ((cl == NULL) || (ct->ct_suspend > 0))
+ return (CY_INFINITY);
+
+ (void) cyclic_reprogram(ct->ct_qcyclic, cl->cl_expiration);
+
+ return (cl->cl_expiration);
+}
+
+static hrtime_t
+callout_queue_process(callout_table_t *ct, hrtime_t delta, int timechange)
+{
+ callout_list_t *firstcl, *cl;
+ hrtime_t expiration, now;
+ int clflags;
+ callout_hash_t temp;
+
+ ASSERT(MUTEX_HELD(&ct->ct_mutex));
+
+ firstcl = ct->ct_queue.ch_head;
+ if (firstcl == NULL)
+ return (CY_INFINITY);
+
+ /*
+ * We walk the callout queue. If we encounter a hrestime entry that
+ * must be removed, we clean it out. Otherwise, we apply any
+ * adjustments needed to it. Because of the latter, we need to
+ * recreate the list as we go along.
+ */
+ temp = ct->ct_queue;
+ ct->ct_queue.ch_head = NULL;
+ ct->ct_queue.ch_tail = NULL;
+
+ clflags = (CALLOUT_LIST_FLAG_HRESTIME | CALLOUT_LIST_FLAG_ABSOLUTE);
+ now = gethrtime();
+ while ((cl = temp.ch_head) != NULL) {
+ CALLOUT_LIST_DELETE(temp, cl);
+
+ /*
+ * Delete the callout and expire it, if one of the following
+ * is true:
+ * - the callout has expired
+ * - the callout is an absolute hrestime one and
+ * there has been a system time change
+ */
+ if ((cl->cl_expiration <= now) ||
+ (timechange && ((cl->cl_flags & clflags) == clflags))) {
+ cl->cl_flags &= ~CALLOUT_LIST_FLAG_QUEUED;
+ CALLOUT_LIST_APPEND(ct->ct_expired, cl);
+ continue;
+ }
+
+ /*
+ * Apply adjustments, if any. Adjustments are applied after
+ * the system returns from KMDB or OBP. They are only applied
+ * to relative callout lists.
+ */
+ if (delta && !(cl->cl_flags & CALLOUT_LIST_FLAG_ABSOLUTE)) {
+ expiration = cl->cl_expiration + delta;
+ if (expiration <= 0)
+ expiration = CY_INFINITY;
+ cl->cl_expiration = expiration;
+ }
+
+ (void) callout_queue_add(ct, cl);
+ }
+
+ /*
+ * We need to return the expiration to help program the cyclic.
+ * If there are expired callouts, the cyclic needs to go off
+ * immediately. If the queue has become empty, then we return infinity.
+ * Else, we return the expiration of the earliest callout in the queue.
+ */
+ if (ct->ct_expired.ch_head != NULL)
+ return (gethrtime());
+
+ cl = ct->ct_queue.ch_head;
+ if (cl == NULL)
+ return (CY_INFINITY);
+
+ return (cl->cl_expiration);
+}
+
+/*
* Initialize a callout table's heap, if necessary. Preallocate some free
* entries so we don't have to check for NULL elsewhere.
*/
@@ -345,17 +519,16 @@ callout_heap_init(callout_table_t *ct)
ASSERT(ct->ct_heap == NULL);
ct->ct_heap_num = 0;
- ct->ct_heap_max = CALLOUT_CHUNK;
- size = sizeof (callout_heap_t) * CALLOUT_CHUNK;
+ ct->ct_heap_max = callout_chunk;
+ size = sizeof (callout_heap_t) * callout_chunk;
ct->ct_heap = kmem_alloc(size, KM_SLEEP);
}
/*
- * Reallocate the heap. We try quite hard because we can't sleep, and if
- * we can't do the allocation, we're toast. Failing all, we try a KM_PANIC
- * allocation. Note that the heap only expands, it never contracts.
+ * Reallocate the heap. Return 0 if the heap is still full at the end of it.
+ * Return 1 otherwise. Note that the heap only expands, it never contracts.
*/
-static void
+static int
callout_heap_expand(callout_table_t *ct)
{
size_t max, size, osize;
@@ -369,10 +542,25 @@ callout_heap_expand(callout_table_t *ct)
mutex_exit(&ct->ct_mutex);
osize = sizeof (callout_heap_t) * max;
- size = sizeof (callout_heap_t) * (max + CALLOUT_CHUNK);
- heap = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC);
+ size = sizeof (callout_heap_t) * (max + callout_chunk);
+ heap = kmem_alloc(size, KM_NOSLEEP);
mutex_enter(&ct->ct_mutex);
+ if (heap == NULL) {
+ /*
+ * We could not allocate memory. If we can free up
+ * some entries, that would be great.
+ */
+ if (ct->ct_nreap > 0)
+ (void) callout_heap_process(ct, 0, 0);
+ /*
+ * If we still have no space in the heap, inform the
+ * caller.
+ */
+ if (ct->ct_heap_num == ct->ct_heap_max)
+ return (0);
+ return (1);
+ }
if (max < ct->ct_heap_max) {
/*
* Someone beat us to the allocation. Free what we
@@ -387,6 +575,8 @@ callout_heap_expand(callout_table_t *ct)
ct->ct_heap = heap;
ct->ct_heap_max = size / sizeof (callout_heap_t);
}
+
+ return (1);
}
/*
@@ -448,6 +638,7 @@ callout_heap_insert(callout_table_t *ct, callout_list_t *cl)
ASSERT(MUTEX_HELD(&ct->ct_mutex));
ASSERT(ct->ct_heap_num < ct->ct_heap_max);
+ cl->cl_flags |= CALLOUT_LIST_FLAG_HEAPED;
/*
* First, copy the expiration and callout list pointer to the bottom
* of the heap.
@@ -553,7 +744,7 @@ comp_left:
/*
* Delete and handle all past expirations in a callout table's heap.
*/
-static void
+static hrtime_t
callout_heap_delete(callout_table_t *ct)
{
hrtime_t now, expiration, next;
@@ -601,6 +792,7 @@ callout_heap_delete(callout_table_t *ct)
* list of expired callout lists. It will be processed
* by the callout executor.
*/
+ cl->cl_flags &= ~CALLOUT_LIST_FLAG_HEAPED;
CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
CALLOUT_LIST_APPEND(ct->ct_expired, cl);
}
@@ -622,7 +814,7 @@ callout_heap_delete(callout_table_t *ct)
* infinity by the cyclic subsystem.
*/
if ((ct->ct_heap_num == 0) || (ct->ct_suspend > 0))
- return;
+ return (CY_INFINITY);
/*
* If the top expirations are within callout_tolerance of each other,
@@ -638,6 +830,8 @@ callout_heap_delete(callout_table_t *ct)
}
(void) cyclic_reprogram(ct->ct_cyclic, expiration);
+
+ return (expiration);
}
/*
@@ -664,21 +858,20 @@ static hrtime_t
callout_heap_process(callout_table_t *ct, hrtime_t delta, int timechange)
{
callout_heap_t *heap;
- callout_list_t *cl, *rootcl;
+ callout_list_t *cl;
hrtime_t expiration, now;
- int i, hash, clflags, expired;
+ int i, hash, clflags;
ulong_t num;
ASSERT(MUTEX_HELD(&ct->ct_mutex));
if (ct->ct_heap_num == 0)
- return (0);
+ return (CY_INFINITY);
if (ct->ct_nreap > 0)
ct->ct_cleanups++;
heap = ct->ct_heap;
- rootcl = heap->ch_list;
/*
* We walk the heap from the top to the bottom. If we encounter
@@ -700,7 +893,6 @@ callout_heap_process(callout_table_t *ct, hrtime_t delta, int timechange)
ct->ct_heap_num = 0;
clflags = (CALLOUT_LIST_FLAG_HRESTIME | CALLOUT_LIST_FLAG_ABSOLUTE);
now = gethrtime();
- expired = 0;
for (i = 0; i < num; i++) {
cl = heap[i].ch_list;
/*
@@ -724,9 +916,9 @@ callout_heap_process(callout_table_t *ct, hrtime_t delta, int timechange)
if ((cl->cl_expiration <= now) ||
(timechange && ((cl->cl_flags & clflags) == clflags))) {
hash = CALLOUT_CLHASH(cl->cl_expiration);
+ cl->cl_flags &= ~CALLOUT_LIST_FLAG_HEAPED;
CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
CALLOUT_LIST_APPEND(ct->ct_expired, cl);
- expired = 1;
continue;
}
@@ -758,16 +950,19 @@ callout_heap_process(callout_table_t *ct, hrtime_t delta, int timechange)
ct->ct_nreap = 0;
- if (expired)
- expiration = gethrtime();
- else if (ct->ct_heap_num == 0)
- expiration = CY_INFINITY;
- else if (rootcl != heap->ch_list)
- expiration = heap->ch_expiration;
- else
- expiration = 0;
+ /*
+ * We need to return the expiration to help program the cyclic.
+ * If there are expired callouts, the cyclic needs to go off
+ * immediately. If the heap has become empty, then we return infinity.
+ * Else, return the expiration of the earliest callout in the heap.
+ */
+ if (ct->ct_expired.ch_head != NULL)
+ return (gethrtime());
- return (expiration);
+ if (ct->ct_heap_num == 0)
+ return (CY_INFINITY);
+
+ return (heap->ch_expiration);
}
/*
@@ -788,7 +983,7 @@ timeout_generic(int type, void (*func)(void *), void *arg,
callout_t *cp;
callout_id_t id;
callout_list_t *cl;
- hrtime_t now, interval, rexpiration;
+ hrtime_t now, interval;
int hash, clflags;
ASSERT(resolution > 0);
@@ -829,11 +1024,11 @@ timeout_generic(int type, void (*func)(void *), void *arg,
if (CALLOUT_CLEANUP(ct)) {
/*
* There are too many heap elements pointing to empty callout
- * lists. Clean them out.
+ * lists. Clean them out. Since cleanup is only done once
+ * in a while, no need to reprogram the cyclic if the root
+ * of the heap gets cleaned out.
*/
- rexpiration = callout_heap_process(ct, 0, 0);
- if ((rexpiration != 0) && (ct->ct_suspend == 0))
- (void) cyclic_reprogram(ct->ct_cyclic, rexpiration);
+ (void) callout_heap_process(ct, 0, 0);
}
if ((cp = ct->ct_free) == NULL)
@@ -911,23 +1106,6 @@ again:
cl = callout_list_get(ct, expiration, clflags, hash);
if (cl == NULL) {
/*
- * Check if we have enough space in the heap to insert one
- * expiration. If not, expand the heap.
- */
- if (ct->ct_heap_num == ct->ct_heap_max) {
- callout_heap_expand(ct);
- /*
- * In the above call, we drop the lock, allocate and
- * reacquire the lock. So, we could have been away
- * for a while. In the meantime, someone could have
- * inserted a callout list with the same expiration.
- * So, the best course is to repeat the steps. This
- * should be an infrequent event.
- */
- goto again;
- }
-
- /*
* Check the free list. If we don't find one, we have to
* take the slow path and allocate from kmem.
*/
@@ -948,6 +1126,30 @@ again:
cl->cl_expiration = expiration;
cl->cl_flags = clflags;
+ /*
+ * Check if we have enough space in the heap to insert one
+ * expiration. If not, expand the heap.
+ */
+ if (ct->ct_heap_num == ct->ct_heap_max) {
+ if (callout_heap_expand(ct) == 0) {
+ /*
+ * Could not expand the heap. Just queue it.
+ */
+ callout_queue_insert(ct, cl);
+ goto out;
+ }
+
+ /*
+ * In the above call, we drop the lock, allocate and
+ * reacquire the lock. So, we could have been away
+ * for a while. In the meantime, someone could have
+ * inserted a callout list with the same expiration.
+ * But we will not go back and check for it as this
+ * should be a really infrequent event. There is no
+ * point.
+ */
+ }
+
if (clflags & CALLOUT_LIST_FLAG_NANO) {
CALLOUT_LIST_APPEND(ct->ct_clhash[hash], cl);
} else {
@@ -969,6 +1171,7 @@ again:
if (cl->cl_callouts.ch_head == NULL)
ct->ct_nreap--;
}
+out:
cp->c_list = cl;
CALLOUT_APPEND(ct, cp);
@@ -1077,7 +1280,7 @@ untimeout_generic(callout_id_t id, int nowait)
callout_t *cp;
callout_id_t xid;
callout_list_t *cl;
- int hash;
+ int hash, flags;
callout_id_t bogus;
ct = &callout_table[CALLOUT_ID_TO_TABLE(id)];
@@ -1113,19 +1316,30 @@ untimeout_generic(callout_id_t id, int nowait)
cl = cp->c_list;
expiration = cl->cl_expiration;
CALLOUT_DELETE(ct, cp);
- cp->c_idnext = ct->ct_free;
- ct->ct_free = cp;
- cp->c_xid |= CALLOUT_FREE;
+ CALLOUT_FREE(ct, cp);
ct->ct_untimeouts_unexpired++;
ct->ct_timeouts_pending--;
/*
- * If the callout list has become empty, it needs
- * to be cleaned along with its heap entry. Increment
- * a reap count.
+ * If the callout list has become empty, there are 3
+ * possibilities. If it is present:
+ * - in the heap, it needs to be cleaned along
+ * with its heap entry. Increment a reap count.
+ * - in the callout queue, free it.
+ * - in the expired list, free it.
*/
- if (cl->cl_callouts.ch_head == NULL)
- ct->ct_nreap++;
+ if (cl->cl_callouts.ch_head == NULL) {
+ flags = cl->cl_flags;
+ if (flags & CALLOUT_LIST_FLAG_HEAPED) {
+ ct->ct_nreap++;
+ } else if (flags & CALLOUT_LIST_FLAG_QUEUED) {
+ CALLOUT_LIST_DELETE(ct->ct_queue, cl);
+ CALLOUT_LIST_FREE(ct, cl);
+ } else {
+ CALLOUT_LIST_DELETE(ct->ct_expired, cl);
+ CALLOUT_LIST_FREE(ct, cl);
+ }
+ }
mutex_exit(&ct->ct_mutex);
expiration -= gethrtime();
@@ -1282,9 +1496,7 @@ callout_list_expire(callout_table_t *ct, callout_list_t *cl)
* cares that we're done.
*/
CALLOUT_DELETE(ct, cp);
- cp->c_idnext = ct->ct_free;
- ct->ct_free = cp;
- cp->c_xid |= CALLOUT_FREE;
+ CALLOUT_FREE(ct, cp);
if (cp->c_waiting) {
cp->c_waiting = 0;
@@ -1339,13 +1551,22 @@ callout_expire(callout_table_t *ct)
*/
/*
- * Realtime callout cyclic handler.
+ * Realtime callout cyclic handlers.
*/
void
callout_realtime(callout_table_t *ct)
{
mutex_enter(&ct->ct_mutex);
- callout_heap_delete(ct);
+ (void) callout_heap_delete(ct);
+ callout_expire(ct);
+ mutex_exit(&ct->ct_mutex);
+}
+
+void
+callout_queue_realtime(callout_table_t *ct)
+{
+ mutex_enter(&ct->ct_mutex);
+ (void) callout_queue_delete(ct);
callout_expire(ct);
mutex_exit(&ct->ct_mutex);
}
@@ -1359,16 +1580,35 @@ callout_execute(callout_table_t *ct)
}
/*
- * Normal callout cyclic handler.
+ * Normal callout cyclic handlers.
*/
void
callout_normal(callout_table_t *ct)
{
int i, exec;
+ hrtime_t exp;
mutex_enter(&ct->ct_mutex);
- callout_heap_delete(ct);
- CALLOUT_EXEC_COMPUTE(ct, exec);
+ exp = callout_heap_delete(ct);
+ CALLOUT_EXEC_COMPUTE(ct, exp, exec);
+ mutex_exit(&ct->ct_mutex);
+
+ for (i = 0; i < exec; i++) {
+ ASSERT(ct->ct_taskq != NULL);
+ (void) taskq_dispatch(ct->ct_taskq,
+ (task_func_t *)callout_execute, ct, TQ_NOSLEEP);
+ }
+}
+
+void
+callout_queue_normal(callout_table_t *ct)
+{
+ int i, exec;
+ hrtime_t exp;
+
+ mutex_enter(&ct->ct_mutex);
+ exp = callout_queue_delete(ct);
+ CALLOUT_EXEC_COMPUTE(ct, exp, exec);
mutex_exit(&ct->ct_mutex);
for (i = 0; i < exec; i++) {
@@ -1405,9 +1645,12 @@ callout_suspend(void)
mutex_exit(&ct->ct_mutex);
continue;
}
- if (ct->ct_suspend == 1)
+ if (ct->ct_suspend == 1) {
(void) cyclic_reprogram(ct->ct_cyclic,
CY_INFINITY);
+ (void) cyclic_reprogram(ct->ct_qcyclic,
+ CY_INFINITY);
+ }
mutex_exit(&ct->ct_mutex);
}
}
@@ -1419,7 +1662,7 @@ callout_suspend(void)
static void
callout_resume(hrtime_t delta, int timechange)
{
- hrtime_t exp;
+ hrtime_t hexp, qexp;
int t, f;
callout_table_t *ct;
@@ -1446,24 +1689,13 @@ callout_resume(hrtime_t delta, int timechange)
* out any empty callout lists that might happen to
* be there.
*/
- (void) callout_heap_process(ct, delta, timechange);
+ hexp = callout_heap_process(ct, delta, timechange);
+ qexp = callout_queue_process(ct, delta, timechange);
ct->ct_suspend--;
if (ct->ct_suspend == 0) {
- /*
- * If the expired list is non-empty, then have
- * the cyclic expire immediately. Else, program
- * the cyclic based on the heap.
- */
- if (ct->ct_expired.ch_head != NULL)
- exp = gethrtime();
- else if (ct->ct_heap_num > 0)
- exp = ct->ct_heap[0].ch_expiration;
- else
- exp = 0;
- if (exp != 0)
- (void) cyclic_reprogram(ct->ct_cyclic,
- exp);
+ (void) cyclic_reprogram(ct->ct_cyclic, hexp);
+ (void) cyclic_reprogram(ct->ct_qcyclic, qexp);
}
mutex_exit(&ct->ct_mutex);
@@ -1524,10 +1756,10 @@ callout_debug_callb(void *arg, int code)
static void
callout_hrestime_one(callout_table_t *ct)
{
- hrtime_t expiration;
+ hrtime_t hexp, qexp;
mutex_enter(&ct->ct_mutex);
- if (ct->ct_heap_num == 0) {
+ if (ct->ct_cyclic == CYCLIC_NONE) {
mutex_exit(&ct->ct_mutex);
return;
}
@@ -1535,10 +1767,13 @@ callout_hrestime_one(callout_table_t *ct)
/*
* Walk the heap and process all the absolute hrestime entries.
*/
- expiration = callout_heap_process(ct, 0, 1);
+ hexp = callout_heap_process(ct, 0, 1);
+ qexp = callout_queue_process(ct, 0, 1);
- if ((expiration != 0) && (ct->ct_suspend == 0))
- (void) cyclic_reprogram(ct->ct_cyclic, expiration);
+ if (ct->ct_suspend == 0) {
+ (void) cyclic_reprogram(ct->ct_cyclic, hexp);
+ (void) cyclic_reprogram(ct->ct_qcyclic, qexp);
+ }
mutex_exit(&ct->ct_mutex);
}
@@ -1623,11 +1858,11 @@ callout_cyclic_init(callout_table_t *ct)
cyc_time_t when;
processorid_t seqid;
int t;
- cyclic_id_t cyclic;
+ cyclic_id_t cyclic, qcyclic;
ASSERT(MUTEX_HELD(&ct->ct_mutex));
- t = CALLOUT_TABLE_TYPE(ct);
+ t = ct->ct_type;
seqid = CALLOUT_TABLE_SEQID(ct);
/*
@@ -1684,19 +1919,29 @@ callout_cyclic_init(callout_table_t *ct)
*/
ASSERT(ct->ct_cyclic == CYCLIC_NONE);
- hdlr.cyh_func = (cyc_func_t)CALLOUT_CYCLIC_HANDLER(t);
- if (ct->ct_type == CALLOUT_REALTIME)
+ if (t == CALLOUT_REALTIME) {
hdlr.cyh_level = callout_realtime_level;
- else
+ hdlr.cyh_func = (cyc_func_t)callout_realtime;
+ } else {
hdlr.cyh_level = callout_normal_level;
+ hdlr.cyh_func = (cyc_func_t)callout_normal;
+ }
hdlr.cyh_arg = ct;
when.cyt_when = CY_INFINITY;
when.cyt_interval = CY_INFINITY;
cyclic = cyclic_add(&hdlr, &when);
+ if (t == CALLOUT_REALTIME)
+ hdlr.cyh_func = (cyc_func_t)callout_queue_realtime;
+ else
+ hdlr.cyh_func = (cyc_func_t)callout_queue_normal;
+
+ qcyclic = cyclic_add(&hdlr, &when);
+
mutex_enter(&ct->ct_mutex);
ct->ct_cyclic = cyclic;
+ ct->ct_qcyclic = qcyclic;
}
void
@@ -1768,9 +2013,10 @@ callout_cpu_online(cpu_t *cp)
mutex_exit(&ct->ct_mutex);
/*
- * Move the cyclic to this CPU by doing a bind.
+ * Move the cyclics to this CPU by doing a bind.
*/
cyclic_bind(ct->ct_cyclic, cp, NULL);
+ cyclic_bind(ct->ct_qcyclic, cp, NULL);
}
}
@@ -1789,10 +2035,11 @@ callout_cpu_offline(cpu_t *cp)
ct = &callout_table[CALLOUT_TABLE(t, seqid)];
/*
- * Unbind the cyclic. This will allow the cyclic subsystem
- * to juggle the cyclic during CPU offline.
+ * Unbind the cyclics. This will allow the cyclic subsystem
+ * to juggle the cyclics during CPU offline.
*/
cyclic_bind(ct->ct_cyclic, NULL, NULL);
+ cyclic_bind(ct->ct_qcyclic, NULL, NULL);
}
}
@@ -1804,6 +2051,22 @@ void
callout_mp_init(void)
{
cpu_t *cp;
+ size_t min, max;
+
+ if (callout_chunk == CALLOUT_CHUNK) {
+ /*
+ * No one has specified a chunk in /etc/system. We need to
+ * compute it here based on the number of online CPUs and
+ * available physical memory.
+ */
+ min = CALLOUT_MIN_HEAP_SIZE;
+ max = ptob(physmem) / CALLOUT_MEM_FRACTION;
+ if (min > max)
+ min = max;
+ callout_chunk = min / sizeof (callout_heap_t);
+ callout_chunk /= ncpus_online;
+ callout_chunk = P2ROUNDUP(callout_chunk, CALLOUT_CHUNK);
+ }
mutex_enter(&cpu_lock);
@@ -1846,6 +2109,10 @@ callout_init(void)
callout_tolerance = CALLOUT_TOLERANCE;
if (callout_threads <= 0)
callout_threads = CALLOUT_THREADS;
+ if (callout_chunk <= 0)
+ callout_chunk = CALLOUT_CHUNK;
+ else
+ callout_chunk = P2ROUNDUP(callout_chunk, CALLOUT_CHUNK);
/*
* Allocate all the callout tables based on max_ncpus. We have chosen
@@ -1893,12 +2160,13 @@ callout_init(void)
*/
ct->ct_gen_id = CALLOUT_SHORT_ID(table_id);
/*
- * Initialize the cyclic as NONE. This will get set
+ * Initialize the cyclics as NONE. This will get set
* during CPU online. This is so that partially
* populated systems will only have the required
* number of cyclics, not more.
*/
ct->ct_cyclic = CYCLIC_NONE;
+ ct->ct_qcyclic = CYCLIC_NONE;
ct->ct_kstat_data = kmem_zalloc(size, KM_SLEEP);
}
}
diff --git a/usr/src/uts/common/sys/callo.h b/usr/src/uts/common/sys/callo.h
index 6a464f9dd9..8ffc38ada6 100644
--- a/usr/src/uts/common/sys/callo.h
+++ b/usr/src/uts/common/sys/callo.h
@@ -23,7 +23,7 @@
/*
- * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -138,9 +138,9 @@ typedef struct callout {
* This bit represents the callout (table) type. Each CPU has one realtime
* and one normal callout table.
*/
-#define CALLOUT_FREE 0x8000000000000000ULL
+#define CALLOUT_ID_FREE 0x8000000000000000ULL
#define CALLOUT_EXECUTING 0x4000000000000000ULL
-#define CALLOUT_ID_FLAGS (CALLOUT_FREE | CALLOUT_EXECUTING)
+#define CALLOUT_ID_FLAGS (CALLOUT_ID_FREE | CALLOUT_EXECUTING)
#define CALLOUT_ID_MASK ~CALLOUT_ID_FLAGS
#define CALLOUT_GENERATION_LOW 0x100000000ULL
#define CALLOUT_LONGTERM 0x80000000
@@ -151,7 +151,6 @@ typedef struct callout {
#define CALLOUT_COUNTER_SHIFT callout_table_bits
#define CALLOUT_TABLE(t, f) (((f) << CALLOUT_TYPE_BITS) | (t))
#define CALLOUT_TABLE_NUM(ct) ((ct) - callout_table)
-#define CALLOUT_TABLE_TYPE(ct) (CALLOUT_TABLE_NUM(ct) & CALLOUT_TYPE_MASK)
#define CALLOUT_TABLE_SEQID(ct) (CALLOUT_TABLE_NUM(ct) >> CALLOUT_TYPE_BITS)
/*
@@ -227,11 +226,17 @@ typedef struct callout_hash {
* Callout list contains hrestime timers.
* CALLOUT_LIST_FLAG_NANO
* Callout list contains 1-nanosecond resolution callouts.
+ * CALLOUT_LIST_FLAG_HEAPED
+ * Callout list is present in the callout heap.
+ * CALLOUT_LIST_FLAG_QUEUED
+ * Callout list is present in the callout queue.
*/
#define CALLOUT_LIST_FLAG_FREE 0x1
#define CALLOUT_LIST_FLAG_ABSOLUTE 0x2
#define CALLOUT_LIST_FLAG_HRESTIME 0x4
#define CALLOUT_LIST_FLAG_NANO 0x8
+#define CALLOUT_LIST_FLAG_HEAPED 0x10
+#define CALLOUT_LIST_FLAG_QUEUED 0x20
struct callout_list {
callout_list_t *cl_next; /* next in clhash */
@@ -248,6 +253,9 @@ struct callout_list {
* callout list pointer in the heap element, we have to always remove
* a heap element and its callout list together. We cannot remove one
* without the other.
+ *
+ * This structure's size must be a power of two because we want an
+ * integral number of these to fit into a page.
*/
typedef struct callout_heap {
hrtime_t ch_expiration;
@@ -362,11 +370,16 @@ typedef struct callout_table {
taskq_t *ct_taskq; /* taskq to execute normal callouts */
kstat_t *ct_kstats; /* callout kstats */
int ct_nreap; /* # heap entries that need reaping */
-#ifdef _LP64
- char ct_pad[28]; /* cache alignment */
-#else
- char ct_pad[24]; /* cache alignment */
+ cyclic_id_t ct_qcyclic; /* cyclic for the callout queue */
+ callout_hash_t ct_queue; /* overflow queue of callouts */
+#ifndef _LP64
+ char ct_pad[12]; /* cache alignment */
#endif
+ /*
+ * This structure should be aligned to a 64-byte (cache-line)
+ * boundary. Make sure the padding is right for 32-bit as well
+ * as 64-bit kernels.
+ */
} callout_table_t;
/*
@@ -389,15 +402,28 @@ typedef struct callout_table {
#define ct_cleanups \
ct_kstat_data[CALLOUT_CLEANUPS].value.ui64
-#define CALLOUT_CHUNK 128
+/*
+ * CALLOUT_CHUNK is the minimum initial size of each heap, and the amount
+ * by which a full heap is expanded to make room for new entries.
+ */
+#define CALLOUT_CHUNK (PAGESIZE / sizeof (callout_heap_t))
+
+/*
+ * CALLOUT_MIN_HEAP_SIZE defines the minimum size for the callout heap for
+ * the whole system.
+ */
+#define CALLOUT_MIN_HEAP_SIZE (64 * 1024 * sizeof (callout_heap_t))
+
+/*
+ * CALLOUT_MEM_FRACTION defines the fraction of available physical memory that
+ * can be allocated towards the callout heap for the whole system.
+ */
+#define CALLOUT_MEM_FRACTION 4096
#define CALLOUT_HEAP_PARENT(index) (((index) - 1) >> 1)
#define CALLOUT_HEAP_RIGHT(index) (((index) + 1) << 1)
#define CALLOUT_HEAP_LEFT(index) ((((index) + 1) << 1) - 1)
-#define CALLOUT_CYCLIC_HANDLER(t) \
- ((t == CALLOUT_REALTIME) ? callout_realtime : callout_normal)
-
#define CALLOUT_TCP_RESOLUTION 10000000ULL
#define CALLOUT_ALIGN 64 /* cache line size */