diff options
author | Madhavan Venkataraman <Madhavan.Venkataraman@Sun.COM> | 2010-02-23 04:19:01 -0500 |
---|---|---|
committer | Madhavan Venkataraman <Madhavan.Venkataraman@Sun.COM> | 2010-02-23 04:19:01 -0500 |
commit | 060cedfb73ddd0d3b0742264bece958dd77d1008 (patch) | |
tree | 8a93e2b5bb2a1002a59d190bbbbfe4b43d5c3c48 | |
parent | c9a5bc8f8ef20fd68ff0b46331bbaf26e1415abb (diff) | |
download | illumos-joyent-060cedfb73ddd0d3b0742264bece958dd77d1008.tar.gz |
6839498 callout subsystem should be less brittle wrt memory allocation failure
-rw-r--r-- | usr/src/cmd/mdb/common/modules/genunix/genunix.c | 112 | ||||
-rw-r--r-- | usr/src/uts/common/os/callout.c | 474 | ||||
-rw-r--r-- | usr/src/uts/common/sys/callo.h | 50 |
3 files changed, 489 insertions, 147 deletions
diff --git a/usr/src/cmd/mdb/common/modules/genunix/genunix.c b/usr/src/cmd/mdb/common/modules/genunix/genunix.c index 99f26333f4..968f0284c4 100644 --- a/usr/src/cmd/mdb/common/modules/genunix/genunix.c +++ b/usr/src/cmd/mdb/common/modules/genunix/genunix.c @@ -651,33 +651,35 @@ static const char *co_typenames[] = { "R", "N" }; #define TABLE_TO_SEQID(x) ((x) >> CALLOUT_TYPE_BITS) /* callout flags, in no particular order */ -#define COF_REAL 0x0000001 -#define COF_NORM 0x0000002 -#define COF_LONG 0x0000004 -#define COF_SHORT 0x0000008 -#define COF_EMPTY 0x0000010 -#define COF_TIME 0x0000020 -#define COF_BEFORE 0x0000040 -#define COF_AFTER 0x0000080 -#define COF_SEQID 0x0000100 -#define COF_FUNC 0x0000200 -#define COF_ADDR 0x0000400 -#define COF_EXEC 0x0000800 -#define COF_HIRES 0x0001000 -#define COF_ABS 0x0002000 -#define COF_TABLE 0x0004000 -#define COF_BYIDH 0x0008000 -#define COF_FREE 0x0010000 -#define COF_LIST 0x0020000 -#define COF_EXPREL 0x0040000 -#define COF_HDR 0x0080000 -#define COF_VERBOSE 0x0100000 -#define COF_LONGLIST 0x0200000 -#define COF_THDR 0x0400000 -#define COF_LHDR 0x0800000 -#define COF_CHDR 0x1000000 -#define COF_PARAM 0x2000000 -#define COF_DECODE 0x4000000 +#define COF_REAL 0x00000001 +#define COF_NORM 0x00000002 +#define COF_LONG 0x00000004 +#define COF_SHORT 0x00000008 +#define COF_EMPTY 0x00000010 +#define COF_TIME 0x00000020 +#define COF_BEFORE 0x00000040 +#define COF_AFTER 0x00000080 +#define COF_SEQID 0x00000100 +#define COF_FUNC 0x00000200 +#define COF_ADDR 0x00000400 +#define COF_EXEC 0x00000800 +#define COF_HIRES 0x00001000 +#define COF_ABS 0x00002000 +#define COF_TABLE 0x00004000 +#define COF_BYIDH 0x00008000 +#define COF_FREE 0x00010000 +#define COF_LIST 0x00020000 +#define COF_EXPREL 0x00040000 +#define COF_HDR 0x00080000 +#define COF_VERBOSE 0x00100000 +#define COF_LONGLIST 0x00200000 +#define COF_THDR 0x00400000 +#define COF_LHDR 0x00800000 +#define COF_CHDR 0x01000000 +#define COF_PARAM 0x02000000 +#define COF_DECODE 0x04000000 +#define COF_HEAP 0x08000000 +#define COF_QUEUE 0x10000000 /* show real and normal, short and long, expired and unexpired. */ #define COF_DEFAULT (COF_REAL | COF_NORM | COF_LONG | COF_SHORT) @@ -719,14 +721,14 @@ callouts_cb(uintptr_t addr, const void *data, void *priv) return (WALK_ERR); } - if ((coargs->flags & COF_FREE) && !(co->c_xid & CALLOUT_FREE)) { + if ((coargs->flags & COF_FREE) && !(co->c_xid & CALLOUT_ID_FREE)) { /* * The callout must have been reallocated. No point in * walking any more. */ return (WALK_DONE); } - if (!(coargs->flags & COF_FREE) && (co->c_xid & CALLOUT_FREE)) { + if (!(coargs->flags & COF_FREE) && (co->c_xid & CALLOUT_ID_FREE)) { /* * The callout must have been freed. No point in * walking any more. @@ -806,6 +808,21 @@ callouts_cb(uintptr_t addr, const void *data, void *priv) return (WALK_NEXT); } } + /* + * We do the checks for COF_HEAP and COF_QUEUE here only if we + * are traversing BYIDH. If the traversal is by callout list, + * we do this check in callout_list_cb() to be more + * efficient. + */ + if ((coargs->flags & COF_HEAP) && + !(list_flags & CALLOUT_LIST_FLAG_HEAPED)) { + return (WALK_NEXT); + } + + if ((coargs->flags & COF_QUEUE) && + !(list_flags & CALLOUT_LIST_FLAG_QUEUED)) { + return (WALK_NEXT); + } } #define callout_table_mask ((1 << coargs->ctbits) - 1) @@ -937,6 +954,16 @@ callout_list_cb(uintptr_t addr, const void *data, void *priv) } } + if ((coargs->flags & COF_HEAP) && + !(coargs->list_flags & CALLOUT_LIST_FLAG_HEAPED)) { + return (WALK_NEXT); + } + + if ((coargs->flags & COF_QUEUE) && + !(coargs->list_flags & CALLOUT_LIST_FLAG_QUEUED)) { + return (WALK_NEXT); + } + if ((coargs->flags & COF_LHDR) && !(coargs->flags & COF_ADDR) && (coargs->flags & (COF_LIST | COF_VERBOSE))) { if (!(coargs->flags & COF_VERBOSE)) { @@ -1063,9 +1090,9 @@ callout_t_cb(uintptr_t addr, const void *data, void *priv) coargs->flags |= (COF_LHDR | COF_CHDR); if (coargs->flags & COF_LONGLIST) { /* more info! */ - mdb_printf("%<u> %-T%-7s %-7s %-?s %-?s" + mdb_printf("%<u> %-T%-7s %-7s %-?s %-?s %-?s" " %-?s %-?s %-?s%</u>", - "HEAPNUM", "HEAPMAX", "TASKQ", "EXPQ", + "HEAPNUM", "HEAPMAX", "TASKQ", "EXPQ", "QUE", "PEND", "FREE", "LOCK"); } mdb_printf("\n"); @@ -1078,10 +1105,11 @@ callout_t_cb(uintptr_t addr, const void *data, void *priv) ct->ct_heap); if (coargs->flags & COF_LONGLIST) { /* more info! */ - mdb_printf(" %-7d %-7d %-?p %-?p" + mdb_printf(" %-7d %-7d %-?p %-?p %-?p" " %-?lld %-?lld %-?p", ct->ct_heap_num, ct->ct_heap_max, ct->ct_taskq, ct->ct_expired.ch_head, + ct->ct_queue.ch_head, cotwd->ct_timeouts_pending, cotwd->ct_allocations - cotwd->ct_timeouts_pending, @@ -1128,6 +1156,17 @@ callout_t_cb(uintptr_t addr, const void *data, void *priv) return (WALK_ERR); } } + /* then, print the callout queue */ + clptr = (callout_list_t *)ct->ct_queue.ch_head; + if (clptr != NULL) { + coargs->bucket = -1; + if (mdb_pwalk("callout_list", callout_list_cb, + coargs, (uintptr_t)clptr) == -1) { + mdb_warn("cannot walk callout_list" + " at %p", clptr); + return (WALK_ERR); + } + } for (i = 0; i < CALLOUT_BUCKETS; i++) { if (ct->ct_clhash == NULL) { /* nothing to do */ @@ -1272,6 +1311,8 @@ callout(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) 'v', MDB_OPT_SETBITS, COF_LONGLIST, &coargs.flags, 'i', MDB_OPT_SETBITS, COF_BYIDH, &coargs.flags, 'F', MDB_OPT_SETBITS, COF_FREE, &coargs.flags, + 'H', MDB_OPT_SETBITS, COF_HEAP, &coargs.flags, + 'Q', MDB_OPT_SETBITS, COF_QUEUE, &coargs.flags, 'A', MDB_OPT_SETBITS, COF_ADDR, &coargs.flags, NULL) != argc) { return (DCMD_USAGE); @@ -1347,6 +1388,11 @@ callout(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) return (DCMD_USAGE); } + if ((coargs.flags & COF_HEAP) && (coargs.flags & COF_QUEUE)) { + mdb_printf("-H and -Q are mutually exclusive\n"); + return (DCMD_USAGE); + } + if (funcname != NULL) { GElf_Sym sym; @@ -1552,6 +1598,8 @@ callout_help(void) " -F : walk free callout list (free list with -i) instead\n" " -v : display more info for each item\n" " -V : show details of each level of info as it is traversed\n" + " -H : limit display to callouts in the callout heap\n" + " -Q : limit display to callouts in the callout queue\n" " -A : show only addresses. Useful for pipelines.\n"); } diff --git a/usr/src/uts/common/os/callout.c b/usr/src/uts/common/os/callout.c index 7ae72da94c..0421dc94d1 100644 --- a/usr/src/uts/common/os/callout.c +++ b/usr/src/uts/common/os/callout.c @@ -42,6 +42,7 @@ */ static int callout_threads; /* callout normal threads */ static hrtime_t callout_debug_hrtime; /* debugger entry time */ +static int callout_chunk; /* callout heap chunk size */ static int callout_min_reap; /* callout minimum reap count */ static int callout_tolerance; /* callout hires tolerance */ static callout_table_t *callout_boot_ct; /* Boot CPU's callout tables */ @@ -170,6 +171,15 @@ static hrtime_t callout_heap_process(callout_table_t *, hrtime_t, int); #define CALLOUT_LIST_DELETE(hash, cl) \ CALLOUT_HASH_DELETE(hash, cl, cl_next, cl_prev) +#define CALLOUT_LIST_BEFORE(cl, nextcl) \ +{ \ + (cl)->cl_prev = (nextcl)->cl_prev; \ + (cl)->cl_next = (nextcl); \ + (nextcl)->cl_prev = (cl); \ + if (cl->cl_prev != NULL) \ + cl->cl_prev->cl_next = cl; \ +} + /* * For normal callouts, there is a deadlock scenario if two callouts that * have an inter-dependency end up on the same callout list. To break the @@ -179,7 +189,7 @@ static hrtime_t callout_heap_process(callout_table_t *, hrtime_t, int); * necessary (sigh). */ #define CALLOUT_THRESHOLD 100000000 -#define CALLOUT_EXEC_COMPUTE(ct, exec) \ +#define CALLOUT_EXEC_COMPUTE(ct, nextexp, exec) \ { \ callout_list_t *cl; \ \ @@ -197,14 +207,10 @@ static hrtime_t callout_heap_process(callout_table_t *, hrtime_t, int); * only one callout, there is no need for two threads. \ */ \ exec = 1; \ - } else if ((ct->ct_heap_num == 0) || \ - (ct->ct_heap[0].ch_expiration > gethrtime() + CALLOUT_THRESHOLD)) {\ + } else if ((nextexp) > (gethrtime() + CALLOUT_THRESHOLD)) { \ /* \ - * If the heap has become empty, we need two threads as \ - * there is no one to kick off the second thread in the \ - * future. If the heap is not empty and the top of the \ - * heap does not expire in the near future, we need two \ - * threads. \ + * If the next expiration of the cyclic is way out into \ + * the future, we need two threads. \ */ \ exec = 2; \ } else { \ @@ -240,6 +246,16 @@ static hrtime_t callout_heap_process(callout_table_t *, hrtime_t, int); } /* + * Macro to free a callout. + */ +#define CALLOUT_FREE(ct, cl) \ +{ \ + cp->c_idnext = ct->ct_free; \ + ct->ct_free = cp; \ + cp->c_xid |= CALLOUT_ID_FREE; \ +} + +/* * Allocate a callout structure. We try quite hard because we * can't sleep, and if we can't do the allocation, we're toast. * Failing all, we try a KM_PANIC allocation. Note that we never @@ -333,6 +349,164 @@ callout_list_get(callout_table_t *ct, hrtime_t expiration, int flags, int hash) } /* + * Add a new callout list into a callout table's queue in sorted order by + * expiration. + */ +static int +callout_queue_add(callout_table_t *ct, callout_list_t *cl) +{ + callout_list_t *nextcl; + hrtime_t expiration; + + expiration = cl->cl_expiration; + nextcl = ct->ct_queue.ch_head; + if ((nextcl == NULL) || (expiration < nextcl->cl_expiration)) { + CALLOUT_LIST_INSERT(ct->ct_queue, cl); + return (1); + } + + while (nextcl != NULL) { + if (expiration < nextcl->cl_expiration) { + CALLOUT_LIST_BEFORE(cl, nextcl); + return (0); + } + nextcl = nextcl->cl_next; + } + CALLOUT_LIST_APPEND(ct->ct_queue, cl); + + return (0); +} + +/* + * Insert a callout list into a callout table's queue and reprogram the queue + * cyclic if needed. + */ +static void +callout_queue_insert(callout_table_t *ct, callout_list_t *cl) +{ + cl->cl_flags |= CALLOUT_LIST_FLAG_QUEUED; + + /* + * Add the callout to the callout queue. If it ends up at the head, + * the cyclic needs to be reprogrammed as we have an earlier + * expiration. + * + * Also, during the CPR suspend phase, do not reprogram the cyclic. + * We don't want any callout activity. When the CPR resume phase is + * entered, the cyclic will be programmed for the earliest expiration + * in the queue. + */ + if (callout_queue_add(ct, cl) && (ct->ct_suspend == 0)) + (void) cyclic_reprogram(ct->ct_qcyclic, cl->cl_expiration); +} + +/* + * Delete and handle all past expirations in a callout table's queue. + */ +static hrtime_t +callout_queue_delete(callout_table_t *ct) +{ + callout_list_t *cl; + hrtime_t now; + + ASSERT(MUTEX_HELD(&ct->ct_mutex)); + + now = gethrtime(); + while ((cl = ct->ct_queue.ch_head) != NULL) { + if (cl->cl_expiration > now) + break; + cl->cl_flags &= ~CALLOUT_LIST_FLAG_QUEUED; + CALLOUT_LIST_DELETE(ct->ct_queue, cl); + CALLOUT_LIST_APPEND(ct->ct_expired, cl); + } + + /* + * If this callout queue is empty or callouts have been suspended, + * just return. + */ + if ((cl == NULL) || (ct->ct_suspend > 0)) + return (CY_INFINITY); + + (void) cyclic_reprogram(ct->ct_qcyclic, cl->cl_expiration); + + return (cl->cl_expiration); +} + +static hrtime_t +callout_queue_process(callout_table_t *ct, hrtime_t delta, int timechange) +{ + callout_list_t *firstcl, *cl; + hrtime_t expiration, now; + int clflags; + callout_hash_t temp; + + ASSERT(MUTEX_HELD(&ct->ct_mutex)); + + firstcl = ct->ct_queue.ch_head; + if (firstcl == NULL) + return (CY_INFINITY); + + /* + * We walk the callout queue. If we encounter a hrestime entry that + * must be removed, we clean it out. Otherwise, we apply any + * adjustments needed to it. Because of the latter, we need to + * recreate the list as we go along. + */ + temp = ct->ct_queue; + ct->ct_queue.ch_head = NULL; + ct->ct_queue.ch_tail = NULL; + + clflags = (CALLOUT_LIST_FLAG_HRESTIME | CALLOUT_LIST_FLAG_ABSOLUTE); + now = gethrtime(); + while ((cl = temp.ch_head) != NULL) { + CALLOUT_LIST_DELETE(temp, cl); + + /* + * Delete the callout and expire it, if one of the following + * is true: + * - the callout has expired + * - the callout is an absolute hrestime one and + * there has been a system time change + */ + if ((cl->cl_expiration <= now) || + (timechange && ((cl->cl_flags & clflags) == clflags))) { + cl->cl_flags &= ~CALLOUT_LIST_FLAG_QUEUED; + CALLOUT_LIST_APPEND(ct->ct_expired, cl); + continue; + } + + /* + * Apply adjustments, if any. Adjustments are applied after + * the system returns from KMDB or OBP. They are only applied + * to relative callout lists. + */ + if (delta && !(cl->cl_flags & CALLOUT_LIST_FLAG_ABSOLUTE)) { + expiration = cl->cl_expiration + delta; + if (expiration <= 0) + expiration = CY_INFINITY; + cl->cl_expiration = expiration; + } + + (void) callout_queue_add(ct, cl); + } + + /* + * We need to return the expiration to help program the cyclic. + * If there are expired callouts, the cyclic needs to go off + * immediately. If the queue has become empty, then we return infinity. + * Else, we return the expiration of the earliest callout in the queue. + */ + if (ct->ct_expired.ch_head != NULL) + return (gethrtime()); + + cl = ct->ct_queue.ch_head; + if (cl == NULL) + return (CY_INFINITY); + + return (cl->cl_expiration); +} + +/* * Initialize a callout table's heap, if necessary. Preallocate some free * entries so we don't have to check for NULL elsewhere. */ @@ -345,17 +519,16 @@ callout_heap_init(callout_table_t *ct) ASSERT(ct->ct_heap == NULL); ct->ct_heap_num = 0; - ct->ct_heap_max = CALLOUT_CHUNK; - size = sizeof (callout_heap_t) * CALLOUT_CHUNK; + ct->ct_heap_max = callout_chunk; + size = sizeof (callout_heap_t) * callout_chunk; ct->ct_heap = kmem_alloc(size, KM_SLEEP); } /* - * Reallocate the heap. We try quite hard because we can't sleep, and if - * we can't do the allocation, we're toast. Failing all, we try a KM_PANIC - * allocation. Note that the heap only expands, it never contracts. + * Reallocate the heap. Return 0 if the heap is still full at the end of it. + * Return 1 otherwise. Note that the heap only expands, it never contracts. */ -static void +static int callout_heap_expand(callout_table_t *ct) { size_t max, size, osize; @@ -369,10 +542,25 @@ callout_heap_expand(callout_table_t *ct) mutex_exit(&ct->ct_mutex); osize = sizeof (callout_heap_t) * max; - size = sizeof (callout_heap_t) * (max + CALLOUT_CHUNK); - heap = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC); + size = sizeof (callout_heap_t) * (max + callout_chunk); + heap = kmem_alloc(size, KM_NOSLEEP); mutex_enter(&ct->ct_mutex); + if (heap == NULL) { + /* + * We could not allocate memory. If we can free up + * some entries, that would be great. + */ + if (ct->ct_nreap > 0) + (void) callout_heap_process(ct, 0, 0); + /* + * If we still have no space in the heap, inform the + * caller. + */ + if (ct->ct_heap_num == ct->ct_heap_max) + return (0); + return (1); + } if (max < ct->ct_heap_max) { /* * Someone beat us to the allocation. Free what we @@ -387,6 +575,8 @@ callout_heap_expand(callout_table_t *ct) ct->ct_heap = heap; ct->ct_heap_max = size / sizeof (callout_heap_t); } + + return (1); } /* @@ -448,6 +638,7 @@ callout_heap_insert(callout_table_t *ct, callout_list_t *cl) ASSERT(MUTEX_HELD(&ct->ct_mutex)); ASSERT(ct->ct_heap_num < ct->ct_heap_max); + cl->cl_flags |= CALLOUT_LIST_FLAG_HEAPED; /* * First, copy the expiration and callout list pointer to the bottom * of the heap. @@ -553,7 +744,7 @@ comp_left: /* * Delete and handle all past expirations in a callout table's heap. */ -static void +static hrtime_t callout_heap_delete(callout_table_t *ct) { hrtime_t now, expiration, next; @@ -601,6 +792,7 @@ callout_heap_delete(callout_table_t *ct) * list of expired callout lists. It will be processed * by the callout executor. */ + cl->cl_flags &= ~CALLOUT_LIST_FLAG_HEAPED; CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl); CALLOUT_LIST_APPEND(ct->ct_expired, cl); } @@ -622,7 +814,7 @@ callout_heap_delete(callout_table_t *ct) * infinity by the cyclic subsystem. */ if ((ct->ct_heap_num == 0) || (ct->ct_suspend > 0)) - return; + return (CY_INFINITY); /* * If the top expirations are within callout_tolerance of each other, @@ -638,6 +830,8 @@ callout_heap_delete(callout_table_t *ct) } (void) cyclic_reprogram(ct->ct_cyclic, expiration); + + return (expiration); } /* @@ -664,21 +858,20 @@ static hrtime_t callout_heap_process(callout_table_t *ct, hrtime_t delta, int timechange) { callout_heap_t *heap; - callout_list_t *cl, *rootcl; + callout_list_t *cl; hrtime_t expiration, now; - int i, hash, clflags, expired; + int i, hash, clflags; ulong_t num; ASSERT(MUTEX_HELD(&ct->ct_mutex)); if (ct->ct_heap_num == 0) - return (0); + return (CY_INFINITY); if (ct->ct_nreap > 0) ct->ct_cleanups++; heap = ct->ct_heap; - rootcl = heap->ch_list; /* * We walk the heap from the top to the bottom. If we encounter @@ -700,7 +893,6 @@ callout_heap_process(callout_table_t *ct, hrtime_t delta, int timechange) ct->ct_heap_num = 0; clflags = (CALLOUT_LIST_FLAG_HRESTIME | CALLOUT_LIST_FLAG_ABSOLUTE); now = gethrtime(); - expired = 0; for (i = 0; i < num; i++) { cl = heap[i].ch_list; /* @@ -724,9 +916,9 @@ callout_heap_process(callout_table_t *ct, hrtime_t delta, int timechange) if ((cl->cl_expiration <= now) || (timechange && ((cl->cl_flags & clflags) == clflags))) { hash = CALLOUT_CLHASH(cl->cl_expiration); + cl->cl_flags &= ~CALLOUT_LIST_FLAG_HEAPED; CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl); CALLOUT_LIST_APPEND(ct->ct_expired, cl); - expired = 1; continue; } @@ -758,16 +950,19 @@ callout_heap_process(callout_table_t *ct, hrtime_t delta, int timechange) ct->ct_nreap = 0; - if (expired) - expiration = gethrtime(); - else if (ct->ct_heap_num == 0) - expiration = CY_INFINITY; - else if (rootcl != heap->ch_list) - expiration = heap->ch_expiration; - else - expiration = 0; + /* + * We need to return the expiration to help program the cyclic. + * If there are expired callouts, the cyclic needs to go off + * immediately. If the heap has become empty, then we return infinity. + * Else, return the expiration of the earliest callout in the heap. + */ + if (ct->ct_expired.ch_head != NULL) + return (gethrtime()); - return (expiration); + if (ct->ct_heap_num == 0) + return (CY_INFINITY); + + return (heap->ch_expiration); } /* @@ -788,7 +983,7 @@ timeout_generic(int type, void (*func)(void *), void *arg, callout_t *cp; callout_id_t id; callout_list_t *cl; - hrtime_t now, interval, rexpiration; + hrtime_t now, interval; int hash, clflags; ASSERT(resolution > 0); @@ -829,11 +1024,11 @@ timeout_generic(int type, void (*func)(void *), void *arg, if (CALLOUT_CLEANUP(ct)) { /* * There are too many heap elements pointing to empty callout - * lists. Clean them out. + * lists. Clean them out. Since cleanup is only done once + * in a while, no need to reprogram the cyclic if the root + * of the heap gets cleaned out. */ - rexpiration = callout_heap_process(ct, 0, 0); - if ((rexpiration != 0) && (ct->ct_suspend == 0)) - (void) cyclic_reprogram(ct->ct_cyclic, rexpiration); + (void) callout_heap_process(ct, 0, 0); } if ((cp = ct->ct_free) == NULL) @@ -911,23 +1106,6 @@ again: cl = callout_list_get(ct, expiration, clflags, hash); if (cl == NULL) { /* - * Check if we have enough space in the heap to insert one - * expiration. If not, expand the heap. - */ - if (ct->ct_heap_num == ct->ct_heap_max) { - callout_heap_expand(ct); - /* - * In the above call, we drop the lock, allocate and - * reacquire the lock. So, we could have been away - * for a while. In the meantime, someone could have - * inserted a callout list with the same expiration. - * So, the best course is to repeat the steps. This - * should be an infrequent event. - */ - goto again; - } - - /* * Check the free list. If we don't find one, we have to * take the slow path and allocate from kmem. */ @@ -948,6 +1126,30 @@ again: cl->cl_expiration = expiration; cl->cl_flags = clflags; + /* + * Check if we have enough space in the heap to insert one + * expiration. If not, expand the heap. + */ + if (ct->ct_heap_num == ct->ct_heap_max) { + if (callout_heap_expand(ct) == 0) { + /* + * Could not expand the heap. Just queue it. + */ + callout_queue_insert(ct, cl); + goto out; + } + + /* + * In the above call, we drop the lock, allocate and + * reacquire the lock. So, we could have been away + * for a while. In the meantime, someone could have + * inserted a callout list with the same expiration. + * But we will not go back and check for it as this + * should be a really infrequent event. There is no + * point. + */ + } + if (clflags & CALLOUT_LIST_FLAG_NANO) { CALLOUT_LIST_APPEND(ct->ct_clhash[hash], cl); } else { @@ -969,6 +1171,7 @@ again: if (cl->cl_callouts.ch_head == NULL) ct->ct_nreap--; } +out: cp->c_list = cl; CALLOUT_APPEND(ct, cp); @@ -1077,7 +1280,7 @@ untimeout_generic(callout_id_t id, int nowait) callout_t *cp; callout_id_t xid; callout_list_t *cl; - int hash; + int hash, flags; callout_id_t bogus; ct = &callout_table[CALLOUT_ID_TO_TABLE(id)]; @@ -1113,19 +1316,30 @@ untimeout_generic(callout_id_t id, int nowait) cl = cp->c_list; expiration = cl->cl_expiration; CALLOUT_DELETE(ct, cp); - cp->c_idnext = ct->ct_free; - ct->ct_free = cp; - cp->c_xid |= CALLOUT_FREE; + CALLOUT_FREE(ct, cp); ct->ct_untimeouts_unexpired++; ct->ct_timeouts_pending--; /* - * If the callout list has become empty, it needs - * to be cleaned along with its heap entry. Increment - * a reap count. + * If the callout list has become empty, there are 3 + * possibilities. If it is present: + * - in the heap, it needs to be cleaned along + * with its heap entry. Increment a reap count. + * - in the callout queue, free it. + * - in the expired list, free it. */ - if (cl->cl_callouts.ch_head == NULL) - ct->ct_nreap++; + if (cl->cl_callouts.ch_head == NULL) { + flags = cl->cl_flags; + if (flags & CALLOUT_LIST_FLAG_HEAPED) { + ct->ct_nreap++; + } else if (flags & CALLOUT_LIST_FLAG_QUEUED) { + CALLOUT_LIST_DELETE(ct->ct_queue, cl); + CALLOUT_LIST_FREE(ct, cl); + } else { + CALLOUT_LIST_DELETE(ct->ct_expired, cl); + CALLOUT_LIST_FREE(ct, cl); + } + } mutex_exit(&ct->ct_mutex); expiration -= gethrtime(); @@ -1282,9 +1496,7 @@ callout_list_expire(callout_table_t *ct, callout_list_t *cl) * cares that we're done. */ CALLOUT_DELETE(ct, cp); - cp->c_idnext = ct->ct_free; - ct->ct_free = cp; - cp->c_xid |= CALLOUT_FREE; + CALLOUT_FREE(ct, cp); if (cp->c_waiting) { cp->c_waiting = 0; @@ -1339,13 +1551,22 @@ callout_expire(callout_table_t *ct) */ /* - * Realtime callout cyclic handler. + * Realtime callout cyclic handlers. */ void callout_realtime(callout_table_t *ct) { mutex_enter(&ct->ct_mutex); - callout_heap_delete(ct); + (void) callout_heap_delete(ct); + callout_expire(ct); + mutex_exit(&ct->ct_mutex); +} + +void +callout_queue_realtime(callout_table_t *ct) +{ + mutex_enter(&ct->ct_mutex); + (void) callout_queue_delete(ct); callout_expire(ct); mutex_exit(&ct->ct_mutex); } @@ -1359,16 +1580,35 @@ callout_execute(callout_table_t *ct) } /* - * Normal callout cyclic handler. + * Normal callout cyclic handlers. */ void callout_normal(callout_table_t *ct) { int i, exec; + hrtime_t exp; mutex_enter(&ct->ct_mutex); - callout_heap_delete(ct); - CALLOUT_EXEC_COMPUTE(ct, exec); + exp = callout_heap_delete(ct); + CALLOUT_EXEC_COMPUTE(ct, exp, exec); + mutex_exit(&ct->ct_mutex); + + for (i = 0; i < exec; i++) { + ASSERT(ct->ct_taskq != NULL); + (void) taskq_dispatch(ct->ct_taskq, + (task_func_t *)callout_execute, ct, TQ_NOSLEEP); + } +} + +void +callout_queue_normal(callout_table_t *ct) +{ + int i, exec; + hrtime_t exp; + + mutex_enter(&ct->ct_mutex); + exp = callout_queue_delete(ct); + CALLOUT_EXEC_COMPUTE(ct, exp, exec); mutex_exit(&ct->ct_mutex); for (i = 0; i < exec; i++) { @@ -1405,9 +1645,12 @@ callout_suspend(void) mutex_exit(&ct->ct_mutex); continue; } - if (ct->ct_suspend == 1) + if (ct->ct_suspend == 1) { (void) cyclic_reprogram(ct->ct_cyclic, CY_INFINITY); + (void) cyclic_reprogram(ct->ct_qcyclic, + CY_INFINITY); + } mutex_exit(&ct->ct_mutex); } } @@ -1419,7 +1662,7 @@ callout_suspend(void) static void callout_resume(hrtime_t delta, int timechange) { - hrtime_t exp; + hrtime_t hexp, qexp; int t, f; callout_table_t *ct; @@ -1446,24 +1689,13 @@ callout_resume(hrtime_t delta, int timechange) * out any empty callout lists that might happen to * be there. */ - (void) callout_heap_process(ct, delta, timechange); + hexp = callout_heap_process(ct, delta, timechange); + qexp = callout_queue_process(ct, delta, timechange); ct->ct_suspend--; if (ct->ct_suspend == 0) { - /* - * If the expired list is non-empty, then have - * the cyclic expire immediately. Else, program - * the cyclic based on the heap. - */ - if (ct->ct_expired.ch_head != NULL) - exp = gethrtime(); - else if (ct->ct_heap_num > 0) - exp = ct->ct_heap[0].ch_expiration; - else - exp = 0; - if (exp != 0) - (void) cyclic_reprogram(ct->ct_cyclic, - exp); + (void) cyclic_reprogram(ct->ct_cyclic, hexp); + (void) cyclic_reprogram(ct->ct_qcyclic, qexp); } mutex_exit(&ct->ct_mutex); @@ -1524,10 +1756,10 @@ callout_debug_callb(void *arg, int code) static void callout_hrestime_one(callout_table_t *ct) { - hrtime_t expiration; + hrtime_t hexp, qexp; mutex_enter(&ct->ct_mutex); - if (ct->ct_heap_num == 0) { + if (ct->ct_cyclic == CYCLIC_NONE) { mutex_exit(&ct->ct_mutex); return; } @@ -1535,10 +1767,13 @@ callout_hrestime_one(callout_table_t *ct) /* * Walk the heap and process all the absolute hrestime entries. */ - expiration = callout_heap_process(ct, 0, 1); + hexp = callout_heap_process(ct, 0, 1); + qexp = callout_queue_process(ct, 0, 1); - if ((expiration != 0) && (ct->ct_suspend == 0)) - (void) cyclic_reprogram(ct->ct_cyclic, expiration); + if (ct->ct_suspend == 0) { + (void) cyclic_reprogram(ct->ct_cyclic, hexp); + (void) cyclic_reprogram(ct->ct_qcyclic, qexp); + } mutex_exit(&ct->ct_mutex); } @@ -1623,11 +1858,11 @@ callout_cyclic_init(callout_table_t *ct) cyc_time_t when; processorid_t seqid; int t; - cyclic_id_t cyclic; + cyclic_id_t cyclic, qcyclic; ASSERT(MUTEX_HELD(&ct->ct_mutex)); - t = CALLOUT_TABLE_TYPE(ct); + t = ct->ct_type; seqid = CALLOUT_TABLE_SEQID(ct); /* @@ -1684,19 +1919,29 @@ callout_cyclic_init(callout_table_t *ct) */ ASSERT(ct->ct_cyclic == CYCLIC_NONE); - hdlr.cyh_func = (cyc_func_t)CALLOUT_CYCLIC_HANDLER(t); - if (ct->ct_type == CALLOUT_REALTIME) + if (t == CALLOUT_REALTIME) { hdlr.cyh_level = callout_realtime_level; - else + hdlr.cyh_func = (cyc_func_t)callout_realtime; + } else { hdlr.cyh_level = callout_normal_level; + hdlr.cyh_func = (cyc_func_t)callout_normal; + } hdlr.cyh_arg = ct; when.cyt_when = CY_INFINITY; when.cyt_interval = CY_INFINITY; cyclic = cyclic_add(&hdlr, &when); + if (t == CALLOUT_REALTIME) + hdlr.cyh_func = (cyc_func_t)callout_queue_realtime; + else + hdlr.cyh_func = (cyc_func_t)callout_queue_normal; + + qcyclic = cyclic_add(&hdlr, &when); + mutex_enter(&ct->ct_mutex); ct->ct_cyclic = cyclic; + ct->ct_qcyclic = qcyclic; } void @@ -1768,9 +2013,10 @@ callout_cpu_online(cpu_t *cp) mutex_exit(&ct->ct_mutex); /* - * Move the cyclic to this CPU by doing a bind. + * Move the cyclics to this CPU by doing a bind. */ cyclic_bind(ct->ct_cyclic, cp, NULL); + cyclic_bind(ct->ct_qcyclic, cp, NULL); } } @@ -1789,10 +2035,11 @@ callout_cpu_offline(cpu_t *cp) ct = &callout_table[CALLOUT_TABLE(t, seqid)]; /* - * Unbind the cyclic. This will allow the cyclic subsystem - * to juggle the cyclic during CPU offline. + * Unbind the cyclics. This will allow the cyclic subsystem + * to juggle the cyclics during CPU offline. */ cyclic_bind(ct->ct_cyclic, NULL, NULL); + cyclic_bind(ct->ct_qcyclic, NULL, NULL); } } @@ -1804,6 +2051,22 @@ void callout_mp_init(void) { cpu_t *cp; + size_t min, max; + + if (callout_chunk == CALLOUT_CHUNK) { + /* + * No one has specified a chunk in /etc/system. We need to + * compute it here based on the number of online CPUs and + * available physical memory. + */ + min = CALLOUT_MIN_HEAP_SIZE; + max = ptob(physmem) / CALLOUT_MEM_FRACTION; + if (min > max) + min = max; + callout_chunk = min / sizeof (callout_heap_t); + callout_chunk /= ncpus_online; + callout_chunk = P2ROUNDUP(callout_chunk, CALLOUT_CHUNK); + } mutex_enter(&cpu_lock); @@ -1846,6 +2109,10 @@ callout_init(void) callout_tolerance = CALLOUT_TOLERANCE; if (callout_threads <= 0) callout_threads = CALLOUT_THREADS; + if (callout_chunk <= 0) + callout_chunk = CALLOUT_CHUNK; + else + callout_chunk = P2ROUNDUP(callout_chunk, CALLOUT_CHUNK); /* * Allocate all the callout tables based on max_ncpus. We have chosen @@ -1893,12 +2160,13 @@ callout_init(void) */ ct->ct_gen_id = CALLOUT_SHORT_ID(table_id); /* - * Initialize the cyclic as NONE. This will get set + * Initialize the cyclics as NONE. This will get set * during CPU online. This is so that partially * populated systems will only have the required * number of cyclics, not more. */ ct->ct_cyclic = CYCLIC_NONE; + ct->ct_qcyclic = CYCLIC_NONE; ct->ct_kstat_data = kmem_zalloc(size, KM_SLEEP); } } diff --git a/usr/src/uts/common/sys/callo.h b/usr/src/uts/common/sys/callo.h index 6a464f9dd9..8ffc38ada6 100644 --- a/usr/src/uts/common/sys/callo.h +++ b/usr/src/uts/common/sys/callo.h @@ -23,7 +23,7 @@ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -138,9 +138,9 @@ typedef struct callout { * This bit represents the callout (table) type. Each CPU has one realtime * and one normal callout table. */ -#define CALLOUT_FREE 0x8000000000000000ULL +#define CALLOUT_ID_FREE 0x8000000000000000ULL #define CALLOUT_EXECUTING 0x4000000000000000ULL -#define CALLOUT_ID_FLAGS (CALLOUT_FREE | CALLOUT_EXECUTING) +#define CALLOUT_ID_FLAGS (CALLOUT_ID_FREE | CALLOUT_EXECUTING) #define CALLOUT_ID_MASK ~CALLOUT_ID_FLAGS #define CALLOUT_GENERATION_LOW 0x100000000ULL #define CALLOUT_LONGTERM 0x80000000 @@ -151,7 +151,6 @@ typedef struct callout { #define CALLOUT_COUNTER_SHIFT callout_table_bits #define CALLOUT_TABLE(t, f) (((f) << CALLOUT_TYPE_BITS) | (t)) #define CALLOUT_TABLE_NUM(ct) ((ct) - callout_table) -#define CALLOUT_TABLE_TYPE(ct) (CALLOUT_TABLE_NUM(ct) & CALLOUT_TYPE_MASK) #define CALLOUT_TABLE_SEQID(ct) (CALLOUT_TABLE_NUM(ct) >> CALLOUT_TYPE_BITS) /* @@ -227,11 +226,17 @@ typedef struct callout_hash { * Callout list contains hrestime timers. * CALLOUT_LIST_FLAG_NANO * Callout list contains 1-nanosecond resolution callouts. + * CALLOUT_LIST_FLAG_HEAPED + * Callout list is present in the callout heap. + * CALLOUT_LIST_FLAG_QUEUED + * Callout list is present in the callout queue. */ #define CALLOUT_LIST_FLAG_FREE 0x1 #define CALLOUT_LIST_FLAG_ABSOLUTE 0x2 #define CALLOUT_LIST_FLAG_HRESTIME 0x4 #define CALLOUT_LIST_FLAG_NANO 0x8 +#define CALLOUT_LIST_FLAG_HEAPED 0x10 +#define CALLOUT_LIST_FLAG_QUEUED 0x20 struct callout_list { callout_list_t *cl_next; /* next in clhash */ @@ -248,6 +253,9 @@ struct callout_list { * callout list pointer in the heap element, we have to always remove * a heap element and its callout list together. We cannot remove one * without the other. + * + * This structure's size must be a power of two because we want an + * integral number of these to fit into a page. */ typedef struct callout_heap { hrtime_t ch_expiration; @@ -362,11 +370,16 @@ typedef struct callout_table { taskq_t *ct_taskq; /* taskq to execute normal callouts */ kstat_t *ct_kstats; /* callout kstats */ int ct_nreap; /* # heap entries that need reaping */ -#ifdef _LP64 - char ct_pad[28]; /* cache alignment */ -#else - char ct_pad[24]; /* cache alignment */ + cyclic_id_t ct_qcyclic; /* cyclic for the callout queue */ + callout_hash_t ct_queue; /* overflow queue of callouts */ +#ifndef _LP64 + char ct_pad[12]; /* cache alignment */ #endif + /* + * This structure should be aligned to a 64-byte (cache-line) + * boundary. Make sure the padding is right for 32-bit as well + * as 64-bit kernels. + */ } callout_table_t; /* @@ -389,15 +402,28 @@ typedef struct callout_table { #define ct_cleanups \ ct_kstat_data[CALLOUT_CLEANUPS].value.ui64 -#define CALLOUT_CHUNK 128 +/* + * CALLOUT_CHUNK is the minimum initial size of each heap, and the amount + * by which a full heap is expanded to make room for new entries. + */ +#define CALLOUT_CHUNK (PAGESIZE / sizeof (callout_heap_t)) + +/* + * CALLOUT_MIN_HEAP_SIZE defines the minimum size for the callout heap for + * the whole system. + */ +#define CALLOUT_MIN_HEAP_SIZE (64 * 1024 * sizeof (callout_heap_t)) + +/* + * CALLOUT_MEM_FRACTION defines the fraction of available physical memory that + * can be allocated towards the callout heap for the whole system. + */ +#define CALLOUT_MEM_FRACTION 4096 #define CALLOUT_HEAP_PARENT(index) (((index) - 1) >> 1) #define CALLOUT_HEAP_RIGHT(index) (((index) + 1) << 1) #define CALLOUT_HEAP_LEFT(index) ((((index) + 1) << 1) - 1) -#define CALLOUT_CYCLIC_HANDLER(t) \ - ((t == CALLOUT_REALTIME) ? callout_realtime : callout_normal) - #define CALLOUT_TCP_RESOLUTION 10000000ULL #define CALLOUT_ALIGN 64 /* cache line size */ |