diff options
author | Madhavan Venkataraman <Madhavan.Venkataraman@Sun.COM> | 2009-04-10 07:14:10 -0700 |
---|---|---|
committer | Madhavan Venkataraman <Madhavan.Venkataraman@Sun.COM> | 2009-04-10 07:14:10 -0700 |
commit | 51b32bdd07bc63a6e416c5759f0f445147703107 (patch) | |
tree | 649c9f6f9d9ac24c56450f908276b1f83d1c4ff0 | |
parent | 845e9415a97ec0124f099537b21fc0364883850f (diff) | |
download | illumos-joyent-51b32bdd07bc63a6e416c5759f0f445147703107.tar.gz |
6789031 High resolution timers needed for time-sensitive applications
6822357 assertion failed: expiration > 0, file: ../../common/os/cyclic.c, line: 3048
6827248 Empty callout lists need to be cleaned up more proactively
6827371 Solaris must support absolute and relative timers at the callout level
-rw-r--r-- | usr/src/cmd/mdb/common/modules/genunix/genunix.c | 66 | ||||
-rw-r--r-- | usr/src/uts/common/os/callout.c | 562 | ||||
-rw-r--r-- | usr/src/uts/common/os/condvar.c | 68 | ||||
-rw-r--r-- | usr/src/uts/common/sys/callo.h | 86 | ||||
-rw-r--r-- | usr/src/uts/common/syscall/lwp_timer.c | 20 |
5 files changed, 533 insertions, 269 deletions
diff --git a/usr/src/cmd/mdb/common/modules/genunix/genunix.c b/usr/src/cmd/mdb/common/modules/genunix/genunix.c index fdf8d0679d..7efcc26d54 100644 --- a/usr/src/cmd/mdb/common/modules/genunix/genunix.c +++ b/usr/src/cmd/mdb/common/modules/genunix/genunix.c @@ -685,6 +685,9 @@ static const char *co_typenames[] = { "R", "N" }; /* show real and normal, short and long, expired and unexpired. */ #define COF_DEFAULT (COF_REAL | COF_NORM | COF_LONG | COF_SHORT) +#define COF_LIST_FLAGS \ + (CALLOUT_LIST_FLAG_HRESTIME | CALLOUT_LIST_FLAG_ABSOLUTE) + /* private callout data for callback functions */ typedef struct callout_data { uint_t flags; /* COF_* */ @@ -712,13 +715,27 @@ callouts_cb(uintptr_t addr, const void *data, void *priv) { callout_data_t *coargs = (callout_data_t *)priv; callout_t *co = (callout_t *)data; - int tableid; + int tableid, list_flags; callout_id_t coid; if ((coargs == NULL) || (co == NULL)) { return (WALK_ERR); } + if ((coargs->flags & COF_FREE) && !(co->c_xid & CALLOUT_FREE)) { + /* + * The callout must have been reallocated. No point in + * walking any more. + */ + return (WALK_DONE); + } + if (!(coargs->flags & COF_FREE) && (co->c_xid & CALLOUT_FREE)) { + /* + * The callout must have been freed. No point in + * walking any more. + */ + return (WALK_DONE); + } if ((coargs->flags & COF_FUNC) && (coargs->funcaddr != (uintptr_t)co->c_func)) { return (WALK_NEXT); @@ -736,8 +753,7 @@ callouts_cb(uintptr_t addr, const void *data, void *priv) if ((coargs->flags & COF_EXEC) && !(co->c_xid & CALLOUT_EXECUTING)) { return (WALK_NEXT); } - - /* it is possible we don't have the exp time */ + /* it is possible we don't have the exp time or flags */ if (coargs->flags & COF_BYIDH) { if (!(coargs->flags & COF_FREE)) { /* we have to fetch the expire time ourselves. */ @@ -776,20 +792,20 @@ callouts_cb(uintptr_t addr, const void *data, void *priv) } } /* tricky part, since both HIRES and ABS can be set */ + list_flags = coargs->list_flags; if ((coargs->flags & COF_HIRES) && (coargs->flags & COF_ABS)) { /* both flags are set, only skip "regular" ones */ - if (! (coargs->list_flags & - (CALLOUT_FLAG_HRESTIME | CALLOUT_FLAG_ABSOLUTE))) { + if (! (list_flags & COF_LIST_FLAGS)) { return (WALK_NEXT); } } else { /* individual flags, or no flags */ if ((coargs->flags & COF_HIRES) && - !(coargs->list_flags & CALLOUT_FLAG_HRESTIME)) { + !(list_flags & CALLOUT_LIST_FLAG_HRESTIME)) { return (WALK_NEXT); } if ((coargs->flags & COF_ABS) && - !(coargs->list_flags & CALLOUT_FLAG_ABSOLUTE)) { + !(list_flags & CALLOUT_LIST_FLAG_ABSOLUTE)) { return (WALK_NEXT); } } @@ -826,7 +842,6 @@ callouts_cb(uintptr_t addr, const void *data, void *priv) } if (!(coargs->flags & COF_ADDR)) { - int list_flags = coargs->list_flags; if (!(coargs->flags & COF_VERBOSE)) { mdb_printf("%-3d %1s %-14llx ", TABLE_TO_SEQID(tableid), @@ -838,10 +853,11 @@ callouts_cb(uintptr_t addr, const void *data, void *priv) (coargs->flags & COF_EXPREL) ? coargs->exp - coargs->now : coargs->exp); } + list_flags = coargs->list_flags; mdb_printf("%1s%1s%1s%1s %-?llx %a(%p)", (co->c_xid & CALLOUT_EXECUTING) ? "X" : " ", - (list_flags & CALLOUT_FLAG_HRESTIME) ? "H" : " ", - (list_flags & CALLOUT_FLAG_ABSOLUTE) ? "A" : " ", + (list_flags & CALLOUT_LIST_FLAG_HRESTIME) ? "H" : " ", + (list_flags & CALLOUT_LIST_FLAG_ABSOLUTE) ? "A" : " ", (co->c_xid & CALLOUT_LONGTERM) ? "L" : " ", (long long)coid, co->c_func, co->c_arg); if (coargs->flags & COF_LONGLIST) { @@ -867,6 +883,7 @@ callout_list_cb(uintptr_t addr, const void *data, void *priv) callout_data_t *coargs = (callout_data_t *)priv; callout_list_t *cl = (callout_list_t *)data; callout_t *coptr; + int list_flags; if ((coargs == NULL) || (cl == NULL)) { return (WALK_ERR); @@ -874,7 +891,22 @@ callout_list_cb(uintptr_t addr, const void *data, void *priv) coargs->exp = cl->cl_expiration; coargs->list_flags = cl->cl_flags; - + if ((coargs->flags & COF_FREE) && + !(cl->cl_flags & CALLOUT_LIST_FLAG_FREE)) { + /* + * The callout list must have been reallocated. No point in + * walking any more. + */ + return (WALK_DONE); + } + if (!(coargs->flags & COF_FREE) && + (cl->cl_flags & CALLOUT_LIST_FLAG_FREE)) { + /* + * The callout list must have been freed. No point in + * walking any more. + */ + return (WALK_DONE); + } if ((coargs->flags & COF_TIME) && (cl->cl_expiration != coargs->time)) { return (WALK_NEXT); @@ -894,17 +926,16 @@ callout_list_cb(uintptr_t addr, const void *data, void *priv) /* FOUR cases, each different, !A!B, !AB, A!B, AB */ if ((coargs->flags & COF_HIRES) && (coargs->flags & COF_ABS)) { /* both flags are set, only skip "regular" ones */ - if (! (cl->cl_flags & - (CALLOUT_FLAG_HRESTIME | CALLOUT_FLAG_ABSOLUTE))) { + if (! (cl->cl_flags & COF_LIST_FLAGS)) { return (WALK_NEXT); } } else { if ((coargs->flags & COF_HIRES) && - !(cl->cl_flags & CALLOUT_FLAG_HRESTIME)) { + !(cl->cl_flags & CALLOUT_LIST_FLAG_HRESTIME)) { return (WALK_NEXT); } if ((coargs->flags & COF_ABS) && - !(cl->cl_flags & CALLOUT_FLAG_ABSOLUTE)) { + !(cl->cl_flags & CALLOUT_LIST_FLAG_ABSOLUTE)) { return (WALK_NEXT); } } @@ -935,12 +966,13 @@ callout_list_cb(uintptr_t addr, const void *data, void *priv) CALLOUT_TYPE_MASK]); } + list_flags = coargs->list_flags; mdb_printf("%-14llx %1s%1s %-6d %-0?p ", (coargs->flags & COF_EXPREL) ? coargs->exp - coargs->now : coargs->exp, - (coargs->list_flags & CALLOUT_FLAG_HRESTIME) ? + (list_flags & CALLOUT_LIST_FLAG_HRESTIME) ? "H" : " ", - (coargs->list_flags & CALLOUT_FLAG_ABSOLUTE) ? + (list_flags & CALLOUT_LIST_FLAG_ABSOLUTE) ? "A" : " ", coargs->bucket, cl->cl_callouts.ch_head); diff --git a/usr/src/uts/common/os/callout.c b/usr/src/uts/common/os/callout.c index adab6f16e6..ed1ae9aa83 100644 --- a/usr/src/uts/common/os/callout.c +++ b/usr/src/uts/common/os/callout.c @@ -40,8 +40,10 @@ /* * Callout tables. See timeout(9F) for details. */ +static int callout_threads; /* callout normal threads */ static hrtime_t callout_debug_hrtime; /* debugger entry time */ -static int callout_min_resolution; /* Minimum resolution */ +static int callout_min_reap; /* callout minimum reap count */ +static int callout_tolerance; /* callout hires tolerance */ static callout_table_t *callout_boot_ct; /* Boot CPU's callout tables */ static clock_t callout_max_ticks; /* max interval */ static hrtime_t callout_longterm; /* longterm nanoseconds */ @@ -58,8 +60,8 @@ static callout_table_t *callout_table; /* global callout table array */ * as it will cause a deadlock. This has always been an unwritten rule. * We are making it explicit here. */ -static int callout_realtime_level = CY_LOW_LEVEL; -static int callout_normal_level = CY_LOCK_LEVEL; +static volatile int callout_realtime_level = CY_LOW_LEVEL; +static volatile int callout_normal_level = CY_LOCK_LEVEL; static char *callout_kstat_names[] = { "callout_timeouts", @@ -69,8 +71,11 @@ static char *callout_kstat_names[] = { "callout_untimeouts_expired", "callout_expirations", "callout_allocations", + "callout_cleanups", }; +static hrtime_t callout_heap_process(callout_table_t *, hrtime_t, int); + #define CALLOUT_HASH_INSERT(hash, cp, cnext, cprev) \ { \ callout_hash_t *hashp = &(hash); \ @@ -125,9 +130,22 @@ static char *callout_kstat_names[] = { * they were queued. This is fair. Plus, it helps to make each * callout expiration timely. It also favors cancellations. * - * - callout lists are queued in a LIFO manner in the callout list hash - * table. This ensures that long term timers stay at the rear of the - * hash lists. + * - callout lists are queued in the following manner in the callout + * hash table buckets: + * + * - appended, if the callout list is a 1-nanosecond resolution + * callout list. When a callout is created, we first look for + * a callout list that has the same expiration so we can avoid + * allocating a callout list and inserting the expiration into + * the heap. However, we do not want to look at 1-nanosecond + * resolution callout lists as we will seldom find a match in + * them. Keeping these callout lists in the rear of the hash + * buckets allows us to skip these during the lookup. + * + * - inserted at the beginning, if the callout list is not a + * 1-nanosecond resolution callout list. This also has the + * side-effect of keeping the long term timers away from the + * front of the buckets. * * - callout lists are queued in a FIFO manner in the expired callouts * list. This ensures that callout lists are executed in the order @@ -180,7 +198,7 @@ static char *callout_kstat_names[] = { */ \ exec = 1; \ } else if ((ct->ct_heap_num == 0) || \ - (ct->ct_heap[0] > gethrtime() + CALLOUT_THRESHOLD)) { \ + (ct->ct_heap[0].ch_expiration > gethrtime() + CALLOUT_THRESHOLD)) {\ /* \ * If the heap has become empty, we need two threads as \ * there is no one to kick off the second thread in the \ @@ -200,6 +218,28 @@ static char *callout_kstat_names[] = { } /* + * Macro to swap two heap items. + */ +#define CALLOUT_SWAP(h1, h2) \ +{ \ + callout_heap_t tmp; \ + \ + tmp = *h1; \ + *h1 = *h2; \ + *h2 = tmp; \ +} + +/* + * Macro to free a callout list. + */ +#define CALLOUT_LIST_FREE(ct, cl) \ +{ \ + cl->cl_next = ct->ct_lfree; \ + ct->ct_lfree = cl; \ + cl->cl_flags |= CALLOUT_LIST_FLAG_FREE; \ +} + +/* * Allocate a callout structure. We try quite hard because we * can't sleep, and if we can't do the allocation, we're toast. * Failing all, we try a KM_PANIC allocation. Note that we never @@ -252,59 +292,46 @@ callout_list_alloc(callout_table_t *ct) bzero(cl, sizeof (callout_list_t)); mutex_enter(&ct->ct_mutex); - cl->cl_next = ct->ct_lfree; - ct->ct_lfree = cl; + CALLOUT_LIST_FREE(ct, cl); } /* - * Find a callout list that corresponds to an expiration. + * Find a callout list that corresponds to an expiration and matching flags. */ static callout_list_t * callout_list_get(callout_table_t *ct, hrtime_t expiration, int flags, int hash) { callout_list_t *cl; + int clflags; ASSERT(MUTEX_HELD(&ct->ct_mutex)); - for (cl = ct->ct_clhash[hash].ch_head; (cl != NULL); cl = cl->cl_next) { - if ((cl->cl_expiration == expiration) && - (cl->cl_flags == flags)) - return (cl); + if (flags & CALLOUT_LIST_FLAG_NANO) { + /* + * This is a 1-nanosecond resolution callout. We will rarely + * find a match for this. So, bail out. + */ + return (NULL); } - return (NULL); -} - -/* - * Find the callout list that corresponds to an expiration. - * If the callout list is null, free it. Else, return it. - */ -static callout_list_t * -callout_list_check(callout_table_t *ct, hrtime_t expiration, int hash) -{ - callout_list_t *cl; - - ASSERT(MUTEX_HELD(&ct->ct_mutex)); - + clflags = (CALLOUT_LIST_FLAG_ABSOLUTE | CALLOUT_LIST_FLAG_HRESTIME); for (cl = ct->ct_clhash[hash].ch_head; (cl != NULL); cl = cl->cl_next) { - if (cl->cl_expiration == expiration) { - if (cl->cl_callouts.ch_head != NULL) { - /* - * Found a match. - */ - return (cl); - } - - CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl); - cl->cl_next = ct->ct_lfree; - ct->ct_lfree = cl; - + /* + * If we have reached a 1-nanosecond resolution callout list, + * we don't have much hope of finding a match in this hash + * bucket. So, just bail out. + */ + if (cl->cl_flags & CALLOUT_LIST_FLAG_NANO) return (NULL); - } + + if ((cl->cl_expiration == expiration) && + ((cl->cl_flags & clflags) == (flags & clflags))) + return (cl); } return (NULL); } + /* * Initialize a callout table's heap, if necessary. Preallocate some free * entries so we don't have to check for NULL elsewhere. @@ -319,7 +346,7 @@ callout_heap_init(callout_table_t *ct) ct->ct_heap_num = 0; ct->ct_heap_max = CALLOUT_CHUNK; - size = sizeof (hrtime_t) * CALLOUT_CHUNK; + size = sizeof (callout_heap_t) * CALLOUT_CHUNK; ct->ct_heap = kmem_alloc(size, KM_SLEEP); } @@ -332,7 +359,7 @@ static void callout_heap_expand(callout_table_t *ct) { size_t max, size, osize; - hrtime_t *heap; + callout_heap_t *heap; ASSERT(MUTEX_HELD(&ct->ct_mutex)); ASSERT(ct->ct_heap_num <= ct->ct_heap_max); @@ -341,8 +368,8 @@ callout_heap_expand(callout_table_t *ct) max = ct->ct_heap_max; mutex_exit(&ct->ct_mutex); - osize = sizeof (hrtime_t) * max; - size = sizeof (hrtime_t) * (max + CALLOUT_CHUNK); + osize = sizeof (callout_heap_t) * max; + size = sizeof (callout_heap_t) * (max + CALLOUT_CHUNK); heap = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC); mutex_enter(&ct->ct_mutex); @@ -358,7 +385,7 @@ callout_heap_expand(callout_table_t *ct) bcopy(ct->ct_heap, heap, osize); kmem_free(ct->ct_heap, osize); ct->ct_heap = heap; - ct->ct_heap_max = size / sizeof (hrtime_t); + ct->ct_heap_max = size / sizeof (callout_heap_t); } } @@ -371,7 +398,7 @@ static int callout_upheap(callout_table_t *ct) { int current, parent; - hrtime_t *heap, current_expiration, parent_expiration; + callout_heap_t *heap, *hcurrent, *hparent; ASSERT(MUTEX_HELD(&ct->ct_mutex)); ASSERT(ct->ct_heap_num >= 1); @@ -385,21 +412,20 @@ callout_upheap(callout_table_t *ct) for (;;) { parent = CALLOUT_HEAP_PARENT(current); - current_expiration = heap[current]; - parent_expiration = heap[parent]; + hparent = &heap[parent]; + hcurrent = &heap[current]; /* * We have an expiration later than our parent; we're done. */ - if (current_expiration >= parent_expiration) { + if (hcurrent->ch_expiration >= hparent->ch_expiration) { return (0); } /* * We need to swap with our parent, and continue up the heap. */ - heap[parent] = current_expiration; - heap[current] = parent_expiration; + CALLOUT_SWAP(hparent, hcurrent); /* * If we just reached the root, we're done. @@ -414,18 +440,20 @@ callout_upheap(callout_table_t *ct) } /* - * Insert a new expiration into a callout table's heap. + * Insert a new heap item into a callout table's heap. */ static void -callout_heap_insert(callout_table_t *ct, hrtime_t expiration) +callout_heap_insert(callout_table_t *ct, callout_list_t *cl) { ASSERT(MUTEX_HELD(&ct->ct_mutex)); ASSERT(ct->ct_heap_num < ct->ct_heap_max); /* - * First, copy the expiration to the bottom of the heap. + * First, copy the expiration and callout list pointer to the bottom + * of the heap. */ - ct->ct_heap[ct->ct_heap_num] = expiration; + ct->ct_heap[ct->ct_heap_num].ch_expiration = cl->cl_expiration; + ct->ct_heap[ct->ct_heap_num].ch_list = cl; ct->ct_heap_num++; /* @@ -439,7 +467,7 @@ callout_heap_insert(callout_table_t *ct, hrtime_t expiration) * in the heap. */ if (callout_upheap(ct) && (ct->ct_suspend == 0)) - (void) cyclic_reprogram(ct->ct_cyclic, expiration); + (void) cyclic_reprogram(ct->ct_cyclic, cl->cl_expiration); } /* @@ -449,8 +477,8 @@ callout_heap_insert(callout_table_t *ct, hrtime_t expiration) static void callout_downheap(callout_table_t *ct) { - int left, right, current, nelems; - hrtime_t *heap, left_expiration, right_expiration, current_expiration; + int current, left, right, nelems; + callout_heap_t *heap, *hleft, *hright, *hcurrent; ASSERT(MUTEX_HELD(&ct->ct_mutex)); ASSERT(ct->ct_heap_num >= 1); @@ -467,8 +495,8 @@ callout_downheap(callout_table_t *ct) if ((left = CALLOUT_HEAP_LEFT(current)) >= nelems) return; - left_expiration = heap[left]; - current_expiration = heap[current]; + hleft = &heap[left]; + hcurrent = &heap[current]; right = CALLOUT_HEAP_RIGHT(current); @@ -479,28 +507,27 @@ callout_downheap(callout_table_t *ct) if (right >= nelems) goto comp_left; - right_expiration = heap[right]; + hright = &heap[right]; /* * We have both a left and a right child. We need to compare * the expiration of the children to determine which * expires earlier. */ - if (right_expiration < left_expiration) { + if (hright->ch_expiration < hleft->ch_expiration) { /* * Our right child is the earlier of our children. * We'll now compare our expiration to its expiration. * If ours is the earlier one, we're done. */ - if (current_expiration <= right_expiration) + if (hcurrent->ch_expiration <= hright->ch_expiration) return; /* * Our right child expires earlier than we do; swap * with our right child, and descend right. */ - heap[right] = current_expiration; - heap[current] = right_expiration; + CALLOUT_SWAP(hright, hcurrent); current = right; continue; } @@ -511,15 +538,14 @@ comp_left: * no right child). We'll now compare our expiration * to its expiration. If ours is the earlier one, we're done. */ - if (current_expiration <= left_expiration) + if (hcurrent->ch_expiration <= hleft->ch_expiration) return; /* * Our left child expires earlier than we do; swap with our * left child, and descend left. */ - heap[left] = current_expiration; - heap[current] = left_expiration; + CALLOUT_SWAP(hleft, hcurrent); current = left; } } @@ -530,29 +556,42 @@ comp_left: static void callout_heap_delete(callout_table_t *ct) { - hrtime_t now, expiration; + hrtime_t now, expiration, next; callout_list_t *cl; + callout_heap_t *heap; int hash; ASSERT(MUTEX_HELD(&ct->ct_mutex)); + if (CALLOUT_CLEANUP(ct)) { + /* + * There are too many heap elements pointing to empty callout + * lists. Clean them out. + */ + (void) callout_heap_process(ct, 0, 0); + } + now = gethrtime(); + heap = ct->ct_heap; while (ct->ct_heap_num > 0) { - expiration = ct->ct_heap[0]; - /* - * Find the callout list that corresponds to the expiration. - * If the callout list is empty, callout_list_check() - * will free the callout list and return NULL. - */ + expiration = heap->ch_expiration; hash = CALLOUT_CLHASH(expiration); - cl = callout_list_check(ct, expiration, hash); - if (cl != NULL) { + cl = heap->ch_list; + ASSERT(expiration == cl->cl_expiration); + + if (cl->cl_callouts.ch_head == NULL) { /* - * If the root of the heap expires in the future, we are - * done. We are doing this check here instead of at the - * beginning because we want to first free all the - * empty callout lists at the top of the heap. + * If the callout list is empty, reap it. + * Decrement the reap count. + */ + CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl); + CALLOUT_LIST_FREE(ct, cl); + ct->ct_nreap--; + } else { + /* + * If the root of the heap expires in the future, + * bail out. */ if (expiration > now) break; @@ -572,23 +611,166 @@ callout_heap_delete(callout_table_t *ct) */ ct->ct_heap_num--; if (ct->ct_heap_num > 0) { - ct->ct_heap[0] = ct->ct_heap[ct->ct_heap_num]; + heap[0] = heap[ct->ct_heap_num]; callout_downheap(ct); } } /* - * If this callout table is empty or callouts have been suspended - * by CPR, just return. The cyclic has already been programmed to + * If this callout table is empty or callouts have been suspended, + * just return. The cyclic has already been programmed to * infinity by the cyclic subsystem. */ if ((ct->ct_heap_num == 0) || (ct->ct_suspend > 0)) return; + /* + * If the top expirations are within callout_tolerance of each other, + * delay the cyclic expire so that they can be processed together. + * This is to prevent high resolution timers from swamping the system + * with cyclic activity. + */ + if (ct->ct_heap_num > 2) { + next = expiration + callout_tolerance; + if ((heap[1].ch_expiration < next) || + (heap[2].ch_expiration < next)) + expiration = next; + } + (void) cyclic_reprogram(ct->ct_cyclic, expiration); } /* + * There are some situations when the entire heap is walked and processed. + * This function is called to do the processing. These are the situations: + * + * 1. When the reap count reaches its threshold, the heap has to be cleared + * of all empty callout lists. + * + * 2. When the system enters and exits KMDB/OBP, all entries in the heap + * need to be adjusted by the interval spent in KMDB/OBP. + * + * 3. When system time is changed, the heap has to be scanned for + * absolute hrestime timers. These need to be removed from the heap + * and expired immediately. + * + * In cases 2 and 3, it is a good idea to do 1 as well since we are + * scanning the heap anyway. + * + * If the root gets changed and/or callout lists are expired, return the + * new expiration to the caller so he can reprogram the cyclic accordingly. + */ +static hrtime_t +callout_heap_process(callout_table_t *ct, hrtime_t delta, int timechange) +{ + callout_heap_t *heap; + callout_list_t *cl, *rootcl; + hrtime_t expiration, now; + int i, hash, clflags, expired; + ulong_t num; + + ASSERT(MUTEX_HELD(&ct->ct_mutex)); + + if (ct->ct_heap_num == 0) + return (0); + + if (ct->ct_nreap > 0) + ct->ct_cleanups++; + + heap = ct->ct_heap; + rootcl = heap->ch_list; + + /* + * We walk the heap from the top to the bottom. If we encounter + * a heap item that points to an empty callout list, we clean + * it out. If we encounter a hrestime entry that must be removed, + * again we clean it out. Otherwise, we apply any adjustments needed + * to an element. + * + * During the walk, we also compact the heap from the bottom and + * reconstruct the heap using upheap operations. This is very + * efficient if the number of elements to be cleaned is greater than + * or equal to half the heap. This is the common case. + * + * Even in the non-common case, the upheap operations should be short + * as the entries below generally tend to be bigger than the entries + * above. + */ + num = ct->ct_heap_num; + ct->ct_heap_num = 0; + clflags = (CALLOUT_LIST_FLAG_HRESTIME | CALLOUT_LIST_FLAG_ABSOLUTE); + now = gethrtime(); + expired = 0; + for (i = 0; i < num; i++) { + cl = heap[i].ch_list; + /* + * If the callout list is empty, delete the heap element and + * free the callout list. + */ + if (cl->cl_callouts.ch_head == NULL) { + hash = CALLOUT_CLHASH(cl->cl_expiration); + CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl); + CALLOUT_LIST_FREE(ct, cl); + continue; + } + + /* + * Delete the heap element and expire the callout list, if + * one of the following is true: + * - the callout list has expired + * - the callout list is an absolute hrestime one and + * there has been a system time change + */ + if ((cl->cl_expiration <= now) || + (timechange && ((cl->cl_flags & clflags) == clflags))) { + hash = CALLOUT_CLHASH(cl->cl_expiration); + CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl); + CALLOUT_LIST_APPEND(ct->ct_expired, cl); + expired = 1; + continue; + } + + /* + * Apply adjustments, if any. Adjustments are applied after + * the system returns from KMDB or OBP. They are only applied + * to relative callout lists. + */ + if (delta && !(cl->cl_flags & CALLOUT_LIST_FLAG_ABSOLUTE)) { + hash = CALLOUT_CLHASH(cl->cl_expiration); + CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl); + expiration = cl->cl_expiration + delta; + if (expiration <= 0) + expiration = CY_INFINITY; + heap[i].ch_expiration = expiration; + cl->cl_expiration = expiration; + hash = CALLOUT_CLHASH(cl->cl_expiration); + if (cl->cl_flags & CALLOUT_LIST_FLAG_NANO) { + CALLOUT_LIST_APPEND(ct->ct_clhash[hash], cl); + } else { + CALLOUT_LIST_INSERT(ct->ct_clhash[hash], cl); + } + } + + heap[ct->ct_heap_num] = heap[i]; + ct->ct_heap_num++; + (void) callout_upheap(ct); + } + + ct->ct_nreap = 0; + + if (expired) + expiration = gethrtime(); + else if (ct->ct_heap_num == 0) + expiration = CY_INFINITY; + else if (rootcl != heap->ch_list) + expiration = heap->ch_expiration; + else + expiration = 0; + + return (expiration); +} + +/* * Common function used to create normal and realtime callouts. * * Realtime callouts are handled at CY_LOW_PIL by a cyclic handler. So, @@ -606,17 +788,17 @@ timeout_generic(int type, void (*func)(void *), void *arg, callout_t *cp; callout_id_t id; callout_list_t *cl; - hrtime_t now, interval; - int hash; + hrtime_t now, interval, rexpiration; + int hash, clflags; ASSERT(resolution > 0); ASSERT(func != NULL); /* - * Please see comment about minimum resolution in callout_init(). + * We get the current hrtime right upfront so that latencies in + * this function do not affect the accuracy of the callout. */ - if (resolution < callout_min_resolution) - resolution = callout_min_resolution; + now = gethrtime(); /* * We disable kernel preemption so that we remain on the same CPU @@ -644,6 +826,16 @@ timeout_generic(int type, void (*func)(void *), void *arg, mutex_enter(&ct->ct_mutex); } + if (CALLOUT_CLEANUP(ct)) { + /* + * There are too many heap elements pointing to empty callout + * lists. Clean them out. + */ + rexpiration = callout_heap_process(ct, 0, 0); + if ((rexpiration != 0) && (ct->ct_suspend == 0)) + (void) cyclic_reprogram(ct->ct_cyclic, rexpiration); + } + if ((cp = ct->ct_free) == NULL) cp = callout_alloc(ct); else @@ -655,16 +847,22 @@ timeout_generic(int type, void (*func)(void *), void *arg, /* * Compute the expiration hrtime. */ - now = gethrtime(); if (flags & CALLOUT_FLAG_ABSOLUTE) { interval = expiration - now; } else { interval = expiration; expiration += now; } - if (flags & CALLOUT_FLAG_ROUNDUP) - expiration += resolution - 1; - expiration = (expiration / resolution) * resolution; + + if (resolution > 1) { + /* + * Align expiration to the specified resolution. + */ + if (flags & CALLOUT_FLAG_ROUNDUP) + expiration += resolution - 1; + expiration = (expiration / resolution) * resolution; + } + if (expiration <= 0) { /* * expiration hrtime overflow has occurred. Just set the @@ -697,15 +895,20 @@ timeout_generic(int type, void (*func)(void *), void *arg, cp->c_xid = id; - flags &= CALLOUT_LIST_FLAGS; + clflags = 0; + if (flags & CALLOUT_FLAG_ABSOLUTE) + clflags |= CALLOUT_LIST_FLAG_ABSOLUTE; + if (flags & CALLOUT_FLAG_HRESTIME) + clflags |= CALLOUT_LIST_FLAG_HRESTIME; + if (resolution == 1) + clflags |= CALLOUT_LIST_FLAG_NANO; hash = CALLOUT_CLHASH(expiration); again: /* * Try to see if a callout list already exists for this expiration. - * Most of the time, this will be the case. */ - cl = callout_list_get(ct, expiration, flags, hash); + cl = callout_list_get(ct, expiration, clflags, hash); if (cl == NULL) { /* * Check if we have enough space in the heap to insert one @@ -743,16 +946,28 @@ again: } ct->ct_lfree = cl->cl_next; cl->cl_expiration = expiration; - cl->cl_flags = flags; + cl->cl_flags = clflags; - CALLOUT_LIST_INSERT(ct->ct_clhash[hash], cl); + if (clflags & CALLOUT_LIST_FLAG_NANO) { + CALLOUT_LIST_APPEND(ct->ct_clhash[hash], cl); + } else { + CALLOUT_LIST_INSERT(ct->ct_clhash[hash], cl); + } /* * This is a new expiration. So, insert it into the heap. * This will also reprogram the cyclic, if the expiration * propagated to the root of the heap. */ - callout_heap_insert(ct, expiration); + callout_heap_insert(ct, cl); + } else { + /* + * If the callout list was empty, untimeout_generic() would + * have incremented a reap count. Decrement the reap count + * as we are going to insert a callout into this list. + */ + if (cl->cl_callouts.ch_head == NULL) + ct->ct_nreap--; } cp->c_list = cl; CALLOUT_APPEND(ct, cp); @@ -861,6 +1076,7 @@ untimeout_generic(callout_id_t id, int nowait) callout_table_t *ct; callout_t *cp; callout_id_t xid; + callout_list_t *cl; int hash; callout_id_t bogus; @@ -894,12 +1110,22 @@ untimeout_generic(callout_id_t id, int nowait) * order to avoid lots of X-calls to the CPU associated * with the callout table. */ - expiration = cp->c_list->cl_expiration; + cl = cp->c_list; + expiration = cl->cl_expiration; CALLOUT_DELETE(ct, cp); cp->c_idnext = ct->ct_free; ct->ct_free = cp; + cp->c_xid |= CALLOUT_FREE; ct->ct_untimeouts_unexpired++; ct->ct_timeouts_pending--; + + /* + * If the callout list has become empty, it needs + * to be cleaned along with its heap entry. Increment + * a reap count. + */ + if (cl->cl_callouts.ch_head == NULL) + ct->ct_nreap++; mutex_exit(&ct->ct_mutex); expiration -= gethrtime(); @@ -957,7 +1183,7 @@ untimeout_generic(callout_id_t id, int nowait) * (1) the callout already fired, or (2) the caller passed us * a bogus value. Perform a sanity check to detect case (2). */ - bogus = (CALLOUT_EXECUTING | CALLOUT_COUNTER_HIGH); + bogus = (CALLOUT_ID_FLAGS | CALLOUT_COUNTER_HIGH); if (((id & bogus) != CALLOUT_COUNTER_HIGH) && (id != 0)) panic("untimeout: impossible timeout id %llx", (unsigned long long)id); @@ -1058,6 +1284,7 @@ callout_list_expire(callout_table_t *ct, callout_list_t *cl) CALLOUT_DELETE(ct, cp); cp->c_idnext = ct->ct_free; ct->ct_free = cp; + cp->c_xid |= CALLOUT_FREE; if (cp->c_waiting) { cp->c_waiting = 0; @@ -1088,8 +1315,7 @@ callout_expire(callout_table_t *ct) * Free the callout list. */ CALLOUT_LIST_DELETE(ct->ct_expired, cl); - cl->cl_next = ct->ct_lfree; - ct->ct_lfree = cl; + CALLOUT_LIST_FREE(ct, cl); } } } @@ -1187,59 +1413,11 @@ callout_suspend(void) } } -static void -callout_adjust(callout_table_t *ct, hrtime_t delta) -{ - int hash, newhash; - hrtime_t expiration; - callout_list_t *cl; - callout_hash_t list; - - ASSERT(MUTEX_HELD(&ct->ct_mutex)); - - /* - * In order to adjust the expirations, we null out the heap. Then, - * we reinsert adjusted expirations in the heap. Keeps it simple. - * Note that since the CALLOUT_TABLE_SUSPENDED flag is set by the - * caller, the heap insert does not result in cyclic reprogramming. - */ - ct->ct_heap_num = 0; - - /* - * First, remove all the callout lists from the table and string them - * in a list. - */ - list.ch_head = list.ch_tail = NULL; - for (hash = 0; hash < CALLOUT_BUCKETS; hash++) { - while ((cl = ct->ct_clhash[hash].ch_head) != NULL) { - CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl); - CALLOUT_LIST_APPEND(list, cl); - } - } - - /* - * Now, traverse the callout lists and adjust their expirations. - */ - while ((cl = list.ch_head) != NULL) { - CALLOUT_LIST_DELETE(list, cl); - /* - * Set the new expiration and reinsert in the right - * hash bucket. - */ - expiration = cl->cl_expiration; - expiration += delta; - cl->cl_expiration = expiration; - newhash = CALLOUT_CLHASH(expiration); - CALLOUT_LIST_INSERT(ct->ct_clhash[newhash], cl); - callout_heap_insert(ct, expiration); - } -} - /* * Resume callout processing. */ static void -callout_resume(hrtime_t delta) +callout_resume(hrtime_t delta, int timechange) { hrtime_t exp; int t, f; @@ -1261,8 +1439,14 @@ callout_resume(hrtime_t delta) continue; } - if (delta) - callout_adjust(ct, delta); + /* + * If a delta is specified, adjust the expirations in + * the heap by delta. Also, if the caller indicates + * a timechange, process that. This step also cleans + * out any empty callout lists that might happen to + * be there. + */ + (void) callout_heap_process(ct, delta, timechange); ct->ct_suspend--; if (ct->ct_suspend == 0) { @@ -1274,13 +1458,14 @@ callout_resume(hrtime_t delta) if (ct->ct_expired.ch_head != NULL) exp = gethrtime(); else if (ct->ct_heap_num > 0) - exp = ct->ct_heap[0]; + exp = ct->ct_heap[0].ch_expiration; else exp = 0; if (exp != 0) (void) cyclic_reprogram(ct->ct_cyclic, exp); } + mutex_exit(&ct->ct_mutex); } } @@ -1288,6 +1473,11 @@ callout_resume(hrtime_t delta) /* * Callback handler used by CPR to stop and resume callouts. + * The cyclic subsystem saves and restores hrtime during CPR. + * That is why callout_resume() is called with a 0 delta. + * Although hrtime is the same, hrestime (system time) has + * progressed during CPR. So, we have to indicate a time change + * to expire the absolute hrestime timers. */ /*ARGSUSED*/ static boolean_t @@ -1296,7 +1486,7 @@ callout_cpr_callb(void *arg, int code) if (code == CB_CODE_CPR_CHKPT) callout_suspend(); else - callout_resume(0); + callout_resume(0, 1); return (B_TRUE); } @@ -1320,7 +1510,7 @@ callout_debug_callb(void *arg, int code) callout_debug_hrtime = gethrtime(); } else { delta = gethrtime() - callout_debug_hrtime; - callout_resume(delta); + callout_resume(delta, 0); } return (B_TRUE); @@ -1334,8 +1524,7 @@ callout_debug_callb(void *arg, int code) static void callout_hrestime_one(callout_table_t *ct) { - callout_list_t *cl, *clnext; - int hash, flags; + hrtime_t expiration; mutex_enter(&ct->ct_mutex); if (ct->ct_heap_num == 0) { @@ -1343,19 +1532,13 @@ callout_hrestime_one(callout_table_t *ct) return; } - flags = CALLOUT_LIST_FLAGS; - for (hash = 0; hash < CALLOUT_BUCKETS; hash++) { - for (cl = ct->ct_clhash[hash].ch_head; cl; cl = clnext) { - clnext = cl->cl_next; - if (cl->cl_flags == flags) { - CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl); - CALLOUT_LIST_APPEND(ct->ct_expired, cl); - } - } - } + /* + * Walk the heap and process all the absolute hrestime entries. + */ + expiration = callout_heap_process(ct, 0, 1); - if ((ct->ct_expired.ch_head != NULL) && (ct->ct_suspend == 0)) - (void) cyclic_reprogram(ct->ct_cyclic, gethrtime()); + if ((expiration != 0) && (ct->ct_suspend == 0)) + (void) cyclic_reprogram(ct->ct_cyclic, expiration); mutex_exit(&ct->ct_mutex); } @@ -1456,7 +1639,7 @@ callout_cyclic_init(callout_table_t *ct) /* * Each callout thread consumes exactly one * task structure while active. Therefore, - * prepopulating with 2 * CALLOUT_THREADS tasks + * prepopulating with 2 * callout_threads tasks * ensures that there's at least one task per * thread that's either scheduled or on the * freelist. In turn, this guarantees that @@ -1467,8 +1650,8 @@ callout_cyclic_init(callout_table_t *ct) */ ct->ct_taskq = taskq_create_instance("callout_taskq", seqid, - CALLOUT_THREADS, maxclsyspri, - 2 * CALLOUT_THREADS, 2 * CALLOUT_THREADS, + callout_threads, maxclsyspri, + 2 * callout_threads, 2 * callout_threads, TASKQ_PREPOPULATE | TASKQ_CPR_SAFE); } @@ -1642,30 +1825,13 @@ callout_init(void) callout_counter_low = 1 << CALLOUT_COUNTER_SHIFT; callout_longterm = TICK_TO_NSEC(CALLOUT_LONGTERM_TICKS); callout_max_ticks = CALLOUT_MAX_TICKS; + if (callout_min_reap == 0) + callout_min_reap = CALLOUT_MIN_REAP; - /* - * Because of the variability in timing behavior across systems with - * different architectures, we cannot allow arbitrarily low - * resolutions. The minimum resolution has to be determined in a - * platform-specific way. Until then, we define a blanket minimum - * resolution for callouts of CALLOUT_MIN_RESOLUTION. - * - * If, in the future, someone requires lower resolution timers, they - * can do one of two things: - * - * - Define a lower value for callout_min_resolution. This would - * affect all clients of the callout subsystem. If this done - * via /etc/system, then no code changes are required and it - * would affect only that customer. - * - * - Define a flag to be passed to timeout creation that allows - * the lower resolution. This involves code changes. But it - * would affect only the calling module. It is the developer's - * responsibility to test on all systems and make sure that - * everything works. - */ - if (callout_min_resolution <= 0) - callout_min_resolution = CALLOUT_MIN_RESOLUTION; + if (callout_tolerance <= 0) + callout_tolerance = CALLOUT_TOLERANCE; + if (callout_threads <= 0) + callout_threads = CALLOUT_THREADS; /* * Allocate all the callout tables based on max_ncpus. We have chosen diff --git a/usr/src/uts/common/os/condvar.c b/usr/src/uts/common/os/condvar.c index cb1543e767..18406bea26 100644 --- a/usr/src/uts/common/os/condvar.c +++ b/usr/src/uts/common/os/condvar.c @@ -39,6 +39,8 @@ #include <sys/sdt.h> #include <sys/callo.h> +clock_t cv_timedwait_hires(kcondvar_t *, kmutex_t *, hrtime_t, hrtime_t, int); + /* * CV_MAX_WAITERS is the maximum number of waiters we track; once * the number becomes higher than that, we look at the sleepq to @@ -221,19 +223,34 @@ cv_wakeup(void *arg) clock_t cv_timedwait(kcondvar_t *cvp, kmutex_t *mp, clock_t tim) { + hrtime_t hrtim; + + if (tim <= lbolt) + return (-1); + + hrtim = TICK_TO_NSEC(tim - lbolt); + return (cv_timedwait_hires(cvp, mp, hrtim, nsec_per_tick, 0)); +} + +clock_t +cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, + hrtime_t res, int flag) +{ kthread_t *t = curthread; callout_id_t id; clock_t timeleft; + hrtime_t limit; int signalled; if (panicstr) return (-1); - timeleft = tim - lbolt; - if (timeleft <= 0) + limit = (flag & CALLOUT_FLAG_ABSOLUTE) ? gethrtime() : 0; + if (tim <= limit) return (-1); mutex_enter(&t->t_wait_mutex); - id = realtime_timeout_default((void (*)(void *))cv_wakeup, t, timeleft); + id = timeout_generic(CALLOUT_REALTIME, (void (*)(void *))cv_wakeup, t, + tim, res, flag); thread_lock(t); /* lock the thread */ cv_block((condvar_impl_t *)cvp); thread_unlock_nopreempt(t); @@ -315,7 +332,8 @@ cv_wait_sig(kcondvar_t *cvp, kmutex_t *mp) } static clock_t -cv_timedwait_sig_internal(kcondvar_t *cvp, kmutex_t *mp, clock_t tim, int flag) +cv_timedwait_sig_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, + hrtime_t res, int flag) { kthread_t *t = curthread; proc_t *p = ttoproc(t); @@ -323,16 +341,9 @@ cv_timedwait_sig_internal(kcondvar_t *cvp, kmutex_t *mp, clock_t tim, int flag) int cancel_pending = 0; callout_id_t id; clock_t rval = 1; - clock_t timeleft; + hrtime_t limit; int signalled = 0; - /* - * If the flag is 0, then realtime_timeout() below creates a - * regular realtime timeout. If the flag is CALLOUT_FLAG_HRESTIME, - * then, it creates a special realtime timeout which is affected by - * changes to hrestime. See callo.h for details. - */ - ASSERT((flag == 0) || (flag == CALLOUT_FLAG_HRESTIME)); if (panicstr) return (rval); @@ -342,17 +353,17 @@ cv_timedwait_sig_internal(kcondvar_t *cvp, kmutex_t *mp, clock_t tim, int flag) * that has not yet unpinned the thread underneath. */ if (lwp == NULL || t->t_intr) - return (cv_timedwait(cvp, mp, tim)); + return (cv_timedwait_hires(cvp, mp, tim, res, flag)); /* - * If tim is less than or equal to lbolt, then the timeout + * If tim is less than or equal to current hrtime, then the timeout * has already occured. So just check to see if there is a signal * pending. If so return 0 indicating that there is a signal pending. * Else return -1 indicating that the timeout occured. No need to * wait on anything. */ - timeleft = tim - lbolt; - if (timeleft <= 0) { + limit = (flag & CALLOUT_FLAG_ABSOLUTE) ? gethrtime() : 0; + if (tim <= limit) { lwp->lwp_asleep = 1; lwp->lwp_sysabort = 0; rval = -1; @@ -365,7 +376,7 @@ cv_timedwait_sig_internal(kcondvar_t *cvp, kmutex_t *mp, clock_t tim, int flag) cancel_pending = schedctl_cancel_pending(); mutex_enter(&t->t_wait_mutex); id = timeout_generic(CALLOUT_REALTIME, (void (*)(void *))cv_wakeup, t, - TICK_TO_NSEC(timeleft), nsec_per_tick, flag); + tim, res, flag); lwp->lwp_asleep = 1; lwp->lwp_sysabort = 0; thread_lock(t); @@ -427,12 +438,15 @@ out: * * cv_timedwait_sig() is now part of the DDI. * - * This function is now just a wrapper for cv_timedwait_sig_internal(). + * This function is now just a wrapper for cv_timedwait_sig_hires(). */ clock_t cv_timedwait_sig(kcondvar_t *cvp, kmutex_t *mp, clock_t tim) { - return (cv_timedwait_sig_internal(cvp, mp, tim, 0)); + hrtime_t hrtim; + + hrtim = TICK_TO_NSEC(tim - lbolt); + return (cv_timedwait_sig_hires(cvp, mp, hrtim, nsec_per_tick, 0)); } /* @@ -680,6 +694,7 @@ cv_waituntil_sig(kcondvar_t *cvp, kmutex_t *mp, { timestruc_t now; timestruc_t delta; + hrtime_t interval; int rval; if (when == NULL) @@ -694,14 +709,19 @@ cv_waituntil_sig(kcondvar_t *cvp, kmutex_t *mp, * Call cv_timedwait_sig() just to check for signals. * We will return immediately with either 0 or -1. */ - rval = cv_timedwait_sig(cvp, mp, lbolt); + rval = cv_timedwait_sig_hires(cvp, mp, 0, 1, 0); } else { - gethrestime_lasttick(&now); if (timecheck == timechanged) { - rval = cv_timedwait_sig_internal(cvp, mp, - lbolt + timespectohz(when, now), + /* + * Make sure that the interval is atleast one tick. + * This is to prevent a user from flooding the system + * with very small, high resolution timers. + */ + interval = ts2hrt(&delta); + if (interval < nsec_per_tick) + interval = nsec_per_tick; + rval = cv_timedwait_sig_hires(cvp, mp, interval, 1, CALLOUT_FLAG_HRESTIME); - } else { /* * Someone reset the system time; diff --git a/usr/src/uts/common/sys/callo.h b/usr/src/uts/common/sys/callo.h index 2b76fe62a8..6a464f9dd9 100644 --- a/usr/src/uts/common/sys/callo.h +++ b/usr/src/uts/common/sys/callo.h @@ -81,17 +81,20 @@ typedef struct callout { * returned. In such cases, a default generation number of 0 is assigned to * the legacy IDs. * - * The lower 32-bit ID space is partitioned into two spaces - one for 32-bit - * IDs and the other for 64-bit IDs. The 32-bit ID space is further divided - * into two spaces - one for short-term callouts and one for long-term. + * The lower 32-bit ID space is partitioned into two spaces - one for + * short-term callouts and one for long-term. * * Here is the bit layout for the callout ID: * - * 63 62 ... 32 31 30 29 .. X+1 X ... 1 0 - * ---------------------------------------------------------------- - * | Exec | Generation | Long | Counter | ID bits | Table | Type | - * | | number | term | High | | number | | - * ---------------------------------------------------------------- + * 63 62 61 ... 32 31 30 29 .. X+1 X ... 1 0 + * ----------------------------------------------------------------------- + * | Free | Exec | Generation | Long | Counter | ID bits | Table | Type | + * | | | number | term | High | | number | | + * ----------------------------------------------------------------------- + * + * Free: + * This bit indicates that this callout has been freed. This is for + * debugging purposes. * * Exec(uting): * This is the executing bit which is only set in the extended callout @@ -135,8 +138,10 @@ typedef struct callout { * This bit represents the callout (table) type. Each CPU has one realtime * and one normal callout table. */ -#define CALLOUT_EXECUTING 0x8000000000000000ULL -#define CALLOUT_ID_MASK ~(CALLOUT_EXECUTING) +#define CALLOUT_FREE 0x8000000000000000ULL +#define CALLOUT_EXECUTING 0x4000000000000000ULL +#define CALLOUT_ID_FLAGS (CALLOUT_FREE | CALLOUT_EXECUTING) +#define CALLOUT_ID_MASK ~CALLOUT_ID_FLAGS #define CALLOUT_GENERATION_LOW 0x100000000ULL #define CALLOUT_LONGTERM 0x80000000 #define CALLOUT_COUNTER_HIGH 0x40000000 @@ -178,7 +183,7 @@ typedef struct callout { #define CALLOUT_LONG_ID(table) \ (CALLOUT_SHORT_ID(table) | CALLOUT_LONGTERM) -#define CALLOUT_THREADS 2 /* keep it simple for now */ +#define CALLOUT_THREADS 2 #define CALLOUT_REALTIME 0 /* realtime callout type */ #define CALLOUT_NORMAL 1 /* normal callout type */ @@ -213,6 +218,21 @@ typedef struct callout_hash { void *ch_tail; } callout_hash_t; +/* + * CALLOUT_LIST_FLAG_FREE + * Callout list is free. + * CALLOUT_LIST_FLAG_ABSOLUTE + * Callout list contains absolute timers. + * CALLOUT_LIST_FLAG_HRESTIME + * Callout list contains hrestime timers. + * CALLOUT_LIST_FLAG_NANO + * Callout list contains 1-nanosecond resolution callouts. + */ +#define CALLOUT_LIST_FLAG_FREE 0x1 +#define CALLOUT_LIST_FLAG_ABSOLUTE 0x2 +#define CALLOUT_LIST_FLAG_HRESTIME 0x4 +#define CALLOUT_LIST_FLAG_NANO 0x8 + struct callout_list { callout_list_t *cl_next; /* next in clhash */ callout_list_t *cl_prev; /* prev in clhash */ @@ -222,6 +242,29 @@ struct callout_list { }; /* + * Callout heap element. Each element in the heap stores the expiration + * as well as the corresponding callout list. This is to avoid a lookup + * of the callout list when the heap is processed. Because we store the + * callout list pointer in the heap element, we have to always remove + * a heap element and its callout list together. We cannot remove one + * without the other. + */ +typedef struct callout_heap { + hrtime_t ch_expiration; + callout_list_t *ch_list; +} callout_heap_t; + +/* + * When the heap contains too many empty callout lists, it needs to be + * cleaned up. The decision to clean up the heap is a function of the + * number of empty entries and the heap size. Also, we don't want to + * clean up small heaps. + */ +#define CALLOUT_MIN_REAP (CALLOUT_BUCKETS >> 3) +#define CALLOUT_CLEANUP(ct) ((ct->ct_nreap >= callout_min_reap) && \ + (ct->ct_nreap >= (ct->ct_heap_num >> 1))) + +/* * Per-callout table kstats. * * CALLOUT_TIMEOUTS @@ -240,6 +283,8 @@ struct callout_list { * Number of callouts that expired. * CALLOUT_ALLOCATIONS * Number of callout structures allocated. + * CALLOUT_CLEANUPS + * Number of times a callout table is cleaned up. */ typedef enum callout_stat_type { CALLOUT_TIMEOUTS, @@ -249,6 +294,7 @@ typedef enum callout_stat_type { CALLOUT_UNTIMEOUTS_EXPIRED, CALLOUT_EXPIRATIONS, CALLOUT_ALLOCATIONS, + CALLOUT_CLEANUPS, CALLOUT_NUM_STATS } callout_stat_type_t; @@ -277,7 +323,6 @@ typedef enum callout_stat_type { #define CALLOUT_FLAG_HRESTIME 0x4 #define CALLOUT_FLAG_32BIT 0x8 -#define CALLOUT_LIST_FLAGS (CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_HRESTIME) /* * On 32-bit systems, the legacy interfaces, timeout() and realtime_timeout(), * must pass CALLOUT_FLAG_32BIT to timeout_generic() so that a 32-bit ID @@ -306,7 +351,7 @@ typedef struct callout_table { uint_t ct_type; /* callout table type */ uint_t ct_suspend; /* suspend count */ cyclic_id_t ct_cyclic; /* cyclic for this table */ - hrtime_t *ct_heap; /* callout expiration heap */ + callout_heap_t *ct_heap; /* callout expiration heap */ ulong_t ct_heap_num; /* occupied slots in the heap */ ulong_t ct_heap_max; /* end of the heap */ kmem_cache_t *ct_cache; /* callout kmem cache */ @@ -316,10 +361,11 @@ typedef struct callout_table { callout_hash_t ct_expired; /* list of expired callout lists */ taskq_t *ct_taskq; /* taskq to execute normal callouts */ kstat_t *ct_kstats; /* callout kstats */ + int ct_nreap; /* # heap entries that need reaping */ #ifdef _LP64 - ulong_t ct_pad[4]; /* cache alignment */ + char ct_pad[28]; /* cache alignment */ #else - ulong_t ct_pad[7]; /* cache alignment */ + char ct_pad[24]; /* cache alignment */ #endif } callout_table_t; @@ -340,6 +386,8 @@ typedef struct callout_table { ct_kstat_data[CALLOUT_EXPIRATIONS].value.ui64 #define ct_allocations \ ct_kstat_data[CALLOUT_ALLOCATIONS].value.ui64 +#define ct_cleanups \ + ct_kstat_data[CALLOUT_CLEANUPS].value.ui64 #define CALLOUT_CHUNK 128 @@ -350,12 +398,6 @@ typedef struct callout_table { #define CALLOUT_CYCLIC_HANDLER(t) \ ((t == CALLOUT_REALTIME) ? callout_realtime : callout_normal) -/* - * We define a blanket minimum resolution for callouts of 1 millisecond. - * 1 millisecond is a safe value as it is already supported when the clock - * resolution is set to high. - */ -#define CALLOUT_MIN_RESOLUTION 1000000ULL #define CALLOUT_TCP_RESOLUTION 10000000ULL #define CALLOUT_ALIGN 64 /* cache line size */ @@ -366,6 +408,8 @@ typedef struct callout_table { #define CALLOUT_MAX_TICKS LONG_MAX #endif +#define CALLOUT_TOLERANCE 200000 /* nanoseconds */ + extern void callout_init(void); extern void membar_sync(void); extern void callout_cpu_online(cpu_t *); diff --git a/usr/src/uts/common/syscall/lwp_timer.c b/usr/src/uts/common/syscall/lwp_timer.c index 134e42b06e..7d4592bbcb 100644 --- a/usr/src/uts/common/syscall/lwp_timer.c +++ b/usr/src/uts/common/syscall/lwp_timer.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -55,7 +55,7 @@ lwp_timer_timeout(void *arg) { lwp_timer_t *lwptp = arg; kthread_t *t = lwptp->lwpt_thread; - timespec_t now; + timespec_t now, delta; mutex_enter(&t->t_delay_lock); gethrestime(&now); @@ -68,10 +68,11 @@ lwp_timer_timeout(void *arg) (lwptp->lwpt_rqtime.tv_sec == now.tv_sec && lwptp->lwpt_rqtime.tv_nsec > now.tv_nsec))) { lwptp->lwpt_imm_timeout = 0; + delta = lwptp->lwpt_rqtime; + timespecsub(&delta, &now); lwptp->lwpt_id = timeout_generic(CALLOUT_REALTIME, - lwp_timer_timeout, lwptp, - TICK_TO_NSEC(timespectohz(&lwptp->lwpt_rqtime, now)), - nsec_per_tick, CALLOUT_FLAG_HRESTIME); + lwp_timer_timeout, lwptp, ts2hrt(&delta), nsec_per_tick, + (CALLOUT_FLAG_HRESTIME | CALLOUT_FLAG_ROUNDUP)); } else { /* * Set the thread running only if it is asleep on @@ -144,7 +145,7 @@ err: int lwp_timer_enqueue(lwp_timer_t *lwptp) { - timespec_t now; + timespec_t now, delta; ASSERT(lwptp->lwpt_thread == curthread); ASSERT(MUTEX_HELD(&curthread->t_delay_lock)); @@ -157,10 +158,11 @@ lwp_timer_enqueue(lwp_timer_t *lwptp) * Queue the timeout. */ lwptp->lwpt_imm_timeout = 0; + delta = lwptp->lwpt_rqtime; + timespecsub(&delta, &now); lwptp->lwpt_id = timeout_generic(CALLOUT_REALTIME, - lwp_timer_timeout, lwptp, - TICK_TO_NSEC(timespectohz(&lwptp->lwpt_rqtime, now)), - nsec_per_tick, CALLOUT_FLAG_HRESTIME); + lwp_timer_timeout, lwptp, ts2hrt(&delta), nsec_per_tick, + (CALLOUT_FLAG_HRESTIME | CALLOUT_FLAG_ROUNDUP)); return (0); } |