summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMadhavan Venkataraman <Madhavan.Venkataraman@Sun.COM>2009-04-10 07:14:10 -0700
committerMadhavan Venkataraman <Madhavan.Venkataraman@Sun.COM>2009-04-10 07:14:10 -0700
commit51b32bdd07bc63a6e416c5759f0f445147703107 (patch)
tree649c9f6f9d9ac24c56450f908276b1f83d1c4ff0
parent845e9415a97ec0124f099537b21fc0364883850f (diff)
downloadillumos-joyent-51b32bdd07bc63a6e416c5759f0f445147703107.tar.gz
6789031 High resolution timers needed for time-sensitive applications
6822357 assertion failed: expiration > 0, file: ../../common/os/cyclic.c, line: 3048 6827248 Empty callout lists need to be cleaned up more proactively 6827371 Solaris must support absolute and relative timers at the callout level
-rw-r--r--usr/src/cmd/mdb/common/modules/genunix/genunix.c66
-rw-r--r--usr/src/uts/common/os/callout.c562
-rw-r--r--usr/src/uts/common/os/condvar.c68
-rw-r--r--usr/src/uts/common/sys/callo.h86
-rw-r--r--usr/src/uts/common/syscall/lwp_timer.c20
5 files changed, 533 insertions, 269 deletions
diff --git a/usr/src/cmd/mdb/common/modules/genunix/genunix.c b/usr/src/cmd/mdb/common/modules/genunix/genunix.c
index fdf8d0679d..7efcc26d54 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/genunix.c
+++ b/usr/src/cmd/mdb/common/modules/genunix/genunix.c
@@ -685,6 +685,9 @@ static const char *co_typenames[] = { "R", "N" };
/* show real and normal, short and long, expired and unexpired. */
#define COF_DEFAULT (COF_REAL | COF_NORM | COF_LONG | COF_SHORT)
+#define COF_LIST_FLAGS \
+ (CALLOUT_LIST_FLAG_HRESTIME | CALLOUT_LIST_FLAG_ABSOLUTE)
+
/* private callout data for callback functions */
typedef struct callout_data {
uint_t flags; /* COF_* */
@@ -712,13 +715,27 @@ callouts_cb(uintptr_t addr, const void *data, void *priv)
{
callout_data_t *coargs = (callout_data_t *)priv;
callout_t *co = (callout_t *)data;
- int tableid;
+ int tableid, list_flags;
callout_id_t coid;
if ((coargs == NULL) || (co == NULL)) {
return (WALK_ERR);
}
+ if ((coargs->flags & COF_FREE) && !(co->c_xid & CALLOUT_FREE)) {
+ /*
+ * The callout must have been reallocated. No point in
+ * walking any more.
+ */
+ return (WALK_DONE);
+ }
+ if (!(coargs->flags & COF_FREE) && (co->c_xid & CALLOUT_FREE)) {
+ /*
+ * The callout must have been freed. No point in
+ * walking any more.
+ */
+ return (WALK_DONE);
+ }
if ((coargs->flags & COF_FUNC) &&
(coargs->funcaddr != (uintptr_t)co->c_func)) {
return (WALK_NEXT);
@@ -736,8 +753,7 @@ callouts_cb(uintptr_t addr, const void *data, void *priv)
if ((coargs->flags & COF_EXEC) && !(co->c_xid & CALLOUT_EXECUTING)) {
return (WALK_NEXT);
}
-
- /* it is possible we don't have the exp time */
+ /* it is possible we don't have the exp time or flags */
if (coargs->flags & COF_BYIDH) {
if (!(coargs->flags & COF_FREE)) {
/* we have to fetch the expire time ourselves. */
@@ -776,20 +792,20 @@ callouts_cb(uintptr_t addr, const void *data, void *priv)
}
}
/* tricky part, since both HIRES and ABS can be set */
+ list_flags = coargs->list_flags;
if ((coargs->flags & COF_HIRES) && (coargs->flags & COF_ABS)) {
/* both flags are set, only skip "regular" ones */
- if (! (coargs->list_flags &
- (CALLOUT_FLAG_HRESTIME | CALLOUT_FLAG_ABSOLUTE))) {
+ if (! (list_flags & COF_LIST_FLAGS)) {
return (WALK_NEXT);
}
} else {
/* individual flags, or no flags */
if ((coargs->flags & COF_HIRES) &&
- !(coargs->list_flags & CALLOUT_FLAG_HRESTIME)) {
+ !(list_flags & CALLOUT_LIST_FLAG_HRESTIME)) {
return (WALK_NEXT);
}
if ((coargs->flags & COF_ABS) &&
- !(coargs->list_flags & CALLOUT_FLAG_ABSOLUTE)) {
+ !(list_flags & CALLOUT_LIST_FLAG_ABSOLUTE)) {
return (WALK_NEXT);
}
}
@@ -826,7 +842,6 @@ callouts_cb(uintptr_t addr, const void *data, void *priv)
}
if (!(coargs->flags & COF_ADDR)) {
- int list_flags = coargs->list_flags;
if (!(coargs->flags & COF_VERBOSE)) {
mdb_printf("%-3d %1s %-14llx ",
TABLE_TO_SEQID(tableid),
@@ -838,10 +853,11 @@ callouts_cb(uintptr_t addr, const void *data, void *priv)
(coargs->flags & COF_EXPREL) ?
coargs->exp - coargs->now : coargs->exp);
}
+ list_flags = coargs->list_flags;
mdb_printf("%1s%1s%1s%1s %-?llx %a(%p)",
(co->c_xid & CALLOUT_EXECUTING) ? "X" : " ",
- (list_flags & CALLOUT_FLAG_HRESTIME) ? "H" : " ",
- (list_flags & CALLOUT_FLAG_ABSOLUTE) ? "A" : " ",
+ (list_flags & CALLOUT_LIST_FLAG_HRESTIME) ? "H" : " ",
+ (list_flags & CALLOUT_LIST_FLAG_ABSOLUTE) ? "A" : " ",
(co->c_xid & CALLOUT_LONGTERM) ? "L" : " ",
(long long)coid, co->c_func, co->c_arg);
if (coargs->flags & COF_LONGLIST) {
@@ -867,6 +883,7 @@ callout_list_cb(uintptr_t addr, const void *data, void *priv)
callout_data_t *coargs = (callout_data_t *)priv;
callout_list_t *cl = (callout_list_t *)data;
callout_t *coptr;
+ int list_flags;
if ((coargs == NULL) || (cl == NULL)) {
return (WALK_ERR);
@@ -874,7 +891,22 @@ callout_list_cb(uintptr_t addr, const void *data, void *priv)
coargs->exp = cl->cl_expiration;
coargs->list_flags = cl->cl_flags;
-
+ if ((coargs->flags & COF_FREE) &&
+ !(cl->cl_flags & CALLOUT_LIST_FLAG_FREE)) {
+ /*
+ * The callout list must have been reallocated. No point in
+ * walking any more.
+ */
+ return (WALK_DONE);
+ }
+ if (!(coargs->flags & COF_FREE) &&
+ (cl->cl_flags & CALLOUT_LIST_FLAG_FREE)) {
+ /*
+ * The callout list must have been freed. No point in
+ * walking any more.
+ */
+ return (WALK_DONE);
+ }
if ((coargs->flags & COF_TIME) &&
(cl->cl_expiration != coargs->time)) {
return (WALK_NEXT);
@@ -894,17 +926,16 @@ callout_list_cb(uintptr_t addr, const void *data, void *priv)
/* FOUR cases, each different, !A!B, !AB, A!B, AB */
if ((coargs->flags & COF_HIRES) && (coargs->flags & COF_ABS)) {
/* both flags are set, only skip "regular" ones */
- if (! (cl->cl_flags &
- (CALLOUT_FLAG_HRESTIME | CALLOUT_FLAG_ABSOLUTE))) {
+ if (! (cl->cl_flags & COF_LIST_FLAGS)) {
return (WALK_NEXT);
}
} else {
if ((coargs->flags & COF_HIRES) &&
- !(cl->cl_flags & CALLOUT_FLAG_HRESTIME)) {
+ !(cl->cl_flags & CALLOUT_LIST_FLAG_HRESTIME)) {
return (WALK_NEXT);
}
if ((coargs->flags & COF_ABS) &&
- !(cl->cl_flags & CALLOUT_FLAG_ABSOLUTE)) {
+ !(cl->cl_flags & CALLOUT_LIST_FLAG_ABSOLUTE)) {
return (WALK_NEXT);
}
}
@@ -935,12 +966,13 @@ callout_list_cb(uintptr_t addr, const void *data, void *priv)
CALLOUT_TYPE_MASK]);
}
+ list_flags = coargs->list_flags;
mdb_printf("%-14llx %1s%1s %-6d %-0?p ",
(coargs->flags & COF_EXPREL) ?
coargs->exp - coargs->now : coargs->exp,
- (coargs->list_flags & CALLOUT_FLAG_HRESTIME) ?
+ (list_flags & CALLOUT_LIST_FLAG_HRESTIME) ?
"H" : " ",
- (coargs->list_flags & CALLOUT_FLAG_ABSOLUTE) ?
+ (list_flags & CALLOUT_LIST_FLAG_ABSOLUTE) ?
"A" : " ",
coargs->bucket, cl->cl_callouts.ch_head);
diff --git a/usr/src/uts/common/os/callout.c b/usr/src/uts/common/os/callout.c
index adab6f16e6..ed1ae9aa83 100644
--- a/usr/src/uts/common/os/callout.c
+++ b/usr/src/uts/common/os/callout.c
@@ -40,8 +40,10 @@
/*
* Callout tables. See timeout(9F) for details.
*/
+static int callout_threads; /* callout normal threads */
static hrtime_t callout_debug_hrtime; /* debugger entry time */
-static int callout_min_resolution; /* Minimum resolution */
+static int callout_min_reap; /* callout minimum reap count */
+static int callout_tolerance; /* callout hires tolerance */
static callout_table_t *callout_boot_ct; /* Boot CPU's callout tables */
static clock_t callout_max_ticks; /* max interval */
static hrtime_t callout_longterm; /* longterm nanoseconds */
@@ -58,8 +60,8 @@ static callout_table_t *callout_table; /* global callout table array */
* as it will cause a deadlock. This has always been an unwritten rule.
* We are making it explicit here.
*/
-static int callout_realtime_level = CY_LOW_LEVEL;
-static int callout_normal_level = CY_LOCK_LEVEL;
+static volatile int callout_realtime_level = CY_LOW_LEVEL;
+static volatile int callout_normal_level = CY_LOCK_LEVEL;
static char *callout_kstat_names[] = {
"callout_timeouts",
@@ -69,8 +71,11 @@ static char *callout_kstat_names[] = {
"callout_untimeouts_expired",
"callout_expirations",
"callout_allocations",
+ "callout_cleanups",
};
+static hrtime_t callout_heap_process(callout_table_t *, hrtime_t, int);
+
#define CALLOUT_HASH_INSERT(hash, cp, cnext, cprev) \
{ \
callout_hash_t *hashp = &(hash); \
@@ -125,9 +130,22 @@ static char *callout_kstat_names[] = {
* they were queued. This is fair. Plus, it helps to make each
* callout expiration timely. It also favors cancellations.
*
- * - callout lists are queued in a LIFO manner in the callout list hash
- * table. This ensures that long term timers stay at the rear of the
- * hash lists.
+ * - callout lists are queued in the following manner in the callout
+ * hash table buckets:
+ *
+ * - appended, if the callout list is a 1-nanosecond resolution
+ * callout list. When a callout is created, we first look for
+ * a callout list that has the same expiration so we can avoid
+ * allocating a callout list and inserting the expiration into
+ * the heap. However, we do not want to look at 1-nanosecond
+ * resolution callout lists as we will seldom find a match in
+ * them. Keeping these callout lists in the rear of the hash
+ * buckets allows us to skip these during the lookup.
+ *
+ * - inserted at the beginning, if the callout list is not a
+ * 1-nanosecond resolution callout list. This also has the
+ * side-effect of keeping the long term timers away from the
+ * front of the buckets.
*
* - callout lists are queued in a FIFO manner in the expired callouts
* list. This ensures that callout lists are executed in the order
@@ -180,7 +198,7 @@ static char *callout_kstat_names[] = {
*/ \
exec = 1; \
} else if ((ct->ct_heap_num == 0) || \
- (ct->ct_heap[0] > gethrtime() + CALLOUT_THRESHOLD)) { \
+ (ct->ct_heap[0].ch_expiration > gethrtime() + CALLOUT_THRESHOLD)) {\
/* \
* If the heap has become empty, we need two threads as \
* there is no one to kick off the second thread in the \
@@ -200,6 +218,28 @@ static char *callout_kstat_names[] = {
}
/*
+ * Macro to swap two heap items.
+ */
+#define CALLOUT_SWAP(h1, h2) \
+{ \
+ callout_heap_t tmp; \
+ \
+ tmp = *h1; \
+ *h1 = *h2; \
+ *h2 = tmp; \
+}
+
+/*
+ * Macro to free a callout list.
+ */
+#define CALLOUT_LIST_FREE(ct, cl) \
+{ \
+ cl->cl_next = ct->ct_lfree; \
+ ct->ct_lfree = cl; \
+ cl->cl_flags |= CALLOUT_LIST_FLAG_FREE; \
+}
+
+/*
* Allocate a callout structure. We try quite hard because we
* can't sleep, and if we can't do the allocation, we're toast.
* Failing all, we try a KM_PANIC allocation. Note that we never
@@ -252,59 +292,46 @@ callout_list_alloc(callout_table_t *ct)
bzero(cl, sizeof (callout_list_t));
mutex_enter(&ct->ct_mutex);
- cl->cl_next = ct->ct_lfree;
- ct->ct_lfree = cl;
+ CALLOUT_LIST_FREE(ct, cl);
}
/*
- * Find a callout list that corresponds to an expiration.
+ * Find a callout list that corresponds to an expiration and matching flags.
*/
static callout_list_t *
callout_list_get(callout_table_t *ct, hrtime_t expiration, int flags, int hash)
{
callout_list_t *cl;
+ int clflags;
ASSERT(MUTEX_HELD(&ct->ct_mutex));
- for (cl = ct->ct_clhash[hash].ch_head; (cl != NULL); cl = cl->cl_next) {
- if ((cl->cl_expiration == expiration) &&
- (cl->cl_flags == flags))
- return (cl);
+ if (flags & CALLOUT_LIST_FLAG_NANO) {
+ /*
+ * This is a 1-nanosecond resolution callout. We will rarely
+ * find a match for this. So, bail out.
+ */
+ return (NULL);
}
- return (NULL);
-}
-
-/*
- * Find the callout list that corresponds to an expiration.
- * If the callout list is null, free it. Else, return it.
- */
-static callout_list_t *
-callout_list_check(callout_table_t *ct, hrtime_t expiration, int hash)
-{
- callout_list_t *cl;
-
- ASSERT(MUTEX_HELD(&ct->ct_mutex));
-
+ clflags = (CALLOUT_LIST_FLAG_ABSOLUTE | CALLOUT_LIST_FLAG_HRESTIME);
for (cl = ct->ct_clhash[hash].ch_head; (cl != NULL); cl = cl->cl_next) {
- if (cl->cl_expiration == expiration) {
- if (cl->cl_callouts.ch_head != NULL) {
- /*
- * Found a match.
- */
- return (cl);
- }
-
- CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
- cl->cl_next = ct->ct_lfree;
- ct->ct_lfree = cl;
-
+ /*
+ * If we have reached a 1-nanosecond resolution callout list,
+ * we don't have much hope of finding a match in this hash
+ * bucket. So, just bail out.
+ */
+ if (cl->cl_flags & CALLOUT_LIST_FLAG_NANO)
return (NULL);
- }
+
+ if ((cl->cl_expiration == expiration) &&
+ ((cl->cl_flags & clflags) == (flags & clflags)))
+ return (cl);
}
return (NULL);
}
+
/*
* Initialize a callout table's heap, if necessary. Preallocate some free
* entries so we don't have to check for NULL elsewhere.
@@ -319,7 +346,7 @@ callout_heap_init(callout_table_t *ct)
ct->ct_heap_num = 0;
ct->ct_heap_max = CALLOUT_CHUNK;
- size = sizeof (hrtime_t) * CALLOUT_CHUNK;
+ size = sizeof (callout_heap_t) * CALLOUT_CHUNK;
ct->ct_heap = kmem_alloc(size, KM_SLEEP);
}
@@ -332,7 +359,7 @@ static void
callout_heap_expand(callout_table_t *ct)
{
size_t max, size, osize;
- hrtime_t *heap;
+ callout_heap_t *heap;
ASSERT(MUTEX_HELD(&ct->ct_mutex));
ASSERT(ct->ct_heap_num <= ct->ct_heap_max);
@@ -341,8 +368,8 @@ callout_heap_expand(callout_table_t *ct)
max = ct->ct_heap_max;
mutex_exit(&ct->ct_mutex);
- osize = sizeof (hrtime_t) * max;
- size = sizeof (hrtime_t) * (max + CALLOUT_CHUNK);
+ osize = sizeof (callout_heap_t) * max;
+ size = sizeof (callout_heap_t) * (max + CALLOUT_CHUNK);
heap = kmem_alloc_tryhard(size, &size, KM_NOSLEEP | KM_PANIC);
mutex_enter(&ct->ct_mutex);
@@ -358,7 +385,7 @@ callout_heap_expand(callout_table_t *ct)
bcopy(ct->ct_heap, heap, osize);
kmem_free(ct->ct_heap, osize);
ct->ct_heap = heap;
- ct->ct_heap_max = size / sizeof (hrtime_t);
+ ct->ct_heap_max = size / sizeof (callout_heap_t);
}
}
@@ -371,7 +398,7 @@ static int
callout_upheap(callout_table_t *ct)
{
int current, parent;
- hrtime_t *heap, current_expiration, parent_expiration;
+ callout_heap_t *heap, *hcurrent, *hparent;
ASSERT(MUTEX_HELD(&ct->ct_mutex));
ASSERT(ct->ct_heap_num >= 1);
@@ -385,21 +412,20 @@ callout_upheap(callout_table_t *ct)
for (;;) {
parent = CALLOUT_HEAP_PARENT(current);
- current_expiration = heap[current];
- parent_expiration = heap[parent];
+ hparent = &heap[parent];
+ hcurrent = &heap[current];
/*
* We have an expiration later than our parent; we're done.
*/
- if (current_expiration >= parent_expiration) {
+ if (hcurrent->ch_expiration >= hparent->ch_expiration) {
return (0);
}
/*
* We need to swap with our parent, and continue up the heap.
*/
- heap[parent] = current_expiration;
- heap[current] = parent_expiration;
+ CALLOUT_SWAP(hparent, hcurrent);
/*
* If we just reached the root, we're done.
@@ -414,18 +440,20 @@ callout_upheap(callout_table_t *ct)
}
/*
- * Insert a new expiration into a callout table's heap.
+ * Insert a new heap item into a callout table's heap.
*/
static void
-callout_heap_insert(callout_table_t *ct, hrtime_t expiration)
+callout_heap_insert(callout_table_t *ct, callout_list_t *cl)
{
ASSERT(MUTEX_HELD(&ct->ct_mutex));
ASSERT(ct->ct_heap_num < ct->ct_heap_max);
/*
- * First, copy the expiration to the bottom of the heap.
+ * First, copy the expiration and callout list pointer to the bottom
+ * of the heap.
*/
- ct->ct_heap[ct->ct_heap_num] = expiration;
+ ct->ct_heap[ct->ct_heap_num].ch_expiration = cl->cl_expiration;
+ ct->ct_heap[ct->ct_heap_num].ch_list = cl;
ct->ct_heap_num++;
/*
@@ -439,7 +467,7 @@ callout_heap_insert(callout_table_t *ct, hrtime_t expiration)
* in the heap.
*/
if (callout_upheap(ct) && (ct->ct_suspend == 0))
- (void) cyclic_reprogram(ct->ct_cyclic, expiration);
+ (void) cyclic_reprogram(ct->ct_cyclic, cl->cl_expiration);
}
/*
@@ -449,8 +477,8 @@ callout_heap_insert(callout_table_t *ct, hrtime_t expiration)
static void
callout_downheap(callout_table_t *ct)
{
- int left, right, current, nelems;
- hrtime_t *heap, left_expiration, right_expiration, current_expiration;
+ int current, left, right, nelems;
+ callout_heap_t *heap, *hleft, *hright, *hcurrent;
ASSERT(MUTEX_HELD(&ct->ct_mutex));
ASSERT(ct->ct_heap_num >= 1);
@@ -467,8 +495,8 @@ callout_downheap(callout_table_t *ct)
if ((left = CALLOUT_HEAP_LEFT(current)) >= nelems)
return;
- left_expiration = heap[left];
- current_expiration = heap[current];
+ hleft = &heap[left];
+ hcurrent = &heap[current];
right = CALLOUT_HEAP_RIGHT(current);
@@ -479,28 +507,27 @@ callout_downheap(callout_table_t *ct)
if (right >= nelems)
goto comp_left;
- right_expiration = heap[right];
+ hright = &heap[right];
/*
* We have both a left and a right child. We need to compare
* the expiration of the children to determine which
* expires earlier.
*/
- if (right_expiration < left_expiration) {
+ if (hright->ch_expiration < hleft->ch_expiration) {
/*
* Our right child is the earlier of our children.
* We'll now compare our expiration to its expiration.
* If ours is the earlier one, we're done.
*/
- if (current_expiration <= right_expiration)
+ if (hcurrent->ch_expiration <= hright->ch_expiration)
return;
/*
* Our right child expires earlier than we do; swap
* with our right child, and descend right.
*/
- heap[right] = current_expiration;
- heap[current] = right_expiration;
+ CALLOUT_SWAP(hright, hcurrent);
current = right;
continue;
}
@@ -511,15 +538,14 @@ comp_left:
* no right child). We'll now compare our expiration
* to its expiration. If ours is the earlier one, we're done.
*/
- if (current_expiration <= left_expiration)
+ if (hcurrent->ch_expiration <= hleft->ch_expiration)
return;
/*
* Our left child expires earlier than we do; swap with our
* left child, and descend left.
*/
- heap[left] = current_expiration;
- heap[current] = left_expiration;
+ CALLOUT_SWAP(hleft, hcurrent);
current = left;
}
}
@@ -530,29 +556,42 @@ comp_left:
static void
callout_heap_delete(callout_table_t *ct)
{
- hrtime_t now, expiration;
+ hrtime_t now, expiration, next;
callout_list_t *cl;
+ callout_heap_t *heap;
int hash;
ASSERT(MUTEX_HELD(&ct->ct_mutex));
+ if (CALLOUT_CLEANUP(ct)) {
+ /*
+ * There are too many heap elements pointing to empty callout
+ * lists. Clean them out.
+ */
+ (void) callout_heap_process(ct, 0, 0);
+ }
+
now = gethrtime();
+ heap = ct->ct_heap;
while (ct->ct_heap_num > 0) {
- expiration = ct->ct_heap[0];
- /*
- * Find the callout list that corresponds to the expiration.
- * If the callout list is empty, callout_list_check()
- * will free the callout list and return NULL.
- */
+ expiration = heap->ch_expiration;
hash = CALLOUT_CLHASH(expiration);
- cl = callout_list_check(ct, expiration, hash);
- if (cl != NULL) {
+ cl = heap->ch_list;
+ ASSERT(expiration == cl->cl_expiration);
+
+ if (cl->cl_callouts.ch_head == NULL) {
/*
- * If the root of the heap expires in the future, we are
- * done. We are doing this check here instead of at the
- * beginning because we want to first free all the
- * empty callout lists at the top of the heap.
+ * If the callout list is empty, reap it.
+ * Decrement the reap count.
+ */
+ CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
+ CALLOUT_LIST_FREE(ct, cl);
+ ct->ct_nreap--;
+ } else {
+ /*
+ * If the root of the heap expires in the future,
+ * bail out.
*/
if (expiration > now)
break;
@@ -572,23 +611,166 @@ callout_heap_delete(callout_table_t *ct)
*/
ct->ct_heap_num--;
if (ct->ct_heap_num > 0) {
- ct->ct_heap[0] = ct->ct_heap[ct->ct_heap_num];
+ heap[0] = heap[ct->ct_heap_num];
callout_downheap(ct);
}
}
/*
- * If this callout table is empty or callouts have been suspended
- * by CPR, just return. The cyclic has already been programmed to
+ * If this callout table is empty or callouts have been suspended,
+ * just return. The cyclic has already been programmed to
* infinity by the cyclic subsystem.
*/
if ((ct->ct_heap_num == 0) || (ct->ct_suspend > 0))
return;
+ /*
+ * If the top expirations are within callout_tolerance of each other,
+ * delay the cyclic expire so that they can be processed together.
+ * This is to prevent high resolution timers from swamping the system
+ * with cyclic activity.
+ */
+ if (ct->ct_heap_num > 2) {
+ next = expiration + callout_tolerance;
+ if ((heap[1].ch_expiration < next) ||
+ (heap[2].ch_expiration < next))
+ expiration = next;
+ }
+
(void) cyclic_reprogram(ct->ct_cyclic, expiration);
}
/*
+ * There are some situations when the entire heap is walked and processed.
+ * This function is called to do the processing. These are the situations:
+ *
+ * 1. When the reap count reaches its threshold, the heap has to be cleared
+ * of all empty callout lists.
+ *
+ * 2. When the system enters and exits KMDB/OBP, all entries in the heap
+ * need to be adjusted by the interval spent in KMDB/OBP.
+ *
+ * 3. When system time is changed, the heap has to be scanned for
+ * absolute hrestime timers. These need to be removed from the heap
+ * and expired immediately.
+ *
+ * In cases 2 and 3, it is a good idea to do 1 as well since we are
+ * scanning the heap anyway.
+ *
+ * If the root gets changed and/or callout lists are expired, return the
+ * new expiration to the caller so he can reprogram the cyclic accordingly.
+ */
+static hrtime_t
+callout_heap_process(callout_table_t *ct, hrtime_t delta, int timechange)
+{
+ callout_heap_t *heap;
+ callout_list_t *cl, *rootcl;
+ hrtime_t expiration, now;
+ int i, hash, clflags, expired;
+ ulong_t num;
+
+ ASSERT(MUTEX_HELD(&ct->ct_mutex));
+
+ if (ct->ct_heap_num == 0)
+ return (0);
+
+ if (ct->ct_nreap > 0)
+ ct->ct_cleanups++;
+
+ heap = ct->ct_heap;
+ rootcl = heap->ch_list;
+
+ /*
+ * We walk the heap from the top to the bottom. If we encounter
+ * a heap item that points to an empty callout list, we clean
+ * it out. If we encounter a hrestime entry that must be removed,
+ * again we clean it out. Otherwise, we apply any adjustments needed
+ * to an element.
+ *
+ * During the walk, we also compact the heap from the bottom and
+ * reconstruct the heap using upheap operations. This is very
+ * efficient if the number of elements to be cleaned is greater than
+ * or equal to half the heap. This is the common case.
+ *
+ * Even in the non-common case, the upheap operations should be short
+ * as the entries below generally tend to be bigger than the entries
+ * above.
+ */
+ num = ct->ct_heap_num;
+ ct->ct_heap_num = 0;
+ clflags = (CALLOUT_LIST_FLAG_HRESTIME | CALLOUT_LIST_FLAG_ABSOLUTE);
+ now = gethrtime();
+ expired = 0;
+ for (i = 0; i < num; i++) {
+ cl = heap[i].ch_list;
+ /*
+ * If the callout list is empty, delete the heap element and
+ * free the callout list.
+ */
+ if (cl->cl_callouts.ch_head == NULL) {
+ hash = CALLOUT_CLHASH(cl->cl_expiration);
+ CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
+ CALLOUT_LIST_FREE(ct, cl);
+ continue;
+ }
+
+ /*
+ * Delete the heap element and expire the callout list, if
+ * one of the following is true:
+ * - the callout list has expired
+ * - the callout list is an absolute hrestime one and
+ * there has been a system time change
+ */
+ if ((cl->cl_expiration <= now) ||
+ (timechange && ((cl->cl_flags & clflags) == clflags))) {
+ hash = CALLOUT_CLHASH(cl->cl_expiration);
+ CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
+ CALLOUT_LIST_APPEND(ct->ct_expired, cl);
+ expired = 1;
+ continue;
+ }
+
+ /*
+ * Apply adjustments, if any. Adjustments are applied after
+ * the system returns from KMDB or OBP. They are only applied
+ * to relative callout lists.
+ */
+ if (delta && !(cl->cl_flags & CALLOUT_LIST_FLAG_ABSOLUTE)) {
+ hash = CALLOUT_CLHASH(cl->cl_expiration);
+ CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
+ expiration = cl->cl_expiration + delta;
+ if (expiration <= 0)
+ expiration = CY_INFINITY;
+ heap[i].ch_expiration = expiration;
+ cl->cl_expiration = expiration;
+ hash = CALLOUT_CLHASH(cl->cl_expiration);
+ if (cl->cl_flags & CALLOUT_LIST_FLAG_NANO) {
+ CALLOUT_LIST_APPEND(ct->ct_clhash[hash], cl);
+ } else {
+ CALLOUT_LIST_INSERT(ct->ct_clhash[hash], cl);
+ }
+ }
+
+ heap[ct->ct_heap_num] = heap[i];
+ ct->ct_heap_num++;
+ (void) callout_upheap(ct);
+ }
+
+ ct->ct_nreap = 0;
+
+ if (expired)
+ expiration = gethrtime();
+ else if (ct->ct_heap_num == 0)
+ expiration = CY_INFINITY;
+ else if (rootcl != heap->ch_list)
+ expiration = heap->ch_expiration;
+ else
+ expiration = 0;
+
+ return (expiration);
+}
+
+/*
* Common function used to create normal and realtime callouts.
*
* Realtime callouts are handled at CY_LOW_PIL by a cyclic handler. So,
@@ -606,17 +788,17 @@ timeout_generic(int type, void (*func)(void *), void *arg,
callout_t *cp;
callout_id_t id;
callout_list_t *cl;
- hrtime_t now, interval;
- int hash;
+ hrtime_t now, interval, rexpiration;
+ int hash, clflags;
ASSERT(resolution > 0);
ASSERT(func != NULL);
/*
- * Please see comment about minimum resolution in callout_init().
+ * We get the current hrtime right upfront so that latencies in
+ * this function do not affect the accuracy of the callout.
*/
- if (resolution < callout_min_resolution)
- resolution = callout_min_resolution;
+ now = gethrtime();
/*
* We disable kernel preemption so that we remain on the same CPU
@@ -644,6 +826,16 @@ timeout_generic(int type, void (*func)(void *), void *arg,
mutex_enter(&ct->ct_mutex);
}
+ if (CALLOUT_CLEANUP(ct)) {
+ /*
+ * There are too many heap elements pointing to empty callout
+ * lists. Clean them out.
+ */
+ rexpiration = callout_heap_process(ct, 0, 0);
+ if ((rexpiration != 0) && (ct->ct_suspend == 0))
+ (void) cyclic_reprogram(ct->ct_cyclic, rexpiration);
+ }
+
if ((cp = ct->ct_free) == NULL)
cp = callout_alloc(ct);
else
@@ -655,16 +847,22 @@ timeout_generic(int type, void (*func)(void *), void *arg,
/*
* Compute the expiration hrtime.
*/
- now = gethrtime();
if (flags & CALLOUT_FLAG_ABSOLUTE) {
interval = expiration - now;
} else {
interval = expiration;
expiration += now;
}
- if (flags & CALLOUT_FLAG_ROUNDUP)
- expiration += resolution - 1;
- expiration = (expiration / resolution) * resolution;
+
+ if (resolution > 1) {
+ /*
+ * Align expiration to the specified resolution.
+ */
+ if (flags & CALLOUT_FLAG_ROUNDUP)
+ expiration += resolution - 1;
+ expiration = (expiration / resolution) * resolution;
+ }
+
if (expiration <= 0) {
/*
* expiration hrtime overflow has occurred. Just set the
@@ -697,15 +895,20 @@ timeout_generic(int type, void (*func)(void *), void *arg,
cp->c_xid = id;
- flags &= CALLOUT_LIST_FLAGS;
+ clflags = 0;
+ if (flags & CALLOUT_FLAG_ABSOLUTE)
+ clflags |= CALLOUT_LIST_FLAG_ABSOLUTE;
+ if (flags & CALLOUT_FLAG_HRESTIME)
+ clflags |= CALLOUT_LIST_FLAG_HRESTIME;
+ if (resolution == 1)
+ clflags |= CALLOUT_LIST_FLAG_NANO;
hash = CALLOUT_CLHASH(expiration);
again:
/*
* Try to see if a callout list already exists for this expiration.
- * Most of the time, this will be the case.
*/
- cl = callout_list_get(ct, expiration, flags, hash);
+ cl = callout_list_get(ct, expiration, clflags, hash);
if (cl == NULL) {
/*
* Check if we have enough space in the heap to insert one
@@ -743,16 +946,28 @@ again:
}
ct->ct_lfree = cl->cl_next;
cl->cl_expiration = expiration;
- cl->cl_flags = flags;
+ cl->cl_flags = clflags;
- CALLOUT_LIST_INSERT(ct->ct_clhash[hash], cl);
+ if (clflags & CALLOUT_LIST_FLAG_NANO) {
+ CALLOUT_LIST_APPEND(ct->ct_clhash[hash], cl);
+ } else {
+ CALLOUT_LIST_INSERT(ct->ct_clhash[hash], cl);
+ }
/*
* This is a new expiration. So, insert it into the heap.
* This will also reprogram the cyclic, if the expiration
* propagated to the root of the heap.
*/
- callout_heap_insert(ct, expiration);
+ callout_heap_insert(ct, cl);
+ } else {
+ /*
+ * If the callout list was empty, untimeout_generic() would
+ * have incremented a reap count. Decrement the reap count
+ * as we are going to insert a callout into this list.
+ */
+ if (cl->cl_callouts.ch_head == NULL)
+ ct->ct_nreap--;
}
cp->c_list = cl;
CALLOUT_APPEND(ct, cp);
@@ -861,6 +1076,7 @@ untimeout_generic(callout_id_t id, int nowait)
callout_table_t *ct;
callout_t *cp;
callout_id_t xid;
+ callout_list_t *cl;
int hash;
callout_id_t bogus;
@@ -894,12 +1110,22 @@ untimeout_generic(callout_id_t id, int nowait)
* order to avoid lots of X-calls to the CPU associated
* with the callout table.
*/
- expiration = cp->c_list->cl_expiration;
+ cl = cp->c_list;
+ expiration = cl->cl_expiration;
CALLOUT_DELETE(ct, cp);
cp->c_idnext = ct->ct_free;
ct->ct_free = cp;
+ cp->c_xid |= CALLOUT_FREE;
ct->ct_untimeouts_unexpired++;
ct->ct_timeouts_pending--;
+
+ /*
+ * If the callout list has become empty, it needs
+ * to be cleaned along with its heap entry. Increment
+ * a reap count.
+ */
+ if (cl->cl_callouts.ch_head == NULL)
+ ct->ct_nreap++;
mutex_exit(&ct->ct_mutex);
expiration -= gethrtime();
@@ -957,7 +1183,7 @@ untimeout_generic(callout_id_t id, int nowait)
* (1) the callout already fired, or (2) the caller passed us
* a bogus value. Perform a sanity check to detect case (2).
*/
- bogus = (CALLOUT_EXECUTING | CALLOUT_COUNTER_HIGH);
+ bogus = (CALLOUT_ID_FLAGS | CALLOUT_COUNTER_HIGH);
if (((id & bogus) != CALLOUT_COUNTER_HIGH) && (id != 0))
panic("untimeout: impossible timeout id %llx",
(unsigned long long)id);
@@ -1058,6 +1284,7 @@ callout_list_expire(callout_table_t *ct, callout_list_t *cl)
CALLOUT_DELETE(ct, cp);
cp->c_idnext = ct->ct_free;
ct->ct_free = cp;
+ cp->c_xid |= CALLOUT_FREE;
if (cp->c_waiting) {
cp->c_waiting = 0;
@@ -1088,8 +1315,7 @@ callout_expire(callout_table_t *ct)
* Free the callout list.
*/
CALLOUT_LIST_DELETE(ct->ct_expired, cl);
- cl->cl_next = ct->ct_lfree;
- ct->ct_lfree = cl;
+ CALLOUT_LIST_FREE(ct, cl);
}
}
}
@@ -1187,59 +1413,11 @@ callout_suspend(void)
}
}
-static void
-callout_adjust(callout_table_t *ct, hrtime_t delta)
-{
- int hash, newhash;
- hrtime_t expiration;
- callout_list_t *cl;
- callout_hash_t list;
-
- ASSERT(MUTEX_HELD(&ct->ct_mutex));
-
- /*
- * In order to adjust the expirations, we null out the heap. Then,
- * we reinsert adjusted expirations in the heap. Keeps it simple.
- * Note that since the CALLOUT_TABLE_SUSPENDED flag is set by the
- * caller, the heap insert does not result in cyclic reprogramming.
- */
- ct->ct_heap_num = 0;
-
- /*
- * First, remove all the callout lists from the table and string them
- * in a list.
- */
- list.ch_head = list.ch_tail = NULL;
- for (hash = 0; hash < CALLOUT_BUCKETS; hash++) {
- while ((cl = ct->ct_clhash[hash].ch_head) != NULL) {
- CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
- CALLOUT_LIST_APPEND(list, cl);
- }
- }
-
- /*
- * Now, traverse the callout lists and adjust their expirations.
- */
- while ((cl = list.ch_head) != NULL) {
- CALLOUT_LIST_DELETE(list, cl);
- /*
- * Set the new expiration and reinsert in the right
- * hash bucket.
- */
- expiration = cl->cl_expiration;
- expiration += delta;
- cl->cl_expiration = expiration;
- newhash = CALLOUT_CLHASH(expiration);
- CALLOUT_LIST_INSERT(ct->ct_clhash[newhash], cl);
- callout_heap_insert(ct, expiration);
- }
-}
-
/*
* Resume callout processing.
*/
static void
-callout_resume(hrtime_t delta)
+callout_resume(hrtime_t delta, int timechange)
{
hrtime_t exp;
int t, f;
@@ -1261,8 +1439,14 @@ callout_resume(hrtime_t delta)
continue;
}
- if (delta)
- callout_adjust(ct, delta);
+ /*
+ * If a delta is specified, adjust the expirations in
+ * the heap by delta. Also, if the caller indicates
+ * a timechange, process that. This step also cleans
+ * out any empty callout lists that might happen to
+ * be there.
+ */
+ (void) callout_heap_process(ct, delta, timechange);
ct->ct_suspend--;
if (ct->ct_suspend == 0) {
@@ -1274,13 +1458,14 @@ callout_resume(hrtime_t delta)
if (ct->ct_expired.ch_head != NULL)
exp = gethrtime();
else if (ct->ct_heap_num > 0)
- exp = ct->ct_heap[0];
+ exp = ct->ct_heap[0].ch_expiration;
else
exp = 0;
if (exp != 0)
(void) cyclic_reprogram(ct->ct_cyclic,
exp);
}
+
mutex_exit(&ct->ct_mutex);
}
}
@@ -1288,6 +1473,11 @@ callout_resume(hrtime_t delta)
/*
* Callback handler used by CPR to stop and resume callouts.
+ * The cyclic subsystem saves and restores hrtime during CPR.
+ * That is why callout_resume() is called with a 0 delta.
+ * Although hrtime is the same, hrestime (system time) has
+ * progressed during CPR. So, we have to indicate a time change
+ * to expire the absolute hrestime timers.
*/
/*ARGSUSED*/
static boolean_t
@@ -1296,7 +1486,7 @@ callout_cpr_callb(void *arg, int code)
if (code == CB_CODE_CPR_CHKPT)
callout_suspend();
else
- callout_resume(0);
+ callout_resume(0, 1);
return (B_TRUE);
}
@@ -1320,7 +1510,7 @@ callout_debug_callb(void *arg, int code)
callout_debug_hrtime = gethrtime();
} else {
delta = gethrtime() - callout_debug_hrtime;
- callout_resume(delta);
+ callout_resume(delta, 0);
}
return (B_TRUE);
@@ -1334,8 +1524,7 @@ callout_debug_callb(void *arg, int code)
static void
callout_hrestime_one(callout_table_t *ct)
{
- callout_list_t *cl, *clnext;
- int hash, flags;
+ hrtime_t expiration;
mutex_enter(&ct->ct_mutex);
if (ct->ct_heap_num == 0) {
@@ -1343,19 +1532,13 @@ callout_hrestime_one(callout_table_t *ct)
return;
}
- flags = CALLOUT_LIST_FLAGS;
- for (hash = 0; hash < CALLOUT_BUCKETS; hash++) {
- for (cl = ct->ct_clhash[hash].ch_head; cl; cl = clnext) {
- clnext = cl->cl_next;
- if (cl->cl_flags == flags) {
- CALLOUT_LIST_DELETE(ct->ct_clhash[hash], cl);
- CALLOUT_LIST_APPEND(ct->ct_expired, cl);
- }
- }
- }
+ /*
+ * Walk the heap and process all the absolute hrestime entries.
+ */
+ expiration = callout_heap_process(ct, 0, 1);
- if ((ct->ct_expired.ch_head != NULL) && (ct->ct_suspend == 0))
- (void) cyclic_reprogram(ct->ct_cyclic, gethrtime());
+ if ((expiration != 0) && (ct->ct_suspend == 0))
+ (void) cyclic_reprogram(ct->ct_cyclic, expiration);
mutex_exit(&ct->ct_mutex);
}
@@ -1456,7 +1639,7 @@ callout_cyclic_init(callout_table_t *ct)
/*
* Each callout thread consumes exactly one
* task structure while active. Therefore,
- * prepopulating with 2 * CALLOUT_THREADS tasks
+ * prepopulating with 2 * callout_threads tasks
* ensures that there's at least one task per
* thread that's either scheduled or on the
* freelist. In turn, this guarantees that
@@ -1467,8 +1650,8 @@ callout_cyclic_init(callout_table_t *ct)
*/
ct->ct_taskq =
taskq_create_instance("callout_taskq", seqid,
- CALLOUT_THREADS, maxclsyspri,
- 2 * CALLOUT_THREADS, 2 * CALLOUT_THREADS,
+ callout_threads, maxclsyspri,
+ 2 * callout_threads, 2 * callout_threads,
TASKQ_PREPOPULATE | TASKQ_CPR_SAFE);
}
@@ -1642,30 +1825,13 @@ callout_init(void)
callout_counter_low = 1 << CALLOUT_COUNTER_SHIFT;
callout_longterm = TICK_TO_NSEC(CALLOUT_LONGTERM_TICKS);
callout_max_ticks = CALLOUT_MAX_TICKS;
+ if (callout_min_reap == 0)
+ callout_min_reap = CALLOUT_MIN_REAP;
- /*
- * Because of the variability in timing behavior across systems with
- * different architectures, we cannot allow arbitrarily low
- * resolutions. The minimum resolution has to be determined in a
- * platform-specific way. Until then, we define a blanket minimum
- * resolution for callouts of CALLOUT_MIN_RESOLUTION.
- *
- * If, in the future, someone requires lower resolution timers, they
- * can do one of two things:
- *
- * - Define a lower value for callout_min_resolution. This would
- * affect all clients of the callout subsystem. If this done
- * via /etc/system, then no code changes are required and it
- * would affect only that customer.
- *
- * - Define a flag to be passed to timeout creation that allows
- * the lower resolution. This involves code changes. But it
- * would affect only the calling module. It is the developer's
- * responsibility to test on all systems and make sure that
- * everything works.
- */
- if (callout_min_resolution <= 0)
- callout_min_resolution = CALLOUT_MIN_RESOLUTION;
+ if (callout_tolerance <= 0)
+ callout_tolerance = CALLOUT_TOLERANCE;
+ if (callout_threads <= 0)
+ callout_threads = CALLOUT_THREADS;
/*
* Allocate all the callout tables based on max_ncpus. We have chosen
diff --git a/usr/src/uts/common/os/condvar.c b/usr/src/uts/common/os/condvar.c
index cb1543e767..18406bea26 100644
--- a/usr/src/uts/common/os/condvar.c
+++ b/usr/src/uts/common/os/condvar.c
@@ -39,6 +39,8 @@
#include <sys/sdt.h>
#include <sys/callo.h>
+clock_t cv_timedwait_hires(kcondvar_t *, kmutex_t *, hrtime_t, hrtime_t, int);
+
/*
* CV_MAX_WAITERS is the maximum number of waiters we track; once
* the number becomes higher than that, we look at the sleepq to
@@ -221,19 +223,34 @@ cv_wakeup(void *arg)
clock_t
cv_timedwait(kcondvar_t *cvp, kmutex_t *mp, clock_t tim)
{
+ hrtime_t hrtim;
+
+ if (tim <= lbolt)
+ return (-1);
+
+ hrtim = TICK_TO_NSEC(tim - lbolt);
+ return (cv_timedwait_hires(cvp, mp, hrtim, nsec_per_tick, 0));
+}
+
+clock_t
+cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
+ hrtime_t res, int flag)
+{
kthread_t *t = curthread;
callout_id_t id;
clock_t timeleft;
+ hrtime_t limit;
int signalled;
if (panicstr)
return (-1);
- timeleft = tim - lbolt;
- if (timeleft <= 0)
+ limit = (flag & CALLOUT_FLAG_ABSOLUTE) ? gethrtime() : 0;
+ if (tim <= limit)
return (-1);
mutex_enter(&t->t_wait_mutex);
- id = realtime_timeout_default((void (*)(void *))cv_wakeup, t, timeleft);
+ id = timeout_generic(CALLOUT_REALTIME, (void (*)(void *))cv_wakeup, t,
+ tim, res, flag);
thread_lock(t); /* lock the thread */
cv_block((condvar_impl_t *)cvp);
thread_unlock_nopreempt(t);
@@ -315,7 +332,8 @@ cv_wait_sig(kcondvar_t *cvp, kmutex_t *mp)
}
static clock_t
-cv_timedwait_sig_internal(kcondvar_t *cvp, kmutex_t *mp, clock_t tim, int flag)
+cv_timedwait_sig_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
+ hrtime_t res, int flag)
{
kthread_t *t = curthread;
proc_t *p = ttoproc(t);
@@ -323,16 +341,9 @@ cv_timedwait_sig_internal(kcondvar_t *cvp, kmutex_t *mp, clock_t tim, int flag)
int cancel_pending = 0;
callout_id_t id;
clock_t rval = 1;
- clock_t timeleft;
+ hrtime_t limit;
int signalled = 0;
- /*
- * If the flag is 0, then realtime_timeout() below creates a
- * regular realtime timeout. If the flag is CALLOUT_FLAG_HRESTIME,
- * then, it creates a special realtime timeout which is affected by
- * changes to hrestime. See callo.h for details.
- */
- ASSERT((flag == 0) || (flag == CALLOUT_FLAG_HRESTIME));
if (panicstr)
return (rval);
@@ -342,17 +353,17 @@ cv_timedwait_sig_internal(kcondvar_t *cvp, kmutex_t *mp, clock_t tim, int flag)
* that has not yet unpinned the thread underneath.
*/
if (lwp == NULL || t->t_intr)
- return (cv_timedwait(cvp, mp, tim));
+ return (cv_timedwait_hires(cvp, mp, tim, res, flag));
/*
- * If tim is less than or equal to lbolt, then the timeout
+ * If tim is less than or equal to current hrtime, then the timeout
* has already occured. So just check to see if there is a signal
* pending. If so return 0 indicating that there is a signal pending.
* Else return -1 indicating that the timeout occured. No need to
* wait on anything.
*/
- timeleft = tim - lbolt;
- if (timeleft <= 0) {
+ limit = (flag & CALLOUT_FLAG_ABSOLUTE) ? gethrtime() : 0;
+ if (tim <= limit) {
lwp->lwp_asleep = 1;
lwp->lwp_sysabort = 0;
rval = -1;
@@ -365,7 +376,7 @@ cv_timedwait_sig_internal(kcondvar_t *cvp, kmutex_t *mp, clock_t tim, int flag)
cancel_pending = schedctl_cancel_pending();
mutex_enter(&t->t_wait_mutex);
id = timeout_generic(CALLOUT_REALTIME, (void (*)(void *))cv_wakeup, t,
- TICK_TO_NSEC(timeleft), nsec_per_tick, flag);
+ tim, res, flag);
lwp->lwp_asleep = 1;
lwp->lwp_sysabort = 0;
thread_lock(t);
@@ -427,12 +438,15 @@ out:
*
* cv_timedwait_sig() is now part of the DDI.
*
- * This function is now just a wrapper for cv_timedwait_sig_internal().
+ * This function is now just a wrapper for cv_timedwait_sig_hires().
*/
clock_t
cv_timedwait_sig(kcondvar_t *cvp, kmutex_t *mp, clock_t tim)
{
- return (cv_timedwait_sig_internal(cvp, mp, tim, 0));
+ hrtime_t hrtim;
+
+ hrtim = TICK_TO_NSEC(tim - lbolt);
+ return (cv_timedwait_sig_hires(cvp, mp, hrtim, nsec_per_tick, 0));
}
/*
@@ -680,6 +694,7 @@ cv_waituntil_sig(kcondvar_t *cvp, kmutex_t *mp,
{
timestruc_t now;
timestruc_t delta;
+ hrtime_t interval;
int rval;
if (when == NULL)
@@ -694,14 +709,19 @@ cv_waituntil_sig(kcondvar_t *cvp, kmutex_t *mp,
* Call cv_timedwait_sig() just to check for signals.
* We will return immediately with either 0 or -1.
*/
- rval = cv_timedwait_sig(cvp, mp, lbolt);
+ rval = cv_timedwait_sig_hires(cvp, mp, 0, 1, 0);
} else {
- gethrestime_lasttick(&now);
if (timecheck == timechanged) {
- rval = cv_timedwait_sig_internal(cvp, mp,
- lbolt + timespectohz(when, now),
+ /*
+ * Make sure that the interval is atleast one tick.
+ * This is to prevent a user from flooding the system
+ * with very small, high resolution timers.
+ */
+ interval = ts2hrt(&delta);
+ if (interval < nsec_per_tick)
+ interval = nsec_per_tick;
+ rval = cv_timedwait_sig_hires(cvp, mp, interval, 1,
CALLOUT_FLAG_HRESTIME);
-
} else {
/*
* Someone reset the system time;
diff --git a/usr/src/uts/common/sys/callo.h b/usr/src/uts/common/sys/callo.h
index 2b76fe62a8..6a464f9dd9 100644
--- a/usr/src/uts/common/sys/callo.h
+++ b/usr/src/uts/common/sys/callo.h
@@ -81,17 +81,20 @@ typedef struct callout {
* returned. In such cases, a default generation number of 0 is assigned to
* the legacy IDs.
*
- * The lower 32-bit ID space is partitioned into two spaces - one for 32-bit
- * IDs and the other for 64-bit IDs. The 32-bit ID space is further divided
- * into two spaces - one for short-term callouts and one for long-term.
+ * The lower 32-bit ID space is partitioned into two spaces - one for
+ * short-term callouts and one for long-term.
*
* Here is the bit layout for the callout ID:
*
- * 63 62 ... 32 31 30 29 .. X+1 X ... 1 0
- * ----------------------------------------------------------------
- * | Exec | Generation | Long | Counter | ID bits | Table | Type |
- * | | number | term | High | | number | |
- * ----------------------------------------------------------------
+ * 63 62 61 ... 32 31 30 29 .. X+1 X ... 1 0
+ * -----------------------------------------------------------------------
+ * | Free | Exec | Generation | Long | Counter | ID bits | Table | Type |
+ * | | | number | term | High | | number | |
+ * -----------------------------------------------------------------------
+ *
+ * Free:
+ * This bit indicates that this callout has been freed. This is for
+ * debugging purposes.
*
* Exec(uting):
* This is the executing bit which is only set in the extended callout
@@ -135,8 +138,10 @@ typedef struct callout {
* This bit represents the callout (table) type. Each CPU has one realtime
* and one normal callout table.
*/
-#define CALLOUT_EXECUTING 0x8000000000000000ULL
-#define CALLOUT_ID_MASK ~(CALLOUT_EXECUTING)
+#define CALLOUT_FREE 0x8000000000000000ULL
+#define CALLOUT_EXECUTING 0x4000000000000000ULL
+#define CALLOUT_ID_FLAGS (CALLOUT_FREE | CALLOUT_EXECUTING)
+#define CALLOUT_ID_MASK ~CALLOUT_ID_FLAGS
#define CALLOUT_GENERATION_LOW 0x100000000ULL
#define CALLOUT_LONGTERM 0x80000000
#define CALLOUT_COUNTER_HIGH 0x40000000
@@ -178,7 +183,7 @@ typedef struct callout {
#define CALLOUT_LONG_ID(table) \
(CALLOUT_SHORT_ID(table) | CALLOUT_LONGTERM)
-#define CALLOUT_THREADS 2 /* keep it simple for now */
+#define CALLOUT_THREADS 2
#define CALLOUT_REALTIME 0 /* realtime callout type */
#define CALLOUT_NORMAL 1 /* normal callout type */
@@ -213,6 +218,21 @@ typedef struct callout_hash {
void *ch_tail;
} callout_hash_t;
+/*
+ * CALLOUT_LIST_FLAG_FREE
+ * Callout list is free.
+ * CALLOUT_LIST_FLAG_ABSOLUTE
+ * Callout list contains absolute timers.
+ * CALLOUT_LIST_FLAG_HRESTIME
+ * Callout list contains hrestime timers.
+ * CALLOUT_LIST_FLAG_NANO
+ * Callout list contains 1-nanosecond resolution callouts.
+ */
+#define CALLOUT_LIST_FLAG_FREE 0x1
+#define CALLOUT_LIST_FLAG_ABSOLUTE 0x2
+#define CALLOUT_LIST_FLAG_HRESTIME 0x4
+#define CALLOUT_LIST_FLAG_NANO 0x8
+
struct callout_list {
callout_list_t *cl_next; /* next in clhash */
callout_list_t *cl_prev; /* prev in clhash */
@@ -222,6 +242,29 @@ struct callout_list {
};
/*
+ * Callout heap element. Each element in the heap stores the expiration
+ * as well as the corresponding callout list. This is to avoid a lookup
+ * of the callout list when the heap is processed. Because we store the
+ * callout list pointer in the heap element, we have to always remove
+ * a heap element and its callout list together. We cannot remove one
+ * without the other.
+ */
+typedef struct callout_heap {
+ hrtime_t ch_expiration;
+ callout_list_t *ch_list;
+} callout_heap_t;
+
+/*
+ * When the heap contains too many empty callout lists, it needs to be
+ * cleaned up. The decision to clean up the heap is a function of the
+ * number of empty entries and the heap size. Also, we don't want to
+ * clean up small heaps.
+ */
+#define CALLOUT_MIN_REAP (CALLOUT_BUCKETS >> 3)
+#define CALLOUT_CLEANUP(ct) ((ct->ct_nreap >= callout_min_reap) && \
+ (ct->ct_nreap >= (ct->ct_heap_num >> 1)))
+
+/*
* Per-callout table kstats.
*
* CALLOUT_TIMEOUTS
@@ -240,6 +283,8 @@ struct callout_list {
* Number of callouts that expired.
* CALLOUT_ALLOCATIONS
* Number of callout structures allocated.
+ * CALLOUT_CLEANUPS
+ * Number of times a callout table is cleaned up.
*/
typedef enum callout_stat_type {
CALLOUT_TIMEOUTS,
@@ -249,6 +294,7 @@ typedef enum callout_stat_type {
CALLOUT_UNTIMEOUTS_EXPIRED,
CALLOUT_EXPIRATIONS,
CALLOUT_ALLOCATIONS,
+ CALLOUT_CLEANUPS,
CALLOUT_NUM_STATS
} callout_stat_type_t;
@@ -277,7 +323,6 @@ typedef enum callout_stat_type {
#define CALLOUT_FLAG_HRESTIME 0x4
#define CALLOUT_FLAG_32BIT 0x8
-#define CALLOUT_LIST_FLAGS (CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_HRESTIME)
/*
* On 32-bit systems, the legacy interfaces, timeout() and realtime_timeout(),
* must pass CALLOUT_FLAG_32BIT to timeout_generic() so that a 32-bit ID
@@ -306,7 +351,7 @@ typedef struct callout_table {
uint_t ct_type; /* callout table type */
uint_t ct_suspend; /* suspend count */
cyclic_id_t ct_cyclic; /* cyclic for this table */
- hrtime_t *ct_heap; /* callout expiration heap */
+ callout_heap_t *ct_heap; /* callout expiration heap */
ulong_t ct_heap_num; /* occupied slots in the heap */
ulong_t ct_heap_max; /* end of the heap */
kmem_cache_t *ct_cache; /* callout kmem cache */
@@ -316,10 +361,11 @@ typedef struct callout_table {
callout_hash_t ct_expired; /* list of expired callout lists */
taskq_t *ct_taskq; /* taskq to execute normal callouts */
kstat_t *ct_kstats; /* callout kstats */
+ int ct_nreap; /* # heap entries that need reaping */
#ifdef _LP64
- ulong_t ct_pad[4]; /* cache alignment */
+ char ct_pad[28]; /* cache alignment */
#else
- ulong_t ct_pad[7]; /* cache alignment */
+ char ct_pad[24]; /* cache alignment */
#endif
} callout_table_t;
@@ -340,6 +386,8 @@ typedef struct callout_table {
ct_kstat_data[CALLOUT_EXPIRATIONS].value.ui64
#define ct_allocations \
ct_kstat_data[CALLOUT_ALLOCATIONS].value.ui64
+#define ct_cleanups \
+ ct_kstat_data[CALLOUT_CLEANUPS].value.ui64
#define CALLOUT_CHUNK 128
@@ -350,12 +398,6 @@ typedef struct callout_table {
#define CALLOUT_CYCLIC_HANDLER(t) \
((t == CALLOUT_REALTIME) ? callout_realtime : callout_normal)
-/*
- * We define a blanket minimum resolution for callouts of 1 millisecond.
- * 1 millisecond is a safe value as it is already supported when the clock
- * resolution is set to high.
- */
-#define CALLOUT_MIN_RESOLUTION 1000000ULL
#define CALLOUT_TCP_RESOLUTION 10000000ULL
#define CALLOUT_ALIGN 64 /* cache line size */
@@ -366,6 +408,8 @@ typedef struct callout_table {
#define CALLOUT_MAX_TICKS LONG_MAX
#endif
+#define CALLOUT_TOLERANCE 200000 /* nanoseconds */
+
extern void callout_init(void);
extern void membar_sync(void);
extern void callout_cpu_online(cpu_t *);
diff --git a/usr/src/uts/common/syscall/lwp_timer.c b/usr/src/uts/common/syscall/lwp_timer.c
index 134e42b06e..7d4592bbcb 100644
--- a/usr/src/uts/common/syscall/lwp_timer.c
+++ b/usr/src/uts/common/syscall/lwp_timer.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -55,7 +55,7 @@ lwp_timer_timeout(void *arg)
{
lwp_timer_t *lwptp = arg;
kthread_t *t = lwptp->lwpt_thread;
- timespec_t now;
+ timespec_t now, delta;
mutex_enter(&t->t_delay_lock);
gethrestime(&now);
@@ -68,10 +68,11 @@ lwp_timer_timeout(void *arg)
(lwptp->lwpt_rqtime.tv_sec == now.tv_sec &&
lwptp->lwpt_rqtime.tv_nsec > now.tv_nsec))) {
lwptp->lwpt_imm_timeout = 0;
+ delta = lwptp->lwpt_rqtime;
+ timespecsub(&delta, &now);
lwptp->lwpt_id = timeout_generic(CALLOUT_REALTIME,
- lwp_timer_timeout, lwptp,
- TICK_TO_NSEC(timespectohz(&lwptp->lwpt_rqtime, now)),
- nsec_per_tick, CALLOUT_FLAG_HRESTIME);
+ lwp_timer_timeout, lwptp, ts2hrt(&delta), nsec_per_tick,
+ (CALLOUT_FLAG_HRESTIME | CALLOUT_FLAG_ROUNDUP));
} else {
/*
* Set the thread running only if it is asleep on
@@ -144,7 +145,7 @@ err:
int
lwp_timer_enqueue(lwp_timer_t *lwptp)
{
- timespec_t now;
+ timespec_t now, delta;
ASSERT(lwptp->lwpt_thread == curthread);
ASSERT(MUTEX_HELD(&curthread->t_delay_lock));
@@ -157,10 +158,11 @@ lwp_timer_enqueue(lwp_timer_t *lwptp)
* Queue the timeout.
*/
lwptp->lwpt_imm_timeout = 0;
+ delta = lwptp->lwpt_rqtime;
+ timespecsub(&delta, &now);
lwptp->lwpt_id = timeout_generic(CALLOUT_REALTIME,
- lwp_timer_timeout, lwptp,
- TICK_TO_NSEC(timespectohz(&lwptp->lwpt_rqtime, now)),
- nsec_per_tick, CALLOUT_FLAG_HRESTIME);
+ lwp_timer_timeout, lwptp, ts2hrt(&delta), nsec_per_tick,
+ (CALLOUT_FLAG_HRESTIME | CALLOUT_FLAG_ROUNDUP));
return (0);
}