diff options
Diffstat (limited to 'lib/dns/rbtdb.c')
-rw-r--r-- | lib/dns/rbtdb.c | 208 |
1 files changed, 170 insertions, 38 deletions
diff --git a/lib/dns/rbtdb.c b/lib/dns/rbtdb.c index 1019cd44..9741c157 100644 --- a/lib/dns/rbtdb.c +++ b/lib/dns/rbtdb.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: rbtdb.c,v 1.270.12.4 2009/03/05 04:57:40 marka Exp $ */ +/* $Id: rbtdb.c,v 1.270.12.6 2009/05/06 23:34:30 jinmei Exp $ */ /*! \file */ @@ -344,7 +344,7 @@ struct acachectl { */ #ifdef DNS_RBTDB_CACHE_NODE_LOCK_COUNT #if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1 -#error "DNS_RBTDB_CACHE_NODE_LOCK_COUNT must be larger 1" +#error "DNS_RBTDB_CACHE_NODE_LOCK_COUNT must be larger than 1" #else #define DEFAULT_CACHE_NODE_LOCK_COUNT DNS_RBTDB_CACHE_NODE_LOCK_COUNT #endif @@ -533,6 +533,7 @@ static void overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start, isc_stdtime_t now, isc_boolean_t tree_locked); static isc_result_t resign_insert(dns_rbtdb_t *rbtdb, int idx, rdatasetheader_t *newheader); +static void prune_tree(isc_task_t *task, isc_event_t *event); static dns_rdatasetmethods_t rdataset_methods = { rdataset_disassociate, @@ -1551,7 +1552,7 @@ new_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) { /* * This function is assumed to be called when a node is newly referenced * and can be in the deadnode list. In that case the node must be retrieved - * from the list because the it is going to be used. In addition, if the caller + * from the list because it is going to be used. In addition, if the caller * happens to hold a write lock on the tree, it's a good chance to purge dead * nodes. * Note: while a new reference is gained in multiple places, there are only very @@ -1604,13 +1605,15 @@ reactivate_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, static isc_boolean_t decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, rbtdb_serial_t least_serial, - isc_rwlocktype_t nlock, isc_rwlocktype_t tlock) + isc_rwlocktype_t nlock, isc_rwlocktype_t tlock, + isc_boolean_t pruning) { isc_result_t result; isc_boolean_t write_locked; rbtdb_nodelock_t *nodelock; unsigned int refs, nrefs; int bucket = node->locknum; + isc_boolean_t no_reference; nodelock = &rbtdb->node_locks[bucket]; @@ -1693,6 +1696,7 @@ decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, } else write_locked = ISC_TRUE; + no_reference = ISC_TRUE; if (write_locked && dns_rbtnode_refcurrent(node) == 0) { /* * We can now delete the node if the reference counter is @@ -1701,35 +1705,97 @@ decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, * current thread locks the tree (e.g., in findnode()). */ - if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(1))) { - char printname[DNS_NAME_FORMATSIZE]; + /* + * If this node is the only one in the level it's in, deleting + * this node may recursively make its parent the only node in + * the parent level; if so, and if no one is currently using + * the parent node, this is almost the only opportunity to + * clean it up. But the recursive cleanup is not that trivial + * since the child and parent may be in different lock buckets, + * which would cause a lock order reversal problem. To avoid + * the trouble, we'll dispatch a separate event for batch + * cleaning. We need to check whether we're deleting the node + * as a result of pruning to avoid infinite dispatching. + * Note: pruning happens only when a task has been set for the + * rbtdb. If the user of the rbtdb chooses not to set a task, + * it's their responsibility to purge stale leaves (e.g. by + * periodic walk-through). + */ + if (!pruning && node->parent != NULL && + node->parent->down == node && node->left == NULL && + node->right == NULL && rbtdb->task != NULL) { + isc_event_t *ev; + dns_db_t *db; + + ev = isc_event_allocate(rbtdb->common.mctx, NULL, + DNS_EVENT_RBTPRUNE, + prune_tree, node, + sizeof(isc_event_t)); + if (ev != NULL) { + new_reference(rbtdb, node); + db = NULL; + attach((dns_db_t *)rbtdb, &db); + ev->ev_sender = db; + isc_task_send(rbtdb->task, &ev); + no_reference = ISC_FALSE; + } else { + /* + * XXX: this is a weird situation. We could + * ignore this error case, but then the stale + * node will unlikely be purged except via a + * rare condition such as manual cleanup. So + * we queue it in the deadnodes list, hoping + * the memory shortage is temporary and the node + * will be deleted later. + */ + isc_log_write(dns_lctx, + DNS_LOGCATEGORY_DATABASE, + DNS_LOGMODULE_CACHE, + ISC_LOG_INFO, + "decrement_reference: failed to " + "allocate pruning event"); + INSIST(!ISC_LINK_LINKED(node, deadlink)); + ISC_LIST_APPEND(rbtdb->deadnodes[bucket], node, + deadlink); + } + } else { + if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(1))) { + char printname[DNS_NAME_FORMATSIZE]; + + isc_log_write(dns_lctx, + DNS_LOGCATEGORY_DATABASE, + DNS_LOGMODULE_CACHE, + ISC_LOG_DEBUG(1), + "decrement_reference: " + "delete from rbt: %p %s", + node, + dns_rbt_formatnodename(node, + printname, + sizeof(printname))); + } - isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, - DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1), - "decrement_reference: " - "delete from rbt: %p %s", - node, - dns_rbt_formatnodename(node, printname, - sizeof(printname))); + INSIST(!ISC_LINK_LINKED(node, deadlink)); + if (node->nsec3) + result = dns_rbt_deletenode(rbtdb->nsec3, node, + ISC_FALSE); + else + result = dns_rbt_deletenode(rbtdb->tree, node, + ISC_FALSE); + if (result != ISC_R_SUCCESS) { + isc_log_write(dns_lctx, + DNS_LOGCATEGORY_DATABASE, + DNS_LOGMODULE_CACHE, + ISC_LOG_WARNING, + "decrement_reference: " + "dns_rbt_deletenode: %s", + isc_result_totext(result)); + } } - - INSIST(!ISC_LINK_LINKED(node, deadlink)); - if (node->nsec3) - result = dns_rbt_deletenode(rbtdb->nsec3, node, - ISC_FALSE); - else - result = dns_rbt_deletenode(rbtdb->tree, node, - ISC_FALSE); - if (result != ISC_R_SUCCESS) - isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, - DNS_LOGMODULE_CACHE, ISC_LOG_WARNING, - "decrement_reference: " - "dns_rbt_deletenode: %s", - isc_result_totext(result)); } else if (dns_rbtnode_refcurrent(node) == 0) { INSIST(!ISC_LINK_LINKED(node, deadlink)); ISC_LIST_APPEND(rbtdb->deadnodes[bucket], node, deadlink); - } + } else + no_reference = ISC_FALSE; /* Restore the lock? */ if (nlock == isc_rwlocktype_read) @@ -1746,7 +1812,71 @@ decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, if (write_locked) isc_rwlock_downgrade(&rbtdb->tree_lock); - return (ISC_TRUE); + return (no_reference); +} + +/* + * Prune the tree by recursively cleaning-up single leaves. In the worst + * case, the number of iteration is the number of tree levels, which is at + * most the maximum number of domain name labels, i.e, 127. In practice, this + * should be much smaller (only a few times), and even the worst case would be + * acceptable for a single event. + */ +static void +prune_tree(isc_task_t *task, isc_event_t *event) { + dns_rbtdb_t *rbtdb = event->ev_sender; + dns_rbtnode_t *node = event->ev_arg; + dns_rbtnode_t *parent; + unsigned int locknum; + + UNUSED(task); + + isc_event_free(&event); + + RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); + locknum = node->locknum; + NODE_LOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write); + do { + parent = node->parent; + decrement_reference(rbtdb, node, 0, isc_rwlocktype_write, + isc_rwlocktype_write, ISC_TRUE); + + if (parent != NULL && parent->down == NULL) { + /* + * node was the only down child of the parent and has + * just been removed. We'll then need to examine the + * parent. Keep the lock if possible; otherwise, + * release the old lock and acquire one for the parent. + */ + if (parent->locknum != locknum) { + NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, + isc_rwlocktype_write); + locknum = parent->locknum; + NODE_LOCK(&rbtdb->node_locks[locknum].lock, + isc_rwlocktype_write); + } + + /* + * We need to gain a reference to the node before + * decrementing it in the next iteration. In addition, + * if the node is in the dead-nodes list, extract it + * from the list beforehand as we do in + * reactivate_node(). + */ + new_reference(rbtdb, parent); + if (ISC_LINK_LINKED(parent, deadlink)) { + ISC_LIST_UNLINK(rbtdb->deadnodes[locknum], + parent, deadlink); + } + } else + parent = NULL; + + node = parent; + } while (node != NULL); + NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write); + RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); + + detach((dns_db_t **)&rbtdb); } static inline void @@ -2163,8 +2293,8 @@ closeversion(dns_db_t *db, dns_dbversion_t **versionp, isc_boolean_t commit) { NODE_UNLOCK(lock, isc_rwlocktype_write); } decrement_reference(rbtdb, header->node, least_serial, - isc_rwlocktype_write, - isc_rwlocktype_none); + isc_rwlocktype_write, isc_rwlocktype_none, + ISC_FALSE); } if (!EMPTY(cleanup_list)) { @@ -2197,7 +2327,7 @@ closeversion(dns_db_t *db, dns_dbversion_t **versionp, isc_boolean_t commit) { rollback_node(rbtnode, serial); decrement_reference(rbtdb, rbtnode, least_serial, isc_rwlocktype_write, - isc_rwlocktype_write); + isc_rwlocktype_write, ISC_FALSE); NODE_UNLOCK(lock, isc_rwlocktype_write); @@ -3747,7 +3877,8 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, NODE_LOCK(lock, isc_rwlocktype_read); decrement_reference(search.rbtdb, node, 0, - isc_rwlocktype_read, isc_rwlocktype_none); + isc_rwlocktype_read, isc_rwlocktype_none, + ISC_FALSE); NODE_UNLOCK(lock, isc_rwlocktype_read); } @@ -4525,7 +4656,8 @@ cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, NODE_LOCK(lock, isc_rwlocktype_read); decrement_reference(search.rbtdb, node, 0, - isc_rwlocktype_read, isc_rwlocktype_none); + isc_rwlocktype_read, isc_rwlocktype_none, + ISC_FALSE); NODE_UNLOCK(lock, isc_rwlocktype_read); } @@ -4743,7 +4875,7 @@ detachnode(dns_db_t *db, dns_dbnode_t **targetp) { NODE_LOCK(&nodelock->lock, isc_rwlocktype_read); if (decrement_reference(rbtdb, node, 0, isc_rwlocktype_read, - isc_rwlocktype_none)) { + isc_rwlocktype_none, ISC_FALSE)) { if (isc_refcount_current(&nodelock->references) == 0 && nodelock->exiting) { inactive = ISC_TRUE; @@ -7469,7 +7601,7 @@ dereference_iter_node(rbtdb_dbiterator_t *rbtdbiter) { lock = &rbtdb->node_locks[node->locknum].lock; NODE_LOCK(lock, isc_rwlocktype_read); decrement_reference(rbtdb, node, 0, isc_rwlocktype_read, - rbtdbiter->tree_locked); + rbtdbiter->tree_locked, ISC_FALSE); NODE_UNLOCK(lock, isc_rwlocktype_read); rbtdbiter->node = NULL; @@ -7510,7 +7642,7 @@ flush_deletions(rbtdb_dbiterator_t *rbtdbiter) { NODE_LOCK(lock, isc_rwlocktype_read); decrement_reference(rbtdb, node, 0, isc_rwlocktype_read, - rbtdbiter->tree_locked); + rbtdbiter->tree_locked, ISC_FALSE); NODE_UNLOCK(lock, isc_rwlocktype_read); } @@ -8446,6 +8578,6 @@ expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, decrement_reference(rbtdb, header->node, 0, isc_rwlocktype_write, tree_locked ? isc_rwlocktype_write : - isc_rwlocktype_none); + isc_rwlocktype_none, ISC_FALSE); } } |