summaryrefslogtreecommitdiff
path: root/lib/dns/rbtdb.c
diff options
context:
space:
mode:
Diffstat (limited to 'lib/dns/rbtdb.c')
-rw-r--r--lib/dns/rbtdb.c208
1 files changed, 170 insertions, 38 deletions
diff --git a/lib/dns/rbtdb.c b/lib/dns/rbtdb.c
index 1019cd44..9741c157 100644
--- a/lib/dns/rbtdb.c
+++ b/lib/dns/rbtdb.c
@@ -15,7 +15,7 @@
* PERFORMANCE OF THIS SOFTWARE.
*/
-/* $Id: rbtdb.c,v 1.270.12.4 2009/03/05 04:57:40 marka Exp $ */
+/* $Id: rbtdb.c,v 1.270.12.6 2009/05/06 23:34:30 jinmei Exp $ */
/*! \file */
@@ -344,7 +344,7 @@ struct acachectl {
*/
#ifdef DNS_RBTDB_CACHE_NODE_LOCK_COUNT
#if DNS_RBTDB_CACHE_NODE_LOCK_COUNT <= 1
-#error "DNS_RBTDB_CACHE_NODE_LOCK_COUNT must be larger 1"
+#error "DNS_RBTDB_CACHE_NODE_LOCK_COUNT must be larger than 1"
#else
#define DEFAULT_CACHE_NODE_LOCK_COUNT DNS_RBTDB_CACHE_NODE_LOCK_COUNT
#endif
@@ -533,6 +533,7 @@ static void overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start,
isc_stdtime_t now, isc_boolean_t tree_locked);
static isc_result_t resign_insert(dns_rbtdb_t *rbtdb, int idx,
rdatasetheader_t *newheader);
+static void prune_tree(isc_task_t *task, isc_event_t *event);
static dns_rdatasetmethods_t rdataset_methods = {
rdataset_disassociate,
@@ -1551,7 +1552,7 @@ new_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) {
/*
* This function is assumed to be called when a node is newly referenced
* and can be in the deadnode list. In that case the node must be retrieved
- * from the list because the it is going to be used. In addition, if the caller
+ * from the list because it is going to be used. In addition, if the caller
* happens to hold a write lock on the tree, it's a good chance to purge dead
* nodes.
* Note: while a new reference is gained in multiple places, there are only very
@@ -1604,13 +1605,15 @@ reactivate_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
static isc_boolean_t
decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
rbtdb_serial_t least_serial,
- isc_rwlocktype_t nlock, isc_rwlocktype_t tlock)
+ isc_rwlocktype_t nlock, isc_rwlocktype_t tlock,
+ isc_boolean_t pruning)
{
isc_result_t result;
isc_boolean_t write_locked;
rbtdb_nodelock_t *nodelock;
unsigned int refs, nrefs;
int bucket = node->locknum;
+ isc_boolean_t no_reference;
nodelock = &rbtdb->node_locks[bucket];
@@ -1693,6 +1696,7 @@ decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
} else
write_locked = ISC_TRUE;
+ no_reference = ISC_TRUE;
if (write_locked && dns_rbtnode_refcurrent(node) == 0) {
/*
* We can now delete the node if the reference counter is
@@ -1701,35 +1705,97 @@ decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
* current thread locks the tree (e.g., in findnode()).
*/
- if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(1))) {
- char printname[DNS_NAME_FORMATSIZE];
+ /*
+ * If this node is the only one in the level it's in, deleting
+ * this node may recursively make its parent the only node in
+ * the parent level; if so, and if no one is currently using
+ * the parent node, this is almost the only opportunity to
+ * clean it up. But the recursive cleanup is not that trivial
+ * since the child and parent may be in different lock buckets,
+ * which would cause a lock order reversal problem. To avoid
+ * the trouble, we'll dispatch a separate event for batch
+ * cleaning. We need to check whether we're deleting the node
+ * as a result of pruning to avoid infinite dispatching.
+ * Note: pruning happens only when a task has been set for the
+ * rbtdb. If the user of the rbtdb chooses not to set a task,
+ * it's their responsibility to purge stale leaves (e.g. by
+ * periodic walk-through).
+ */
+ if (!pruning && node->parent != NULL &&
+ node->parent->down == node && node->left == NULL &&
+ node->right == NULL && rbtdb->task != NULL) {
+ isc_event_t *ev;
+ dns_db_t *db;
+
+ ev = isc_event_allocate(rbtdb->common.mctx, NULL,
+ DNS_EVENT_RBTPRUNE,
+ prune_tree, node,
+ sizeof(isc_event_t));
+ if (ev != NULL) {
+ new_reference(rbtdb, node);
+ db = NULL;
+ attach((dns_db_t *)rbtdb, &db);
+ ev->ev_sender = db;
+ isc_task_send(rbtdb->task, &ev);
+ no_reference = ISC_FALSE;
+ } else {
+ /*
+ * XXX: this is a weird situation. We could
+ * ignore this error case, but then the stale
+ * node will unlikely be purged except via a
+ * rare condition such as manual cleanup. So
+ * we queue it in the deadnodes list, hoping
+ * the memory shortage is temporary and the node
+ * will be deleted later.
+ */
+ isc_log_write(dns_lctx,
+ DNS_LOGCATEGORY_DATABASE,
+ DNS_LOGMODULE_CACHE,
+ ISC_LOG_INFO,
+ "decrement_reference: failed to "
+ "allocate pruning event");
+ INSIST(!ISC_LINK_LINKED(node, deadlink));
+ ISC_LIST_APPEND(rbtdb->deadnodes[bucket], node,
+ deadlink);
+ }
+ } else {
+ if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(1))) {
+ char printname[DNS_NAME_FORMATSIZE];
+
+ isc_log_write(dns_lctx,
+ DNS_LOGCATEGORY_DATABASE,
+ DNS_LOGMODULE_CACHE,
+ ISC_LOG_DEBUG(1),
+ "decrement_reference: "
+ "delete from rbt: %p %s",
+ node,
+ dns_rbt_formatnodename(node,
+ printname,
+ sizeof(printname)));
+ }
- isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
- DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
- "decrement_reference: "
- "delete from rbt: %p %s",
- node,
- dns_rbt_formatnodename(node, printname,
- sizeof(printname)));
+ INSIST(!ISC_LINK_LINKED(node, deadlink));
+ if (node->nsec3)
+ result = dns_rbt_deletenode(rbtdb->nsec3, node,
+ ISC_FALSE);
+ else
+ result = dns_rbt_deletenode(rbtdb->tree, node,
+ ISC_FALSE);
+ if (result != ISC_R_SUCCESS) {
+ isc_log_write(dns_lctx,
+ DNS_LOGCATEGORY_DATABASE,
+ DNS_LOGMODULE_CACHE,
+ ISC_LOG_WARNING,
+ "decrement_reference: "
+ "dns_rbt_deletenode: %s",
+ isc_result_totext(result));
+ }
}
-
- INSIST(!ISC_LINK_LINKED(node, deadlink));
- if (node->nsec3)
- result = dns_rbt_deletenode(rbtdb->nsec3, node,
- ISC_FALSE);
- else
- result = dns_rbt_deletenode(rbtdb->tree, node,
- ISC_FALSE);
- if (result != ISC_R_SUCCESS)
- isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
- DNS_LOGMODULE_CACHE, ISC_LOG_WARNING,
- "decrement_reference: "
- "dns_rbt_deletenode: %s",
- isc_result_totext(result));
} else if (dns_rbtnode_refcurrent(node) == 0) {
INSIST(!ISC_LINK_LINKED(node, deadlink));
ISC_LIST_APPEND(rbtdb->deadnodes[bucket], node, deadlink);
- }
+ } else
+ no_reference = ISC_FALSE;
/* Restore the lock? */
if (nlock == isc_rwlocktype_read)
@@ -1746,7 +1812,71 @@ decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
if (write_locked)
isc_rwlock_downgrade(&rbtdb->tree_lock);
- return (ISC_TRUE);
+ return (no_reference);
+}
+
+/*
+ * Prune the tree by recursively cleaning-up single leaves. In the worst
+ * case, the number of iteration is the number of tree levels, which is at
+ * most the maximum number of domain name labels, i.e, 127. In practice, this
+ * should be much smaller (only a few times), and even the worst case would be
+ * acceptable for a single event.
+ */
+static void
+prune_tree(isc_task_t *task, isc_event_t *event) {
+ dns_rbtdb_t *rbtdb = event->ev_sender;
+ dns_rbtnode_t *node = event->ev_arg;
+ dns_rbtnode_t *parent;
+ unsigned int locknum;
+
+ UNUSED(task);
+
+ isc_event_free(&event);
+
+ RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
+ locknum = node->locknum;
+ NODE_LOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
+ do {
+ parent = node->parent;
+ decrement_reference(rbtdb, node, 0, isc_rwlocktype_write,
+ isc_rwlocktype_write, ISC_TRUE);
+
+ if (parent != NULL && parent->down == NULL) {
+ /*
+ * node was the only down child of the parent and has
+ * just been removed. We'll then need to examine the
+ * parent. Keep the lock if possible; otherwise,
+ * release the old lock and acquire one for the parent.
+ */
+ if (parent->locknum != locknum) {
+ NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
+ isc_rwlocktype_write);
+ locknum = parent->locknum;
+ NODE_LOCK(&rbtdb->node_locks[locknum].lock,
+ isc_rwlocktype_write);
+ }
+
+ /*
+ * We need to gain a reference to the node before
+ * decrementing it in the next iteration. In addition,
+ * if the node is in the dead-nodes list, extract it
+ * from the list beforehand as we do in
+ * reactivate_node().
+ */
+ new_reference(rbtdb, parent);
+ if (ISC_LINK_LINKED(parent, deadlink)) {
+ ISC_LIST_UNLINK(rbtdb->deadnodes[locknum],
+ parent, deadlink);
+ }
+ } else
+ parent = NULL;
+
+ node = parent;
+ } while (node != NULL);
+ NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
+ RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
+
+ detach((dns_db_t **)&rbtdb);
}
static inline void
@@ -2163,8 +2293,8 @@ closeversion(dns_db_t *db, dns_dbversion_t **versionp, isc_boolean_t commit) {
NODE_UNLOCK(lock, isc_rwlocktype_write);
}
decrement_reference(rbtdb, header->node, least_serial,
- isc_rwlocktype_write,
- isc_rwlocktype_none);
+ isc_rwlocktype_write, isc_rwlocktype_none,
+ ISC_FALSE);
}
if (!EMPTY(cleanup_list)) {
@@ -2197,7 +2327,7 @@ closeversion(dns_db_t *db, dns_dbversion_t **versionp, isc_boolean_t commit) {
rollback_node(rbtnode, serial);
decrement_reference(rbtdb, rbtnode, least_serial,
isc_rwlocktype_write,
- isc_rwlocktype_write);
+ isc_rwlocktype_write, ISC_FALSE);
NODE_UNLOCK(lock, isc_rwlocktype_write);
@@ -3747,7 +3877,8 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version,
NODE_LOCK(lock, isc_rwlocktype_read);
decrement_reference(search.rbtdb, node, 0,
- isc_rwlocktype_read, isc_rwlocktype_none);
+ isc_rwlocktype_read, isc_rwlocktype_none,
+ ISC_FALSE);
NODE_UNLOCK(lock, isc_rwlocktype_read);
}
@@ -4525,7 +4656,8 @@ cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version,
NODE_LOCK(lock, isc_rwlocktype_read);
decrement_reference(search.rbtdb, node, 0,
- isc_rwlocktype_read, isc_rwlocktype_none);
+ isc_rwlocktype_read, isc_rwlocktype_none,
+ ISC_FALSE);
NODE_UNLOCK(lock, isc_rwlocktype_read);
}
@@ -4743,7 +4875,7 @@ detachnode(dns_db_t *db, dns_dbnode_t **targetp) {
NODE_LOCK(&nodelock->lock, isc_rwlocktype_read);
if (decrement_reference(rbtdb, node, 0, isc_rwlocktype_read,
- isc_rwlocktype_none)) {
+ isc_rwlocktype_none, ISC_FALSE)) {
if (isc_refcount_current(&nodelock->references) == 0 &&
nodelock->exiting) {
inactive = ISC_TRUE;
@@ -7469,7 +7601,7 @@ dereference_iter_node(rbtdb_dbiterator_t *rbtdbiter) {
lock = &rbtdb->node_locks[node->locknum].lock;
NODE_LOCK(lock, isc_rwlocktype_read);
decrement_reference(rbtdb, node, 0, isc_rwlocktype_read,
- rbtdbiter->tree_locked);
+ rbtdbiter->tree_locked, ISC_FALSE);
NODE_UNLOCK(lock, isc_rwlocktype_read);
rbtdbiter->node = NULL;
@@ -7510,7 +7642,7 @@ flush_deletions(rbtdb_dbiterator_t *rbtdbiter) {
NODE_LOCK(lock, isc_rwlocktype_read);
decrement_reference(rbtdb, node, 0,
isc_rwlocktype_read,
- rbtdbiter->tree_locked);
+ rbtdbiter->tree_locked, ISC_FALSE);
NODE_UNLOCK(lock, isc_rwlocktype_read);
}
@@ -8446,6 +8578,6 @@ expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
decrement_reference(rbtdb, header->node, 0,
isc_rwlocktype_write,
tree_locked ? isc_rwlocktype_write :
- isc_rwlocktype_none);
+ isc_rwlocktype_none, ISC_FALSE);
}
}