1 files changed, 905 insertions, 337 deletions
diff --git a/db/btree.cpp b/db/btree.cpp
index d646de8..d547a1b 100644
--- a/db/btree.cpp
+++ b/db/btree.cpp
@@ -24,48 +24,92 @@
 #include "clientcursor.h"
 #include "client.h"
 #include "dbhelpers.h"
-#include "curop.h"
+#include "curop-inl.h"
 #include "stats/counters.h"
+#include "dur_commitjob.h"
 
 namespace mongo {
 
 #define VERIFYTHISLOC dassert( thisLoc.btree() == this );
 
+    /**
+     * give us a writable version of the btree bucket (declares write intent).
+     * note it is likely more efficient to declare write intent on something smaller when you can.
+     */
+    BtreeBucket* DiskLoc::btreemod() const {
+        assert( _a != -1 );
+        BtreeBucket *b = const_cast< BtreeBucket * >( btree() );
+        return static_cast< BtreeBucket* >( getDur().writingPtr( b, BucketSize ) );
+    }
+
+    _KeyNode& _KeyNode::writing() const {
+        return *getDur().writing( const_cast< _KeyNode* >( this ) );
+    }
+
     KeyNode::KeyNode(const BucketBasics& bb, const _KeyNode &k) :
-            prevChildBucket(k.prevChildBucket),
-            recordLoc(k.recordLoc), key(bb.data+k.keyDataOfs())
+        prevChildBucket(k.prevChildBucket),
+        recordLoc(k.recordLoc), key(bb.data+k.keyDataOfs())
     { }
 
-    const int KeyMax = BucketSize / 10;
+    // largest key size we allow.  note we very much need to support bigger keys (somehow) in the future.
+    static const int KeyMax = BucketSize / 10;
+
+    // We define this value as the maximum number of bytes such that, if we have
+    // fewer than this many bytes, we must be able to either merge with or receive
+    // keys from any neighboring node.  If our utilization goes below this value we
+    // know we can bring up the utilization with a simple operation.  Ignoring the
+    // 90/10 split policy which is sometimes employed and our 'unused' nodes, this
+    // is a lower bound on bucket utilization for non root buckets.
+    //
+    // Note that the exact value here depends on the implementation of
+    // rebalancedSeparatorPos().  The conditions for lowWaterMark - 1 are as
+    // follows:  We know we cannot merge with the neighbor, so the total data size
+    // for us, the neighbor, and the separator must be at least
+    // BtreeBucket::bodySize() + 1.  We must be able to accept one key of any
+    // allowed size, so our size plus storage for that additional key must be
+    // <= BtreeBucket::bodySize() / 2.  This way, with the extra key we'll have a
+    // new bucket data size < half the total data size and by the implementation
+    // of rebalancedSeparatorPos() the key must be added.
+    static const int lowWaterMark = BtreeBucket::bodySize() / 2 - KeyMax - sizeof( _KeyNode ) + 1;
+
+    static const int split_debug = 0;
+    static const int insert_debug = 0;
 
     extern int otherTraceLevel;
-    const int split_debug = 0;
-    const int insert_debug = 0;
 
-    static void alreadyInIndex() { 
+    /**
+     * this error is ok/benign when doing a background indexing -- that logic in pdfile checks explicitly
+     * for the 10287 error code.
+     */
+    static void alreadyInIndex() {
         // we don't use massert() here as that does logging and this is 'benign' - see catches in _indexRecord()
         throw MsgAssertionException(10287, "btree: key+recloc already in index");
     }
 
     /* BucketBasics --------------------------------------------------- */
 
-    inline void BucketBasics::modified(const DiskLoc& thisLoc) {
-        VERIFYTHISLOC
-        btreeStore->modified(thisLoc);
+    void BucketBasics::assertWritable() {
+        if( cmdLine.dur )
+            dur::assertAlreadyDeclared(this, sizeof(*this));
+    }
+
+    string BtreeBucket::bucketSummary() const {
+        stringstream ss;
+        ss << "  Bucket info:" << endl;
+        ss << "    n: " << n << endl;
+        ss << "    parent: " << parent.toString() << endl;
+        ss << "    nextChild: " << parent.toString() << endl;
+        ss << "    flags:" << flags << endl;
+        ss << "    emptySize: " << emptySize << " topSize: " << topSize << endl;
+        return ss.str();
     }
 
     int BucketBasics::Size() const {
         assert( _wasSize == BucketSize );
         return BucketSize;
     }
-    inline void BucketBasics::setNotPacked() {
-        flags &= ~Packed;
-    }
-    inline void BucketBasics::setPacked() {
-        flags |= Packed;
-    }
 
-    void BucketBasics::_shape(int level, stringstream& ss) {
+    void BucketBasics::_shape(int level, stringstream& ss) const {
         for ( int i = 0; i < level; i++ ) ss << ' ';
         ss << "*\n";
         for ( int i = 0; i < n; i++ )
@@ -78,13 +122,13 @@ namespace mongo {
     int bt_fv=0;
     int bt_dmp=0;
 
-    void BucketBasics::dumpTree(DiskLoc thisLoc, const BSONObj &order) {
+    void BtreeBucket::dumpTree(const DiskLoc &thisLoc, const BSONObj &order) const {
         bt_dmp=1;
         fullValidate(thisLoc, order);
         bt_dmp=0;
     }
 
-    int BucketBasics::fullValidate(const DiskLoc& thisLoc, const BSONObj &order, int *unusedCount) {
+    int BtreeBucket::fullValidate(const DiskLoc& thisLoc, const BSONObj &order, int *unusedCount, bool strict) const {
         {
             bool f = false;
             assert( f = true );
@@ -93,8 +137,6 @@ namespace mongo {
 
         killCurrentOp.checkForInterrupt();
         assertValid(order, true);
-//	if( bt_fv==0 )
-//		return;
 
         if ( bt_dmp ) {
             out() << thisLoc.toString() << ' ';
@@ -105,26 +147,37 @@ namespace mongo {
         int kc = 0;
 
         for ( int i = 0; i < n; i++ ) {
-            _KeyNode& kn = k(i);
+            const _KeyNode& kn = k(i);
 
             if ( kn.isUsed() ) {
                 kc++;
-            } else {
+            }
+            else {
                 if ( unusedCount ) {
                     ++( *unusedCount );
                 }
             }
             if ( !kn.prevChildBucket.isNull() ) {
                 DiskLoc left = kn.prevChildBucket;
-                BtreeBucket *b = left.btree();
-                wassert( b->parent == thisLoc );
-                kc += b->fullValidate(kn.prevChildBucket, order, unusedCount);
+                const BtreeBucket *b = left.btree();
+                if ( strict ) {
+                    assert( b->parent == thisLoc );
+                }
+                else {
+                    wassert( b->parent == thisLoc );
+                }
+                kc += b->fullValidate(kn.prevChildBucket, order, unusedCount, strict);
             }
         }
         if ( !nextChild.isNull() ) {
-            BtreeBucket *b = nextChild.btree();
-            wassert( b->parent == thisLoc );
-            kc += b->fullValidate(nextChild, order, unusedCount);
+            const BtreeBucket *b = nextChild.btree();
+            if ( strict ) {
+                assert( b->parent == thisLoc );
+            }
+            else {
+                wassert( b->parent == thisLoc );
+            }
+            kc += b->fullValidate(nextChild, order, unusedCount, strict);
         }
 
         return kc;
@@ -132,12 +185,20 @@ namespace mongo {
 
     int nDumped = 0;
 
-    void BucketBasics::assertValid(const Ordering &order, bool force) {
+    void BucketBasics::assertValid(const Ordering &order, bool force) const {
         if ( !debug && !force )
             return;
         wassert( n >= 0 && n < Size() );
         wassert( emptySize >= 0 && emptySize < BucketSize );
         wassert( topSize >= n && topSize <= BucketSize );
+
+        // this is very slow so don't do often
+        {
+            static int _k;
+            if( ++_k % 128 )
+                return;
+        }
+
         DEV {
             // slow:
             for ( int i = 0; i < n-1; i++ ) {
@@ -204,15 +265,16 @@ namespace mongo {
         reserved = 0;
     }
 
-    /* see _alloc */
+    /** see _alloc */
     inline void BucketBasics::_unalloc(int bytes) {
         topSize -= bytes;
         emptySize += bytes;
     }
 
-    /* we allocate space from the end of the buffer for data.
-       the keynodes grow from the front.
-    */
+    /**
+     * we allocate space from the end of the buffer for data.
+     * the keynodes grow from the front.
+     */
     inline int BucketBasics::_alloc(int bytes) {
         topSize += bytes;
         emptySize -= bytes;
@@ -221,21 +283,23 @@ namespace mongo {
         return ofs;
     }
 
-    void BucketBasics::_delKeyAtPos(int keypos) {
+    void BucketBasics::_delKeyAtPos(int keypos, bool mayEmpty) {
         assert( keypos >= 0 && keypos <= n );
         assert( childForPos(keypos).isNull() );
+        // TODO audit cases where nextChild is null
+        assert( ( mayEmpty && n > 0 ) || n > 1 || nextChild.isNull() );
+        emptySize += sizeof(_KeyNode);
         n--;
-        assert( n > 0 || nextChild.isNull() );
         for ( int j = keypos; j < n; j++ )
             k(j) = k(j+1);
-        emptySize += sizeof(_KeyNode);
         setNotPacked();
     }
 
-    /* pull rightmost key from the bucket.  this version requires its right child to be null so it 
-	   does not bother returning that value.
-    */
-    void BucketBasics::popBack(DiskLoc& recLoc, BSONObj& key) { 
+    /**
+     * pull rightmost key from the bucket.  this version requires its right child to be null so it
+     *  does not bother returning that value.
+     */
+    void BucketBasics::popBack(DiskLoc& recLoc, BSONObj& key) {
         massert( 10282 ,  "n==0 in btree popBack()", n > 0 );
         assert( k(n-1).isUsed() ); // no unused skipping in this function at this point - btreebuilder doesn't require that
         KeyNode kn = keyNode(n-1);
@@ -243,18 +307,18 @@ namespace mongo {
         key = kn.key;
         int keysize = kn.key.objsize();
 
-		massert( 10283 , "rchild not null in btree popBack()", nextChild.isNull());
+        massert( 10283 , "rchild not null in btree popBack()", nextChild.isNull());
 
-		/* weirdly, we also put the rightmost down pointer in nextchild, even when bucket isn't full. */
-		nextChild = kn.prevChildBucket;
+        // weirdly, we also put the rightmost down pointer in nextchild, even when bucket isn't full.
+        nextChild = kn.prevChildBucket;
 
         n--;
         emptySize += sizeof(_KeyNode);
         _unalloc(keysize);
     }
 
-    /* add a key.  must be > all existing.  be careful to set next ptr right. */
-    bool BucketBasics::_pushBack(const DiskLoc& recordLoc, BSONObj& key, const Ordering &order, DiskLoc prevChild) {
+    /** add a key.  must be > all existing.  be careful to set next ptr right. */
+    bool BucketBasics::_pushBack(const DiskLoc recordLoc, const BSONObj& key, const Ordering &order, const DiskLoc prevChild) {
         int bytesNeeded = key.objsize() + sizeof(_KeyNode);
         if ( bytesNeeded > emptySize )
             return false;
@@ -269,38 +333,96 @@ namespace mongo {
         memcpy(p, key.objdata(), key.objsize());
         return true;
     }
-    /*void BucketBasics::pushBack(const DiskLoc& recordLoc, BSONObj& key, const BSONObj &order, DiskLoc prevChild, DiskLoc nextChild) { 
-        pushBack(recordLoc, key, order, prevChild);
-        childForPos(n) = nextChild;
-    }*/
 
-    /* insert a key in a bucket with no complexity -- no splits required */
-    bool BucketBasics::basicInsert(const DiskLoc& thisLoc, int &keypos, const DiskLoc& recordLoc, const BSONObj& key, const Ordering &order) {
-        modified(thisLoc);
+    /* durability note
+       we do separate intent declarations herein.  arguably one could just declare
+       the whole bucket given we do group commits. this is something we could investigate
+       later as to what is faster under what situations.
+       */
+    /** insert a key in a bucket with no complexity -- no splits required
+        @return false if a split is required.
+    */
+    bool BucketBasics::basicInsert(const DiskLoc thisLoc, int &keypos, const DiskLoc recordLoc, const BSONObj& key, const Ordering &order) const {
         assert( keypos >= 0 && keypos <= n );
         int bytesNeeded = key.objsize() + sizeof(_KeyNode);
         if ( bytesNeeded > emptySize ) {
-            pack( order, keypos );
+            _pack(thisLoc, order, keypos);
             if ( bytesNeeded > emptySize )
                 return false;
         }
-        for ( int j = n; j > keypos; j-- ) // make room
-            k(j) = k(j-1);
-        n++;
-        emptySize -= sizeof(_KeyNode);
-        _KeyNode& kn = k(keypos);
+
+        BucketBasics *b;
+        {
+            const char *p = (const char *) &k(keypos);
+            const char *q = (const char *) &k(n+1);
+            // declare that we will write to [k(keypos),k(n)]
+            // todo: this writes a medium amount to the journal.  we may want to add a verb "shift" to the redo log so
+            //       we can log a very small amount.
+            b = (BucketBasics*) getDur().writingAtOffset((void *) this, p-(char*)this, q-p);
+
+            // e.g. n==3, keypos==2
+            // 1 4 9
+            // ->
+            // 1 4 _ 9
+            for ( int j = n; j > keypos; j-- ) // make room
+                b->k(j) = b->k(j-1);
+        }
+
+        getDur().declareWriteIntent(&b->emptySize, 12); // [b->emptySize..b->n] is 12 bytes and we are going to write those
+        b->emptySize -= sizeof(_KeyNode);
+        b->n++;
+
+        _KeyNode& kn = b->k(keypos);
         kn.prevChildBucket.Null();
         kn.recordLoc = recordLoc;
-        kn.setKeyDataOfs((short) _alloc(key.objsize()) );
-        char *p = dataAt(kn.keyDataOfs());
+        kn.setKeyDataOfs((short) b->_alloc(key.objsize()) );
+        char *p = b->dataAt(kn.keyDataOfs());
+        getDur().declareWriteIntent(p, key.objsize());
         memcpy(p, key.objdata(), key.objsize());
         return true;
     }
 
-    /* when we delete things we just leave empty space until the node is
-       full and then we repack it.
-    */
-    void BucketBasics::pack( const Ordering &order, int &refPos ) {
+    /** with this implementation, refPos == 0 disregards effect of refPos */
+    bool BucketBasics::mayDropKey( int index, int refPos ) const {
+        return index > 0 && ( index != refPos ) && k( index ).isUnused() && k( index ).prevChildBucket.isNull();
+    }
+
+    int BucketBasics::packedDataSize( int refPos ) const {
+        if ( flags & Packed ) {
+            return BucketSize - emptySize - headerSize();
+        }
+        int size = 0;
+        for( int j = 0; j < n; ++j ) {
+            if ( mayDropKey( j, refPos ) ) {
+                continue;
+            }
+            size += keyNode( j ).key.objsize() + sizeof( _KeyNode );
+        }
+        return size;
+    }
+
+    /**
+     * when we delete things we just leave empty space until the node is
+     * full and then we repack it.
+     */
+    void BucketBasics::_pack(const DiskLoc thisLoc, const Ordering &order, int &refPos) const {
+        if ( flags & Packed )
+            return;
+
+        VERIFYTHISLOC
+
+        /** TODO perhaps this can be optimized.  for example if packing does no write, we can skip intent decl.
+                 an empirical approach is probably best than just adding new code : perhaps the bucket would need
+                 declaration anyway within the group commit interval, in which case we would just be adding
+                 code and complexity without benefit.
+        */
+        thisLoc.btreemod()->_packReadyForMod(order, refPos);
+    }
+
+    /** version when write intent already declared */
+    void BucketBasics::_packReadyForMod( const Ordering &order, int &refPos ) {
+        assertWritable();
+
         if ( flags & Packed )
             return;
 
@@ -310,7 +432,7 @@ namespace mongo {
         topSize = 0;
         int i = 0;
         for ( int j = 0; j < n; j++ ) {
-            if( j > 0 && ( j != refPos ) && k( j ).isUnused() && k( j ).prevChildBucket.isNull() ) {
+            if( mayDropKey( j, refPos ) ) {
                 continue; // key is unused and has no children - drop it
             }
             if( i != j ) {
@@ -333,26 +455,104 @@ namespace mongo {
         n = i;
         int dataUsed = tdz - ofs;
         memcpy(data + ofs, temp + ofs, dataUsed);
+
+        // assertWritable();
+        // TEMP TEST getDur().declareWriteIntent(this, sizeof(*this));
+
         emptySize = tdz - dataUsed - n * sizeof(_KeyNode);
         assert( emptySize >= 0 );
 
         setPacked();
+
         assertValid( order );
     }
 
     inline void BucketBasics::truncateTo(int N, const Ordering &order, int &refPos) {
+        dbMutex.assertWriteLocked();
+        assertWritable();
+
         n = N;
         setNotPacked();
-        pack( order, refPos );
+        _packReadyForMod( order, refPos );
+    }
+
+    /**
+     * In the standard btree algorithm, we would split based on the
+     * existing keys _and_ the new key.  But that's more work to
+     * implement, so we split the existing keys and then add the new key.
+     *
+     * There are several published heuristic algorithms for doing splits,
+     * but basically what you want are (1) even balancing between the two
+     * sides and (2) a small split key so the parent can have a larger
+     * branching factor.
+     *
+     * We just have a simple algorithm right now: if a key includes the
+     * halfway point (or 10% way point) in terms of bytes, split on that key;
+     * otherwise split on the key immediately to the left of the halfway
+     * point.
+     *
+     * This function is expected to be called on a packed bucket.
+     */
+    int BucketBasics::splitPos( int keypos ) const {
+        assert( n > 2 );
+        int split = 0;
+        int rightSize = 0;
+        // when splitting a btree node, if the new key is greater than all the other keys, we should not do an even split, but a 90/10 split.
+        // see SERVER-983
+        int rightSizeLimit = ( topSize + sizeof( _KeyNode ) * n ) / ( keypos == n ? 10 : 2 );
+        for( int i = n - 1; i > -1; --i ) {
+            rightSize += keyNode( i ).key.objsize() + sizeof( _KeyNode );
+            if ( rightSize > rightSizeLimit ) {
+                split = i;
+                break;
+            }
+        }
+        // safeguards - we must not create an empty bucket
+        if ( split < 1 ) {
+            split = 1;
+        }
+        else if ( split > n - 2 ) {
+            split = n - 2;
+        }
+
+        return split;
+    }
+
+    void BucketBasics::reserveKeysFront( int nAdd ) {
+        assert( emptySize >= int( sizeof( _KeyNode ) * nAdd ) );
+        emptySize -= sizeof( _KeyNode ) * nAdd;
+        for( int i = n - 1; i > -1; --i ) {
+            k( i + nAdd ) = k( i );
+        }
+        n += nAdd;
+    }
+
+    void BucketBasics::setKey( int i, const DiskLoc recordLoc, const BSONObj &key, const DiskLoc prevChildBucket ) {
+        _KeyNode &kn = k( i );
+        kn.recordLoc = recordLoc;
+        kn.prevChildBucket = prevChildBucket;
+        short ofs = (short) _alloc( key.objsize() );
+        kn.setKeyDataOfs( ofs );
+        char *p = dataAt( ofs );
+        memcpy( p, key.objdata(), key.objsize() );
+    }
+
+    void BucketBasics::dropFront( int nDrop, const Ordering &order, int &refpos ) {
+        for( int i = nDrop; i < n; ++i ) {
+            k( i - nDrop ) = k( i );
+        }
+        n -= nDrop;
+        setNotPacked();
+        _packReadyForMod( order, refpos );
     }
 
     /* - BtreeBucket --------------------------------------------------- */
 
-    /* return largest key in the subtree. */
+    /** @return largest key in the subtree. */
     void BtreeBucket::findLargestKey(const DiskLoc& thisLoc, DiskLoc& largestLoc, int& largestKey) {
         DiskLoc loc = thisLoc;
         while ( 1 ) {
-            BtreeBucket *b = loc.btree();
+            const BtreeBucket *b = loc.btree();
             if ( !b->nextChild.isNull() ) {
                 loc = b->nextChild;
                 continue;
@@ -365,23 +565,34 @@ namespace mongo {
             break;
         }
     }
-    
-    int BtreeBucket::customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, const vector< const BSONElement * > &rEnd, const Ordering &o ) {
+
+    /**
+     * NOTE Currently the Ordering implementation assumes a compound index will
+     * not have more keys than an unsigned variable has bits.  The same
+     * assumption is used in the implementation below with respect to the 'mask'
+     * variable.
+     */
+    int BtreeBucket::customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, bool rSup, const vector< const BSONElement * > &rEnd, const vector< bool > &rEndInclusive, const Ordering &o, int direction ) {
         BSONObjIterator ll( l );
         BSONObjIterator rr( rBegin );
         vector< const BSONElement * >::const_iterator rr2 = rEnd.begin();
+        vector< bool >::const_iterator inc = rEndInclusive.begin();
         unsigned mask = 1;
         for( int i = 0; i < rBeginLen; ++i, mask <<= 1 ) {
             BSONElement lll = ll.next();
             BSONElement rrr = rr.next();
             ++rr2;
-            
+            ++inc;
+
             int x = lll.woCompare( rrr, false );
             if ( o.descending( mask ) )
                 x = -x;
             if ( x != 0 )
                 return x;
         }
+        if ( rSup ) {
+            return -direction;
+        }
         for( ; ll.more(); mask <<= 1 ) {
             BSONElement lll = ll.next();
             BSONElement rrr = **rr2;
@@ -391,11 +602,15 @@ namespace mongo {
                 x = -x;
             if ( x != 0 )
                 return x;
+            if ( !*inc ) {
+                return -direction;
+            }
+            ++inc;
         }
         return 0;
     }
 
-    bool BtreeBucket::exists(const IndexDetails& idx, DiskLoc thisLoc, const BSONObj& key, const Ordering& order) { 
+    bool BtreeBucket::exists(const IndexDetails& idx, const DiskLoc &thisLoc, const BSONObj& key, const Ordering& order) const {
         int pos;
         bool found;
         DiskLoc b = locate(idx, thisLoc, key, order, pos, found, minDiskLoc);
@@ -404,8 +619,8 @@ namespace mongo {
         while ( 1 ) {
             if( b.isNull() )
                 break;
-            BtreeBucket *bucket = b.btree();
-            _KeyNode& kn = bucket->k(pos);
+            const BtreeBucket *bucket = b.btree();
+            const _KeyNode& kn = bucket->k(pos);
             if ( kn.isUsed() )
                 return bucket->keyAt(pos).woEqual(key);
             b = bucket->advance(b, pos, 1, "BtreeBucket::exists");
@@ -413,22 +628,22 @@ namespace mongo {
         return false;
     }
 
-    /* @param self - don't complain about ourself already being in the index case.
-       @return true = there is a duplicate.
-    */
+    /**
+     * @param self - don't complain about ourself already being in the index case.
+     * @return true = there is a duplicate.
+     */
     bool BtreeBucket::wouldCreateDup(
-        const IndexDetails& idx, DiskLoc thisLoc, 
+        const IndexDetails& idx, const DiskLoc &thisLoc,
         const BSONObj& key, const Ordering& order,
-        DiskLoc self) 
-    { 
+        const DiskLoc &self) const {
         int pos;
         bool found;
         DiskLoc b = locate(idx, thisLoc, key, order, pos, found, minDiskLoc);
 
         while ( !b.isNull() ) {
             // we skip unused keys
-            BtreeBucket *bucket = b.btree();
-            _KeyNode& kn = bucket->k(pos);
+            const BtreeBucket *bucket = b.btree();
+            const _KeyNode& kn = bucket->k(pos);
             if ( kn.isUsed() ) {
                 if( bucket->keyAt(pos).woEqual(key) )
                     return kn.recordLoc != self;
@@ -440,7 +655,7 @@ namespace mongo {
         return false;
     }
 
-    string BtreeBucket::dupKeyError( const IndexDetails& idx , const BSONObj& key ){
+    string BtreeBucket::dupKeyError( const IndexDetails& idx , const BSONObj& key ) {
         stringstream ss;
         ss << "E11000 duplicate key error ";
         ss << "index: " << idx.indexNamespace() << "  ";
@@ -448,37 +663,38 @@ namespace mongo {
         return ss.str();
     }
 
-    /* Find a key withing this btree bucket.
- 
-       When duplicate keys are allowed, we use the DiskLoc of the record as if it were part of the 
-       key.  That assures that even when there are many duplicates (e.g., 1 million) for a key,
-       our performance is still good.
-
-       assertIfDup: if the key exists (ignoring the recordLoc), uassert
-
-       pos: for existing keys k0...kn-1.
-       returns # it goes BEFORE.  so key[pos-1] < key < key[pos]
-       returns n if it goes after the last existing key.
-       note result might be an Unused location!
-    */
-	char foo;
-    bool BtreeBucket::find(const IndexDetails& idx, const BSONObj& key, DiskLoc recordLoc, const Ordering &order, int& pos, bool assertIfDup) {
+    /**
+     * Find a key withing this btree bucket.
+     *
+     * When duplicate keys are allowed, we use the DiskLoc of the record as if it were part of the
+     * key.  That assures that even when there are many duplicates (e.g., 1 million) for a key,
+     * our performance is still good.
+     *
+     * assertIfDup: if the key exists (ignoring the recordLoc), uassert
+     *
+     * pos: for existing keys k0...kn-1.
+     * returns # it goes BEFORE.  so key[pos-1] < key < key[pos]
+     * returns n if it goes after the last existing key.
+     * note result might be an Unused location!
+     */
+    char foo;
+    bool BtreeBucket::find(const IndexDetails& idx, const BSONObj& key, const DiskLoc &recordLoc, const Ordering &order, int& pos, bool assertIfDup) const {
 #if defined(_EXPERIMENT1)
-		{
-			char *z = (char *) this;
-			int i = 0;
-			while( 1 ) {
-				i += 4096;
-				if( i >= BucketSize )
-					break;
-				foo += z[i];
-			}
-		}
+        {
+            char *z = (char *) this;
+            int i = 0;
+            while( 1 ) {
+                i += 4096;
+                if( i >= BucketSize )
+                    break;
+                foo += z[i];
+            }
+        }
 #endif
-        
+
         globalIndexCounters.btree( (char*)this );
-        
-        /* binary search for this key */
+
+        // binary search for this key
         bool dupsChecked = false;
         int l=0;
         int h=n-1;
@@ -486,13 +702,13 @@ namespace mongo {
             int m = (l+h)/2;
             KeyNode M = keyNode(m);
             int x = key.woCompare(M.key, order);
-            if ( x == 0 ) { 
+            if ( x == 0 ) {
                 if( assertIfDup ) {
-                    if( k(m).isUnused() ) { 
-                        // ok that key is there if unused.  but we need to check that there aren't other 
-                        // entries for the key then.  as it is very rare that we get here, we don't put any 
+                    if( k(m).isUnused() ) {
+                        // ok that key is there if unused.  but we need to check that there aren't other
+                        // entries for the key then.  as it is very rare that we get here, we don't put any
                         // coding effort in here to make this particularly fast
-                        if( !dupsChecked ) { 
+                        if( !dupsChecked ) {
                             dupsChecked = true;
                             if( idx.head.btree()->exists(idx, idx.head, key, order) ) {
                                 if( idx.head.btree()->wouldCreateDup(idx, idx.head, key, order, recordLoc) )
@@ -503,7 +719,7 @@ namespace mongo {
                         }
                     }
                     else {
-                        if( M.recordLoc == recordLoc ) 
+                        if( M.recordLoc == recordLoc )
                             alreadyInIndex();
                         uasserted( ASSERT_ID_DUPKEY , dupKeyError( idx , key ) );
                     }
@@ -537,86 +753,378 @@ namespace mongo {
         return false;
     }
 
-    void BtreeBucket::delBucket(const DiskLoc& thisLoc, IndexDetails& id) {
+    void BtreeBucket::delBucket(const DiskLoc thisLoc, const IndexDetails& id) {
         ClientCursor::informAboutToDeleteBucket(thisLoc); // slow...
         assert( !isHead() );
 
-        BtreeBucket *p = parent.btreemod();
-        if ( p->nextChild == thisLoc ) {
-            p->nextChild.Null();
-        }
-        else {
-            for ( int i = 0; i < p->n; i++ ) {
-                if ( p->k(i).prevChildBucket == thisLoc ) {
-                    p->k(i).prevChildBucket.Null();
-                    goto found;
-                }
-            }
-            out() << "ERROR: can't find ref to deleted bucket.\n";
-            out() << "To delete:\n";
-            dump();
-            out() << "Parent:\n";
-            p->dump();
-            assert(false);
-        }
-found:
+        const BtreeBucket *p = parent.btree();
+        int parentIdx = indexInParent( thisLoc );
+        p->childForPos( parentIdx ).writing().Null();
         deallocBucket( thisLoc, id );
     }
-    
-    void BtreeBucket::deallocBucket(const DiskLoc &thisLoc, IndexDetails &id) {
+
+    void BtreeBucket::deallocBucket(const DiskLoc thisLoc, const IndexDetails &id) {
 #if 0
-        /* as a temporary defensive measure, we zap the whole bucket, AND don't truly delete
-           it (meaning it is ineligible for reuse).
-           */
+        // as a temporary defensive measure, we zap the whole bucket, AND don't truly delete
+        // it (meaning it is ineligible for reuse).
         memset(this, 0, Size());
-        modified(thisLoc);
 #else
-        //defensive:
+        // defensive:
         n = -1;
         parent.Null();
         string ns = id.indexNamespace();
-        btreeStore->deleteRecord(ns.c_str(), thisLoc);
+        theDataFileMgr._deleteRecord(nsdetails(ns.c_str()), ns.c_str(), thisLoc.rec(), thisLoc);
 #endif
     }
 
-    /* note: may delete the entire bucket!  this invalid upon return sometimes. */
-    void BtreeBucket::delKeyAtPos(const DiskLoc& thisLoc, IndexDetails& id, int p) {
-        modified(thisLoc);
+    /** note: may delete the entire bucket!  this invalid upon return sometimes. */
+    void BtreeBucket::delKeyAtPos( const DiskLoc thisLoc, IndexDetails& id, int p, const Ordering &order) {
         assert(n>0);
         DiskLoc left = childForPos(p);
 
         if ( n == 1 ) {
             if ( left.isNull() && nextChild.isNull() ) {
-                if ( isHead() )
-                    _delKeyAtPos(p); // we don't delete the top bucket ever
-                else
-                    delBucket(thisLoc, id);
+                _delKeyAtPos(p);
+                if ( isHead() ) {
+                    // we don't delete the top bucket ever
+                }
+                else {
+                    if ( !mayBalanceWithNeighbors( thisLoc, id, order ) ) {
+                        // An empty bucket is only allowed as a transient state.  If
+                        // there are no neighbors to balance with, we delete ourself.
+                        // This condition is only expected in legacy btrees.
+                        delBucket(thisLoc, id);
+                    }
+                }
                 return;
             }
-            markUnused(p);
+            deleteInternalKey( thisLoc, p, id, order );
             return;
         }
 
-        if ( left.isNull() )
+        if ( left.isNull() ) {
             _delKeyAtPos(p);
-        else
-            markUnused(p);
+            mayBalanceWithNeighbors( thisLoc, id, order );
+        }
+        else {
+            deleteInternalKey( thisLoc, p, id, order );
+        }
     }
 
-    int qqq = 0;
+    /**
+     * This function replaces the specified key (k) by either the prev or next
+     * key in the btree (k').  We require that k have either a left or right
+     * child.  If k has a left child, we set k' to the prev key of k, which must
+     * be a leaf present in the left child.  If k does not have a left child, we
+     * set k' to the next key of k, which must be a leaf present in the right
+     * child.  When we replace k with k', we copy k' over k (which may cause a
+     * split) and then remove k' from its original location.  Because k' is
+     * stored in a descendent of k, replacing k by k' will not modify the
+     * storage location of the original k', and we can easily remove k' from
+     * its original location.
+     *
+     * This function is only needed in cases where k has a left or right child;
+     * in other cases a simpler key removal implementation is possible.
+     *
+     * NOTE on legacy btree structures:
+     * In legacy btrees, k' can be a nonleaf.  In such a case we 'delete' k by
+     * marking it as an unused node rather than replacing it with k'.  Also, k'
+     * may be a leaf but marked as an unused node.  In such a case we replace
+     * k by k', preserving the key's unused marking.  This function is only
+     * expected to mark a key as unused when handling a legacy btree.
+     */
+    void BtreeBucket::deleteInternalKey( const DiskLoc thisLoc, int keypos, IndexDetails &id, const Ordering &order ) {
+        DiskLoc lchild = childForPos( keypos );
+        DiskLoc rchild = childForPos( keypos + 1 );
+        assert( !lchild.isNull() || !rchild.isNull() );
+        int advanceDirection = lchild.isNull() ? 1 : -1;
+        int advanceKeyOfs = keypos;
+        DiskLoc advanceLoc = advance( thisLoc, advanceKeyOfs, advanceDirection, __FUNCTION__ );
+
+        if ( !advanceLoc.btree()->childForPos( advanceKeyOfs ).isNull() ||
+                !advanceLoc.btree()->childForPos( advanceKeyOfs + 1 ).isNull() ) {
+            // only expected with legacy btrees, see note above
+            markUnused( keypos );
+            return;
+        }
 
-    /* remove a key from the index */
-    bool BtreeBucket::unindex(const DiskLoc& thisLoc, IndexDetails& id, BSONObj& key, const DiskLoc& recordLoc ) {
-        if ( key.objsize() > KeyMax ) {
-            OCCASIONALLY problem() << "unindex: key too large to index, skipping " << id.indexNamespace() << /* ' ' << key.toString() << */ endl;
+        KeyNode kn = advanceLoc.btree()->keyNode( advanceKeyOfs );
+        setInternalKey( thisLoc, keypos, kn.recordLoc, kn.key, order, childForPos( keypos ), childForPos( keypos + 1 ), id );
+        advanceLoc.btreemod()->delKeyAtPos( advanceLoc, id, advanceKeyOfs, order );
+    }
+
+    void BtreeBucket::replaceWithNextChild( const DiskLoc thisLoc, IndexDetails &id ) {
+        assert( n == 0 && !nextChild.isNull() );
+        if ( parent.isNull() ) {
+            assert( id.head == thisLoc );
+            id.head.writing() = nextChild;
+        }
+        else {
+            parent.btree()->childForPos( indexInParent( thisLoc ) ).writing() = nextChild;
+        }
+        nextChild.btree()->parent.writing() = parent;
+        ClientCursor::informAboutToDeleteBucket( thisLoc );
+        deallocBucket( thisLoc, id );
+    }
+
+    bool BtreeBucket::canMergeChildren( const DiskLoc &thisLoc, int leftIndex ) const {
+        assert( leftIndex >= 0 && leftIndex < n );
+        DiskLoc leftNodeLoc = childForPos( leftIndex );
+        DiskLoc rightNodeLoc = childForPos( leftIndex + 1 );
+        if ( leftNodeLoc.isNull() || rightNodeLoc.isNull() ) {
+            // TODO if this situation is possible in long term implementation, maybe we should compact somehow anyway
             return false;
         }
+        int pos = 0;
+        {
+            const BtreeBucket *l = leftNodeLoc.btree();
+            const BtreeBucket *r = rightNodeLoc.btree();
+            if ( ( headerSize() + l->packedDataSize( pos ) + r->packedDataSize( pos ) + keyNode( leftIndex ).key.objsize() + sizeof(_KeyNode) > unsigned( BucketSize ) ) ) {
+                return false;
+            }
+        }
+        return true;
+    }
 
+    /**
+     * This implementation must respect the meaning and value of lowWaterMark.
+     * Also see comments in splitPos().
+     */
+    int BtreeBucket::rebalancedSeparatorPos( const DiskLoc &thisLoc, int leftIndex ) const {
+        int split = -1;
+        int rightSize = 0;
+        const BtreeBucket *l = childForPos( leftIndex ).btree();
+        const BtreeBucket *r = childForPos( leftIndex + 1 ).btree();
+
+        int KNS = sizeof( _KeyNode );
+        int rightSizeLimit = ( l->topSize + l->n * KNS + keyNode( leftIndex ).key.objsize() + KNS + r->topSize + r->n * KNS ) / 2;
+        // This constraint should be ensured by only calling this function
+        // if we go below the low water mark.
+        assert( rightSizeLimit < BtreeBucket::bodySize() );
+        for( int i = r->n - 1; i > -1; --i ) {
+            rightSize += r->keyNode( i ).key.objsize() + KNS;
+            if ( rightSize > rightSizeLimit ) {
+                split = l->n + 1 + i;
+                break;
+            }
+        }
+        if ( split == -1 ) {
+            rightSize += keyNode( leftIndex ).key.objsize() + KNS;
+            if ( rightSize > rightSizeLimit ) {
+                split = l->n;
+            }
+        }
+        if ( split == -1 ) {
+            for( int i = l->n - 1; i > -1; --i ) {
+                rightSize += l->keyNode( i ).key.objsize() + KNS;
+                if ( rightSize > rightSizeLimit ) {
+                    split = i;
+                    break;
+                }
+            }
+        }
+        // safeguards - we must not create an empty bucket
+        if ( split < 1 ) {
+            split = 1;
+        }
+        else if ( split > l->n + 1 + r->n - 2 ) {
+            split = l->n + 1 + r->n - 2;
+        }
+
+        return split;
+    }
+
+    void BtreeBucket::doMergeChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) {
+        DiskLoc leftNodeLoc = childForPos( leftIndex );
+        DiskLoc rightNodeLoc = childForPos( leftIndex + 1 );
+        BtreeBucket *l = leftNodeLoc.btreemod();
+        BtreeBucket *r = rightNodeLoc.btreemod();
+        int pos = 0;
+        l->_packReadyForMod( order, pos );
+        r->_packReadyForMod( order, pos ); // pack r in case there are droppable keys
+
+        int oldLNum = l->n;
+        {
+            KeyNode kn = keyNode( leftIndex );
+            l->pushBack( kn.recordLoc, kn.key, order, l->nextChild ); // left child's right child becomes old parent key's left child
+        }
+        for( int i = 0; i < r->n; ++i ) {
+            KeyNode kn = r->keyNode( i );
+            l->pushBack( kn.recordLoc, kn.key, order, kn.prevChildBucket );
+        }
+        l->nextChild = r->nextChild;
+        l->fixParentPtrs( leftNodeLoc, oldLNum );
+        r->delBucket( rightNodeLoc, id );
+        childForPos( leftIndex + 1 ) = leftNodeLoc;
+        childForPos( leftIndex ) = DiskLoc();
+        _delKeyAtPos( leftIndex, true );
+        if ( n == 0 ) {
+            // will trash this and thisLoc
+            // TODO To ensure all leaves are of equal height, we should ensure
+            // this is only called on the root.
+            replaceWithNextChild( thisLoc, id );
+        }
+        else {
+            // balance recursively - maybe we should do this even when n == 0?
+            mayBalanceWithNeighbors( thisLoc, id, order );
+        }
+    }
+
+    int BtreeBucket::indexInParent( const DiskLoc &thisLoc ) const {
+        assert( !parent.isNull() );
+        const BtreeBucket *p = parent.btree();
+        if ( p->nextChild == thisLoc ) {
+            return p->n;
+        }
+        else {
+            for( int i = 0; i < p->n; ++i ) {
+                if ( p->k( i ).prevChildBucket == thisLoc ) {
+                    return i;
+                }
+            }
+        }
+        out() << "ERROR: can't find ref to child bucket.\n";
+        out() << "child: " << thisLoc << "\n";
+        dump();
+        out() << "Parent: " << parent << "\n";
+        p->dump();
+        assert(false);
+        return -1; // just to compile
+    }
+
+    bool BtreeBucket::tryBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) const {
+        // If we can merge, then we must merge rather than balance to preserve
+        // bucket utilization constraints.
+        if ( canMergeChildren( thisLoc, leftIndex ) ) {
+            return false;
+        }
+        thisLoc.btreemod()->doBalanceChildren( thisLoc, leftIndex, id, order );
+        return true;
+    }
+
+    void BtreeBucket::doBalanceLeftToRight( const DiskLoc thisLoc, int leftIndex, int split,
+                                            BtreeBucket *l, const DiskLoc lchild,
+                                            BtreeBucket *r, const DiskLoc rchild,
+                                            IndexDetails &id, const Ordering &order ) {
+        // TODO maybe do some audits the same way pushBack() does?
+        int rAdd = l->n - split;
+        r->reserveKeysFront( rAdd );
+        for( int i = split + 1, j = 0; i < l->n; ++i, ++j ) {
+            KeyNode kn = l->keyNode( i );
+            r->setKey( j, kn.recordLoc, kn.key, kn.prevChildBucket );
+        }
+        {
+            KeyNode kn = keyNode( leftIndex );
+            r->setKey( rAdd - 1, kn.recordLoc, kn.key, l->nextChild ); // left child's right child becomes old parent key's left child
+        }
+        r->fixParentPtrs( rchild, 0, rAdd - 1 );
+        {
+            KeyNode kn = l->keyNode( split );
+            l->nextChild = kn.prevChildBucket;
+            setInternalKey( thisLoc, leftIndex, kn.recordLoc, kn.key, order, lchild, rchild, id );
+        }
+        int zeropos = 0;
+        l->truncateTo( split, order, zeropos );
+    }
+
+    void BtreeBucket::doBalanceRightToLeft( const DiskLoc thisLoc, int leftIndex, int split,
+                                            BtreeBucket *l, const DiskLoc lchild,
+                                            BtreeBucket *r, const DiskLoc rchild,
+                                            IndexDetails &id, const Ordering &order ) {
+        int lN = l->n;
+        {
+            KeyNode kn = keyNode( leftIndex );
+            l->pushBack( kn.recordLoc, kn.key, order, l->nextChild ); // left child's right child becomes old parent key's left child
+        }
+        for( int i = 0; i < split - lN - 1; ++i ) {
+            KeyNode kn = r->keyNode( i );
+            l->pushBack( kn.recordLoc, kn.key, order, kn.prevChildBucket );
+        }
+        {
+            KeyNode kn = r->keyNode( split - lN - 1 );
+            l->nextChild = kn.prevChildBucket;
+            l->fixParentPtrs( lchild, lN + 1, l->n );
+            setInternalKey( thisLoc, leftIndex, kn.recordLoc, kn.key, order, lchild, rchild, id );
+        }
+        int zeropos = 0;
+        r->dropFront( split - lN, order, zeropos );
+    }
+
+    void BtreeBucket::doBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) {
+        DiskLoc lchild = childForPos( leftIndex );
+        DiskLoc rchild = childForPos( leftIndex + 1 );
+        int zeropos = 0;
+        BtreeBucket *l = lchild.btreemod();
+        l->_packReadyForMod( order, zeropos );
+        BtreeBucket *r = rchild.btreemod();
+        r->_packReadyForMod( order, zeropos );
+        int split = rebalancedSeparatorPos( thisLoc, leftIndex );
+
+        // By definition, if we are below the low water mark and cannot merge
+        // then we must actively balance.
+        assert( split != l->n );
+        if ( split < l->n ) {
+            doBalanceLeftToRight( thisLoc, leftIndex, split, l, lchild, r, rchild, id, order );
+        }
+        else {
+            doBalanceRightToLeft( thisLoc, leftIndex, split, l, lchild, r, rchild, id, order );
+        }
+    }
+
+    bool BtreeBucket::mayBalanceWithNeighbors( const DiskLoc thisLoc, IndexDetails &id, const Ordering &order ) const {
+        if ( parent.isNull() ) { // we are root, there are no neighbors
+            return false;
+        }
+
+        if ( packedDataSize( 0 ) >= lowWaterMark ) {
+            return false;
+        }
+
+        const BtreeBucket *p = parent.btree();
+        int parentIdx = indexInParent( thisLoc );
+
+        // TODO will missing neighbor case be possible long term?  Should we try to merge/balance somehow in that case if so?
+        bool mayBalanceRight = ( ( parentIdx < p->n ) && !p->childForPos( parentIdx + 1 ).isNull() );
+        bool mayBalanceLeft = ( ( parentIdx > 0 ) && !p->childForPos( parentIdx - 1 ).isNull() );
+
+        // Balance if possible on one side - we merge only if absolutely necessary
+        // to preserve btree bucket utilization constraints since that's a more
+        // heavy duty operation (especially if we must re-split later).
+        if ( mayBalanceRight &&
+                p->tryBalanceChildren( parent, parentIdx, id, order ) ) {
+            return true;
+        }
+        if ( mayBalanceLeft &&
+                p->tryBalanceChildren( parent, parentIdx - 1, id, order ) ) {
+            return true;
+        }
+
+        BtreeBucket *pm = parent.btreemod();
+        if ( mayBalanceRight ) {
+            pm->doMergeChildren( parent, parentIdx, id, order );
+            return true;
+        }
+        else if ( mayBalanceLeft ) {
+            pm->doMergeChildren( parent, parentIdx - 1, id, order );
+            return true;
+        }
+
+        return false;
+    }
+
+    /** remove a key from the index */
+    bool BtreeBucket::unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc ) const {
         int pos;
         bool found;
         DiskLoc loc = locate(id, thisLoc, key, Ordering::make(id.keyPattern()), pos, found, recordLoc, 1);
         if ( found ) {
-            loc.btree()->delKeyAtPos(loc, id, pos);
+
+            if ( key.objsize() > KeyMax ) {
+                OCCASIONALLY problem() << "unindex: key too large to index but was found for " << id.indexNamespace() << " reIndex suggested" << endl;
+            }
+            
+            loc.btreemod()->delKeyAtPos(loc, id, pos, Ordering::make(id.keyPattern()));
+            
             return true;
         }
         return false;
@@ -628,40 +1136,68 @@ found:
         return b;
     }
 
-    inline void fix(const DiskLoc& thisLoc, const DiskLoc& child) {
+    inline void BtreeBucket::fix(const DiskLoc thisLoc, const DiskLoc child) {
         if ( !child.isNull() ) {
             if ( insert_debug )
                 out() << "      " << child.toString() << ".parent=" << thisLoc.toString() << endl;
-            child.btreemod()->parent = thisLoc;
+            child.btree()->parent.writing() = thisLoc;
         }
     }
 
-    /* this sucks.  maybe get rid of parent ptrs. */
-    void BtreeBucket::fixParentPtrs(const DiskLoc& thisLoc) {
+    /** this sucks.  maybe get rid of parent ptrs. */
+    void BtreeBucket::fixParentPtrs(const DiskLoc thisLoc, int firstIndex, int lastIndex) const {
         VERIFYTHISLOC
-        fix(thisLoc, nextChild);
-        for ( int i = 0; i < n; i++ )
-            fix(thisLoc, k(i).prevChildBucket);
+        if ( lastIndex == -1 ) {
+            lastIndex = n;
+        }
+        for ( int i = firstIndex; i <= lastIndex; i++ ) {
+            fix(thisLoc, childForPos(i));
+        }
     }
 
-    /* insert a key in this bucket, splitting if necessary.
-       keypos - where to insert the key i3n range 0..n.  0=make leftmost, n=make rightmost.
-       NOTE this function may free some data, and as a result the value passed for keypos may
-       be invalid after calling insertHere()
-    */
-    void BtreeBucket::insertHere(DiskLoc thisLoc, int keypos,
-                                 DiskLoc recordLoc, const BSONObj& key, const Ordering& order,
-                                 DiskLoc lchild, DiskLoc rchild, IndexDetails& idx)
-    {
-        modified(thisLoc);
+    void BtreeBucket::setInternalKey( const DiskLoc thisLoc, int keypos,
+                                      const DiskLoc recordLoc, const BSONObj &key, const Ordering &order,
+                                      const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx ) {
+        childForPos( keypos ).Null();
+
+        // This may leave the bucket empty (n == 0) which is ok only as a
+        // transient state.  In the instant case, the implementation of
+        // insertHere behaves correctly when n == 0 and as a side effect
+        // increments n.
+        _delKeyAtPos( keypos, true );
+
+        // Ensure we do not orphan neighbor's old child.
+        assert( childForPos( keypos ) == rchild );
+
+        // Just set temporarily - required to pass validation in insertHere()
+        childForPos( keypos ) = lchild;
+
+        insertHere( thisLoc, keypos, recordLoc, key, order, lchild, rchild, idx );
+    }
+
+    /**
+     * insert a key in this bucket, splitting if necessary.
+     * @keypos - where to insert the key in range 0..n.  0=make leftmost, n=make rightmost.
+     * NOTE this function may free some data, and as a result the value passed for keypos may
+     * be invalid after calling insertHere()
+     */
+    void BtreeBucket::insertHere( const DiskLoc thisLoc, int keypos,
+                                  const DiskLoc recordLoc, const BSONObj& key, const Ordering& order,
+                                  const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx) const {
         if ( insert_debug )
             out() << "   " << thisLoc.toString() << ".insertHere " << key.toString() << '/' << recordLoc.toString() << ' '
-                 << lchild.toString() << ' ' << rchild.toString() << " keypos:" << keypos << endl;
+                  << lchild.toString() << ' ' << rchild.toString() << " keypos:" << keypos << endl;
 
         DiskLoc oldLoc = thisLoc;
 
-        if ( basicInsert(thisLoc, keypos, recordLoc, key, order) ) {
-            _KeyNode& kn = k(keypos);
+        if ( !basicInsert(thisLoc, keypos, recordLoc, key, order) ) {
+            thisLoc.btreemod()->split(thisLoc, keypos, recordLoc, key, order, lchild, rchild, idx);
+            return;
+        }
+
+        {
+            const _KeyNode *_kn = &k(keypos);
+            _KeyNode *kn = (_KeyNode *) getDur().alreadyDeclared((_KeyNode*) _kn); // already declared intent in basicInsert()
             if ( keypos+1 == n ) { // last key
                 if ( nextChild != lchild ) {
                     out() << "ERROR nextChild != lchild" << endl;
@@ -671,22 +1207,16 @@ found:
                     out() << "  recordLoc: " << recordLoc.toString() << " rchild: " << rchild.toString() << endl;
                     out() << "  key: " << key.toString() << endl;
                     dump();
-#if 0
-                    out() << "\n\nDUMPING FULL INDEX" << endl;
-                    bt_dmp=1;
-                    bt_fv=1;
-                    idx.head.btree()->fullValidate(idx.head);
-#endif
                     assert(false);
                 }
-                kn.prevChildBucket = nextChild;
-                assert( kn.prevChildBucket == lchild );
-                nextChild = rchild;
+                kn->prevChildBucket = nextChild;
+                assert( kn->prevChildBucket == lchild );
+                nextChild.writing() = rchild;
                 if ( !rchild.isNull() )
-                    rchild.btreemod()->parent = thisLoc;
+                    rchild.btree()->parent.writing() = thisLoc;
             }
             else {
-                k(keypos).prevChildBucket = lchild;
+                kn->prevChildBucket = lchild;
                 if ( k(keypos+1).prevChildBucket != lchild ) {
                     out() << "ERROR k(keypos+1).prevChildBucket != lchild" << endl;
                     out() << "  thisLoc: " << thisLoc.toString() << ' ' << idx.indexNamespace() << endl;
@@ -695,33 +1225,24 @@ found:
                     out() << "  recordLoc: " << recordLoc.toString() << " rchild: " << rchild.toString() << endl;
                     out() << "  key: " << key.toString() << endl;
                     dump();
-#if 0
-                    out() << "\n\nDUMPING FULL INDEX" << endl;
-                    bt_dmp=1;
-                    bt_fv=1;
-                    idx.head.btree()->fullValidate(idx.head);
-#endif
                     assert(false);
                 }
-                k(keypos+1).prevChildBucket = rchild;
+                const DiskLoc *pc = &k(keypos+1).prevChildBucket;
+                *getDur().alreadyDeclared((DiskLoc*) pc) = rchild; // declared in basicInsert()
                 if ( !rchild.isNull() )
-                    rchild.btreemod()->parent = thisLoc;
+                    rchild.btree()->parent.writing() = thisLoc;
             }
             return;
         }
+    }
 
-        /* ---------- split ---------------- */
+    void BtreeBucket::split(const DiskLoc thisLoc, int keypos, const DiskLoc recordLoc, const BSONObj& key, const Ordering& order, const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx) {
+        assertWritable();
 
         if ( split_debug )
             out() << "    " << thisLoc.toString() << ".split" << endl;
 
-        int split = n / 2;
-        if ( keypos == n ) { // see SERVER-983
-            split = (int) (0.9 * n);
-            if ( split > n - 2 )
-                split = n - 2;
-        }
-
+        int split = splitPos( keypos );
         DiskLoc rLoc = addBucket(idx);
         BtreeBucket *r = rLoc.btreemod();
         if ( split_debug )
@@ -753,15 +1274,14 @@ found:
                 p->pushBack(splitkey.recordLoc, splitkey.key, order, thisLoc);
                 p->nextChild = rLoc;
                 p->assertValid( order );
-                parent = idx.head = L;
+                parent = idx.head.writing() = L;
                 if ( split_debug )
                     out() << "    we were root, making new root:" << hex << parent.getOfs() << dec << endl;
-                rLoc.btreemod()->parent = parent;
+                rLoc.btree()->parent.writing() = parent;
             }
             else {
-                /* set this before calling _insert - if it splits it will do fixParent() logic and change the value.
-                */
-                rLoc.btreemod()->parent = parent;
+                // set this before calling _insert - if it splits it will do fixParent() logic and change the value.
+                rLoc.btree()->parent.writing() = parent;
                 if ( split_debug )
                     out() << "    promoting splitkey key " << splitkey.key.toString() << endl;
                 parent.btree()->_insert(parent, splitkey.recordLoc, splitkey.key, order, /*dupsallowed*/true, thisLoc, rLoc, idx);
@@ -769,16 +1289,17 @@ found:
         }
 
         int newpos = keypos;
+        // note this may trash splitkey.key.  thus we had to promote it before finishing up here.
         truncateTo(split, order, newpos);  // note this may trash splitkey.key.  thus we had to promote it before finishing up here.
 
         // add our new key, there is room now
         {
-
             if ( keypos <= split ) {
                 if ( split_debug )
                     out() << "  keypos<split, insertHere() the new key" << endl;
                 insertHere(thisLoc, newpos, recordLoc, key, order, lchild, rchild, idx);
-            } else {
+            }
+            else {
                 int kp = keypos-split-1;
                 assert(kp>=0);
                 rLoc.btree()->insertHere(rLoc, kp, recordLoc, key, order, lchild, rchild, idx);
@@ -789,26 +1310,27 @@ found:
             out() << "     split end " << hex << thisLoc.getOfs() << dec << endl;
     }
 
-    /* start a new index off, empty */
-    DiskLoc BtreeBucket::addBucket(IndexDetails& id) {
-        DiskLoc loc = btreeStore->insert(id.indexNamespace().c_str(), 0, BucketSize, true);
+    /** start a new index off, empty */
+    DiskLoc BtreeBucket::addBucket(const IndexDetails& id) {
+        string ns = id.indexNamespace();
+        DiskLoc loc = theDataFileMgr.insert(ns.c_str(), 0, BucketSize, true);
         BtreeBucket *b = loc.btreemod();
         b->init();
         return loc;
     }
 
     void BtreeBucket::renameIndexNamespace(const char *oldNs, const char *newNs) {
-        btreeStore->rename( oldNs, newNs );
+        renameNamespace( oldNs, newNs );
     }
 
-    DiskLoc BtreeBucket::getHead(const DiskLoc& thisLoc) {
+    const DiskLoc BtreeBucket::getHead(const DiskLoc& thisLoc) const {
         DiskLoc p = thisLoc;
         while ( !p.btree()->isHead() )
             p = p.btree()->parent;
         return p;
     }
 
-    DiskLoc BtreeBucket::advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) {
+    DiskLoc BtreeBucket::advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) const {
         if ( keyOfs < 0 || keyOfs >= n ) {
             out() << "ASSERT failure BtreeBucket::advance, caller: " << caller << endl;
             out() << "  thisLoc: " << thisLoc.toString() << endl;
@@ -841,7 +1363,7 @@ found:
         while ( 1 ) {
             if ( ancestor.isNull() )
                 break;
-            BtreeBucket *an = ancestor.btree();
+            const BtreeBucket *an = ancestor.btree();
             for ( int i = 0; i < an->n; i++ ) {
                 if ( an->childForPos(i+adj) == childLoc ) {
                     keyOfs = i;
@@ -857,7 +1379,7 @@ found:
         return DiskLoc();
     }
 
-    DiskLoc BtreeBucket::locate(const IndexDetails& idx, const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order, int& pos, bool& found, DiskLoc recordLoc, int direction) {
+    DiskLoc BtreeBucket::locate(const IndexDetails& idx, const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order, int& pos, bool& found, const DiskLoc &recordLoc, int direction) const {
         int p;
         found = find(idx, key, recordLoc, order, p, /*assertIfDup*/ false);
         if ( found ) {
@@ -880,7 +1402,7 @@ found:
             return pos == n ? DiskLoc() /*theend*/ : thisLoc;
     }
 
-    bool BtreeBucket::customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, const vector< const BSONElement * > &keyEnd, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent ) {
+    bool BtreeBucket::customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent ) const {
         while( 1 ) {
             if ( l + 1 == h ) {
                 keyOfs = ( direction > 0 ) ? h : l;
@@ -889,101 +1411,123 @@ found:
                     bestParent = make_pair( thisLoc, keyOfs );
                     thisLoc = next;
                     return true;
-                } else {
+                }
+                else {
                     return false;
                 }
             }
             int m = l + ( h - l ) / 2;
-            int cmp = customBSONCmp( thisLoc.btree()->keyNode( m ).key, keyBegin, keyBeginLen, keyEnd, order );
+            int cmp = customBSONCmp( thisLoc.btree()->keyNode( m ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction );
             if ( cmp < 0 ) {
                 l = m;
-            } else if ( cmp > 0 ) {
+            }
+            else if ( cmp > 0 ) {
                 h = m;
-            } else {
+            }
+            else {
                 if ( direction < 0 ) {
                     l = m;
-                } else {
+                }
+                else {
                     h = m;
                 }
             }
-        }        
+        }
     }
-    
-    // find smallest/biggest value greater-equal/less-equal than specified
-    // starting thisLoc + keyOfs will be strictly less than/strictly greater than keyBegin/keyBeginLen/keyEnd
-    // All the direction checks below allowed me to refactor the code, but possibly separate forward and reverse implementations would be more efficient
-    void BtreeBucket::advanceTo(const IndexDetails &id, DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, const vector< const BSONElement * > &keyEnd, const Ordering &order, int direction ) {
+
+    /**
+     * find smallest/biggest value greater-equal/less-equal than specified
+     * starting thisLoc + keyOfs will be strictly less than/strictly greater than keyBegin/keyBeginLen/keyEnd
+     * All the direction checks below allowed me to refactor the code, but possibly separate forward and reverse implementations would be more efficient
+     */
+    void BtreeBucket::advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) const {
         int l,h;
         bool dontGoUp;
         if ( direction > 0 ) {
             l = keyOfs;
             h = n - 1;
-            dontGoUp = ( customBSONCmp( keyNode( h ).key, keyBegin, keyBeginLen, keyEnd, order ) >= 0 );
-        } else {
+            dontGoUp = ( customBSONCmp( keyNode( h ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 );
+        }
+        else {
             l = 0;
             h = keyOfs;
-            dontGoUp = ( customBSONCmp( keyNode( l ).key, keyBegin, keyBeginLen, keyEnd, order ) <= 0 );
+            dontGoUp = ( customBSONCmp( keyNode( l ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 );
         }
         pair< DiskLoc, int > bestParent;
         if ( dontGoUp ) {
             // this comparison result assures h > l
-            if ( !customFind( l, h, keyBegin, keyBeginLen, keyEnd, order, direction, thisLoc, keyOfs, bestParent ) ) {
+            if ( !customFind( l, h, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction, thisLoc, keyOfs, bestParent ) ) {
                 return;
             }
-        } else {
+        }
+        else {
             // go up parents until rightmost/leftmost node is >=/<= target or at top
             while( !thisLoc.btree()->parent.isNull() ) {
                 thisLoc = thisLoc.btree()->parent;
                 if ( direction > 0 ) {
-                    if ( customBSONCmp( thisLoc.btree()->keyNode( thisLoc.btree()->n - 1 ).key, keyBegin, keyBeginLen, keyEnd, order ) >= 0 ) {
+                    if ( customBSONCmp( thisLoc.btree()->keyNode( thisLoc.btree()->n - 1 ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 ) {
                         break;
                     }
-                } else {
-                    if ( customBSONCmp( thisLoc.btree()->keyNode( 0 ).key, keyBegin, keyBeginLen, keyEnd, order ) <= 0 ) {
+                }
+                else {
+                    if ( customBSONCmp( thisLoc.btree()->keyNode( 0 ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 ) {
                         break;
-                    }                    
+                    }
                 }
             }
         }
+        customLocate( thisLoc, keyOfs, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction, bestParent );
+    }
+
+    void BtreeBucket::customLocate(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, pair< DiskLoc, int > &bestParent ) const {
+        if ( thisLoc.btree()->n == 0 ) {
+            thisLoc = DiskLoc();
+            return;
+        }
         // go down until find smallest/biggest >=/<= target
         while( 1 ) {
-            l = 0;
-            h = thisLoc.btree()->n - 1;
+            int l = 0;
+            int h = thisLoc.btree()->n - 1;
             // leftmost/rightmost key may possibly be >=/<= search key
             bool firstCheck;
             if ( direction > 0 ) {
-                firstCheck = ( customBSONCmp( thisLoc.btree()->keyNode( 0 ).key, keyBegin, keyBeginLen, keyEnd, order ) >= 0 );
-            } else {
-                firstCheck = ( customBSONCmp( thisLoc.btree()->keyNode( h ).key, keyBegin, keyBeginLen, keyEnd, order ) <= 0 );
+                firstCheck = ( customBSONCmp( thisLoc.btree()->keyNode( 0 ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) >= 0 );
+            }
+            else {
+                firstCheck = ( customBSONCmp( thisLoc.btree()->keyNode( h ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) <= 0 );
             }
             if ( firstCheck ) {
                 DiskLoc next;
                 if ( direction > 0 ) {
                     next = thisLoc.btree()->k( 0 ).prevChildBucket;
                     keyOfs = 0;
-                } else {
+                }
+                else {
                     next = thisLoc.btree()->nextChild;
                     keyOfs = h;
                 }
                 if ( !next.isNull() ) {
-                    bestParent = make_pair( thisLoc, keyOfs );
+                    bestParent = pair< DiskLoc, int >( thisLoc, keyOfs );
                     thisLoc = next;
                     continue;
-                } else {
+                }
+                else {
                     return;
                 }
             }
             bool secondCheck;
             if ( direction > 0 ) {
-                secondCheck = ( customBSONCmp( thisLoc.btree()->keyNode( h ).key, keyBegin, keyBeginLen, keyEnd, order ) < 0 );
-            } else {
-                secondCheck = ( customBSONCmp( thisLoc.btree()->keyNode( 0 ).key, keyBegin, keyBeginLen, keyEnd, order ) > 0 );
+                secondCheck = ( customBSONCmp( thisLoc.btree()->keyNode( h ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) < 0 );
+            }
+            else {
+                secondCheck = ( customBSONCmp( thisLoc.btree()->keyNode( 0 ).key, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction ) > 0 );
             }
             if ( secondCheck ) {
                 DiskLoc next;
                 if ( direction > 0 ) {
                     next = thisLoc.btree()->nextChild;
-                } else {
+                }
+                else {
                     next = thisLoc.btree()->k( 0 ).prevChildBucket;
                 }
                 if ( next.isNull() ) {
@@ -991,23 +1535,23 @@ found:
                     thisLoc = bestParent.first;
                     keyOfs = bestParent.second;
                     return;
-                } else {
+                }
+                else {
                     thisLoc = next;
                     continue;
                 }
             }
-            if ( !customFind( l, h, keyBegin, keyBeginLen, keyEnd, order, direction, thisLoc, keyOfs, bestParent ) ) {
+            if ( !customFind( l, h, keyBegin, keyBeginLen, afterKey, keyEnd, keyEndInclusive, order, direction, thisLoc, keyOfs, bestParent ) ) {
                 return;
             }
         }
     }
 
-    
-    /* @thisLoc disk location of *this
-    */
-    int BtreeBucket::_insert(DiskLoc thisLoc, DiskLoc recordLoc,
+
+    /** @thisLoc disk location of *this */
+    int BtreeBucket::_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
                              const BSONObj& key, const Ordering &order, bool dupsAllowed,
-                             DiskLoc lChild, DiskLoc rChild, IndexDetails& idx) {
+                             const DiskLoc lChild, const DiskLoc rChild, IndexDetails& idx) const {
         if ( key.objsize() > KeyMax ) {
             problem() << "ERROR: key too large len:" << key.objsize() << " max:" << KeyMax << ' ' << key.objsize() << ' ' << idx.indexNamespace() << endl;
             return 2;
@@ -1018,34 +1562,34 @@ found:
         bool found = find(idx, key, recordLoc, order, pos, !dupsAllowed);
         if ( insert_debug ) {
             out() << "  " << thisLoc.toString() << '.' << "_insert " <<
-                 key.toString() << '/' << recordLoc.toString() <<
-                 " l:" << lChild.toString() << " r:" << rChild.toString() << endl;
+                  key.toString() << '/' << recordLoc.toString() <<
+                  " l:" << lChild.toString() << " r:" << rChild.toString() << endl;
             out() << "    found:" << found << " pos:" << pos << " n:" << n << endl;
         }
 
         if ( found ) {
-            _KeyNode& kn = k(pos);
+            const _KeyNode& kn = k(pos);
             if ( kn.isUnused() ) {
                 log(4) << "btree _insert: reusing unused key" << endl;
                 massert( 10285 , "_insert: reuse key but lchild is not null", lChild.isNull());
                 massert( 10286 , "_insert: reuse key but rchild is not null", rChild.isNull());
-                kn.setUsed();
+                kn.writing().setUsed();
                 return 0;
             }
 
-            DEV { 
-                out() << "_insert(): key already exists in index (ok for background:true)\n";
-                out() << "  " << idx.indexNamespace().c_str() << " thisLoc:" << thisLoc.toString() << '\n';
-                out() << "  " << key.toString() << '\n';
-                out() << "  " << "recordLoc:" << recordLoc.toString() << " pos:" << pos << endl;
-                out() << "  old l r: " << childForPos(pos).toString() << ' ' << childForPos(pos+1).toString() << endl;
-                out() << "  new l r: " << lChild.toString() << ' ' << rChild.toString() << endl;
+            DEV {
+                log() << "_insert(): key already exists in index (ok for background:true)\n";
+                log() << "  " << idx.indexNamespace() << " thisLoc:" << thisLoc.toString() << '\n';
+                log() << "  " << key.toString() << '\n';
+                log() << "  " << "recordLoc:" << recordLoc.toString() << " pos:" << pos << endl;
+                log() << "  old l r: " << childForPos(pos).toString() << ' ' << childForPos(pos+1).toString() << endl;
+                log() << "  new l r: " << lChild.toString() << ' ' << rChild.toString() << endl;
             }
             alreadyInIndex();
         }
 
         DEBUGGING out() << "TEMP: key: " << key.toString() << endl;
-        DiskLoc& child = childForPos(pos);
+        DiskLoc child = childForPos(pos);
         if ( insert_debug )
             out() << "    getChild(" << pos << "): " << child.toString() << endl;
         if ( child.isNull() || !rChild.isNull() /* means an 'internal' insert */ ) {
@@ -1056,28 +1600,27 @@ found:
         return child.btree()->bt_insert(child, recordLoc, key, order, dupsAllowed, idx, /*toplevel*/false);
     }
 
-    void BtreeBucket::dump() {
+    void BtreeBucket::dump() const {
         out() << "DUMP btreebucket n:" << n;
         out() << " parent:" << hex << parent.getOfs() << dec;
         for ( int i = 0; i < n; i++ ) {
             out() << '\n';
             KeyNode k = keyNode(i);
             out() << '\t' << i << '\t' << k.key.toString() << "\tleft:" << hex <<
-                 k.prevChildBucket.getOfs() << "\tRecLoc:" << k.recordLoc.toString() << dec;
+                  k.prevChildBucket.getOfs() << "\tRecLoc:" << k.recordLoc.toString() << dec;
             if ( this->k(i).isUnused() )
                 out() << " UNUSED";
         }
         out() << " right:" << hex << nextChild.getOfs() << dec << endl;
     }
 
-    /* todo: meaning of return code unclear clean up */
-    int BtreeBucket::bt_insert(DiskLoc thisLoc, DiskLoc recordLoc,
-                            const BSONObj& key, const Ordering &order, bool dupsAllowed,
-                            IndexDetails& idx, bool toplevel)
-    {
+    /** todo: meaning of return code unclear clean up */
+    int BtreeBucket::bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
+                               const BSONObj& key, const Ordering &order, bool dupsAllowed,
+                               IndexDetails& idx, bool toplevel) const {
         if ( toplevel ) {
             if ( key.objsize() > KeyMax ) {
-                problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace().c_str() << ' ' << key.objsize() << ' ' << key.toString() << endl;
+                problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace() << ' ' << key.objsize() << ' ' << key.toString() << endl;
                 return 3;
             }
         }
@@ -1088,22 +1631,30 @@ found:
         return x;
     }
 
-    void BtreeBucket::shape(stringstream& ss) {
+    void BtreeBucket::shape(stringstream& ss) const {
         _shape(0, ss);
     }
-    
-    DiskLoc BtreeBucket::findSingle( const IndexDetails& indexdetails , const DiskLoc& thisLoc, const BSONObj& key ){
+
+    int BtreeBucket::getLowWaterMark() {
+        return lowWaterMark;
+    }
+
+    int BtreeBucket::getKeyMax() {
+        return KeyMax;
+    }
+
+    DiskLoc BtreeBucket::findSingle( const IndexDetails& indexdetails , const DiskLoc& thisLoc, const BSONObj& key ) const {
         int pos;
         bool found;
-        /* TODO: is it really ok here that the order is a default? */
+        // TODO: is it really ok here that the order is a default?
         Ordering o = Ordering::make(BSONObj());
         DiskLoc bucket = locate( indexdetails , indexdetails.head , key , o , pos , found , minDiskLoc );
         if ( bucket.isNull() )
             return bucket;
 
-        BtreeBucket *b = bucket.btree();
-        while ( 1 ){
-            _KeyNode& knraw = b->k(pos);
+        const BtreeBucket *b = bucket.btree();
+        while ( 1 ) {
+            const _KeyNode& knraw = b->k(pos);
             if ( knraw.isUsed() )
                 break;
             bucket = b->advance( bucket , pos , 1 , "findSingle" );
@@ -1125,7 +1676,7 @@ found:
 namespace mongo {
 
     void BtreeBucket::a_test(IndexDetails& id) {
-        BtreeBucket *b = id.head.btree();
+        BtreeBucket *b = id.head.btreemod();
 
         // record locs for testing
         DiskLoc A(1, 20);
@@ -1171,26 +1722,37 @@ namespace mongo {
 
     /* --- BtreeBuilder --- */
 
-    BtreeBuilder::BtreeBuilder(bool _dupsAllowed, IndexDetails& _idx) : 
-      dupsAllowed(_dupsAllowed), 
-      idx(_idx), 
-      n(0),
-      order( idx.keyPattern() ),
-      ordering( Ordering::make(idx.keyPattern()) )
-    {
+    BtreeBuilder::BtreeBuilder(bool _dupsAllowed, IndexDetails& _idx) :
+        dupsAllowed(_dupsAllowed),
+        idx(_idx),
+        n(0),
+        order( idx.keyPattern() ),
+        ordering( Ordering::make(idx.keyPattern()) ) {
         first = cur = BtreeBucket::addBucket(idx);
         b = cur.btreemod();
         committed = false;
     }
 
-    void BtreeBuilder::newBucket() { 
+    void BtreeBuilder::newBucket() {
         DiskLoc L = BtreeBucket::addBucket(idx);
         b->tempNext() = L;
         cur = L;
         b = cur.btreemod();
     }
 
-    void BtreeBuilder::addKey(BSONObj& key, DiskLoc loc) { 
+    void BtreeBuilder::mayCommitProgressDurably() {
+        if ( getDur().commitIfNeeded() ) {
+            b = cur.btreemod();
+        }
+    }
+
+    void BtreeBuilder::addKey(BSONObj& key, DiskLoc loc) {
+        if ( key.objsize() > KeyMax ) {
+            problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace() 
+                      << ' ' << key.objsize() << ' ' << key.toString() << endl;
+            return;
+        }
+
         if( !dupsAllowed ) {
             if( n > 0 ) {
                 int cmp = keyLast.woCompare(key, order);
@@ -1203,26 +1765,21 @@ namespace mongo {
             keyLast = key;
         }
 
-        if ( ! b->_pushBack(loc, key, ordering, DiskLoc()) ){
-            // no room
-            if ( key.objsize() > KeyMax ) {
-                problem() << "Btree::insert: key too large to index, skipping " << idx.indexNamespace().c_str() << ' ' << key.objsize() << ' ' << key.toString() << endl;
-            }
-            else { 
-                // bucket was full
-                newBucket();
-                b->pushBack(loc, key, ordering, DiskLoc());
-            }
+        if ( ! b->_pushBack(loc, key, ordering, DiskLoc()) ) {
+            // bucket was full
+            newBucket();
+            b->pushBack(loc, key, ordering, DiskLoc());
         }
         n++;
+        mayCommitProgressDurably();
     }
 
-    void BtreeBuilder::buildNextLevel(DiskLoc loc) { 
+    void BtreeBuilder::buildNextLevel(DiskLoc loc) {
         int levels = 1;
-        while( 1 ) { 
-            if( loc.btree()->tempNext().isNull() ) { 
+        while( 1 ) {
+            if( loc.btree()->tempNext().isNull() ) {
                 // only 1 bucket at this level. we are done.
-                idx.head = loc;
+                getDur().writingDiskLoc(idx.head) = loc;
                 break;
             }
             levels++;
@@ -1232,59 +1789,70 @@ namespace mongo {
             BtreeBucket *up = upLoc.btreemod();
 
             DiskLoc xloc = loc;
-            while( !xloc.isNull() ) { 
+            while( !xloc.isNull() ) {
+                if ( getDur().commitIfNeeded() ) {
+                    b = cur.btreemod();
+                    up = upLoc.btreemod();
+                }
+
                 BtreeBucket *x = xloc.btreemod();
-                BSONObj k; 
+                BSONObj k;
                 DiskLoc r;
                 x->popBack(r,k);
                 bool keepX = ( x->n != 0 );
                 DiskLoc keepLoc = keepX ? xloc : x->nextChild;
 
-                if ( ! up->_pushBack(r, k, ordering, keepLoc) ){
+                if ( ! up->_pushBack(r, k, ordering, keepLoc) ) {
                     // current bucket full
                     DiskLoc n = BtreeBucket::addBucket(idx);
                     up->tempNext() = n;
-                    upLoc = n; 
+                    upLoc = n;
                     up = upLoc.btreemod();
                     up->pushBack(r, k, ordering, keepLoc);
                 }
 
-                DiskLoc nextLoc = x->tempNext(); /* get next in chain at current level */
+                DiskLoc nextLoc = x->tempNext(); // get next in chain at current level
                 if ( keepX ) {
-                    x->parent = upLoc;                
-                } else {
+                    x->parent = upLoc;
+                }
+                else {
                     if ( !x->nextChild.isNull() )
                         x->nextChild.btreemod()->parent = upLoc;
                     x->deallocBucket( xloc, idx );
                 }
                 xloc = nextLoc;
             }
-            
+
             loc = upStart;
+            mayCommitProgressDurably();
         }
 
         if( levels > 1 )
             log(2) << "btree levels: " << levels << endl;
     }
 
-    /* when all addKeys are done, we then build the higher levels of the tree */
-    void BtreeBuilder::commit() { 
+    /** when all addKeys are done, we then build the higher levels of the tree */
+    void BtreeBuilder::commit() {
         buildNextLevel(first);
         committed = true;
     }
 
-    BtreeBuilder::~BtreeBuilder() { 
-        if( !committed ) { 
-            log(2) << "Rolling back partially built index space" << endl;
-            DiskLoc x = first;
-            while( !x.isNull() ) { 
-                DiskLoc next = x.btree()->tempNext();
-                btreeStore->deleteRecord(idx.indexNamespace().c_str(), x);
-                x = next;
+    BtreeBuilder::~BtreeBuilder() {
+        DESTRUCTOR_GUARD(
+            if( !committed ) {
+                log(2) << "Rolling back partially built index space" << endl;
+                DiskLoc x = first;
+                while( !x.isNull() ) {
+                    DiskLoc next = x.btree()->tempNext();
+                    string ns = idx.indexNamespace();
+                    theDataFileMgr._deleteRecord(nsdetails(ns.c_str()), ns.c_str(), x.rec(), x);
+                    x = next;
+                    getDur().commitIfNeeded();
+                }
+                assert( idx.head.isNull() );
+                log(2) << "done rollback" << endl;
             }
-            assert( idx.head.isNull() );
-            log(2) << "done rollback" << endl;
-        }
+        )
     }
 
 }