1 files changed, 376 insertions, 209 deletions
diff --git a/db/btree.h b/db/btree.h
index 233b4dc..bced95e 100644
--- a/db/btree.h
+++ b/db/btree.h
@@ -25,8 +25,12 @@
 
 namespace mongo {
 
+    const int BucketSize = 8192;
+
 #pragma pack(1)
     struct _KeyNode {
+        /** Signals that we are writing this _KeyNode and casts away const */
+        _KeyNode& writing() const;
         DiskLoc prevChildBucket; // the lchild
         DiskLoc recordLoc; // location of the record associated with the key
         short keyDataOfs() const {
@@ -41,15 +45,12 @@ namespace mongo {
             _kdo = s;
             assert(s>=0);
         }
-        void setUsed() { 
-            recordLoc.GETOFS() &= ~1;
-        }
+        void setUsed() { recordLoc.GETOFS() &= ~1; }
         void setUnused() {
-            /* Setting ofs to odd is the sentinel for unused, as real recordLoc's are always
-               even numbers.
-               Note we need to keep its value basically the same as we use the recordLoc
-               as part of the key in the index (to handle duplicate keys efficiently).
-            */
+            // Setting ofs to odd is the sentinel for unused, as real recordLoc's are always
+            //  even numbers.
+            // Note we need to keep its value basically the same as we use the recordLoc
+            // as part of the key in the index (to handle duplicate keys efficiently).
             recordLoc.GETOFS() |= 1;
         }
         int isUnused() const {
@@ -63,7 +64,12 @@ namespace mongo {
 
     class BucketBasics;
 
-    /* wrapper - this is our in memory representation of the key.  _KeyNode is the disk representation. */
+    /**
+     * wrapper - this is our in memory representation of the key.
+     * _KeyNode is the disk representation.
+     *
+     * This object and its bson key will become invalid if the key is moved.
+     */
     class KeyNode {
     public:
         KeyNode(const BucketBasics& bb, const _KeyNode &k);
@@ -73,51 +79,111 @@ namespace mongo {
     };
 
 #pragma pack(1)
-    /* this class is all about the storage management */
-    class BucketBasics {
+    class BtreeData {
+    protected:
+        DiskLoc parent;
+        DiskLoc nextChild; // child bucket off and to the right of the highest key.
+        unsigned short _wasSize; // can be reused, value is 8192 in current pdfile version Apr2010
+        unsigned short _reserved1; // zero
+        int flags;
+
+        // basicInsert() assumes these three are together and in this order:
+        int emptySize; // size of the empty region
+        int topSize; // size of the data at the top of the bucket (keys are at the beginning or 'bottom')
+        int n; // # of keys so far.
+
+        int reserved;
+        char data[4];
+    };
+
+    /**
+     * This class is all about the storage management
+     *
+     * Const member functions of this class are those which may be called on
+     * an object for which writing has not been signaled.  Non const member
+     * functions may only be called on objects for which writing has been
+     * signaled.  Note that currently some const functions write to the
+     * underlying memory representation of this bucket using optimized methods
+     * to signal write operations.
+     *
+     * DiskLoc parameters that may shadow references within the btree should
+     * be passed by value rather than by reference to non const member
+     * functions or const member functions which may perform writes.  This way
+     * a callee need not worry that write operations will change or invalidate
+     * its arguments.
+     *
+     * The current policy for dealing with bson arguments is the opposite of
+     * what is described above for DiskLoc arguments.  We do
+     * not want to want to copy bson into memory as an intermediate step for
+     * btree changes, so if bson is to be moved it must be copied to the new
+     * location before the old location is invalidated.
+     */
+    class BucketBasics : public BtreeData {
         friend class BtreeBuilder;
         friend class KeyNode;
     public:
-        void dumpTree(DiskLoc thisLoc, const BSONObj &order);
-        bool isHead() { return parent.isNull(); }
-        void assertValid(const Ordering &order, bool force = false);
-        void assertValid(const BSONObj &orderObj, bool force = false) { 
-            return assertValid(Ordering::make(orderObj),force); 
-        }
-        int fullValidate(const DiskLoc& thisLoc, const BSONObj &order, int *unusedCount = 0); /* traverses everything */
+        /** assert write intent declared for this bucket already */
+        void assertWritable();
 
-        KeyNode keyNode(int i) const {
-            if ( i >= n ){
+        void assertValid(const Ordering &order, bool force = false) const;
+        void assertValid(const BSONObj &orderObj, bool force = false) const { return assertValid(Ordering::make(orderObj),force); }
+
+        /**
+         * @return KeyNode for key at index i.  The KeyNode will become invalid
+         * if the key is moved or reassigned, or if the node is packed.
+         */
+        const KeyNode keyNode(int i) const {
+            if ( i >= n ) {
                 massert( 13000 , (string)"invalid keyNode: " +  BSON( "i" << i << "n" << n ).jsonString() , i < n );
             }
             return KeyNode(*this, k(i));
         }
 
-    protected:
+        static int headerSize() {
+            const BucketBasics *d = 0;
+            return (char*)&(d->data) - (char*)&(d->parent);
+        }
+        static int bodySize() { return BucketSize - headerSize(); }
 
-        void modified(const DiskLoc& thisLoc);
+        // for testing
+        int nKeys() const { return n; }
+        const DiskLoc getNextChild() const { return nextChild; }
 
-        char * dataAt(short ofs) {
-            return data + ofs;
-        }
+    protected:
+        char * dataAt(short ofs) { return data + ofs; }
 
         void init(); // initialize a new node
 
-        /* returns false if node is full and must be split
-           keypos is where to insert -- inserted after that key #.  so keypos=0 is the leftmost one.
-        */
-        bool basicInsert(const DiskLoc& thisLoc, int &keypos, const DiskLoc& recordLoc, const BSONObj& key, const Ordering &order);
-        
         /**
-         * @return true if works, false if not enough space
+         * @return false if node is full and must be split
+         * @keypos is where to insert -- inserted before that key #.  so keypos=0 is the leftmost one.
+         *  keypos will be updated if keys are moved as a result of pack()
+         * This function will modify the btree bucket memory representation even
+         * though it is marked const.
          */
-        bool _pushBack(const DiskLoc& recordLoc, BSONObj& key, const Ordering &order, DiskLoc prevChild);
-        void pushBack(const DiskLoc& recordLoc, BSONObj& key, const Ordering &order, DiskLoc prevChild){
+        bool basicInsert(const DiskLoc thisLoc, int &keypos, const DiskLoc recordLoc, const BSONObj& key, const Ordering &order) const;
+
+        /** @return true if works, false if not enough space */
+        bool _pushBack(const DiskLoc recordLoc, const BSONObj& key, const Ordering &order, const DiskLoc prevChild);
+        void pushBack(const DiskLoc recordLoc, const BSONObj& key, const Ordering &order, const DiskLoc prevChild) {
             bool ok = _pushBack( recordLoc , key , order , prevChild );
             assert(ok);
         }
+
+        /**
+         * This is a special purpose function used by BtreeBuilder.  The
+         * interface is quite dangerous if you're not careful.  The bson key
+         * returned here points to bucket memory that has been invalidated but
+         * not yet reclaimed.
+         *
+         * TODO Maybe this could be replaced with two functions, one which
+         * returns the last key without deleting it and another which simply
+         * deletes the last key.  Then the caller would have enough control to
+         * ensure proper memory integrity.
+         */
         void popBack(DiskLoc& recLoc, BSONObj& key);
-        void _delKeyAtPos(int keypos); // low level version that doesn't deal with child ptrs.
+
+        void _delKeyAtPos(int keypos, bool mayEmpty = false); // low level version that doesn't deal with child ptrs.
 
         /* !Packed means there is deleted fragment space within the bucket.
            We "repack" when we run out of space before considering the node
@@ -125,145 +191,257 @@ namespace mongo {
            */
         enum Flags { Packed=1 };
 
-        DiskLoc& childForPos(int p) {
-            return p == n ? nextChild : k(p).prevChildBucket;
-        }
+        const DiskLoc& childForPos(int p) const { return p == n ? nextChild : k(p).prevChildBucket; }
+        DiskLoc& childForPos(int p) { return p == n ? nextChild : k(p).prevChildBucket; }
 
         int totalDataSize() const;
-        void pack( const Ordering &order, int &refPos);
-        void setNotPacked();
-        void setPacked();
+        /** @return true if the key may be dropped by pack() */
+        bool mayDropKey( int index, int refPos ) const;
+
+        /**
+         * Pack the bucket to reclaim space from invalidated memory.
+         * @refPos is an index in the bucket which will may be updated if we
+         *  delete keys from the bucket
+         * This function may cast away const and perform a write.
+         */
+        void _pack(const DiskLoc thisLoc, const Ordering &order, int &refPos) const;
+        /** Pack when already writable */
+        void _packReadyForMod(const Ordering &order, int &refPos);
+
+        /**
+         * @return the size of non header data in this bucket if we were to
+         * call pack().
+         */
+        int packedDataSize( int refPos ) const;
+        void setNotPacked() { flags &= ~Packed; }
+        void setPacked() { flags |= Packed; }
         int _alloc(int bytes);
         void _unalloc(int bytes);
         void truncateTo(int N, const Ordering &order, int &refPos);
+        /** drop specified number of keys from beginning of key array, and pack */
+        void dropFront(int nDrop, const Ordering &order, int &refPos);
         void markUnused(int keypos);
 
-        /* BtreeBuilder uses the parent var as a temp place to maintain a linked list chain. 
-           we use tempNext() when we do that to be less confusing. (one might have written a union in C)
-           */
+        /**
+         * BtreeBuilder uses the parent var as a temp place to maintain a linked list chain.
+         *   we use tempNext() when we do that to be less confusing. (one might have written a union in C)
+         */
+        const DiskLoc& tempNext() const { return parent; }
         DiskLoc& tempNext() { return parent; }
 
-    public:
-        DiskLoc parent;
-
-        string bucketSummary() const {
-            stringstream ss;
-            ss << "  Bucket info:" << endl;
-            ss << "    n: " << n << endl;
-            ss << "    parent: " << parent.toString() << endl;
-            ss << "    nextChild: " << parent.toString() << endl;
-            ss << "    flags:" << flags << endl;
-            ss << "    emptySize: " << emptySize << " topSize: " << topSize << endl;
-            return ss.str();
-        }
-        
-        bool isUsed( int i ) const {
-            return k(i).isUsed();
-        }
+        void _shape(int level, stringstream&) const;
+        int Size() const;
+        const _KeyNode& k(int i) const { return ((const _KeyNode*)data)[i]; }
+        _KeyNode& k(int i) { return ((_KeyNode*)data)[i]; }
 
-    protected:
-        void _shape(int level, stringstream&);
-        DiskLoc nextChild; // child bucket off and to the right of the highest key.
+        /** @return the key position where a split should occur on insert */
+        int splitPos( int keypos ) const;
 
-    private:
-        unsigned short _wasSize; // can be reused, value is 8192 in current pdfile version Apr2010
-        unsigned short _reserved1; // zero
+        /**
+         * Adds new entries to beginning of key array, shifting existing
+         * entries to the right.  After this is called, setKey() must be called
+         * on all the newly created entries in the key array.
+         */
+        void reserveKeysFront( int nAdd );
 
-    protected:
-        int Size() const;
-        int flags;
-        int emptySize; // size of the empty region
-        int topSize; // size of the data at the top of the bucket (keys are at the beginning or 'bottom')
-        int n; // # of keys so far.
-        int reserved;
-        const _KeyNode& k(int i) const {
-            return ((_KeyNode*)data)[i];
-        }
-        _KeyNode& k(int i) {
-            return ((_KeyNode*)data)[i];
-        }
-        char data[4];
+        /**
+         * Sets an existing key using the given parameters.
+         * @i index of key to set
+         */
+        void setKey( int i, const DiskLoc recordLoc, const BSONObj &key, const DiskLoc prevChildBucket );
     };
-#pragma pack()
 
-#pragma pack(1)
+    /**
+     * This class adds functionality for manipulating buckets that are assembled
+     * in a tree.  The requirements for const and non const functions and
+     * arguments are generally the same as in BtreeBucket.  Because this class
+     * deals with tree structure, some functions that are marked const may
+     * trigger modification of another node in the btree or potentially of the
+     * current node.  In such cases, the function's implementation explicitly
+     * casts away const when indicating an intent to write to the durability
+     * layer.  The DiskLocs provided to such functions should be passed by
+     * value if they shadow pointers within the btree.
+     *
+     * To clarify enforcement of referential integrity in this implementation,
+     * we use the following pattern when deleting data we have a persistent
+     * pointer to.  The pointer is cleared or removed explicitly, then the data
+     * it pointed to is cleaned up with a helper function.
+     *
+     * TODO It might make sense to put some of these functions in a class
+     * representing a full btree instead of a single btree bucket.  That would
+     * allow us to use the const qualifier in a manner more consistent with
+     * standard usage.  Right now the interface is for both a node and a tree,
+     * so assignment of const is sometimes nonideal.
+     *
+     * TODO There are several cases in which the this pointer is invalidated
+     * as a result of deallocation.  A seperate class representing a btree would
+     * alleviate some fragile cases where the implementation must currently
+     * behave correctly if the this pointer is suddenly invalidated by a
+     * callee.
+     */
     class BtreeBucket : public BucketBasics {
         friend class BtreeCursor;
     public:
-        void dump();
+        bool isHead() const { return parent.isNull(); }
+        void dumpTree(const DiskLoc &thisLoc, const BSONObj &order) const;
+        int fullValidate(const DiskLoc& thisLoc, const BSONObj &order, int *unusedCount = 0, bool strict = false) const; /* traverses everything */
 
-        /* @return true if key exists in index 
+        bool isUsed( int i ) const { return k(i).isUsed(); }
+        string bucketSummary() const;
+        void dump() const;
 
-           order - indicates order of keys in the index.  this is basically the index's key pattern, e.g.:
-             BSONObj order = ((IndexDetails&)idx).keyPattern();
-           likewise below in bt_insert() etc.
-        */
-        bool exists(const IndexDetails& idx, DiskLoc thisLoc, const BSONObj& key, const Ordering& order);
+        /**
+         * @return true if key exists in index
+         *
+         * @order - indicates order of keys in the index.  this is basically the index's key pattern, e.g.:
+         *    BSONObj order = ((IndexDetails&)idx).keyPattern();
+         * likewise below in bt_insert() etc.
+         */
+        bool exists(const IndexDetails& idx, const DiskLoc &thisLoc, const BSONObj& key, const Ordering& order) const;
 
         bool wouldCreateDup(
-            const IndexDetails& idx, DiskLoc thisLoc, 
+            const IndexDetails& idx, const DiskLoc &thisLoc,
             const BSONObj& key, const Ordering& order,
-            DiskLoc self); 
+            const DiskLoc &self) const;
+
+        static DiskLoc addBucket(const IndexDetails&); /* start a new index off, empty */
+        /** invalidates 'this' and thisLoc */
+        void deallocBucket(const DiskLoc thisLoc, const IndexDetails &id);
 
-        static DiskLoc addBucket(IndexDetails&); /* start a new index off, empty */
-        void deallocBucket(const DiskLoc &thisLoc, IndexDetails &id);
-        
         static void renameIndexNamespace(const char *oldNs, const char *newNs);
 
-        int bt_insert(DiskLoc thisLoc, DiskLoc recordLoc,
-                   const BSONObj& key, const Ordering &order, bool dupsAllowed,
-                   IndexDetails& idx, bool toplevel = true);
+        /** This function may change the btree root */
+        int bt_insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
+                      const BSONObj& key, const Ordering &order, bool dupsAllowed,
+                      IndexDetails& idx, bool toplevel = true) const;
 
-        bool unindex(const DiskLoc& thisLoc, IndexDetails& id, BSONObj& key, const DiskLoc& recordLoc);
+        /** This function may change the btree root */
+        bool unindex(const DiskLoc thisLoc, IndexDetails& id, const BSONObj& key, const DiskLoc recordLoc) const;
 
-        /* locate may return an "unused" key that is just a marker.  so be careful.
-             looks for a key:recordloc pair.
+        /**
+         * locate may return an "unused" key that is just a marker.  so be careful.
+         *   looks for a key:recordloc pair.
+         *
+         * @found - returns true if exact match found.  note you can get back a position
+         *          result even if found is false.
+         */
+        DiskLoc locate(const IndexDetails &idx , const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order,
+                       int& pos, bool& found, const DiskLoc &recordLoc, int direction=1) const;
 
-           found - returns true if exact match found.  note you can get back a position 
-                   result even if found is false.
-        */
-        DiskLoc locate(const IndexDetails& , const DiskLoc& thisLoc, const BSONObj& key, const Ordering &order, 
-                       int& pos, bool& found, DiskLoc recordLoc, int direction=1);
-        
         /**
          * find the first instance of the key
          * does not handle dups
-         * returned DiskLock isNull if can't find anything with that
+         * returned DiskLoc isNull if can't find anything with that
+         * @return the record location of the first match
          */
-        DiskLoc findSingle( const IndexDetails& , const DiskLoc& thisLoc, const BSONObj& key );
+        DiskLoc findSingle( const IndexDetails &indexdetails , const DiskLoc& thisLoc, const BSONObj& key ) const;
+
+        /** advance one key position in the index: */
+        DiskLoc advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller) const;
 
-        /* advance one key position in the index: */
-        DiskLoc advance(const DiskLoc& thisLoc, int& keyOfs, int direction, const char *caller);
-        
-        void advanceTo(const IndexDetails &id, DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, const vector< const BSONElement * > &keyEnd, const Ordering &order, int direction );
-        
-        DiskLoc getHead(const DiskLoc& thisLoc);
+        void advanceTo(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction ) const;
+        void customLocate(DiskLoc &thisLoc, int &keyOfs, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, pair< DiskLoc, int > &bestParent ) const;
 
-        /* get tree shape */
-        void shape(stringstream&);
+        const DiskLoc getHead(const DiskLoc& thisLoc) const;
+
+        /** get tree shape */
+        void shape(stringstream&) const;
 
         static void a_test(IndexDetails&);
 
-    private:
-        void fixParentPtrs(const DiskLoc& thisLoc);
-        void delBucket(const DiskLoc& thisLoc, IndexDetails&);
-        void delKeyAtPos(const DiskLoc& thisLoc, IndexDetails& id, int p);
-        BSONObj keyAt(int keyOfs) {
+        static int getLowWaterMark();
+        static int getKeyMax();
+
+    protected:
+        /**
+         * Fix parent pointers for children
+         * @firstIndex first index to modify
+         * @lastIndex last index to modify (-1 means last index is n)
+         */
+        void fixParentPtrs(const DiskLoc thisLoc, int firstIndex = 0, int lastIndex = -1) const;
+
+        /** invalidates this and thisLoc */
+        void delBucket(const DiskLoc thisLoc, const IndexDetails&);
+        /** may invalidate this and thisLoc */
+        void delKeyAtPos(const DiskLoc thisLoc, IndexDetails& id, int p, const Ordering &order);
+
+        /**
+         * May balance utilization of this bucket with a neighbor, either by
+         * merging the buckets or shifting nodes.
+         * @return true iff balancing was performed.
+         * NOTE This function may invalidate thisLoc.
+         */
+        bool mayBalanceWithNeighbors(const DiskLoc thisLoc, IndexDetails &id, const Ordering &order) const;
+
+        /** @return true if balance succeeded */
+        bool tryBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order ) const;
+        void doBalanceChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order );
+        void doBalanceLeftToRight( const DiskLoc thisLoc, int leftIndex, int split,
+                                   BtreeBucket *l, const DiskLoc lchild,
+                                   BtreeBucket *r, const DiskLoc rchild,
+                                   IndexDetails &id, const Ordering &order );
+        void doBalanceRightToLeft( const DiskLoc thisLoc, int leftIndex, int split,
+                                   BtreeBucket *l, const DiskLoc lchild,
+                                   BtreeBucket *r, const DiskLoc rchild,
+                                   IndexDetails &id, const Ordering &order );
+
+        /** may invalidate this and thisLoc */
+        void doMergeChildren( const DiskLoc thisLoc, int leftIndex, IndexDetails &id, const Ordering &order);
+
+        /** will invalidate this and thisLoc */
+        void replaceWithNextChild( const DiskLoc thisLoc, IndexDetails &id );
+
+        /** @return true iff left and right child can be merged into one node */
+        bool canMergeChildren( const DiskLoc &thisLoc, int leftIndex ) const;
+
+        /**
+         * @return index of the rebalanced separator; the index value is
+         * determined as if we had an array
+         * <left bucket keys array>.push( <old separator> ).concat( <right bucket keys array> )
+         * This is only expected to be called if the left and right child
+         * cannot be merged.
+         * This function is expected to be called on packed buckets, see also
+         * comments for splitPos().
+         */
+        int rebalancedSeparatorPos( const DiskLoc &thisLoc, int leftIndex ) const;
+
+        int indexInParent( const DiskLoc &thisLoc ) const;
+        BSONObj keyAt(int keyOfs) const {
             return keyOfs >= n ? BSONObj() : keyNode(keyOfs).key;
         }
         static BtreeBucket* allocTemp(); /* caller must release with free() */
-        void insertHere(DiskLoc thisLoc, int keypos,
-                        DiskLoc recordLoc, const BSONObj& key, const Ordering &order,
-                        DiskLoc lchild, DiskLoc rchild, IndexDetails&);
-        int _insert(DiskLoc thisLoc, DiskLoc recordLoc,
+
+        /** split bucket */
+        void split(const DiskLoc thisLoc, int keypos,
+                   const DiskLoc recordLoc, const BSONObj& key,
+                   const Ordering& order, const DiskLoc lchild, const DiskLoc rchild, IndexDetails& idx);
+
+        void insertHere(const DiskLoc thisLoc, int keypos,
+                        const DiskLoc recordLoc, const BSONObj& key, const Ordering &order,
+                        const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx) const;
+
+        int _insert(const DiskLoc thisLoc, const DiskLoc recordLoc,
                     const BSONObj& key, const Ordering &order, bool dupsAllowed,
-                    DiskLoc lChild, DiskLoc rChild, IndexDetails&);
-        bool find(const IndexDetails& idx, const BSONObj& key, DiskLoc recordLoc, const Ordering &order, int& pos, bool assertIfDup);
-        bool customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, const vector< const BSONElement * > &keyEnd, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent );
+                    const DiskLoc lChild, const DiskLoc rChild, IndexDetails &idx) const;
+        bool find(const IndexDetails& idx, const BSONObj& key, const DiskLoc &recordLoc, const Ordering &order, int& pos, bool assertIfDup) const;
+        bool customFind( int l, int h, const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive, const Ordering &order, int direction, DiskLoc &thisLoc, int &keyOfs, pair< DiskLoc, int > &bestParent ) const;
         static void findLargestKey(const DiskLoc& thisLoc, DiskLoc& largestLoc, int& largestKey);
-        static int customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, const vector< const BSONElement * > &rEnd, const Ordering &o );
+        static int customBSONCmp( const BSONObj &l, const BSONObj &rBegin, int rBeginLen, bool rSup, const vector< const BSONElement * > &rEnd, const vector< bool > &rEndInclusive, const Ordering &o, int direction );
+        static void fix(const DiskLoc thisLoc, const DiskLoc child);
+
+        /** Replaces an existing key with the new specified key, splitting if necessary */
+        void setInternalKey( const DiskLoc thisLoc, int keypos,
+                             const DiskLoc recordLoc, const BSONObj &key, const Ordering &order,
+                             const DiskLoc lchild, const DiskLoc rchild, IndexDetails &idx);
+
+        /**
+         * Deletes the specified key, replacing it with the key immediately
+         * preceding or succeeding it in the btree.  Either the left or right
+         * child of the specified key must be non null.
+         */
+        void deleteInternalKey( const DiskLoc thisLoc, int keypos, IndexDetails &id, const Ordering &order );
     public:
-        // simply builds and returns a dup key error message string
+        /** simply builds and returns a dup key error message string */
         static string dupKeyError( const IndexDetails& idx , const BSONObj& key );
     };
 #pragma pack()
@@ -271,76 +449,59 @@ namespace mongo {
     class BtreeCursor : public Cursor {
     public:
         BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails&, const BSONObj &startKey, const BSONObj &endKey, bool endKeyInclusive, int direction );
-
         BtreeCursor( NamespaceDetails *_d, int _idxNo, const IndexDetails& _id, const shared_ptr< FieldRangeVector > &_bounds, int _direction );
-        ~BtreeCursor(){
-        }
-        virtual bool ok() {
-            return !bucket.isNull();
-        }
-        bool eof() {
-            return !ok();
-        }
+        virtual bool ok() { return !bucket.isNull(); }
         virtual bool advance();
-
         virtual void noteLocation(); // updates keyAtKeyOfs...
         virtual void checkLocation();
         virtual bool supportGetMore() { return true; }
         virtual bool supportYields() { return true; }
 
-        /* used for multikey index traversal to avoid sending back dups. see Matcher::matches().
-           if a multikey index traversal:
-             if loc has already been sent, returns true.
-             otherwise, marks loc as sent.
-             @return true if the loc has not been seen
-        */
+        /**
+         * used for multikey index traversal to avoid sending back dups. see Matcher::matches().
+         * if a multikey index traversal:
+         *   if loc has already been sent, returns true.
+         *   otherwise, marks loc as sent.
+         * @return true if the loc has not been seen
+         */
         virtual bool getsetdup(DiskLoc loc) {
-            if( multikey ) { 
-                pair<set<DiskLoc>::iterator, bool> p = dups.insert(loc);
+            if( _multikey ) {
+                pair<set<DiskLoc>::iterator, bool> p = _dups.insert(loc);
                 return !p.second;
             }
             return false;
         }
 
-        _KeyNode& _currKeyNode() {
+        virtual bool modifiedKeys() const { return _multikey; }
+        virtual bool isMultiKey() const { return _multikey; }
+
+        const _KeyNode& _currKeyNode() const {
             assert( !bucket.isNull() );
-            _KeyNode& kn = bucket.btree()->k(keyOfs);
+            const _KeyNode& kn = bucket.btree()->k(keyOfs);
             assert( kn.isUsed() );
             return kn;
         }
-        KeyNode currKeyNode() const {
+        const KeyNode currKeyNode() const {
             assert( !bucket.isNull() );
             return bucket.btree()->keyNode(keyOfs);
         }
-        virtual BSONObj currKey() const {
-            return currKeyNode().key;
-        }
 
-        virtual BSONObj indexKeyPattern() {
-            return indexDetails.keyPattern();
-        }
+        virtual BSONObj currKey() const { return currKeyNode().key; }
+        virtual BSONObj indexKeyPattern() { return indexDetails.keyPattern(); }
 
         virtual void aboutToDeleteBucket(const DiskLoc& b) {
             if ( bucket == b )
                 keyOfs = -1;
         }
 
-        virtual DiskLoc currLoc() {
-            return !bucket.isNull() ? _currKeyNode().recordLoc : DiskLoc();
-        }
-        virtual DiskLoc refLoc() {
-            return currLoc();
-        }
-        virtual Record* _current() {
-            return currLoc().rec();
-        }
-        virtual BSONObj current() {
-            return BSONObj(_current());
-        }
+        virtual DiskLoc currLoc()  { return !bucket.isNull() ? _currKeyNode().recordLoc : DiskLoc();  }
+        virtual DiskLoc refLoc()   { return currLoc(); }
+        virtual Record* _current() { return currLoc().rec(); }
+        virtual BSONObj current()  { return BSONObj(_current()); }
         virtual string toString() {
             string s = string("BtreeCursor ") + indexDetails.indexName();
-            if ( direction < 0 ) s += " reverse";
-            if ( bounds_.get() && bounds_->size() > 1 ) s += " multi";
+            if ( _direction < 0 ) s += " reverse";
+            if ( _bounds.get() && _bounds->size() > 1 ) s += " multi";
             return s;
         }
 
@@ -351,77 +512,81 @@ namespace mongo {
         virtual BSONObj prettyIndexBounds() const {
             if ( !_independentFieldRanges ) {
                 return BSON( "start" << prettyKey( startKey ) << "end" << prettyKey( endKey ) );
-            } else {
-                return bounds_->obj();
+            }
+            else {
+                return _bounds->obj();
             }
         }
-        
+
         void forgetEndKey() { endKey = BSONObj(); }
 
         virtual CoveredIndexMatcher *matcher() const { return _matcher.get(); }
-        
-        virtual void setMatcher( shared_ptr< CoveredIndexMatcher > matcher ) {
-            _matcher = matcher;
-        }
 
-        // for debugging only
-        DiskLoc getBucket() const { return bucket; }
-        
+        virtual void setMatcher( shared_ptr< CoveredIndexMatcher > matcher ) { _matcher = matcher;  }
+
+        virtual long long nscanned() { return _nscanned; }
+
+        /** for debugging only */
+        const DiskLoc getBucket() const { return bucket; }
+
     private:
-        /* Our btrees may (rarely) have "unused" keys when items are deleted.
-           Skip past them.
-        */
+        /**
+         * Our btrees may (rarely) have "unused" keys when items are deleted.
+         * Skip past them.
+         */
         bool skipUnusedKeys( bool mayJump );
         bool skipOutOfRangeKeysAndCheckEnd();
         void skipAndCheck();
         void checkEnd();
 
-        // selective audits on construction
+        /** selective audits on construction */
         void audit();
 
-        // set initial bucket
+        /** set initial bucket */
         void init();
 
-        void advanceTo( const BSONObj &keyBegin, int keyBeginLen, const vector< const BSONElement * > &keyEnd);
-        
+        /** if afterKey is true, we want the first key with values of the keyBegin fields greater than keyBegin */
+        void advanceTo( const BSONObj &keyBegin, int keyBeginLen, bool afterKey, const vector< const BSONElement * > &keyEnd, const vector< bool > &keyEndInclusive );
+
         friend class BtreeBucket;
-        set<DiskLoc> dups;
-        NamespaceDetails *d;
-        int idxNo;
-        
+
+        set<DiskLoc> _dups;
+        NamespaceDetails * const d;
+        const int idxNo;
         BSONObj startKey;
         BSONObj endKey;
-        bool endKeyInclusive_;
-        
-        bool multikey; // note this must be updated every getmore batch in case someone added a multikey...
-
+        bool _endKeyInclusive;
+        bool _multikey; // this must be updated every getmore batch in case someone added a multikey
         const IndexDetails& indexDetails;
-        BSONObj order;
-        Ordering _ordering;
+        const BSONObj _order;
+        const Ordering _ordering;
         DiskLoc bucket;
         int keyOfs;
-        int direction; // 1=fwd,-1=reverse
+        const int _direction; // 1=fwd,-1=reverse
         BSONObj keyAtKeyOfs; // so we can tell if things moved around on us between the query and the getMore call
         DiskLoc locAtKeyOfs;
-        shared_ptr< FieldRangeVector > bounds_;
+        const shared_ptr< FieldRangeVector > _bounds;
         auto_ptr< FieldRangeVector::Iterator > _boundsIterator;
         const IndexSpec& _spec;
         shared_ptr< CoveredIndexMatcher > _matcher;
         bool _independentFieldRanges;
+        long long _nscanned;
     };
 
 
-    inline bool IndexDetails::hasKey(const BSONObj& key) { 
+    inline bool IndexDetails::hasKey(const BSONObj& key) {
         return head.btree()->exists(*this, head, key, Ordering::make(keyPattern()));
     }
-    inline bool IndexDetails::wouldCreateDup(const BSONObj& key, DiskLoc self) { 
+    inline bool IndexDetails::wouldCreateDup(const BSONObj& key, DiskLoc self) {
         return head.btree()->wouldCreateDup(*this, head, key, Ordering::make(keyPattern()), self);
     }
 
-    /* build btree from the bottom up */
-    /* _ TODO dropDups */
+    /**
+     * build btree from the bottom up
+     * _ TODO dropDups
+     */
     class BtreeBuilder {
-        bool dupsAllowed; 
+        bool dupsAllowed;
         IndexDetails& idx;
         unsigned long long n;
         BSONObj keyLast;
@@ -434,18 +599,20 @@ namespace mongo {
 
         void newBucket();
         void buildNextLevel(DiskLoc);
+        void mayCommitProgressDurably();
 
     public:
         ~BtreeBuilder();
 
         BtreeBuilder(bool _dupsAllowed, IndexDetails& _idx);
 
-        /* keys must be added in order */
+        /** keys must be added in order */
         void addKey(BSONObj& key, DiskLoc loc);
 
-        /* commit work.  if not called, destructor will clean up partially completed work 
-           (in case exception has happened).
-        */
+        /**
+         * commit work.  if not called, destructor will clean up partially completed work
+         *  (in case exception has happened).
+         */
         void commit();
 
         unsigned long long getn() { return n; }